Apply pre-commit style fixes

pwr-ai · Jun 24, 2024 · 8f189b3 · 8f189b3
1 parent a3999bd
commit 8f189b3
Show file tree

Hide file tree

Showing 32 changed files with 711 additions and 691 deletions.
diff --git a/.github/workflows/nbdev-test.yaml b/.github/workflows/nbdev-test.yaml
@@ -4,7 +4,7 @@ on:  [workflow_dispatch, pull_request]
 jobs:
   nbdev-test:
     runs-on: ubuntu-latest
-    steps: 
+    steps:
       - uses: fastai/workflows/nbdev-ci@master
         with:
           skip_test: true
diff --git a/Dockerfile b/Dockerfile
@@ -24,7 +24,7 @@ RUN apt-get update \
     && gdebi -n quarto-1.5.17-linux-amd64.deb \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists \
-    && rm -rf /tmp 
+    && rm -rf /tmp
 
 ENV PYTHONDONTWRITEBYTECODE 1
 ENV PYTHONUNBUFFERED 1

diff --git a/configs/embedding.yaml b/configs/embedding.yaml
@@ -2,8 +2,8 @@ defaults:
   - embedding_model: ???
   - dataset: pl-court-raw
   - _self_
-  - override hydra/hydra_logging: disabled  
-  - override hydra/job_logging: disabled  
+  - override hydra/hydra_logging: disabled
+  - override hydra/job_logging: disabled
 
 length_adjust_mode: chunk
 chunk_config:
@@ -14,7 +14,7 @@ batch_size: 64
 
 output_dir: data/embeddings/${dataset.name}/${hydra:runtime.choices.embedding_model}/all_embeddings
 
-hydra:  
-  output_subdir: null  
-  run:  
+hydra:
+  output_subdir: null
+  run:
     dir: .
diff --git a/configs/fine_tuning.yaml b/configs/fine_tuning.yaml
@@ -2,8 +2,8 @@ defaults:
   - model: ???
   - dataset: pl-court-instruct
   - _self_
-  - override hydra/hydra_logging: disabled  
-  - override hydra/job_logging: disabled  
+  - override hydra/hydra_logging: disabled
+  - override hydra/job_logging: disabled
 
 output_dir: data/experiments/fine-tune/${hydra:runtime.choices.model}/${hydra:runtime.choices.dataset}
 run_name: ${hydra:runtime.choices.model}_${hydra:runtime.choices.dataset}_fine_tune
@@ -17,7 +17,7 @@ truncate_context: True
 epochs: 1
 batch_size: 4
 
-hydra:  
-  output_subdir: null  
-  run:  
+hydra:
+  output_subdir: null
+  run:
     dir: .
diff --git a/configs/predict.yaml b/configs/predict.yaml
@@ -2,8 +2,8 @@ defaults:
   - model: ???
   - dataset: pl-court-instruct
   - _self_
-  - override hydra/hydra_logging: disabled  
-  - override hydra/job_logging: disabled  
+  - override hydra/hydra_logging: disabled
+  - override hydra/job_logging: disabled
 
 device_map: 'auto'
 output_file: data/experiments/predict/${hydra:runtime.choices.dataset}/outputs_${hydra:runtime.choices.model}.json
@@ -12,7 +12,7 @@ metrics_file: data/experiments/predict/${hydra:runtime.choices.dataset}/metrics_
 max_new_tokens: 250
 truncate_context: True
 
-hydra:  
-  output_subdir: null  
-  run:  
+hydra:
+  output_subdir: null
+  run:
     dir: .
diff --git a/dashboards/pages/01_🔍_Search_Judgements.py b/dashboards/pages/01_🔍_Search_Judgements.py
@@ -1,40 +1,40 @@
-from typing import Any
-import streamlit as st
-
-from juddges.data.datasets import get_mongo_collection
-from pymongo.collection import Collection
-
-TITLE = "Search for Judgements"
-
-st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")
-
-st.title(TITLE)
-
-
-@st.cache_resource
-def get_judgements_collection() -> Collection:
-    return get_mongo_collection("judgements")
-
-
-judgements_collection = get_judgements_collection()
-
-
-def search_data(query: str, max_judgements: int = 5) -> list[dict[str, Any]]:
-    items = list(judgements_collection.find({"$text": {"$search": query}}).limit(max_judgements))
-    return items
-
-
-with st.form(key="search_form"):
-    text = st.text_area("What you are looking for in the judgements?")
-    max_judgements = st.slider("Max judgements to show", min_value=1, max_value=20, value=5)
-    submit_button = st.form_submit_button(label="Search")
-
-if submit_button:
-    with st.spinner("Searching..."):
-        items = search_data(text, max_judgements)
-
-        st.header("Judgements - Results")
-        for item in items:
-            st.header(item["signature"])
-            st.subheader(item["publicationDate"])
-            st.write(item["text"])
+from typing import Any
+import streamlit as st
+
+from juddges.data.datasets import get_mongo_collection
+from pymongo.collection import Collection
+
+TITLE = "Search for Judgements"
+
+st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")
+
+st.title(TITLE)
+
+
+@st.cache_resource
+def get_judgements_collection() -> Collection:
+    return get_mongo_collection("judgements")
+
+
+judgements_collection = get_judgements_collection()
+
+
+def search_data(query: str, max_judgements: int = 5) -> list[dict[str, Any]]:
+    items = list(judgements_collection.find({"$text": {"$search": query}}).limit(max_judgements))
+    return items
+
+
+with st.form(key="search_form"):
+    text = st.text_area("What you are looking for in the judgements?")
+    max_judgements = st.slider("Max judgements to show", min_value=1, max_value=20, value=5)
+    submit_button = st.form_submit_button(label="Search")
+
+if submit_button:
+    with st.spinner("Searching..."):
+        items = search_data(text, max_judgements)
+
+        st.header("Judgements - Results")
+        for item in items:
+            st.header(item["signature"])
+            st.subheader(item["publicationDate"])
+            st.write(item["text"])
diff --git a/dashboards/pages/02_🔍_Analyse_Extracted_Information.py b/dashboards/pages/02_🔍_Analyse_Extracted_Information.py
@@ -1,86 +1,86 @@
-import io
-
-import pandas as pd
-import streamlit as st
-
-from juddges.prompts.information_extraction import EXAMPLE_SCHEMA
-from juddges.settings import SAMPLE_DATA_PATH
-
-TITLE = "Analyse Judgements"
-
-st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")
-
-st.title(TITLE)
-
-
-@st.cache_resource
-def load_data():
-    return pd.read_csv(SAMPLE_DATA_PATH / "judgements-100-sample-with-retrieved-informations.csv")
-
-
-df = load_data()
-extracted_keys = [line.split(":")[0] for line in EXAMPLE_SCHEMA.split("\n") if len(line) > 3] + [
-    "signature",
-    "excerpt",
-    "text",
-    "judges",
-    "references",
-]
-
-st.info(
-    "We sampled 100 random judgements from the dataset and extracted information from them. Below is the extracted information and the schema (questions) used to extract it."
-)
-
-st.text_area(
-    "Example schema for extracted informations: ", value=EXAMPLE_SCHEMA, height=300, disabled=True
-)
-
-st.header("Extracted Information - tabular format")
-st.write(df[extracted_keys])
-
-
-output = io.BytesIO()
-with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
-    df.to_excel(writer, sheet_name="Sheet1", index=False)
-output.seek(0)
-st.download_button(
-    label="Download data as Excel",
-    data=output,
-    file_name="judgements.xlsx",
-    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-)
-
-st.header("Analyse Extracted Information")
-
-st.subheader("How many judgements we analyzed?")
-
-st.write(f"Number of judgements: {len(df)}")
-
-st.subheader("What courts judgement do we analyse")
-
-st.write(df.groupby("court")["_id"].count())
-
-st.subheader("How many judgements are drug offences?")
-
-drug_offences = df["drug_offence"].sum()
-
-st.info(f"Number of drug offences: {drug_offences}")
-
-st.subheader("How many judgements are child offences?")
-
-child_offences = df["child_offence"].sum()
-
-st.info(f"Number of child offences: {child_offences}")
-
-st.subheader("Show examples of judgements that are child offences")
-
-drug_offences_df = df[df["child_offence"]]
-
-st.write("We can check the sentences of them")
-
-for row_id, row in drug_offences_df.iterrows():
-    st.subheader(row["signature"])
-    st.info(row["verdict_summary"])
-    if st.toggle(key=row, label="Show judgement's text"):
-        st.markdown(row["text"])
-        st.markdown("---")
+import io
+
+import pandas as pd
+import streamlit as st
+
+from juddges.prompts.information_extraction import EXAMPLE_SCHEMA
+from juddges.settings import SAMPLE_DATA_PATH
+
+TITLE = "Analyse Judgements"
+
+st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")
+
+st.title(TITLE)
+
+
+@st.cache_resource
+def load_data():
+    return pd.read_csv(SAMPLE_DATA_PATH / "judgements-100-sample-with-retrieved-informations.csv")
+
+
+df = load_data()
+extracted_keys = [line.split(":")[0] for line in EXAMPLE_SCHEMA.split("\n") if len(line) > 3] + [
+    "signature",
+    "excerpt",
+    "text",
+    "judges",
+    "references",
+]
+
+st.info(
+    "We sampled 100 random judgements from the dataset and extracted information from them. Below is the extracted information and the schema (questions) used to extract it."
+)
+
+st.text_area(
+    "Example schema for extracted informations: ", value=EXAMPLE_SCHEMA, height=300, disabled=True
+)
+
+st.header("Extracted Information - tabular format")
+st.write(df[extracted_keys])
+
+
+output = io.BytesIO()
+with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
+    df.to_excel(writer, sheet_name="Sheet1", index=False)
+output.seek(0)
+st.download_button(
+    label="Download data as Excel",
+    data=output,
+    file_name="judgements.xlsx",
+    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+)
+
+st.header("Analyse Extracted Information")
+
+st.subheader("How many judgements we analyzed?")
+
+st.write(f"Number of judgements: {len(df)}")
+
+st.subheader("What courts judgement do we analyse")
+
+st.write(df.groupby("court")["_id"].count())
+
+st.subheader("How many judgements are drug offences?")
+
+drug_offences = df["drug_offence"].sum()
+
+st.info(f"Number of drug offences: {drug_offences}")
+
+st.subheader("How many judgements are child offences?")
+
+child_offences = df["child_offence"].sum()
+
+st.info(f"Number of child offences: {child_offences}")
+
+st.subheader("Show examples of judgements that are child offences")
+
+drug_offences_df = df[df["child_offence"]]
+
+st.write("We can check the sentences of them")
+
+for row_id, row in drug_offences_df.iterrows():
+    st.subheader(row["signature"])
+    st.info(row["verdict_summary"])
+    if st.toggle(key=row, label="Show judgement's text"):
+        st.markdown(row["text"])
+        st.markdown("---")
diff --git a/data/datasets/pl/graph/template_README.md b/data/datasets/pl/graph/template_README.md
@@ -10,11 +10,11 @@ tags: {{tags}}
 # Polish Court Judgments Graph
 
 ## Dataset description
-We introduce a graph dataset of Polish Court Judgments. This dataset is primarily based on the [`JuDDGES/pl-court-raw`](https://huggingface.co/datasets/JuDDGES/pl-court-raw). The dataset consists of nodes representing either judgments or legal bases, and edges connecting judgments to the legal bases they refer to. Also, the graph was cleaned from small disconnected components, leaving single giant component. Consequently, the resulting graph is bipartite. We provide the dataset in both `JSON` and `PyG` formats, each has different purpose. While structurally graphs in these formats are the same, their attributes differ. 
+We introduce a graph dataset of Polish Court Judgments. This dataset is primarily based on the [`JuDDGES/pl-court-raw`](https://huggingface.co/datasets/JuDDGES/pl-court-raw). The dataset consists of nodes representing either judgments or legal bases, and edges connecting judgments to the legal bases they refer to. Also, the graph was cleaned from small disconnected components, leaving single giant component. Consequently, the resulting graph is bipartite. We provide the dataset in both `JSON` and `PyG` formats, each has different purpose. While structurally graphs in these formats are the same, their attributes differ.
 
 The `JSON` format is intended for analysis and contains most of the attributes available in [`JuDDGES/pl-court-raw`](https://huggingface.co/datasets/JuDDGES/pl-court-raw). We excluded some less-useful attributes and text content, which can be easily retrieved from the raw dataset and added to the graph as needed.
 
-The `PyG` format is designed for machine learning applications, such as link prediction on graphs, and is fully compatible with the [`Pytorch Geometric`](https://github.com/pyg-team/pytorch_geometric) framework. 
+The `PyG` format is designed for machine learning applications, such as link prediction on graphs, and is fully compatible with the [`Pytorch Geometric`](https://github.com/pyg-team/pytorch_geometric) framework.
 
 In the following sections, we provide a more detailed explanation and use case examples for each format.
 
@@ -28,9 +28,9 @@ In the following sections, we provide a more detailed explanation and use case e
 | #nodes (type=`legal_base`) | {{num_target_nodes}} |
 | avg(degree)                | {{avg_degree}}       |
 
-    
+
 ![png](assets/degree_distribution.png)
-    
+
 
 
 ## `JSON` format
@@ -67,10 +67,10 @@ g = nx.node_link_graph(g_data)
 
 ## `PyG` format
 
-The `PyTorch Geometric` format includes embeddings of the judgment content, obtained with [{{embedding_method}}](https://huggingface.co/{{embedding_method}}) for judgment nodes, 
-and one-hot-vector identifiers for legal-base nodes (note that for efficiency one can substitute it with random noise identifiers, 
+The `PyTorch Geometric` format includes embeddings of the judgment content, obtained with [{{embedding_method}}](https://huggingface.co/{{embedding_method}}) for judgment nodes,
+and one-hot-vector identifiers for legal-base nodes (note that for efficiency one can substitute it with random noise identifiers,
 like in [(Abboud et al., 2021)](https://arxiv.org/abs/2010.01179)).
-    
+
 
 
 ### Loading
@@ -134,4 +134,4 @@ print(ds)
 ### Example usage
 ```python
 # TBD
-```
+```
diff --git a/...experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct-fine-tuned.json b/...experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct-fine-tuned.json
@@ -9,4 +9,4 @@
 		"recorder": 0.9931748509407043,
 		"signature": 0.9937450289726257
 	}
-}
+}
diff --git a/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json
@@ -9,4 +9,4 @@
 		"recorder": 0.7640316486358643,
 		"signature": 0.7549777626991272
 	}
-}
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,4 +9,4 @@ @@
     		"recorder": 0.9931748509407043,
     		"signature": 0.9937450289726257
     	}
-    }
+    }