diff --git a/.github/workflows/nbdev-test.yaml b/.github/workflows/nbdev-test.yaml index acc31c8..822fab5 100644 --- a/.github/workflows/nbdev-test.yaml +++ b/.github/workflows/nbdev-test.yaml @@ -4,7 +4,7 @@ on: [workflow_dispatch, pull_request] jobs: nbdev-test: runs-on: ubuntu-latest - steps: + steps: - uses: fastai/workflows/nbdev-ci@master with: skip_test: true diff --git a/Dockerfile b/Dockerfile index bf1347c..75968f9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,7 @@ RUN apt-get update \ && gdebi -n quarto-1.5.17-linux-amd64.deb \ && apt-get clean \ && rm -rf /var/lib/apt/lists \ - && rm -rf /tmp + && rm -rf /tmp ENV PYTHONDONTWRITEBYTECODE 1 ENV PYTHONUNBUFFERED 1 diff --git a/configs/embedding.yaml b/configs/embedding.yaml index c9c891e..37afafe 100644 --- a/configs/embedding.yaml +++ b/configs/embedding.yaml @@ -2,8 +2,8 @@ defaults: - embedding_model: ??? - dataset: pl-court-raw - _self_ - - override hydra/hydra_logging: disabled - - override hydra/job_logging: disabled + - override hydra/hydra_logging: disabled + - override hydra/job_logging: disabled length_adjust_mode: chunk chunk_config: @@ -14,7 +14,7 @@ batch_size: 64 output_dir: data/embeddings/${dataset.name}/${hydra:runtime.choices.embedding_model}/all_embeddings -hydra: - output_subdir: null - run: +hydra: + output_subdir: null + run: dir: . diff --git a/configs/fine_tuning.yaml b/configs/fine_tuning.yaml index 4cac7aa..e96d55b 100644 --- a/configs/fine_tuning.yaml +++ b/configs/fine_tuning.yaml @@ -2,8 +2,8 @@ defaults: - model: ??? - dataset: pl-court-instruct - _self_ - - override hydra/hydra_logging: disabled - - override hydra/job_logging: disabled + - override hydra/hydra_logging: disabled + - override hydra/job_logging: disabled output_dir: data/experiments/fine-tune/${hydra:runtime.choices.model}/${hydra:runtime.choices.dataset} run_name: ${hydra:runtime.choices.model}_${hydra:runtime.choices.dataset}_fine_tune @@ -17,7 +17,7 @@ truncate_context: True epochs: 1 batch_size: 4 -hydra: - output_subdir: null - run: +hydra: + output_subdir: null + run: dir: . diff --git a/configs/predict.yaml b/configs/predict.yaml index a85cf50..9b47c82 100644 --- a/configs/predict.yaml +++ b/configs/predict.yaml @@ -2,8 +2,8 @@ defaults: - model: ??? - dataset: pl-court-instruct - _self_ - - override hydra/hydra_logging: disabled - - override hydra/job_logging: disabled + - override hydra/hydra_logging: disabled + - override hydra/job_logging: disabled device_map: 'auto' output_file: data/experiments/predict/${hydra:runtime.choices.dataset}/outputs_${hydra:runtime.choices.model}.json @@ -12,7 +12,7 @@ metrics_file: data/experiments/predict/${hydra:runtime.choices.dataset}/metrics_ max_new_tokens: 250 truncate_context: True -hydra: - output_subdir: null - run: +hydra: + output_subdir: null + run: dir: . diff --git "a/dashboards/pages/01_\360\237\224\215_Search_Judgements.py" "b/dashboards/pages/01_\360\237\224\215_Search_Judgements.py" index 74d86f0..266957e 100644 --- "a/dashboards/pages/01_\360\237\224\215_Search_Judgements.py" +++ "b/dashboards/pages/01_\360\237\224\215_Search_Judgements.py" @@ -1,40 +1,40 @@ -from typing import Any -import streamlit as st - -from juddges.data.datasets import get_mongo_collection -from pymongo.collection import Collection - -TITLE = "Search for Judgements" - -st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide") - -st.title(TITLE) - - -@st.cache_resource -def get_judgements_collection() -> Collection: - return get_mongo_collection("judgements") - - -judgements_collection = get_judgements_collection() - - -def search_data(query: str, max_judgements: int = 5) -> list[dict[str, Any]]: - items = list(judgements_collection.find({"$text": {"$search": query}}).limit(max_judgements)) - return items - - -with st.form(key="search_form"): - text = st.text_area("What you are looking for in the judgements?") - max_judgements = st.slider("Max judgements to show", min_value=1, max_value=20, value=5) - submit_button = st.form_submit_button(label="Search") - -if submit_button: - with st.spinner("Searching..."): - items = search_data(text, max_judgements) - - st.header("Judgements - Results") - for item in items: - st.header(item["signature"]) - st.subheader(item["publicationDate"]) - st.write(item["text"]) +from typing import Any +import streamlit as st + +from juddges.data.datasets import get_mongo_collection +from pymongo.collection import Collection + +TITLE = "Search for Judgements" + +st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide") + +st.title(TITLE) + + +@st.cache_resource +def get_judgements_collection() -> Collection: + return get_mongo_collection("judgements") + + +judgements_collection = get_judgements_collection() + + +def search_data(query: str, max_judgements: int = 5) -> list[dict[str, Any]]: + items = list(judgements_collection.find({"$text": {"$search": query}}).limit(max_judgements)) + return items + + +with st.form(key="search_form"): + text = st.text_area("What you are looking for in the judgements?") + max_judgements = st.slider("Max judgements to show", min_value=1, max_value=20, value=5) + submit_button = st.form_submit_button(label="Search") + +if submit_button: + with st.spinner("Searching..."): + items = search_data(text, max_judgements) + + st.header("Judgements - Results") + for item in items: + st.header(item["signature"]) + st.subheader(item["publicationDate"]) + st.write(item["text"]) diff --git "a/dashboards/pages/02_\360\237\224\215_Analyse_Extracted_Information.py" "b/dashboards/pages/02_\360\237\224\215_Analyse_Extracted_Information.py" index ad663b2..9aeb056 100644 --- "a/dashboards/pages/02_\360\237\224\215_Analyse_Extracted_Information.py" +++ "b/dashboards/pages/02_\360\237\224\215_Analyse_Extracted_Information.py" @@ -1,86 +1,86 @@ -import io - -import pandas as pd -import streamlit as st - -from juddges.prompts.information_extraction import EXAMPLE_SCHEMA -from juddges.settings import SAMPLE_DATA_PATH - -TITLE = "Analyse Judgements" - -st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide") - -st.title(TITLE) - - -@st.cache_resource -def load_data(): - return pd.read_csv(SAMPLE_DATA_PATH / "judgements-100-sample-with-retrieved-informations.csv") - - -df = load_data() -extracted_keys = [line.split(":")[0] for line in EXAMPLE_SCHEMA.split("\n") if len(line) > 3] + [ - "signature", - "excerpt", - "text", - "judges", - "references", -] - -st.info( - "We sampled 100 random judgements from the dataset and extracted information from them. Below is the extracted information and the schema (questions) used to extract it." -) - -st.text_area( - "Example schema for extracted informations: ", value=EXAMPLE_SCHEMA, height=300, disabled=True -) - -st.header("Extracted Information - tabular format") -st.write(df[extracted_keys]) - - -output = io.BytesIO() -with pd.ExcelWriter(output, engine="xlsxwriter") as writer: - df.to_excel(writer, sheet_name="Sheet1", index=False) -output.seek(0) -st.download_button( - label="Download data as Excel", - data=output, - file_name="judgements.xlsx", - mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", -) - -st.header("Analyse Extracted Information") - -st.subheader("How many judgements we analyzed?") - -st.write(f"Number of judgements: {len(df)}") - -st.subheader("What courts judgement do we analyse") - -st.write(df.groupby("court")["_id"].count()) - -st.subheader("How many judgements are drug offences?") - -drug_offences = df["drug_offence"].sum() - -st.info(f"Number of drug offences: {drug_offences}") - -st.subheader("How many judgements are child offences?") - -child_offences = df["child_offence"].sum() - -st.info(f"Number of child offences: {child_offences}") - -st.subheader("Show examples of judgements that are child offences") - -drug_offences_df = df[df["child_offence"]] - -st.write("We can check the sentences of them") - -for row_id, row in drug_offences_df.iterrows(): - st.subheader(row["signature"]) - st.info(row["verdict_summary"]) - if st.toggle(key=row, label="Show judgement's text"): - st.markdown(row["text"]) - st.markdown("---") +import io + +import pandas as pd +import streamlit as st + +from juddges.prompts.information_extraction import EXAMPLE_SCHEMA +from juddges.settings import SAMPLE_DATA_PATH + +TITLE = "Analyse Judgements" + +st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide") + +st.title(TITLE) + + +@st.cache_resource +def load_data(): + return pd.read_csv(SAMPLE_DATA_PATH / "judgements-100-sample-with-retrieved-informations.csv") + + +df = load_data() +extracted_keys = [line.split(":")[0] for line in EXAMPLE_SCHEMA.split("\n") if len(line) > 3] + [ + "signature", + "excerpt", + "text", + "judges", + "references", +] + +st.info( + "We sampled 100 random judgements from the dataset and extracted information from them. Below is the extracted information and the schema (questions) used to extract it." +) + +st.text_area( + "Example schema for extracted informations: ", value=EXAMPLE_SCHEMA, height=300, disabled=True +) + +st.header("Extracted Information - tabular format") +st.write(df[extracted_keys]) + + +output = io.BytesIO() +with pd.ExcelWriter(output, engine="xlsxwriter") as writer: + df.to_excel(writer, sheet_name="Sheet1", index=False) +output.seek(0) +st.download_button( + label="Download data as Excel", + data=output, + file_name="judgements.xlsx", + mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", +) + +st.header("Analyse Extracted Information") + +st.subheader("How many judgements we analyzed?") + +st.write(f"Number of judgements: {len(df)}") + +st.subheader("What courts judgement do we analyse") + +st.write(df.groupby("court")["_id"].count()) + +st.subheader("How many judgements are drug offences?") + +drug_offences = df["drug_offence"].sum() + +st.info(f"Number of drug offences: {drug_offences}") + +st.subheader("How many judgements are child offences?") + +child_offences = df["child_offence"].sum() + +st.info(f"Number of child offences: {child_offences}") + +st.subheader("Show examples of judgements that are child offences") + +drug_offences_df = df[df["child_offence"]] + +st.write("We can check the sentences of them") + +for row_id, row in drug_offences_df.iterrows(): + st.subheader(row["signature"]) + st.info(row["verdict_summary"]) + if st.toggle(key=row, label="Show judgement's text"): + st.markdown(row["text"]) + st.markdown("---") diff --git a/data/datasets/pl/graph/template_README.md b/data/datasets/pl/graph/template_README.md index 842b85f..8f9dd2b 100755 --- a/data/datasets/pl/graph/template_README.md +++ b/data/datasets/pl/graph/template_README.md @@ -10,11 +10,11 @@ tags: {{tags}} # Polish Court Judgments Graph ## Dataset description -We introduce a graph dataset of Polish Court Judgments. This dataset is primarily based on the [`JuDDGES/pl-court-raw`](https://huggingface.co/datasets/JuDDGES/pl-court-raw). The dataset consists of nodes representing either judgments or legal bases, and edges connecting judgments to the legal bases they refer to. Also, the graph was cleaned from small disconnected components, leaving single giant component. Consequently, the resulting graph is bipartite. We provide the dataset in both `JSON` and `PyG` formats, each has different purpose. While structurally graphs in these formats are the same, their attributes differ. +We introduce a graph dataset of Polish Court Judgments. This dataset is primarily based on the [`JuDDGES/pl-court-raw`](https://huggingface.co/datasets/JuDDGES/pl-court-raw). The dataset consists of nodes representing either judgments or legal bases, and edges connecting judgments to the legal bases they refer to. Also, the graph was cleaned from small disconnected components, leaving single giant component. Consequently, the resulting graph is bipartite. We provide the dataset in both `JSON` and `PyG` formats, each has different purpose. While structurally graphs in these formats are the same, their attributes differ. The `JSON` format is intended for analysis and contains most of the attributes available in [`JuDDGES/pl-court-raw`](https://huggingface.co/datasets/JuDDGES/pl-court-raw). We excluded some less-useful attributes and text content, which can be easily retrieved from the raw dataset and added to the graph as needed. -The `PyG` format is designed for machine learning applications, such as link prediction on graphs, and is fully compatible with the [`Pytorch Geometric`](https://github.com/pyg-team/pytorch_geometric) framework. +The `PyG` format is designed for machine learning applications, such as link prediction on graphs, and is fully compatible with the [`Pytorch Geometric`](https://github.com/pyg-team/pytorch_geometric) framework. In the following sections, we provide a more detailed explanation and use case examples for each format. @@ -28,9 +28,9 @@ In the following sections, we provide a more detailed explanation and use case e | #nodes (type=`legal_base`) | {{num_target_nodes}} | | avg(degree) | {{avg_degree}} | - + ![png](assets/degree_distribution.png) - + ## `JSON` format @@ -67,10 +67,10 @@ g = nx.node_link_graph(g_data) ## `PyG` format -The `PyTorch Geometric` format includes embeddings of the judgment content, obtained with [{{embedding_method}}](https://huggingface.co/{{embedding_method}}) for judgment nodes, -and one-hot-vector identifiers for legal-base nodes (note that for efficiency one can substitute it with random noise identifiers, +The `PyTorch Geometric` format includes embeddings of the judgment content, obtained with [{{embedding_method}}](https://huggingface.co/{{embedding_method}}) for judgment nodes, +and one-hot-vector identifiers for legal-base nodes (note that for efficiency one can substitute it with random noise identifiers, like in [(Abboud et al., 2021)](https://arxiv.org/abs/2010.01179)). - + ### Loading @@ -134,4 +134,4 @@ print(ds) ### Example usage ```python # TBD -``` \ No newline at end of file +``` diff --git a/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct-fine-tuned.json b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct-fine-tuned.json index 6740087..4bfb85e 100644 --- a/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct-fine-tuned.json +++ b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct-fine-tuned.json @@ -9,4 +9,4 @@ "recorder": 0.9931748509407043, "signature": 0.9937450289726257 } -} \ No newline at end of file +} diff --git a/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json index 0e8eb94..e474cc6 100644 --- a/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json +++ b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json @@ -9,4 +9,4 @@ "recorder": 0.7640316486358643, "signature": 0.7549777626991272 } -} \ No newline at end of file +} diff --git a/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json index e0a55ee..6fed7d3 100644 --- a/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json +++ b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json @@ -9,4 +9,4 @@ "recorder": 0.9933416843414307, "signature": 0.9780842661857605 } -} \ No newline at end of file +} diff --git a/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3.json b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3.json index c6c6bd8..184b00e 100644 --- a/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3.json +++ b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3.json @@ -9,4 +9,4 @@ "recorder": 0.9425673484802246, "signature": 0.56711345911026 } -} \ No newline at end of file +} diff --git a/data/experiments/predict/pl-court-instruct/metrics_summary.md b/data/experiments/predict/pl-court-instruct/metrics_summary.md index f2307b6..6225012 100644 --- a/data/experiments/predict/pl-court-instruct/metrics_summary.md +++ b/data/experiments/predict/pl-court-instruct/metrics_summary.md @@ -3,4 +3,4 @@ | Unsloth-Llama-3-8B-Instruct | 0.439 | 0.879 | 0.982 | 0.906 | 0.915 | 0.426 | 0.764 | 0.755 | | Unsloth-Llama-3-8B-Instruct-fine-tuned | 0.828 | 0.995 | 0.989 | 0.986 | 0.977 | 0.601 | 0.993 | 0.994 | | Unsloth-Mistral-7B-Instruct-v0.3 | 0.477 | 0.830 | 0.987 | 0.900 | 0.870 | 0.419 | 0.943 | 0.567 | -| Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned | 0.819 | 0.996 | 0.989 | 0.996 | 0.981 | 0.737 | 0.993 | 0.978 | \ No newline at end of file +| Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned | 0.819 | 0.996 | 0.989 | 0.996 | 0.981 | 0.737 | 0.993 | 0.978 | diff --git a/data/sample_data/.gitignore b/data/sample_data/.gitignore index fb39c27..1f07155 100644 --- a/data/sample_data/.gitignore +++ b/data/sample_data/.gitignore @@ -1,2 +1,2 @@ -/judgements-100-sample-with-retrieved-informations.csv -/judgements-100-sample.csv +/judgements-100-sample-with-retrieved-informations.csv +/judgements-100-sample.csv diff --git a/dvc.lock b/dvc.lock index b04ce8c..f642f40 100644 --- a/dvc.lock +++ b/dvc.lock @@ -94,10 +94,10 @@ stages: size: 24415235644 nfiles: 53 evaluate@Unsloth-Llama-3-8B-Instruct: - cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file + cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json deps: - - path: + - path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json hash: md5 md5: df2f1d464152f87737c8ebb5b0673854 @@ -107,16 +107,16 @@ stages: md5: 66211e8b6f056234240f094896966a9c size: 578 outs: - - path: + - path: data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json hash: md5 md5: 521a731cc2c45d3eda0656a8e69d505b size: 307 evaluate@Unsloth-Llama-3-8B-Instruct-fine-tuned: - cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file + cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json deps: - - path: + - path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json hash: md5 md5: 9199da7e04fb35cc1ce2bbe9dd5cd274 @@ -126,16 +126,16 @@ stages: md5: 66211e8b6f056234240f094896966a9c size: 578 outs: - - path: + - path: data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct-fine-tuned.json hash: md5 md5: 6a0eb30a14687342bc86ae80253cd60c size: 306 evaluate@Unsloth-Mistral-7B-Instruct-v0.3: - cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file + cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3.json deps: - - path: + - path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3.json hash: md5 md5: c2e03f3fbd29c744023bdac7e1007265 @@ -145,16 +145,16 @@ stages: md5: 66211e8b6f056234240f094896966a9c size: 578 outs: - - path: + - path: data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3.json hash: md5 md5: 091b8888275600052dd2dcdd36a55588 size: 305 evaluate@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned: - cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file + cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json deps: - - path: + - path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json hash: md5 md5: a4fda5774b367e8924cf07f3bf271922 @@ -164,7 +164,7 @@ stages: md5: 66211e8b6f056234240f094896966a9c size: 578 outs: - - path: + - path: data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json hash: md5 md5: 3b3589929112cb2f199044d240e87bcc @@ -199,7 +199,7 @@ stages: md5: 59c2afb977f520c9134153def544111d size: 3188 outs: - - path: + - path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json hash: md5 md5: df2f1d464152f87737c8ebb5b0673854 @@ -220,7 +220,7 @@ stages: md5: 59c2afb977f520c9134153def544111d size: 3188 outs: - - path: + - path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json hash: md5 md5: 9199da7e04fb35cc1ce2bbe9dd5cd274 @@ -241,7 +241,7 @@ stages: md5: 59c2afb977f520c9134153def544111d size: 3188 outs: - - path: + - path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3.json hash: md5 md5: c2e03f3fbd29c744023bdac7e1007265 @@ -262,7 +262,7 @@ stages: md5: 59c2afb977f520c9134153def544111d size: 3188 outs: - - path: + - path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json hash: md5 md5: a4fda5774b367e8924cf07f3bf271922 diff --git a/dvc.yaml b/dvc.yaml index 4d3f1b5..8bb0fd9 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,9 +1,9 @@ stages: raw_dataset_readme: cmd: >- - jupyter nbconvert - --no-input - --to markdown + jupyter nbconvert + --no-input + --to markdown --execute nbs/Data/02_Dataset_Description_Raw.ipynb --output-dir data/datasets/pl/readme/raw --output README @@ -15,9 +15,9 @@ stages: instruct_dataset_readme: cmd: >- - jupyter nbconvert - --no-input - --to markdown + jupyter nbconvert + --no-input + --to markdown --execute nbs/Data/03_Dataset_Description_Instruct.ipynb --output-dir data/datasets/pl/readme/instruct --output README @@ -54,7 +54,7 @@ stages: model: - mmlw-roberta-large cmd: >- - PYTHONPATH=. python scripts/embed/aggregate_embeddings.py + PYTHONPATH=. python scripts/embed/aggregate_embeddings.py --embeddings-dir data/embeddings/pl-court-raw/${item.model}/all_embeddings deps: - scripts/embed/aggregate_embeddings.py @@ -62,17 +62,17 @@ stages: outs: - data/embeddings/pl-court-raw/${item.model}/agg_embeddings.pt - + build_graph_dataset: cmd: >- PYTHONPATH=. python scripts/dataset/build_graph_dataset.py - --dataset-dir data/datasets/pl/raw + --dataset-dir data/datasets/pl/raw --embeddings-root-dir data/embeddings/pl-court-raw/mmlw-roberta-large/ --target-dir data/datasets/pl/graph deps: - scripts/dataset/build_graph_dataset.py - juddges/data/pl_court_graph.py - - data/datasets/pl/raw + - data/datasets/pl/raw - data/embeddings/pl-court-raw/mmlw-roberta-large/agg_embeddings.pt - data/embeddings/pl-court-raw/mmlw-roberta-large/all_embeddings/config.yaml outs: @@ -88,7 +88,7 @@ stages: PYTHONPATH=. python scripts/sft/fine_tune_unsloth.py model=${item.model} deps: - scripts/sft/fine_tune_unsloth.py - - configs/fine_tuning.yaml + - configs/fine_tuning.yaml - configs/model/${item.model}.yaml outs: - data/experiments/fine-tune/${item.model}/pl-court-instruct @@ -117,7 +117,7 @@ stages: - Unsloth-Mistral-7B-Instruct-v0.3 - Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned cmd: >- - PYTHONPATH=. python scripts/sft/evaluate.py + PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/pl-court-instruct/outputs_${item.model}.json deps: - scripts/sft/evaluate.py @@ -131,7 +131,7 @@ stages: dir: - data/experiments/predict/pl-court-instruct cmd: >- - PYTHONPATH=. python scripts/sft/summarize_metrics.py + PYTHONPATH=. python scripts/sft/summarize_metrics.py --root-dir ${item.dir} deps: - scripts/sft/summarize_metrics.py diff --git a/juddges/data/database.py b/juddges/data/database.py index 9c2dae0..4b10c2f 100644 --- a/juddges/data/database.py +++ b/juddges/data/database.py @@ -1,81 +1,81 @@ -import os -from typing import Any, Callable, Generator, Iterable, Iterator - -from loguru import logger -from pymongo import MongoClient, UpdateOne -from pymongo.collection import Collection -from pymongo.cursor import Cursor -from pymongo.errors import BulkWriteError - - -def get_mongo_collection( - mongo_uri: str | None = None, - mongo_db: str | None = None, - collection_name: str = "pl-court", -) -> Collection: - uri = mongo_uri or os.environ.get("MONGO_URI") - assert uri, "Mongo URI is required" - db_name = mongo_db or os.environ.get("MONGO_DB_NAME") - assert db_name, "Mongo DB name is required" - - client: MongoClient = MongoClient(uri) - db = client[db_name] - return db[collection_name] - - -class BatchedDatabaseCursor: - """MongoDB cursor wrapper that returns documents in batches. - - Cursor is consumed in batches of specified size. - - Prefetch option loads all documents into memory before iterating. - """ - - def __init__(self, cursor: Cursor, batch_size: int, prefetch: bool) -> None: - self.cursor = cursor - self.batch_size = batch_size - self.prefetch = prefetch - - def __iter__(self) -> Iterator[list[dict[str, Any]]]: - if self.prefetch: - iterable: Iterable = list(self.cursor) - else: - iterable = self.cursor - - def gen_batches() -> Generator[list[dict[str, Any]], None, None]: - """Credit: https://stackoverflow.com/a/61809417""" - chunk: list[dict[str, Any]] = [] - for i, row in enumerate(iterable): - if i % self.batch_size == 0 and i > 0: - yield chunk - del chunk[:] - chunk.append(row) - yield chunk - - return gen_batches() - - -class BatchDatabaseUpdate: - """Updates database in batches using provided update function. - - Update function takes document id and returns dictionary with updated fields: - def update_func (document: dict[str, Any]) -> dict[str, Any]: - - Updated document may be constrained to only necessary fields (_id must be present). - - Update fields may or may not be already present in the database. - - Update is called specified documents. - """ - - def __init__(self, mongo_uri: str, update_func: Callable[[dict[str, Any]], dict]) -> None: - self.mongo_uri = mongo_uri - self.update_func = update_func - - def __call__(self, documents: list[dict[str, Any]]) -> None: - update_batch: list[UpdateOne] = [] - - for doc in documents: - update_data = self.update_func(doc) - update_batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": update_data})) - - collection = get_mongo_collection(mongo_uri=self.mongo_uri) - - try: - collection.bulk_write(update_batch, ordered=False) - except BulkWriteError as err: - logger.error(err) +import os +from typing import Any, Callable, Generator, Iterable, Iterator + +from loguru import logger +from pymongo import MongoClient, UpdateOne +from pymongo.collection import Collection +from pymongo.cursor import Cursor +from pymongo.errors import BulkWriteError + + +def get_mongo_collection( + mongo_uri: str | None = None, + mongo_db: str | None = None, + collection_name: str = "pl-court", +) -> Collection: + uri = mongo_uri or os.environ.get("MONGO_URI") + assert uri, "Mongo URI is required" + db_name = mongo_db or os.environ.get("MONGO_DB_NAME") + assert db_name, "Mongo DB name is required" + + client: MongoClient = MongoClient(uri) + db = client[db_name] + return db[collection_name] + + +class BatchedDatabaseCursor: + """MongoDB cursor wrapper that returns documents in batches. + - Cursor is consumed in batches of specified size. + - Prefetch option loads all documents into memory before iterating. + """ + + def __init__(self, cursor: Cursor, batch_size: int, prefetch: bool) -> None: + self.cursor = cursor + self.batch_size = batch_size + self.prefetch = prefetch + + def __iter__(self) -> Iterator[list[dict[str, Any]]]: + if self.prefetch: + iterable: Iterable = list(self.cursor) + else: + iterable = self.cursor + + def gen_batches() -> Generator[list[dict[str, Any]], None, None]: + """Credit: https://stackoverflow.com/a/61809417""" + chunk: list[dict[str, Any]] = [] + for i, row in enumerate(iterable): + if i % self.batch_size == 0 and i > 0: + yield chunk + del chunk[:] + chunk.append(row) + yield chunk + + return gen_batches() + + +class BatchDatabaseUpdate: + """Updates database in batches using provided update function. + - Update function takes document id and returns dictionary with updated fields: + def update_func (document: dict[str, Any]) -> dict[str, Any]: + - Updated document may be constrained to only necessary fields (_id must be present). + - Update fields may or may not be already present in the database. + - Update is called specified documents. + """ + + def __init__(self, mongo_uri: str, update_func: Callable[[dict[str, Any]], dict]) -> None: + self.mongo_uri = mongo_uri + self.update_func = update_func + + def __call__(self, documents: list[dict[str, Any]]) -> None: + update_batch: list[UpdateOne] = [] + + for doc in documents: + update_data = self.update_func(doc) + update_batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": update_data})) + + collection = get_mongo_collection(mongo_uri=self.mongo_uri) + + try: + collection.bulk_write(update_batch, ordered=False) + except BulkWriteError as err: + logger.error(err) diff --git a/juddges/prompts/information_extraction.py b/juddges/prompts/information_extraction.py index 0cc2284..18adb5e 100644 --- a/juddges/prompts/information_extraction.py +++ b/juddges/prompts/information_extraction.py @@ -1,108 +1,108 @@ -from langchain.output_parsers.json import parse_json_markdown -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate -from langchain_core.messages import AIMessage -from langchain_core.runnables import RunnableLambda, RunnablePassthrough, RunnableSequence -from langchain_openai import ChatOpenAI - -SCHEMA_PROMPT_TEMPLATE = """ -Act as a assistant that prepares schema for information extraction - -Based on the user input prepare schema containing variables with their short description and type. -Be precise about variable names, format names using snake_case. -If user asks irrelevant question always return empty JSON. -As example: -User: I want extract age, gender, and plea from the judgement -Agent: - age: integer - gender: male or female - plea: string - -==== -{SCHEMA_TEXT} -==== - -Format response as JSON: -""" - -EXTRACTION_PROMPT_TEMPLATE = """Act as a legal document tool that extracts information and answer questions based on judgements. - -Instruction for extracting information from judgements: -- Judgements are in {LANGUAGE} language, please extract information in {LANGUAGE}. -- Do not provide information that are not explicitly mentioned in judgements. If you can't extract information from the text field, leave the field with empty string "". - -Follow the following YAML structure to extract information and answer questions based on judgements: -{SCHEMA} - -==== -{TEXT} -==== - -Format response as JSON: -""" - -EXAMPLE_SCHEMA = """verdict_date: date as ISO 8601 -verdict: string, text representing verdict of the judgement -verdict_summary: string, short summary of the verdict -verdict_id: string -court: string -parties: string -appeal_against: string -first_trial: boolean -drug_offence: boolean -child_offence: boolean -offence_seriousness: boolean -verdict_tags: List[string]""" - - -def prepare_information_extraction_chain_from_user_prompt() -> RunnableSequence: - schema_chain = prepare_schema_chain() - inputs = { - "SCHEMA": schema_chain, - "TEXT": RunnablePassthrough(), - "LANGUAGE": RunnablePassthrough(), - } - return inputs | RunnableLambda(route) - - -def prepare_information_extraction_chain( - model_name: str = "gpt-4-0125-preview", - log_to_mlflow: bool = False, -) -> RunnableSequence: - model = ChatOpenAI(model=model_name, temperature=0) - human_message_template = HumanMessagePromptTemplate.from_template(EXTRACTION_PROMPT_TEMPLATE) - _prompt = ChatPromptTemplate( - messages=[human_message_template], - input_variables=["TEXT", "LANGUAGE", "SCHEMA"], - ) - - if log_to_mlflow: - import mlflow - - mlflow.log_dict(_prompt.save_to_json(), "prompt.json") - - return _prompt | model | (lambda x: parse_json_markdown(x.content)) - - -def prepare_schema_chain(model_name: str = "gpt-3.5-turbo") -> RunnableSequence: - model = ChatOpenAI(model=model_name, temperature=0) - human_message_template = HumanMessagePromptTemplate.from_template(SCHEMA_PROMPT_TEMPLATE) - _prompt = ChatPromptTemplate( - messages=[human_message_template], - input_variables=["TEXT", "LANGUAGE", "SCHEMA"], - ) - - return _prompt | model | parse_schema - - -def parse_schema(ai_message: AIMessage) -> str: - response_schema = parse_json_markdown(ai_message.content) - return "\n".join(f"{key}: {val}" for key, val in response_schema.items()) - - -def route(response_schema: str) -> dict[str, str]: - if response_schema["SCHEMA"]: - return prepare_information_extraction_chain() - - raise ValueError( - "Cannot determine schema for the given input prompt. Please try different query." - ) +from langchain.output_parsers.json import parse_json_markdown +from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from langchain_core.messages import AIMessage +from langchain_core.runnables import RunnableLambda, RunnablePassthrough, RunnableSequence +from langchain_openai import ChatOpenAI + +SCHEMA_PROMPT_TEMPLATE = """ +Act as a assistant that prepares schema for information extraction + +Based on the user input prepare schema containing variables with their short description and type. +Be precise about variable names, format names using snake_case. +If user asks irrelevant question always return empty JSON. +As example: +User: I want extract age, gender, and plea from the judgement +Agent: + age: integer + gender: male or female + plea: string + +==== +{SCHEMA_TEXT} +==== + +Format response as JSON: +""" + +EXTRACTION_PROMPT_TEMPLATE = """Act as a legal document tool that extracts information and answer questions based on judgements. + +Instruction for extracting information from judgements: +- Judgements are in {LANGUAGE} language, please extract information in {LANGUAGE}. +- Do not provide information that are not explicitly mentioned in judgements. If you can't extract information from the text field, leave the field with empty string "". + +Follow the following YAML structure to extract information and answer questions based on judgements: +{SCHEMA} + +==== +{TEXT} +==== + +Format response as JSON: +""" + +EXAMPLE_SCHEMA = """verdict_date: date as ISO 8601 +verdict: string, text representing verdict of the judgement +verdict_summary: string, short summary of the verdict +verdict_id: string +court: string +parties: string +appeal_against: string +first_trial: boolean +drug_offence: boolean +child_offence: boolean +offence_seriousness: boolean +verdict_tags: List[string]""" + + +def prepare_information_extraction_chain_from_user_prompt() -> RunnableSequence: + schema_chain = prepare_schema_chain() + inputs = { + "SCHEMA": schema_chain, + "TEXT": RunnablePassthrough(), + "LANGUAGE": RunnablePassthrough(), + } + return inputs | RunnableLambda(route) + + +def prepare_information_extraction_chain( + model_name: str = "gpt-4-0125-preview", + log_to_mlflow: bool = False, +) -> RunnableSequence: + model = ChatOpenAI(model=model_name, temperature=0) + human_message_template = HumanMessagePromptTemplate.from_template(EXTRACTION_PROMPT_TEMPLATE) + _prompt = ChatPromptTemplate( + messages=[human_message_template], + input_variables=["TEXT", "LANGUAGE", "SCHEMA"], + ) + + if log_to_mlflow: + import mlflow + + mlflow.log_dict(_prompt.save_to_json(), "prompt.json") + + return _prompt | model | (lambda x: parse_json_markdown(x.content)) + + +def prepare_schema_chain(model_name: str = "gpt-3.5-turbo") -> RunnableSequence: + model = ChatOpenAI(model=model_name, temperature=0) + human_message_template = HumanMessagePromptTemplate.from_template(SCHEMA_PROMPT_TEMPLATE) + _prompt = ChatPromptTemplate( + messages=[human_message_template], + input_variables=["TEXT", "LANGUAGE", "SCHEMA"], + ) + + return _prompt | model | parse_schema + + +def parse_schema(ai_message: AIMessage) -> str: + response_schema = parse_json_markdown(ai_message.content) + return "\n".join(f"{key}: {val}" for key, val in response_schema.items()) + + +def route(response_schema: str) -> dict[str, str]: + if response_schema["SCHEMA"]: + return prepare_information_extraction_chain() + + raise ValueError( + "Cannot determine schema for the given input prompt. Please try different query." + ) diff --git a/juddges/settings.py b/juddges/settings.py index 84a2058..835d3cf 100644 --- a/juddges/settings.py +++ b/juddges/settings.py @@ -1,72 +1,72 @@ -from pathlib import Path - -import mlflow -import tiktoken -from sqlalchemy import create_engine -from sqlalchemy.engine import Engine - -# get root path as ROOT_PATH as pathlib objects -ROOT_PATH = Path(__file__).resolve().parent.parent - -DATA_PATH = ROOT_PATH / "data" -CONFIG_PATH = ROOT_PATH / "configs" - -SAMPLE_DATA_PATH = DATA_PATH / "sample_data" - -PL_JUDGEMENTS_PATH = DATA_PATH / "datasets" / "pl" -PL_COURT_DEP_ID_2_NAME = PL_JUDGEMENTS_PATH / "court_dep_names.csv" -PL_JUDGEMENTS_PATH_RAW = PL_JUDGEMENTS_PATH / "raw" -PL_JUDGEMENTS_PATH_TEXTS = PL_JUDGEMENTS_PATH / "text" -PL_JUDGEMENTS_PATH_INSTRUCT = PL_JUDGEMENTS_PATH / "instruct" - -MLFLOW_EXP_NAME = "Juddges-Information-Extraction" - - -def num_tokens_from_string( - string: str, # The string to count tokens for - encoding_name: str = "cl100k_base", # gpt-4, gpt-3.5-turbo, text-embedding-ada-002 -) -> int: # The number of tokens in the string - """ - Returns the number of tokens in a text string. - """ - encoding = tiktoken.get_encoding(encoding_name) - num_tokens = len(encoding.encode(string)) - return num_tokens - - -LLM_TO_PRICE_INPUT = { - "gpt-4-1106-preview": 0.01 / 1000, - "gpt-4-0125-preview": 0.01 / 1000, - "gpt-3.5-turbo-1106": 0.001 / 1000, -} - -LLM_TO_PRICE_COMPLETION = { - "gpt-4-1106-preview": 0.03 / 1000, - "gpt-4-0125-preview": 0.03 / 1000, - "gpt-3.5-turbo-1106": 0.002 / 1000, -} - -LOCAL_POSTGRES = "postgresql+psycopg2://llm:llm@postgres-juddges:5432/llm" - - -def get_sqlalchemy_engine() -> Engine: - return create_engine( - LOCAL_POSTGRES, - pool_size=10, - max_overflow=2, - pool_recycle=300, - pool_pre_ping=True, - pool_use_lifo=True, - ) - - -def prepare_langchain_cache() -> None: - import langchain - from langchain.cache import SQLAlchemyMd5Cache - - langchain.llm_cache = SQLAlchemyMd5Cache(get_sqlalchemy_engine()) - - -def prepare_mlflow(experiment_name: str = MLFLOW_EXP_NAME, url: str = "postgres-juddges") -> None: - mlflow.set_tracking_uri(url) - mlflow.set_experiment(experiment_name) +from pathlib import Path + +import mlflow +import tiktoken +from sqlalchemy import create_engine +from sqlalchemy.engine import Engine + +# get root path as ROOT_PATH as pathlib objects +ROOT_PATH = Path(__file__).resolve().parent.parent + +DATA_PATH = ROOT_PATH / "data" +CONFIG_PATH = ROOT_PATH / "configs" + +SAMPLE_DATA_PATH = DATA_PATH / "sample_data" + +PL_JUDGEMENTS_PATH = DATA_PATH / "datasets" / "pl" +PL_COURT_DEP_ID_2_NAME = PL_JUDGEMENTS_PATH / "court_dep_names.csv" +PL_JUDGEMENTS_PATH_RAW = PL_JUDGEMENTS_PATH / "raw" +PL_JUDGEMENTS_PATH_TEXTS = PL_JUDGEMENTS_PATH / "text" +PL_JUDGEMENTS_PATH_INSTRUCT = PL_JUDGEMENTS_PATH / "instruct" + +MLFLOW_EXP_NAME = "Juddges-Information-Extraction" + + +def num_tokens_from_string( + string: str, # The string to count tokens for + encoding_name: str = "cl100k_base", # gpt-4, gpt-3.5-turbo, text-embedding-ada-002 +) -> int: # The number of tokens in the string + """ + Returns the number of tokens in a text string. + """ + encoding = tiktoken.get_encoding(encoding_name) + num_tokens = len(encoding.encode(string)) + return num_tokens + + +LLM_TO_PRICE_INPUT = { + "gpt-4-1106-preview": 0.01 / 1000, + "gpt-4-0125-preview": 0.01 / 1000, + "gpt-3.5-turbo-1106": 0.001 / 1000, +} + +LLM_TO_PRICE_COMPLETION = { + "gpt-4-1106-preview": 0.03 / 1000, + "gpt-4-0125-preview": 0.03 / 1000, + "gpt-3.5-turbo-1106": 0.002 / 1000, +} + +LOCAL_POSTGRES = "postgresql+psycopg2://llm:llm@postgres-juddges:5432/llm" + + +def get_sqlalchemy_engine() -> Engine: + return create_engine( + LOCAL_POSTGRES, + pool_size=10, + max_overflow=2, + pool_recycle=300, + pool_pre_ping=True, + pool_use_lifo=True, + ) + + +def prepare_langchain_cache() -> None: + import langchain + from langchain.cache import SQLAlchemyMd5Cache + + langchain.llm_cache = SQLAlchemyMd5Cache(get_sqlalchemy_engine()) + + +def prepare_mlflow(experiment_name: str = MLFLOW_EXP_NAME, url: str = "postgres-juddges") -> None: + mlflow.set_tracking_uri(url) + mlflow.set_experiment(experiment_name) diff --git a/nbs/Dataset Cards/03_Graph_Description.md b/nbs/Dataset Cards/03_Graph_Description.md index afb8e4e..33fd907 100644 --- a/nbs/Dataset Cards/03_Graph_Description.md +++ b/nbs/Dataset Cards/03_Graph_Description.md @@ -1,11 +1,11 @@ # Polish Court Judgments Graph ## Dataset description -We introduce a graph dataset of Polish Court Judgments. This dataset is primarily based on the [`JuDDGES/pl-court-raw`](https://huggingface.co/datasets/JuDDGES/pl-court-raw). The dataset consists of nodes representing either judgments or legal bases, and edges connecting judgments to the legal bases they refer to. Also, the graph was cleaned from small disconnected components, leaving single giant component. Consequently, the resulting graph is bipartite. We provide the dataset in both `JSON` and `PyG` formats, each has different purpose. While structurally graphs in these formats are the same, their attributes differ. +We introduce a graph dataset of Polish Court Judgments. This dataset is primarily based on the [`JuDDGES/pl-court-raw`](https://huggingface.co/datasets/JuDDGES/pl-court-raw). The dataset consists of nodes representing either judgments or legal bases, and edges connecting judgments to the legal bases they refer to. Also, the graph was cleaned from small disconnected components, leaving single giant component. Consequently, the resulting graph is bipartite. We provide the dataset in both `JSON` and `PyG` formats, each has different purpose. While structurally graphs in these formats are the same, their attributes differ. The `JSON` format is intended for analysis and contains most of the attributes available in [`JuDDGES/pl-court-raw`](https://huggingface.co/datasets/JuDDGES/pl-court-raw). We excluded some less-useful attributes and text content, which can be easily retrieved from the raw dataset and added to the graph as needed. -The `PyG` format is designed for machine learning applications, such as link prediction on graphs, and is fully compatible with the [`Pytorch Geometric`](https://github.com/pyg-team/pytorch_geometric) framework. +The `PyG` format is designed for machine learning applications, such as link prediction on graphs, and is fully compatible with the [`Pytorch Geometric`](https://github.com/pyg-team/pytorch_geometric) framework. In the following sections, we provide a more detailed explanation and use case examples for each format. @@ -19,9 +19,9 @@ In the following sections, we provide a more detailed explanation and use case e | #nodes (type=`legal_base`) | 2819 | | avg(degree) | 6.132015294025195 | - + ![png](../images/degree_distribution.png) - + ## `JSON` format @@ -58,10 +58,10 @@ g = nx.node_link_graph(g_data) ## `PyG` format -The `PyTorch Geometric` format includes embeddings of the judgment content, obtained with [sdadas/mmlw-roberta-large](https://huggingface.co/sdadas/mmlw-roberta-large) for judgment nodes, -and one-hot-vector identifiers for legal-base nodes (note that for efficiency one can substitute it with random noise identifiers, +The `PyTorch Geometric` format includes embeddings of the judgment content, obtained with [sdadas/mmlw-roberta-large](https://huggingface.co/sdadas/mmlw-roberta-large) for judgment nodes, +and one-hot-vector identifiers for legal-base nodes (note that for efficiency one can substitute it with random noise identifiers, like in [(Abboud et al., 2021)](https://arxiv.org/abs/2010.01179)). - + ### Loading @@ -125,4 +125,4 @@ print(ds) ### Example usage ```python # TBD -``` \ No newline at end of file +``` diff --git a/nbs/Presentations/00_workshop_demo.ipynb b/nbs/Presentations/00_workshop_demo.ipynb index 3b5ac83..d178f43 100644 --- a/nbs/Presentations/00_workshop_demo.ipynb +++ b/nbs/Presentations/00_workshop_demo.ipynb @@ -1,43 +1,43 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Workshop Demo\n", - "\n", - "> Jak możemy strukturyzować orzeczenia?\n", - "\n", - "## Jakie informacje chcemy/możemy ekstrahować automatycznie z orzeczeń?\n", - "\n", - "### Ogólne\n", - "\n", - "- Sygnatura sprawy\n", - "- Podstawa prawna\n", - "- Strony\n", - "- Sentencja\n", - "- Podsumowanie\n", - "- Tagi, etykiety\n", - "- ...\n", - "\n", - "### Przykłady specyficznych pytań/zagadnień\n", - "\n", - "- Czy sprawa dotyczy dzieci?\n", - "- Czy sprawa dotyczy wolności słowa?\n", - "- Czy sprawa dotyczy XXX? - każde tego typu pytanie możemy użyć\n", - "- ...\n", - "\n", - "### Czego brakuje nad w codziennych zadaniach/pracach?\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "python3", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Workshop Demo\n", + "\n", + "> Jak możemy strukturyzować orzeczenia?\n", + "\n", + "## Jakie informacje chcemy/możemy ekstrahować automatycznie z orzeczeń?\n", + "\n", + "### Ogólne\n", + "\n", + "- Sygnatura sprawy\n", + "- Podstawa prawna\n", + "- Strony\n", + "- Sentencja\n", + "- Podsumowanie\n", + "- Tagi, etykiety\n", + "- ...\n", + "\n", + "### Przykłady specyficznych pytań/zagadnień\n", + "\n", + "- Czy sprawa dotyczy dzieci?\n", + "- Czy sprawa dotyczy wolności słowa?\n", + "- Czy sprawa dotyczy XXX? - każde tego typu pytanie możemy użyć\n", + "- ...\n", + "\n", + "### Czego brakuje nad w codziennych zadaniach/pracach?\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/_quarto.yml b/nbs/_quarto.yml index 0a6dfcb..006b406 100644 --- a/nbs/_quarto.yml +++ b/nbs/_quarto.yml @@ -17,4 +17,4 @@ website: sidebar: style: floating -metadata-files: [nbdev.yml, sidebar.yml] \ No newline at end of file +metadata-files: [nbdev.yml, sidebar.yml] diff --git a/nginx/Dockerfile b/nginx/Dockerfile index d61e299..d70c6ce 100644 --- a/nginx/Dockerfile +++ b/nginx/Dockerfile @@ -1,20 +1,20 @@ -FROM nginx:1.25.3 - -RUN apt-get update -y \ - && apt-get install -y \ - apache2-utils \ - && rm -rf /var/lib/apt/lists/* - -ENV LISTEN_PORT=8080 \ - AUTH_REALM="Restricted" \ - HTPASSWD_FILE="/etc/nginx/conf.d/auth.htpasswd" \ - FORWARD_PROTOCOL="http" \ - FORWARD_PORT=8501 - -WORKDIR /opt - -COPY auth.htpasswd launch.sh ./ - -RUN chmod 0755 ./launch.sh - -CMD ["./launch.sh"] +FROM nginx:1.25.3 + +RUN apt-get update -y \ + && apt-get install -y \ + apache2-utils \ + && rm -rf /var/lib/apt/lists/* + +ENV LISTEN_PORT=8080 \ + AUTH_REALM="Restricted" \ + HTPASSWD_FILE="/etc/nginx/conf.d/auth.htpasswd" \ + FORWARD_PROTOCOL="http" \ + FORWARD_PORT=8501 + +WORKDIR /opt + +COPY auth.htpasswd launch.sh ./ + +RUN chmod 0755 ./launch.sh + +CMD ["./launch.sh"] diff --git a/nginx/auth.conf b/nginx/auth.conf index 7e18251..211b0a6 100644 --- a/nginx/auth.conf +++ b/nginx/auth.conf @@ -1,31 +1,31 @@ -upstream ws-backend { - # enable sticky session based on IP - ip_hash; - - server web:8501; -} - -server { - listen 8080 default_server; - listen [::]:8080; - - # server_name web; - - location / { - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header Host $host; - - # basic auth - auth_basic "Restricted"; - auth_basic_user_file auth.htpasswd; - - # proxy pass - proxy_pass http://ws-backend; - proxy_read_timeout 900; - - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection "upgrade"; - - } -} \ No newline at end of file +upstream ws-backend { + # enable sticky session based on IP + ip_hash; + + server web:8501; +} + +server { + listen 8080 default_server; + listen [::]:8080; + + # server_name web; + + location / { + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $host; + + # basic auth + auth_basic "Restricted"; + auth_basic_user_file auth.htpasswd; + + # proxy pass + proxy_pass http://ws-backend; + proxy_read_timeout 900; + + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + + } +} diff --git a/nginx/launch.sh b/nginx/launch.sh index fb26765..8f30390 100644 --- a/nginx/launch.sh +++ b/nginx/launch.sh @@ -7,4 +7,4 @@ htpasswd -c -b /etc/nginx/auth.htpasswd $USER $PASS echo basic-auth-pwd cat /etc/nginx/auth.htpasswd -nginx -g "daemon off;" \ No newline at end of file +nginx -g "daemon off;" diff --git a/pyproject.toml b/pyproject.toml index 0616818..89732a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,4 +12,3 @@ plugins = "numpy.typing.mypy_plugin" [[tool.mypy.overrides]] module = ["pyarrow.*", "datasets.*", "sentence_transformers.*"] ignore_missing_imports = true - diff --git a/requirements.txt b/requirements.txt index af77995..b11f9a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,47 +1,46 @@ -accelerate==0.29.3 -bitsandbytes==0.43.1 -chardet==5.2.0 -datasets==2.19.1 -langchain-openai==0.1.1 -langchain==0.1.13 -langsmith==0.1.33 -loguru==0.7.2 -mlflow==2.11.3 -mpire==2.10.0 -openpyxl==3.1.2 -pandas==2.2.1 -peft==0.10.0 -polars==0.20.15 -pydantic==2.7.1 -pyarrow==15.0.0 -pymongo==4.3.3 -python-dotenv==1.0.1 -PyYAML==6.0.1 -requests==2.31.0 -rich==13.7.0 -seaborn==0.13.2 -sentence-transformers==3.0.0 -tenacity==8.2.3 -tensorboard==2.16.2 -tiktoken==0.6.0 -torch==2.2.1 -torchmetrics==1.4.0 -torch_geometric==2.5.3 -transformers==4.40.2 -trl==0.8.6 -typer==0.9.0 -wandb==0.16.5 -xmltodict==0.13.0 -xlsxwriter==3.2.0 - -# dev -coverage==7.4.3 -dvc[s3]==3.48.3 -mlflow==2.11.3 -mypy==1.8.0 -nbdev==2.3.25 -psycopg2-binary==2.9.9 -pytest==8.0.2 -ruff==0.2.2 -streamlit==1.31.1 - +accelerate==0.29.3 +bitsandbytes==0.43.1 +chardet==5.2.0 +datasets==2.19.1 +langchain-openai==0.1.1 +langchain==0.1.13 +langsmith==0.1.33 +loguru==0.7.2 +mlflow==2.11.3 +mpire==2.10.0 +openpyxl==3.1.2 +pandas==2.2.1 +peft==0.10.0 +polars==0.20.15 +pydantic==2.7.1 +pyarrow==15.0.0 +pymongo==4.3.3 +python-dotenv==1.0.1 +PyYAML==6.0.1 +requests==2.31.0 +rich==13.7.0 +seaborn==0.13.2 +sentence-transformers==3.0.0 +tenacity==8.2.3 +tensorboard==2.16.2 +tiktoken==0.6.0 +torch==2.2.1 +torchmetrics==1.4.0 +torch_geometric==2.5.3 +transformers==4.40.2 +trl==0.8.6 +typer==0.9.0 +wandb==0.16.5 +xmltodict==0.13.0 +xlsxwriter==3.2.0 + +# dev +coverage==7.4.3 +dvc[s3]==3.48.3 +mlflow==2.11.3 +mypy==1.8.0 +nbdev==2.3.25 +pre-commit==3.7.1 +psycopg2-binary==2.9.9 +pytest==8.0.2 +streamlit==1.31.1 diff --git a/requirements_unsloth.txt b/requirements_unsloth.txt index 16eae7b..d6249b1 100644 --- a/requirements_unsloth.txt +++ b/requirements_unsloth.txt @@ -1,41 +1,41 @@ -accelerate==0.29.3 -bitsandbytes==0.43.1 -chardet==5.2.0 -datasets==2.19.1 -langchain-openai==0.1.1 -langchain==0.1.13 -langsmith==0.1.33 -loguru==0.7.2 -mlflow==2.11.3 -mpire==2.10.0 -pandas==2.2.1 -peft==0.10.0 -polars==0.20.15 -pydantic==2.7.1 -pyarrow==15.0.0 -pymongo==4.3.3 -python-dotenv==1.0.1 -PyYAML==6.0.1 -requests==2.31.0 -rich==13.7.0 -seaborn==0.13.2 -sentence-transformers==3.0.0 -tenacity==8.2.3 -tensorboard==2.16.2 -tiktoken==0.6.0 -torchmetrics==1.4.0 -trl==0.8.6 -typer==0.9.0 -wandb==0.16.5 -xmltodict==0.13.0 - -# dev -coverage==7.4.3 -dvc[s3]==3.48.3 -mlflow==2.11.3 -mypy==1.8.0 -nbdev==2.3.13 -psycopg2-binary==2.9.9 -pytest==8.0.2 -ruff==0.2.2 -streamlit==1.31.1 \ No newline at end of file +accelerate==0.29.3 +bitsandbytes==0.43.1 +chardet==5.2.0 +datasets==2.19.1 +langchain-openai==0.1.1 +langchain==0.1.13 +langsmith==0.1.33 +loguru==0.7.2 +mlflow==2.11.3 +mpire==2.10.0 +pandas==2.2.1 +peft==0.10.0 +polars==0.20.15 +pydantic==2.7.1 +pyarrow==15.0.0 +pymongo==4.3.3 +python-dotenv==1.0.1 +PyYAML==6.0.1 +requests==2.31.0 +rich==13.7.0 +seaborn==0.13.2 +sentence-transformers==3.0.0 +tenacity==8.2.3 +tensorboard==2.16.2 +tiktoken==0.6.0 +torchmetrics==1.4.0 +trl==0.8.6 +typer==0.9.0 +wandb==0.16.5 +xmltodict==0.13.0 + +# dev +coverage==7.4.3 +dvc[s3]==3.48.3 +mlflow==2.11.3 +mypy==1.8.0 +nbdev==2.3.13 +psycopg2-binary==2.9.9 +pytest==8.0.2 +ruff==0.2.2 +streamlit==1.31.1 diff --git a/scripts/README.md b/scripts/README.md index 82f914c..8893971 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -53,7 +53,7 @@ MONGO_DB_NAME="datasets" ```shell PYTHONPATH=. python scripts/dataset/dump_pl_dataset.py \ --file-name data/datasets/pl/raw/raw.parquet - dvc add data/datasets/pl/raw/raw.parquet && dvc push + dvc add data/datasets/pl/raw/raw.parquet && dvc push ``` 7. Generate dataset card for `pl-court-raw` ```shell @@ -70,12 +70,12 @@ MONGO_DB_NAME="datasets" ```shell NUM_JOBS=8 dvc repro build_instruct_dataset ``` - + 11. Generate dataset card for `pl-court-instruct` ```shell dvc repro instruct_dataset_readme && dvc push ``` - + 12. Upload `pl-court-instruct` dataset card to huggingface ```shell PYTHONPATH=. scripts/dataset/push_instruct_readme.py --repo-id JuDDGES/pl-court-instruct diff --git a/scripts/sft/fine_tune_llm.py b/scripts/sft/fine_tune_llm.py index d9d43d1..fc8fc76 100644 --- a/scripts/sft/fine_tune_llm.py +++ b/scripts/sft/fine_tune_llm.py @@ -2,6 +2,7 @@ Fine-tune a large language model using SFT. the script is based on: https://www.philschmid.de/fine-tune-llms-in-2024-with-trl """ + import os from pathlib import Path diff --git a/settings.ini b/settings.ini index ab01294..e21bcad 100644 --- a/settings.ini +++ b/settings.ini @@ -31,7 +31,7 @@ audience = Developers author = Łukasz Augustyniak author_email = aisolutions@lukaszaugustyniak.com copyright = 2024 onwards, %(author)s -description = +description = keywords = nbdev jupyter notebook python language = English status = 3 @@ -39,5 +39,5 @@ user = laugustyniak ### Optional ### # requirements = fastcore pandas -# dev_requirements = -# console_scripts = \ No newline at end of file +# dev_requirements = +# console_scripts = diff --git a/setup.py b/setup.py index e3281ae..8bd6668 100644 --- a/setup.py +++ b/setup.py @@ -1,57 +1,78 @@ from pkg_resources import parse_version from configparser import ConfigParser -import setuptools, shlex -assert parse_version(setuptools.__version__)>=parse_version('36.2') +import setuptools +import shlex + +assert parse_version(setuptools.__version__) >= parse_version("36.2") # note: all settings are in settings.ini; edit there, not here -config = ConfigParser(delimiters=['=']) -config.read('settings.ini', encoding='utf-8') -cfg = config['DEFAULT'] +config = ConfigParser(delimiters=["="]) +config.read("settings.ini", encoding="utf-8") +cfg = config["DEFAULT"] -cfg_keys = 'version description keywords author author_email'.split() +cfg_keys = "version description keywords author author_email".split() expected = cfg_keys + "lib_name user branch license status min_python audience language".split() -for o in expected: assert o in cfg, "missing expected setting: {}".format(o) -setup_cfg = {o:cfg[o] for o in cfg_keys} +for o in expected: + assert o in cfg, "missing expected setting: {}".format(o) +setup_cfg = {o: cfg[o] for o in cfg_keys} licenses = { - 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'), - 'mit': ('MIT License', 'OSI Approved :: MIT License'), - 'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'), - 'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'), - 'bsd3': ('BSD License', 'OSI Approved :: BSD License'), + "apache2": ("Apache Software License 2.0", "OSI Approved :: Apache Software License"), + "mit": ("MIT License", "OSI Approved :: MIT License"), + "gpl2": ( + "GNU General Public License v2", + "OSI Approved :: GNU General Public License v2 (GPLv2)", + ), + "gpl3": ( + "GNU General Public License v3", + "OSI Approved :: GNU General Public License v3 (GPLv3)", + ), + "bsd3": ("BSD License", "OSI Approved :: BSD License"), } -statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha', - '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ] -py_versions = '3.6 3.7 3.8 3.9 3.10'.split() +statuses = [ + "1 - Planning", + "2 - Pre-Alpha", + "3 - Alpha", + "4 - Beta", + "5 - Production/Stable", + "6 - Mature", + "7 - Inactive", +] +py_versions = "3.6 3.7 3.8 3.9 3.10".split() -requirements = shlex.split(cfg.get('requirements', '')) -if cfg.get('pip_requirements'): requirements += shlex.split(cfg.get('pip_requirements', '')) -min_python = cfg['min_python'] -lic = licenses.get(cfg['license'].lower(), (cfg['license'], None)) -dev_requirements = (cfg.get('dev_requirements') or '').split() +requirements = shlex.split(cfg.get("requirements", "")) +if cfg.get("pip_requirements"): + requirements += shlex.split(cfg.get("pip_requirements", "")) +min_python = cfg["min_python"] +lic = licenses.get(cfg["license"].lower(), (cfg["license"], None)) +dev_requirements = (cfg.get("dev_requirements") or "").split() setuptools.setup( - name = cfg['lib_name'], - license = lic[0], - classifiers = [ - 'Development Status :: ' + statuses[int(cfg['status'])], - 'Intended Audience :: ' + cfg['audience'].title(), - 'Natural Language :: ' + cfg['language'].title(), - ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []), - url = cfg['git_url'], - packages = setuptools.find_packages(), - include_package_data = True, - install_requires = requirements, - extras_require={ 'dev': dev_requirements }, - dependency_links = cfg.get('dep_links','').split(), - python_requires = '>=' + cfg['min_python'], - long_description = open('README.md', encoding='utf-8').read(), - long_description_content_type = 'text/markdown', - zip_safe = False, - entry_points = { - 'console_scripts': cfg.get('console_scripts','').split(), - 'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d'] + name=cfg["lib_name"], + license=lic[0], + classifiers=[ + "Development Status :: " + statuses[int(cfg["status"])], + "Intended Audience :: " + cfg["audience"].title(), + "Natural Language :: " + cfg["language"].title(), + ] + + [ + "Programming Language :: Python :: " + o + for o in py_versions[py_versions.index(min_python) :] + ] + + (["License :: " + lic[1]] if lic[1] else []), + url=cfg["git_url"], + packages=setuptools.find_packages(), + include_package_data=True, + install_requires=requirements, + extras_require={"dev": dev_requirements}, + dependency_links=cfg.get("dep_links", "").split(), + python_requires=">=" + cfg["min_python"], + long_description=open("README.md", encoding="utf-8").read(), + long_description_content_type="text/markdown", + zip_safe=False, + entry_points={ + "console_scripts": cfg.get("console_scripts", "").split(), + "nbdev": [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d'], }, - **setup_cfg) - - + **setup_cfg, +)