Apply codestyle changes and optimizations

pwr-ai · Jun 12, 2024 · dea9a2f · dea9a2f
1 parent b975357
commit dea9a2f
Show file tree

Hide file tree

Showing 14 changed files with 99 additions and 74 deletions.
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -20,7 +20,7 @@ jobs:
       - uses: actions/cache@v3
         with:
           path: ${{ env.pythonLocation }}
-          key: ${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}
+          key: ${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}
       - name: Install deps
         run: make install_cpu
       - name: Lint
@@ -46,7 +46,7 @@ jobs:
       - uses: actions/cache@v3
         with:
           path: ${{ env.pythonLocation }}
-          key: ${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}
+          key: ${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}
       - name: Install deps
         run: make install_cpu
       - name: Test

diff --git a/README.md b/README.md
@@ -23,12 +23,35 @@ fostering cross-disciplinary and cross-jurisdictional collaboration.
 
 ![baner](https://raw.githubusercontent.com/pwr-ai/JuDDGES/bffb1d75ba7c78f101fc94bd9086499886b2c128/nbs/images/baner.png)
 
+## Usage
+
+### Installation
+
+- to install necessary dependencies use available `Makefile`, you can
+  use `python>=3.10`: `shell     make install`
+- if you want to run evaluation and fine-tuning with `unsloth`, use the
+  following command with `python=3.10` inside conda environment:
+  `shell     make install_unsloth`
+
+### Dataset creation
+
+The specific details of dataset creation are available in
+[scripts/README.md](scripts/README.md).
+
+### Fine tuning
+
+To run evaluation or fine-tuning, run proper stages declared
+[`dvc.yaml`](dvc.yaml) (see [DVC docs for
+details](https://dvc.org/doc/user-guide))
+
+## Project details
+
 The JuDDGES project encompasses several Work Packages (WPs) designed to
 cover all aspects of its objectives, from project management to the open
 science practices and engaging early career researchers. Below is an
 overview of the project’s WPs based on the provided information:
 
-## WP1: Project Management
+### WP1: Project Management
 
 **Duration**: 24 Months
 
@@ -37,7 +60,7 @@ within budget. This includes administrative management, scientific and
 technological management, quality innovation and risk management,
 ethical and legal consideration, and facilitating open science.
 
-## WP2: Gathering and Human Encoding of Judicial Decision Data
+### WP2: Gathering and Human Encoding of Judicial Decision Data
 
 **Duration**: 22 Months
 
@@ -48,7 +71,7 @@ coders, making human-coded data available for WP3, facilitating
 human-in-loop coding for WP3, and enabling WP4 to make data open and
 reusable beyond the project team.
 
-## WP3: NLP and HITL Machine Learning Methodological Development
+### WP3: NLP and HITL Machine Learning Methodological Development
 
 **Duration**: 24 Months
 
@@ -59,7 +82,7 @@ baseline information extraction, intelligent inference methods for legal
 corpus data, and constructing an annotation tool through active learning
 and human-in-the-loop annotation methods.
 
-## WP4: Open Science Practices & Engaging Early Career Researchers
+### WP4: Open Science Practices & Engaging Early Career Researchers
 
 **Duration**: 12 Months
 
@@ -73,12 +96,6 @@ Each WP includes specific tasks aimed at achieving its goals, involving
 collaboration among project partners and contributing to the overarching
 aim of the JuDDGES project.
 
-## Install
-
-``` sh
-pip install juddges
-```
-
 ## Acknowledgements
 
 The universities involved in the JuDDGES project are:

diff --git a/configs/dataset/pl-court-instruct.yaml b/configs/dataset/pl-court-instruct.yaml
@@ -1,4 +1,4 @@
 name: JuDDGES/pl-court-instruct
 prompt_field: prompt
 context_field: context
-output_field: output
+output_field: output
diff --git a/configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned.yaml b/configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned.yaml
@@ -7,4 +7,4 @@ max_seq_length: 7_900
 batch_size: 1
 padding: longest
 
-use_unsloth: true
+use_unsloth: true
diff --git a/configs/model/Unsloth-Llama-3-8B-Instruct.yaml b/configs/model/Unsloth-Llama-3-8B-Instruct.yaml
@@ -7,4 +7,4 @@ max_seq_length: 7_900
 batch_size: 1
 padding: longest
 
-use_unsloth: true
+use_unsloth: true
diff --git a/dvc.lock b/dvc.lock
@@ -65,8 +65,8 @@ stages:
       nfiles: 17
     - path: scripts/dataset/build_instruct_dataset.py
       hash: md5
-      md5: f07a9d106853e74be4d0e4807b33ff3d
-      size: 5393
+      md5: 9b138322059d63ce3ad1bd05c8b931f2
+      size: 5461
   embed@mmlw-roberta-large:
     cmd: PYTHONPATH=. python scripts/embed/embed_text.py embedding_model=mmlw-roberta-large
     deps:
@@ -188,16 +188,16 @@ stages:
     deps:
     - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
       hash: md5
-      md5: e97bb2e6bf39f75edea7714d6ba58b77
-      size: 160
+      md5: 1b4c0353b8c41fd3656ec5cf15eb6c2b
+      size: 161
     - path: configs/predict.yaml
       hash: md5
       md5: 74ad1dc5d9f130074533078d85e55e94
       size: 504
     - path: scripts/sft/predict.py
       hash: md5
-      md5: 85a61d28419afaf276ea17863205aa2a
-      size: 3203
+      md5: 59c2afb977f520c9134153def544111d
+      size: 3188
     outs:
     - path: 
         data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json
@@ -209,16 +209,16 @@ stages:
     deps:
     - path: configs/model/Unsloth-Llama-3-8B-Instruct-fine-tuned.yaml
       hash: md5
-      md5: c9fab7cd7a4b0159d13ba61e3c516c0a
-      size: 244
+      md5: dd00fc3994bdc95baf1f17de7b026a0f
+      size: 245
     - path: configs/predict.yaml
       hash: md5
       md5: 74ad1dc5d9f130074533078d85e55e94
       size: 504
     - path: scripts/sft/predict.py
       hash: md5
-      md5: 85a61d28419afaf276ea17863205aa2a
-      size: 3203
+      md5: 59c2afb977f520c9134153def544111d
+      size: 3188
     outs:
     - path: 
         data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json
@@ -238,8 +238,8 @@ stages:
       size: 504
     - path: scripts/sft/predict.py
       hash: md5
-      md5: 85a61d28419afaf276ea17863205aa2a
-      size: 3203
+      md5: 59c2afb977f520c9134153def544111d
+      size: 3188
     outs:
     - path: 
         data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3.json
@@ -259,8 +259,8 @@ stages:
       size: 504
     - path: scripts/sft/predict.py
       hash: md5
-      md5: 85a61d28419afaf276ea17863205aa2a
-      size: 3203
+      md5: 59c2afb977f520c9134153def544111d
+      size: 3188
     outs:
     - path: 
         data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json
@@ -295,8 +295,8 @@ stages:
       size: 528
     - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
       hash: md5
-      md5: e97bb2e6bf39f75edea7714d6ba58b77
-      size: 160
+      md5: 1b4c0353b8c41fd3656ec5cf15eb6c2b
+      size: 161
     - path: scripts/sft/fine_tune_unsloth.py
       hash: md5
       md5: 2c3ca748d6f7bf76e92eb7bc8ded5f37
@@ -333,8 +333,8 @@ stages:
     deps:
     - path: scripts/sft/summarize_metrics.py
       hash: md5
-      md5: 8993c84349eab010b5e484fbf43fd8ff
-      size: 737
+      md5: 482321d8b7291cbed1e80bed7b685b46
+      size: 782
     outs:
     - path: data/experiments/predict/pl-court-instruct/metrics_summary.md
       hash: md5

diff --git a/juddges/preprocessing/context_truncator.py b/juddges/preprocessing/context_truncator.py
@@ -10,8 +10,8 @@ def __init__(self, tokenizer: BaseTokenizer, max_length: int):
 
         empty_messages = [
             {"role": "user", "content": ""},
+            {"role": "assistant", "content": ""},
         ]
-        empty_messages.append({"role": "assistant", "content": ""})
 
         self.empty_messages_length = len(
             self.tokenizer.apply_chat_template(empty_messages, tokenize=True)

diff --git a/nbs/index.ipynb b/nbs/index.ipynb
@@ -27,27 +27,52 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Usage\n",
+    "\n",
+    "### Installation\n",
+    "- to install necessary dependencies use available `Makefile`, you can use `python>=3.10`:\n",
+    "    ```shell\n",
+    "    make install\n",
+    "    ```\n",
+    "- if you want to run evaluation and fine-tuning with `unsloth`, use the following command with `python=3.10` inside conda environment:\n",
+    "    ```shell\n",
+    "    make install_unsloth\n",
+    "    ```\n",
+    "\n",
+    "### Dataset creation\n",
+    "The specific details of dataset creation are available in [scripts/README.md](scripts/README.md).\n",
+    "\n",
+    "### Fine tuning\n",
+    "To run evaluation or fine-tuning, run proper stages declared [`dvc.yaml`](dvc.yaml) (see [DVC docs for details](https://dvc.org/doc/user-guide))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Project details\n",
+    "\n",
     "The JuDDGES project encompasses several Work Packages (WPs) designed to cover all aspects of its objectives, from project management to the open science practices and engaging early career researchers. Below is an overview of the project's WPs based on the provided information:\n",
     "\n",
-    "## WP1: Project Management\n",
+    "### WP1: Project Management\n",
     "\n",
     "**Duration**: 24 Months\n",
     "\n",
     "**Main Aim**: To ensure the project's successful completion on time and within budget. This includes administrative management, scientific and technological management, quality innovation and risk management, ethical and legal consideration, and facilitating open science.\n",
     "\n",
-    "## WP2: Gathering and Human Encoding of Judicial Decision Data\n",
+    "### WP2: Gathering and Human Encoding of Judicial Decision Data\n",
     "\n",
     "**Duration**: 22 Months\n",
     "\n",
     "**Main Aim**: To establish the data foundation for developing and testing the project's tools. This involves collating/gathering legal case records and judgments, developing a coding scheme, training human coders, making human-coded data available for WP3, facilitating human-in-loop coding for WP3, and enabling WP4 to make data open and reusable beyond the project team.\n",
     "\n",
-    "## WP3: NLP and HITL Machine Learning Methodological Development\n",
+    "### WP3: NLP and HITL Machine Learning Methodological Development\n",
     "\n",
     "**Duration**: 24 Months\n",
     "\n",
     "**Main Aim**: To create a bridge between machine learning (led by WUST and MUHEC) and Open Science facilitation (by ELICO), focusing on the development and deployment of annotation methodologies. This includes baseline information extraction, intelligent inference methods for legal corpus data, and constructing an annotation tool through active learning and human-in-the-loop annotation methods.\n",
     "\n",
-    "## WP4: Open Science Practices & Engaging Early Career Researchers\n",
+    "### WP4: Open Science Practices & Engaging Early Career Researchers\n",
     "\n",
     "**Duration**: 12 Months\n",
     "\n",
@@ -56,22 +81,6 @@
     "Each WP includes specific tasks aimed at achieving its goals, involving collaboration among project partners and contributing to the overarching aim of the JuDDGES project.\n"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Install\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "```sh\n",
-    "pip install juddges\n",
-    "```\n"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -84,11 +93,6 @@
     "2. Middlesex University London (UK)\n",
     "3. University of Lyon 1 (France).\n"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
   }
  ],
  "metadata": {

diff --git a/scripts/dataset/build_instruct_dataset.py b/scripts/dataset/build_instruct_dataset.py
@@ -14,6 +14,8 @@
 load_dotenv()
 
 MAX_SHARD_SIZE = "4GB"
+DATE_FORMAT = "%Y-%m-%d"
+JUDGE_SEPARATOR = " i "
 
 SCHEMA_TEMPLATE = "```yaml\n{schema}\n```"
 INSTRUCTION_TEMPLATE = """
@@ -113,7 +115,7 @@ def main(
 
 
 def _pre_filter(item: dict[str, Any]) -> bool:
-    return all(item[feat] is not None for feat in FEATURES)
+    return not any(item[feat] is None for feat in FEATURES)
 
 
 def _preprocess(item: dict[str, Any]) -> dict[str, Any]:
@@ -125,15 +127,15 @@ def _preprocess(item: dict[str, Any]) -> dict[str, Any]:
 
 def _simplify_date(item: dict[str, Any]) -> dict[str, Any]:
     item["date"], *_ = item["date"].split()
-    datetime.strptime(item["date"], "%Y-%m-%d")  # raises ValueError on invalid format
+    datetime.strptime(item["date"], DATE_FORMAT)  # raises ValueError on invalid format
     return item
 
 
 def _split_multiple_names(item: dict[str, Any]) -> dict[str, Any]:
     """Splits judges names that are joined by 'i'."""
     judges = []
     for j in item["judges"]:
-        for j_part in j.split(" i "):
+        for j_part in j.split(JUDGE_SEPARATOR):
             judges.append(j_part)
 
     item["judges"] = judges
@@ -146,7 +148,7 @@ def _legal_bases_to_texts(item: dict[str, Any]) -> dict[str, Any]:
 
 
 def _filter(item: dict[str, Any]) -> bool:
-    all_judges_in_text = all(j in item["text"] for j in item["judges"])
+    all_judges_in_text = not any(j not in item["text"] for j in item["judges"])
     recorder_in_text = item["recorder"] in item["text"]
     signature_in_text = item["signature"] in item["text"]
 

diff --git a/scripts/dataset/download_pl_additional_data.py b/scripts/dataset/download_pl_additional_data.py
@@ -43,7 +43,7 @@ def main(
     cursor = collection.find(query, {"_id": 1}, batch_size=batch_size)
     batched_cursor = BatchedDatabaseCursor(cursor=cursor, batch_size=batch_size, prefetch=True)
 
-    download_data = DownloadAdditionalData(data_type)
+    download_data = AdditionalDataDownloader(data_type)
     download_data_and_update_db = BatchDatabaseUpdate(mongo_uri, download_data)
 
     with multiprocessing.Pool(n_jobs) as pool:
@@ -60,7 +60,7 @@ def main(
     assert collection.count_documents(query) == 0
 
 
-class DownloadAdditionalData:
+class AdditionalDataDownloader:
     def __init__(self, data_type: DataType):
         self.data_type = data_type
         self.api = PolishCourtAPI()

diff --git a/scripts/sft/fine_tune_llm.py b/scripts/sft/fine_tune_llm.py
@@ -1,3 +1,7 @@
+"""
+Fine-tune a large language model using SFT.
+the script is based on: https://www.philschmid.de/fine-tune-llms-in-2024-with-trl
+"""
 import os
 from pathlib import Path