pwr-ai · binkjakub · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024
diff --git a/.github/workflows/nbdev-test.yaml b/.github/workflows/nbdev-test.yaml
@@ -4,7 +4,7 @@ on:  [workflow_dispatch, pull_request]
 jobs:
   nbdev-test:
     runs-on: ubuntu-latest
-    steps: 
+    steps:
       - uses: fastai/workflows/nbdev-ci@master
         with:
           skip_test: true
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -14,15 +14,26 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
-      - uses: actions/cache@v3
+          cache: 'pip'
+          cache-dependency-path: |
+            requirements.txt
+
+      - name: Cache pre-commit
+        uses: actions/cache@v4
         with:
-          path: ${{ env.pythonLocation }}
-          key: ${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}
+          path: ~/.cache/pre-commit
+          key: pre-commit|${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('.pre-commit-config.yaml') }}
+
       - name: Install deps
-        run: make install_cpu
+        run: |
+          python -m pip install --upgrade pip
+          make install_cpu
+
       - name: Lint
         run: make check
 
@@ -40,15 +51,20 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
-      - uses: actions/cache@v3
-        with:
-          path: ${{ env.pythonLocation }}
-          key: ${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('requirements.txt') }}
+          cache: 'pip'
+          cache-dependency-path: |
+            requirements.txt
+
       - name: Install deps
-        run: make install_cpu
+        run: |
+          python -m pip install --upgrade pip
+          make install_cpu
+
       - name: Test
         env:
           HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,29 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: no-commit-to-branch
+        name: No commits to master
+      - id: end-of-file-fixer
+        name: End-of-file fixer
+      - name: mixed-line-ending
+        id: mixed-line-ending
+        args: [--fix, lf]
+      - id: trailing-whitespace
+        name: Remove trailing whitespaces
+      - id: check-toml
+        name: Check toml
+      - id: check-yaml
+        name: Check yaml
+        args: [--allow-multiple-documents]
+
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.4.10
+    hooks:
+      - id: ruff
+        name: Ruff Linter
+        args: [--fix, --exit-non-zero-on-fix, juddges, scripts, dashboards, tests]
+      - id: ruff-format
+        name: Ruff Formatter
+        args: [juddges, scripts, dashboards, tests]
diff --git a/Dockerfile b/Dockerfile
@@ -24,7 +24,7 @@ RUN apt-get update \
     && gdebi -n quarto-1.5.17-linux-amd64.deb \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists \
-    && rm -rf /tmp 
+    && rm -rf /tmp
 
 ENV PYTHONDONTWRITEBYTECODE 1
 ENV PYTHONUNBUFFERED 1

diff --git a/Makefile b/Makefile
@@ -2,12 +2,10 @@ lint_dirs := juddges scripts dashboards tests
 mypy_dirs := juddges scripts dashboards tests
 
 fix:
-	ruff check $(lint_dirs) --fix
-	ruff format $(lint_dirs)
+	pre-commit run --all-files
 
 check:
-	ruff check $(lint_dirs)
-	ruff format $(lint_dirs) --check
+	pre-commit run --all-files
 
 check-types:
 	mypy --install-types --non-interactive $(mypy_dirs)
@@ -25,8 +23,7 @@ install:
 install_cpu:
 	pip install --find-links https://download.pytorch.org/whl/cpu -r requirements.txt
 
-# unsloth requires python 3.10
-# requires conda environment
+# unsloth requires python 3.10 and conda environment
 install_unsloth:
 	conda install --yes pytorch-cuda=12.1 pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers
 	pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

diff --git a/configs/embedding.yaml b/configs/embedding.yaml
@@ -2,8 +2,8 @@ defaults:
   - embedding_model: ???
   - dataset: pl-court-raw
   - _self_
-  - override hydra/hydra_logging: disabled  
-  - override hydra/job_logging: disabled  
+  - override hydra/hydra_logging: disabled
+  - override hydra/job_logging: disabled
 
 length_adjust_mode: chunk
 chunk_config:
@@ -14,7 +14,7 @@ batch_size: 64
 
 output_dir: data/embeddings/${dataset.name}/${hydra:runtime.choices.embedding_model}/all_embeddings
 
-hydra:  
-  output_subdir: null  
-  run:  
+hydra:
+  output_subdir: null
+  run:
     dir: .
diff --git a/configs/fine_tuning.yaml b/configs/fine_tuning.yaml
@@ -2,8 +2,8 @@ defaults:
   - model: ???
   - dataset: pl-court-instruct
   - _self_
-  - override hydra/hydra_logging: disabled  
-  - override hydra/job_logging: disabled  
+  - override hydra/hydra_logging: disabled
+  - override hydra/job_logging: disabled
 
 output_dir: data/experiments/fine-tune/${hydra:runtime.choices.model}/${hydra:runtime.choices.dataset}
 run_name: ${hydra:runtime.choices.model}_${hydra:runtime.choices.dataset}_fine_tune
@@ -17,7 +17,7 @@ truncate_context: True
 epochs: 1
 batch_size: 4
 
-hydra:  
-  output_subdir: null  
-  run:  
+hydra:
+  output_subdir: null
+  run:
     dir: .
diff --git a/configs/llm_judge.yaml b/configs/llm_judge.yaml
@@ -0,0 +1,11 @@
+defaults:
+  - model: ???
+  - _self_
+
+answers_file: ???
+out_metric_file: ???
+out_predictions_file: ???
+
+generate_kwargs:
+  max_new_tokens: 20
+  do_sample: False
diff --git a/configs/predict.yaml b/configs/predict.yaml
@@ -2,17 +2,19 @@ defaults:
   - model: ???
   - dataset: pl-court-instruct
   - _self_
-  - override hydra/hydra_logging: disabled  
-  - override hydra/job_logging: disabled  
+  - override hydra/hydra_logging: disabled
+  - override hydra/job_logging: disabled
 
 device_map: 'auto'
 output_file: data/experiments/predict/${hydra:runtime.choices.dataset}/outputs_${hydra:runtime.choices.model}.json
 metrics_file: data/experiments/predict/${hydra:runtime.choices.dataset}/metrics_${hydra:runtime.choices.model}.json
 
-max_new_tokens: 250
 truncate_context: True
+generate_kwargs:
+  max_new_tokens: 250
+  do_sample: False
 
-hydra:  
-  output_subdir: null  
-  run:  
+hydra:
+  output_subdir: null
+  run:
     dir: .
diff --git a/dashboards/pages/01_🔍_Search_Judgements.py b/dashboards/pages/01_🔍_Search_Judgements.py
@@ -1,40 +1,41 @@
-from typing import Any
-import streamlit as st
-
-from juddges.data.datasets import get_mongo_collection
-from pymongo.collection import Collection
-
-TITLE = "Search for Judgements"
-
-st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")
-
-st.title(TITLE)
-
-
-@st.cache_resource
-def get_judgements_collection() -> Collection:
-    return get_mongo_collection("judgements")
-
-
-judgements_collection = get_judgements_collection()
-
-
-def search_data(query: str, max_judgements: int = 5) -> list[dict[str, Any]]:
-    items = list(judgements_collection.find({"$text": {"$search": query}}).limit(max_judgements))
-    return items
-
-
-with st.form(key="search_form"):
-    text = st.text_area("What you are looking for in the judgements?")
-    max_judgements = st.slider("Max judgements to show", min_value=1, max_value=20, value=5)
-    submit_button = st.form_submit_button(label="Search")
-
-if submit_button:
-    with st.spinner("Searching..."):
-        items = search_data(text, max_judgements)
-
-        st.header("Judgements - Results")
-        for item in items:
-            st.header(item["signature"])
-            st.subheader(item["publicationDate"])
-            st.write(item["text"])
+from typing import Any
+
+import streamlit as st
+from pymongo.collection import Collection
+
+from juddges.data.datasets import get_mongo_collection
+
+TITLE = "Search for Judgements"
+
+st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")
+
+st.title(TITLE)
+
+
+@st.cache_resource
+def get_judgements_collection() -> Collection:
+    return get_mongo_collection("judgements")
+
+
+judgements_collection = get_judgements_collection()
+
+
+def search_data(query: str, max_judgements: int = 5) -> list[dict[str, Any]]:
+    items = list(judgements_collection.find({"$text": {"$search": query}}).limit(max_judgements))
+    return items
+
+
+with st.form(key="search_form"):
+    text = st.text_area("What you are looking for in the judgements?")
+    max_judgements = st.slider("Max judgements to show", min_value=1, max_value=20, value=5)
+    submit_button = st.form_submit_button(label="Search")
+
+if submit_button:
+    with st.spinner("Searching..."):
+        items = search_data(text, max_judgements)
+
+        st.header("Judgements - Results")
+        for item in items:
+            st.header(item["signature"])
+            st.subheader(item["publicationDate"])
+            st.write(item["text"])