Merge branch 'refs/heads/master' into feat/llm-fine-tuning

# Conflicts: # juddges/_modidx.py # pyproject.toml # requirements.txt
pwr-ai · Apr 11, 2024 · c91c6e9 · c91c6e9
2 parents 402f15b + 566593a
commit c91c6e9
Show file tree

Hide file tree

Showing 22 changed files with 1,762 additions and 189 deletions.
diff --git a/.gitignore b/.gitignore
@@ -152,3 +152,4 @@ checklink/cookies.txt
 .history
 
 secrets.env
+**/postgres-juddges/**
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-lint_dirs := juddges scripts notebooks dashboards tests
+lint_dirs := juddges scripts dashboards tests
 mypy_dirs := juddges scripts dashboards tests
 
 fix:
@@ -8,7 +8,6 @@ fix:
 check:
 	ruff check $(lint_dirs)
 	ruff format $(lint_dirs) --check
-	mypy --install-types --non-interactive $(mypy_dirs)
 
 test:
 	coverage run -m pytest

diff --git a/dashboards/app.py b/dashboards/app.py
diff --git a/dashboards/pages/00_🛠️_Extract_Information_from_Judgements.py b/dashboards/pages/00_🛠️_Extract_Information_from_Judgements.py
@@ -0,0 +1,76 @@
+import streamlit as st
+
+from juddges.data.pl_court_api import PolishCourtAPI
+from juddges.prompts.information_extraction import (
+    EXAMPLE_SCHEMA,
+    prepare_information_extraction_chain,
+    prepare_schema_chain,
+)
+from juddges.settings import prepare_langchain_cache, prepare_mlflow
+
+prepare_langchain_cache()
+prepare_mlflow()
+
+TITLE = "⚖️ JuDDGES Information Extraction from Court Decisions ⚖️"
+
+st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")
+
+st.title(TITLE)
+
+st.info(
+    "You can provide an URL to court decision or plain text of it, describe in written form schema of the information that will be extracted, choose model and language and start extraction."
+)
+
+st.header("Data source")
+source_option = st.selectbox("Choose the source of the judgement text:", ["API", "Plain text"])
+
+if source_option == "API":
+    api = PolishCourtAPI()
+    judgement_url = st.text_input(
+        "Enter the judgement URL:",
+        "https://orzeczenia.wroclaw.sa.gov.pl/details/$N/155000000001006_II_AKa_000334_2019_Uz_2020-02-06_001",
+    )
+    judgement_id = judgement_url.strip().split("/")[-1]
+    judgement_text = api.get_content(id=judgement_id)
+else:
+    judgement_text = st.text_area("Enter the judgement text here:", height=500)
+
+st.header("Schema extraction/definition")
+schema_query = st.text_input(
+    "Ask for schema in natural language:",
+    "Extract the date, verdict, and court from  the judgement.",
+)
+llm_schema = st.selectbox(
+    "Select the LLM model (schema)",
+    ["gpt-3.5-turbo-1106", "gpt-4-0125-preview", "gpt-4-1106-preview"],
+)
+
+if st.button("Generate schema to extract information"):
+    chain = prepare_schema_chain(model_name=llm_schema)
+    schema = chain.invoke({"SCHEMA_TEXT": schema_query})
+    if not schema:
+        st.error("Could not extract schema from the given query. Try with a different one.")
+    else:
+        st.session_state.schema = schema
+
+schema_text = st.text_area(
+    "Enter the schema text here:", st.session_state.get("schema") or EXAMPLE_SCHEMA, height=500
+)
+
+st.header("Information extraction")
+llm_extraction = st.selectbox(
+    "Select the LLM model", ["gpt-4-0125-preview", "gpt-4-1106-preview", "gpt-3.5-turbo-1106"]
+)
+language = st.selectbox("Enter the language of the judgement text:", ["Polish", "English"])
+
+
+if st.button("Extract information"):
+    with st.spinner("Extracting information from the judgement text..."):
+        chain = prepare_information_extraction_chain(model_name=llm_extraction)
+        retrieved_informations = chain.invoke(
+            {"LANGUAGE": language, "TEXT": judgement_text, "SCHEMA": schema_text}
+        )
+        col_left, col_right = st.columns(2)
+
+        col_left.write(judgement_text)
+        col_right.write(retrieved_informations)
diff --git a/dashboards/pages/01_🔍_Search_Judgements.py b/dashboards/pages/01_🔍_Search_Judgements.py
@@ -0,0 +1,38 @@
+import streamlit as st
+
+from juddges.data.models import get_mongo_collection
+
+TITLE = "Search for Judgements"
+
+st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")
+
+st.title(TITLE)
+
+
+@st.cache_resource
+def get_judgements_collection():
+    return get_mongo_collection("judgements")
+
+
+judgements_collection = get_judgements_collection()
+
+
+def search_data(query: str, max_judgements: int = 5):
+    items = list(judgements_collection.find({"$text": {"$search": query}}).limit(max_judgements))
+    return items
+
+
+with st.form(key="search_form"):
+    text = st.text_area("What you are looking for in the judgements?")
+    max_judgements = st.slider("Max judgements to show", min_value=1, max_value=20, value=5)
+    submit_button = st.form_submit_button(label="Search")
+
+if submit_button:
+    with st.spinner("Searching..."):
+        items = search_data(text, max_judgements)
+
+        st.header("Judgements - Results")
+        for item in items:
+            st.header(item["signature"])
+            st.subheader(item["publicationDate"])
+            st.write(item["text"])
diff --git a/dashboards/pages/02_🔍_Analyse_Extracted_Information.py b/dashboards/pages/02_🔍_Analyse_Extracted_Information.py
@@ -0,0 +1,62 @@
+import pandas as pd
+import streamlit as st
+
+from juddges.prompts.information_extraction import EXAMPLE_SCHEMA
+from juddges.settings import SAMPLE_DATA_PATH
+
+TITLE = "Analyse Judgements"
+
+st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")
+
+st.title(TITLE)
+
+
+@st.cache_resource
+def load_data():
+    return pd.read_csv(SAMPLE_DATA_PATH / "judgements-100-sample-with-retrieved-informations.csv")
+
+
+df = load_data()
+
+st.info(
+    "We sampled 100 random judgements from the dataset and extracted information from them. Below is the extracted information and the schema (questions) used to extract it."
+)
+
+st.header("Schema:")
+st.write(EXAMPLE_SCHEMA)
+
+st.header("Extracted Information - tabular format")
+st.write(df)
+
+st.header("Analyse Extracted Information")
+
+st.subheader("How many judgements we analyzed?")
+
+st.write(f"Number of judgements: {len(df)}")
+
+st.subheader("What courts judgement do we analyse")
+
+st.write(df.groupby("court")["_id"].count())
+
+st.subheader("How many judgements are drug offences?")
+
+drug_offences = df["drug_offence"].sum()
+
+st.info(f"Number of drug offences: {drug_offences}")
+
+st.subheader("How many judgements are child offences?")
+
+child_offences = df["child_offence"].sum()
+
+st.info(f"Number of child offences: {child_offences}")
+
+st.subheader("Show examples of judgements that are child offences")
+
+drug_offences_df = df[df["child_offence"]]
+
+st.write("We can check the sentences of them")
+
+for _, row in drug_offences_df.iterrows():
+    st.subheader(row["signature"])
+    st.markdown(row["text"])
+    st.markdown("---")  # Add a horizontal line
diff --git a/dashboards/⚖️_Project_Information.py b/dashboards/⚖️_Project_Information.py
@@ -0,0 +1,59 @@
+import streamlit as st
+
+from juddges.settings import ROOT_PATH
+
+TITLE = "⚖️ JuDDGES Information Extraction from Court Decisions ⚖️"
+
+st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")
+
+st.title(TITLE)
+
+st.warning("JuDDGES stands for Judicial Decision Data Gathering, Encoding, and Sharing")
+
+st.info(
+    """The JuDDGES project aims to revolutionize the accessibility and analysis of judicial decisions across varied legal systems using advanced Natural Language Processing and Human-In-The-Loop technologies. It focuses on criminal court records from jurisdictions with diverse legal constitutions, including Poland and England & Wales. By overcoming barriers related to resources, language, data, and format inhomogeneity, the project facilitates the development and testing of theories on judicial decision-making and informs judicial policy and practice. Open software and tools produced by the project will enable extensive, flexible meta-annotation of legal texts, benefiting researchers and public legal institutions alike. This initiative not only advances empirical legal research by adopting Open Science principles but also creates the most comprehensive legal research repository in Europe, fostering cross-disciplinary and cross-jurisdictional collaboration."""
+)
+
+st.image((ROOT_PATH / "nbs/images/baner.png").as_posix(), use_column_width=True)
+
+st.info(
+    "The JuDDGES project encompasses several Work Packages (WPs) designed to cover all aspects of its objectives, from project management to the open science practices and engaging early career researchers. Below is an overview of the project’s WPs based on the provided information."
+)
+
+st.header("WP1: Project Management")
+st.subheader("Duration: 24 Months")
+
+st.info(
+    "Main Aim: To ensure the project’s successful completion on time and within budget. This includes administrative management, scientific and technological management, quality innovation and risk management, ethical and legal consideration, and facilitating open science."
+)
+
+st.header("WP2: Gathering and Human Encoding of Judicial Decision Data")
+st.subheader("Duration: 22 Months")
+
+st.info(
+    "Main Aim: To establish the data foundation for developing and testing the project’s tools. This involves collating/gathering legal case records and judgments, developing a coding scheme, training human coders, making human-coded data available for WP3, facilitating human-in-loop coding for WP3, and enabling WP4 to make data open and reusable beyond the project team."
+)
+
+st.header("WP3: NLP and HITL Machine Learning Methodological Development")
+st.subheader("Duration: 24 Months")
+
+st.info(
+    "Main Aim: To create a bridge between machine learning (led by WUST and MUHEC) and Open Science facilitation (by ELICO), focusing on the development and deployment of annotation methodologies. This includes baseline information extraction, intelligent inference methods for legal corpus data, and constructing an annotation tool through active learning and human-in-the-loop annotation methods."
+)
+
+st.header("WP4: Open Science Practices & Engaging Early Career Researchers")
+st.subheader("Duration: 12 Months")
+
+st.info(
+    "Main Aim: To implement the Open Science policy of the call and engage with relevant early career researchers (ECRs). Objectives include providing open access to publication data and software, disseminating/exploiting project results, and promoting the project and its findings."
+)
+
+st.info(
+    "Each WP includes specific tasks aimed at achieving its goals, involving collaboration among project partners and contributing to the overarching aim of the JuDDGES project."
+)
+
+st.header("Project Partners")
+
+st.subheader("Wroclaw University of Science and Technology (WUST)")
+st.subheader("Middlesex University London (UK)")
+st.subheader("University of Lyon 1 (France).")
diff --git a/data/sample_data/.gitignore b/data/sample_data/.gitignore
@@ -0,0 +1,2 @@
+/judgements-100-sample-with-retrieved-informations.csv
+/judgements-100-sample.csv
diff --git a/data/sample_data/judgements-100-sample-with-retrieved-informations.csv.dvc b/data/sample_data/judgements-100-sample-with-retrieved-informations.csv.dvc
@@ -0,0 +1,5 @@
+outs:
+- md5: a2f1b05a21c7b1cdbf57f9c8b2245256
+  size: 2326000
+  hash: md5
+  path: judgements-100-sample-with-retrieved-informations.csv
diff --git a/data/sample_data/judgements-100-sample.csv.dvc b/data/sample_data/judgements-100-sample.csv.dvc
@@ -0,0 +1,5 @@
+outs:
+- md5: af392a462f059194e87f042fd8de7b20
+  size: 2261016
+  hash: md5
+  path: judgements-100-sample.csv
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -4,18 +4,15 @@ name: juddges
 services:
   web:
     build: .
-    command: streamlit run /app/dashboards/app.py
+    command: streamlit run /app/dashboards/⚖️_Project_Information.py
     volumes:
       - ./:/app
       - ~/.cache:/root/.cache
       - L:\docker-configs\zsh\smartass_zsh_history:/root/.zsh_history
     tty: true
     shm_size: "2gb"
-    environment:
-      - OPENAI_API_KEY=${OPENAI_API_KEY}
-      - MLFLOW_TRACKING_URI=${MLFLOW_TRACKING_URI}
-      - MLFLOW_S3_ENDPOINT_URL=${MLFLOW_S3_ENDPOINT_URL}
-      - LOGNAME=${LOGNAME}
+    env_file:
+      - .env
     restart: always
     deploy:
       resources:

diff --git a/juddges/_modidx.py b/juddges/_modidx.py
@@ -5,4 +5,9 @@
                 'doc_host': 'https://laugustyniak.github.io',
                 'git_url': 'https://github.com/laugustyniak/juddges',
                 'lib_path': 'juddges'},
-  'syms': {'juddges.data.pl_court_api': {}, 'juddges.preprocessing.parser_base': {}, 'juddges.preprocessing.pl_court_parser': {}}}
+  'syms': { 'juddges.data.models': {},
+            'juddges.data.pl_court_api': {},
+            'juddges.preprocessing.parser_base': {},
+            'juddges.preprocessing.pl_court_parser': {},
+            'juddges.prompts.information_extraction': {},
+            'juddges.settings': {}}}
diff --git a/juddges/data/models.py b/juddges/data/models.py
@@ -0,0 +1,17 @@
+import os
+
+from pymongo import MongoClient
+from pymongo.collection import Collection
+
+if os.environ.get("MONGO_URI", None) is None:
+    raise Exception("Missing `MONGO_URI` environment variable.")
+
+
+if os.environ.get("MONGO_DB_NAME", None) is None:
+    raise Exception("Missing `MONGO_DB_NAME` environment variable.")
+
+
+def get_mongo_collection(collection_name: str = "judgements") -> Collection:
+    client = MongoClient(os.environ["MONGO_URI"])
+    db = client[os.environ["MONGO_DB_NAME"]]
+    return db[collection_name]
diff --git a/juddges/prompts/__init__.py b/juddges/prompts/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.0.1"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -152,3 +152,4 @@ checklink/cookies.txt
		.history

		secrets.env
		/postgres-juddges/
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		/judgements-100-sample-with-retrieved-informations.csv
		/judgements-100-sample.csv