Skip to content

Commit

Permalink
Merge branch 'refs/heads/master' into feat/llm-fine-tuning
Browse files Browse the repository at this point in the history
# Conflicts:
#	juddges/_modidx.py
#	pyproject.toml
#	requirements.txt
  • Loading branch information
Jakub Binkowski committed Apr 11, 2024
2 parents 402f15b + 566593a commit c91c6e9
Show file tree
Hide file tree
Showing 22 changed files with 1,762 additions and 189 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,4 @@ checklink/cookies.txt
.history

secrets.env
**/postgres-juddges/**
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
lint_dirs := juddges scripts notebooks dashboards tests
lint_dirs := juddges scripts dashboards tests
mypy_dirs := juddges scripts dashboards tests

fix:
Expand All @@ -8,7 +8,6 @@ fix:
check:
ruff check $(lint_dirs)
ruff format $(lint_dirs) --check
mypy --install-types --non-interactive $(mypy_dirs)

test:
coverage run -m pytest
Expand Down
3 changes: 0 additions & 3 deletions dashboards/app.py

This file was deleted.

76 changes: 76 additions & 0 deletions dashboards/pages/00_🛠️_Extract_Information_from_Judgements.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import streamlit as st

from juddges.data.pl_court_api import PolishCourtAPI
from juddges.prompts.information_extraction import (
EXAMPLE_SCHEMA,
prepare_information_extraction_chain,
prepare_schema_chain,
)
from juddges.settings import prepare_langchain_cache, prepare_mlflow

prepare_langchain_cache()
prepare_mlflow()

TITLE = "⚖️ JuDDGES Information Extraction from Court Decisions ⚖️"

st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")

st.title(TITLE)

st.info(
"You can provide an URL to court decision or plain text of it, describe in written form schema of the information that will be extracted, choose model and language and start extraction."
)

st.header("Data source")
source_option = st.selectbox("Choose the source of the judgement text:", ["API", "Plain text"])

if source_option == "API":
api = PolishCourtAPI()
judgement_url = st.text_input(
"Enter the judgement URL:",
"https://orzeczenia.wroclaw.sa.gov.pl/details/$N/155000000001006_II_AKa_000334_2019_Uz_2020-02-06_001",
)
judgement_id = judgement_url.strip().split("/")[-1]
judgement_text = api.get_content(id=judgement_id)
else:
judgement_text = st.text_area("Enter the judgement text here:", height=500)

st.header("Schema extraction/definition")
schema_query = st.text_input(
"Ask for schema in natural language:",
"Extract the date, verdict, and court from the judgement.",
)
llm_schema = st.selectbox(
"Select the LLM model (schema)",
["gpt-3.5-turbo-1106", "gpt-4-0125-preview", "gpt-4-1106-preview"],
)

if st.button("Generate schema to extract information"):
chain = prepare_schema_chain(model_name=llm_schema)
schema = chain.invoke({"SCHEMA_TEXT": schema_query})
if not schema:
st.error("Could not extract schema from the given query. Try with a different one.")
else:
st.session_state.schema = schema

schema_text = st.text_area(
"Enter the schema text here:", st.session_state.get("schema") or EXAMPLE_SCHEMA, height=500
)

st.header("Information extraction")
llm_extraction = st.selectbox(
"Select the LLM model", ["gpt-4-0125-preview", "gpt-4-1106-preview", "gpt-3.5-turbo-1106"]
)
language = st.selectbox("Enter the language of the judgement text:", ["Polish", "English"])


if st.button("Extract information"):
with st.spinner("Extracting information from the judgement text..."):
chain = prepare_information_extraction_chain(model_name=llm_extraction)
retrieved_informations = chain.invoke(
{"LANGUAGE": language, "TEXT": judgement_text, "SCHEMA": schema_text}
)
col_left, col_right = st.columns(2)

col_left.write(judgement_text)
col_right.write(retrieved_informations)
38 changes: 38 additions & 0 deletions dashboards/pages/01_🔍_Search_Judgements.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import streamlit as st

from juddges.data.models import get_mongo_collection

TITLE = "Search for Judgements"

st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")

st.title(TITLE)


@st.cache_resource
def get_judgements_collection():
return get_mongo_collection("judgements")


judgements_collection = get_judgements_collection()


def search_data(query: str, max_judgements: int = 5):
items = list(judgements_collection.find({"$text": {"$search": query}}).limit(max_judgements))
return items


with st.form(key="search_form"):
text = st.text_area("What you are looking for in the judgements?")
max_judgements = st.slider("Max judgements to show", min_value=1, max_value=20, value=5)
submit_button = st.form_submit_button(label="Search")

if submit_button:
with st.spinner("Searching..."):
items = search_data(text, max_judgements)

st.header("Judgements - Results")
for item in items:
st.header(item["signature"])
st.subheader(item["publicationDate"])
st.write(item["text"])
62 changes: 62 additions & 0 deletions dashboards/pages/02_🔍_Analyse_Extracted_Information.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pandas as pd
import streamlit as st

from juddges.prompts.information_extraction import EXAMPLE_SCHEMA
from juddges.settings import SAMPLE_DATA_PATH

TITLE = "Analyse Judgements"

st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")

st.title(TITLE)


@st.cache_resource
def load_data():
return pd.read_csv(SAMPLE_DATA_PATH / "judgements-100-sample-with-retrieved-informations.csv")


df = load_data()

st.info(
"We sampled 100 random judgements from the dataset and extracted information from them. Below is the extracted information and the schema (questions) used to extract it."
)

st.header("Schema:")
st.write(EXAMPLE_SCHEMA)

st.header("Extracted Information - tabular format")
st.write(df)

st.header("Analyse Extracted Information")

st.subheader("How many judgements we analyzed?")

st.write(f"Number of judgements: {len(df)}")

st.subheader("What courts judgement do we analyse")

st.write(df.groupby("court")["_id"].count())

st.subheader("How many judgements are drug offences?")

drug_offences = df["drug_offence"].sum()

st.info(f"Number of drug offences: {drug_offences}")

st.subheader("How many judgements are child offences?")

child_offences = df["child_offence"].sum()

st.info(f"Number of child offences: {child_offences}")

st.subheader("Show examples of judgements that are child offences")

drug_offences_df = df[df["child_offence"]]

st.write("We can check the sentences of them")

for _, row in drug_offences_df.iterrows():
st.subheader(row["signature"])
st.markdown(row["text"])
st.markdown("---") # Add a horizontal line
59 changes: 59 additions & 0 deletions dashboards/⚖️_Project_Information.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import streamlit as st

from juddges.settings import ROOT_PATH

TITLE = "⚖️ JuDDGES Information Extraction from Court Decisions ⚖️"

st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide")

st.title(TITLE)

st.warning("JuDDGES stands for Judicial Decision Data Gathering, Encoding, and Sharing")

st.info(
"""The JuDDGES project aims to revolutionize the accessibility and analysis of judicial decisions across varied legal systems using advanced Natural Language Processing and Human-In-The-Loop technologies. It focuses on criminal court records from jurisdictions with diverse legal constitutions, including Poland and England & Wales. By overcoming barriers related to resources, language, data, and format inhomogeneity, the project facilitates the development and testing of theories on judicial decision-making and informs judicial policy and practice. Open software and tools produced by the project will enable extensive, flexible meta-annotation of legal texts, benefiting researchers and public legal institutions alike. This initiative not only advances empirical legal research by adopting Open Science principles but also creates the most comprehensive legal research repository in Europe, fostering cross-disciplinary and cross-jurisdictional collaboration."""
)

st.image((ROOT_PATH / "nbs/images/baner.png").as_posix(), use_column_width=True)

st.info(
"The JuDDGES project encompasses several Work Packages (WPs) designed to cover all aspects of its objectives, from project management to the open science practices and engaging early career researchers. Below is an overview of the project’s WPs based on the provided information."
)

st.header("WP1: Project Management")
st.subheader("Duration: 24 Months")

st.info(
"Main Aim: To ensure the project’s successful completion on time and within budget. This includes administrative management, scientific and technological management, quality innovation and risk management, ethical and legal consideration, and facilitating open science."
)

st.header("WP2: Gathering and Human Encoding of Judicial Decision Data")
st.subheader("Duration: 22 Months")

st.info(
"Main Aim: To establish the data foundation for developing and testing the project’s tools. This involves collating/gathering legal case records and judgments, developing a coding scheme, training human coders, making human-coded data available for WP3, facilitating human-in-loop coding for WP3, and enabling WP4 to make data open and reusable beyond the project team."
)

st.header("WP3: NLP and HITL Machine Learning Methodological Development")
st.subheader("Duration: 24 Months")

st.info(
"Main Aim: To create a bridge between machine learning (led by WUST and MUHEC) and Open Science facilitation (by ELICO), focusing on the development and deployment of annotation methodologies. This includes baseline information extraction, intelligent inference methods for legal corpus data, and constructing an annotation tool through active learning and human-in-the-loop annotation methods."
)

st.header("WP4: Open Science Practices & Engaging Early Career Researchers")
st.subheader("Duration: 12 Months")

st.info(
"Main Aim: To implement the Open Science policy of the call and engage with relevant early career researchers (ECRs). Objectives include providing open access to publication data and software, disseminating/exploiting project results, and promoting the project and its findings."
)

st.info(
"Each WP includes specific tasks aimed at achieving its goals, involving collaboration among project partners and contributing to the overarching aim of the JuDDGES project​​."
)

st.header("Project Partners")

st.subheader("Wroclaw University of Science and Technology (WUST)")
st.subheader("Middlesex University London (UK)")
st.subheader("University of Lyon 1 (France)​​.")
2 changes: 2 additions & 0 deletions data/sample_data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/judgements-100-sample-with-retrieved-informations.csv
/judgements-100-sample.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
outs:
- md5: a2f1b05a21c7b1cdbf57f9c8b2245256
size: 2326000
hash: md5
path: judgements-100-sample-with-retrieved-informations.csv
5 changes: 5 additions & 0 deletions data/sample_data/judgements-100-sample.csv.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
outs:
- md5: af392a462f059194e87f042fd8de7b20
size: 2261016
hash: md5
path: judgements-100-sample.csv
9 changes: 3 additions & 6 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,15 @@ name: juddges
services:
web:
build: .
command: streamlit run /app/dashboards/app.py
command: streamlit run /app/dashboards/⚖️_Project_Information.py
volumes:
- ./:/app
- ~/.cache:/root/.cache
- L:\docker-configs\zsh\smartass_zsh_history:/root/.zsh_history
tty: true
shm_size: "2gb"
environment:
- OPENAI_API_KEY=${OPENAI_API_KEY}
- MLFLOW_TRACKING_URI=${MLFLOW_TRACKING_URI}
- MLFLOW_S3_ENDPOINT_URL=${MLFLOW_S3_ENDPOINT_URL}
- LOGNAME=${LOGNAME}
env_file:
- .env
restart: always
deploy:
resources:
Expand Down
7 changes: 6 additions & 1 deletion juddges/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,9 @@
'doc_host': 'https://laugustyniak.github.io',
'git_url': 'https://github.com/laugustyniak/juddges',
'lib_path': 'juddges'},
'syms': {'juddges.data.pl_court_api': {}, 'juddges.preprocessing.parser_base': {}, 'juddges.preprocessing.pl_court_parser': {}}}
'syms': { 'juddges.data.models': {},
'juddges.data.pl_court_api': {},
'juddges.preprocessing.parser_base': {},
'juddges.preprocessing.pl_court_parser': {},
'juddges.prompts.information_extraction': {},
'juddges.settings': {}}}
17 changes: 17 additions & 0 deletions juddges/data/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import os

from pymongo import MongoClient
from pymongo.collection import Collection

if os.environ.get("MONGO_URI", None) is None:
raise Exception("Missing `MONGO_URI` environment variable.")


if os.environ.get("MONGO_DB_NAME", None) is None:
raise Exception("Missing `MONGO_DB_NAME` environment variable.")


def get_mongo_collection(collection_name: str = "judgements") -> Collection:
client = MongoClient(os.environ["MONGO_URI"])
db = client[os.environ["MONGO_DB_NAME"]]
return db[collection_name]
1 change: 1 addition & 0 deletions juddges/prompts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "0.0.1"
Loading

0 comments on commit c91c6e9

Please sign in to comment.