-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'refs/heads/master' into feat/llm-fine-tuning
# Conflicts: # juddges/_modidx.py # pyproject.toml # requirements.txt
- Loading branch information
Showing
22 changed files
with
1,762 additions
and
189 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -152,3 +152,4 @@ checklink/cookies.txt | |
.history | ||
|
||
secrets.env | ||
**/postgres-juddges/** |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
76 changes: 76 additions & 0 deletions
76
dashboards/pages/00_🛠️_Extract_Information_from_Judgements.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import streamlit as st | ||
|
||
from juddges.data.pl_court_api import PolishCourtAPI | ||
from juddges.prompts.information_extraction import ( | ||
EXAMPLE_SCHEMA, | ||
prepare_information_extraction_chain, | ||
prepare_schema_chain, | ||
) | ||
from juddges.settings import prepare_langchain_cache, prepare_mlflow | ||
|
||
prepare_langchain_cache() | ||
prepare_mlflow() | ||
|
||
TITLE = "⚖️ JuDDGES Information Extraction from Court Decisions ⚖️" | ||
|
||
st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide") | ||
|
||
st.title(TITLE) | ||
|
||
st.info( | ||
"You can provide an URL to court decision or plain text of it, describe in written form schema of the information that will be extracted, choose model and language and start extraction." | ||
) | ||
|
||
st.header("Data source") | ||
source_option = st.selectbox("Choose the source of the judgement text:", ["API", "Plain text"]) | ||
|
||
if source_option == "API": | ||
api = PolishCourtAPI() | ||
judgement_url = st.text_input( | ||
"Enter the judgement URL:", | ||
"https://orzeczenia.wroclaw.sa.gov.pl/details/$N/155000000001006_II_AKa_000334_2019_Uz_2020-02-06_001", | ||
) | ||
judgement_id = judgement_url.strip().split("/")[-1] | ||
judgement_text = api.get_content(id=judgement_id) | ||
else: | ||
judgement_text = st.text_area("Enter the judgement text here:", height=500) | ||
|
||
st.header("Schema extraction/definition") | ||
schema_query = st.text_input( | ||
"Ask for schema in natural language:", | ||
"Extract the date, verdict, and court from the judgement.", | ||
) | ||
llm_schema = st.selectbox( | ||
"Select the LLM model (schema)", | ||
["gpt-3.5-turbo-1106", "gpt-4-0125-preview", "gpt-4-1106-preview"], | ||
) | ||
|
||
if st.button("Generate schema to extract information"): | ||
chain = prepare_schema_chain(model_name=llm_schema) | ||
schema = chain.invoke({"SCHEMA_TEXT": schema_query}) | ||
if not schema: | ||
st.error("Could not extract schema from the given query. Try with a different one.") | ||
else: | ||
st.session_state.schema = schema | ||
|
||
schema_text = st.text_area( | ||
"Enter the schema text here:", st.session_state.get("schema") or EXAMPLE_SCHEMA, height=500 | ||
) | ||
|
||
st.header("Information extraction") | ||
llm_extraction = st.selectbox( | ||
"Select the LLM model", ["gpt-4-0125-preview", "gpt-4-1106-preview", "gpt-3.5-turbo-1106"] | ||
) | ||
language = st.selectbox("Enter the language of the judgement text:", ["Polish", "English"]) | ||
|
||
|
||
if st.button("Extract information"): | ||
with st.spinner("Extracting information from the judgement text..."): | ||
chain = prepare_information_extraction_chain(model_name=llm_extraction) | ||
retrieved_informations = chain.invoke( | ||
{"LANGUAGE": language, "TEXT": judgement_text, "SCHEMA": schema_text} | ||
) | ||
col_left, col_right = st.columns(2) | ||
|
||
col_left.write(judgement_text) | ||
col_right.write(retrieved_informations) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import streamlit as st | ||
|
||
from juddges.data.models import get_mongo_collection | ||
|
||
TITLE = "Search for Judgements" | ||
|
||
st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide") | ||
|
||
st.title(TITLE) | ||
|
||
|
||
@st.cache_resource | ||
def get_judgements_collection(): | ||
return get_mongo_collection("judgements") | ||
|
||
|
||
judgements_collection = get_judgements_collection() | ||
|
||
|
||
def search_data(query: str, max_judgements: int = 5): | ||
items = list(judgements_collection.find({"$text": {"$search": query}}).limit(max_judgements)) | ||
return items | ||
|
||
|
||
with st.form(key="search_form"): | ||
text = st.text_area("What you are looking for in the judgements?") | ||
max_judgements = st.slider("Max judgements to show", min_value=1, max_value=20, value=5) | ||
submit_button = st.form_submit_button(label="Search") | ||
|
||
if submit_button: | ||
with st.spinner("Searching..."): | ||
items = search_data(text, max_judgements) | ||
|
||
st.header("Judgements - Results") | ||
for item in items: | ||
st.header(item["signature"]) | ||
st.subheader(item["publicationDate"]) | ||
st.write(item["text"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import pandas as pd | ||
import streamlit as st | ||
|
||
from juddges.prompts.information_extraction import EXAMPLE_SCHEMA | ||
from juddges.settings import SAMPLE_DATA_PATH | ||
|
||
TITLE = "Analyse Judgements" | ||
|
||
st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide") | ||
|
||
st.title(TITLE) | ||
|
||
|
||
@st.cache_resource | ||
def load_data(): | ||
return pd.read_csv(SAMPLE_DATA_PATH / "judgements-100-sample-with-retrieved-informations.csv") | ||
|
||
|
||
df = load_data() | ||
|
||
st.info( | ||
"We sampled 100 random judgements from the dataset and extracted information from them. Below is the extracted information and the schema (questions) used to extract it." | ||
) | ||
|
||
st.header("Schema:") | ||
st.write(EXAMPLE_SCHEMA) | ||
|
||
st.header("Extracted Information - tabular format") | ||
st.write(df) | ||
|
||
st.header("Analyse Extracted Information") | ||
|
||
st.subheader("How many judgements we analyzed?") | ||
|
||
st.write(f"Number of judgements: {len(df)}") | ||
|
||
st.subheader("What courts judgement do we analyse") | ||
|
||
st.write(df.groupby("court")["_id"].count()) | ||
|
||
st.subheader("How many judgements are drug offences?") | ||
|
||
drug_offences = df["drug_offence"].sum() | ||
|
||
st.info(f"Number of drug offences: {drug_offences}") | ||
|
||
st.subheader("How many judgements are child offences?") | ||
|
||
child_offences = df["child_offence"].sum() | ||
|
||
st.info(f"Number of child offences: {child_offences}") | ||
|
||
st.subheader("Show examples of judgements that are child offences") | ||
|
||
drug_offences_df = df[df["child_offence"]] | ||
|
||
st.write("We can check the sentences of them") | ||
|
||
for _, row in drug_offences_df.iterrows(): | ||
st.subheader(row["signature"]) | ||
st.markdown(row["text"]) | ||
st.markdown("---") # Add a horizontal line |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import streamlit as st | ||
|
||
from juddges.settings import ROOT_PATH | ||
|
||
TITLE = "⚖️ JuDDGES Information Extraction from Court Decisions ⚖️" | ||
|
||
st.set_page_config(page_title=TITLE, page_icon="⚖️", layout="wide") | ||
|
||
st.title(TITLE) | ||
|
||
st.warning("JuDDGES stands for Judicial Decision Data Gathering, Encoding, and Sharing") | ||
|
||
st.info( | ||
"""The JuDDGES project aims to revolutionize the accessibility and analysis of judicial decisions across varied legal systems using advanced Natural Language Processing and Human-In-The-Loop technologies. It focuses on criminal court records from jurisdictions with diverse legal constitutions, including Poland and England & Wales. By overcoming barriers related to resources, language, data, and format inhomogeneity, the project facilitates the development and testing of theories on judicial decision-making and informs judicial policy and practice. Open software and tools produced by the project will enable extensive, flexible meta-annotation of legal texts, benefiting researchers and public legal institutions alike. This initiative not only advances empirical legal research by adopting Open Science principles but also creates the most comprehensive legal research repository in Europe, fostering cross-disciplinary and cross-jurisdictional collaboration.""" | ||
) | ||
|
||
st.image((ROOT_PATH / "nbs/images/baner.png").as_posix(), use_column_width=True) | ||
|
||
st.info( | ||
"The JuDDGES project encompasses several Work Packages (WPs) designed to cover all aspects of its objectives, from project management to the open science practices and engaging early career researchers. Below is an overview of the project’s WPs based on the provided information." | ||
) | ||
|
||
st.header("WP1: Project Management") | ||
st.subheader("Duration: 24 Months") | ||
|
||
st.info( | ||
"Main Aim: To ensure the project’s successful completion on time and within budget. This includes administrative management, scientific and technological management, quality innovation and risk management, ethical and legal consideration, and facilitating open science." | ||
) | ||
|
||
st.header("WP2: Gathering and Human Encoding of Judicial Decision Data") | ||
st.subheader("Duration: 22 Months") | ||
|
||
st.info( | ||
"Main Aim: To establish the data foundation for developing and testing the project’s tools. This involves collating/gathering legal case records and judgments, developing a coding scheme, training human coders, making human-coded data available for WP3, facilitating human-in-loop coding for WP3, and enabling WP4 to make data open and reusable beyond the project team." | ||
) | ||
|
||
st.header("WP3: NLP and HITL Machine Learning Methodological Development") | ||
st.subheader("Duration: 24 Months") | ||
|
||
st.info( | ||
"Main Aim: To create a bridge between machine learning (led by WUST and MUHEC) and Open Science facilitation (by ELICO), focusing on the development and deployment of annotation methodologies. This includes baseline information extraction, intelligent inference methods for legal corpus data, and constructing an annotation tool through active learning and human-in-the-loop annotation methods." | ||
) | ||
|
||
st.header("WP4: Open Science Practices & Engaging Early Career Researchers") | ||
st.subheader("Duration: 12 Months") | ||
|
||
st.info( | ||
"Main Aim: To implement the Open Science policy of the call and engage with relevant early career researchers (ECRs). Objectives include providing open access to publication data and software, disseminating/exploiting project results, and promoting the project and its findings." | ||
) | ||
|
||
st.info( | ||
"Each WP includes specific tasks aimed at achieving its goals, involving collaboration among project partners and contributing to the overarching aim of the JuDDGES project." | ||
) | ||
|
||
st.header("Project Partners") | ||
|
||
st.subheader("Wroclaw University of Science and Technology (WUST)") | ||
st.subheader("Middlesex University London (UK)") | ||
st.subheader("University of Lyon 1 (France).") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
/judgements-100-sample-with-retrieved-informations.csv | ||
/judgements-100-sample.csv |
5 changes: 5 additions & 0 deletions
5
data/sample_data/judgements-100-sample-with-retrieved-informations.csv.dvc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
outs: | ||
- md5: a2f1b05a21c7b1cdbf57f9c8b2245256 | ||
size: 2326000 | ||
hash: md5 | ||
path: judgements-100-sample-with-retrieved-informations.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
outs: | ||
- md5: af392a462f059194e87f042fd8de7b20 | ||
size: 2261016 | ||
hash: md5 | ||
path: judgements-100-sample.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import os | ||
|
||
from pymongo import MongoClient | ||
from pymongo.collection import Collection | ||
|
||
if os.environ.get("MONGO_URI", None) is None: | ||
raise Exception("Missing `MONGO_URI` environment variable.") | ||
|
||
|
||
if os.environ.get("MONGO_DB_NAME", None) is None: | ||
raise Exception("Missing `MONGO_DB_NAME` environment variable.") | ||
|
||
|
||
def get_mongo_collection(collection_name: str = "judgements") -> Collection: | ||
client = MongoClient(os.environ["MONGO_URI"]) | ||
db = client[os.environ["MONGO_DB_NAME"]] | ||
return db[collection_name] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__version__ = "0.0.1" |
Oops, something went wrong.