-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add building english instruct dataset and infer over it with gpt-4o-mini
- Loading branch information
Showing
6 changed files
with
215 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
name: JuDDGES/en-court-instruct | ||
prompt_field: prompt | ||
context_field: context | ||
output_field: output | ||
|
||
max_output_tokens: 300 |
2 changes: 2 additions & 0 deletions
2
data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/.gitignore
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
/outputs_997.json | ||
/metrics_997.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
from pathlib import Path | ||
from typing import Any, Optional | ||
|
||
import typer | ||
import yaml | ||
from datasets import load_dataset | ||
from dotenv import load_dotenv | ||
from loguru import logger | ||
|
||
from juddges.settings import PL_JUDGEMENTS_PATH_INSTRUCT | ||
|
||
load_dotenv() | ||
|
||
MAX_SHARD_SIZE = "4GB" | ||
SOURCE_DATASET_PATH = "JuDDGES/en-court-raw" | ||
TEXT_FIELD = "content" | ||
|
||
SCHEMA_TEMPLATE = "```yaml\n{schema}\n```" | ||
INSTRUCTION_TEMPLATE = """ | ||
You are extracting information from the court judgments. | ||
Extract specified values strictly from the provided judgement. If information is not provided in the judgement, leave the field with null value. | ||
Please return the response in the identical YAML format: | ||
{schema} | ||
===== | ||
{{context}} | ||
====== | ||
""" | ||
"" | ||
SCHEMA_DESC = { | ||
"judges": "<list of judge full names>", | ||
"citation": "<string containing the neutral citation number>", | ||
"type": "<type of the court, one of: crown_court,supreme_court,martial_court,high_court_administrative_court,high_court_division_court,civil_criminal_court,division_court>", | ||
} | ||
|
||
PROMPT = INSTRUCTION_TEMPLATE.format( | ||
schema=SCHEMA_TEMPLATE.format(schema=yaml.dump(SCHEMA_DESC).strip()) | ||
) | ||
|
||
FEATURES = [ | ||
"judges", | ||
"citation", | ||
"type", | ||
TEXT_FIELD, | ||
] | ||
SCHEMA_2_FEATURES = dict(zip(FEATURES, FEATURES)) | ||
|
||
|
||
# todo: In the future one might make this single script (for any language) with configurable preprocessing | ||
def main( | ||
dataset_dir: str = typer.Option(SOURCE_DATASET_PATH, help="Path to the dataset directory"), | ||
repo_id: Optional[str] = typer.Option(None), | ||
target_dir: Path = typer.Option( | ||
PL_JUDGEMENTS_PATH_INSTRUCT, | ||
help="Path to the target directory", | ||
), | ||
test_size: int = typer.Option(2_000, help="Size of the test set"), | ||
random_seed: int = typer.Option(42, help="Random seed"), | ||
num_jobs: Optional[int] = typer.Option( | ||
None, | ||
envvar="NUM_JOBS", | ||
help="Number of parallel jobs to use", | ||
), | ||
branch: Optional[str] = typer.Option(None, help="Branch to push the dataset to"), | ||
commit_message: Optional[str] = typer.Option( | ||
None, help="Commit message", envvar="COMMIT_MESSAGE" | ||
), | ||
) -> None: | ||
feature_cols = ["_id"] + FEATURES | ||
logger.info("Loading dataset...") | ||
ds = load_dataset(dataset_dir, columns=feature_cols) | ||
assert all(col in ds.column_names["train"] for col in feature_cols) | ||
|
||
initial_size = ds["train"].num_rows | ||
logger.info(f"Pre-filtering dataset (initial size={initial_size})...") | ||
|
||
ds = ds.filter(_pre_filter) | ||
|
||
pre_filtered_size = ds["train"].num_rows | ||
logger.info( | ||
f"Finished pre-filtering (size={pre_filtered_size}, removed {initial_size - pre_filtered_size})" | ||
) | ||
|
||
ds = ds.filter(_filter) | ||
|
||
filtered_size = ds["train"].num_rows | ||
logger.info( | ||
f"Finished filtering (size={filtered_size}, " | ||
f"removed {initial_size - filtered_size} from original, " | ||
f"and {pre_filtered_size - filtered_size} from pre-filtered)" | ||
) | ||
|
||
logger.info("Generating instructions...") | ||
ds = ds.map(to_instruction_fmt, num_proc=num_jobs, remove_columns=FEATURES) | ||
ds = ds["train"].train_test_split(test_size=test_size, seed=random_seed) | ||
|
||
logger.info("Built dataset with following parameters: {ds_info}", ds_info=str(ds)) | ||
|
||
if repo_id: | ||
ds.push_to_hub( | ||
repo_id, | ||
max_shard_size=MAX_SHARD_SIZE, | ||
commit_message=commit_message, | ||
revision=branch, | ||
) | ||
else: | ||
ds.save_to_disk(target_dir, max_shard_size=MAX_SHARD_SIZE, num_proc=num_jobs) | ||
|
||
|
||
def _pre_filter(item: dict[str, Any]) -> bool: | ||
return not any(item[feat] is None for feat in FEATURES) | ||
|
||
|
||
def _filter(item: dict[str, Any]) -> bool: | ||
all_judges_in_text = not any(j not in item[TEXT_FIELD] for j in item["judges"]) | ||
return all_judges_in_text | ||
|
||
|
||
def to_instruction_fmt(item: dict[str, Any]) -> dict[str, str]: | ||
yaml_output = yaml.dump( | ||
{k: item[SCHEMA_2_FEATURES[k]] for k in SCHEMA_DESC.keys()}, | ||
allow_unicode=True, | ||
).strip() | ||
output = SCHEMA_TEMPLATE.format(schema=yaml_output) | ||
|
||
return {"prompt": PROMPT, "context": item[TEXT_FIELD], "output": output} | ||
|
||
|
||
if __name__ == "__main__": | ||
typer.run(main) |
File renamed without changes.