Skip to content

Commit

Permalink
Add building english instruct dataset and infer over it with gpt-4o-mini
Browse files Browse the repository at this point in the history
  • Loading branch information
binkjakub committed Aug 26, 2024
1 parent ceaed8b commit c76b0b7
Show file tree
Hide file tree
Showing 6 changed files with 215 additions and 12 deletions.
6 changes: 6 additions & 0 deletions configs/dataset/en-court-instruct.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: JuDDGES/en-court-instruct
prompt_field: prompt
context_field: context
output_field: output

max_output_tokens: 300
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/outputs_997.json
/metrics_997.json
42 changes: 42 additions & 0 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1843,3 +1843,45 @@ stages:
hash: md5
md5: adb7c1e239396bbf6e308f3f1b436099
size: 307
build_instruct_dataset_en:
cmd: PYTHONPATH=. python scripts/dataset/build_instruct_dataset_en.py --repo-id
JuDDGES/en-court-instruct
deps:
- path: scripts/dataset/build_instruct_dataset_en.py
hash: md5
md5: fb5d5943c75bbe6724f335d7a8a491b1
size: 4176
predict_with_api@en-court-instruct-gpt-4o-mini-997:
cmd: PYTHONPATH=. python scripts/sft/predict_with_api.py dataset=en-court-instruct
model_version=gpt-4o-mini seed=997
output_file=data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
deps:
- path: configs/predict_with_api.yaml
hash: md5
md5: aff18078742a14c3d8ce2cd74e718d44
size: 320
- path: scripts/sft/predict_with_api.py
hash: md5
md5: 610d32b0036ae6eef4480c5a30f07999
size: 3987
outs:
- path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
hash: md5
md5: 276e2f7aa0d8d1e25265532286a282ef
size: 696620
evaluate@en-court-instruct-open_ai_gpt-4o-mini-997:
cmd: PYTHONPATH=. python scripts/sft/evaluate.py --output-file data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
deps:
- path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/outputs_997.json
hash: md5
md5: 276e2f7aa0d8d1e25265532286a282ef
size: 696620
- path: scripts/sft/evaluate.py
hash: md5
md5: 73aa4a7eb8a035c087702457b9401654
size: 636
outs:
- path: data/experiments/predict/en-court-instruct/open_ai_gpt-4o-mini/metrics_997.json
hash: md5
md5: 3620e35fe5a600672708437032eb6544
size: 157
48 changes: 36 additions & 12 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,23 @@ stages:
outs:
- data/datasets/pl/readme/instruct/

build_instruct_dataset:
build_instruct_dataset_pl:
cmd: >-
PYTHONPATH=. python scripts/dataset/build_instruct_dataset.py
PYTHONPATH=. python scripts/dataset/build_instruct_dataset_pl.py
--dataset-dir data/datasets/pl/raw
--repo-id JuDDGES/pl-court-instruct
deps:
- data/datasets/pl/raw
- scripts/dataset/build_instruct_dataset.py
- scripts/dataset/build_instruct_dataset_pl.py
desc: "Build instriction dataset (set NUM_JOBS envar) with labels extracted from API/text"

build_instruct_dataset_en:
cmd: >-
PYTHONPATH=. python scripts/dataset/build_instruct_dataset_en.py
--repo-id JuDDGES/en-court-instruct
deps:
- scripts/dataset/build_instruct_dataset_en.py

embed:
matrix:
model:
Expand Down Expand Up @@ -85,22 +92,30 @@ stages:

sft_unsloth:
matrix:
dataset:
- pl-court-instruct
- en-court-instruct
model:
- Unsloth-Llama-3-8B-Instruct
- Unsloth-Mistral-7B-Instruct-v0.3
- Unsloth-Mistral-Nemo-Instruct-2407
- Bielik-7B-Instruct-v0.1
cmd: >-
PYTHONPATH=. python scripts/sft/fine_tune_llm.py model=${item.model}
PYTHONPATH=. python scripts/sft/fine_tune_llm.py
dataset=${item.dataset}
model=${item.model}
deps:
- scripts/sft/fine_tune_llm.py
- configs/fine_tuning.yaml
- configs/model/${item.model}.yaml
outs:
- data/experiments/fine-tune/${item.model}/pl-court-instruct
- data/experiments/fine-tune/${item.model}/${item.dataset}/

predict:
matrix:
dataset:
- pl-court-instruct
- en-court-instruct
model:
- Unsloth-Llama-3-8B-Instruct
- Unsloth-Llama-3-8B-Instruct-fine-tuned
Expand All @@ -113,36 +128,44 @@ stages:
seed: ${seeds}
cmd: >-
PYTHONPATH=. python scripts/sft/predict.py
dataset=${item.dataset}
model=${item.model}
random_seed=${item.seed}
output_file=data/experiments/predict/pl-court-instruct/${item.model}/outputs_${item.seed}.json
output_file=data/experiments/predict/${item.dataset}/${item.model}/outputs_${item.seed}.json
deps:
- scripts/sft/predict.py
- configs/predict.yaml
- configs/model/${item.model}.yaml
outs:
- data/experiments/predict/pl-court-instruct/${item.model}/outputs_${item.seed}.json
- data/experiments/predict/${item.dataset}/${item.model}/outputs_${item.seed}.json

predict_with_api:
matrix:
dataset:
- pl-court-instruct
- en-court-instruct
model:
- gpt-4o
- gpt-4o-mini
seed:
- 997
cmd: >-
PYTHONPATH=. python scripts/sft/predict_with_api.py
dataset=${item.dataset}
model_version=${item.model}
seed=${item.seed}
output_file=data/experiments/predict/pl-court-instruct/open_ai_${item.model}/outputs_${item.seed}.json
output_file=data/experiments/predict/${item.dataset}/open_ai_${item.model}/outputs_${item.seed}.json
deps:
- scripts/sft/predict_with_api.py
- configs/predict_with_api.yaml
outs:
- data/experiments/predict/pl-court-instruct/open_ai_${item.model}/outputs_${item.seed}.json
- data/experiments/predict/${item.dataset}/open_ai_${item.model}/outputs_${item.seed}.json

evaluate:
matrix:
dataset:
- pl-court-instruct
- en-court-instruct
model:
- Unsloth-Llama-3-8B-Instruct
- Unsloth-Llama-3-8B-Instruct-fine-tuned
Expand All @@ -157,12 +180,12 @@ stages:
seed: ${seeds}
cmd: >-
PYTHONPATH=. python scripts/sft/evaluate.py
--output-file data/experiments/predict/pl-court-instruct/${item.model}/outputs_${item.seed}.json
--output-file data/experiments/predict/${item.dataset}/${item.model}/outputs_${item.seed}.json
deps:
- scripts/sft/evaluate.py
- data/experiments/predict/pl-court-instruct/${item.model}/outputs_${item.seed}.json
- data/experiments/predict/${item.dataset}/${item.model}/outputs_${item.seed}.json
outs:
- data/experiments/predict/pl-court-instruct/${item.model}/metrics_${item.seed}.json
- data/experiments/predict/${item.dataset}/${item.model}/metrics_${item.seed}.json

evaluate_llm_as_judge:
matrix:
Expand All @@ -189,6 +212,7 @@ stages:
matrix:
dir:
- data/experiments/predict/pl-court-instruct
- data/experiments/predict/en-court-instruct
cmd: >-
PYTHONPATH=. python scripts/sft/summarize_metrics.py
--root-dir ${item.dir}
Expand Down
129 changes: 129 additions & 0 deletions scripts/dataset/build_instruct_dataset_en.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from pathlib import Path
from typing import Any, Optional

import typer
import yaml
from datasets import load_dataset
from dotenv import load_dotenv
from loguru import logger

from juddges.settings import PL_JUDGEMENTS_PATH_INSTRUCT

load_dotenv()

MAX_SHARD_SIZE = "4GB"
SOURCE_DATASET_PATH = "JuDDGES/en-court-raw"
TEXT_FIELD = "content"

SCHEMA_TEMPLATE = "```yaml\n{schema}\n```"
INSTRUCTION_TEMPLATE = """
You are extracting information from the court judgments.
Extract specified values strictly from the provided judgement. If information is not provided in the judgement, leave the field with null value.
Please return the response in the identical YAML format:
{schema}
=====
{{context}}
======
"""
""
SCHEMA_DESC = {
"judges": "<list of judge full names>",
"citation": "<string containing the neutral citation number>",
"type": "<type of the court, one of: crown_court,supreme_court,martial_court,high_court_administrative_court,high_court_division_court,civil_criminal_court,division_court>",
}

PROMPT = INSTRUCTION_TEMPLATE.format(
schema=SCHEMA_TEMPLATE.format(schema=yaml.dump(SCHEMA_DESC).strip())
)

FEATURES = [
"judges",
"citation",
"type",
TEXT_FIELD,
]
SCHEMA_2_FEATURES = dict(zip(FEATURES, FEATURES))


# todo: In the future one might make this single script (for any language) with configurable preprocessing
def main(
dataset_dir: str = typer.Option(SOURCE_DATASET_PATH, help="Path to the dataset directory"),
repo_id: Optional[str] = typer.Option(None),
target_dir: Path = typer.Option(
PL_JUDGEMENTS_PATH_INSTRUCT,
help="Path to the target directory",
),
test_size: int = typer.Option(2_000, help="Size of the test set"),
random_seed: int = typer.Option(42, help="Random seed"),
num_jobs: Optional[int] = typer.Option(
None,
envvar="NUM_JOBS",
help="Number of parallel jobs to use",
),
branch: Optional[str] = typer.Option(None, help="Branch to push the dataset to"),
commit_message: Optional[str] = typer.Option(
None, help="Commit message", envvar="COMMIT_MESSAGE"
),
) -> None:
feature_cols = ["_id"] + FEATURES
logger.info("Loading dataset...")
ds = load_dataset(dataset_dir, columns=feature_cols)
assert all(col in ds.column_names["train"] for col in feature_cols)

initial_size = ds["train"].num_rows
logger.info(f"Pre-filtering dataset (initial size={initial_size})...")

ds = ds.filter(_pre_filter)

pre_filtered_size = ds["train"].num_rows
logger.info(
f"Finished pre-filtering (size={pre_filtered_size}, removed {initial_size - pre_filtered_size})"
)

ds = ds.filter(_filter)

filtered_size = ds["train"].num_rows
logger.info(
f"Finished filtering (size={filtered_size}, "
f"removed {initial_size - filtered_size} from original, "
f"and {pre_filtered_size - filtered_size} from pre-filtered)"
)

logger.info("Generating instructions...")
ds = ds.map(to_instruction_fmt, num_proc=num_jobs, remove_columns=FEATURES)
ds = ds["train"].train_test_split(test_size=test_size, seed=random_seed)

logger.info("Built dataset with following parameters: {ds_info}", ds_info=str(ds))

if repo_id:
ds.push_to_hub(
repo_id,
max_shard_size=MAX_SHARD_SIZE,
commit_message=commit_message,
revision=branch,
)
else:
ds.save_to_disk(target_dir, max_shard_size=MAX_SHARD_SIZE, num_proc=num_jobs)


def _pre_filter(item: dict[str, Any]) -> bool:
return not any(item[feat] is None for feat in FEATURES)


def _filter(item: dict[str, Any]) -> bool:
all_judges_in_text = not any(j not in item[TEXT_FIELD] for j in item["judges"])
return all_judges_in_text


def to_instruction_fmt(item: dict[str, Any]) -> dict[str, str]:
yaml_output = yaml.dump(
{k: item[SCHEMA_2_FEATURES[k]] for k in SCHEMA_DESC.keys()},
allow_unicode=True,
).strip()
output = SCHEMA_TEMPLATE.format(schema=yaml_output)

return {"prompt": PROMPT, "context": item[TEXT_FIELD], "output": output}


if __name__ == "__main__":
typer.run(main)
File renamed without changes.

0 comments on commit c76b0b7

Please sign in to comment.