A repo that implements Stanford CRFM their HELM Instruct with adaptable evaluation criteria
python -m venv .venv
source .venv/bin/activate
pip install -r helm_instruct/requirements.txt
python helm_instruct/main.py
from helm_instruct.evaluation_criteria import Rating,Criterion
criterion = {
"childfriendliness"; Criterion(
question="How child-friendly is the game?",
ratings=[
Rating(
rating=1,
description="Not child-friendly"
),
Rating(
rating=2,
description="A bit child-friendly"
),
Rating(
rating=3,
description="Child-friendly"
),
Rating(
rating=4,
description="Very child-friendly"
)
]
)
}