Skip to content

Commit

Permalink
eval: add proper selector and permissions
Browse files Browse the repository at this point in the history
* Add resource request to run a GPU node
* Write the best model/score to a file: /data/mt-bench-best.txt

Signed-off-by: Sébastien Han <[email protected]>
  • Loading branch information
leseb committed Oct 10, 2024
1 parent d92ecfd commit ce439bf
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 43 deletions.
9 changes: 7 additions & 2 deletions eval/mt_bench/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def run_mt_bench_op(
models_list: List[str] = None,
models_folder: Optional[str] = None,
device: str = None,
best_score_file: Optional[str] = None,
) -> NamedTuple("outputs", best_model=str, best_score=float):
import json
import os
Expand All @@ -29,7 +30,7 @@ def run_mt_bench_op(
VLLM_SERVER = "http://localhost:8000/v1"

def launch_vllm(
model_path: str, gpu_count: int, retries: int = 120, delay: int = 5
model_path: str, gpu_count: int, retries: int = 120, delay: int = 10
):
import subprocess
import sys
Expand Down Expand Up @@ -185,12 +186,16 @@ def stop_vllm():
all_mt_bench_data.append(mt_bench_data)
scores[model_path] = overall_score

with open(mt_bench_output.path, "w") as f:
with open(mt_bench_output.path, "w", encoding="utf-8") as f:
json.dump(all_mt_bench_data, f, indent=4)

outputs = NamedTuple("outputs", best_model=str, best_score=float)
best_model = max(scores, key=scores.get)
best_score = scores[best_model]
if best_score_file:
with open(best_score_file, "w", encoding="utf-8") as f:
json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4)

return outputs(best_model=best_model, best_score=best_score)


Expand Down
2 changes: 1 addition & 1 deletion pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ def gen_standalone():
"exec-sdg-op": 'sdg_op(num_instructions_to_generate=2, repo_branch="", repo_pr="", taxonomy="/data/taxonomy", sdg="/data/generated")',
"exec-git-clone-op": {},
"exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/data/model")',
"exec-run-mt-bench-op": 'run_mt_bench_op(mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/hf_format", models_path_prefix="/data/model/output/hf_format", max_workers="auto", merge_system_user_message=False)',
"exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="/data/mt-bench-best.txt",mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/hf_format", models_path_prefix="/data/model/output/hf_format", max_workers="auto", merge_system_user_message=False)',
}

details = {}
Expand Down
58 changes: 32 additions & 26 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,9 @@ components:
executorLabel: exec-run-mt-bench-op
inputDefinitions:
parameters:
best_score_file:
isOptional: true
parameterType: STRING
device:
isOptional: true
parameterType: STRING
Expand Down Expand Up @@ -1204,28 +1207,28 @@ deploymentSpec:
\ number of gpus allocated for serving is calculated based on environment\n\
\ # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
\ max_workers: str,\n models_list: List[str] = None,\n models_folder:\
\ Optional[str] = None,\n device: str = None,\n) -> NamedTuple(\"outputs\"\
, best_model=str, best_score=float):\n import json\n import os\n\n\
\ import torch\n from instructlab.eval.mt_bench import MTBenchEvaluator\n\
\n VLLM_SERVER = \"http://localhost:8000/v1\"\n\n def launch_vllm(\n\
\ model_path: str, gpu_count: int, retries: int = 120, delay: int\
\ = 5\n ):\n import subprocess\n import sys\n import\
\ time\n\n import requests\n\n if gpu_count > 0:\n \
\ command = [\n sys.executable,\n \"-m\"\
,\n \"vllm.entrypoints.openai.api_server\",\n \
\ \"--model\",\n model_path,\n \"--tensor-parallel-size\"\
,\n str(gpu_count),\n ]\n else:\n \
\ command = [\n sys.executable,\n \"\
-m\",\n \"vllm.entrypoints.openai.api_server\",\n \
\ \"--model\",\n model_path,\n ]\n\n \
\ subprocess.Popen(args=command)\n\n print(f\"Waiting for vLLM\
\ server to start at {VLLM_SERVER}...\")\n\n for attempt in range(retries):\n\
\ try:\n response = requests.get(f\"{VLLM_SERVER}/models\"\
)\n if response.status_code == 200:\n \
\ print(f\"vLLM server is up and running at {VLLM_SERVER}.\")\n \
\ return\n except requests.ConnectionError:\n \
\ pass\n\n print(\n f\"Server not available\
\ yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...\"\
\ Optional[str] = None,\n device: str = None,\n best_score_file: Optional[str]\
\ = None,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n\
\ import json\n import os\n\n import torch\n from instructlab.eval.mt_bench\
\ import MTBenchEvaluator\n\n VLLM_SERVER = \"http://localhost:8000/v1\"\
\n\n def launch_vllm(\n model_path: str, gpu_count: int, retries:\
\ int = 120, delay: int = 10\n ):\n import subprocess\n \
\ import sys\n import time\n\n import requests\n\n \
\ if gpu_count > 0:\n command = [\n sys.executable,\n\
\ \"-m\",\n \"vllm.entrypoints.openai.api_server\"\
,\n \"--model\",\n model_path,\n \
\ \"--tensor-parallel-size\",\n str(gpu_count),\n \
\ ]\n else:\n command = [\n sys.executable,\n\
\ \"-m\",\n \"vllm.entrypoints.openai.api_server\"\
,\n \"--model\",\n model_path,\n \
\ ]\n\n subprocess.Popen(args=command)\n\n print(f\"Waiting\
\ for vLLM server to start at {VLLM_SERVER}...\")\n\n for attempt\
\ in range(retries):\n try:\n response = requests.get(f\"\
{VLLM_SERVER}/models\")\n if response.status_code == 200:\n\
\ print(f\"vLLM server is up and running at {VLLM_SERVER}.\"\
)\n return\n except requests.ConnectionError:\n\
\ pass\n\n print(\n f\"Server not\
\ available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...\"\
\n )\n time.sleep(delay)\n\n raise RuntimeError(\n\
\ f\"Failed to start vLLM server at {VLLM_SERVER} after {retries}\
\ retries.\"\n )\n\n # This seems like excessive effort to stop\
Expand Down Expand Up @@ -1287,10 +1290,13 @@ deploymentSpec:
: turn_scores,\n \"qa_scores\": qa_pairs,\n \"error_rate\"\
: error_rate,\n }\n\n all_mt_bench_data.append(mt_bench_data)\n\
\ scores[model_path] = overall_score\n\n with open(mt_bench_output.path,\
\ \"w\") as f:\n json.dump(all_mt_bench_data, f, indent=4)\n\n \
\ outputs = NamedTuple(\"outputs\", best_model=str, best_score=float)\n\
\ best_model = max(scores, key=scores.get)\n best_score = scores[best_model]\n\
\ return outputs(best_model=best_model, best_score=best_score)\n\n"
\ \"w\", encoding=\"utf-8\") as f:\n json.dump(all_mt_bench_data,\
\ f, indent=4)\n\n outputs = NamedTuple(\"outputs\", best_model=str,\
\ best_score=float)\n best_model = max(scores, key=scores.get)\n best_score\
\ = scores[best_model]\n if best_score_file:\n with open(best_score_file,\
\ \"w\", encoding=\"utf-8\") as f:\n json.dump({\"best_model\"\
: best_model, \"best_score\": best_score}, f, indent=4)\n\n return outputs(best_model=best_model,\
\ best_score=best_score)\n\n"
image: quay.io/sallyom/instructlab-ocp:eval-10-8
resources:
accelerator:
Expand Down
33 changes: 24 additions & 9 deletions standalone/standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@
DATA_PVC_OUTPUT_DATA_PATH = path.join(DATA_PVC_OUTPUT_PATH, "data")
PYTORCH_NNODES = 2
# MMLU_SCORES_PATH = "/output/mmlu-results.txt"
MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt")
MT_BENCH_OUTPUT_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt")
MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt")
SDG_OBJECT_STORE_SECRET_NAME = "sdg-object-store-credentials"
KFP_MODEL_SERVER_CM = """
# TODO: remove the following line and replace it with the actual ConfigMap/Secret
Expand Down Expand Up @@ -363,6 +364,12 @@ def upload_s3_file():
PYTHON_EXECUTOR = """
set -e
export XDG_CACHE_HOME=/tmp
export OUTLINES_CACHE_DIR=/tmp
export NUMBA_CACHE_DIR=/tmp
export TRANSFORMERS_CACHE=/tmp
export HF_HOME=/tmp
export HOME=/tmp
export TRITON_CACHE_DIR=/tmp
tmp=$(mktemp -d)
cat <<EOF > "$tmp"/exec.py
Expand Down Expand Up @@ -773,9 +780,8 @@ def run(
ctx.obj["eval_type"] = "mt-bench"
scores = ctx.invoke(evaluation)
scores = json.loads(scores)
best_model = max(scores, key=lambda x: x["average_score"])
logger.info("Best model: %s", best_model.get("model"))
ctx.obj["candidate_model"] = best_model.get("model")
logger.info("Best model: %s", scores.get("best_model"))
ctx.obj["candidate_model"] = scores.get("best_model")

# Final evaluation
# TODO
Expand Down Expand Up @@ -1268,7 +1274,6 @@ def data_processing(train_args: TrainingArgs) -> None:

container = kubernetes.client.V1Container(
name="sdg-preprocess",
# image="quay.io/tcoufal/ilab-sdg:latest",
image=RHELAI_IMAGE,
command=["/bin/sh", "-ce"],
args=[
Expand Down Expand Up @@ -1320,6 +1325,7 @@ def create_eval_job(
namespace: str,
job_name: str,
eval_type: str,
nproc_per_node: int = 1,
) -> kubernetes.client.V1Job:
"""
Create a Kubernetes Job object.
Expand Down Expand Up @@ -1374,6 +1380,7 @@ def run_mt_bench_op(
models_list: List[str] = None,
models_folder: Optional[str] = None,
device: str = None,
best_score_file: Optional[str] = None,
) -> NamedTuple("outputs", best_model=str, best_score=float):
import json
import os
Expand All @@ -1384,7 +1391,7 @@ def run_mt_bench_op(
VLLM_SERVER = "http://localhost:8000/v1"
def launch_vllm(
model_path: str, gpu_count: int, retries: int = 120, delay: int = 5
model_path: str, gpu_count: int, retries: int = 120, delay: int = 10
):
import subprocess
import sys
Expand Down Expand Up @@ -1540,16 +1547,20 @@ def stop_vllm():
all_mt_bench_data.append(mt_bench_data)
scores[model_path] = overall_score
with open(mt_bench_output, "w") as f:
with open(mt_bench_output, "w", encoding="utf-8") as f:
json.dump(all_mt_bench_data, f, indent=4)
outputs = NamedTuple("outputs", best_model=str, best_score=float)
best_model = max(scores, key=scores.get)
best_score = scores[best_model]
if best_score_file:
with open(best_score_file, "w", encoding="utf-8") as f:
json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4)
return outputs(best_model=best_model, best_score=best_score)
"""
exec_run_mt_bench_op_args = """
run_mt_bench_op(mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/hf_format", models_path_prefix="/data/model/output/hf_format", max_workers="auto", merge_system_user_message=False)
run_mt_bench_op(best_score_file="/data/mt-bench-best.txt",mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/hf_format", models_path_prefix="/data/model/output/hf_format", max_workers="auto", merge_system_user_message=False)
"""

if eval_type == "mt-bench":
Expand All @@ -1573,6 +1584,10 @@ def stop_vllm():
)
),
],
resources=kubernetes.client.V1ResourceRequirements(
requests={"cpu": "1", "nvidia.com/gpu": nproc_per_node},
limits={"cpu": "1", "nvidia.com/gpu": nproc_per_node},
),
)
]
container = kubernetes.client.V1Container(
Expand Down Expand Up @@ -2163,7 +2178,7 @@ def evaluation(ctx: click.Context) -> str:

try:
scores_data = json.loads(scores)
if isinstance(scores_data, list):
if isinstance(scores_data, dict):
scores = json.dumps(scores_data)
else:
raise ValueError("Unexpected format for scores data")
Expand Down
21 changes: 16 additions & 5 deletions standalone/standalone.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ DATA_PVC_OUTPUT_PATH = path.join(DATA_PVC_MOUNT_PATH, "output")
DATA_PVC_OUTPUT_DATA_PATH = path.join(DATA_PVC_OUTPUT_PATH, "data")
PYTORCH_NNODES = 2
# MMLU_SCORES_PATH = "/output/mmlu-results.txt"
MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt")
MT_BENCH_OUTPUT_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt")
MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt")
SDG_OBJECT_STORE_SECRET_NAME = "sdg-object-store-credentials"
KFP_MODEL_SERVER_CM = """
# TODO: remove the following line and replace it with the actual ConfigMap/Secret
Expand Down Expand Up @@ -348,6 +349,12 @@ spec:
PYTHON_EXECUTOR = """
set -e
export XDG_CACHE_HOME=/tmp
export OUTLINES_CACHE_DIR=/tmp
export NUMBA_CACHE_DIR=/tmp
export TRANSFORMERS_CACHE=/tmp
export HF_HOME=/tmp
export HOME=/tmp
export TRITON_CACHE_DIR=/tmp
tmp=$(mktemp -d)
cat <<EOF > "$tmp"/exec.py
Expand Down Expand Up @@ -758,9 +765,8 @@ def run(
ctx.obj["eval_type"] = "mt-bench"
scores = ctx.invoke(evaluation)
scores = json.loads(scores)
best_model = max(scores, key=lambda x: x["average_score"])
logger.info("Best model: %s", best_model.get("model"))
ctx.obj["candidate_model"] = best_model.get("model")
logger.info("Best model: %s", scores.get("best_model"))
ctx.obj["candidate_model"] = scores.get("best_model")

# Final evaluation
# TODO
Expand Down Expand Up @@ -1131,6 +1137,7 @@ def create_eval_job(
namespace: str,
job_name: str,
eval_type: str,
nproc_per_node: int = 1,
) -> kubernetes.client.V1Job:
"""
Create a Kubernetes Job object.
Expand Down Expand Up @@ -1199,6 +1206,10 @@ def create_eval_job(
)
),
],
resources=kubernetes.client.V1ResourceRequirements(
requests={"cpu": "1", "nvidia.com/gpu": nproc_per_node},
limits={"cpu": "1", "nvidia.com/gpu": nproc_per_node},
),
)
]
container = kubernetes.client.V1Container(
Expand Down Expand Up @@ -1789,7 +1800,7 @@ def evaluation(ctx: click.Context) -> str:

try:
scores_data = json.loads(scores)
if isinstance(scores_data, list):
if isinstance(scores_data, dict):
scores = json.dumps(scores_data)
else:
raise ValueError("Unexpected format for scores data")
Expand Down

0 comments on commit ce439bf

Please sign in to comment.