From ce439bf33a7d8f6aa114092b8206cfba88a3dd09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Thu, 10 Oct 2024 13:34:10 +0200 Subject: [PATCH] eval: add proper selector and permissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add resource request to run a GPU node * Write the best model/score to a file: /data/mt-bench-best.txt Signed-off-by: Sébastien Han --- eval/mt_bench/components.py | 9 ++++-- pipeline.py | 2 +- pipeline.yaml | 58 ++++++++++++++++++++----------------- standalone/standalone.py | 33 +++++++++++++++------ standalone/standalone.tpl | 21 ++++++++++---- 5 files changed, 80 insertions(+), 43 deletions(-) diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py index 8f853f9..b2e6cd9 100644 --- a/eval/mt_bench/components.py +++ b/eval/mt_bench/components.py @@ -19,6 +19,7 @@ def run_mt_bench_op( models_list: List[str] = None, models_folder: Optional[str] = None, device: str = None, + best_score_file: Optional[str] = None, ) -> NamedTuple("outputs", best_model=str, best_score=float): import json import os @@ -29,7 +30,7 @@ def run_mt_bench_op( VLLM_SERVER = "http://localhost:8000/v1" def launch_vllm( - model_path: str, gpu_count: int, retries: int = 120, delay: int = 5 + model_path: str, gpu_count: int, retries: int = 120, delay: int = 10 ): import subprocess import sys @@ -185,12 +186,16 @@ def stop_vllm(): all_mt_bench_data.append(mt_bench_data) scores[model_path] = overall_score - with open(mt_bench_output.path, "w") as f: + with open(mt_bench_output.path, "w", encoding="utf-8") as f: json.dump(all_mt_bench_data, f, indent=4) outputs = NamedTuple("outputs", best_model=str, best_score=float) best_model = max(scores, key=scores.get) best_score = scores[best_model] + if best_score_file: + with open(best_score_file, "w", encoding="utf-8") as f: + json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4) + return outputs(best_model=best_model, best_score=best_score) diff --git a/pipeline.py b/pipeline.py index cfbfe58..b984128 100644 --- a/pipeline.py +++ b/pipeline.py @@ -446,7 +446,7 @@ def gen_standalone(): "exec-sdg-op": 'sdg_op(num_instructions_to_generate=2, repo_branch="", repo_pr="", taxonomy="/data/taxonomy", sdg="/data/generated")', "exec-git-clone-op": {}, "exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/data/model")', - "exec-run-mt-bench-op": 'run_mt_bench_op(mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/hf_format", models_path_prefix="/data/model/output/hf_format", max_workers="auto", merge_system_user_message=False)', + "exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="/data/mt-bench-best.txt",mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/hf_format", models_path_prefix="/data/model/output/hf_format", max_workers="auto", merge_system_user_message=False)', } details = {} diff --git a/pipeline.yaml b/pipeline.yaml index bf49fa2..6d0c2d7 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -471,6 +471,9 @@ components: executorLabel: exec-run-mt-bench-op inputDefinitions: parameters: + best_score_file: + isOptional: true + parameterType: STRING device: isOptional: true parameterType: STRING @@ -1204,28 +1207,28 @@ deploymentSpec: \ number of gpus allocated for serving is calculated based on environment\n\ \ # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\ \ max_workers: str,\n models_list: List[str] = None,\n models_folder:\ - \ Optional[str] = None,\n device: str = None,\n) -> NamedTuple(\"outputs\"\ - , best_model=str, best_score=float):\n import json\n import os\n\n\ - \ import torch\n from instructlab.eval.mt_bench import MTBenchEvaluator\n\ - \n VLLM_SERVER = \"http://localhost:8000/v1\"\n\n def launch_vllm(\n\ - \ model_path: str, gpu_count: int, retries: int = 120, delay: int\ - \ = 5\n ):\n import subprocess\n import sys\n import\ - \ time\n\n import requests\n\n if gpu_count > 0:\n \ - \ command = [\n sys.executable,\n \"-m\"\ - ,\n \"vllm.entrypoints.openai.api_server\",\n \ - \ \"--model\",\n model_path,\n \"--tensor-parallel-size\"\ - ,\n str(gpu_count),\n ]\n else:\n \ - \ command = [\n sys.executable,\n \"\ - -m\",\n \"vllm.entrypoints.openai.api_server\",\n \ - \ \"--model\",\n model_path,\n ]\n\n \ - \ subprocess.Popen(args=command)\n\n print(f\"Waiting for vLLM\ - \ server to start at {VLLM_SERVER}...\")\n\n for attempt in range(retries):\n\ - \ try:\n response = requests.get(f\"{VLLM_SERVER}/models\"\ - )\n if response.status_code == 200:\n \ - \ print(f\"vLLM server is up and running at {VLLM_SERVER}.\")\n \ - \ return\n except requests.ConnectionError:\n \ - \ pass\n\n print(\n f\"Server not available\ - \ yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...\"\ + \ Optional[str] = None,\n device: str = None,\n best_score_file: Optional[str]\ + \ = None,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n\ + \ import json\n import os\n\n import torch\n from instructlab.eval.mt_bench\ + \ import MTBenchEvaluator\n\n VLLM_SERVER = \"http://localhost:8000/v1\"\ + \n\n def launch_vllm(\n model_path: str, gpu_count: int, retries:\ + \ int = 120, delay: int = 10\n ):\n import subprocess\n \ + \ import sys\n import time\n\n import requests\n\n \ + \ if gpu_count > 0:\n command = [\n sys.executable,\n\ + \ \"-m\",\n \"vllm.entrypoints.openai.api_server\"\ + ,\n \"--model\",\n model_path,\n \ + \ \"--tensor-parallel-size\",\n str(gpu_count),\n \ + \ ]\n else:\n command = [\n sys.executable,\n\ + \ \"-m\",\n \"vllm.entrypoints.openai.api_server\"\ + ,\n \"--model\",\n model_path,\n \ + \ ]\n\n subprocess.Popen(args=command)\n\n print(f\"Waiting\ + \ for vLLM server to start at {VLLM_SERVER}...\")\n\n for attempt\ + \ in range(retries):\n try:\n response = requests.get(f\"\ + {VLLM_SERVER}/models\")\n if response.status_code == 200:\n\ + \ print(f\"vLLM server is up and running at {VLLM_SERVER}.\"\ + )\n return\n except requests.ConnectionError:\n\ + \ pass\n\n print(\n f\"Server not\ + \ available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...\"\ \n )\n time.sleep(delay)\n\n raise RuntimeError(\n\ \ f\"Failed to start vLLM server at {VLLM_SERVER} after {retries}\ \ retries.\"\n )\n\n # This seems like excessive effort to stop\ @@ -1287,10 +1290,13 @@ deploymentSpec: : turn_scores,\n \"qa_scores\": qa_pairs,\n \"error_rate\"\ : error_rate,\n }\n\n all_mt_bench_data.append(mt_bench_data)\n\ \ scores[model_path] = overall_score\n\n with open(mt_bench_output.path,\ - \ \"w\") as f:\n json.dump(all_mt_bench_data, f, indent=4)\n\n \ - \ outputs = NamedTuple(\"outputs\", best_model=str, best_score=float)\n\ - \ best_model = max(scores, key=scores.get)\n best_score = scores[best_model]\n\ - \ return outputs(best_model=best_model, best_score=best_score)\n\n" + \ \"w\", encoding=\"utf-8\") as f:\n json.dump(all_mt_bench_data,\ + \ f, indent=4)\n\n outputs = NamedTuple(\"outputs\", best_model=str,\ + \ best_score=float)\n best_model = max(scores, key=scores.get)\n best_score\ + \ = scores[best_model]\n if best_score_file:\n with open(best_score_file,\ + \ \"w\", encoding=\"utf-8\") as f:\n json.dump({\"best_model\"\ + : best_model, \"best_score\": best_score}, f, indent=4)\n\n return outputs(best_model=best_model,\ + \ best_score=best_score)\n\n" image: quay.io/sallyom/instructlab-ocp:eval-10-8 resources: accelerator: diff --git a/standalone/standalone.py b/standalone/standalone.py index 2e1ab20..40d9c91 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -57,7 +57,8 @@ DATA_PVC_OUTPUT_DATA_PATH = path.join(DATA_PVC_OUTPUT_PATH, "data") PYTORCH_NNODES = 2 # MMLU_SCORES_PATH = "/output/mmlu-results.txt" -MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt") +MT_BENCH_OUTPUT_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt") +MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt") SDG_OBJECT_STORE_SECRET_NAME = "sdg-object-store-credentials" KFP_MODEL_SERVER_CM = """ # TODO: remove the following line and replace it with the actual ConfigMap/Secret @@ -363,6 +364,12 @@ def upload_s3_file(): PYTHON_EXECUTOR = """ set -e export XDG_CACHE_HOME=/tmp +export OUTLINES_CACHE_DIR=/tmp +export NUMBA_CACHE_DIR=/tmp +export TRANSFORMERS_CACHE=/tmp +export HF_HOME=/tmp +export HOME=/tmp +export TRITON_CACHE_DIR=/tmp tmp=$(mktemp -d) cat < "$tmp"/exec.py @@ -773,9 +780,8 @@ def run( ctx.obj["eval_type"] = "mt-bench" scores = ctx.invoke(evaluation) scores = json.loads(scores) - best_model = max(scores, key=lambda x: x["average_score"]) - logger.info("Best model: %s", best_model.get("model")) - ctx.obj["candidate_model"] = best_model.get("model") + logger.info("Best model: %s", scores.get("best_model")) + ctx.obj["candidate_model"] = scores.get("best_model") # Final evaluation # TODO @@ -1268,7 +1274,6 @@ def data_processing(train_args: TrainingArgs) -> None: container = kubernetes.client.V1Container( name="sdg-preprocess", - # image="quay.io/tcoufal/ilab-sdg:latest", image=RHELAI_IMAGE, command=["/bin/sh", "-ce"], args=[ @@ -1320,6 +1325,7 @@ def create_eval_job( namespace: str, job_name: str, eval_type: str, + nproc_per_node: int = 1, ) -> kubernetes.client.V1Job: """ Create a Kubernetes Job object. @@ -1374,6 +1380,7 @@ def run_mt_bench_op( models_list: List[str] = None, models_folder: Optional[str] = None, device: str = None, + best_score_file: Optional[str] = None, ) -> NamedTuple("outputs", best_model=str, best_score=float): import json import os @@ -1384,7 +1391,7 @@ def run_mt_bench_op( VLLM_SERVER = "http://localhost:8000/v1" def launch_vllm( - model_path: str, gpu_count: int, retries: int = 120, delay: int = 5 + model_path: str, gpu_count: int, retries: int = 120, delay: int = 10 ): import subprocess import sys @@ -1540,16 +1547,20 @@ def stop_vllm(): all_mt_bench_data.append(mt_bench_data) scores[model_path] = overall_score - with open(mt_bench_output, "w") as f: + with open(mt_bench_output, "w", encoding="utf-8") as f: json.dump(all_mt_bench_data, f, indent=4) outputs = NamedTuple("outputs", best_model=str, best_score=float) best_model = max(scores, key=scores.get) best_score = scores[best_model] + if best_score_file: + with open(best_score_file, "w", encoding="utf-8") as f: + json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4) + return outputs(best_model=best_model, best_score=best_score) """ exec_run_mt_bench_op_args = """ -run_mt_bench_op(mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/hf_format", models_path_prefix="/data/model/output/hf_format", max_workers="auto", merge_system_user_message=False) +run_mt_bench_op(best_score_file="/data/mt-bench-best.txt",mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/hf_format", models_path_prefix="/data/model/output/hf_format", max_workers="auto", merge_system_user_message=False) """ if eval_type == "mt-bench": @@ -1573,6 +1584,10 @@ def stop_vllm(): ) ), ], + resources=kubernetes.client.V1ResourceRequirements( + requests={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, + limits={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, + ), ) ] container = kubernetes.client.V1Container( @@ -2163,7 +2178,7 @@ def evaluation(ctx: click.Context) -> str: try: scores_data = json.loads(scores) - if isinstance(scores_data, list): + if isinstance(scores_data, dict): scores = json.dumps(scores_data) else: raise ValueError("Unexpected format for scores data") diff --git a/standalone/standalone.tpl b/standalone/standalone.tpl index 423d36a..4f10ff6 100755 --- a/standalone/standalone.tpl +++ b/standalone/standalone.tpl @@ -57,7 +57,8 @@ DATA_PVC_OUTPUT_PATH = path.join(DATA_PVC_MOUNT_PATH, "output") DATA_PVC_OUTPUT_DATA_PATH = path.join(DATA_PVC_OUTPUT_PATH, "data") PYTORCH_NNODES = 2 # MMLU_SCORES_PATH = "/output/mmlu-results.txt" -MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt") +MT_BENCH_OUTPUT_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt") +MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt") SDG_OBJECT_STORE_SECRET_NAME = "sdg-object-store-credentials" KFP_MODEL_SERVER_CM = """ # TODO: remove the following line and replace it with the actual ConfigMap/Secret @@ -348,6 +349,12 @@ spec: PYTHON_EXECUTOR = """ set -e export XDG_CACHE_HOME=/tmp +export OUTLINES_CACHE_DIR=/tmp +export NUMBA_CACHE_DIR=/tmp +export TRANSFORMERS_CACHE=/tmp +export HF_HOME=/tmp +export HOME=/tmp +export TRITON_CACHE_DIR=/tmp tmp=$(mktemp -d) cat < "$tmp"/exec.py @@ -758,9 +765,8 @@ def run( ctx.obj["eval_type"] = "mt-bench" scores = ctx.invoke(evaluation) scores = json.loads(scores) - best_model = max(scores, key=lambda x: x["average_score"]) - logger.info("Best model: %s", best_model.get("model")) - ctx.obj["candidate_model"] = best_model.get("model") + logger.info("Best model: %s", scores.get("best_model")) + ctx.obj["candidate_model"] = scores.get("best_model") # Final evaluation # TODO @@ -1131,6 +1137,7 @@ def create_eval_job( namespace: str, job_name: str, eval_type: str, + nproc_per_node: int = 1, ) -> kubernetes.client.V1Job: """ Create a Kubernetes Job object. @@ -1199,6 +1206,10 @@ def create_eval_job( ) ), ], + resources=kubernetes.client.V1ResourceRequirements( + requests={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, + limits={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, + ), ) ] container = kubernetes.client.V1Container( @@ -1789,7 +1800,7 @@ def evaluation(ctx: click.Context) -> str: try: scores_data = json.loads(scores) - if isinstance(scores_data, list): + if isinstance(scores_data, dict): scores = json.dumps(scores_data) else: raise ValueError("Unexpected format for scores data")