bulk commit

Sorry, I went really far with this one but I can confirm that: * sdg-data-fetch is working * data processing works * training phase 1 is stuck when launched, need investigation Also: * remove backtick from the code since it breaks the shell that runs the python executor * only use a single PVC for everything: sdg data, model, trained model * --force-pull: to force pulling from the object store again if the data are already present Signed-off-by: Sébastien Han <[email protected]>
redhat-et · Oct 9, 2024 · 3612fe7 · 3612fe7
1 parent ca03343
commit 3612fe7
Show file tree

Hide file tree

Showing 10 changed files with 634 additions and 525 deletions.
diff --git a/eval/final/components.py b/eval/final/components.py
@@ -221,7 +221,7 @@ def find_node_dataset_directories(base_directory: str):
 
  ######################################################################
  # TODO: Update ilab/model/evaluate evaluate def logic to allow for external judge model
- # and when that happens, much of this logic can be imported from the `evaluate` definition:
+ # and when that happens, much of this logic can be imported from the 'evaluate' definition:
  # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504
  #
  # With instructlab, model_name is synonomous with model_path
@@ -244,8 +244,8 @@ def find_node_dataset_directories(base_directory: str):
  ),
  ]
 
- # ilab/evaluate uses a magic word for its mt_bench evaluator - `auto`
- # with `auto`, number of gpus allocated for serving is calculated based on environment
+ # ilab/evaluate uses a magic word for its mt_bench evaluator - 'auto'
+ # with 'auto', number of gpus allocated for serving is calculated based on environment
  # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
  if max_workers == "auto":
  try:

diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py
@@ -12,8 +12,8 @@ def run_mt_bench_op(
  models_path_prefix: str,
  mt_bench_output: Output[Artifact],
  merge_system_user_message: bool,
- # generate_answers,judgment uses a magic word for its mt_bench evaluator - `auto`
- # with `auto`, number of gpus allocated for serving is calculated based on environment
+ # generate_answers,judgment uses a magic word for its mt_bench evaluator - 'auto'
+ # with 'auto', number of gpus allocated for serving is calculated based on environment
  # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
  max_workers: str,
  models_list: List[str] = None,
@@ -53,8 +53,8 @@ def run_mt_bench_op(
  scores = {}
  all_mt_bench_data = []
 
- # generate_answers,judgment uses a magic word for its mt_bench evaluator - `auto`
- # with `auto`, number of gpus allocated for serving is calculated based on environment
+ # generate_answers,judgment uses a magic word for its mt_bench evaluator - 'auto'
+ # with 'auto', number of gpus allocated for serving is calculated based on environment
  # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
  if max_workers == "auto":
  try:

diff --git a/pipeline.py b/pipeline.py
@@ -348,7 +348,7 @@ def pipeline(
  final_eval_task.set_accelerator_type("nvidia.com/gpu")
  final_eval_task.set_accelerator_limit(1)
 
- # Technically `output_model_task` and `output_data_task` can happen before evaluation,
+ # Technically 'output_model_task' and 'output_data_task' can happen before evaluation,
  # however the PVC can only be mounted once, so, setting these to _after_ so the eval proceeds.
  output_model_task = pvc_to_artifact_op(
  pvc_path="/output/data",
@@ -417,7 +417,7 @@ def gen_standalone():
  This function should be used when Kubeflow Pipelines are not available. It will generate a
  script that replicates the pipeline's functionality.
 
- Example usage: ``` $ python pipeline.py gen-standalone ```
+ Example usage: ''' $ python pipeline.py gen-standalone '''
  """
  from os import path
 
@@ -442,11 +442,11 @@ def gen_standalone():
 
  # The list of executor names to extract details from to generate the standalone script
  executors = {
- "exec-data-processing-op": 'data_processing_op(max_seq_len=4096, max_batch_len=20000, sdg="/input_data/generated", model="/input_model", processed_data="/input_data/processed_data")',
- "exec-sdg-op": 'sdg_op(num_instructions_to_generate=2, repo_branch="", repo_pr="", taxonomy="/input_data/taxonomy", sdg="/input_data/generated")',
+ "exec-data-processing-op": 'data_processing_op(max_seq_len=4096, max_batch_len=20000, sdg="/data/data", model="/data/model", processed_data="/data/processed_data")',
+ "exec-sdg-op": 'sdg_op(num_instructions_to_generate=2, repo_branch="", repo_pr="", taxonomy="/data/taxonomy", sdg="/data/generated")',
  "exec-git-clone-op": {},
- "exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/input_model")',
- "exec-run-mt-bench-op": 'run_mt_bench_op(mt_bench_output="/output/mt-bench-results.txt", models_list="/output/model/model/hf_format", models_path_prefix="/output/model/hf_format", max_workers="auto", merge_system_user_message=False)',
+ "exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/data/model")',
+ "exec-run-mt-bench-op": 'run_mt_bench_op(mt_bench_output="/data/mt-bench-results.txt", models_list="/data/model/model/hf_format", models_path_prefix="/data/model/hf_format", max_workers="auto", merge_system_user_message=False)',
  }
 
  details = {}
@@ -621,9 +621,18 @@ def change_dsl_function_to_normal_function(rendered_code: list):
  "import kfp": "",
  "from kfp import dsl": "",
  "from kfp.dsl import *": "",
- ".path": "", # super hacky, but works for now, the idea is that "taxonomy.path" is a string so we just remove the ".path" part
  }
 
+ import re
+
+ # Regular expression to match ".path" but not "os.path"
+ path_pattern = re.compile(r"(?<!os)\.path")
+
+ def remove_path_not_os_path(line):
+ return path_pattern.sub("", line)
+
+ rendered_code = [remove_path_not_os_path(line) for line in rendered_code]
+
  for old, new in replacements.items():
  rendered_code = [line.replace(old, new) for line in rendered_code]
  return rendered_code[-1].strip()

diff --git a/pipeline.yaml b/pipeline.yaml
@@ -589,7 +589,7 @@ deploymentSpec:
  \  )\n\n def data_processing(train_args: TrainingArgs) -> None:\n \
  \  # early validation logic here\n if train_args.max_batch_len\
  \ < train_args.max_seq_len:\n raise ValueError(\n \
- \  f\"the `max_batch_len` cannot be less than `max_seq_len`: {train_args.max_batch_len=}\
+ \  f\"the 'max_batch_len' cannot be less than 'max_seq_len': {train_args.max_batch_len=}\
  \ < {train_args.max_seq_len=}\"\n )\n\n # process\
  \ the training data\n if not os.path.exists(train_args.data_output_dir):\n\
  \  os.makedirs(train_args.data_output_dir, exist_ok=True)\n \
@@ -1107,7 +1107,7 @@ deploymentSpec:
  main\"\n\n ######################################################################\n\
  \  # TODO: Update ilab/model/evaluate evaluate def logic to allow for\
  \ external judge model\n # and when that happens, much of this logic\
- \ can be imported from the `evaluate` definition:\n # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504\n\
+ \ can be imported from the 'evaluate' definition:\n # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504\n\
  \  #\n # With instructlab, model_name is synonomous with model_path\n\
  \  mt_bench_evaluators = [\n MTBenchBranchEvaluator(\n \
  \  model_name=candidate_model,\n judge_model_name=judge_model_name,\n\
@@ -1118,7 +1118,7 @@ deploymentSpec:
  \  branch=base_branch,\n output_dir=output_dir,\n \
  \  merge_system_user_message=merge_system_user_message,\n \
  \  ),\n ]\n\n # ilab/evaluate uses a magic word for its mt_bench\
- \ evaluator - `auto`\n # with `auto`, number of gpus allocated for serving\
+ \ evaluator - 'auto'\n # with 'auto', number of gpus allocated for serving\
  \ is calculated based on environment\n # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
  \  if max_workers == \"auto\":\n try:\n usable_cpu_count\
  \ = len(os.sched_getaffinity(0)) // 2\n except AttributeError:\n\
@@ -1197,7 +1197,7 @@ deploymentSpec:
  - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
  \ *\n\ndef run_mt_bench_op(\n models_path_prefix: str,\n mt_bench_output:\
  \ Output[Artifact],\n merge_system_user_message: bool,\n # generate_answers,judgment\
- \ uses a magic word for its mt_bench evaluator - `auto`\n # with `auto`,\
+ \ uses a magic word for its mt_bench evaluator - 'auto'\n # with 'auto',\
  \ number of gpus allocated for serving is calculated based on environment\n\
  \  # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
  \  max_workers: str,\n models_list: List[str] = None,\n models_folder:\
@@ -1215,7 +1215,7 @@ deploymentSpec:
  \n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\
  \ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\
  )\n\n scores = {}\n all_mt_bench_data = []\n\n # generate_answers,judgment\
- \ uses a magic word for its mt_bench evaluator - `auto`\n # with `auto`,\
+ \ uses a magic word for its mt_bench evaluator - 'auto'\n # with 'auto',\
  \ number of gpus allocated for serving is calculated based on environment\n\
  \  # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
  \  if max_workers == \"auto\":\n try:\n usable_cpu_count\
@@ -1286,7 +1286,7 @@ deploymentSpec:
  \ > 0) else \"empty\"\n\n print(\"Generating syntetic dataset for:\"\
  )\n print()\n print(read_taxonomy(taxonomy.path, taxonomy_base))\n\
  \n # generate_data has a magic word for its taxonomy_base argument -\
- \ `empty`\n # it allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
+ \ 'empty'\n # it allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
  \  generate_data(\n client=client,\n num_instructions_to_generate=num_instructions_to_generate,\n\
  \  output_dir=sdg.path,\n taxonomy=taxonomy.path,\n \
  \ taxonomy_base=taxonomy_base,\n model_name=model,\n chunk_word_count=1000,\n\

diff --git a/sdg/components.py b/sdg/components.py
@@ -52,7 +52,7 @@ def sdg_op(
  print()
  print(read_taxonomy(taxonomy.path, taxonomy_base))
 
- # generate_data has a magic word for its taxonomy_base argument - `empty`
+ # generate_data has a magic word for its taxonomy_base argument - 'empty'
  # it allows generating from the whole repo, see:
  # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
  generate_data(

diff --git a/standalone/README.md b/standalone/README.md
@@ -98,6 +98,10 @@ The script requires information regarding the location and method for accessing
 * `--eval-serving-model-name`: The name of the model to use for evaluation. **Required**
 * `--eval-serving-model-api-key`: The API key for the model to evaluate. `EVAL_SERVING_MODEL_API_KEY`
  environment variable can be used as well. **Required**
+* `--force-pull`: Force pull the data (sdg data and model) from the object store even if it already
+ exists in the PVC. **Optional** - Default: false.
+* `--training-1-epoch-num`: The number of epochs to train the model for phase 1. **Optional** - Default: 7.
+* `--training-2-epoch-num`: The number of epochs to train the model for phase 2. **Optional** - Default: 10.
 
 
 ## Example End-To-End Workflow