diff --git a/azureml/eval_prompts.yml b/azureml/eval_prompts.yml new file mode 100644 index 0000000..5613f29 --- /dev/null +++ b/azureml/eval_prompts.yml @@ -0,0 +1,37 @@ +$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json +command: > + python -m autora.doc.pipelines.main eval-prompts + ${{inputs.data_dir}}/${{inputs.data_file}} + ${{inputs.prompts_dir}}/${{inputs.prompts_file}} + --model-path ${{inputs.model_path}} + --param do_sample=${{inputs.do_sample}} + --param temperature=${{inputs.temperature}} + --param top_k=${{inputs.top_k}} + --param top_p=${{inputs.top_p}} +code: ../src +inputs: + data_dir: + type: uri_folder + path: azureml://datastores/workspaceblobstore/paths/data/sweetpea/ + prompts_dir: + type: uri_folder + path: azureml://datastores/workspaceblobstore/paths/data/autora/prompts/ + # Currently models are loading faster directly from HuggingFace vs Azure Blob Storage + # model_dir: + # type: uri_folder + # path: azureml://datastores/workspaceblobstore/paths/base_models + model_path: meta-llama/Llama-2-7b-chat-hf + temperature: 0.01 + do_sample: 0 + top_p: 0.95 + top_k: 1 + data_file: data.jsonl + prompts_file: prompt_list.json +# using a curated environment doesn't work because we need additional packages +environment: # azureml://registries/azureml/environments/acpt-pytorch-2.0-cuda11.7/versions/21 + image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:21 + conda_file: conda.yml +display_name: autodoc_multi_prompts_prediction +compute: azureml:v100cluster +experiment_name: evaluation_multi_prompts +description: Run code-to-documentation generation on data_file for each prompt in prompts_file diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py index d01c407..dedf6e2 100644 --- a/src/autora/doc/pipelines/main.py +++ b/src/autora/doc/pipelines/main.py @@ -145,14 +145,12 @@ def eval_prompt( timer_end = timer() bleu, meteor = evaluate_documentation(predictions, labels) pred_time = timer_end - timer_start + prompt_hash = hash(prompt) mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs))) for i in range(len(inputs)): - mlflow.log_text(labels[i], f"label_{i}.txt") - mlflow.log_text(inputs[i], f"input_{i}.py") - for j in range(len(predictions[i])): - mlflow.log_text(predictions[i][j], f"prediction_{i}_{j}.txt") - mlflow.log_text("bleu_score is ", str(bleu)) - mlflow.log_text("meteor_score is ", str(meteor)) + mlflow.log_text(labels[i], f"{prompt_hash}_label_{i}.txt") + mlflow.log_text(inputs[i], f"{prompt_hash}_input_{i}.py") + mlflow.log_text(predictions[i], f"{prompt_hash}_prediction_{i}.txt") # flatten predictions for counting tokens predictions_flat = list(itertools.chain.from_iterable(predictions))