diff --git a/azureml/eval_prompts.yml b/azureml/eval_prompts.yml
new file mode 100644
index 0000000..5613f29
--- /dev/null
+++ b/azureml/eval_prompts.yml
@@ -0,0 +1,37 @@
+$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
+command: >
+  python -m autora.doc.pipelines.main eval-prompts
+  ${{inputs.data_dir}}/${{inputs.data_file}}
+  ${{inputs.prompts_dir}}/${{inputs.prompts_file}}
+  --model-path ${{inputs.model_path}}
+  --param do_sample=${{inputs.do_sample}}
+  --param temperature=${{inputs.temperature}}
+  --param top_k=${{inputs.top_k}}
+  --param top_p=${{inputs.top_p}}
+code: ../src
+inputs:
+  data_dir:
+    type: uri_folder
+    path: azureml://datastores/workspaceblobstore/paths/data/sweetpea/
+  prompts_dir:
+    type: uri_folder
+    path: azureml://datastores/workspaceblobstore/paths/data/autora/prompts/
+  # Currently models are loading faster directly from HuggingFace vs Azure Blob Storage
+  # model_dir:
+  #   type: uri_folder
+  #   path: azureml://datastores/workspaceblobstore/paths/base_models
+  model_path: meta-llama/Llama-2-7b-chat-hf
+  temperature: 0.01
+  do_sample: 0
+  top_p: 0.95
+  top_k: 1
+  data_file: data.jsonl
+  prompts_file: prompt_list.json
+# using a curated environment doesn't work because we need additional packages
+environment: # azureml://registries/azureml/environments/acpt-pytorch-2.0-cuda11.7/versions/21
+  image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:21
+  conda_file: conda.yml
+display_name: autodoc_multi_prompts_prediction
+compute: azureml:v100cluster
+experiment_name: evaluation_multi_prompts
+description: Run code-to-documentation generation on data_file for each prompt in prompts_file
diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
index d01c407..dedf6e2 100644
--- a/src/autora/doc/pipelines/main.py
+++ b/src/autora/doc/pipelines/main.py
@@ -145,14 +145,12 @@ def eval_prompt(
     timer_end = timer()
     bleu, meteor = evaluate_documentation(predictions, labels)
     pred_time = timer_end - timer_start
+    prompt_hash = hash(prompt)
     mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
     for i in range(len(inputs)):
-        mlflow.log_text(labels[i], f"label_{i}.txt")
-        mlflow.log_text(inputs[i], f"input_{i}.py")
-        for j in range(len(predictions[i])):
-            mlflow.log_text(predictions[i][j], f"prediction_{i}_{j}.txt")
-    mlflow.log_text("bleu_score is ", str(bleu))
-    mlflow.log_text("meteor_score is ", str(meteor))
+        mlflow.log_text(labels[i], f"{prompt_hash}_label_{i}.txt")
+        mlflow.log_text(inputs[i], f"{prompt_hash}_input_{i}.py")
+        mlflow.log_text(predictions[i], f"{prompt_hash}_prediction_{i}.txt")
 
     # flatten predictions for counting tokens
     predictions_flat = list(itertools.chain.from_iterable(predictions))