diff --git a/assets/evaluation_on_cloud/environments/evaluations-built-in/asset.yaml b/assets/evaluation_on_cloud/environments/evaluations-built-in/asset.yaml new file mode 100644 index 0000000000..93e930eddc --- /dev/null +++ b/assets/evaluation_on_cloud/environments/evaluations-built-in/asset.yaml @@ -0,0 +1,6 @@ +name: evaluations-built-in +version: auto +type: environment +spec: spec.yaml +extra_config: environment.yaml +categories: ["Evaluation"] diff --git a/assets/evaluation_on_cloud/environments/evaluations-built-in/context/Dockerfile b/assets/evaluation_on_cloud/environments/evaluations-built-in/context/Dockerfile new file mode 100644 index 0000000000..c881b87906 --- /dev/null +++ b/assets/evaluation_on_cloud/environments/evaluations-built-in/context/Dockerfile @@ -0,0 +1,8 @@ +FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04:latest + +COPY requirements.txt /app/requirements.txt +RUN pip install -r /app/requirements.txt + +# Copy your Python file into the image +COPY evaluate_on_data.py /app/evaluate_on_data.py +COPY save_evaluation.py /app/save_evaluation.py \ No newline at end of file diff --git a/assets/evaluation_on_cloud/environments/evaluations-built-in/context/evaluate_on_data.py b/assets/evaluation_on_cloud/environments/evaluations-built-in/context/evaluate_on_data.py new file mode 100644 index 0000000000..c9d36ae737 --- /dev/null +++ b/assets/evaluation_on_cloud/environments/evaluations-built-in/context/evaluate_on_data.py @@ -0,0 +1,124 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Evaluate for a built-in or custom evulator.""" +import argparse +import json +import logging +import mlflow +import os +import pandas as pd +import requests +import shutil +from azure.ai.ml.identity import AzureMLOnBehalfOfCredential +from azure.ai.evaluation import evaluate +from save_eval import load_evaluator + +logger = logging.getLogger(__name__) + + +def update_value_in_dict(d, key_substring, new_func): + """Recursively search for a value containing 'key_substring' and apply 'new_func' to modify it.""" + for key, value in d.items(): + if isinstance(value, dict): + update_value_in_dict(value, key_substring, new_func) + elif isinstance(value, str) and key_substring in value: + d[key] = new_func(value) + + +def find_file_and_get_parent_dir(root_dir, file_name="flow.flex.yaml"): + """Find the flex flow or any given file in a directory and return the parent directory.""" + for dirpath, _, filenames in os.walk(root_dir): + if file_name in filenames: + logger.info(f"Found {file_name} in {dirpath}") + return dirpath + + +def copy_evaluator_files(command_line_args): + """Copy the mounted evaluator files to the relative paths to enable read/write.""" + evaluator_name_id_map = json.loads(command_line_args.evaluator_name_id_map) + for evaluator_name, evaluator_id in evaluator_name_id_map.items(): + dir_path = find_file_and_get_parent_dir(evaluator_id) + if dir_path: + shutil.copytree(dir_path, f"./{evaluator_name}") + logger.info(f"Copying {dir_path} to ./{evaluator_name}") + logger.info(evaluator_name, os.listdir(f"./{evaluator_name}")) + else: + logger.info(f"Directory for evaluator {evaluator_name} not found.") + + +def initialize_evaluators(command_line_args): + """Initialize the evaluators using correct parameters and credentials for rai evaluators.""" + evaluators = {} + evaluators_o = json.loads(command_line_args.evaluators) + for evaluator_name, evaluator in evaluators_o.items(): + init_params = evaluator["InitParams"] + update_value_in_dict(init_params, "AZURE_OPENAI_API_KEY", lambda x: os.environ[x.upper()]) + flow = load_evaluator('./' + evaluator_name) + if any(rai_eval in evaluator["Id"] for rai_eval in rai_evaluators): + init_params["credential"] = AzureMLOnBehalfOfCredential() + evaluators[evaluator_name] = flow(**init_params) + return evaluators + + +def run_evaluation(command_line_args, evaluators): + """Run evaluation using evaluators.""" + results = evaluate( + data=command_line_args.eval_data, + evaluators=evaluators + ) + metrics = {} + for metric_name, metric_value in results['metrics'].items(): + logger.info("Logging metric:", metric_name, metric_value) + metrics[metric_name] = metric_value + mlflow.log_metrics(metrics) + + if results and results.get("rows"): + # Convert the results to a DataFrame + df = pd.DataFrame(results["rows"]) + + # Save the DataFrame as a JSONL file + df.to_json("instance_results.jsonl", orient="records", lines=True) + df.to_json("eval_results.jsonl", orient="records", lines=True) + mlflow.log_artifact("instance_results.jsonl") + mlflow.log_artifact("eval_results.jsonl") + + +def get_promptflow_run_logs(): + """Get promptflow run logs.""" + if os.path.exists("/root/.promptflow/.runs/"): + runs = os.listdir("/root/.promptflow/.runs/") + for run in runs: + if os.path.exists(f"/root/.promptflow/.runs/{run}/logs.txt"): + with open(f"/root/.promptflow/.runs/{run}/logs.txt", "r") as f: + logger.info(f"RUN {run} =========================") + logger.info(f.read()) + else: + logger.info("RUN DOES NOT EXIST") + + +# Create a session for making HTTP requests +session = requests.Session() + +# Parse command line arguments and debug to ensure working +parser = argparse.ArgumentParser("eval") +parser.add_argument("--eval_data", type=str) +parser.add_argument("--eval_output", type=str) +parser.add_argument("--evaluators", type=str) +parser.add_argument("--evaluator_name_id_map", type=str) + +args = parser.parse_args() +rai_evaluators = ['HateUnfairnessEvaluator', 'Sexual-Content-Evaluator', 'Hate-and-Unfairness-Evaluator', + 'Violent-Content-Evaluator', 'Self-Harm-Related-Content-Evaluator'] + +if __name__ == '__main__': + copy_evaluator_files(args) + evaluators = initialize_evaluators(args) + logger.info("*************** Collecting Result of Evaluators ******************") + # Run the evaluation + with mlflow.start_run() as run: + try: + run_evaluation(args, evaluators) + except Exception as e: + logger.error("EXCEPT", e) + get_promptflow_run_logs() diff --git a/assets/evaluation_on_cloud/environments/evaluations-built-in/context/requirements.txt b/assets/evaluation_on_cloud/environments/evaluations-built-in/context/requirements.txt new file mode 100644 index 0000000000..6e0b2786f2 --- /dev/null +++ b/assets/evaluation_on_cloud/environments/evaluations-built-in/context/requirements.txt @@ -0,0 +1,5 @@ +azure-ai-evaluation +openai +azureml-mlflow +azure-identity +azure-ai-ml \ No newline at end of file diff --git a/assets/evaluation_on_cloud/environments/evaluations-built-in/context/save_evaluation.py b/assets/evaluation_on_cloud/environments/evaluations-built-in/context/save_evaluation.py new file mode 100644 index 0000000000..ddf8675c44 --- /dev/null +++ b/assets/evaluation_on_cloud/environments/evaluations-built-in/context/save_evaluation.py @@ -0,0 +1,30 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Load a built-in or custom evulator as flow.""" +import importlib +import logging +import os +import sys +from promptflow.client import load_flow + +logger = logging.getLogger(__name__) + + +def load_evaluator(evaluator): + """Load evaluator as flow.""" + logger.info(f"Loading evaluator {evaluator}") + loaded_evaluator = load_flow(evaluator) + logger.info(loaded_evaluator) + module_parent = loaded_evaluator.path.parent.name + module_name = loaded_evaluator.entry.split(":")[0] + logger.info(f"Loading module {os.getcwd()} {module_name} from {module_parent}") + module_path = os.path.join(os.getcwd(), module_parent, module_name + ".py") + logger.info(f"Loading module {module_name} from {module_path}") + spec = importlib.util.spec_from_file_location(module_name, module_path) + mod = importlib.util.module_from_spec(spec) + logger.info(f"Loaded module {mod}") + sys.modules[module_name] = mod + spec.loader.exec_module(mod) + eval_class = getattr(mod, loaded_evaluator.entry.split(":")[1]) + return eval_class diff --git a/assets/evaluation_on_cloud/environments/evaluations-built-in/environment.yaml b/assets/evaluation_on_cloud/environments/evaluations-built-in/environment.yaml new file mode 100644 index 0000000000..e995657493 --- /dev/null +++ b/assets/evaluation_on_cloud/environments/evaluations-built-in/environment.yaml @@ -0,0 +1,11 @@ +image: + name: azureml/curated/evaluations-built-in + os: linux + context: + dir: context + dockerfile: Dockerfile + template_files: + - Dockerfile + publish: + location: mcr + visibility: public \ No newline at end of file diff --git a/assets/evaluation_on_cloud/environments/evaluations-built-in/spec.yaml b/assets/evaluation_on_cloud/environments/evaluations-built-in/spec.yaml new file mode 100644 index 0000000000..1a135ac6d0 --- /dev/null +++ b/assets/evaluation_on_cloud/environments/evaluations-built-in/spec.yaml @@ -0,0 +1,11 @@ +$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json +description: Python environment for running promptflow-evals based evaluators. + +name: "{{asset.name}}" +version: "{{asset.version}}" + +os_type: linux + +build: + path: "{{image.context.path}}" + dockerfile_path: "{{image.dockerfile.path}}"