Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AML Evaluations team's Built-in Evaluator Environment #3455

Merged
merged 8 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: evaluations-built-in
version: auto
type: environment
spec: spec.yaml
extra_config: environment.yaml
categories: ["Evaluation"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04:latest

COPY requirements.txt /app/requirements.txt
RUN pip install -r /app/requirements.txt

# Copy your Python file into the image
COPY evaluate_on_data.py /app/evaluate_on_data.py
COPY save_evaluation.py /app/save_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Evaluate for a built-in or custom evulator."""
import argparse
import json
import logging
import mlflow
import os
import pandas as pd
import requests
import shutil
from azure.ai.ml.identity import AzureMLOnBehalfOfCredential
from azure.ai.evaluation import evaluate
from save_eval import load_evaluator

logger = logging.getLogger(__name__)


def update_value_in_dict(d, key_substring, new_func):
"""Recursively search for a value containing 'key_substring' and apply 'new_func' to modify it."""
for key, value in d.items():
if isinstance(value, dict):
update_value_in_dict(value, key_substring, new_func)
elif isinstance(value, str) and key_substring in value:
d[key] = new_func(value)


def find_file_and_get_parent_dir(root_dir, file_name="flow.flex.yaml"):
"""Find the flex flow or any given file in a directory and return the parent directory."""
for dirpath, _, filenames in os.walk(root_dir):
if file_name in filenames:
logger.info(f"Found {file_name} in {dirpath}")
return dirpath


def copy_evaluator_files(command_line_args):
"""Copy the mounted evaluator files to the relative paths to enable read/write."""
evaluator_name_id_map = json.loads(command_line_args.evaluator_name_id_map)
for evaluator_name, evaluator_id in evaluator_name_id_map.items():
dir_path = find_file_and_get_parent_dir(evaluator_id)
if dir_path:
shutil.copytree(dir_path, f"./{evaluator_name}")
logger.info(f"Copying {dir_path} to ./{evaluator_name}")
logger.info(evaluator_name, os.listdir(f"./{evaluator_name}"))
else:
logger.info(f"Directory for evaluator {evaluator_name} not found.")


def initialize_evaluators(command_line_args):
"""Initialize the evaluators using correct parameters and credentials for rai evaluators."""
evaluators = {}
evaluators_o = json.loads(command_line_args.evaluators)
for evaluator_name, evaluator in evaluators_o.items():
init_params = evaluator["InitParams"]
update_value_in_dict(init_params, "AZURE_OPENAI_API_KEY", lambda x: os.environ[x.upper()])
flow = load_evaluator('./' + evaluator_name)
if any(rai_eval in evaluator["Id"] for rai_eval in rai_evaluators):
init_params["credential"] = AzureMLOnBehalfOfCredential()
evaluators[evaluator_name] = flow(**init_params)
return evaluators


def run_evaluation(command_line_args, evaluators):
"""Run evaluation using evaluators."""
results = evaluate(
data=command_line_args.eval_data,
evaluators=evaluators
)
metrics = {}
for metric_name, metric_value in results['metrics'].items():
logger.info("Logging metric:", metric_name, metric_value)
metrics[metric_name] = metric_value
mlflow.log_metrics(metrics)

if results and results.get("rows"):
# Convert the results to a DataFrame
df = pd.DataFrame(results["rows"])

# Save the DataFrame as a JSONL file
df.to_json("instance_results.jsonl", orient="records", lines=True)
df.to_json("eval_results.jsonl", orient="records", lines=True)
mlflow.log_artifact("instance_results.jsonl")
mlflow.log_artifact("eval_results.jsonl")


def get_promptflow_run_logs():
"""Get promptflow run logs."""
if os.path.exists("/root/.promptflow/.runs/"):
runs = os.listdir("/root/.promptflow/.runs/")
for run in runs:
if os.path.exists(f"/root/.promptflow/.runs/{run}/logs.txt"):
with open(f"/root/.promptflow/.runs/{run}/logs.txt", "r") as f:
logger.info(f"RUN {run} =========================")
logger.info(f.read())
else:
logger.info("RUN DOES NOT EXIST")


# Create a session for making HTTP requests
session = requests.Session()

# Parse command line arguments and debug to ensure working
parser = argparse.ArgumentParser("eval")
parser.add_argument("--eval_data", type=str)
parser.add_argument("--eval_output", type=str)
parser.add_argument("--evaluators", type=str)
parser.add_argument("--evaluator_name_id_map", type=str)

args = parser.parse_args()
rai_evaluators = ['HateUnfairnessEvaluator', 'Sexual-Content-Evaluator', 'Hate-and-Unfairness-Evaluator',
'Violent-Content-Evaluator', 'Self-Harm-Related-Content-Evaluator']

if __name__ == '__main__':
copy_evaluator_files(args)
evaluators = initialize_evaluators(args)
logger.info("*************** Collecting Result of Evaluators ******************")
# Run the evaluation
with mlflow.start_run() as run:
try:
run_evaluation(args, evaluators)
except Exception as e:
logger.error("EXCEPT", e)
get_promptflow_run_logs()
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
azure-ai-evaluation
openai
azureml-mlflow
azure-identity
azure-ai-ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Load a built-in or custom evulator as flow."""
import importlib
import logging
import os
import sys
from promptflow.client import load_flow

logger = logging.getLogger(__name__)


def load_evaluator(evaluator):
"""Load evaluator as flow."""
logger.info(f"Loading evaluator {evaluator}")
loaded_evaluator = load_flow(evaluator)
logger.info(loaded_evaluator)
module_parent = loaded_evaluator.path.parent.name
module_name = loaded_evaluator.entry.split(":")[0]
logger.info(f"Loading module {os.getcwd()} {module_name} from {module_parent}")
module_path = os.path.join(os.getcwd(), module_parent, module_name + ".py")
logger.info(f"Loading module {module_name} from {module_path}")
spec = importlib.util.spec_from_file_location(module_name, module_path)
mod = importlib.util.module_from_spec(spec)
logger.info(f"Loaded module {mod}")
sys.modules[module_name] = mod
spec.loader.exec_module(mod)
eval_class = getattr(mod, loaded_evaluator.entry.split(":")[1])
return eval_class
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
image:
name: azureml/curated/evaluations-built-in
os: linux
context:
dir: context
dockerfile: Dockerfile
template_files:
- Dockerfile
publish:
location: mcr
visibility: public
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
description: Python environment for running promptflow-evals based evaluators.

name: "{{asset.name}}"
version: "{{asset.version}}"

os_type: linux

build:
path: "{{image.context.path}}"
dockerfile_path: "{{image.dockerfile.path}}"
Loading