From 39609ae3d094876674f38e16b9f86552b2e84c0d Mon Sep 17 00:00:00 2001 From: jotaylo Date: Wed, 4 Mar 2020 17:39:49 -0800 Subject: [PATCH] Proposal: split train.py into train.py and train_aml.py (#219) This change splits train.py into two files. The new train.py is standalone, and has no references to AzureML. It defines three functions, split_data to split a dataframe into test/train data, and train_model which takes the test/train data and a parameter object and trains the model, and get_model_metrics, which evaluates metrics about the model. The script can be run locally, in which case it loads a dataset from a file. The second file, train_aml.py contains reasonably general AzureML logic. It reads data from a dataset, then calls the split_data function from train.py. It loads input parameters from a config file and logs them, then calls train_model from train.py. It then uploads the model and logs any metrics returned by get_model_metrics. The hope with these changes is to demonstrate a simple interface for integrating an existing ML script with MLOpsPython, as well as providing an example for how the core ML functionality can be invoked in multiple ways for development purposes. Co-authored-by: Bryan J Smith --- ...diabetes_regression-variables-template.yml | 2 +- diabetes_regression/training/test_train.py | 39 +- diabetes_regression/training/train.py | 174 ++------- diabetes_regression/training/train_aml.py | 175 +++++++++ ... Regression Experimentation Pipeline.ipynb | 353 ++++++++++++++++++ ...Regression Parameter Experimentation.ipynb | 211 +++++++++++ 6 files changed, 799 insertions(+), 155 deletions(-) create mode 100644 diabetes_regression/training/train_aml.py create mode 100644 experimentation/Diabetes Ridge Regression Experimentation Pipeline.ipynb create mode 100644 experimentation/Diabetes Ridge Regression Parameter Experimentation.ipynb diff --git a/.pipelines/diabetes_regression-variables-template.yml b/.pipelines/diabetes_regression-variables-template.yml index 6d4d9797..fdccb3b7 100644 --- a/.pipelines/diabetes_regression-variables-template.yml +++ b/.pipelines/diabetes_regression-variables-template.yml @@ -7,7 +7,7 @@ variables: value: diabetes_regression # The path to the model training script under SOURCES_DIR_TRAIN - name: TRAIN_SCRIPT_PATH - value: training/train.py + value: training/train_aml.py # The path to the model evaluation script under SOURCES_DIR_TRAIN - name: EVALUATE_SCRIPT_PATH value: evaluate/evaluate_model.py diff --git a/diabetes_regression/training/test_train.py b/diabetes_regression/training/test_train.py index 155d367a..d121ecbc 100644 --- a/diabetes_regression/training/test_train.py +++ b/diabetes_regression/training/test_train.py @@ -1,27 +1,32 @@ import numpy as np -from azureml.core.run import Run -from unittest.mock import Mock -from diabetes_regression.training.train import train_model +from diabetes_regression.training.train import train_model, get_model_metrics def test_train_model(): X_train = np.array([1, 2, 3, 4, 5, 6]).reshape(-1, 1) y_train = np.array([10, 9, 8, 8, 6, 5]) + data = {"train": {"X": X_train, "y": y_train}} + + reg_model = train_model(data, {"alpha": 1.2}) + + preds = reg_model.predict([[1], [2]]) + np.testing.assert_equal(preds, [9.93939393939394, 9.03030303030303]) + + +def test_get_model_metrics(): + + class MockModel: + + @staticmethod + def predict(data): + return ([8.12121212, 7.21212121]) + X_test = np.array([3, 4]).reshape(-1, 1) y_test = np.array([8, 7]) - data = {"train": {"X": X_train, "y": y_train}, - "test": {"X": X_test, "y": y_test}} + data = {"test": {"X": X_test, "y": y_test}} - run = Mock(Run) - reg = train_model(run, data, alpha=1.2) + metrics = get_model_metrics(MockModel(), data) - _, call2 = run.log.call_args_list - nameValue, descriptionDict = call2 - name, value = nameValue - description = descriptionDict['description'] - assert (name == 'mse') - np.testing.assert_almost_equal(value, 0.029843893480257067) - assert (description == 'Mean squared error metric') - - preds = reg.predict([[1], [2]]) - np.testing.assert_equal(preds, [9.93939393939394, 9.03030303030303]) + assert 'mse' in metrics + mse = metrics['mse'] + np.testing.assert_almost_equal(mse, 0.029843893480257067) diff --git a/diabetes_regression/training/train.py b/diabetes_regression/training/train.py index 66dbc20f..22258042 100644 --- a/diabetes_regression/training/train.py +++ b/diabetes_regression/training/train.py @@ -23,137 +23,16 @@ ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ -from azureml.core.run import Run + import os -import argparse +import pandas as pd from sklearn.linear_model import Ridge from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split -import joblib -import json -from azureml.core import Dataset, Datastore, Workspace - - -def register_dataset( - aml_workspace: Workspace, - dataset_name: str, - datastore_name: str, - file_path: str -) -> Dataset: - datastore = Datastore.get(aml_workspace, datastore_name) - dataset = Dataset.Tabular.from_delimited_files(path=(datastore, file_path)) - dataset = dataset.register(workspace=aml_workspace, - name=dataset_name, - create_new_version=True) - - return dataset - - -def train_model(run, data, alpha): - run.log("alpha", alpha) - run.parent.log("alpha", alpha) - reg = Ridge(alpha=alpha) - reg.fit(data["train"]["X"], data["train"]["y"]) - preds = reg.predict(data["test"]["X"]) - run.log("mse", mean_squared_error( - preds, data["test"]["y"]), description="Mean squared error metric") - run.parent.log("mse", mean_squared_error( - preds, data["test"]["y"]), description="Mean squared error metric") - return reg - -def main(): - print("Running train.py") - parser = argparse.ArgumentParser("train") - - parser.add_argument( - "--model_name", - type=str, - help="Name of the Model", - default="sklearn_regression_model.pkl", - ) - - parser.add_argument( - "--step_output", - type=str, - help=("output for passing data to next step") - ) - - parser.add_argument( - "--dataset_version", - type=str, - help=("dataset version") - ) - - parser.add_argument( - "--data_file_path", - type=str, - help=("data file path, if specified,\ - a new version of the dataset will be registered") - ) - - parser.add_argument( - "--caller_run_id", - type=str, - help=("caller run id, for example ADF pipeline run id") - ) - - parser.add_argument( - "--dataset_name", - type=str, - help=("Dataset name. Dataset must be passed by name\ - to always get the desired dataset version\ - rather than the one used while the pipeline creation") - ) - - args = parser.parse_args() - - print("Argument [model_name]: %s" % args.model_name) - print("Argument [step_output]: %s" % args.step_output) - print("Argument [dataset_version]: %s" % args.dataset_version) - print("Argument [data_file_path]: %s" % args.data_file_path) - print("Argument [caller_run_id]: %s" % args.caller_run_id) - print("Argument [dataset_name]: %s" % args.dataset_name) - - model_name = args.model_name - step_output_path = args.step_output - dataset_version = args.dataset_version - data_file_path = args.data_file_path - dataset_name = args.dataset_name - - print("Getting training parameters") - - with open("config.json") as f: - pars = json.load(f) - try: - alpha = pars["training"]["alpha"] - except KeyError: - alpha = 0.5 - - print("Parameter alpha: %s" % alpha) - - run = Run.get_context() - - # Get the dataset - if (dataset_name): - if (data_file_path == 'none'): - dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 - else: - dataset = register_dataset(run.experiment.workspace, - dataset_name, - os.environ.get("DATASTORE_NAME"), - data_file_path) - else: - e = ("No dataset provided") - print(e) - raise Exception(e) - - # Link dataset to the step run so it is trackable in the UI - run.input_datasets['training_data'] = dataset - run.parent.tag("dataset_id", value=dataset.id) - - df = dataset.to_pandas_dataframe() +# Split the dataframe into test and train data +def split_data(df): X = df.drop('Y', axis=1).values y = df['Y'].values @@ -161,23 +40,44 @@ def main(): X, y, test_size=0.2, random_state=0) data = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}} + return data + + +# Train the model, return the model +def train_model(data, ridge_args): + reg_model = Ridge(**ridge_args) + reg_model.fit(data["train"]["X"], data["train"]["y"]) + return reg_model + + +# Evaluate the metrics for the model +def get_model_metrics(model, data): + preds = model.predict(data["test"]["X"]) + mse = mean_squared_error(preds, data["test"]["y"]) + metrics = {"mse": mse} + return metrics + + +def main(): + print("Running train.py") - reg = train_model(run, data, alpha) + # Define training parameters + ridge_args = {"alpha": 0.5} - # Pass model file to next step - os.makedirs(step_output_path, exist_ok=True) - model_output_path = os.path.join(step_output_path, model_name) - joblib.dump(value=reg, filename=model_output_path) + # Load the training data as dataframe + data_dir = "data" + data_file = os.path.join(data_dir, 'diabetes.csv') + train_df = pd.read_csv(data_file) - # Also upload model file to run outputs for history - os.makedirs('outputs', exist_ok=True) - output_path = os.path.join('outputs', model_name) - joblib.dump(value=reg, filename=output_path) + data = split_data(train_df) - run.tag("run_type", value="train") - print(f"tags now present for run: {run.tags}") + # Train the model + model = train_model(data, ridge_args) - run.complete() + # Log the metrics for the model + metrics = get_model_metrics(model, data) + for (k, v) in metrics.items(): + print(f"{k}: {v}") if __name__ == '__main__': diff --git a/diabetes_regression/training/train_aml.py b/diabetes_regression/training/train_aml.py new file mode 100644 index 00000000..5bf76cb4 --- /dev/null +++ b/diabetes_regression/training/train_aml.py @@ -0,0 +1,175 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" +from azureml.core.run import Dataset, Datastore, Run, Workspace +import os +import argparse +import joblib +import json +from train import split_data, train_model, get_model_metrics + + +def register_dataset( + aml_workspace: Workspace, + dataset_name: str, + datastore_name: str, + file_path: str +) -> Dataset: + datastore = Datastore.get(aml_workspace, datastore_name) + dataset = Dataset.Tabular.from_delimited_files(path=(datastore, file_path)) + dataset = dataset.register(workspace=aml_workspace, + name=dataset_name, + create_new_version=True) + + return dataset + + +def main(): + print("Running train_aml.py") + + parser = argparse.ArgumentParser("train") + parser.add_argument( + "--model_name", + type=str, + help="Name of the Model", + default="sklearn_regression_model.pkl", + ) + + parser.add_argument( + "--step_output", + type=str, + help=("output for passing data to next step") + ) + + parser.add_argument( + "--dataset_version", + type=str, + help=("dataset version") + ) + + parser.add_argument( + "--data_file_path", + type=str, + help=("data file path, if specified,\ + a new version of the dataset will be registered") + ) + + parser.add_argument( + "--caller_run_id", + type=str, + help=("caller run id, for example ADF pipeline run id") + ) + + parser.add_argument( + "--dataset_name", + type=str, + help=("Dataset name. Dataset must be passed by name\ + to always get the desired dataset version\ + rather than the one used while the pipeline creation") + ) + + args = parser.parse_args() + + print("Argument [model_name]: %s" % args.model_name) + print("Argument [step_output]: %s" % args.step_output) + print("Argument [dataset_version]: %s" % args.dataset_version) + print("Argument [data_file_path]: %s" % args.data_file_path) + print("Argument [caller_run_id]: %s" % args.caller_run_id) + print("Argument [dataset_name]: %s" % args.dataset_name) + + model_name = args.model_name + step_output_path = args.step_output + dataset_version = args.dataset_version + data_file_path = args.data_file_path + dataset_name = args.dataset_name + + run = Run.get_context() + + print("Getting training parameters") + + # Load the training parameters from the config file + with open("config.json") as f: + pars = json.load(f) + try: + train_args = pars["training"] + except KeyError: + print("Could not load training values from file") + train_args = {} + + # Log the training parameters + print(f"Parameters: {train_args}") + for (k, v) in train_args.items(): + run.log(k, v) + run.parent.log(k, v) + + # Get the dataset + if (dataset_name): + if (data_file_path == 'none'): + dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 + else: + dataset = register_dataset(run.experiment.workspace, + dataset_name, + os.environ.get("DATASTORE_NAME"), + data_file_path) + else: + e = ("No dataset provided") + print(e) + raise Exception(e) + + # Link dataset to the step run so it is trackable in the UI + run.input_datasets['training_data'] = dataset + run.parent.tag("dataset_id", value=dataset.id) + + # Split the data into test/train + df = dataset.to_pandas_dataframe() + data = split_data(df) + + # Train the model + model = train_model(data, train_args) + + # Evaluate and log the metrics returned from the train function + metrics = get_model_metrics(model, data) + for (k, v) in metrics.items(): + run.log(k, v) + run.parent.log(k, v) + + # Pass model file to next step + os.makedirs(step_output_path, exist_ok=True) + model_output_path = os.path.join(step_output_path, model_name) + joblib.dump(value=model, filename=model_output_path) + + # Also upload model file to run outputs for history + os.makedirs('outputs', exist_ok=True) + output_path = os.path.join('outputs', model_name) + joblib.dump(value=model, filename=output_path) + + run.tag("run_type", value="train") + print(f"tags now present for run: {run.tags}") + + run.complete() + + +if __name__ == '__main__': + main() diff --git a/experimentation/Diabetes Ridge Regression Experimentation Pipeline.ipynb b/experimentation/Diabetes Ridge Regression Experimentation Pipeline.ipynb new file mode 100644 index 00000000..8b04a5c5 --- /dev/null +++ b/experimentation/Diabetes Ridge Regression Experimentation Pipeline.ipynb @@ -0,0 +1,353 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Experiment with parameters for a Ridge Regression Model on the Diabetes Dataset in an Azure ML Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook is for experimenting with different parameters to train a ridge regression model on the Diabetes dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Change out of the experimentation directory\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.core\n", + "from azureml.core import Workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the workspace from the saved config file\n", + "ws = Workspace.from_config()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os, shutil\n", + "\n", + "# Create a folder for the experiment files\n", + "training_folder = 'diabetes-training'\n", + "os.makedirs(training_folder, exist_ok=True)\n", + "\n", + "# Copy the data file into the experiment folder\n", + "shutil.copy('data/diabetes.csv', os.path.join(training_folder, \"diabetes.csv\"))\n", + "\n", + "# Copy the train functions into the experiment folder\n", + "shutil.copy('diabetes_regression/training/train.py', os.path.join(training_folder, \"train.py\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/parameters.json\n", + "{\n", + " \"training\":\n", + " {\n", + " \"alpha\": 0.3\n", + " },\n", + " \"evaluation\":\n", + " {\n", + "\n", + " },\n", + " \"scoring\":\n", + " {\n", + " \n", + " }\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/diabetes_training.py\n", + "# Import libraries\n", + "from azureml.core import Run\n", + "import pandas as pd\n", + "import shutil\n", + "import joblib\n", + "\n", + "from train import split_data, train_model\n", + "\n", + "# Get parameters\n", + "parser = argparse.ArgumentParser()\n", + "parser.add_argument('--output_folder', type=str, dest='output_folder', default=\"diabetes_model\", help='output folder')\n", + "args = parser.parse_args()\n", + "output_folder = args.output_folder\n", + "\n", + "# Get the experiment run context\n", + "run = Run.get_context()\n", + "\n", + "# load the diabetes dataset\n", + "print(\"Loading Data...\")\n", + "train_df = pd.read_csv('diabetes.csv')\n", + "\n", + "data = split_data(train_df)\n", + "\n", + "# Specify the parameters to test\n", + "with open(\"parameters.json\") as f:\n", + " pars = json.load(f)\n", + " train_args = pars[\"training\"]\n", + "\n", + "# Log parameters\n", + "for k, v in train_args.items():\n", + " run.log(k, v)\n", + "\n", + "model, metrics = train_model(data, train_args)\n", + "\n", + "# Log metrics\n", + "for k, v in metrics.items():\n", + " run.log(k, v)\n", + "\n", + "# Save the parameters file to the outputs folder\n", + "os.makedirs(output_folder, exist_ok=True)\n", + "shutil.copy('parameters.json', os.path.join(output_folder, 'parameters.json'))\n", + "joblib.dump(value=model, filename= output_folder + \"/model.pkl\")\n", + " \n", + "run.complete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/register_diabetes.py\n", + "# Import libraries\n", + "import argparse\n", + "import joblib\n", + "from azureml.core import Workspace, Model, Run\n", + "\n", + "# Get parameters\n", + "parser = argparse.ArgumentParser()\n", + "parser.add_argument('--model_folder', type=str, dest='model_folder', default=\"diabetes_model\", help='model location')\n", + "args = parser.parse_args()\n", + "model_folder = args.model_folder\n", + "\n", + "# Get the experiment run context\n", + "run = Run.get_context()\n", + "\n", + "# load the model\n", + "print(\"Loading model from \" + model_folder)\n", + "model_file = model_folder + \"/model.pkl\"\n", + "model = joblib.load(model_file)\n", + "\n", + "Model.register(workspace=run.experiment.workspace,\n", + " model_path = model_file,\n", + " model_name = 'diabetes_model',\n", + " tags={'Training context':'Pipeline'})\n", + "\n", + "run.complete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.compute import ComputeTarget, AmlCompute\n", + "from azureml.core.compute_target import ComputeTargetException\n", + "\n", + "cluster_name = \"aml-cluster\"\n", + "\n", + "# Verify that cluster exists\n", + "try:\n", + " pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)\n", + " print('Found existing cluster, use it.')\n", + "except ComputeTargetException:\n", + " # If not, create it\n", + " compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',\n", + " max_nodes=4,\n", + " idle_seconds_before_scaledown=1800)\n", + " pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)\n", + "\n", + "pipeline_cluster.wait_for_completion(show_output=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Environment\n", + "from azureml.core.conda_dependencies import CondaDependencies\n", + "from azureml.core.runconfig import RunConfiguration\n", + "\n", + "# Create a Python environment for the experiment\n", + "diabetes_env = Environment(\"diabetes-pipeline-env\")\n", + "diabetes_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies\n", + "diabetes_env.docker.enabled = True # Use a docker container\n", + "\n", + "# Create a set of package dependencies\n", + "diabetes_packages = CondaDependencies.create(conda_packages=['scikit-learn','pandas'],\n", + " pip_packages=['azureml-sdk'])\n", + "\n", + "# Add the dependencies to the environment\n", + "diabetes_env.python.conda_dependencies = diabetes_packages\n", + "\n", + "# Register the environment (just in case you want to use it again)\n", + "diabetes_env.register(workspace=ws)\n", + "registered_env = Environment.get(ws, 'diabetes-pipeline-env')\n", + "\n", + "# Create a new runconfig object for the pipeline\n", + "pipeline_run_config = RunConfiguration()\n", + "\n", + "# Use the compute you created above. \n", + "pipeline_run_config.target = pipeline_cluster\n", + "\n", + "# Assign the environment to the run configuration\n", + "pipeline_run_config.environment = registered_env\n", + "\n", + "print (\"Run configuration created.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.pipeline.core import PipelineData\n", + "from azureml.pipeline.steps import PythonScriptStep, EstimatorStep\n", + "from azureml.train.estimator import Estimator\n", + "\n", + "# Get the training dataset\n", + "#diabetes_ds = ws.datasets.get(\"diabetes dataset\")\n", + "\n", + "# Create a PipelineData (temporary Data Reference) for the model folder\n", + "model_folder = PipelineData(\"model_folder\", datastore=ws.get_default_datastore())\n", + "\n", + "estimator = Estimator(source_directory=training_folder,\n", + " compute_target = pipeline_cluster,\n", + " environment_definition=pipeline_run_config.environment,\n", + " entry_script='diabetes_training.py')\n", + "\n", + "# Step 1, run the estimator to train the model\n", + "train_step = EstimatorStep(name = \"Train Model\",\n", + " estimator=estimator, \n", + " estimator_entry_script_arguments=['--output_folder', model_folder],\n", + " outputs=[model_folder],\n", + " compute_target = pipeline_cluster,\n", + " allow_reuse = True)\n", + "\n", + "# Step 2, run the model registration script\n", + "register_step = PythonScriptStep(name = \"Register Model\",\n", + " source_directory = training_folder,\n", + " script_name = \"register_diabetes.py\",\n", + " arguments = ['--model_folder', model_folder],\n", + " inputs=[model_folder],\n", + " compute_target = pipeline_cluster,\n", + " runconfig = pipeline_run_config,\n", + " allow_reuse = True)\n", + "\n", + "print(\"Pipeline steps defined\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Experiment\n", + "from azureml.pipeline.core import Pipeline\n", + "from azureml.widgets import RunDetails\n", + "\n", + "# Construct the pipeline\n", + "pipeline_steps = [train_step, register_step]\n", + "pipeline = Pipeline(workspace = ws, steps=pipeline_steps)\n", + "print(\"Pipeline is built.\")\n", + "\n", + "# Create an experiment and run the pipeline\n", + "experiment = Experiment(workspace = ws, name = 'diabetes-training-pipeline')\n", + "pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)\n", + "print(\"Pipeline submitted for execution.\")\n", + "\n", + "RunDetails(pipeline_run).show()\n", + "pipeline_run.wait_for_completion()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Model\n", + "\n", + "for model in Model.list(ws):\n", + " print(model.name, 'version:', model.version)\n", + " for tag_name in model.tags:\n", + " tag = model.tags[tag_name]\n", + " print ('\\t',tag_name, ':', tag)\n", + " for prop_name in model.properties:\n", + " prop = model.properties[prop_name]\n", + " print ('\\t',prop_name, ':', prop)\n", + " print('\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/experimentation/Diabetes Ridge Regression Parameter Experimentation.ipynb b/experimentation/Diabetes Ridge Regression Parameter Experimentation.ipynb new file mode 100644 index 00000000..aab5e052 --- /dev/null +++ b/experimentation/Diabetes Ridge Regression Parameter Experimentation.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Experiment with parameters for a Ridge Regression Model on the Diabetes Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook is for experimenting with different parameters to train a ridge regression model on the Diabetes dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Change out of the experimentation directory\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.core\n", + "from azureml.core import Workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the workspace from the saved config file\n", + "ws = Workspace.from_config()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os, shutil\n", + "\n", + "# Create a folder for the experiment files\n", + "training_folder = 'diabetes-training'\n", + "os.makedirs(training_folder, exist_ok=True)\n", + "\n", + "# Copy the data file into the experiment folder\n", + "shutil.copy('data/diabetes.csv', os.path.join(training_folder, \"diabetes.csv\"))\n", + "\n", + "# Copy the train functions into the experiment folder\n", + "shutil.copy('diabetes_regression/training/train.py', os.path.join(training_folder, \"train.py\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/parameters.json\n", + "{\n", + " \"training\":\n", + " {\n", + " \"alpha\": 0.3\n", + " },\n", + " \"evaluation\":\n", + " {\n", + "\n", + " },\n", + " \"scoring\":\n", + " {\n", + " \n", + " }\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/diabetes_training.py\n", + "# Import libraries\n", + "from azureml.core import Run\n", + "import json\n", + "import os\n", + "import pandas as pd\n", + "import shutil\n", + "\n", + "from train import split_data, train_model\n", + "\n", + "# Get the experiment run context\n", + "run = Run.get_context()\n", + "\n", + "# load the diabetes dataset\n", + "print(\"Loading Data...\")\n", + "train_df = pd.read_csv('diabetes.csv')\n", + "\n", + "data = split_data(train_df)\n", + "\n", + "# Specify the parameters to test\n", + "with open(\"parameters.json\") as f:\n", + " pars = json.load(f)\n", + " train_args = pars[\"training\"]\n", + "\n", + "# Log parameters\n", + "for k, v in train_args.items():\n", + " run.log(k, v)\n", + "\n", + "model, metrics = train_model(data, train_args)\n", + "\n", + "# Log metrics\n", + "for k, v in metrics.items():\n", + " run.log(k, v)\n", + "\n", + "# Save the parameters file to the outputs folder\n", + "os.makedirs('outputs', exist_ok=True)\n", + "shutil.copy('parameters.json', os.path.join('outputs', 'parameters.json'))\n", + " \n", + "run.complete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.estimator import Estimator\n", + "from azureml.core import Experiment\n", + "\n", + "# Create an estimator\n", + "estimator = Estimator(source_directory=training_folder,\n", + " entry_script='diabetes_training.py',\n", + " compute_target='local',\n", + " conda_packages=['scikit-learn']\n", + " )\n", + "\n", + "# Create an experiment\n", + "experiment_name = 'diabetes-training'\n", + "experiment = Experiment(workspace = ws, name = experiment_name)\n", + "\n", + "# Run the experiment based on the estimator\n", + "run = experiment.submit(config=estimator)\n", + "run.wait_for_completion(show_output=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics = run.get_metrics()\n", + "for k, v in metrics.items():\n", + " print(k, v)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for file in run.get_file_names():\n", + " print(file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.6.10 64-bit ('OH3': conda)", + "language": "python", + "name": "python361064bitoh3conda5f7beeba8c1d407187c86667ecfb684f" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}