diff --git a/.github/workflows/e2e-test-kfpv1.yaml b/.github/workflows/e2e-test-kfpv1.yaml new file mode 100644 index 00000000000..52807f7cbaf --- /dev/null +++ b/.github/workflows/e2e-test-kfpv1.yaml @@ -0,0 +1,45 @@ +name: E2E Test with kubeflow pipelines v1 + +on: + pull_request: + paths-ignore: + - "pkg/new-ui/v1beta1/frontend/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + +jobs: + e2e: + runs-on: ubuntu-20.04 + timeout-minutes: 120 + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Setup Test Env + uses: ./.github/workflows/template-setup-e2e-test + with: + kubernetes-version: ${{ matrix.kubernetes-version }} + python-version: "3.10" + + - name: Run e2e test with ${{ matrix.experiments }} experiments + uses: ./.github/workflows/template-e2e-test + with: + experiments: ${{ matrix.experiments }} + training-operator: true + # Comma Delimited + trial-images: kfpv1-metrics-collector + install-kfp: 1.8.1 + experiment-namespace: kubeflow + + strategy: + fail-fast: false + matrix: + kubernetes-version: ["v1.23.13", "v1.24.7", "v1.25.3"] + # Comma Delimited + experiments: + - "katib-kfp-example-e2e-v1" diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml index 750ab03c99e..5708e9ce9ac 100644 --- a/.github/workflows/publish-core-images.yaml +++ b/.github/workflows/publish-core-images.yaml @@ -32,3 +32,5 @@ jobs: dockerfile: cmd/metricscollector/v1beta1/file-metricscollector/Dockerfile - component-name: tfevent-metrics-collector dockerfile: cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile + - component-name: kfpv1-metrics-collector + dockerfile: cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml index ef1ca26064d..6337c8215bf 100644 --- a/.github/workflows/template-e2e-test/action.yaml +++ b/.github/workflows/template-e2e-test/action.yaml @@ -21,6 +21,15 @@ inputs: required: false description: mysql or postgres default: mysql + install-kfp: + required: false + description: whether kubeflow pipelines is required + as a dependency. If so provide version as string (eg 1.8.1) + default: false + experiment-namespace: + required: false + description: namespace to execute test experiment in + default: default runs: using: composite @@ -31,8 +40,8 @@ runs: - name: Setup Katib shell: bash - run: ./test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh ${{ inputs.katib-ui }} ${{ inputs.training-operator }} ${{ inputs.database-type }} + run: ./test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh ${{ inputs.katib-ui }} ${{ inputs.training-operator }} ${{ inputs.database-type }} ${{ inputs.install-kfp }} - name: Run E2E Experiment shell: bash - run: ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }} + run: ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }} ${{ inputs.experiment-namespace }} diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile new file mode 100644 index 00000000000..9d7722e5f30 --- /dev/null +++ b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.10-slim + +ARG TARGETARCH +ENV TARGET_DIR /opt/katib +ENV METRICS_COLLECTOR_DIR cmd/metricscollector/v1beta1/kfp-metricscollector/v1 +ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/metricscollector/v1beta1/kfp-metricscollector/v1::${TARGET_DIR}/pkg/metricscollector/v1beta1/common/ + +ADD ./pkg/ ${TARGET_DIR}/pkg/ +ADD ./${METRICS_COLLECTOR_DIR}/ ${TARGET_DIR}/${METRICS_COLLECTOR_DIR}/ + +WORKDIR ${TARGET_DIR}/${METRICS_COLLECTOR_DIR} + +RUN if [ "${TARGETARCH}" = "arm64" ]; then \ + apt-get -y update && \ + apt-get -y install gfortran libpcre3 libpcre3-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/*; \ + fi + +RUN pip install --no-cache-dir -r requirements.txt +RUN chgrp -R 0 ${TARGET_DIR} \ + && chmod -R g+rwX ${TARGET_DIR} + +ENTRYPOINT ["python", "main.py"] diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py new file mode 100644 index 00000000000..333e70553eb --- /dev/null +++ b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py @@ -0,0 +1,101 @@ +# Copyright 2023 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +from logging import INFO, StreamHandler, getLogger + +import api_pb2 +import const +import grpc +from metrics_loader import MetricsCollector +from pns import WaitMainProcesses + +timeout_in_seconds = 60 + + +def parse_options(): + parser = argparse.ArgumentParser( + description="KFP V1 MetricsCollector", add_help=True + ) + + # TODO (andreyvelich): Add early stopping flags. + parser.add_argument("-s-db", "--db_manager_server_addr", type=str, default="") + parser.add_argument("-t", "--pod_name", type=str, default="") + parser.add_argument( + "-path", + "--metrics_file_dir", + type=str, + default=const.DEFAULT_METRICS_FILE_KFPV1_DIR, + ) + parser.add_argument("-m", "--metric_names", type=str, default="") + parser.add_argument("-o-type", "--objective_type", type=str, default="") + parser.add_argument("-f", "--metric_filters", type=str, default="") + parser.add_argument( + "-p", "--poll_interval", type=int, default=const.DEFAULT_POLL_INTERVAL + ) + parser.add_argument( + "-timeout", "--timeout", type=int, default=const.DEFAULT_TIMEOUT + ) + parser.add_argument( + "-w", "--wait_all_processes", type=str, default=const.DEFAULT_WAIT_ALL_PROCESSES + ) + opt = parser.parse_args() + return opt + + +if __name__ == "__main__": + logger = getLogger(__name__) + handler = StreamHandler() + handler.setLevel(INFO) + logger.setLevel(INFO) + logger.addHandler(handler) + logger.propagate = False + opt = parse_options() + wait_all_processes = opt.wait_all_processes.lower() == "true" + db_manager_server = opt.db_manager_server_addr.split(":") + trial_name = "-".join(opt.pod_name.split("-")[:-1]) + if len(db_manager_server) != 2: + raise Exception( + "Invalid Katib DB manager service address: %s" % opt.db_manager_server_addr + ) + + WaitMainProcesses( + pool_interval=opt.poll_interval, + timout=opt.timeout, + wait_all=wait_all_processes, + completed_marked_dir=None, + ) + + mc = MetricsCollector(opt.metric_names.split(";")) + observation_log = mc.parse_file(opt.metrics_file_dir) + + channel = grpc.beta.implementations.insecure_channel( + db_manager_server[0], int(db_manager_server[1]) + ) + + with api_pb2.beta_create_DBManager_stub(channel) as client: + logger.info( + "In " + + trial_name + + " " + + str(len(observation_log.metric_logs)) + + " metrics will be reported." + ) + client.ReportObservationLog( + api_pb2.ReportObservationLogRequest( + trial_name=trial_name, observation_log=observation_log + ), + timeout=timeout_in_seconds, + ) diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt new file mode 100644 index 00000000000..b73a43f3fba --- /dev/null +++ b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt @@ -0,0 +1,5 @@ +psutil==5.9.4 +rfc3339>=6.2 +grpcio==1.41.1 +googleapis-common-protos==1.6.0 +protobuf==3.20.0 diff --git a/examples/v1beta1/kubeflow-pipelines/README.md b/examples/v1beta1/kubeflow-pipelines/README.md index df1e2bf0041..0c2ff8b6956 100644 --- a/examples/v1beta1/kubeflow-pipelines/README.md +++ b/examples/v1beta1/kubeflow-pipelines/README.md @@ -3,6 +3,10 @@ The following examples show how to use Katib with [Kubeflow Pipelines](https://github.com/kubeflow/pipelines). +Two different aspects are illustrated here: +A) How to orchestrate Katib experiments from Kubeflow pipelines using the Katib Kubeflow Component (Example 1 & 2) +B) How to use Katib to tune parameters of Kubeflow pipelines + You can find the Katib Component source code for the Kubeflow Pipelines [here](https://github.com/kubeflow/pipelines/tree/master/components/kubeflow/katib-launcher). @@ -13,6 +17,8 @@ You have to install the following Python SDK to run these examples: - [`kfp`](https://pypi.org/project/kfp/) >= 1.8.12 - [`kubeflow-katib`](https://pypi.org/project/kubeflow-katib/) >= 0.13.0 +In order to run parameter tuning over Kubeflow pipelines, additionally Katib needs to be setup to run with Argo workflow tasks. The setup is described within the example notebook (3). + ## Multi-User Pipelines Setup The Notebooks examples run Pipelines in multi-user mode and your Kubeflow Notebook @@ -25,10 +31,12 @@ to give an access Kubeflow Notebook to run Kubeflow Pipelines. The following Pipelines are deployed from Kubeflow Notebook: -- [Kubeflow E2E MNIST](kubeflow-e2e-mnist.ipynb) +1) [Kubeflow E2E MNIST](kubeflow-e2e-mnist.ipynb) + +2) [Katib Experiment with Early Stopping](early-stopping.ipynb) -- [Katib Experiment with Early Stopping](early-stopping.ipynb) +3) [Tune parameters of a `MNIST` kubeflow pipeline with Katib](kubeflow-kfpv1-opt-mnist.ipynb) -The following Pipelines have to be compiled and uploaded to the Kubeflow Pipelines UI: +The following Pipelines have to be compiled and uploaded to the Kubeflow Pipelines UI for examples 1 & 2: - [MPIJob Horovod](mpi-job-horovod.py) diff --git a/examples/v1beta1/kubeflow-pipelines/katib-kfp-example-e2e-v1.yaml b/examples/v1beta1/kubeflow-pipelines/katib-kfp-example-e2e-v1.yaml new file mode 100644 index 00000000000..18d6683825c --- /dev/null +++ b/examples/v1beta1/kubeflow-pipelines/katib-kfp-example-e2e-v1.yaml @@ -0,0 +1,374 @@ +apiVersion: kubeflow.org/v1beta1 +kind: Experiment +metadata: + name: katib-e2e-2023-07-20-22h-37m-57s + namespace: kubeflow +spec: + algorithm: + algorithmName: random + maxFailedTrialCount: 2 + maxTrialCount: 5 + metricsCollectorSpec: + collector: + customCollector: + args: + - -m + - val-accuracy;accuracy + - -s + - katib-db-manager.kubeflow:6789 + - -t + - $(PodName) + - -path + - /tmp/outputs/mlpipeline_metrics + env: + - name: PodName + valueFrom: + fieldRef: + fieldPath: metadata.name + image: docker.io/votti/kfpv1-metricscollector:v0.0.10 + imagePullPolicy: Always + name: custom-metrics-logger-and-collector + kind: Custom + source: + fileSystemPath: + kind: File + path: /tmp/outputs/mlpipeline_metrics/data + objective: + additionalMetricNames: + - accuracy + goal: 0.9 + objectiveMetricName: val-accuracy + type: maximize + parallelTrialCount: 5 + parameters: + - feasibleSpace: + max: '0.001' + min: '0.00001' + name: learning_rate + parameterType: double + - feasibleSpace: + max: '64' + min: '16' + name: batch_size + parameterType: int + - feasibleSpace: + list: + - '0' + - '1' + name: histogram_norm + parameterType: discrete + trialTemplate: + failureCondition: status.[@this].#(phase=="Failed")# + primaryContainerName: main + primaryPodLabels: + katib.kubeflow.org/model-training: 'true' + retain: false + successCondition: status.[@this].#(phase=="Succeeded")# + trialParameters: + - description: Learning rate for the training model + name: learningRate + reference: learning_rate + - description: Batch size for NN training + name: batchSize + reference: batch_size + - description: Histogram normalization of image on? + name: histogramNorm + reference: histogram_norm + trialSpec: + apiVersion: argoproj.io/v1alpha1 + kind: Workflow + metadata: + annotations: + pipelines.kubeflow.org/kfp_sdk_version: 1.8.12 + pipelines.kubeflow.org/pipeline_compilation_time: '2023-07-20T22:37:57.355215' + pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"default": "0.0001", + "name": "lr", "optional": true, "type": "Float"}, {"default": "Adam", + "name": "optimizer", "optional": true, "type": "String"}, {"default": + "categorical_crossentropy", "name": "loss", "optional": true, "type": + "String"}, {"default": "3", "name": "epochs", "optional": true, "type": + "Integer"}, {"default": "5", "name": "batch_size", "optional": true, "type": + "Integer"}, {"default": "False", "name": "histogram_norm", "optional": + true, "type": "Boolean"}, {"default": "${trialParameters.learningRate}", + "name": "lr"}, {"default": "${trialParameters.batchSize}", "name": "batch_size"}, + {"default": "${trialParameters.histogramNorm}", "name": "histogram_norm"}], + "name": "Minimal KFP1 pipeline for e2e testing"}' + generateName: minimal-kfp1-pipeline-for-e2e-testing- + labels: + pipelines.kubeflow.org/kfp_sdk_version: 1.8.12 + spec: + arguments: + parameters: + - name: lr + value: ${trialParameters.learningRate} + - name: optimizer + value: Adam + - name: loss + value: categorical_crossentropy + - name: epochs + value: '3' + - name: batch_size + value: ${trialParameters.batchSize} + - name: histogram_norm + value: ${trialParameters.histogramNorm} + entrypoint: minimal-kfp1-pipeline-for-e2e-testing + serviceAccountName: pipeline-runner + templates: + - dag: + tasks: + - arguments: + parameters: + - name: histogram_norm + value: '{{inputs.parameters.histogram_norm}}' + name: prep-e2e + template: prep-e2e + - arguments: + artifacts: + - from: '{{tasks.prep-e2e.outputs.artifacts.prep-e2e-output_nr}}' + name: prep-e2e-output_nr + parameters: + - name: batch_size + value: '{{inputs.parameters.batch_size}}' + - name: epochs + value: '{{inputs.parameters.epochs}}' + - name: loss + value: '{{inputs.parameters.loss}}' + - name: lr + value: '{{inputs.parameters.lr}}' + - name: optimizer + value: '{{inputs.parameters.optimizer}}' + dependencies: + - prep-e2e + name: train-e2e + template: train-e2e + inputs: + parameters: + - name: batch_size + - name: epochs + - name: histogram_norm + - name: loss + - name: lr + - name: optimizer + name: minimal-kfp1-pipeline-for-e2e-testing + - container: + args: + - --histogram-norm + - '{{inputs.parameters.histogram_norm}}' + - --output-nr + - /tmp/outputs/output_nr/data + command: + - sh + - -ec + - 'program_path=$(mktemp) + + printf "%s" "$0" > "$program_path" + + python3 -u "$program_path" "$@" + + ' + - "def _make_parent_dirs_and_return_path(file_path: str):\n import\ + \ os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n \ + \ return file_path\n\ndef prep_e2e(\n output_nr_path, # type:\ + \ ignore # noqa: F821\n histogram_norm = True,\n):\n with open(output_nr_path,\ + \ 'w') as writer:\n writer.write(str(int(histogram_norm)))\n\n\ + def _deserialize_bool(s) -> bool:\n from distutils.util import strtobool\n\ + \ return strtobool(s) == 1\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Prep\ + \ e2e', description='')\n_parser.add_argument(\"--histogram-norm\",\ + \ dest=\"histogram_norm\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--output-nr\", dest=\"output_nr_path\", type=_make_parent_dirs_and_return_path,\ + \ required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\ + \n_outputs = prep_e2e(**_parsed_args)\n" + image: python:3.7 + inputs: + parameters: + - name: histogram_norm + metadata: + annotations: + pipelines.kubeflow.org/arguments.parameters: '{"histogram_norm": "{{inputs.parameters.histogram_norm}}"}' + pipelines.kubeflow.org/component_ref: '{}' + pipelines.kubeflow.org/component_spec: '{"implementation": {"container": + {"args": [{"if": {"cond": {"isPresent": "histogram_norm"}, "then": + ["--histogram-norm", {"inputValue": "histogram_norm"}]}}, "--output-nr", + {"outputPath": "output_nr"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf + \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", + "def _make_parent_dirs_and_return_path(file_path: str):\n import + os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return + file_path\n\ndef prep_e2e(\n output_nr_path, # type: ignore # + noqa: F821\n histogram_norm = True,\n):\n with open(output_nr_path, + ''w'') as writer:\n writer.write(str(int(histogram_norm)))\n\ndef + _deserialize_bool(s) -> bool:\n from distutils.util import strtobool\n return + strtobool(s) == 1\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Prep + e2e'', description='''')\n_parser.add_argument(\"--histogram-norm\", + dest=\"histogram_norm\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--output-nr\", + dest=\"output_nr_path\", type=_make_parent_dirs_and_return_path, required=True, + default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs + = prep_e2e(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": + [{"default": "True", "name": "histogram_norm", "optional": true, "type": + "Boolean"}], "name": "Prep e2e", "outputs": [{"name": "output_nr", + "type": "Integer"}]}' + pipelines.kubeflow.org/task_display_name: Prepare a dummy output that + should be cached + labels: + pipelines.kubeflow.org/cache_enabled: 'true' + pipelines.kubeflow.org/enable_caching: 'true' + pipelines.kubeflow.org/kfp_sdk_version: 1.8.12 + pipelines.kubeflow.org/pipeline-sdk-type: kfp + name: prep-e2e + outputs: + artifacts: + - name: prep-e2e-output_nr + path: /tmp/outputs/output_nr/data + - container: + args: + - --input-nr + - /tmp/inputs/input_nr/data + - --lr + - '{{inputs.parameters.lr}}' + - --optimizer + - '{{inputs.parameters.optimizer}}' + - --loss + - '{{inputs.parameters.loss}}' + - --epochs + - '{{inputs.parameters.epochs}}' + - --batch-size + - '{{inputs.parameters.batch_size}}' + - --mlpipeline-metrics + - /tmp/outputs/mlpipeline_metrics/data + command: + - sh + - -ec + - 'program_path=$(mktemp) + + printf "%s" "$0" > "$program_path" + + python3 -u "$program_path" "$@" + + ' + - "def _make_parent_dirs_and_return_path(file_path: str):\n import\ + \ os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n \ + \ return file_path\n\ndef train_e2e(\n input_nr_path, # type:\ + \ ignore # noqa: F821\n mlpipeline_metrics_path, # type: ignore\ + \ # noqa: F821\n lr = 1e-4,\n optimizer = \"Adam\",\n loss\ + \ = \"categorical_crossentropy\",\n epochs = 1,\n batch_size =\ + \ 32,\n):\n \"\"\"\n This is the simulated train part of our ML\ + \ pipeline where training is performed\n \"\"\"\n import json\ + \ \n import time\n with open(input_nr_path, 'r') as reader:\n\ + \ line = reader.readline()\n histogram_norm_value = int(line)\n\ + \n accuracy = (batch_size + histogram_norm_value)/ (batch_size +\ + \ epochs+histogram_norm_value)\n val_accuracy = accuracy * 0.9\n\ + \ metrics = {\n \"metrics\": [\n {\n \ + \ \"name\": \"accuracy\", # The name of the metric. Visualized\ + \ as the column name in the runs table.\n \"numberValue\"\ + : accuracy, # The value of the metric. Must be a numeric value.\n \ + \ \"format\": \"PERCENTAGE\", # The optional format of\ + \ the metric. Supported values are \"RAW\" (displayed in raw format)\ + \ and \"PERCENTAGE\" (displayed in percentage format).\n \ + \ },\n {\n \"name\": \"val-accuracy\", #\ + \ The name of the metric. Visualized as the column name in the runs\ + \ table.\n \"numberValue\": val_accuracy, # The value\ + \ of the metric. Must be a numeric value.\n \"format\"\ + : \"PERCENTAGE\", # The optional format of the metric. Supported values\ + \ are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed\ + \ in percentage format).\n },\n ]\n }\n with\ + \ open(mlpipeline_metrics_path, \"w\") as f:\n json.dump(metrics,\ + \ f)\n\n # If this step is to fast, the metrics collector fails as\ + \ the\n # pod is already finished before it can collect the metrics.\n\ + \ time.sleep(10)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Train\ + \ e2e', description='This is the simulated train part of our ML pipeline\ + \ where training is performed')\n_parser.add_argument(\"--input-nr\"\ + , dest=\"input_nr_path\", type=str, required=True, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--lr\", dest=\"lr\", type=float, required=False,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--optimizer\",\ + \ dest=\"optimizer\", type=str, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--loss\", dest=\"loss\", type=str, required=False,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--epochs\", dest=\"\ + epochs\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"\ + --batch-size\", dest=\"batch_size\", type=int, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--mlpipeline-metrics\", dest=\"mlpipeline_metrics_path\"\ + , type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n\ + _parsed_args = vars(_parser.parse_args())\n\n_outputs = train_e2e(**_parsed_args)\n" + image: python:3.7 + inputs: + artifacts: + - name: prep-e2e-output_nr + path: /tmp/inputs/input_nr/data + parameters: + - name: batch_size + - name: epochs + - name: loss + - name: lr + - name: optimizer + metadata: + annotations: + pipelines.kubeflow.org/arguments.parameters: '{"batch_size": "{{inputs.parameters.batch_size}}", + "epochs": "{{inputs.parameters.epochs}}", "loss": "{{inputs.parameters.loss}}", + "lr": "{{inputs.parameters.lr}}", "optimizer": "{{inputs.parameters.optimizer}}"}' + pipelines.kubeflow.org/component_ref: '{}' + pipelines.kubeflow.org/component_spec: '{"description": "This is the + simulated train part of our ML pipeline where training is performed", + "implementation": {"container": {"args": ["--input-nr", {"inputPath": + "input_nr"}, {"if": {"cond": {"isPresent": "lr"}, "then": ["--lr", + {"inputValue": "lr"}]}}, {"if": {"cond": {"isPresent": "optimizer"}, + "then": ["--optimizer", {"inputValue": "optimizer"}]}}, {"if": {"cond": + {"isPresent": "loss"}, "then": ["--loss", {"inputValue": "loss"}]}}, + {"if": {"cond": {"isPresent": "epochs"}, "then": ["--epochs", {"inputValue": + "epochs"}]}}, {"if": {"cond": {"isPresent": "batch_size"}, "then": + ["--batch-size", {"inputValue": "batch_size"}]}}, "--mlpipeline-metrics", + {"outputPath": "mlpipeline_metrics"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf + \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", + "def _make_parent_dirs_and_return_path(file_path: str):\n import + os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return + file_path\n\ndef train_e2e(\n input_nr_path, # type: ignore # + noqa: F821\n mlpipeline_metrics_path, # type: ignore # noqa: F821\n lr + = 1e-4,\n optimizer = \"Adam\",\n loss = \"categorical_crossentropy\",\n epochs + = 1,\n batch_size = 32,\n):\n \"\"\"\n This is the simulated + train part of our ML pipeline where training is performed\n \"\"\"\n import + json \n import time\n with open(input_nr_path, ''r'') as reader:\n line + = reader.readline()\n histogram_norm_value = int(line)\n\n accuracy + = (batch_size + histogram_norm_value)/ (batch_size + epochs+histogram_norm_value)\n val_accuracy + = accuracy * 0.9\n metrics = {\n \"metrics\": [\n {\n \"name\": + \"accuracy\", # The name of the metric. Visualized as the column + name in the runs table.\n \"numberValue\": accuracy, # + The value of the metric. Must be a numeric value.\n \"format\": + \"PERCENTAGE\", # The optional format of the metric. Supported values + are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed + in percentage format).\n },\n {\n \"name\": + \"val-accuracy\", # The name of the metric. Visualized as the column + name in the runs table.\n \"numberValue\": val_accuracy, # + The value of the metric. Must be a numeric value.\n \"format\": + \"PERCENTAGE\", # The optional format of the metric. Supported values + are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed + in percentage format).\n },\n ]\n }\n with + open(mlpipeline_metrics_path, \"w\") as f:\n json.dump(metrics, + f)\n\n # If this step is to fast, the metrics collector fails as + the\n # pod is already finished before it can collect the metrics.\n time.sleep(10)\n\nimport + argparse\n_parser = argparse.ArgumentParser(prog=''Train e2e'', description=''This + is the simulated train part of our ML pipeline where training is performed'')\n_parser.add_argument(\"--input-nr\", + dest=\"input_nr_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--lr\", + dest=\"lr\", type=float, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--optimizer\", + dest=\"optimizer\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--loss\", + dest=\"loss\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--epochs\", + dest=\"epochs\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--batch-size\", + dest=\"batch_size\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--mlpipeline-metrics\", + dest=\"mlpipeline_metrics_path\", type=_make_parent_dirs_and_return_path, + required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs + = train_e2e(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": + [{"name": "input_nr", "type": "Integer"}, {"default": "0.0001", "name": + "lr", "optional": true, "type": "Float"}, {"default": "Adam", "name": + "optimizer", "optional": true, "type": "String"}, {"default": "categorical_crossentropy", + "name": "loss", "optional": true, "type": "String"}, {"default": "1", + "name": "epochs", "optional": true, "type": "Integer"}, {"default": + "32", "name": "batch_size", "optional": true, "type": "Integer"}], + "name": "Train e2e", "outputs": [{"name": "mlpipeline_metrics", "type": + "Metrics"}]}' + pipelines.kubeflow.org/max_cache_staleness: P0D + pipelines.kubeflow.org/task_display_name: Generate dummy metrics + labels: + katib.kubeflow.org/model-training: 'true' + pipelines.kubeflow.org/enable_caching: 'true' + pipelines.kubeflow.org/kfp_sdk_version: 1.8.12 + pipelines.kubeflow.org/pipeline-sdk-type: kfp + name: train-e2e + outputs: + artifacts: + - name: mlpipeline-metrics + path: /tmp/outputs/mlpipeline_metrics/data diff --git a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb new file mode 100644 index 00000000000..6efb26732a0 --- /dev/null +++ b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb @@ -0,0 +1,1972 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Katib parameter tuning over Kubeflow Pipelines (V1)\n", + "\n", + "This example shows how parameter tunning can be done over a multistep Kubeflow pipeline.\n", + "\n", + "The pipeline consists of 4 steps:\n", + "- Download of the training images and labels from the original MNIST publication\n", + "- Prepartion of the training dataset\n", + "- Image pre-processing\n", + "- Model fitting\n", + "\n", + "The pipeline has the model has model fitting parameters as well as image pre-processing parameters exposed as a pipeline parameter for tuning. Katib will be used to explore the question if image preprocessing using a simple histogram normalization might improve a neural network training on MNIST.\n", + "\n", + "## Requirements\n", + "\n", + "This requires a Kubeflow installation with Katib and Pipelines.\n", + "\n", + "Additionally the Katib-Argo integration needs to be setup:\n", + "\n", + "If you are running on a full Kubeflow installation *do not reinstall or update Argo* as this will likely break your installation.\n", + "\n", + "Just run the following commands:\n", + "\n", + "Enable side-car injection:\n", + "\n", + "`kubectl patch namespace argo -p '{\"metadata\":{\"labels\":{\"katib.kubeflow.org/metrics-collector-injection\":\"enabled\"}}}'`\n", + "\n", + "\n", + "Verify that the emissary executor is active (should be default in newer Kubeflow installations):\n", + "\n", + "` kubectl get ConfigMap -n argo workflow-controller-configmap -o yaml | grep containerRuntimeExecutor`\n", + "\n", + "Patch the Katib controller:\n", + "\n", + "`kubectl patch ClusterRole katib-controller -n kubeflow --type=json \\\n", + " -p='[{\"op\": \"add\", \"path\": \"/rules/-\", \"value\": {\"apiGroups\":[\"argoproj.io\"],\"resources\":[\"workflows\"],\"verbs\":[\"get\", \"list\", \"watch\", \"create\", \"delete\"]}}]'\n", + "`\n", + "\n", + "`kubectl patch Deployment katib-controller -n kubeflow --type=json \\\n", + " -p='[{\"op\": \"add\", \"path\": \"/spec/template/spec/containers/0/args/-\", \"value\": \"--trial-resources=Workflow.v1alpha1.argoproj.io\"}]'`\n", + "\n", + "For more details and how to set this up on a partial Kubeflow installation follow:\n", + "https://github.com/kubeflow/katib/tree/master/examples/v1beta1/argo/README.mdd" + ] + }, + { + "attachments": { + "image.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building the base Kubeflow pipeline\n", + "\n", + "The next steps will build up the following Kubeflow pipeline:\n", + "\n", + "![image.png](attachment:image.png)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set default variables\n", + "\n", + "The following default variables should be changed when running the notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "# Namespace to run the workloads under\n", + "USER_NAMESPACE = \"kubeflow\" # On a full installation this would be your user namespace\n", + "# Pipeline service account\n", + "# On a Kubeflow instance on GCP this should be 'default-editor'\n", + "KFP_SERVICE_ACCOUNT = \"pipeline-runner\"\n", + "\n", + "\n", + "# Consmetic variables\n", + "# Pipeline run variables\n", + "KFP_EXPERIMENT = \"katib-kfp-example\"\n", + "KFP_RUN = \"mnist-pipeline-v1\"\n", + "\n", + "# Katib run variables\n", + "KATIB_EXPERIMENT = \"katib-kfp-example-v1\"\n", + "KATIB_E2E_EXPERIMENT = \"katib-kfp-example-e2e-v1\"\n", + "KATIB_WORKLFLOW_COLLECTOR_IMAGE = \"docker.io/kubeflowkatib/kfpv1-metrics-collector:latest\" #\"docker.io/votti/kfpv1-metricscollector:v0.0.10\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install and load required python packages" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: kfp==1.8.12 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (1.8.12)\n", + "Requirement already satisfied: absl-py<2,>=0.9 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.4.0)\n", + "Requirement already satisfied: PyYAML<6,>=5.3 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (5.4.1)\n", + "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (2.10.2)\n", + "Requirement already satisfied: google-cloud-storage<2,>=1.20.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.44.0)\n", + "Requirement already satisfied: kubernetes<19,>=8.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (18.20.0)\n", + "Requirement already satisfied: google-api-python-client<2,>=1.7.8 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.12.11)\n", + "Requirement already satisfied: google-auth<2,>=1.6.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.35.0)\n", + "Requirement already satisfied: requests-toolbelt<1,>=0.8.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.10.1)\n", + "Requirement already satisfied: cloudpickle<3,>=2.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (2.2.1)\n", + "Requirement already satisfied: kfp-server-api<2.0.0,>=1.1.2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.8.5)\n", + "Requirement already satisfied: jsonschema<4,>=3.0.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (3.2.0)\n", + "Requirement already satisfied: tabulate<1,>=0.8.6 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.9.0)\n", + "Requirement already satisfied: click<9,>=7.1.2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (8.1.3)\n", + "Requirement already satisfied: Deprecated<2,>=1.2.7 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.2.14)\n", + "Requirement already satisfied: strip-hints<1,>=0.1.8 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.1.10)\n", + "Requirement already satisfied: docstring-parser<1,>=0.7.3 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.15)\n", + "Requirement already satisfied: kfp-pipeline-spec<0.2.0,>=0.1.14 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.1.16)\n", + "Requirement already satisfied: fire<1,>=0.3.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.5.0)\n", + "Requirement already satisfied: protobuf<4,>=3.13.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (3.20.3)\n", + "Requirement already satisfied: uritemplate<4,>=3.0.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (3.0.1)\n", + "Requirement already satisfied: pydantic<2,>=1.8.2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.10.9)\n", + "Requirement already satisfied: typer<1.0,>=0.3.2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.9.0)\n", + "Requirement already satisfied: wrapt<2,>=1.10 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from Deprecated<2,>=1.2.7->kfp==1.8.12) (1.15.0)\n", + "Requirement already satisfied: six in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from fire<1,>=0.3.1->kfp==1.8.12) (1.16.0)\n", + "Requirement already satisfied: termcolor in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from fire<1,>=0.3.1->kfp==1.8.12) (2.3.0)\n", + "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.56.2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp==1.8.12) (1.59.1)\n", + "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp==1.8.12) (2.31.0)\n", + "Requirement already satisfied: httplib2<1dev,>=0.15.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-api-python-client<2,>=1.7.8->kfp==1.8.12) (0.22.0)\n", + "Requirement already satisfied: google-auth-httplib2>=0.0.3 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-api-python-client<2,>=1.7.8->kfp==1.8.12) (0.1.0)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth<2,>=1.6.1->kfp==1.8.12) (4.2.4)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth<2,>=1.6.1->kfp==1.8.12) (0.3.0)\n", + "Requirement already satisfied: setuptools>=40.3.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth<2,>=1.6.1->kfp==1.8.12) (67.7.2)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth<2,>=1.6.1->kfp==1.8.12) (4.9)\n", + "Requirement already satisfied: google-cloud-core<3.0dev,>=1.6.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-cloud-storage<2,>=1.20.0->kfp==1.8.12) (2.3.2)\n", + "Requirement already satisfied: google-resumable-media<3.0dev,>=1.3.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-cloud-storage<2,>=1.20.0->kfp==1.8.12) (2.5.0)\n", + "Requirement already satisfied: attrs>=17.4.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from jsonschema<4,>=3.0.1->kfp==1.8.12) (23.1.0)\n", + "Requirement already satisfied: pyrsistent>=0.14.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from jsonschema<4,>=3.0.1->kfp==1.8.12) (0.19.3)\n", + "Requirement already satisfied: urllib3>=1.15 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp==1.8.12) (1.26.15)\n", + "Requirement already satisfied: certifi in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp==1.8.12) (2023.5.7)\n", + "Requirement already satisfied: python-dateutil in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp==1.8.12) (2.8.2)\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes<19,>=8.0.0->kfp==1.8.12) (1.6.0)\n", + "Requirement already satisfied: requests-oauthlib in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes<19,>=8.0.0->kfp==1.8.12) (1.3.1)\n", + "Requirement already satisfied: typing-extensions>=4.2.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from pydantic<2,>=1.8.2->kfp==1.8.12) (4.6.3)\n", + "Requirement already satisfied: wheel in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from strip-hints<1,>=0.1.8->kfp==1.8.12) (0.40.0)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-resumable-media<3.0dev,>=1.3.0->google-cloud-storage<2,>=1.20.0->kfp==1.8.12) (1.5.0)\n", + "Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from httplib2<1dev,>=0.15.0->google-api-python-client<2,>=1.7.8->kfp==1.8.12) (3.1.0)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.1->kfp==1.8.12) (0.5.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp==1.8.12) (3.1.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp==1.8.12) (3.4)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from requests-oauthlib->kubernetes<19,>=8.0.0->kfp==1.8.12) (3.2.2)\n", + "Requirement already satisfied: kubeflow-katib==0.13.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (0.13.0)\n", + "Requirement already satisfied: certifi>=14.05.14 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubeflow-katib==0.13.0) (2023.5.7)\n", + "Requirement already satisfied: six>=1.10 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubeflow-katib==0.13.0) (1.16.0)\n", + "Requirement already satisfied: setuptools>=21.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubeflow-katib==0.13.0) (67.7.2)\n", + "Requirement already satisfied: urllib3>=1.15.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubeflow-katib==0.13.0) (1.26.15)\n", + "Requirement already satisfied: kubernetes>=12.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubeflow-katib==0.13.0) (18.20.0)\n", + "Requirement already satisfied: python-dateutil>=2.5.3 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes>=12.0.0->kubeflow-katib==0.13.0) (2.8.2)\n", + "Requirement already satisfied: pyyaml>=5.4.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes>=12.0.0->kubeflow-katib==0.13.0) (5.4.1)\n", + "Requirement already satisfied: google-auth>=1.0.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes>=12.0.0->kubeflow-katib==0.13.0) (1.35.0)\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes>=12.0.0->kubeflow-katib==0.13.0) (1.6.0)\n", + "Requirement already satisfied: requests in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes>=12.0.0->kubeflow-katib==0.13.0) (2.31.0)\n", + "Requirement already satisfied: requests-oauthlib in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes>=12.0.0->kubeflow-katib==0.13.0) (1.3.1)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth>=1.0.1->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (4.2.4)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth>=1.0.1->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (0.3.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth>=1.0.1->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (4.9)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from requests->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (3.1.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from requests->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (3.4)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from requests-oauthlib->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (3.2.2)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (0.5.0)\n" + ] + } + ], + "source": [ + "# Install required packages (Kubeflow Pipelines and Katib SDK).\n", + "!pip install kfp==1.8.12\n", + "!pip install kubeflow-katib==0.13.0" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Optional\n", + "from datetime import datetime as dt\n", + "import kfp\n", + "import kfp.components as components\n", + "import kfp.dsl as dsl\n", + "from kfp.components import InputPath, OutputPath, create_component_from_func" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialize the Kubeflow pipeline client\n", + "\n", + "Documentation how this is done in various environments: https://www.kubeflow.org/docs/components/pipelines/v1/sdk/connect-api/" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "kfp_client = kfp.Client()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get the downloader component\n", + "\n", + "This is a publicly available, generic downloader we use to download the raw MNIST data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "download_data_op = components.load_component_from_url(\n", + " \"https://raw.githubusercontent.com/kubeflow/pipelines/master/components/contrib/web/Download/component.yaml\"\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parse the MNIST raw data format\n", + "\n", + "This is a component from text that converts the raw MNIST data format into a tensorflow compatible format." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "parse_mnist_op = components.load_component_from_text(\n", + " \"\"\"\n", + "name: Parse MNIST\n", + "inputs:\n", + "- {name: Images, description: gziped images in the idx format}\n", + "- {name: Labels, description: gziped labels in the idx format}\n", + "outputs:\n", + "- {name: Dataset}\n", + "metadata:\n", + " annotations:\n", + " author: Vito Zanotelli, D-ONE.ai\n", + " description: Based on https://github.com/kubeflow/pipelines/blob/master/components/contrib/sample/Python_script/component.yaml\n", + "implementation:\n", + " container:\n", + " image: tensorflow/tensorflow:2.7.1\n", + " command:\n", + " - sh\n", + " - -ec\n", + " - |\n", + " # This is how additional packages can be installed dynamically\n", + " python3 -m pip install pip idx2numpy\n", + " # Run the rest of the command after installing the packages.\n", + " \"$0\" \"$@\"\n", + " - python3\n", + " - -u # Auto-flush. We want the logs to appear in the console immediately.\n", + " - -c # Inline scripts are easy, but have size limitaions and the error traces do not show source lines.\n", + " - |\n", + " import gzip\n", + " import idx2numpy\n", + " import sys\n", + " from pathlib import Path\n", + " import pickle\n", + " import tensorflow as tf\n", + " img_path = sys.argv[1]\n", + " label_path = sys.argv[2]\n", + " output_path = sys.argv[3]\n", + " with gzip.open(img_path, 'rb') as f:\n", + " x = idx2numpy.convert_from_string(f.read())\n", + " with gzip.open(label_path, 'rb') as f:\n", + " y = idx2numpy.convert_from_string(f.read())\n", + " #one-hot encode the categories\n", + " x_out = tf.convert_to_tensor(x)\n", + " y_out = tf.keras.utils.to_categorical(y)\n", + " Path(output_path).parent.mkdir(parents=True, exist_ok=True)\n", + " with open(output_path, 'wb') as output_file:\n", + " pickle.dump((x_out, y_out), output_file)\n", + " - {inputPath: Images}\n", + " - {inputPath: Labels}\n", + " - {outputPath: Dataset}\n", + "\"\"\"\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Process the images\n", + "\n", + "This does the pre-processing of the images, including a training-validation split.\n", + "\n", + "Here also an optional `histogram_norm` image normalization step can be activated" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def process(\n", + " data_raw_path: InputPath(str), # type: ignore\n", + " data_processed_path: OutputPath(str), # type: ignore\n", + " val_pct: float = 0.2,\n", + " trainset_flag: bool = True,\n", + " histogram_norm: bool = False,\n", + "):\n", + " \"\"\"\n", + " Here we do all the preprocessing\n", + " if the data path is for training data we:\n", + " (1) Normalize the data\n", + " (2) split the train and val data\n", + " If it is for unseen test data, we:\n", + " (1) Normalize the data\n", + " This function returns in any case the processed data path\n", + " \"\"\"\n", + " # sklearn\n", + " import pickle\n", + " from sklearn.model_selection import train_test_split\n", + " import tensorflow as tf\n", + " import tensorflow_addons as tfa\n", + "\n", + " def img_norm(x):\n", + " x_ = tf.reshape(x, list(x.shape) + [1])\n", + "\n", + " if histogram_norm:\n", + " x_ = tfa.image.equalize(x_)\n", + "\n", + " # Scale between 0-1\n", + " x_ = x_ / 255\n", + " return x_\n", + "\n", + " with open(data_raw_path, \"rb\") as f:\n", + " x, y = pickle.load(f)\n", + " if trainset_flag:\n", + "\n", + " x_ = img_norm(x)\n", + " x_train, x_val, y_train, y_val = train_test_split(\n", + " x_.numpy(), y, test_size=val_pct, stratify=y, random_state=42\n", + " )\n", + "\n", + " with open(data_processed_path, \"wb\") as output_file:\n", + " pickle.dump((x_train, y_train, x_val, y_val), output_file)\n", + "\n", + " else:\n", + " x_ = img_norm(x)\n", + " with open(data_processed_path, \"wb\") as output_file:\n", + " pickle.dump((x_, y), output_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "process_op = create_component_from_func(\n", + " func=process,\n", + " base_image=\"tensorflow/tensorflow:2.7.1\", # Optional\n", + " packages_to_install=[\"scikit-learn\", \"tensorflow-addons[tensorflow]\"], # Optional\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training component\n", + "\n", + "Component with ML hyperparameters as parameters.\n", + "Note that the `metrics` that should be tracked by Katib need to be\n", + "saved as ML metrics output artifacts.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def train(\n", + " data_train_path: InputPath(str), # type: ignore\n", + " model_out_path: OutputPath(str), # type: ignore\n", + " mlpipeline_metrics_path: OutputPath(\"Metrics\"), # type: ignore # noqa: F821\n", + " lr: float = 1e-4,\n", + " optimizer: str = \"Adam\",\n", + " loss: str = \"categorical_crossentropy\",\n", + " epochs: int = 1,\n", + " batch_size: int = 32,\n", + "):\n", + " \"\"\"\n", + " This is the simulated train part of our ML pipeline where training is performed\n", + " \"\"\"\n", + "\n", + " import tensorflow as tf\n", + " import pickle\n", + " from tensorflow.keras.preprocessing.image import ImageDataGenerator\n", + " import json\n", + "\n", + " with open(data_train_path, \"rb\") as f:\n", + " x_train, y_train, x_val, y_val = pickle.load(f)\n", + "\n", + " model = tf.keras.Sequential(\n", + " [\n", + " tf.keras.layers.Conv2D(\n", + " 64, (3, 3), activation=\"relu\", input_shape=(28, 28, 1)\n", + " ),\n", + " tf.keras.layers.MaxPooling2D(2, 2),\n", + " tf.keras.layers.Conv2D(64, (3, 3), activation=\"relu\"),\n", + " tf.keras.layers.MaxPooling2D(2, 2),\n", + " tf.keras.layers.Flatten(),\n", + " tf.keras.layers.Dense(128, activation=\"relu\"),\n", + " tf.keras.layers.Dense(10, activation=\"softmax\"),\n", + " ]\n", + " )\n", + "\n", + " if optimizer.lower() == \"sgd\":\n", + " optimizer = tf.keras.optimizers.SGD(lr)\n", + " else:\n", + " optimizer = tf.keras.optimizers.Adam(lr)\n", + "\n", + " model.compile(loss=loss, optimizer=optimizer, metrics=[\"accuracy\"])\n", + "\n", + " # fit the model\n", + " model_early_stopping_callback = tf.keras.callbacks.EarlyStopping(\n", + " monitor=\"val_accuracy\", patience=10, verbose=1, restore_best_weights=True\n", + " )\n", + "\n", + " train_datagen = ImageDataGenerator()\n", + "\n", + " validation_datagen = ImageDataGenerator()\n", + " history = model.fit(\n", + " train_datagen.flow(x_train, y_train, batch_size=batch_size),\n", + " epochs=epochs,\n", + " validation_data=validation_datagen.flow(x_val, y_val, batch_size=batch_size),\n", + " shuffle=False,\n", + " callbacks=[model_early_stopping_callback],\n", + " )\n", + "\n", + " model.save(model_out_path, save_format=\"tf\")\n", + "\n", + " metrics = {\n", + " \"metrics\": [\n", + " {\n", + " \"name\": \"accuracy\", # The name of the metric. Visualized as the column name in the runs table.\n", + " \"numberValue\": history.history[\"accuracy\"][\n", + " -1\n", + " ], # The value of the metric. Must be a numeric value.\n", + " \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\n", + " },\n", + " {\n", + " \"name\": \"val-accuracy\", # The name of the metric. Visualized as the column name in the runs table.\n", + " \"numberValue\": history.history[\"val_accuracy\"][\n", + " -1\n", + " ], # The value of the metric. Must be a numeric value.\n", + " \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\n", + " },\n", + " ]\n", + " }\n", + " with open(mlpipeline_metrics_path, \"w\") as f:\n", + " json.dump(metrics, f)\n", + "\n", + "\n", + "train_op = create_component_from_func(\n", + " func=train, base_image=\"tensorflow/tensorflow:2.7.1\", packages_to_install=[\"scipy\"]\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Build the full pipeline\n", + "\n", + "These wires the components to a full pipeline.\n", + "\n", + "The only thing required to make the pipeline Katib compatible is:\n", + "\n", + "1) A pod label to mark the pod from which the metrics tracked by Katib should be collected from: \"katib.kubeflow.org/model-training\", \"true\"\n", + "2) A mark to prevent caching on this pod: `execution_options.caching_strategy.max_cache_staleness = \"P0D\"`\n", + "\n", + "In addition, currently the pod label for caching seems not be added by default and thus the cache is not used. To enable cache usage, the cache label is added to all the steps.\n", + "\n", + "Apart from these two requirements, there is no restriction on how the pipeline is build. The pipeline remains a normal Kubeflow pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def _label_cache(step):\n", + " \"\"\"Helper to add pod cache label\n", + "\n", + " Currently there seems to be an issue with pod labeling.\n", + " \"\"\"\n", + " step.add_pod_label(\"pipelines.kubeflow.org/cache_enabled\", \"true\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "@dsl.pipeline(\n", + " name=\"Download MNIST dataset\",\n", + " description=\"A pipeline to train MNIST classification from scratch.\",\n", + ")\n", + "def mnist_training_pipeline(\n", + " lr: float = 1e-4,\n", + " optimizer: str = \"Adam\",\n", + " loss: str = \"categorical_crossentropy\",\n", + " epochs: int = 3,\n", + " batch_size: int = 5,\n", + " histogram_norm: bool = False,\n", + "):\n", + " TRAIN_IMG_URL = \"http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\"\n", + " TRAIN_LAB_URL = \"http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\"\n", + "\n", + " train_imgs = download_data_op(TRAIN_IMG_URL)\n", + " train_imgs.set_display_name(\"Download training images\")\n", + " _label_cache(train_imgs)\n", + "\n", + " train_y = download_data_op(TRAIN_LAB_URL)\n", + " train_y.set_display_name(\"Download training labels\")\n", + " _label_cache(train_y)\n", + "\n", + " mnist_train = parse_mnist_op(train_imgs.output, train_y.output)\n", + " mnist_train.set_display_name(\"Prepare train dataset\")\n", + " _label_cache(mnist_train)\n", + "\n", + " processed_train = (\n", + " process_op(\n", + " mnist_train.output,\n", + " val_pct=0.2,\n", + " trainset_flag=True,\n", + " histogram_norm=histogram_norm,\n", + " )\n", + " .set_cpu_limit(\"1\")\n", + " .set_memory_limit(\"2Gi\")\n", + " .set_display_name(\"Preprocess images\")\n", + " )\n", + " _label_cache(processed_train)\n", + "\n", + " training_output = (\n", + " train_op(\n", + " processed_train.outputs[\"data_processed\"],\n", + " lr=lr,\n", + " optimizer=optimizer,\n", + " epochs=epochs,\n", + " batch_size=batch_size,\n", + " loss=loss,\n", + " )\n", + " .set_cpu_limit(\"1\")\n", + " .set_memory_limit(\"2Gi\")\n", + " )\n", + " training_output.set_display_name(\"Fit the model\")\n", + " # This pod label indicates which pod Katib should collect the metric from.\n", + " # A metrics collecting sidecar container will be added\n", + " training_output.add_pod_label(\"katib.kubeflow.org/model-training\", \"true\")\n", + " # This step needs to run always, as otherwise the metrics for Katib could not\n", + " # be collected.\n", + " training_output.execution_options.caching_strategy.max_cache_staleness = \"P0D\"\n", + "\n", + " return mnist_train.output" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Experiment details." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run details." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "kfp_run = f\"{KFP_RUN}-{dt.today().strftime('%Y-%m-%d-%Hh-%Mm-%Ss')}\"\n", + "run = kfp_client.create_run_from_pipeline_func(\n", + " mnist_training_pipeline,\n", + " mode=kfp.dsl.PipelineExecutionMode.V1_LEGACY,\n", + " # You can optionally override your pipeline_root when submitting the run too:\n", + " # pipeline_root='gs://my-pipeline-root/example-pipeline',\n", + " arguments={\"histogram_norm\": \"0\"},\n", + " experiment_name=KFP_EXPERIMENT,\n", + " run_name=kfp_run,\n", + " # In a multiuser setup, provide the namesapce\n", + " #namespace=USER_NAMESPACE,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parameter tuning with Katib\n", + "\n", + "We now want to do parameter tuning over the whole pipeline with Katib.\n", + "\n", + "This requires us to build up a specificaiton for the Katib experiment" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First import the Katib python components:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import yaml\n", + "from typing import List\n", + "\n", + "from kubernetes.client.models import V1ObjectMeta\n", + "from kubeflow.katib import ApiClient\n", + "from kubeflow.katib import KatibClient\n", + "from kubeflow.katib import V1beta1Experiment\n", + "from kubeflow.katib import V1beta1ExperimentSpec\n", + "from kubeflow.katib import V1beta1AlgorithmSpec\n", + "from kubeflow.katib import V1beta1ObjectiveSpec\n", + "from kubeflow.katib import V1beta1ParameterSpec\n", + "from kubeflow.katib import V1beta1FeasibleSpace\n", + "from kubeflow.katib import V1beta1TrialTemplate\n", + "from kubeflow.katib import V1beta1TrialParameterSpec\n", + "from kubeflow.katib import V1beta1MetricsCollectorSpec\n", + "from kubeflow.katib import V1beta1CollectorSpec" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to build a katib experiment, we require a trial spec.\n", + "\n", + "In this case the trial spec is an Argo workflow produced form the Kubeflow pipeline.\n", + "\n", + "This workflow can be run thanks to the Katib-Argo integration that was setup in the requirements section.\n", + "\n", + "\n", + "The Katib Experiment consists of many components, that we next will setup using custom built helper functions:" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Helper functions to build the individual Katib Experiment Components\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def create_trial_spec(\n", + " pipeline,\n", + " params_list: List[dsl.PipelineParam],\n", + " service_account: Optional[str] = None,\n", + "):\n", + " \"\"\"\n", + " Create an Argo workflow specification from a KFP pipeline function\n", + "\n", + " The Argo worklow CRD will be the basis for the trial_template used\n", + " by Katib.\n", + "\n", + " Args:\n", + " pipeline: a kubeflow pipeline function\n", + " params_list (List[dsl.PipelineParam]): a list of mappings of Kubeflow pipeline parameters\n", + " to Katib trialParameters.\n", + " These need to map the pipeline parameter to the Katib parameter.\n", + " Eg: [dsl.PipelineParam(name='lr', value='${trialParameters.learningRate}')]\n", + " here `lr` is the PipelineParam and `trialParameters.learningRate` the Katib trialParameter.\n", + "\n", + " \"\"\"\n", + " compiler = kfp.compiler.Compiler(\n", + " mode=kfp.dsl.PipelineExecutionMode.V1_LEGACY,\n", + " )\n", + " # Here the pipeline parameters are passed.\n", + " # These will be generated in the Katib trials\n", + " trial_spec = compiler._create_workflow(pipeline, params_list=params_list)\n", + " # Somehow the pipeline is configured with the wrong serviceAccountName by default\n", + " if service_account is not None:\n", + " trial_spec[\"spec\"][\"serviceAccountName\"] = service_account\n", + "\n", + " return trial_spec" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def create_trial_template(\n", + " trial_spec,\n", + " trial_param_specs: List[V1beta1TrialParameterSpec],\n", + " retain_pods: bool = False,\n", + ") -> V1beta1TrialTemplate:\n", + " \"\"\"Generate a trial template from the spec\n", + "\n", + " This takes the Argo workflow CRD and wrapps it as a\n", + " Katib trial template.\n", + " Here the Katib trial parameters are defined.\n", + "\n", + " Args:\n", + " trial_spec (Argo workflow spec): The workflow/pipeline to tune\n", + " trial_params_spec (List[V1beta1TrialParameterSpec]): The trial parameter specifications\n", + " Note that the `name` of the parameters needs to match the names refered to by the\n", + " create_trial_spec `params_list` arguments.\n", + " The `ref` needs to match the names used in the parameter space defined in `V1beta1ParameterSpec`.\n", + "\n", + " Returns:\n", + " V1beta1TrialTemplate: the trial template\n", + " \"\"\"\n", + "\n", + " trial_template = V1beta1TrialTemplate(\n", + " primary_container_name=\"main\", # Name of the primary container returning the metrics in the workflow\n", + " # The label used for the pipeline component returning the pipeline specs\n", + " primary_pod_labels={\"katib.kubeflow.org/model-training\": \"true\"},\n", + " trial_parameters=trial_param_specs,\n", + " trial_spec=trial_spec,\n", + " success_condition='status.[@this].#(phase==\"Succeeded\")#',\n", + " failure_condition='status.[@this].#(phase==\"Failed\")#',\n", + " retain=retain_pods, # Retain completed pods - left hear for easier debugging\n", + " )\n", + " return trial_template" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "def create_metrics_collector_spec(objective: V1beta1ObjectiveSpec):\n", + " \"\"\"This defines the custom metrics collector\n", + "\n", + " This custom metrics connector was built to collect\n", + " Kubeflow pipeline MLmetrics from a step.\n", + "\n", + " Args:\n", + " objective (V1beta1ObjectiveSpec): the objective spec used to get the metrics names\n", + "\n", + " \"\"\"\n", + "\n", + " metric_names = [objective.objective_metric_name] + list(\n", + " objective.additional_metric_names\n", + " )\n", + " collector = V1beta1MetricsCollectorSpec(\n", + " source={\n", + " \"fileSystemPath\": {\n", + " # In KFP v1 this seems to be the hardcoded location\n", + " # for this output file..\n", + " \"path\": \"/tmp/outputs/mlpipeline_metrics/data\",\n", + " \"kind\": \"File\",\n", + " }\n", + " },\n", + " collector=V1beta1CollectorSpec(\n", + " kind=\"Custom\",\n", + " custom_collector={\n", + " \"args\": [\n", + " \"-m\",\n", + " f\"{';'.join(metric_names)}\",\n", + " \"-s\",\n", + " \"katib-db-manager.kubeflow:6789\",\n", + " \"-t\",\n", + " \"$(PodName)\",\n", + " \"-path\",\n", + " \"/tmp/outputs/mlpipeline_metrics\",\n", + " ],\n", + " \"image\": KATIB_WORKLFLOW_COLLECTOR_IMAGE,\n", + " \"imagePullPolicy\": \"Always\",\n", + " \"name\": \"custom-metrics-logger-and-collector\",\n", + " \"env\": [\n", + " {\n", + " # In this setup the PodName can be used to\n", + " # infer the `trial name` required to report back\n", + " # the metrics.\n", + " \"name\": \"PodName\",\n", + " \"valueFrom\": {\"fieldRef\": {\"fieldPath\": \"metadata.name\"}},\n", + " }\n", + " ],\n", + " },\n", + " ),\n", + " )\n", + " return collector" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Final helper function to create experiments from pipelines\n", + "\n", + "\n", + "This helper function is the main entry point to train pipelines." + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "def create_katib_experiment_spec(\n", + " pipeline: dsl.Pipeline,\n", + " pipeline_params: List[dsl.PipelineParam],\n", + " trial_params: List[V1beta1TrialParameterSpec],\n", + " trial_params_space: List[V1beta1ParameterSpec],\n", + " objective: V1beta1ObjectiveSpec,\n", + " algorithm: V1beta1AlgorithmSpec,\n", + " max_trial_count: int = 2,\n", + " max_failed_trial_count: int = 2,\n", + " parallel_trial_count: int = 2,\n", + " pipeline_service_account: Optional[str] = None,\n", + " retain_pods: bool = False,\n", + ") -> V1beta1ExperimentSpec:\n", + " \"\"\"Construct a Katib experiment over a KFP pipeline\n", + "\n", + " Args:\n", + " pipeline (dsl.Pipeline): The Kubeflow Pipeline\n", + " pipeline_params (List[dsl.PipelineParam]): A mapping of trial-parameters to pipeline parameters.\n", + " Example: [\n", + " dsl.PipelineParam(name=\"lr\", value=\"${trialParameters.learningRate}\"),\n", + " ...\n", + " ]\n", + " trial_params (List[V1beta1TrialParameterSpec]): Spec for Trial parameters. Note that name\n", + " and refs need to match the ones used in `pipeline_params` and `trial_params_space`\n", + " Example: [\n", + " V1beta1TrialParameterSpec(\n", + " name=\"learningRate\",\n", + " description=\"Learning rate for the training model\",\n", + " reference=\"learning_rate\",\n", + " ), ...]\n", + " trial_params_space (List[V1beta1ParameterSpec]): The spec for the parameter space explored in the\n", + " Trials\n", + " Example: [\n", + " V1beta1ParameterSpec(\n", + " name=\"learning_rate\",\n", + " parameter_type=\"double\",\n", + " feasible_space=V1beta1FeasibleSpace(min=\"0.00001\", max=\"0.001\"),\n", + " ), ...]\n", + " objective (V1beta1ObjectiveSpec): objective spec. The names used here\n", + " need to match the metrics reported by the pipeline.\n", + " Example: V1beta1ObjectiveSpec(\n", + " type=\"maximize\",\n", + " goal=0.9,\n", + " objective_metric_name=\"val-accuracy\",\n", + " additional_metric_names=[\"accuracy\"],\n", + " )\n", + " algorithm (V1beta1AlgorithmSpec): algorithm spec\n", + " Example: V1beta1AlgorithmSpec(\n", + " algorithm_name=\"random\",\n", + " )\n", + " max_trial_count (int, optional): Max total number of trials. Defaults to 2.\n", + " max_failed_trial_count (int, optional): Number of failed trials tolerated. Defaults to 2.\n", + " parallel_trial_count (int, optional): Number of trials run in parallel. Defaults to 2.\n", + " pipeline_service_account (str | None, optional): Name of the service account to run\n", + " pipelines with. Defaults to None (uses pre-configured default).\n", + " On a Kubeflow GCP deployment this should be set to `default-editor`\n", + " retain_pods (bool): retain pods (good for debugging). Default: false\n", + "\n", + " Returns:\n", + " V1beta1ExperimentSpec: Katib experiment spec\n", + " \"\"\"\n", + "\n", + " trial_spec = create_trial_spec(\n", + " pipeline, pipeline_params, service_account=pipeline_service_account\n", + " )\n", + "\n", + " # Configure parameters for the Trial template.\n", + " trial_template = create_trial_template(\n", + " trial_spec, trial_params, retain_pods=retain_pods\n", + " )\n", + "\n", + " # Metrics collector spec\n", + " metrics_collector = create_metrics_collector_spec(objective=objective)\n", + "\n", + " # Create an Experiment from the above parameters.\n", + " experiment_spec = V1beta1ExperimentSpec(\n", + " # Experimental Budget\n", + " max_trial_count=max_trial_count,\n", + " max_failed_trial_count=max_failed_trial_count,\n", + " parallel_trial_count=parallel_trial_count,\n", + " # Optimization Objective\n", + " objective=objective,\n", + " # Optimization Algorithm\n", + " algorithm=algorithm,\n", + " # Optimization Parameters\n", + " parameters=trial_params_space,\n", + " # Trial Template\n", + " trial_template=trial_template,\n", + " # Metrics collector\n", + " metrics_collector_spec=metrics_collector,\n", + " )\n", + "\n", + " return experiment_spec" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tune the MNIST pipeline using Katib\n", + "\n", + "First prepare all required input" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_params = [\n", + " dsl.PipelineParam(name=\"lr\", value=\"${trialParameters.learningRate}\"),\n", + " dsl.PipelineParam(name=\"batch_size\", value=\"${trialParameters.batchSize}\"),\n", + " dsl.PipelineParam(name=\"histogram_norm\", value=\"${trialParameters.histogramNorm}\"),\n", + "]\n", + "trial_params_specs = [\n", + " V1beta1TrialParameterSpec(\n", + " name=\"learningRate\", # the parameter name that is replaced in your template (see Trial Specification).\n", + " description=\"Learning rate for the training model\",\n", + " reference=\"learning_rate\", # the parameter name that experiment’s suggestion returns (parameter name in the Parameters Specification).\n", + " ),\n", + " V1beta1TrialParameterSpec(\n", + " name=\"batchSize\",\n", + " description=\"Batch size for NN training\",\n", + " reference=\"batch_size\",\n", + " ),\n", + " V1beta1TrialParameterSpec(\n", + " name=\"histogramNorm\",\n", + " description=\"Histogram normalization of image on?\",\n", + " reference=\"histogram_norm\",\n", + " ),\n", + "]\n", + "parameter_space = [\n", + " V1beta1ParameterSpec(\n", + " name=\"learning_rate\",\n", + " parameter_type=\"double\",\n", + " feasible_space=V1beta1FeasibleSpace(min=\"0.00001\", max=\"0.001\"),\n", + " ),\n", + " V1beta1ParameterSpec(\n", + " name=\"batch_size\",\n", + " parameter_type=\"int\",\n", + " feasible_space=V1beta1FeasibleSpace(min=\"16\", max=\"64\"),\n", + " ),\n", + " V1beta1ParameterSpec(\n", + " name=\"histogram_norm\",\n", + " parameter_type=\"discrete\",\n", + " feasible_space=V1beta1FeasibleSpace(list=[\"0\", \"1\"]),\n", + " ),\n", + "]\n", + "objective = V1beta1ObjectiveSpec(\n", + " type=\"maximize\",\n", + " goal=0.9,\n", + " objective_metric_name=\"val-accuracy\",\n", + " additional_metric_names=[\"accuracy\"],\n", + ")\n", + "\n", + "algorithm = V1beta1AlgorithmSpec(\n", + " algorithm_name=\"random\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the full spec\n", + "\n", + "katib_spec = create_katib_experiment_spec(\n", + " pipeline=mnist_training_pipeline,\n", + " pipeline_params=pipeline_params,\n", + " trial_params=trial_params_specs,\n", + " trial_params_space=parameter_space,\n", + " objective=objective,\n", + " algorithm=algorithm,\n", + " pipeline_service_account=KFP_SERVICE_ACCOUNT,\n", + " max_trial_count=5,\n", + " parallel_trial_count=5,\n", + " retain_pods=False,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to generate a full experiment the api_version, kind and namespace need to be defined:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'katib_spec' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[17], line 11\u001b[0m\n\u001b[1;32m 1\u001b[0m katib_experiment_name \u001b[39m=\u001b[39m (\n\u001b[1;32m 2\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mKATIB_EXPERIMENT\u001b[39m}\u001b[39;00m\u001b[39m-\u001b[39m\u001b[39m{\u001b[39;00mdt\u001b[39m.\u001b[39mtoday()\u001b[39m.\u001b[39mstrftime(\u001b[39m'\u001b[39m\u001b[39m%\u001b[39m\u001b[39mY-\u001b[39m\u001b[39m%\u001b[39m\u001b[39mm-\u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m-\u001b[39m\u001b[39m%\u001b[39m\u001b[39mHh-\u001b[39m\u001b[39m%\u001b[39m\u001b[39mMm-\u001b[39m\u001b[39m%\u001b[39m\u001b[39mSs\u001b[39m\u001b[39m'\u001b[39m)\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 3\u001b[0m )\n\u001b[1;32m 4\u001b[0m katib_experiment \u001b[39m=\u001b[39m V1beta1Experiment(\n\u001b[1;32m 5\u001b[0m api_version\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mkubeflow.org/v1beta1\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 6\u001b[0m kind\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mExperiment\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 7\u001b[0m metadata\u001b[39m=\u001b[39mV1ObjectMeta(\n\u001b[1;32m 8\u001b[0m name\u001b[39m=\u001b[39mkatib_experiment_name,\n\u001b[1;32m 9\u001b[0m namespace\u001b[39m=\u001b[39mUSER_NAMESPACE,\n\u001b[1;32m 10\u001b[0m ),\n\u001b[0;32m---> 11\u001b[0m spec\u001b[39m=\u001b[39mkatib_spec,\n\u001b[1;32m 12\u001b[0m )\n", + "\u001b[0;31mNameError\u001b[0m: name 'katib_spec' is not defined" + ] + } + ], + "source": [ + "katib_experiment_name = (\n", + " f\"{KATIB_EXPERIMENT}-{dt.today().strftime('%Y-%m-%d-%Hh-%Mm-%Ss')}\"\n", + ")\n", + "katib_experiment = V1beta1Experiment(\n", + " api_version=\"kubeflow.org/v1beta1\",\n", + " kind=\"Experiment\",\n", + " metadata=V1ObjectMeta(\n", + " name=katib_experiment_name,\n", + " namespace=USER_NAMESPACE,\n", + " ),\n", + " spec=katib_spec,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The generated yaml can written out to submit via the web ui:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "with open(f\"{KATIB_EXPERIMENT}.yaml\", \"w\") as f:\n", + " yaml.dump(ApiClient().sanitize_for_serialization(katib_experiment), f)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or sumitted via the KatibClient:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "katib_client = KatibClient()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'apiVersion': 'kubeflow.org/v1beta1',\n", + " 'kind': 'Experiment',\n", + " 'metadata': {'creationTimestamp': '2023-07-20T19:40:11Z',\n", + " 'generation': 1,\n", + " 'managedFields': [{'apiVersion': 'kubeflow.org/v1beta1',\n", + " 'fieldsType': 'FieldsV1',\n", + " 'fieldsV1': {'f:spec': {'.': {},\n", + " 'f:algorithm': {'.': {}, 'f:algorithmName': {}},\n", + " 'f:maxFailedTrialCount': {},\n", + " 'f:maxTrialCount': {},\n", + " 'f:metricsCollectorSpec': {'.': {},\n", + " 'f:collector': {'.': {},\n", + " 'f:customCollector': {'.': {},\n", + " 'f:args': {},\n", + " 'f:env': {},\n", + " 'f:image': {},\n", + " 'f:imagePullPolicy': {},\n", + " 'f:name': {}},\n", + " 'f:kind': {}},\n", + " 'f:source': {'.': {},\n", + " 'f:fileSystemPath': {'.': {}, 'f:kind': {}, 'f:path': {}}}},\n", + " 'f:objective': {'.': {},\n", + " 'f:additionalMetricNames': {},\n", + " 'f:goal': {},\n", + " 'f:objectiveMetricName': {},\n", + " 'f:type': {}},\n", + " 'f:parallelTrialCount': {},\n", + " 'f:parameters': {},\n", + " 'f:trialTemplate': {'.': {},\n", + " 'f:failureCondition': {},\n", + " 'f:primaryContainerName': {},\n", + " 'f:primaryPodLabels': {'.': {},\n", + " 'f:katib.kubeflow.org/model-training': {}},\n", + " 'f:retain': {},\n", + " 'f:successCondition': {},\n", + " 'f:trialParameters': {},\n", + " 'f:trialSpec': {'.': {},\n", + " 'f:apiVersion': {},\n", + " 'f:kind': {},\n", + " 'f:metadata': {'.': {},\n", + " 'f:annotations': {'.': {},\n", + " 'f:pipelines.kubeflow.org/kfp_sdk_version': {},\n", + " 'f:pipelines.kubeflow.org/pipeline_compilation_time': {},\n", + " 'f:pipelines.kubeflow.org/pipeline_spec': {}},\n", + " 'f:generateName': {},\n", + " 'f:labels': {'.': {},\n", + " 'f:pipelines.kubeflow.org/kfp_sdk_version': {}}},\n", + " 'f:spec': {'.': {},\n", + " 'f:arguments': {'.': {}, 'f:parameters': {}},\n", + " 'f:entrypoint': {},\n", + " 'f:serviceAccountName': {},\n", + " 'f:templates': {}}}}}},\n", + " 'manager': 'OpenAPI-Generator',\n", + " 'operation': 'Update',\n", + " 'time': '2023-07-20T19:40:11Z'}],\n", + " 'name': 'katib-kfp-example-v1-2023-07-20-21h-40m-05s',\n", + " 'namespace': 'kubeflow',\n", + " 'resourceVersion': '6526',\n", + " 'uid': '68d7df06-e02d-4d1e-932c-c3032f7ecaff'},\n", + " 'spec': {'algorithm': {'algorithmName': 'random'},\n", + " 'maxFailedTrialCount': 2,\n", + " 'maxTrialCount': 5,\n", + " 'metricsCollectorSpec': {'collector': {'customCollector': {'args': ['-m',\n", + " 'val-accuracy;accuracy',\n", + " '-s',\n", + " 'katib-db-manager.kubeflow:6789',\n", + " '-t',\n", + " '$(PodName)',\n", + " '-path',\n", + " '/tmp/outputs/mlpipeline_metrics'],\n", + " 'env': [{'name': 'PodName',\n", + " 'valueFrom': {'fieldRef': {'fieldPath': 'metadata.name'}}}],\n", + " 'image': 'docker.io/kubeflowkatib/kfpv1-metrics-collector:latest',\n", + " 'imagePullPolicy': 'Always',\n", + " 'name': 'custom-metrics-logger-and-collector',\n", + " 'resources': {}},\n", + " 'kind': 'Custom'},\n", + " 'source': {'fileSystemPath': {'kind': 'File',\n", + " 'path': '/tmp/outputs/mlpipeline_metrics/data'}}},\n", + " 'objective': {'additionalMetricNames': ['accuracy'],\n", + " 'goal': 0.9,\n", + " 'metricStrategies': [{'name': 'val-accuracy', 'value': 'max'},\n", + " {'name': 'accuracy', 'value': 'max'}],\n", + " 'objectiveMetricName': 'val-accuracy',\n", + " 'type': 'maximize'},\n", + " 'parallelTrialCount': 5,\n", + " 'parameters': [{'feasibleSpace': {'max': '0.001', 'min': '0.00001'},\n", + " 'name': 'learning_rate',\n", + " 'parameterType': 'double'},\n", + " {'feasibleSpace': {'max': '64', 'min': '16'},\n", + " 'name': 'batch_size',\n", + " 'parameterType': 'int'},\n", + " {'feasibleSpace': {'list': ['0', '1']},\n", + " 'name': 'histogram_norm',\n", + " 'parameterType': 'discrete'}],\n", + " 'resumePolicy': 'Never',\n", + " 'trialTemplate': {'failureCondition': 'status.[@this].#(phase==\"Failed\")#',\n", + " 'primaryContainerName': 'main',\n", + " 'primaryPodLabels': {'katib.kubeflow.org/model-training': 'true'},\n", + " 'successCondition': 'status.[@this].#(phase==\"Succeeded\")#',\n", + " 'trialParameters': [{'description': 'Learning rate for the training model',\n", + " 'name': 'learningRate',\n", + " 'reference': 'learning_rate'},\n", + " {'description': 'Batch size for NN training',\n", + " 'name': 'batchSize',\n", + " 'reference': 'batch_size'},\n", + " {'description': 'Histogram normalization of image on?',\n", + " 'name': 'histogramNorm',\n", + " 'reference': 'histogram_norm'}],\n", + " 'trialSpec': {'apiVersion': 'argoproj.io/v1alpha1',\n", + " 'kind': 'Workflow',\n", + " 'metadata': {'annotations': {'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline_compilation_time': '2023-07-20T21:40:03.664402',\n", + " 'pipelines.kubeflow.org/pipeline_spec': '{\"description\": \"A pipeline to download the MNIST dataset files\", \"inputs\": [{\"default\": \"0.0001\", \"name\": \"lr\", \"optional\": true, \"type\": \"Float\"}, {\"default\": \"Adam\", \"name\": \"optimizer\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"categorical_crossentropy\", \"name\": \"loss\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"3\", \"name\": \"epochs\", \"optional\": true, \"type\": \"Integer\"}, {\"default\": \"5\", \"name\": \"batch_size\", \"optional\": true, \"type\": \"Integer\"}, {\"default\": \"False\", \"name\": \"histogram_norm\", \"optional\": true, \"type\": \"Boolean\"}, {\"default\": \"${trialParameters.learningRate}\", \"name\": \"lr\"}, {\"default\": \"${trialParameters.batchSize}\", \"name\": \"batch_size\"}, {\"default\": \"${trialParameters.histogramNorm}\", \"name\": \"histogram_norm\"}], \"name\": \"Download MNIST dataset\"}'},\n", + " 'generateName': 'download-mnist-dataset-',\n", + " 'labels': {'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12'}},\n", + " 'spec': {'arguments': {'parameters': [{'name': 'lr',\n", + " 'value': '${trialParameters.learningRate}'},\n", + " {'name': 'optimizer', 'value': 'Adam'},\n", + " {'name': 'loss', 'value': 'categorical_crossentropy'},\n", + " {'name': 'epochs', 'value': '3'},\n", + " {'name': 'batch_size', 'value': '${trialParameters.batchSize}'},\n", + " {'name': 'histogram_norm',\n", + " 'value': '${trialParameters.histogramNorm}'}]},\n", + " 'entrypoint': 'download-mnist-dataset',\n", + " 'serviceAccountName': 'pipeline-runner',\n", + " 'templates': [{'container': {'args': [],\n", + " 'command': ['sh',\n", + " '-exc',\n", + " 'url=\"$0\"\\noutput_path=\"$1\"\\ncurl_options=\"$2\"\\n\\nmkdir -p \"$(dirname \"$output_path\")\"\\ncurl --get \"$url\" --output \"$output_path\" $curl_options\\n',\n", + " 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',\n", + " '/tmp/outputs/Data/data',\n", + " '--location'],\n", + " 'image': 'byrnedo/alpine-curl@sha256:548379d0a4a0c08b9e55d9d87a592b7d35d9ab3037f4936f5ccd09d0b625a342'},\n", + " 'metadata': {'annotations': {'author': 'Alexey Volkov ',\n", + " 'canonical_location': 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/web/Download/component.yaml',\n", + " 'pipelines.kubeflow.org/arguments.parameters': '{\"Url\": \"http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\", \"curl options\": \"--location\"}',\n", + " 'pipelines.kubeflow.org/component_ref': '{\"digest\": \"2f61f2edf713f214934bd286791877a1a3a37f31a4de4368b90e3b76743f1523\", \"url\": \"https://raw.githubusercontent.com/kubeflow/pipelines/master/components/contrib/web/Download/component.yaml\"}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"implementation\": {\"container\": {\"command\": [\"sh\", \"-exc\", \"url=\\\\\"$0\\\\\"\\\\noutput_path=\\\\\"$1\\\\\"\\\\ncurl_options=\\\\\"$2\\\\\"\\\\n\\\\nmkdir -p \\\\\"$(dirname \\\\\"$output_path\\\\\")\\\\\"\\\\ncurl --get \\\\\"$url\\\\\" --output \\\\\"$output_path\\\\\" $curl_options\\\\n\", {\"inputValue\": \"Url\"}, {\"outputPath\": \"Data\"}, {\"inputValue\": \"curl options\"}], \"image\": \"byrnedo/alpine-curl@sha256:548379d0a4a0c08b9e55d9d87a592b7d35d9ab3037f4936f5ccd09d0b625a342\"}}, \"inputs\": [{\"name\": \"Url\", \"type\": \"URI\"}, {\"default\": \"--location\", \"description\": \"Additional options given to the curl bprogram. See https://curl.haxx.se/docs/manpage.html\", \"name\": \"curl options\", \"type\": \"string\"}], \"metadata\": {\"annotations\": {\"author\": \"Alexey Volkov \", \"canonical_location\": \"https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/web/Download/component.yaml\"}}, \"name\": \"Download data\", \"outputs\": [{\"name\": \"Data\"}]}',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Download training images'},\n", + " 'labels': {'pipelines.kubeflow.org/cache_enabled': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'download-data',\n", + " 'outputs': {'artifacts': [{'name': 'download-data-Data',\n", + " 'path': '/tmp/outputs/Data/data'}]}},\n", + " {'container': {'args': [],\n", + " 'command': ['sh',\n", + " '-exc',\n", + " 'url=\"$0\"\\noutput_path=\"$1\"\\ncurl_options=\"$2\"\\n\\nmkdir -p \"$(dirname \"$output_path\")\"\\ncurl --get \"$url\" --output \"$output_path\" $curl_options\\n',\n", + " 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',\n", + " '/tmp/outputs/Data/data',\n", + " '--location'],\n", + " 'image': 'byrnedo/alpine-curl@sha256:548379d0a4a0c08b9e55d9d87a592b7d35d9ab3037f4936f5ccd09d0b625a342'},\n", + " 'metadata': {'annotations': {'author': 'Alexey Volkov ',\n", + " 'canonical_location': 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/web/Download/component.yaml',\n", + " 'pipelines.kubeflow.org/arguments.parameters': '{\"Url\": \"http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\", \"curl options\": \"--location\"}',\n", + " 'pipelines.kubeflow.org/component_ref': '{\"digest\": \"2f61f2edf713f214934bd286791877a1a3a37f31a4de4368b90e3b76743f1523\", \"url\": \"https://raw.githubusercontent.com/kubeflow/pipelines/master/components/contrib/web/Download/component.yaml\"}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"implementation\": {\"container\": {\"command\": [\"sh\", \"-exc\", \"url=\\\\\"$0\\\\\"\\\\noutput_path=\\\\\"$1\\\\\"\\\\ncurl_options=\\\\\"$2\\\\\"\\\\n\\\\nmkdir -p \\\\\"$(dirname \\\\\"$output_path\\\\\")\\\\\"\\\\ncurl --get \\\\\"$url\\\\\" --output \\\\\"$output_path\\\\\" $curl_options\\\\n\", {\"inputValue\": \"Url\"}, {\"outputPath\": \"Data\"}, {\"inputValue\": \"curl options\"}], \"image\": \"byrnedo/alpine-curl@sha256:548379d0a4a0c08b9e55d9d87a592b7d35d9ab3037f4936f5ccd09d0b625a342\"}}, \"inputs\": [{\"name\": \"Url\", \"type\": \"URI\"}, {\"default\": \"--location\", \"description\": \"Additional options given to the curl bprogram. See https://curl.haxx.se/docs/manpage.html\", \"name\": \"curl options\", \"type\": \"string\"}], \"metadata\": {\"annotations\": {\"author\": \"Alexey Volkov \", \"canonical_location\": \"https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/web/Download/component.yaml\"}}, \"name\": \"Download data\", \"outputs\": [{\"name\": \"Data\"}]}',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Download training labels'},\n", + " 'labels': {'pipelines.kubeflow.org/cache_enabled': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'download-data-2',\n", + " 'outputs': {'artifacts': [{'name': 'download-data-2-Data',\n", + " 'path': '/tmp/outputs/Data/data'}]}},\n", + " {'dag': {'tasks': [{'name': 'download-data',\n", + " 'template': 'download-data'},\n", + " {'name': 'download-data-2', 'template': 'download-data-2'},\n", + " {'arguments': {'artifacts': [{'from': '{{tasks.download-data-2.outputs.artifacts.download-data-2-Data}}',\n", + " 'name': 'download-data-2-Data'},\n", + " {'from': '{{tasks.download-data.outputs.artifacts.download-data-Data}}',\n", + " 'name': 'download-data-Data'}]},\n", + " 'dependencies': ['download-data', 'download-data-2'],\n", + " 'name': 'parse-mnist',\n", + " 'template': 'parse-mnist'},\n", + " {'arguments': {'artifacts': [{'from': '{{tasks.parse-mnist.outputs.artifacts.parse-mnist-Dataset}}',\n", + " 'name': 'parse-mnist-Dataset'}],\n", + " 'parameters': [{'name': 'histogram_norm',\n", + " 'value': '{{inputs.parameters.histogram_norm}}'}]},\n", + " 'dependencies': ['parse-mnist'],\n", + " 'name': 'process',\n", + " 'template': 'process'},\n", + " {'arguments': {'artifacts': [{'from': '{{tasks.process.outputs.artifacts.process-data_processed}}',\n", + " 'name': 'process-data_processed'}],\n", + " 'parameters': [{'name': 'batch_size',\n", + " 'value': '{{inputs.parameters.batch_size}}'},\n", + " {'name': 'epochs', 'value': '{{inputs.parameters.epochs}}'},\n", + " {'name': 'loss', 'value': '{{inputs.parameters.loss}}'},\n", + " {'name': 'lr', 'value': '{{inputs.parameters.lr}}'},\n", + " {'name': 'optimizer',\n", + " 'value': '{{inputs.parameters.optimizer}}'}]},\n", + " 'dependencies': ['process'],\n", + " 'name': 'train',\n", + " 'template': 'train'}]},\n", + " 'inputs': {'parameters': [{'name': 'batch_size'},\n", + " {'name': 'epochs'},\n", + " {'name': 'histogram_norm'},\n", + " {'name': 'loss'},\n", + " {'name': 'lr'},\n", + " {'name': 'optimizer'}]},\n", + " 'name': 'download-mnist-dataset'},\n", + " {'container': {'args': [],\n", + " 'command': ['sh',\n", + " '-ec',\n", + " '# This is how additional packages can be installed dynamically\\npython3 -m pip install pip idx2numpy\\n# Run the rest of the command after installing the packages.\\n\"$0\" \"$@\"\\n',\n", + " 'python3',\n", + " '-u',\n", + " '-c',\n", + " \"import gzip\\nimport idx2numpy\\nimport sys\\nfrom pathlib import Path\\nimport pickle\\nimport tensorflow as tf\\nimg_path = sys.argv[1]\\nlabel_path = sys.argv[2]\\noutput_path = sys.argv[3]\\nwith gzip.open(img_path, 'rb') as f:\\n x = idx2numpy.convert_from_string(f.read())\\nwith gzip.open(label_path, 'rb') as f:\\n y = idx2numpy.convert_from_string(f.read())\\n#one-hot encode the categories\\nx_out = tf.convert_to_tensor(x)\\ny_out = tf.keras.utils.to_categorical(y)\\nPath(output_path).parent.mkdir(parents=True, exist_ok=True)\\nwith open(output_path, 'wb') as output_file:\\n pickle.dump((x_out, y_out), output_file)\\n\",\n", + " '/tmp/inputs/Images/data',\n", + " '/tmp/inputs/Labels/data',\n", + " '/tmp/outputs/Dataset/data'],\n", + " 'image': 'tensorflow/tensorflow:2.7.1'},\n", + " 'inputs': {'artifacts': [{'name': 'download-data-Data',\n", + " 'path': '/tmp/inputs/Images/data'},\n", + " {'name': 'download-data-2-Data', 'path': '/tmp/inputs/Labels/data'}]},\n", + " 'metadata': {'annotations': {'author': 'Vito Zanotelli, D-ONE.ai',\n", + " 'description': 'Based on https://github.com/kubeflow/pipelines/blob/master/components/contrib/sample/Python_script/component.yaml',\n", + " 'pipelines.kubeflow.org/component_ref': '{\"digest\": \"80825e6ec527562f31b6fdba1bae9a42dae5032c8654f4b9d39cb97a3dc4ed23\"}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"implementation\": {\"container\": {\"command\": [\"sh\", \"-ec\", \"# This is how additional packages can be installed dynamically\\\\npython3 -m pip install pip idx2numpy\\\\n# Run the rest of the command after installing the packages.\\\\n\\\\\"$0\\\\\" \\\\\"$@\\\\\"\\\\n\", \"python3\", \"-u\", \"-c\", \"import gzip\\\\nimport idx2numpy\\\\nimport sys\\\\nfrom pathlib import Path\\\\nimport pickle\\\\nimport tensorflow as tf\\\\nimg_path = sys.argv[1]\\\\nlabel_path = sys.argv[2]\\\\noutput_path = sys.argv[3]\\\\nwith gzip.open(img_path, \\'rb\\') as f:\\\\n x = idx2numpy.convert_from_string(f.read())\\\\nwith gzip.open(label_path, \\'rb\\') as f:\\\\n y = idx2numpy.convert_from_string(f.read())\\\\n#one-hot encode the categories\\\\nx_out = tf.convert_to_tensor(x)\\\\ny_out = tf.keras.utils.to_categorical(y)\\\\nPath(output_path).parent.mkdir(parents=True, exist_ok=True)\\\\nwith open(output_path, \\'wb\\') as output_file:\\\\n pickle.dump((x_out, y_out), output_file)\\\\n\", {\"inputPath\": \"Images\"}, {\"inputPath\": \"Labels\"}, {\"outputPath\": \"Dataset\"}], \"image\": \"tensorflow/tensorflow:2.7.1\"}}, \"inputs\": [{\"description\": \"gziped images in the idx format\", \"name\": \"Images\"}, {\"description\": \"gziped labels in the idx format\", \"name\": \"Labels\"}], \"metadata\": {\"annotations\": {\"author\": \"Vito Zanotelli, D-ONE.ai\", \"description\": \"Based on https://github.com/kubeflow/pipelines/blob/master/components/contrib/sample/Python_script/component.yaml\"}}, \"name\": \"Parse MNIST\", \"outputs\": [{\"name\": \"Dataset\"}]}',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Prepare train dataset'},\n", + " 'labels': {'pipelines.kubeflow.org/cache_enabled': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'parse-mnist',\n", + " 'outputs': {'artifacts': [{'name': 'parse-mnist-Dataset',\n", + " 'path': '/tmp/outputs/Dataset/data'}]}},\n", + " {'container': {'args': ['--data-raw',\n", + " '/tmp/inputs/data_raw/data',\n", + " '--val-pct',\n", + " '0.2',\n", + " '--trainset-flag',\n", + " 'True',\n", + " '--histogram-norm',\n", + " '{{inputs.parameters.histogram_norm}}',\n", + " '--data-processed',\n", + " '/tmp/outputs/data_processed/data'],\n", + " 'command': ['sh',\n", + " '-c',\n", + " '(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scikit-learn\\' \\'tensorflow-addons[tensorflow]\\' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scikit-learn\\' \\'tensorflow-addons[tensorflow]\\' --user) && \"$0\" \"$@\"',\n", + " 'sh',\n", + " '-ec',\n", + " 'program_path=$(mktemp)\\nprintf \"%s\" \"$0\" > \"$program_path\"\\npython3 -u \"$program_path\" \"$@\"\\n',\n", + " 'def _make_parent_dirs_and_return_path(file_path: str):\\n import os\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\n return file_path\\n\\ndef process(\\n data_raw_path, # type: ignore\\n data_processed_path, # type: ignore\\n val_pct = 0.2,\\n trainset_flag = True,\\n histogram_norm = False,\\n):\\n \"\"\"\\n Here we do all the preprocessing\\n if the data path is for training data we:\\n (1) Normalize the data\\n (2) split the train and val data\\n If it is for unseen test data, we:\\n (1) Normalize the data\\n This function returns in any case the processed data path\\n \"\"\"\\n # sklearn\\n import pickle\\n from sklearn.model_selection import train_test_split\\n import tensorflow as tf\\n import tensorflow_addons as tfa\\n\\n def img_norm(x):\\n x_ = tf.reshape(x, list(x.shape) + [1])\\n\\n if histogram_norm:\\n x_ = tfa.image.equalize(x_)\\n\\n # Scale between 0-1\\n x_ = x_ / 255\\n return x_\\n\\n with open(data_raw_path, \"rb\") as f:\\n x, y = pickle.load(f)\\n if trainset_flag:\\n\\n x_ = img_norm(x)\\n x_train, x_val, y_train, y_val = train_test_split(\\n x_.numpy(), y, test_size=val_pct, stratify=y, random_state=42\\n )\\n\\n with open(data_processed_path, \"wb\") as output_file:\\n pickle.dump((x_train, y_train, x_val, y_val), output_file)\\n\\n else:\\n x_ = img_norm(x)\\n with open(data_processed_path, \"wb\") as output_file:\\n pickle.dump((x_, y), output_file)\\n\\ndef _deserialize_bool(s) -> bool:\\n from distutils.util import strtobool\\n return strtobool(s) == 1\\n\\nimport argparse\\n_parser = argparse.ArgumentParser(prog=\\'Process\\', description=\\'Here we do all the preprocessing\\')\\n_parser.add_argument(\"--data-raw\", dest=\"data_raw_path\", type=str, required=True, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--val-pct\", dest=\"val_pct\", type=float, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--trainset-flag\", dest=\"trainset_flag\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--histogram-norm\", dest=\"histogram_norm\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--data-processed\", dest=\"data_processed_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\n_parsed_args = vars(_parser.parse_args())\\n\\n_outputs = process(**_parsed_args)\\n'],\n", + " 'image': 'tensorflow/tensorflow:2.7.1',\n", + " 'resources': {'limits': {'cpu': '1', 'memory': '2Gi'}}},\n", + " 'inputs': {'artifacts': [{'name': 'parse-mnist-Dataset',\n", + " 'path': '/tmp/inputs/data_raw/data'}],\n", + " 'parameters': [{'name': 'histogram_norm'}]},\n", + " 'metadata': {'annotations': {'pipelines.kubeflow.org/arguments.parameters': '{\"histogram_norm\": \"{{inputs.parameters.histogram_norm}}\", \"trainset_flag\": \"True\", \"val_pct\": \"0.2\"}',\n", + " 'pipelines.kubeflow.org/component_ref': '{}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"description\": \"Here we do all the preprocessing\", \"implementation\": {\"container\": {\"args\": [\"--data-raw\", {\"inputPath\": \"data_raw\"}, {\"if\": {\"cond\": {\"isPresent\": \"val_pct\"}, \"then\": [\"--val-pct\", {\"inputValue\": \"val_pct\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"trainset_flag\"}, \"then\": [\"--trainset-flag\", {\"inputValue\": \"trainset_flag\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"histogram_norm\"}, \"then\": [\"--histogram-norm\", {\"inputValue\": \"histogram_norm\"}]}}, \"--data-processed\", {\"outputPath\": \"data_processed\"}], \"command\": [\"sh\", \"-c\", \"(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scikit-learn\\' \\'tensorflow-addons[tensorflow]\\' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scikit-learn\\' \\'tensorflow-addons[tensorflow]\\' --user) && \\\\\"$0\\\\\" \\\\\"$@\\\\\"\", \"sh\", \"-ec\", \"program_path=$(mktemp)\\\\nprintf \\\\\"%s\\\\\" \\\\\"$0\\\\\" > \\\\\"$program_path\\\\\"\\\\npython3 -u \\\\\"$program_path\\\\\" \\\\\"$@\\\\\"\\\\n\", \"def _make_parent_dirs_and_return_path(file_path: str):\\\\n import os\\\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\\\n return file_path\\\\n\\\\ndef process(\\\\n data_raw_path, # type: ignore\\\\n data_processed_path, # type: ignore\\\\n val_pct = 0.2,\\\\n trainset_flag = True,\\\\n histogram_norm = False,\\\\n):\\\\n \\\\\"\\\\\"\\\\\"\\\\n Here we do all the preprocessing\\\\n if the data path is for training data we:\\\\n (1) Normalize the data\\\\n (2) split the train and val data\\\\n If it is for unseen test data, we:\\\\n (1) Normalize the data\\\\n This function returns in any case the processed data path\\\\n \\\\\"\\\\\"\\\\\"\\\\n # sklearn\\\\n import pickle\\\\n from sklearn.model_selection import train_test_split\\\\n import tensorflow as tf\\\\n import tensorflow_addons as tfa\\\\n\\\\n def img_norm(x):\\\\n x_ = tf.reshape(x, list(x.shape) + [1])\\\\n\\\\n if histogram_norm:\\\\n x_ = tfa.image.equalize(x_)\\\\n\\\\n # Scale between 0-1\\\\n x_ = x_ / 255\\\\n return x_\\\\n\\\\n with open(data_raw_path, \\\\\"rb\\\\\") as f:\\\\n x, y = pickle.load(f)\\\\n if trainset_flag:\\\\n\\\\n x_ = img_norm(x)\\\\n x_train, x_val, y_train, y_val = train_test_split(\\\\n x_.numpy(), y, test_size=val_pct, stratify=y, random_state=42\\\\n )\\\\n\\\\n with open(data_processed_path, \\\\\"wb\\\\\") as output_file:\\\\n pickle.dump((x_train, y_train, x_val, y_val), output_file)\\\\n\\\\n else:\\\\n x_ = img_norm(x)\\\\n with open(data_processed_path, \\\\\"wb\\\\\") as output_file:\\\\n pickle.dump((x_, y), output_file)\\\\n\\\\ndef _deserialize_bool(s) -> bool:\\\\n from distutils.util import strtobool\\\\n return strtobool(s) == 1\\\\n\\\\nimport argparse\\\\n_parser = argparse.ArgumentParser(prog=\\'Process\\', description=\\'Here we do all the preprocessing\\')\\\\n_parser.add_argument(\\\\\"--data-raw\\\\\", dest=\\\\\"data_raw_path\\\\\", type=str, required=True, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--val-pct\\\\\", dest=\\\\\"val_pct\\\\\", type=float, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--trainset-flag\\\\\", dest=\\\\\"trainset_flag\\\\\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--histogram-norm\\\\\", dest=\\\\\"histogram_norm\\\\\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--data-processed\\\\\", dest=\\\\\"data_processed_path\\\\\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\\\n_parsed_args = vars(_parser.parse_args())\\\\n\\\\n_outputs = process(**_parsed_args)\\\\n\"], \"image\": \"tensorflow/tensorflow:2.7.1\"}}, \"inputs\": [{\"name\": \"data_raw\", \"type\": \"String\"}, {\"default\": \"0.2\", \"name\": \"val_pct\", \"optional\": true, \"type\": \"Float\"}, {\"default\": \"True\", \"name\": \"trainset_flag\", \"optional\": true, \"type\": \"Boolean\"}, {\"default\": \"False\", \"name\": \"histogram_norm\", \"optional\": true, \"type\": \"Boolean\"}], \"name\": \"Process\", \"outputs\": [{\"name\": \"data_processed\", \"type\": \"String\"}]}',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Preprocess images'},\n", + " 'labels': {'pipelines.kubeflow.org/cache_enabled': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'process',\n", + " 'outputs': {'artifacts': [{'name': 'process-data_processed',\n", + " 'path': '/tmp/outputs/data_processed/data'}]}},\n", + " {'container': {'args': ['--data-train',\n", + " '/tmp/inputs/data_train/data',\n", + " '--lr',\n", + " '{{inputs.parameters.lr}}',\n", + " '--optimizer',\n", + " '{{inputs.parameters.optimizer}}',\n", + " '--loss',\n", + " '{{inputs.parameters.loss}}',\n", + " '--epochs',\n", + " '{{inputs.parameters.epochs}}',\n", + " '--batch-size',\n", + " '{{inputs.parameters.batch_size}}',\n", + " '--model-out',\n", + " '/tmp/outputs/model_out/data',\n", + " '--mlpipeline-metrics',\n", + " '/tmp/outputs/mlpipeline_metrics/data'],\n", + " 'command': ['sh',\n", + " '-c',\n", + " '(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scipy\\' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scipy\\' --user) && \"$0\" \"$@\"',\n", + " 'sh',\n", + " '-ec',\n", + " 'program_path=$(mktemp)\\nprintf \"%s\" \"$0\" > \"$program_path\"\\npython3 -u \"$program_path\" \"$@\"\\n',\n", + " 'def _make_parent_dirs_and_return_path(file_path: str):\\n import os\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\n return file_path\\n\\ndef train(\\n data_train_path, # type: ignore\\n model_out_path, # type: ignore\\n mlpipeline_metrics_path, # type: ignore # noqa: F821\\n lr = 1e-4,\\n optimizer = \"Adam\",\\n loss = \"categorical_crossentropy\",\\n epochs = 1,\\n batch_size = 32,\\n):\\n \"\"\"\\n This is the simulated train part of our ML pipeline where training is performed\\n \"\"\"\\n\\n import tensorflow as tf\\n import pickle\\n from tensorflow.keras.preprocessing.image import ImageDataGenerator\\n import json\\n\\n with open(data_train_path, \"rb\") as f:\\n x_train, y_train, x_val, y_val = pickle.load(f)\\n\\n model = tf.keras.Sequential(\\n [\\n tf.keras.layers.Conv2D(\\n 64, (3, 3), activation=\"relu\", input_shape=(28, 28, 1)\\n ),\\n tf.keras.layers.MaxPooling2D(2, 2),\\n tf.keras.layers.Conv2D(64, (3, 3), activation=\"relu\"),\\n tf.keras.layers.MaxPooling2D(2, 2),\\n tf.keras.layers.Flatten(),\\n tf.keras.layers.Dense(128, activation=\"relu\"),\\n tf.keras.layers.Dense(10, activation=\"softmax\"),\\n ]\\n )\\n\\n if optimizer.lower() == \"sgd\":\\n optimizer = tf.keras.optimizers.SGD(lr)\\n else:\\n optimizer = tf.keras.optimizers.Adam(lr)\\n\\n model.compile(loss=loss, optimizer=optimizer, metrics=[\"accuracy\"])\\n\\n # fit the model\\n model_early_stopping_callback = tf.keras.callbacks.EarlyStopping(\\n monitor=\"val_accuracy\", patience=10, verbose=1, restore_best_weights=True\\n )\\n\\n train_datagen = ImageDataGenerator()\\n\\n validation_datagen = ImageDataGenerator()\\n history = model.fit(\\n train_datagen.flow(x_train, y_train, batch_size=batch_size),\\n epochs=epochs,\\n validation_data=validation_datagen.flow(x_val, y_val, batch_size=batch_size),\\n shuffle=False,\\n callbacks=[model_early_stopping_callback],\\n )\\n\\n model.save(model_out_path, save_format=\"tf\")\\n\\n metrics = {\\n \"metrics\": [\\n {\\n \"name\": \"accuracy\", # The name of the metric. Visualized as the column name in the runs table.\\n \"numberValue\": history.history[\"accuracy\"][\\n -1\\n ], # The value of the metric. Must be a numeric value.\\n \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\\n },\\n {\\n \"name\": \"val-accuracy\", # The name of the metric. Visualized as the column name in the runs table.\\n \"numberValue\": history.history[\"val_accuracy\"][\\n -1\\n ], # The value of the metric. Must be a numeric value.\\n \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\\n },\\n ]\\n }\\n with open(mlpipeline_metrics_path, \"w\") as f:\\n json.dump(metrics, f)\\n\\nimport argparse\\n_parser = argparse.ArgumentParser(prog=\\'Train\\', description=\\'This is the simulated train part of our ML pipeline where training is performed\\')\\n_parser.add_argument(\"--data-train\", dest=\"data_train_path\", type=str, required=True, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--lr\", dest=\"lr\", type=float, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--optimizer\", dest=\"optimizer\", type=str, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--loss\", dest=\"loss\", type=str, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--epochs\", dest=\"epochs\", type=int, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--batch-size\", dest=\"batch_size\", type=int, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--model-out\", dest=\"model_out_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--mlpipeline-metrics\", dest=\"mlpipeline_metrics_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\n_parsed_args = vars(_parser.parse_args())\\n\\n_outputs = train(**_parsed_args)\\n'],\n", + " 'image': 'tensorflow/tensorflow:2.7.1',\n", + " 'resources': {'limits': {'cpu': '1', 'memory': '2Gi'}}},\n", + " 'inputs': {'artifacts': [{'name': 'process-data_processed',\n", + " 'path': '/tmp/inputs/data_train/data'}],\n", + " 'parameters': [{'name': 'batch_size'},\n", + " {'name': 'epochs'},\n", + " {'name': 'loss'},\n", + " {'name': 'lr'},\n", + " {'name': 'optimizer'}]},\n", + " 'metadata': {'annotations': {'pipelines.kubeflow.org/arguments.parameters': '{\"batch_size\": \"{{inputs.parameters.batch_size}}\", \"epochs\": \"{{inputs.parameters.epochs}}\", \"loss\": \"{{inputs.parameters.loss}}\", \"lr\": \"{{inputs.parameters.lr}}\", \"optimizer\": \"{{inputs.parameters.optimizer}}\"}',\n", + " 'pipelines.kubeflow.org/component_ref': '{}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"description\": \"This is the simulated train part of our ML pipeline where training is performed\", \"implementation\": {\"container\": {\"args\": [\"--data-train\", {\"inputPath\": \"data_train\"}, {\"if\": {\"cond\": {\"isPresent\": \"lr\"}, \"then\": [\"--lr\", {\"inputValue\": \"lr\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"optimizer\"}, \"then\": [\"--optimizer\", {\"inputValue\": \"optimizer\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"loss\"}, \"then\": [\"--loss\", {\"inputValue\": \"loss\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"epochs\"}, \"then\": [\"--epochs\", {\"inputValue\": \"epochs\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"batch_size\"}, \"then\": [\"--batch-size\", {\"inputValue\": \"batch_size\"}]}}, \"--model-out\", {\"outputPath\": \"model_out\"}, \"--mlpipeline-metrics\", {\"outputPath\": \"mlpipeline_metrics\"}], \"command\": [\"sh\", \"-c\", \"(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scipy\\' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scipy\\' --user) && \\\\\"$0\\\\\" \\\\\"$@\\\\\"\", \"sh\", \"-ec\", \"program_path=$(mktemp)\\\\nprintf \\\\\"%s\\\\\" \\\\\"$0\\\\\" > \\\\\"$program_path\\\\\"\\\\npython3 -u \\\\\"$program_path\\\\\" \\\\\"$@\\\\\"\\\\n\", \"def _make_parent_dirs_and_return_path(file_path: str):\\\\n import os\\\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\\\n return file_path\\\\n\\\\ndef train(\\\\n data_train_path, # type: ignore\\\\n model_out_path, # type: ignore\\\\n mlpipeline_metrics_path, # type: ignore # noqa: F821\\\\n lr = 1e-4,\\\\n optimizer = \\\\\"Adam\\\\\",\\\\n loss = \\\\\"categorical_crossentropy\\\\\",\\\\n epochs = 1,\\\\n batch_size = 32,\\\\n):\\\\n \\\\\"\\\\\"\\\\\"\\\\n This is the simulated train part of our ML pipeline where training is performed\\\\n \\\\\"\\\\\"\\\\\"\\\\n\\\\n import tensorflow as tf\\\\n import pickle\\\\n from tensorflow.keras.preprocessing.image import ImageDataGenerator\\\\n import json\\\\n\\\\n with open(data_train_path, \\\\\"rb\\\\\") as f:\\\\n x_train, y_train, x_val, y_val = pickle.load(f)\\\\n\\\\n model = tf.keras.Sequential(\\\\n [\\\\n tf.keras.layers.Conv2D(\\\\n 64, (3, 3), activation=\\\\\"relu\\\\\", input_shape=(28, 28, 1)\\\\n ),\\\\n tf.keras.layers.MaxPooling2D(2, 2),\\\\n tf.keras.layers.Conv2D(64, (3, 3), activation=\\\\\"relu\\\\\"),\\\\n tf.keras.layers.MaxPooling2D(2, 2),\\\\n tf.keras.layers.Flatten(),\\\\n tf.keras.layers.Dense(128, activation=\\\\\"relu\\\\\"),\\\\n tf.keras.layers.Dense(10, activation=\\\\\"softmax\\\\\"),\\\\n ]\\\\n )\\\\n\\\\n if optimizer.lower() == \\\\\"sgd\\\\\":\\\\n optimizer = tf.keras.optimizers.SGD(lr)\\\\n else:\\\\n optimizer = tf.keras.optimizers.Adam(lr)\\\\n\\\\n model.compile(loss=loss, optimizer=optimizer, metrics=[\\\\\"accuracy\\\\\"])\\\\n\\\\n # fit the model\\\\n model_early_stopping_callback = tf.keras.callbacks.EarlyStopping(\\\\n monitor=\\\\\"val_accuracy\\\\\", patience=10, verbose=1, restore_best_weights=True\\\\n )\\\\n\\\\n train_datagen = ImageDataGenerator()\\\\n\\\\n validation_datagen = ImageDataGenerator()\\\\n history = model.fit(\\\\n train_datagen.flow(x_train, y_train, batch_size=batch_size),\\\\n epochs=epochs,\\\\n validation_data=validation_datagen.flow(x_val, y_val, batch_size=batch_size),\\\\n shuffle=False,\\\\n callbacks=[model_early_stopping_callback],\\\\n )\\\\n\\\\n model.save(model_out_path, save_format=\\\\\"tf\\\\\")\\\\n\\\\n metrics = {\\\\n \\\\\"metrics\\\\\": [\\\\n {\\\\n \\\\\"name\\\\\": \\\\\"accuracy\\\\\", # The name of the metric. Visualized as the column name in the runs table.\\\\n \\\\\"numberValue\\\\\": history.history[\\\\\"accuracy\\\\\"][\\\\n -1\\\\n ], # The value of the metric. Must be a numeric value.\\\\n \\\\\"format\\\\\": \\\\\"PERCENTAGE\\\\\", # The optional format of the metric. Supported values are \\\\\"RAW\\\\\" (displayed in raw format) and \\\\\"PERCENTAGE\\\\\" (displayed in percentage format).\\\\n },\\\\n {\\\\n \\\\\"name\\\\\": \\\\\"val-accuracy\\\\\", # The name of the metric. Visualized as the column name in the runs table.\\\\n \\\\\"numberValue\\\\\": history.history[\\\\\"val_accuracy\\\\\"][\\\\n -1\\\\n ], # The value of the metric. Must be a numeric value.\\\\n \\\\\"format\\\\\": \\\\\"PERCENTAGE\\\\\", # The optional format of the metric. Supported values are \\\\\"RAW\\\\\" (displayed in raw format) and \\\\\"PERCENTAGE\\\\\" (displayed in percentage format).\\\\n },\\\\n ]\\\\n }\\\\n with open(mlpipeline_metrics_path, \\\\\"w\\\\\") as f:\\\\n json.dump(metrics, f)\\\\n\\\\nimport argparse\\\\n_parser = argparse.ArgumentParser(prog=\\'Train\\', description=\\'This is the simulated train part of our ML pipeline where training is performed\\')\\\\n_parser.add_argument(\\\\\"--data-train\\\\\", dest=\\\\\"data_train_path\\\\\", type=str, required=True, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--lr\\\\\", dest=\\\\\"lr\\\\\", type=float, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--optimizer\\\\\", dest=\\\\\"optimizer\\\\\", type=str, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--loss\\\\\", dest=\\\\\"loss\\\\\", type=str, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--epochs\\\\\", dest=\\\\\"epochs\\\\\", type=int, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--batch-size\\\\\", dest=\\\\\"batch_size\\\\\", type=int, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--model-out\\\\\", dest=\\\\\"model_out_path\\\\\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--mlpipeline-metrics\\\\\", dest=\\\\\"mlpipeline_metrics_path\\\\\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\\\n_parsed_args = vars(_parser.parse_args())\\\\n\\\\n_outputs = train(**_parsed_args)\\\\n\"], \"image\": \"tensorflow/tensorflow:2.7.1\"}}, \"inputs\": [{\"name\": \"data_train\", \"type\": \"String\"}, {\"default\": \"0.0001\", \"name\": \"lr\", \"optional\": true, \"type\": \"Float\"}, {\"default\": \"Adam\", \"name\": \"optimizer\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"categorical_crossentropy\", \"name\": \"loss\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"1\", \"name\": \"epochs\", \"optional\": true, \"type\": \"Integer\"}, {\"default\": \"32\", \"name\": \"batch_size\", \"optional\": true, \"type\": \"Integer\"}], \"name\": \"Train\", \"outputs\": [{\"name\": \"model_out\", \"type\": \"String\"}, {\"name\": \"mlpipeline_metrics\", \"type\": \"Metrics\"}]}',\n", + " 'pipelines.kubeflow.org/max_cache_staleness': 'P0D',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Fit the model'},\n", + " 'labels': {'katib.kubeflow.org/model-training': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'train',\n", + " 'outputs': {'artifacts': [{'name': 'mlpipeline-metrics',\n", + " 'path': '/tmp/outputs/mlpipeline_metrics/data'},\n", + " {'name': 'train-model_out',\n", + " 'path': '/tmp/outputs/model_out/data'}]}}]}}}}}" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "katib_client.create_experiment(katib_experiment)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should now be able to observe in the Web UI how the Katib\n", + "Experiment is running.\n", + "\n", + "To see how the `Argo Workflows` are started, you can also check the Kubernetes cluster:\n", + "\n", + "`kubectl get Workflow -n `" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Minimal example pipeline for e2e testing\n", + "\n", + "The following part generates a minimal Katib Experiment for e2e testing" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "def prep_e2e(\n", + " output_nr_path: OutputPath(int), # type: ignore # noqa: F821\n", + " histogram_norm: bool = True,\n", + "):\n", + " with open(output_nr_path, 'w') as writer:\n", + " writer.write(str(int(histogram_norm)))\n", + " \n", + "prep_e2e_op = create_component_from_func(\n", + " func=prep_e2e\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "def train_e2e(\n", + " input_nr_path: InputPath(int), # type: ignore # noqa: F821\n", + " mlpipeline_metrics_path: OutputPath(\"Metrics\"), # type: ignore # noqa: F821\n", + " lr: float = 1e-4,\n", + " optimizer: str = \"Adam\",\n", + " loss: str = \"categorical_crossentropy\",\n", + " epochs: int = 1,\n", + " batch_size: int = 32,\n", + "):\n", + " \"\"\"\n", + " This is the simulated train part of our ML pipeline where training is performed\n", + " \"\"\"\n", + " import json \n", + " import time\n", + " with open(input_nr_path, 'r') as reader:\n", + " line = reader.readline()\n", + " histogram_norm_value = int(line)\n", + "\n", + " accuracy = (batch_size + histogram_norm_value)/ (batch_size + epochs+histogram_norm_value)\n", + " val_accuracy = accuracy * 0.9\n", + " metrics = {\n", + " \"metrics\": [\n", + " {\n", + " \"name\": \"accuracy\", # The name of the metric. Visualized as the column name in the runs table.\n", + " \"numberValue\": accuracy, # The value of the metric. Must be a numeric value.\n", + " \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\n", + " },\n", + " {\n", + " \"name\": \"val-accuracy\", # The name of the metric. Visualized as the column name in the runs table.\n", + " \"numberValue\": val_accuracy, # The value of the metric. Must be a numeric value.\n", + " \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\n", + " },\n", + " ]\n", + " }\n", + " with open(mlpipeline_metrics_path, \"w\") as f:\n", + " json.dump(metrics, f)\n", + " \n", + " # If this step is to fast, the metrics collector fails as the\n", + " # pod is already finished before it can collect the metrics.\n", + " time.sleep(10)\n", + "\n", + "\n", + "train_e2e_op = create_component_from_func(\n", + " func=train_e2e\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "@dsl.pipeline(\n", + " name=\"Minimal KFP1 pipeline for e2e testing\",\n", + " description=\"\",\n", + ")\n", + "def e2e_example_pipeline(\n", + " lr: float = 1e-4,\n", + " optimizer: str = \"Adam\",\n", + " loss: str = \"categorical_crossentropy\",\n", + " epochs: int = 3,\n", + " batch_size: int = 5,\n", + " histogram_norm: bool = False,\n", + "):\n", + " prep_e2e_output = (\n", + " prep_e2e_op(\n", + " histogram_norm=histogram_norm,\n", + " )\n", + " .set_display_name(\"Prepare a dummy output that should be cached\")\n", + " )\n", + " _label_cache(prep_e2e_output)\n", + "\n", + " training_output = (\n", + " train_e2e_op(\n", + " prep_e2e_output.output,\n", + " lr=lr,\n", + " optimizer=optimizer,\n", + " epochs=epochs,\n", + " batch_size=batch_size,\n", + " loss=loss,\n", + " )\n", + " )\n", + " training_output.set_display_name(\"Generate dummy metrics\")\n", + " # This pod label indicates which pod Katib should collect the metric from.\n", + " # A metrics collecting sidecar container will be added\n", + " training_output.add_pod_label(\"katib.kubeflow.org/model-training\", \"true\")\n", + " # This step needs to run always, as otherwise the metrics for Katib could not\n", + " # be collected.\n", + " training_output.execution_options.caching_strategy.max_cache_staleness = \"P0D\"" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Experiment details." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run details." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "kfp_run = f\"e2e-example-{dt.today().strftime('%Y-%m-%d-%Hh-%Mm-%Ss')}\"\n", + "run = kfp_client.create_run_from_pipeline_func(\n", + " e2e_example_pipeline,\n", + " mode=kfp.dsl.PipelineExecutionMode.V1_LEGACY,\n", + " # You can optionally override your pipeline_root when submitting the run too:\n", + " # pipeline_root='gs://my-pipeline-root/example-pipeline',\n", + " arguments={\"histogram_norm\": \"0\"},\n", + " experiment_name=KFP_EXPERIMENT,\n", + " run_name=kfp_run,\n", + " # In a multiuser setup, provide the namesapce\n", + " #namespace=USER_NAMESPACE,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the full spec\n", + "\n", + "katib_e2e_spec = create_katib_experiment_spec(\n", + " pipeline=e2e_example_pipeline,\n", + " pipeline_params=pipeline_params,\n", + " trial_params=trial_params_specs,\n", + " trial_params_space=parameter_space,\n", + " objective=objective,\n", + " algorithm=algorithm,\n", + " pipeline_service_account=KFP_SERVICE_ACCOUNT,\n", + " max_trial_count=5,\n", + " parallel_trial_count=5,\n", + " retain_pods=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the experiment\n", + "\n", + "katib_e2e_experiment_name = (\n", + " f\"katib-e2e-{dt.today().strftime('%Y-%m-%d-%Hh-%Mm-%Ss')}\"\n", + ")\n", + "katib_e2e_experiment = V1beta1Experiment(\n", + " api_version=\"kubeflow.org/v1beta1\",\n", + " kind=\"Experiment\",\n", + " metadata=V1ObjectMeta(\n", + " name=katib_e2e_experiment_name,\n", + " namespace=USER_NAMESPACE,\n", + " ),\n", + " spec=katib_e2e_spec,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "with open(f\"{KATIB_E2E_EXPERIMENT}.yaml\", \"w\") as f:\n", + " yaml.dump(ApiClient().sanitize_for_serialization(katib_e2e_experiment), f)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'apiVersion': 'kubeflow.org/v1beta1',\n", + " 'kind': 'Experiment',\n", + " 'metadata': {'creationTimestamp': '2023-07-20T20:37:59Z',\n", + " 'generation': 1,\n", + " 'managedFields': [{'apiVersion': 'kubeflow.org/v1beta1',\n", + " 'fieldsType': 'FieldsV1',\n", + " 'fieldsV1': {'f:spec': {'.': {},\n", + " 'f:algorithm': {'.': {}, 'f:algorithmName': {}},\n", + " 'f:maxFailedTrialCount': {},\n", + " 'f:maxTrialCount': {},\n", + " 'f:metricsCollectorSpec': {'.': {},\n", + " 'f:collector': {'.': {},\n", + " 'f:customCollector': {'.': {},\n", + " 'f:args': {},\n", + " 'f:env': {},\n", + " 'f:image': {},\n", + " 'f:imagePullPolicy': {},\n", + " 'f:name': {}},\n", + " 'f:kind': {}},\n", + " 'f:source': {'.': {},\n", + " 'f:fileSystemPath': {'.': {}, 'f:kind': {}, 'f:path': {}}}},\n", + " 'f:objective': {'.': {},\n", + " 'f:additionalMetricNames': {},\n", + " 'f:goal': {},\n", + " 'f:objectiveMetricName': {},\n", + " 'f:type': {}},\n", + " 'f:parallelTrialCount': {},\n", + " 'f:parameters': {},\n", + " 'f:trialTemplate': {'.': {},\n", + " 'f:failureCondition': {},\n", + " 'f:primaryContainerName': {},\n", + " 'f:primaryPodLabels': {'.': {},\n", + " 'f:katib.kubeflow.org/model-training': {}},\n", + " 'f:retain': {},\n", + " 'f:successCondition': {},\n", + " 'f:trialParameters': {},\n", + " 'f:trialSpec': {'.': {},\n", + " 'f:apiVersion': {},\n", + " 'f:kind': {},\n", + " 'f:metadata': {'.': {},\n", + " 'f:annotations': {'.': {},\n", + " 'f:pipelines.kubeflow.org/kfp_sdk_version': {},\n", + " 'f:pipelines.kubeflow.org/pipeline_compilation_time': {},\n", + " 'f:pipelines.kubeflow.org/pipeline_spec': {}},\n", + " 'f:generateName': {},\n", + " 'f:labels': {'.': {},\n", + " 'f:pipelines.kubeflow.org/kfp_sdk_version': {}}},\n", + " 'f:spec': {'.': {},\n", + " 'f:arguments': {'.': {}, 'f:parameters': {}},\n", + " 'f:entrypoint': {},\n", + " 'f:serviceAccountName': {},\n", + " 'f:templates': {}}}}}},\n", + " 'manager': 'OpenAPI-Generator',\n", + " 'operation': 'Update',\n", + " 'time': '2023-07-20T20:37:59Z'}],\n", + " 'name': 'katib-e2e-2023-07-20-22h-37m-57s',\n", + " 'namespace': 'kubeflow',\n", + " 'resourceVersion': '11759',\n", + " 'uid': 'c91aa6c9-8a2b-434d-9ab8-c4a317210893'},\n", + " 'spec': {'algorithm': {'algorithmName': 'random'},\n", + " 'maxFailedTrialCount': 2,\n", + " 'maxTrialCount': 5,\n", + " 'metricsCollectorSpec': {'collector': {'customCollector': {'args': ['-m',\n", + " 'val-accuracy;accuracy',\n", + " '-s',\n", + " 'katib-db-manager.kubeflow:6789',\n", + " '-t',\n", + " '$(PodName)',\n", + " '-path',\n", + " '/tmp/outputs/mlpipeline_metrics'],\n", + " 'env': [{'name': 'PodName',\n", + " 'valueFrom': {'fieldRef': {'fieldPath': 'metadata.name'}}}],\n", + " 'image': 'docker.io/votti/kfpv1-metricscollector:v0.0.10',\n", + " 'imagePullPolicy': 'Always',\n", + " 'name': 'custom-metrics-logger-and-collector',\n", + " 'resources': {}},\n", + " 'kind': 'Custom'},\n", + " 'source': {'fileSystemPath': {'kind': 'File',\n", + " 'path': '/tmp/outputs/mlpipeline_metrics/data'}}},\n", + " 'objective': {'additionalMetricNames': ['accuracy'],\n", + " 'goal': 0.9,\n", + " 'metricStrategies': [{'name': 'val-accuracy', 'value': 'max'},\n", + " {'name': 'accuracy', 'value': 'max'}],\n", + " 'objectiveMetricName': 'val-accuracy',\n", + " 'type': 'maximize'},\n", + " 'parallelTrialCount': 5,\n", + " 'parameters': [{'feasibleSpace': {'max': '0.001', 'min': '0.00001'},\n", + " 'name': 'learning_rate',\n", + " 'parameterType': 'double'},\n", + " {'feasibleSpace': {'max': '64', 'min': '16'},\n", + " 'name': 'batch_size',\n", + " 'parameterType': 'int'},\n", + " {'feasibleSpace': {'list': ['0', '1']},\n", + " 'name': 'histogram_norm',\n", + " 'parameterType': 'discrete'}],\n", + " 'resumePolicy': 'Never',\n", + " 'trialTemplate': {'failureCondition': 'status.[@this].#(phase==\"Failed\")#',\n", + " 'primaryContainerName': 'main',\n", + " 'primaryPodLabels': {'katib.kubeflow.org/model-training': 'true'},\n", + " 'successCondition': 'status.[@this].#(phase==\"Succeeded\")#',\n", + " 'trialParameters': [{'description': 'Learning rate for the training model',\n", + " 'name': 'learningRate',\n", + " 'reference': 'learning_rate'},\n", + " {'description': 'Batch size for NN training',\n", + " 'name': 'batchSize',\n", + " 'reference': 'batch_size'},\n", + " {'description': 'Histogram normalization of image on?',\n", + " 'name': 'histogramNorm',\n", + " 'reference': 'histogram_norm'}],\n", + " 'trialSpec': {'apiVersion': 'argoproj.io/v1alpha1',\n", + " 'kind': 'Workflow',\n", + " 'metadata': {'annotations': {'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline_compilation_time': '2023-07-20T22:37:57.355215',\n", + " 'pipelines.kubeflow.org/pipeline_spec': '{\"inputs\": [{\"default\": \"0.0001\", \"name\": \"lr\", \"optional\": true, \"type\": \"Float\"}, {\"default\": \"Adam\", \"name\": \"optimizer\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"categorical_crossentropy\", \"name\": \"loss\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"3\", \"name\": \"epochs\", \"optional\": true, \"type\": \"Integer\"}, {\"default\": \"5\", \"name\": \"batch_size\", \"optional\": true, \"type\": \"Integer\"}, {\"default\": \"False\", \"name\": \"histogram_norm\", \"optional\": true, \"type\": \"Boolean\"}, {\"default\": \"${trialParameters.learningRate}\", \"name\": \"lr\"}, {\"default\": \"${trialParameters.batchSize}\", \"name\": \"batch_size\"}, {\"default\": \"${trialParameters.histogramNorm}\", \"name\": \"histogram_norm\"}], \"name\": \"Minimal KFP1 pipeline for e2e testing\"}'},\n", + " 'generateName': 'minimal-kfp1-pipeline-for-e2e-testing-',\n", + " 'labels': {'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12'}},\n", + " 'spec': {'arguments': {'parameters': [{'name': 'lr',\n", + " 'value': '${trialParameters.learningRate}'},\n", + " {'name': 'optimizer', 'value': 'Adam'},\n", + " {'name': 'loss', 'value': 'categorical_crossentropy'},\n", + " {'name': 'epochs', 'value': '3'},\n", + " {'name': 'batch_size', 'value': '${trialParameters.batchSize}'},\n", + " {'name': 'histogram_norm',\n", + " 'value': '${trialParameters.histogramNorm}'}]},\n", + " 'entrypoint': 'minimal-kfp1-pipeline-for-e2e-testing',\n", + " 'serviceAccountName': 'pipeline-runner',\n", + " 'templates': [{'dag': {'tasks': [{'arguments': {'parameters': [{'name': 'histogram_norm',\n", + " 'value': '{{inputs.parameters.histogram_norm}}'}]},\n", + " 'name': 'prep-e2e',\n", + " 'template': 'prep-e2e'},\n", + " {'arguments': {'artifacts': [{'from': '{{tasks.prep-e2e.outputs.artifacts.prep-e2e-output_nr}}',\n", + " 'name': 'prep-e2e-output_nr'}],\n", + " 'parameters': [{'name': 'batch_size',\n", + " 'value': '{{inputs.parameters.batch_size}}'},\n", + " {'name': 'epochs', 'value': '{{inputs.parameters.epochs}}'},\n", + " {'name': 'loss', 'value': '{{inputs.parameters.loss}}'},\n", + " {'name': 'lr', 'value': '{{inputs.parameters.lr}}'},\n", + " {'name': 'optimizer',\n", + " 'value': '{{inputs.parameters.optimizer}}'}]},\n", + " 'dependencies': ['prep-e2e'],\n", + " 'name': 'train-e2e',\n", + " 'template': 'train-e2e'}]},\n", + " 'inputs': {'parameters': [{'name': 'batch_size'},\n", + " {'name': 'epochs'},\n", + " {'name': 'histogram_norm'},\n", + " {'name': 'loss'},\n", + " {'name': 'lr'},\n", + " {'name': 'optimizer'}]},\n", + " 'name': 'minimal-kfp1-pipeline-for-e2e-testing'},\n", + " {'container': {'args': ['--histogram-norm',\n", + " '{{inputs.parameters.histogram_norm}}',\n", + " '--output-nr',\n", + " '/tmp/outputs/output_nr/data'],\n", + " 'command': ['sh',\n", + " '-ec',\n", + " 'program_path=$(mktemp)\\nprintf \"%s\" \"$0\" > \"$program_path\"\\npython3 -u \"$program_path\" \"$@\"\\n',\n", + " 'def _make_parent_dirs_and_return_path(file_path: str):\\n import os\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\n return file_path\\n\\ndef prep_e2e(\\n output_nr_path, # type: ignore # noqa: F821\\n histogram_norm = True,\\n):\\n with open(output_nr_path, \\'w\\') as writer:\\n writer.write(str(int(histogram_norm)))\\n\\ndef _deserialize_bool(s) -> bool:\\n from distutils.util import strtobool\\n return strtobool(s) == 1\\n\\nimport argparse\\n_parser = argparse.ArgumentParser(prog=\\'Prep e2e\\', description=\\'\\')\\n_parser.add_argument(\"--histogram-norm\", dest=\"histogram_norm\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--output-nr\", dest=\"output_nr_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\n_parsed_args = vars(_parser.parse_args())\\n\\n_outputs = prep_e2e(**_parsed_args)\\n'],\n", + " 'image': 'python:3.7'},\n", + " 'inputs': {'parameters': [{'name': 'histogram_norm'}]},\n", + " 'metadata': {'annotations': {'pipelines.kubeflow.org/arguments.parameters': '{\"histogram_norm\": \"{{inputs.parameters.histogram_norm}}\"}',\n", + " 'pipelines.kubeflow.org/component_ref': '{}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"implementation\": {\"container\": {\"args\": [{\"if\": {\"cond\": {\"isPresent\": \"histogram_norm\"}, \"then\": [\"--histogram-norm\", {\"inputValue\": \"histogram_norm\"}]}}, \"--output-nr\", {\"outputPath\": \"output_nr\"}], \"command\": [\"sh\", \"-ec\", \"program_path=$(mktemp)\\\\nprintf \\\\\"%s\\\\\" \\\\\"$0\\\\\" > \\\\\"$program_path\\\\\"\\\\npython3 -u \\\\\"$program_path\\\\\" \\\\\"$@\\\\\"\\\\n\", \"def _make_parent_dirs_and_return_path(file_path: str):\\\\n import os\\\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\\\n return file_path\\\\n\\\\ndef prep_e2e(\\\\n output_nr_path, # type: ignore # noqa: F821\\\\n histogram_norm = True,\\\\n):\\\\n with open(output_nr_path, \\'w\\') as writer:\\\\n writer.write(str(int(histogram_norm)))\\\\n\\\\ndef _deserialize_bool(s) -> bool:\\\\n from distutils.util import strtobool\\\\n return strtobool(s) == 1\\\\n\\\\nimport argparse\\\\n_parser = argparse.ArgumentParser(prog=\\'Prep e2e\\', description=\\'\\')\\\\n_parser.add_argument(\\\\\"--histogram-norm\\\\\", dest=\\\\\"histogram_norm\\\\\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--output-nr\\\\\", dest=\\\\\"output_nr_path\\\\\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\\\n_parsed_args = vars(_parser.parse_args())\\\\n\\\\n_outputs = prep_e2e(**_parsed_args)\\\\n\"], \"image\": \"python:3.7\"}}, \"inputs\": [{\"default\": \"True\", \"name\": \"histogram_norm\", \"optional\": true, \"type\": \"Boolean\"}], \"name\": \"Prep e2e\", \"outputs\": [{\"name\": \"output_nr\", \"type\": \"Integer\"}]}',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Prepare a dummy output that should be cached'},\n", + " 'labels': {'pipelines.kubeflow.org/cache_enabled': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'prep-e2e',\n", + " 'outputs': {'artifacts': [{'name': 'prep-e2e-output_nr',\n", + " 'path': '/tmp/outputs/output_nr/data'}]}},\n", + " {'container': {'args': ['--input-nr',\n", + " '/tmp/inputs/input_nr/data',\n", + " '--lr',\n", + " '{{inputs.parameters.lr}}',\n", + " '--optimizer',\n", + " '{{inputs.parameters.optimizer}}',\n", + " '--loss',\n", + " '{{inputs.parameters.loss}}',\n", + " '--epochs',\n", + " '{{inputs.parameters.epochs}}',\n", + " '--batch-size',\n", + " '{{inputs.parameters.batch_size}}',\n", + " '--mlpipeline-metrics',\n", + " '/tmp/outputs/mlpipeline_metrics/data'],\n", + " 'command': ['sh',\n", + " '-ec',\n", + " 'program_path=$(mktemp)\\nprintf \"%s\" \"$0\" > \"$program_path\"\\npython3 -u \"$program_path\" \"$@\"\\n',\n", + " 'def _make_parent_dirs_and_return_path(file_path: str):\\n import os\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\n return file_path\\n\\ndef train_e2e(\\n input_nr_path, # type: ignore # noqa: F821\\n mlpipeline_metrics_path, # type: ignore # noqa: F821\\n lr = 1e-4,\\n optimizer = \"Adam\",\\n loss = \"categorical_crossentropy\",\\n epochs = 1,\\n batch_size = 32,\\n):\\n \"\"\"\\n This is the simulated train part of our ML pipeline where training is performed\\n \"\"\"\\n import json \\n import time\\n with open(input_nr_path, \\'r\\') as reader:\\n line = reader.readline()\\n histogram_norm_value = int(line)\\n\\n accuracy = (batch_size + histogram_norm_value)/ (batch_size + epochs+histogram_norm_value)\\n val_accuracy = accuracy * 0.9\\n metrics = {\\n \"metrics\": [\\n {\\n \"name\": \"accuracy\", # The name of the metric. Visualized as the column name in the runs table.\\n \"numberValue\": accuracy, # The value of the metric. Must be a numeric value.\\n \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\\n },\\n {\\n \"name\": \"val-accuracy\", # The name of the metric. Visualized as the column name in the runs table.\\n \"numberValue\": val_accuracy, # The value of the metric. Must be a numeric value.\\n \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\\n },\\n ]\\n }\\n with open(mlpipeline_metrics_path, \"w\") as f:\\n json.dump(metrics, f)\\n\\n # If this step is to fast, the metrics collector fails as the\\n # pod is already finished before it can collect the metrics.\\n time.sleep(10)\\n\\nimport argparse\\n_parser = argparse.ArgumentParser(prog=\\'Train e2e\\', description=\\'This is the simulated train part of our ML pipeline where training is performed\\')\\n_parser.add_argument(\"--input-nr\", dest=\"input_nr_path\", type=str, required=True, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--lr\", dest=\"lr\", type=float, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--optimizer\", dest=\"optimizer\", type=str, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--loss\", dest=\"loss\", type=str, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--epochs\", dest=\"epochs\", type=int, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--batch-size\", dest=\"batch_size\", type=int, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--mlpipeline-metrics\", dest=\"mlpipeline_metrics_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\n_parsed_args = vars(_parser.parse_args())\\n\\n_outputs = train_e2e(**_parsed_args)\\n'],\n", + " 'image': 'python:3.7'},\n", + " 'inputs': {'artifacts': [{'name': 'prep-e2e-output_nr',\n", + " 'path': '/tmp/inputs/input_nr/data'}],\n", + " 'parameters': [{'name': 'batch_size'},\n", + " {'name': 'epochs'},\n", + " {'name': 'loss'},\n", + " {'name': 'lr'},\n", + " {'name': 'optimizer'}]},\n", + " 'metadata': {'annotations': {'pipelines.kubeflow.org/arguments.parameters': '{\"batch_size\": \"{{inputs.parameters.batch_size}}\", \"epochs\": \"{{inputs.parameters.epochs}}\", \"loss\": \"{{inputs.parameters.loss}}\", \"lr\": \"{{inputs.parameters.lr}}\", \"optimizer\": \"{{inputs.parameters.optimizer}}\"}',\n", + " 'pipelines.kubeflow.org/component_ref': '{}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"description\": \"This is the simulated train part of our ML pipeline where training is performed\", \"implementation\": {\"container\": {\"args\": [\"--input-nr\", {\"inputPath\": \"input_nr\"}, {\"if\": {\"cond\": {\"isPresent\": \"lr\"}, \"then\": [\"--lr\", {\"inputValue\": \"lr\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"optimizer\"}, \"then\": [\"--optimizer\", {\"inputValue\": \"optimizer\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"loss\"}, \"then\": [\"--loss\", {\"inputValue\": \"loss\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"epochs\"}, \"then\": [\"--epochs\", {\"inputValue\": \"epochs\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"batch_size\"}, \"then\": [\"--batch-size\", {\"inputValue\": \"batch_size\"}]}}, \"--mlpipeline-metrics\", {\"outputPath\": \"mlpipeline_metrics\"}], \"command\": [\"sh\", \"-ec\", \"program_path=$(mktemp)\\\\nprintf \\\\\"%s\\\\\" \\\\\"$0\\\\\" > \\\\\"$program_path\\\\\"\\\\npython3 -u \\\\\"$program_path\\\\\" \\\\\"$@\\\\\"\\\\n\", \"def _make_parent_dirs_and_return_path(file_path: str):\\\\n import os\\\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\\\n return file_path\\\\n\\\\ndef train_e2e(\\\\n input_nr_path, # type: ignore # noqa: F821\\\\n mlpipeline_metrics_path, # type: ignore # noqa: F821\\\\n lr = 1e-4,\\\\n optimizer = \\\\\"Adam\\\\\",\\\\n loss = \\\\\"categorical_crossentropy\\\\\",\\\\n epochs = 1,\\\\n batch_size = 32,\\\\n):\\\\n \\\\\"\\\\\"\\\\\"\\\\n This is the simulated train part of our ML pipeline where training is performed\\\\n \\\\\"\\\\\"\\\\\"\\\\n import json \\\\n import time\\\\n with open(input_nr_path, \\'r\\') as reader:\\\\n line = reader.readline()\\\\n histogram_norm_value = int(line)\\\\n\\\\n accuracy = (batch_size + histogram_norm_value)/ (batch_size + epochs+histogram_norm_value)\\\\n val_accuracy = accuracy * 0.9\\\\n metrics = {\\\\n \\\\\"metrics\\\\\": [\\\\n {\\\\n \\\\\"name\\\\\": \\\\\"accuracy\\\\\", # The name of the metric. Visualized as the column name in the runs table.\\\\n \\\\\"numberValue\\\\\": accuracy, # The value of the metric. Must be a numeric value.\\\\n \\\\\"format\\\\\": \\\\\"PERCENTAGE\\\\\", # The optional format of the metric. Supported values are \\\\\"RAW\\\\\" (displayed in raw format) and \\\\\"PERCENTAGE\\\\\" (displayed in percentage format).\\\\n },\\\\n {\\\\n \\\\\"name\\\\\": \\\\\"val-accuracy\\\\\", # The name of the metric. Visualized as the column name in the runs table.\\\\n \\\\\"numberValue\\\\\": val_accuracy, # The value of the metric. Must be a numeric value.\\\\n \\\\\"format\\\\\": \\\\\"PERCENTAGE\\\\\", # The optional format of the metric. Supported values are \\\\\"RAW\\\\\" (displayed in raw format) and \\\\\"PERCENTAGE\\\\\" (displayed in percentage format).\\\\n },\\\\n ]\\\\n }\\\\n with open(mlpipeline_metrics_path, \\\\\"w\\\\\") as f:\\\\n json.dump(metrics, f)\\\\n\\\\n # If this step is to fast, the metrics collector fails as the\\\\n # pod is already finished before it can collect the metrics.\\\\n time.sleep(10)\\\\n\\\\nimport argparse\\\\n_parser = argparse.ArgumentParser(prog=\\'Train e2e\\', description=\\'This is the simulated train part of our ML pipeline where training is performed\\')\\\\n_parser.add_argument(\\\\\"--input-nr\\\\\", dest=\\\\\"input_nr_path\\\\\", type=str, required=True, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--lr\\\\\", dest=\\\\\"lr\\\\\", type=float, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--optimizer\\\\\", dest=\\\\\"optimizer\\\\\", type=str, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--loss\\\\\", dest=\\\\\"loss\\\\\", type=str, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--epochs\\\\\", dest=\\\\\"epochs\\\\\", type=int, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--batch-size\\\\\", dest=\\\\\"batch_size\\\\\", type=int, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--mlpipeline-metrics\\\\\", dest=\\\\\"mlpipeline_metrics_path\\\\\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\\\n_parsed_args = vars(_parser.parse_args())\\\\n\\\\n_outputs = train_e2e(**_parsed_args)\\\\n\"], \"image\": \"python:3.7\"}}, \"inputs\": [{\"name\": \"input_nr\", \"type\": \"Integer\"}, {\"default\": \"0.0001\", \"name\": \"lr\", \"optional\": true, \"type\": \"Float\"}, {\"default\": \"Adam\", \"name\": \"optimizer\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"categorical_crossentropy\", \"name\": \"loss\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"1\", \"name\": \"epochs\", \"optional\": true, \"type\": \"Integer\"}, {\"default\": \"32\", \"name\": \"batch_size\", \"optional\": true, \"type\": \"Integer\"}], \"name\": \"Train e2e\", \"outputs\": [{\"name\": \"mlpipeline_metrics\", \"type\": \"Metrics\"}]}',\n", + " 'pipelines.kubeflow.org/max_cache_staleness': 'P0D',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Generate dummy metrics'},\n", + " 'labels': {'katib.kubeflow.org/model-training': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'train-e2e',\n", + " 'outputs': {'artifacts': [{'name': 'mlpipeline-metrics',\n", + " 'path': '/tmp/outputs/mlpipeline_metrics/data'}]}}]}}}}}" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "katib_client.create_experiment(katib_e2e_experiment)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "katibdev", + "language": "python", + "name": "katibdev" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + }, + "vscode": { + "interpreter": { + "hash": "346a4e9d8b8e6802b68a0916b92683cfb1882082eeafaaae0a3525ab995e1047" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pkg/metricscollector/v1beta1/common/const.py b/pkg/metricscollector/v1beta1/common/const.py index f3bdf56af46..c155cd04945 100644 --- a/pkg/metricscollector/v1beta1/common/const.py +++ b/pkg/metricscollector/v1beta1/common/const.py @@ -20,6 +20,8 @@ DEFAULT_WAIT_ALL_PROCESSES = "True" # Default value for directory where TF event metrics are reported DEFAULT_METRICS_FILE_DIR = "/log" +# Default value for directory where Kubeflow pipeline metrics are reported +DEFAULT_METRICS_FILE_KFPV1_DIR = "/tmp/outputs/mlpipeline_metrics" # Job finished marker in $$$$.pid file when main process is completed TRAINING_COMPLETED = "completed" diff --git a/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/__init__.py b/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/metrics_loader.py b/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/metrics_loader.py new file mode 100644 index 00000000000..90e1764b7e8 --- /dev/null +++ b/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/metrics_loader.py @@ -0,0 +1,110 @@ +# Copyright 2023 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The Kubeflow pipeline metrics collector KFPMetricParser parses the metrics file +# and returns an ObservationLog of the metrics specified. +# Some documentation on the metrics collector file structure can be found here: +# https://v0-6.kubeflow.org/docs/pipelines/sdk/pipelines-metrics/ + +from datetime import datetime +from logging import getLogger, StreamHandler, INFO +import os +from typing import List +import json + +import rfc3339 +import api_pb2 +from pkg.metricscollector.v1beta1.common import const + +class KFPMetricParser: + def __init__(self, metric_names): + self.metric_names = metric_names + + @staticmethod + def find_all_files(directory): + for root, dirs, files in os.walk(directory): + for f in files: + yield os.path.join(root, f) + + def parse_metrics(self, metric_file_path: str) -> List[api_pb2.MetricLog]: + """Parse a kubeflow pipeline metrics file + + Args: + fn (function): path to metrics file + + Returns: + List[api_pb2.MetricLog]: A list of logged metrics + """ + metrics = [] + with open(metric_file_path) as f: + metrics_dict = json.load(f) + for m in metrics_dict["metrics"]: + name = m["name"] + value = m["numberValue"] + if name in self.metric_names: + ml = api_pb2.MetricLog( + time_stamp=rfc3339.rfc3339(datetime.now()), + metric=api_pb2.Metric(name=name, value=str(value)), + ) + metrics.append(ml) + return metrics + +class MetricsCollector: + def __init__(self, metric_names): + self.logger = getLogger(__name__) + handler = StreamHandler() + handler.setLevel(INFO) + self.logger.setLevel(INFO) + self.logger.addHandler(handler) + self.logger.propagate = False + self.metrics = metric_names + self.parser = KFPMetricParser(metric_names) + + def parse_file(self, directory): + """Parses the Kubeflow Pipeline metrics files""" + mls = [] + for f in self.parser.find_all_files(directory): + if os.path.isdir(f): + continue + try: + self.logger.info(f + " will be parsed.") + mls.extend(self.parser.parse_metrics(f)) + except Exception as e: + self.logger.warning("Unexpected error: " + str(e)) + continue + + # Metrics logs must contain at least one objective metric value + # Objective metric is located at first index + is_objective_metric_reported = False + for ml in mls: + if ml.metric.name == self.metrics[0]: + is_objective_metric_reported = True + break + # If objective metrics were not reported, insert unavailable value in the DB + if not is_objective_metric_reported: + mls = [ + api_pb2.MetricLog( + time_stamp=rfc3339.rfc3339(datetime.now()), + metric=api_pb2.Metric( + name=self.metrics[0], value=const.UNAVAILABLE_METRIC_VALUE + ), + ) + ] + self.logger.info( + "Objective metric {} is not found in metrics file, {} value is reported".format( + self.metrics[0], const.UNAVAILABLE_METRIC_VALUE + ) + ) + + return api_pb2.ObservationLog(metric_logs=mls) diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index 3953f49f54d..b4fa896bc2e 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -71,6 +71,9 @@ docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/cert-generator:${ echo -e "\nBuilding file metrics collector image...\n" docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/file-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . +echo -e "\nBuilding kfpv1 metrics collector image...\n" +docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/kfpv1-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/kfp-metricscollector/v1/Dockerfile . + echo -e "\nBuilding TF Event metrics collector image...\n" if [ "${ARCH}" == "ppc64le" ]; then docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.ppc64le . diff --git a/scripts/v1beta1/push.sh b/scripts/v1beta1/push.sh index 6f0627b4081..d8c7116552f 100755 --- a/scripts/v1beta1/push.sh +++ b/scripts/v1beta1/push.sh @@ -50,6 +50,9 @@ docker push "${REGISTRY}/cert-generator:${TAG}" echo -e "\nPushing file metrics collector image...\n" docker push "${REGISTRY}/file-metrics-collector:${TAG}" +echo -e "\nPushing kfpv1 metrics collector image...\n" +docker push "${REGISTRY}/kfpv1-metrics-collector:${TAG}" + echo -e "\nPushing TF Event metrics collector image...\n" docker push "${REGISTRY}/tfevent-metrics-collector:${TAG}" diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh index 5a20faa6934..72ae394000f 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh @@ -15,7 +15,12 @@ # limitations under the License. # This shell script is used to run Katib Experiment. -# Input parameter - path to Experiment yaml. +# Input parameters +# - comma separated list of experiment names (exp1,exp2). +# For each experiment name, the script will search the folder +# `examples/v1beta1` for a file "{exp_name}.yaml" that will be +# executed as a katib experiment. Default: "" +# - namespace to execute experiment in. Default: default set -o errexit set -o nounset @@ -24,6 +29,7 @@ set -o pipefail cd "$(dirname "$0")" EXPERIMENT_FILES=${1:-""} IFS="," read -r -a EXPERIMENT_FILE_ARRAY <<< "$EXPERIMENT_FILES" +NAMESPACE=${2:-"default"} echo "Katib deployments" kubectl -n kubeflow get deploy @@ -44,7 +50,7 @@ fi for exp_name in "${EXPERIMENT_FILE_ARRAY[@]}"; do echo "Running Experiment from $exp_name file" exp_path=$(find ../../../../../examples/v1beta1 -name "${exp_name}.yaml") - python run-e2e-experiment.py --experiment-path "${exp_path}" --namespace default \ + python run-e2e-experiment.py --experiment-path "${exp_path}" --namespace "${NAMESPACE}" \ --verbose || (kubectl get pods -n kubeflow && exit 1) done diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index e2547e2efad..1fa62dc8f2d 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -23,10 +23,15 @@ cd "$(dirname "$0")" DEPLOY_KATIB_UI=${1:-false} DEPLOY_TRAINING_OPERATOR=${2:-false} WITH_DATABASE_TYPE=${3:-mysql} +# false or a specific KFP version (eg 1.8.1) +DEPLOY_KFP=${4:-false} E2E_TEST_IMAGE_TAG="e2e-test" TRAINING_OPERATOR_VERSION="v1.6.0-rc.0" +KFP_ENV=platform-agnostic-emissary +KFP_BASE_URL="github.com/kubeflow/pipelines/manifests/kustomize" + echo "Start to install Katib" # Update Katib images with `e2e-test`. @@ -61,6 +66,7 @@ if "$DEPLOY_TRAINING_OPERATOR"; then kustomize build "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=$TRAINING_OPERATOR_VERSION" | kubectl apply -f - fi + echo "Deploying Katib" cd ../../../../../ && WITH_DATABASE_TYPE=$WITH_DATABASE_TYPE make deploy && cd - @@ -80,6 +86,20 @@ kubectl -n kubeflow get svc echo "Katib pods" kubectl -n kubeflow get pod +# If the user wants to deploy kubeflow pipelines, then use the kustomization file for kubeflow pipelines. +# found at: https://github.com/kubeflow/pipelines/tree/master/manifests/kustomize +if [ $DEPLOY_KFP ]; then + KFP_VERSION="$DEPLOY_KFP" + echo "Deploying Kubeflow Pipelines version $KFP_VERSION" + kubectl apply -k "${KFP_BASE_URL}/cluster-scoped-resources/?ref=${KFP_VERSION}" + kubectl wait crd/applications.app.k8s.io --for condition=established --timeout=60s + kubectl apply -k "${KFP_BASE_URL}/env/${KFP_ENV}/?ref=${KFP_VERSION}" + kubectl wait pods -l application-crd-id=kubeflow-pipelines -n kubeflow --for condition=Ready --timeout=1800s + #kubectl port-forward -n kubeflow svc/ml-pipeline-ui 8080:80 + kubectl patch ClusterRole katib-controller -n kubeflow --type=json -p='[{"op": "add", "path": "/rules/-", "value": {"apiGroups":["argoproj.io"],"resources":["workflows"],"verbs":["get", "list", "watch", "create", "delete"]}}]' + kubectl label namespace kubeflow katib.kubeflow.org/metrics-collector-injection=enabled +fi + # Check that Katib is working with 2 Experiments. kubectl apply -f ../../testdata/valid-experiment.yaml kubectl delete -f ../../testdata/valid-experiment.yaml