kubeflow · votti · Feb 9, 2023 · Feb 10, 2023 · Feb 15, 2023 · Feb 15, 2023
diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml
@@ -32,3 +32,5 @@ jobs:
  dockerfile: cmd/metricscollector/v1beta1/file-metricscollector/Dockerfile
  - component-name: tfevent-metrics-collector
  dockerfile: cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile
+ - component-name: kfpv1-metrics-collector
+ dockerfile: cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile
diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile
@@ -0,0 +1,24 @@
+FROM python:3.10-slim
+
+ARG TARGETARCH
+ENV TARGET_DIR /opt/katib
+ENV METRICS_COLLECTOR_DIR cmd/metricscollector/v1beta1/kfp-metricscollector/v1
+ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/metricscollector/v1beta1/kfp-metricscollector/v1::${TARGET_DIR}/pkg/metricscollector/v1beta1/common/
+
+ADD ./pkg/ ${TARGET_DIR}/pkg/
+ADD ./${METRICS_COLLECTOR_DIR}/ ${TARGET_DIR}/${METRICS_COLLECTOR_DIR}/
+
+WORKDIR ${TARGET_DIR}/${METRICS_COLLECTOR_DIR}
+
+RUN if [ "${TARGETARCH}" = "arm64" ]; then \
+ apt-get -y update && \
+ apt-get -y install gfortran libpcre3 libpcre3-dev && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/*; \
+ fi
+
+RUN pip install --no-cache-dir -r requirements.txt
+RUN chgrp -R 0 ${TARGET_DIR} \
+ && chmod -R g+rwX ${TARGET_DIR}
+
+ENTRYPOINT ["python", "main.py"]
diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py
@@ -0,0 +1,101 @@
+# Copyright 2023 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from logging import INFO, StreamHandler, getLogger
+
+import api_pb2
+import const
+import grpc
+from metrics_loader import MetricsCollector
+from pns import WaitMainProcesses
+
+timeout_in_seconds = 60
+
+
+def parse_options():
+ parser = argparse.ArgumentParser(
+ description="KFP V1 MetricsCollector", add_help=True
+ )
+
+ # TODO (andreyvelich): Add early stopping flags.
+ parser.add_argument("-s-db", "--db_manager_server_addr", type=str, default="")
+ parser.add_argument("-t", "--pod_name", type=str, default="")
+ parser.add_argument(
+ "-path",
+ "--metrics_file_dir",
+ type=str,
+ default=const.DEFAULT_METRICS_FILE_KFPV1_DIR,
+ )
+ parser.add_argument("-m", "--metric_names", type=str, default="")
+ parser.add_argument("-o-type", "--objective_type", type=str, default="")
+ parser.add_argument("-f", "--metric_filters", type=str, default="")
+ parser.add_argument(
+ "-p", "--poll_interval", type=int, default=const.DEFAULT_POLL_INTERVAL
+ )
+ parser.add_argument(
+ "-timeout", "--timeout", type=int, default=const.DEFAULT_TIMEOUT
+ )
+ parser.add_argument(
+ "-w", "--wait_all_processes", type=str, default=const.DEFAULT_WAIT_ALL_PROCESSES
+ )
+ opt = parser.parse_args()
+ return opt
+
+
+if __name__ == "__main__":
+ logger = getLogger(__name__)
+ handler = StreamHandler()
+ handler.setLevel(INFO)
+ logger.setLevel(INFO)
+ logger.addHandler(handler)
+ logger.propagate = False
+ opt = parse_options()
+ wait_all_processes = opt.wait_all_processes.lower() == "true"
+ db_manager_server = opt.db_manager_server_addr.split(":")
+ trial_name = "-".join(opt.pod_name.split("-")[:-1])
+ if len(db_manager_server) != 2:
+ raise Exception(
+ "Invalid Katib DB manager service address: %s" % opt.db_manager_server_addr
+ )
+
+ WaitMainProcesses(
+ pool_interval=opt.poll_interval,
+ timout=opt.timeout,
+ wait_all=wait_all_processes,
+ completed_marked_dir=None,
 if completed_marked_dir: 
 mark_file = os.path.join(completed_marked_dir, "{}.pid".format(pid)) 
 # Check if file contains "completed" marker 
 with open(mark_file) as file_obj: 
 contents = file_obj.read() 
 if contents.strip() != const.TRAINING_COMPLETED: 
 raise Exception( 
 "Unable to find marker: {} in file: {} with contents: {} for pid: {}".format( 
 const.TRAINING_COMPLETED, mark_file, contents, pid)) 
 # Add main pid to finished pids set 
 if completed_marked_dir: 
 mark_file = os.path.join(completed_marked_dir, "{}.pid".format(pid)) 
 # Check if file contains "completed" marker 
 with open(mark_file) as file_obj: 
 contents = file_obj.read() 
 if contents.strip() != const.TRAINING_COMPLETED: 
 raise Exception( 
 "Unable to find marker: {} in file: {} with contents: {} for pid: {}".format( 
 const.TRAINING_COMPLETED, mark_file, contents, pid)) 
 # Add main pid to finished pids set 
+ )
+
+ mc = MetricsCollector(opt.metric_names.split(";"))
+ observation_log = mc.parse_file(opt.metrics_file_dir)
+
+ channel = grpc.beta.implementations.insecure_channel(
+ db_manager_server[0], int(db_manager_server[1])
+ )
+
+ with api_pb2.beta_create_DBManager_stub(channel) as client:
+ logger.info(
+ "In "
+ + trial_name
+ + " "
+ + str(len(observation_log.metric_logs))
+ + " metrics will be reported."
+ )
+ client.ReportObservationLog(
+ api_pb2.ReportObservationLogRequest(
+ trial_name=trial_name, observation_log=observation_log
+ ),
+ timeout=timeout_in_seconds,
+ )
diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt
@@ -0,0 +1,5 @@
+psutil==5.8.0
+rfc3339>=6.2
+grpcio==1.41.1
+googleapis-common-protos==1.6.0
+protobuf==3.20.0
diff --git a/examples/v1beta1/kubeflow-pipelines/README.md b/examples/v1beta1/kubeflow-pipelines/README.md
@@ -3,6 +3,10 @@
 The following examples show how to use Katib with
 [Kubeflow Pipelines](https://github.com/kubeflow/pipelines).
 
+Two different aspects are illustrated here:
+A) How to orchestrate Katib experiments from Kubeflow pipelines using the Katib Kubeflow Component (Example 1 & 2)
+B) How to use Katib to tune parameters of Kubeflow pipelines
+
 You can find the Katib Component source code for the Kubeflow Pipelines
 [here](https://github.com/kubeflow/pipelines/tree/master/components/kubeflow/katib-launcher).
 
@@ -13,6 +17,8 @@ You have to install the following Python SDK to run these examples:
 - [`kfp`](https://pypi.org/project/kfp/) >= 1.8.12
 - [`kubeflow-katib`](https://pypi.org/project/kubeflow-katib/) >= 0.13.0
 
+In order to run parameter tuning over Kubeflow pipelines, additionally Katib needs to be setup to run with Argo workflow tasks. The setup is described within the example notebook (3).
+
 ## Multi-User Pipelines Setup
 
 The Notebooks examples run Pipelines in multi-user mode and your Kubeflow Notebook
@@ -25,10 +31,12 @@ to give an access Kubeflow Notebook to run Kubeflow Pipelines.
 
 The following Pipelines are deployed from Kubeflow Notebook:
 
-- [Kubeflow E2E MNIST](kubeflow-e2e-mnist.ipynb)
+1) [Kubeflow E2E MNIST](kubeflow-e2e-mnist.ipynb)
+
+2) [Katib Experiment with Early Stopping](early-stopping.ipynb)
 
-- [Katib Experiment with Early Stopping](early-stopping.ipynb)
+3) [Tune parameters of a `MNIST` kubeflow pipeline with Katib](kubeflow-kfpv1-opt-mnist.ipynb)
 
-The following Pipelines have to be compiled and uploaded to the Kubeflow Pipelines UI:
+The following Pipelines have to be compiled and uploaded to the Kubeflow Pipelines UI for examples 1 & 2:
 
 - [MPIJob Horovod](mpi-job-horovod.py)
diff --git a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb
diff --git a/manifests/v1beta1/components/mysql/pvc.yaml b/manifests/v1beta1/components/mysql/pvc.yaml
@@ -1,5 +1,4 @@
----
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
 name: katib-mysql
@@ -9,4 +8,4 @@
  - ReadWriteOnce
  resources:
  requests:
- storage: 10Gi
+ storage: 2Gi
diff --git a/pkg/metricscollector/v1beta1/common/const.py b/pkg/metricscollector/v1beta1/common/const.py
@@ -20,6 +20,8 @@
 DEFAULT_WAIT_ALL_PROCESSES = "True"
 # Default value for directory where TF event metrics are reported
 DEFAULT_METRICS_FILE_DIR = "/log"
+# Default value for directory where Kubeflow pipeline metrics are reported
+DEFAULT_METRICS_FILE_KFPV1_DIR = "/tmp/outputs/mlpipeline_metrics"
 # Job finished marker in $$$$.pid file when main process is completed
 TRAINING_COMPLETED = "completed"
 

diff --git a/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/__init__.py b/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/__init__.py
diff --git a/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/metrics_loader.py b/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/metrics_loader.py
@@ -0,0 +1,110 @@
+# Copyright 2023 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The Kubeflow pipeline metrics collector KFPMetricParser parses the metrics file 
+# and returns an ObservationLog of the metrics specified.
+# Some documentation on the metrics collector file structure can be found here:
+# https://v0-6.kubeflow.org/docs/pipelines/sdk/pipelines-metrics/
+
+from datetime import datetime
+from logging import getLogger, StreamHandler, INFO
+import os
+from typing import List
+import json
+
+import rfc3339
+import api_pb2
+from pkg.metricscollector.v1beta1.common import const
+
+class KFPMetricParser:
+ def __init__(self, metric_names):
+ self.metric_names = metric_names
+
+ @staticmethod
+ def find_all_files(directory):
+ for root, dirs, files in os.walk(directory):
+ for f in files:
+ yield os.path.join(root, f)
+
+ def parse_metrics(self, metric_file_path: str) -> List[api_pb2.MetricLog]:
+ """Parse a kubeflow pipeline metrics file
+
+ Args:
+ fn (function): path to metrics file
+
+ Returns:
+ List[api_pb2.MetricLog]: A list of logged metrics
+ """
+ metrics = []
+ with open(metric_file_path) as f:
+ metrics_dict = json.load(f)
+ for m in metrics_dict["metrics"]:
+ name = m["name"]
+ value = m["numberValue"]
+ if name in self.metric_names:
+ ml = api_pb2.MetricLog(
+ time_stamp=rfc3339.rfc3339(datetime.now()),
+ metric=api_pb2.Metric(name=name, value=str(value)),
+ )
+ metrics.append(ml)
+ return metrics
+
+class MetricsCollector:
+ def __init__(self, metric_names):
+ self.logger = getLogger(__name__)
+ handler = StreamHandler()
+ handler.setLevel(INFO)
+ self.logger.setLevel(INFO)
+ self.logger.addHandler(handler)
+ self.logger.propagate = False
+ self.metrics = metric_names
+ self.parser = KFPMetricParser(metric_names)
+
+ def parse_file(self, directory):
+ """Parses the Kubeflow Pipeline metrics files"""
+ mls = []
+ for f in self.parser.find_all_files(directory):
+ if os.path.isdir(f):
+ continue
+ try:
+ self.logger.info(f + " will be parsed.")
+ mls.extend(self.parser.parse_metrics(f))
+ except Exception as e:
+ self.logger.warning("Unexpected error: " + str(e))
+ continue
+
+ # Metrics logs must contain at least one objective metric value
+ # Objective metric is located at first index
+ is_objective_metric_reported = False
+ for ml in mls:
+ if ml.metric.name == self.metrics[0]:
+ is_objective_metric_reported = True
+ break
+ # If objective metrics were not reported, insert unavailable value in the DB
+ if not is_objective_metric_reported:
+ mls = [
+ api_pb2.MetricLog(
+ time_stamp=rfc3339.rfc3339(datetime.now()),
+ metric=api_pb2.Metric(
+ name=self.metrics[0], value=const.UNAVAILABLE_METRIC_VALUE
+ ),
+ )
+ ]
+ self.logger.info(
+ "Objective metric {} is not found in metrics file, {} value is reported".format(
+ self.metrics[0], const.UNAVAILABLE_METRIC_VALUE
+ )
+ )
+
+ return api_pb2.ObservationLog(metric_logs=mls)
diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh
@@ -71,6 +71,9 @@ docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/cert-generator:${
 echo -e "\nBuilding file metrics collector image...\n"
 docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/file-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile .
 
+echo -e "\nBuilding kfpv1 metrics collector image...\n"
+docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/kfpv1-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/kfp-metricscollector/v1/Dockerfile .
+
 echo -e "\nBuilding TF Event metrics collector image...\n"
 if [ "${ARCH}" == "ppc64le" ]; then
  docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.ppc64le .

diff --git a/scripts/v1beta1/push.sh b/scripts/v1beta1/push.sh
@@ -50,6 +50,9 @@ docker push "${REGISTRY}/cert-generator:${TAG}"
 echo -e "\nPushing file metrics collector image...\n"
 docker push "${REGISTRY}/file-metrics-collector:${TAG}"
 
+echo -e "\nPushing kfpv1 metrics collector image...\n"
+docker push "${REGISTRY}/kfpv1-metrics-collector:${TAG}"
+
 echo -e "\nPushing TF Event metrics collector image...\n"
 docker push "${REGISTRY}/tfevent-metrics-collector:${TAG}"
 

diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh
@@ -23,10 +23,17 @@ cd "$(dirname "$0")"
 DEPLOY_KATIB_UI=${1:-false}
 DEPLOY_TRAINING_OPERATOR=${2:-false}
 WITH_DATABASE_TYPE=${3:-mysql}
+DEPLOY_KFP=${4:-false}
 
-E2E_TEST_IMAGE_TAG="e2e-test"
+E2E_TEST_IMAGE_TAG="v0.15.0"
 TRAINING_OPERATOR_VERSION="v1.6.0-rc.0"
 
+KFP_ENV=platform-agnostic-emissary
+KFP_BASE_URL="github.com/kubeflow/pipelines/manifests/kustomize"
+# This is one of the latest KFPv1 version which was compatible with a
+# recent K8s version at the time of writing (eg 1.8.22 gave an error).
+KFP_VERSION="1.8.1"
+
 echo "Start to install Katib"
 
 # Update Katib images with `e2e-test`.
@@ -44,12 +51,12 @@ fi
 
 # If the user wants to deploy Katib UI, then use the kustomization file for Katib UI.
 if ! "$DEPLOY_KATIB_UI"; then
- index="$(yq eval '.resources.[] | select(. == "../../components/ui/") | path | .[-1]' $KUSTOMIZATION_FILE)"
- index="$index" yq eval -i 'del(.resources.[env(index)])' $KUSTOMIZATION_FILE
+ index="$(yq -y '.resources.[] | select(. == "../../components/ui/") | path | .[-1]' $KUSTOMIZATION_FILE)"
+ index="$index" yq -y -i 'del(.resources.[env(index)])' $KUSTOMIZATION_FILE
 fi
 
 # Since e2e test doesn't need to large storage, we use a small PVC for Katib.
-yq eval -i '.spec.resources.requests.storage|="2Gi"' $PVC_FILE
+yq -y -i '.spec.resources.requests.storage|="2Gi"' $PVC_FILE
 
 echo -e "\n The Katib will be deployed with the following configs"
 cat $KUSTOMIZATION_FILE
@@ -61,6 +68,18 @@ if "$DEPLOY_TRAINING_OPERATOR"; then
  kustomize build "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=$TRAINING_OPERATOR_VERSION" | kubectl apply -f -
 fi
 
+# If the user wants to deploy kubeflow pipelines, then use the kustomization file for kubeflow pipelines.
+# found at: https://github.com/kubeflow/pipelines/tree/master/manifests/kustomize
+if "$DEPLOY_KFP"; then
+ echo "Deploying Kubeflow Pipelines version $KFP_VERSION"
+ kubectl apply -k "${KFP_BASE_URL}/cluster-scoped-resources/?ref=${KFP_VERSION}"
+ kubectl wait crd/applications.app.k8s.io --for condition=established --timeout=60s
+ kubectl apply -k "${KFP_BASE_URL}/env/${KFP_ENV}/?ref=${KFP_VERSION}"
+ kubectl wait pods -l application-crd-id=kubeflow-pipelines -n kubeflow --for condition=Ready --timeout=1800s
+ #kubectl port-forward -n kubeflow svc/ml-pipeline-ui 8080:80
+ kubectl patch ClusterRole katib-controller -n kubeflow --type=json -p='[{"op": "add", "path": "/rules/-", "value": {"apiGroups":["argoproj.io"],"resources":["workflows"],"verbs":["get", "list", "watch", "create", "delete"]}}]'
+fi
+
 echo "Deploying Katib"
 cd ../../../../../ && WITH_DATABASE_TYPE=$WITH_DATABASE_TYPE make deploy && cd -