Bumped dependencies + minor improvements

copandrej · Sep 12, 2024 · 4f15eb3 · 4f15eb3
1 parent 3818598
commit 4f15eb3
Show file tree

Hide file tree

Showing 25 changed files with 48 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -31,10 +31,12 @@ Skip this step if you already have a kubernetes cluster with required addons.
 - Adjust configs in `values_example.yaml`, then deploy with helm:
 
 ```bash
-helm repo add semr_charts https://copandrej.github.io/NAOMI/
-helm install semr semr_charts/SEMR --values values_example.yaml
+helm repo add naomi_charts https://copandrej.github.io/NAOMI/
+helm install naom naomi_charts/NAOMI --version 0.1.0 --values values_example.yaml
 ```
 
+> The app name 'naom' should not be longer than 4 characters, due to limitations in k8s service name length.
+
 #### 3. Environment
 This step is only required for running example AI/ML workflows.
 - Run config script `./helper_scripts/env-prepare.sh` on VM to install requirements and connect flytectl to the cluster for running AI/ML workflows.
@@ -85,8 +87,8 @@ Quality of Experience (QoE) prediction is a workflow example adjusted from O-RAN
 1. Populate MinIO with file `insert.py` in `workflow_examples/qoe_prediction/populate_minio/` (Change IP endpoint of MinIO in the script).
 2. Run the workflow with Flyte CLI; --bt_s is batch size, --n is dataset size (1, 10, 100):
     ```bash
-    pyflyte run --remote --image  copandrej/flyte_workflow:1 wf.py qoe_train --bt_s 10 --n 1
-    ```
+   pyflyte run --remote --env SYSTEM_IP=$(hostname -I | awk '{print $1}') --image copandrej/flyte_workflow:2 wf.py qoe_train --bt_s 10 --n 1
+   ```
 3. Monitor the progress on dashboards.
 
 #### MNIST
@@ -96,7 +98,8 @@ A workflow example for distributed data processing, distributed model training,
 1. Populate MinIO with file `populate.py` in `workflow_examples/mnist/populate_minio/` (Change IP endpoint of MinIO in the script).
 2. Run the workflow with Flyte CLI from `workflow_examples/mnist/` directory:
     ```bash
-    pyflyte run --remote --env SYSTEM_IP=<CHANGE-ME> --image copandrej/flyte_workflow:1 wf.py mnist_train
+    pyflyte run --remote --env SYSTEM_IP=$(hostname -I | awk '{print $1}') --image copandrej/flyte_workflow:2 wf.py mnist_train
+
     ```
 3. Monitor the progress on dashboards.
 

diff --git a/docker_build/model_deployment/requirements.txt b/docker_build/model_deployment/requirements.txt
@@ -1,8 +1,8 @@
 # Requirements by model deplyoment template
-fastapi
+fastapi[all] == 0.114.1
 uvicorn
 python-multipart
-mlflow==2.10.2
+mlflow == 2.16.0
 
 # Add your own dependencies for model inference and data preprocessing
 # numpy

diff --git a/docker_build/ray_image/Dockerfile b/docker_build/ray_image/Dockerfile
@@ -1,6 +1,6 @@
-# Start from the rayproject/2.10.0-py310-aarch64 image for raspberry pi
+# Start from the rayproject/2.35.0-py310-aarch64 image for raspberry pi
 
-FROM rayproject/ray:2.10.0-py310
+FROM rayproject/ray:2.35.0-py310
 
 # Install dependencies from requirements.txt
 COPY requirements.txt .

diff --git a/docker_build/ray_image/requirements.txt b/docker_build/ray_image/requirements.txt
@@ -1,12 +1,10 @@
-ray[train,tune,serve,data,rllib] == 2.10.0
-fastapi == 0.104.0
-python-multipart == 0.0.7
+ray[train,tune,serve,data,rllib] == 2.35.0
+fastapi[all] == 0.114.1
 keras == 2.15.0
-mlflow == 2.10.2
+mlflow == 2.16.0
 flytekit>=1.5.0
 pandas <= 2.1.4
 evaluate
-torch
 torchvision
 pillow
 requests

diff --git a/helm_charts/SEMR/Chart.lock → helm_charts/NAOMI/Chart.lock b/helm_charts/SEMR/Chart.lock → helm_charts/NAOMI/Chart.lock
@@ -1,18 +1,18 @@
 dependencies:
 - name: mlflow
   repository: oci://registry-1.docker.io/bitnamicharts
-  version: 0.11.0
+  version: 1.5.0
 - name: minio
   repository: oci://registry-1.docker.io/bitnamicharts
-  version: 14.6.0
+  version: 14.7.7
 - name: flyte-binary
   repository: https://flyteorg.github.io/flyte
-  version: v1.10.6
+  version: v1.13.1
 - name: kuberay-operator
   repository: https://ray-project.github.io/kuberay-helm/
   version: 1.0.0-rc.0
 - name: kube-prometheus-stack
   repository: https://prometheus-community.github.io/helm-charts
-  version: 48.2.1
-digest: sha256:4bfd9d8f8583cc395c21458945856df407300dbd2c18fa8965da4c702d5345ef
-generated: "2024-06-28T12:56:38.754566806+02:00"
+  version: 62.6.0
+digest: sha256:de9ed2a3f1c1efc36a4197988676a4223b7fa4830e8787c2d8ae8865026d7b56
+generated: "2024-09-11T14:50:01.076446796+02:00"
diff --git a/helm_charts/SEMR/Chart.yaml → helm_charts/NAOMI/Chart.yaml b/helm_charts/SEMR/Chart.yaml → helm_charts/NAOMI/Chart.yaml
@@ -1,25 +1,25 @@
 apiVersion: v2
-name: SEMR
-description: A Helm chart for Kubernetes
+name: NAOMI
+description: NAOMI Helm chart for Kubernetes
 type: application
-version: 0.1.1
-appVersion: "1.0.0"
+version: 0.1.0
+appVersion: "0.1.0"
 
 dependencies:
 - condition: mlflow.enabled
   name: mlflow
   repository: oci://registry-1.docker.io/bitnamicharts
-  version: 0.11.0
+  version: 1.5.0
 
 - condition: minio.enabled
   name: minio
   repository: oci://registry-1.docker.io/bitnamicharts
-  version: 14.6.0
+  version: 14.7.7
 
 - condition: flyte-binary.enabled
   name: flyte-binary
   repository: https://flyteorg.github.io/flyte
-  version: 1.10.6
+  version: 1.13.1
 
 - name: kuberay-operator
   repository: https://ray-project.github.io/kuberay-helm/
@@ -28,4 +28,4 @@ dependencies:
 - condition: kube-prometheus-stack.enabled
   name: kube-prometheus-stack
   repository: https://prometheus-community.github.io/helm-charts
-  version: 48.2.1
+  version: 62.6.0
diff --git a/...na-dashboards/data_grafana_dashboard.json → ...na-dashboards/data_grafana_dashboard.json b/...na-dashboards/data_grafana_dashboard.json → ...na-dashboards/data_grafana_dashboard.json
diff --git a/...dashboards/default_grafana_dashboard.json → ...dashboards/default_grafana_dashboard.json b/...dashboards/default_grafana_dashboard.json → ...dashboards/default_grafana_dashboard.json
diff --git a/...s/serve_deployment_grafana_dashboard.json → ...s/serve_deployment_grafana_dashboard.json b/...s/serve_deployment_grafana_dashboard.json → ...s/serve_deployment_grafana_dashboard.json
diff --git a/...a-dashboards/serve_grafana_dashboard.json → ...a-dashboards/serve_grafana_dashboard.json b/...a-dashboards/serve_grafana_dashboard.json → ...a-dashboards/serve_grafana_dashboard.json
diff --git a/...emplates/grafana-dashboard-configmap.yaml → ...emplates/grafana-dashboard-configmap.yaml b/...emplates/grafana-dashboard-configmap.yaml → ...emplates/grafana-dashboard-configmap.yaml
diff --git a/helm_charts/SEMR/templates/ingress-kube.yaml → ..._charts/NAOMI/templates/ingress-kube.yaml b/helm_charts/SEMR/templates/ingress-kube.yaml → ..._charts/NAOMI/templates/ingress-kube.yaml
diff --git a/...SEMR/templates/local-flyte-resources.yaml → ...AOMI/templates/local-flyte-resources.yaml b/...SEMR/templates/local-flyte-resources.yaml → ...AOMI/templates/local-flyte-resources.yaml
diff --git a/helm_charts/SEMR/templates/monitoring.yaml → helm_charts/NAOMI/templates/monitoring.yaml b/helm_charts/SEMR/templates/monitoring.yaml → helm_charts/NAOMI/templates/monitoring.yaml
diff --git a/helm_charts/SEMR/templates/nodeports.yaml → helm_charts/NAOMI/templates/nodeports.yaml b/helm_charts/SEMR/templates/nodeports.yaml → helm_charts/NAOMI/templates/nodeports.yaml
diff --git a/helm_charts/SEMR/templates/prom_np.yaml → helm_charts/NAOMI/templates/prom_np.yaml b/helm_charts/SEMR/templates/prom_np.yaml → helm_charts/NAOMI/templates/prom_np.yaml
diff --git a/helm_charts/SEMR/templates/ray-serve.yaml → helm_charts/NAOMI/templates/ray-serve.yaml b/helm_charts/SEMR/templates/ray-serve.yaml → helm_charts/NAOMI/templates/ray-serve.yaml
@@ -17,7 +17,7 @@ spec:
             ray_actor_options:
               num_cpus: 0
   rayClusterConfig:
-    rayVersion: '2.10.0' # Should match the Ray version in the image of the containers
+    rayVersion: "{{ .Values.rayVersion }}" # Should match the Ray version in the image of the containers
     #############auto scaler copied from https://github.com/ray-project/kuberay/blob/5b1a5a11f5df76db2d66ed332ff0802dc3bbff76/ray-operator/config/samples/ray-cluster.autoscaler.yaml ############################
     enableInTreeAutoscaling: false
 

diff --git a/helm_charts/SEMR/values.yaml → helm_charts/NAOMI/values.yaml b/helm_charts/SEMR/values.yaml → helm_charts/NAOMI/values.yaml
@@ -6,9 +6,10 @@ general:
 # Ray
 #
 ################################################
+rayVersion: "2.35.0"
 headGroupSpecs:
   rayHead:
-    image: copandrej/ijs-custom:ray-amd64-ray210
+    image: copandrej/ijs-custom:ray-amd64-ray235
     resources:
       limits:
         cpu: "2"
@@ -25,7 +26,7 @@ workerGroups:
     rayStartParams:
       numCpus: "1"
     container:
-      image: copandrej/ijs-custom:ray-amd64-ray210
+      image: copandrej/ijs-custom:ray-amd64-ray235
       resources:
         limits:
           cpu: "1"
@@ -41,7 +42,7 @@ workerGroups:
     rayStartParams:
       numCpus: "1"
     container:
-      image: copandrej/ijs-custom:ray-aarch64-ray210
+      image: copandrej/ijs-custom:ray-aarch64-ray235
       resources:
         limits:
           cpu: "1"

diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,7 @@
-ray[train,tune,serve,data] ==  2.10.0
-fastapi == 0.104.0
-python-multipart == 0.0.7
+ray[train,tune,serve,data,rllib] == 2.35.0
+fastapi[all] == 0.114.1
 keras == 2.15.0
-mlflow == 2.10.2
+mlflow == 2.16.0
 flytekit>=1.5.0
 evaluate
 pillow

diff --git a/values_example.yaml b/values_example.yaml
@@ -5,6 +5,7 @@ general:
 
 # Ray
 # Ray can't be disabled currently
+rayVersion: "2.35.0"
 headGroupSpecs:
   rayHead:
     resources:
@@ -65,3 +66,7 @@ flyte-binary:
 # promethes&grafana
 kube-prometheus-stack:
   enabled: true
+  grafana:
+    grafana.ini:
+      auth.anonymous:
+        enabled: true
diff --git a/workflow_examples/mnist/README.md b/workflow_examples/mnist/README.md
@@ -5,7 +5,7 @@ A workflow example for distributed data processing, distributed model training,
 1. Populate MinIO with file `populate.py` in `workflow_examples/mnist/populate_minio/` (Change IP endpoint of MinIO in the script).
 2. Run the workflow with Flyte CLI from `workflow_examples/mnist/` directory:
     ```bash
-    pyflyte run --remote --env SYSTEM_IP=<CHANGE-ME> --image copandrej/flyte_workflow:1 wf.py mnist_train
+    pyflyte run --remote --env SYSTEM_IP=$(hostname -I | awk '{print $1}') --image copandrej/flyte_workflow:2 wf.py mnist_train
     ```
 3. Monitor the progress on dashboards.
 

diff --git a/workflow_examples/mnist/deploy_model.py b/workflow_examples/mnist/deploy_model.py
@@ -18,7 +18,7 @@
 def deploy(model: keras.Sequential, num_replicas: int) -> None:
     app = FastAPI(debug=True)
 
-    @serve.deployment(name="mnist", num_replicas=num_replicas, ray_actor_options={"num_cpus": 0, "num_gpus": 0}, max_concurrent_queries=100000) # , "resources": {"rasp":0.25}
+    @serve.deployment(name="mnist", num_replicas=num_replicas, ray_actor_options={"num_cpus": 0, "num_gpus": 0}) # , "resources": {"rasp":0.25}
     @serve.ingress(app)
     class Hello:
         def __init__(self):

diff --git a/workflow_examples/qoe-prediction/README.md b/workflow_examples/qoe-prediction/README.md
@@ -5,7 +5,7 @@ Quality of Experience (QoE) prediction is a workflow example adjusted from O-RAN
 1. Populate MinIO with file `insert.py` in `workflow_examples/qoe_prediction/populate_minio/` (Change IP endpoint of MinIO in the script).
 2. Run the workflow with Flyte CLI; --bt_s is batch size, --n is dataset size (1, 10, 100):
     ```bash
-    pyflyte run --remote --env SYSTEM_IP=<CHANGE-ME> --image copandrej/flyte_workflow:1 wf.py qoe_train --bt_s 10 --n 1
+   pyflyte run --remote --env SYSTEM_IP=$(hostname -I | awk '{print $1}') --image copandrej/flyte_workflow:2 wf.py qoe_train --bt_s 10 --n 1
     ```
 3. Monitor the progress on dashboards.
 

diff --git a/workflow_examples/qoe-prediction/deploy_model.py b/workflow_examples/qoe-prediction/deploy_model.py
@@ -17,7 +17,6 @@ def deploy(model: keras.Sequential, num_replicas: int) -> None:
 
     @serve.deployment(name="qoe_prediction", num_replicas="auto",
                       ray_actor_options={"num_cpus": 0, "num_gpus": 0, "memory": 0},
-                      max_concurrent_queries=1000,
                       autoscaling_config={"min_replicas": 1, "max_replicas": 1})  # , "resources": {"rasp":0.25}
     @serve.ingress(app)
     class Qoe:

diff --git a/workflow_examples/requirements.txt b/workflow_examples/requirements.txt
@@ -1,8 +1,7 @@
-ray[train,tune,serve,data] == 2.10.0
-fastapi == 0.104.0
-python-multipart == 0.0.7
+ray[train,tune,serve,data] == 2.35.0
+fastapi[all] == 0.114.1
 keras == 2.15.0
-mlflow == 2.10.2
+mlflow == 2.16.0
 flytekit>=1.5.0
 pandas <= 2.1.4
 tensorflow