From 4fe884f3c4cfa13788dd6c7d718cedba7dabb050 Mon Sep 17 00:00:00 2001
From: iusztinpaul <p.e.iusztin@gmail.com>
Date: Fri, 8 Nov 2024 12:56:40 +0200
Subject: [PATCH] feat: run all ML pipelines using GA

---
 .github/workflows/feature_pipeline.yaml    | 11 ++-
 notebooks/1_feature_engineering.ipynb      | 22 +++---
 notebooks/2_train_retrieval_model.ipynb    |  6 +-
 notebooks/3_embeddings_creation.ipynb      |  6 +-
 notebooks/4_train_ranking_model.ipynb      | 18 +++++
 notebooks/5_create_deployments.ipynb       | 18 +++++
 notebooks/7_job_scheduler.ipynb            | 19 ++++++
 pyproject.toml                             |  1 +
 recsys/utils.py                            | 16 +++++
 {notebooks => tools}/6_inference_and_ui.py |  0
 tools/clean_hopsworks_resources.py         | 78 ++++++++++++++++++++++
 uv.lock                                    | 24 +++++++
 12 files changed, 203 insertions(+), 16 deletions(-)
 create mode 100644 recsys/utils.py
 rename {notebooks => tools}/6_inference_and_ui.py (100%)
 create mode 100644 tools/clean_hopsworks_resources.py

diff --git a/.github/workflows/feature_pipeline.yaml b/.github/workflows/feature_pipeline.yaml
index a1f054c..bc897ce 100644
--- a/.github/workflows/feature_pipeline.yaml
+++ b/.github/workflows/feature_pipeline.yaml
@@ -24,6 +24,15 @@ jobs:
         uses: astral-sh/setup-uv@v3
         with:
           version: 0.4.30
+          
+      - name: Get notebook list
+        id: ml_pipelines
+        run: |
+          echo "notebooks=$(ls notebooks/*.ipynb | sort -n | jq -R -s -c 'split("\n")[:-1]')" >> $GITHUB_OUTPUT
+
+    strategy:
+      matrix:
+        notebook: ${{ fromJson(steps.notebooks.outputs.notebooks) }}
 
       - name: Set up Python
         run: uv python install
@@ -33,7 +42,7 @@ jobs:
           uv sync --all-extras --dev
 
       - name: Run pipeline
-        run: uv run ipython notebooks/1_feature_engineering.ipynb
+        run: uv run ipython ${{ matrix.ml_pipelines }}
         env:
           HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
 
diff --git a/notebooks/1_feature_engineering.ipynb b/notebooks/1_feature_engineering.ipynb
index d3d0f16..9344e62 100644
--- a/notebooks/1_feature_engineering.ipynb
+++ b/notebooks/1_feature_engineering.ipynb
@@ -28,7 +28,13 @@
     "\n",
     "root_dir = str(Path().absolute().parent)\n",
     "if root_dir not in sys.path:\n",
-    "    sys.path.append(root_dir)"
+    "    sys.path.append(root_dir)\n",
+    "\n",
+    "# Exit the notebook\n",
+    "print(\"BAAAAM\")\n",
+    "sys.exit(\"Exiting notebook\")\n",
+    "\n",
+    "\n"
    ]
   },
   {
@@ -143,12 +149,12 @@
     "\n",
     "import random\n",
     "import polars as pl\n",
-    "import numpy as np\n",
     "import torch\n",
     "from sentence_transformers import SentenceTransformer\n",
     "import warnings\n",
     "warnings.filterwarnings('ignore')\n",
     "\n",
+    "from recsys import utils\n",
     "from recsys.features.articles import (\n",
     "    prepare_articles, \n",
     "    generate_embeddings_for_dataframe,\n",
@@ -187,17 +193,7 @@
     }
    ],
    "source": [
-    "import hopsworks\n",
-    "\n",
-    "# TODO: How to adapt this for GA and Modal?\n",
-    "HOPSWORKS_API_KEY = os.environ.get(\"HOPSWORKS_API_KEY\")\n",
-    "if HOPSWORKS_API_KEY:\n",
-    "    print(\"Found Hopsworks API Key!\")\n",
-    "    project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)\n",
-    "else:\n",
-    "    project = hopsworks.login()\n",
-    "\n",
-    "fs = project.get_feature_store()"
+    "fs = utils.get_hopsworks_feature_store()"
    ]
   },
   {
diff --git a/notebooks/2_train_retrieval_model.ipynb b/notebooks/2_train_retrieval_model.ipynb
index 5e2283b..1b98c9f 100644
--- a/notebooks/2_train_retrieval_model.ipynb
+++ b/notebooks/2_train_retrieval_model.ipynb
@@ -31,7 +31,11 @@
     "\n",
     "root_dir = str(Path().absolute().parent)\n",
     "if root_dir not in sys.path:\n",
-    "    sys.path.append(root_dir)"
+    "    sys.path.append(root_dir)\n",
+    "\n",
+    "# Exit the notebook\n",
+    "print(\"BAAAAM\")\n",
+    "sys.exit(\"Exiting notebook\")"
    ]
   },
   {
diff --git a/notebooks/3_embeddings_creation.ipynb b/notebooks/3_embeddings_creation.ipynb
index b5d2a43..5fba3c4 100644
--- a/notebooks/3_embeddings_creation.ipynb
+++ b/notebooks/3_embeddings_creation.ipynb
@@ -20,7 +20,11 @@
     "\n",
     "root_dir = str(Path().absolute().parent)\n",
     "if root_dir not in sys.path:\n",
-    "    sys.path.append(root_dir)"
+    "    sys.path.append(root_dir)\n",
+    "\n",
+    "# Exit the notebook\n",
+    "print(\"BAAAAM\")\n",
+    "sys.exit(\"Exiting notebook\")"
    ]
   },
   {
diff --git a/notebooks/4_train_ranking_model.ipynb b/notebooks/4_train_ranking_model.ipynb
index 18fd78a..fc54dae 100644
--- a/notebooks/4_train_ranking_model.ipynb
+++ b/notebooks/4_train_ranking_model.ipynb
@@ -9,6 +9,24 @@
     "In this notebook, you will train a ranking model using gradient boosted trees. "
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "root_dir = str(Path().absolute().parent)\n",
+    "if root_dir not in sys.path:\n",
+    "    sys.path.append(root_dir)\n",
+    "\n",
+    "# Exit the notebook\n",
+    "print(\"BAAAAM\")\n",
+    "sys.exit(\"Exiting notebook\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
diff --git a/notebooks/5_create_deployments.ipynb b/notebooks/5_create_deployments.ipynb
index f3f3b73..1ab87a9 100644
--- a/notebooks/5_create_deployments.ipynb
+++ b/notebooks/5_create_deployments.ipynb
@@ -11,6 +11,24 @@
     "**NOTE Currently the transformer scripts are not implemented.**"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "root_dir = str(Path().absolute().parent)\n",
+    "if root_dir not in sys.path:\n",
+    "    sys.path.append(root_dir)\n",
+    "\n",
+    "# Exit the notebook\n",
+    "print(\"BAAAAM\")\n",
+    "sys.exit(\"Exiting notebook\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 22,
diff --git a/notebooks/7_job_scheduler.ipynb b/notebooks/7_job_scheduler.ipynb
index 4546f2f..f36e60c 100644
--- a/notebooks/7_job_scheduler.ipynb
+++ b/notebooks/7_job_scheduler.ipynb
@@ -8,6 +8,25 @@
     "## <span style=\"color:#ff5f27\">🗓️ Job Scheduling </span>\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0dc1480f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "root_dir = str(Path().absolute().parent)\n",
+    "if root_dir not in sys.path:\n",
+    "    sys.path.append(root_dir)\n",
+    "\n",
+    "# Exit the notebook\n",
+    "print(\"BAAAAM\")\n",
+    "sys.exit(\"Exiting notebook\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
diff --git a/pyproject.toml b/pyproject.toml
index 10d858a..f201c96 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,7 @@ dependencies = [
     "huggingface-hub==0.24.7",
     "langchain-openai==0.1.14",
     "langchain==0.2.6",
+    "loguru>=0.7.2",
     "polars==1.9.0",
     "sentence-transformers==2.2.2",
     "streamlit==1.28.2",
diff --git a/recsys/utils.py b/recsys/utils.py
new file mode 100644
index 0000000..21e8822
--- /dev/null
+++ b/recsys/utils.py
@@ -0,0 +1,16 @@
+import os
+import hopsworks
+
+from loguru import logger
+
+
+def get_hopsworks_feature_store():
+    HOPSWORKS_API_KEY = os.environ.get("HOPSWORKS_API_KEY")
+    if HOPSWORKS_API_KEY:
+        logger.info("Loging to Hopsworks using HOPSWORKS_API_KEY env var.")
+        project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
+    else:
+        logger.info("Login to Hopsworks using cached API KEY.")
+        project = hopsworks.login()
+
+    return project.get_feature_store()
diff --git a/notebooks/6_inference_and_ui.py b/tools/6_inference_and_ui.py
similarity index 100%
rename from notebooks/6_inference_and_ui.py
rename to tools/6_inference_and_ui.py
diff --git a/tools/clean_hopsworks_resources.py b/tools/clean_hopsworks_resources.py
new file mode 100644
index 0000000..42da1a1
--- /dev/null
+++ b/tools/clean_hopsworks_resources.py
@@ -0,0 +1,78 @@
+import hopsworks
+
+# Login to Hopsworks
+project = hopsworks.login()
+
+
+# Get deployment registry
+mr = project.get_model_serving()
+
+# List all deployments
+deployments = mr.get_deployments()
+
+# Delete each deployment
+for deployment in deployments:
+    print(f"Deleting deployment: {deployment.name}.")
+    deployment.delete()
+
+# Get the model registry
+mr = project.get_model_registry()
+
+# List all models
+for model_name in ["ranking_model", "candidate_model", "query_model"]:
+    models = mr.get_models(name=model_name)
+
+    # Delete each model
+    for model in models:
+        print(f"Deleting model: {model.name} (version: {model.version})")
+        model.delete()
+
+
+# Get feature store
+fs = project.get_feature_store()
+
+
+for feature_view in [
+    "retrieval",
+    "articles",
+    "customers",
+    "candidate_embeddings",
+    "ranking",
+]:
+    # Get all feature views
+    try:
+        feature_views = fs.get_feature_views(name=feature_view)
+    except:
+        print(f"Couldn't find feature view: {feature_view}. Skipping...")
+        feature_views = []
+
+    # Delete each feature view
+    for fv in feature_views:
+        print(f"Deleting feature view: {fv.name} (version: {fv.version})")
+        try:
+            fv.delete()
+        except Exception:
+            print(f"Failed to delete feature view {fv.name}.")
+
+for feature_group in [
+    "customers",
+    "articles",
+    "transactions",
+    "interactions",
+    "candidate_embeddings_fg",
+    "ranking",
+]:
+    # Get all feature groups
+    try:
+        feature_groups = fs.get_feature_groups(name=feature_group)
+    except:
+        print(f"Couldn't find feature group: {feature_view}. Skipping...")
+        feature_groups = []
+
+    # Delete each feature group
+    for fg in feature_groups:
+        print(f"Deleting feature group: {fg.name} (version: {fg.version})")
+        try:
+            fg.delete()
+        except:
+            print(f"Failed to delete feature group {fv.name}.")
diff --git a/uv.lock b/uv.lock
index 5140a26..1db4dee 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1079,6 +1079,7 @@ dependencies = [
     { name = "huggingface-hub" },
     { name = "langchain" },
     { name = "langchain-openai" },
+    { name = "loguru" },
     { name = "polars" },
     { name = "sentence-transformers" },
     { name = "streamlit" },
@@ -1098,6 +1099,7 @@ requires-dist = [
     { name = "huggingface-hub", specifier = "==0.24.7" },
     { name = "langchain", specifier = "==0.2.6" },
     { name = "langchain-openai", specifier = "==0.1.14" },
+    { name = "loguru", specifier = ">=0.7.2" },
     { name = "polars", specifier = "==1.9.0" },
     { name = "sentence-transformers", specifier = "==2.2.2" },
     { name = "streamlit", specifier = "==1.28.2" },
@@ -1807,6 +1809,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/71/cf/e01dc4cc79779cd82d77888a88ae2fa424d93b445ad4f6c02bfc18335b70/libclang-18.1.1-py2.py3-none-win_arm64.whl", hash = "sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8", size = 22361112 },
 ]
 
+[[package]]
+name = "loguru"
+version = "0.7.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "win32-setctime", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9e/30/d87a423766b24db416a46e9335b9602b054a72b96a88a241f2b09b560fa8/loguru-0.7.2.tar.gz", hash = "sha256:e671a53522515f34fd406340ee968cb9ecafbc4b36c679da03c18fd8d0bd51ac", size = 145103 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/03/0a/4f6fed21aa246c6b49b561ca55facacc2a44b87d65b8b92362a8e99ba202/loguru-0.7.2-py3-none-any.whl", hash = "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb", size = 62549 },
+]
+
 [[package]]
 name = "makefun"
 version = "1.15.6"
@@ -4154,6 +4169,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/21/02/88b65cc394961a60c43c70517066b6b679738caf78506a5da7b88ffcb643/widgetsnbextension-4.0.13-py3-none-any.whl", hash = "sha256:74b2692e8500525cc38c2b877236ba51d34541e6385eeed5aec15a70f88a6c71", size = 2335872 },
 ]
 
+[[package]]
+name = "win32-setctime"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6b/dd/f95a13d2b235a28d613ba23ebad55191514550debb968b46aab99f2e3a30/win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2", size = 3676 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0a/e6/a7d828fef907843b2a5773ebff47fb79ac0c1c88d60c0ca9530ee941e248/win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad", size = 3604 },
+]
+
 [[package]]
 name = "wrapt"
 version = "1.14.1"