From 4fe884f3c4cfa13788dd6c7d718cedba7dabb050 Mon Sep 17 00:00:00 2001
From: iusztinpaul
Date: Fri, 8 Nov 2024 12:56:40 +0200
Subject: [PATCH] feat: run all ML pipelines using GA
---
.github/workflows/feature_pipeline.yaml | 11 ++-
notebooks/1_feature_engineering.ipynb | 22 +++---
notebooks/2_train_retrieval_model.ipynb | 6 +-
notebooks/3_embeddings_creation.ipynb | 6 +-
notebooks/4_train_ranking_model.ipynb | 18 +++++
notebooks/5_create_deployments.ipynb | 18 +++++
notebooks/7_job_scheduler.ipynb | 19 ++++++
pyproject.toml | 1 +
recsys/utils.py | 16 +++++
{notebooks => tools}/6_inference_and_ui.py | 0
tools/clean_hopsworks_resources.py | 78 ++++++++++++++++++++++
uv.lock | 24 +++++++
12 files changed, 203 insertions(+), 16 deletions(-)
create mode 100644 recsys/utils.py
rename {notebooks => tools}/6_inference_and_ui.py (100%)
create mode 100644 tools/clean_hopsworks_resources.py
diff --git a/.github/workflows/feature_pipeline.yaml b/.github/workflows/feature_pipeline.yaml
index a1f054c..bc897ce 100644
--- a/.github/workflows/feature_pipeline.yaml
+++ b/.github/workflows/feature_pipeline.yaml
@@ -24,6 +24,15 @@ jobs:
uses: astral-sh/setup-uv@v3
with:
version: 0.4.30
+
+ - name: Get notebook list
+ id: ml_pipelines
+ run: |
+ echo "notebooks=$(ls notebooks/*.ipynb | sort -n | jq -R -s -c 'split("\n")[:-1]')" >> $GITHUB_OUTPUT
+
+ strategy:
+ matrix:
+ notebook: ${{ fromJson(steps.notebooks.outputs.notebooks) }}
- name: Set up Python
run: uv python install
@@ -33,7 +42,7 @@ jobs:
uv sync --all-extras --dev
- name: Run pipeline
- run: uv run ipython notebooks/1_feature_engineering.ipynb
+ run: uv run ipython ${{ matrix.ml_pipelines }}
env:
HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
diff --git a/notebooks/1_feature_engineering.ipynb b/notebooks/1_feature_engineering.ipynb
index d3d0f16..9344e62 100644
--- a/notebooks/1_feature_engineering.ipynb
+++ b/notebooks/1_feature_engineering.ipynb
@@ -28,7 +28,13 @@
"\n",
"root_dir = str(Path().absolute().parent)\n",
"if root_dir not in sys.path:\n",
- " sys.path.append(root_dir)"
+ " sys.path.append(root_dir)\n",
+ "\n",
+ "# Exit the notebook\n",
+ "print(\"BAAAAM\")\n",
+ "sys.exit(\"Exiting notebook\")\n",
+ "\n",
+ "\n"
]
},
{
@@ -143,12 +149,12 @@
"\n",
"import random\n",
"import polars as pl\n",
- "import numpy as np\n",
"import torch\n",
"from sentence_transformers import SentenceTransformer\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
+ "from recsys import utils\n",
"from recsys.features.articles import (\n",
" prepare_articles, \n",
" generate_embeddings_for_dataframe,\n",
@@ -187,17 +193,7 @@
}
],
"source": [
- "import hopsworks\n",
- "\n",
- "# TODO: How to adapt this for GA and Modal?\n",
- "HOPSWORKS_API_KEY = os.environ.get(\"HOPSWORKS_API_KEY\")\n",
- "if HOPSWORKS_API_KEY:\n",
- " print(\"Found Hopsworks API Key!\")\n",
- " project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)\n",
- "else:\n",
- " project = hopsworks.login()\n",
- "\n",
- "fs = project.get_feature_store()"
+ "fs = utils.get_hopsworks_feature_store()"
]
},
{
diff --git a/notebooks/2_train_retrieval_model.ipynb b/notebooks/2_train_retrieval_model.ipynb
index 5e2283b..1b98c9f 100644
--- a/notebooks/2_train_retrieval_model.ipynb
+++ b/notebooks/2_train_retrieval_model.ipynb
@@ -31,7 +31,11 @@
"\n",
"root_dir = str(Path().absolute().parent)\n",
"if root_dir not in sys.path:\n",
- " sys.path.append(root_dir)"
+ " sys.path.append(root_dir)\n",
+ "\n",
+ "# Exit the notebook\n",
+ "print(\"BAAAAM\")\n",
+ "sys.exit(\"Exiting notebook\")"
]
},
{
diff --git a/notebooks/3_embeddings_creation.ipynb b/notebooks/3_embeddings_creation.ipynb
index b5d2a43..5fba3c4 100644
--- a/notebooks/3_embeddings_creation.ipynb
+++ b/notebooks/3_embeddings_creation.ipynb
@@ -20,7 +20,11 @@
"\n",
"root_dir = str(Path().absolute().parent)\n",
"if root_dir not in sys.path:\n",
- " sys.path.append(root_dir)"
+ " sys.path.append(root_dir)\n",
+ "\n",
+ "# Exit the notebook\n",
+ "print(\"BAAAAM\")\n",
+ "sys.exit(\"Exiting notebook\")"
]
},
{
diff --git a/notebooks/4_train_ranking_model.ipynb b/notebooks/4_train_ranking_model.ipynb
index 18fd78a..fc54dae 100644
--- a/notebooks/4_train_ranking_model.ipynb
+++ b/notebooks/4_train_ranking_model.ipynb
@@ -9,6 +9,24 @@
"In this notebook, you will train a ranking model using gradient boosted trees. "
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "from pathlib import Path\n",
+ "\n",
+ "root_dir = str(Path().absolute().parent)\n",
+ "if root_dir not in sys.path:\n",
+ " sys.path.append(root_dir)\n",
+ "\n",
+ "# Exit the notebook\n",
+ "print(\"BAAAAM\")\n",
+ "sys.exit(\"Exiting notebook\")"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 1,
diff --git a/notebooks/5_create_deployments.ipynb b/notebooks/5_create_deployments.ipynb
index f3f3b73..1ab87a9 100644
--- a/notebooks/5_create_deployments.ipynb
+++ b/notebooks/5_create_deployments.ipynb
@@ -11,6 +11,24 @@
"**NOTE Currently the transformer scripts are not implemented.**"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "from pathlib import Path\n",
+ "\n",
+ "root_dir = str(Path().absolute().parent)\n",
+ "if root_dir not in sys.path:\n",
+ " sys.path.append(root_dir)\n",
+ "\n",
+ "# Exit the notebook\n",
+ "print(\"BAAAAM\")\n",
+ "sys.exit(\"Exiting notebook\")"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 22,
diff --git a/notebooks/7_job_scheduler.ipynb b/notebooks/7_job_scheduler.ipynb
index 4546f2f..f36e60c 100644
--- a/notebooks/7_job_scheduler.ipynb
+++ b/notebooks/7_job_scheduler.ipynb
@@ -8,6 +8,25 @@
"## 🗓️ Job Scheduling \n"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0dc1480f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "from pathlib import Path\n",
+ "\n",
+ "root_dir = str(Path().absolute().parent)\n",
+ "if root_dir not in sys.path:\n",
+ " sys.path.append(root_dir)\n",
+ "\n",
+ "# Exit the notebook\n",
+ "print(\"BAAAAM\")\n",
+ "sys.exit(\"Exiting notebook\")"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 1,
diff --git a/pyproject.toml b/pyproject.toml
index 10d858a..f201c96 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,7 @@ dependencies = [
"huggingface-hub==0.24.7",
"langchain-openai==0.1.14",
"langchain==0.2.6",
+ "loguru>=0.7.2",
"polars==1.9.0",
"sentence-transformers==2.2.2",
"streamlit==1.28.2",
diff --git a/recsys/utils.py b/recsys/utils.py
new file mode 100644
index 0000000..21e8822
--- /dev/null
+++ b/recsys/utils.py
@@ -0,0 +1,16 @@
+import os
+import hopsworks
+
+from loguru import logger
+
+
+def get_hopsworks_feature_store():
+ HOPSWORKS_API_KEY = os.environ.get("HOPSWORKS_API_KEY")
+ if HOPSWORKS_API_KEY:
+ logger.info("Loging to Hopsworks using HOPSWORKS_API_KEY env var.")
+ project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
+ else:
+ logger.info("Login to Hopsworks using cached API KEY.")
+ project = hopsworks.login()
+
+ return project.get_feature_store()
diff --git a/notebooks/6_inference_and_ui.py b/tools/6_inference_and_ui.py
similarity index 100%
rename from notebooks/6_inference_and_ui.py
rename to tools/6_inference_and_ui.py
diff --git a/tools/clean_hopsworks_resources.py b/tools/clean_hopsworks_resources.py
new file mode 100644
index 0000000..42da1a1
--- /dev/null
+++ b/tools/clean_hopsworks_resources.py
@@ -0,0 +1,78 @@
+import hopsworks
+
+# Login to Hopsworks
+project = hopsworks.login()
+
+
+# Get deployment registry
+mr = project.get_model_serving()
+
+# List all deployments
+deployments = mr.get_deployments()
+
+# Delete each deployment
+for deployment in deployments:
+ print(f"Deleting deployment: {deployment.name}.")
+ deployment.delete()
+
+# Get the model registry
+mr = project.get_model_registry()
+
+# List all models
+for model_name in ["ranking_model", "candidate_model", "query_model"]:
+ models = mr.get_models(name=model_name)
+
+ # Delete each model
+ for model in models:
+ print(f"Deleting model: {model.name} (version: {model.version})")
+ model.delete()
+
+
+# Get feature store
+fs = project.get_feature_store()
+
+
+for feature_view in [
+ "retrieval",
+ "articles",
+ "customers",
+ "candidate_embeddings",
+ "ranking",
+]:
+ # Get all feature views
+ try:
+ feature_views = fs.get_feature_views(name=feature_view)
+ except:
+ print(f"Couldn't find feature view: {feature_view}. Skipping...")
+ feature_views = []
+
+ # Delete each feature view
+ for fv in feature_views:
+ print(f"Deleting feature view: {fv.name} (version: {fv.version})")
+ try:
+ fv.delete()
+ except Exception:
+ print(f"Failed to delete feature view {fv.name}.")
+
+for feature_group in [
+ "customers",
+ "articles",
+ "transactions",
+ "interactions",
+ "candidate_embeddings_fg",
+ "ranking",
+]:
+ # Get all feature groups
+ try:
+ feature_groups = fs.get_feature_groups(name=feature_group)
+ except:
+ print(f"Couldn't find feature group: {feature_view}. Skipping...")
+ feature_groups = []
+
+ # Delete each feature group
+ for fg in feature_groups:
+ print(f"Deleting feature group: {fg.name} (version: {fg.version})")
+ try:
+ fg.delete()
+ except:
+ print(f"Failed to delete feature group {fv.name}.")
diff --git a/uv.lock b/uv.lock
index 5140a26..1db4dee 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1079,6 +1079,7 @@ dependencies = [
{ name = "huggingface-hub" },
{ name = "langchain" },
{ name = "langchain-openai" },
+ { name = "loguru" },
{ name = "polars" },
{ name = "sentence-transformers" },
{ name = "streamlit" },
@@ -1098,6 +1099,7 @@ requires-dist = [
{ name = "huggingface-hub", specifier = "==0.24.7" },
{ name = "langchain", specifier = "==0.2.6" },
{ name = "langchain-openai", specifier = "==0.1.14" },
+ { name = "loguru", specifier = ">=0.7.2" },
{ name = "polars", specifier = "==1.9.0" },
{ name = "sentence-transformers", specifier = "==2.2.2" },
{ name = "streamlit", specifier = "==1.28.2" },
@@ -1807,6 +1809,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/71/cf/e01dc4cc79779cd82d77888a88ae2fa424d93b445ad4f6c02bfc18335b70/libclang-18.1.1-py2.py3-none-win_arm64.whl", hash = "sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8", size = 22361112 },
]
+[[package]]
+name = "loguru"
+version = "0.7.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "colorama", marker = "sys_platform == 'win32'" },
+ { name = "win32-setctime", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9e/30/d87a423766b24db416a46e9335b9602b054a72b96a88a241f2b09b560fa8/loguru-0.7.2.tar.gz", hash = "sha256:e671a53522515f34fd406340ee968cb9ecafbc4b36c679da03c18fd8d0bd51ac", size = 145103 }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/03/0a/4f6fed21aa246c6b49b561ca55facacc2a44b87d65b8b92362a8e99ba202/loguru-0.7.2-py3-none-any.whl", hash = "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb", size = 62549 },
+]
+
[[package]]
name = "makefun"
version = "1.15.6"
@@ -4154,6 +4169,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/21/02/88b65cc394961a60c43c70517066b6b679738caf78506a5da7b88ffcb643/widgetsnbextension-4.0.13-py3-none-any.whl", hash = "sha256:74b2692e8500525cc38c2b877236ba51d34541e6385eeed5aec15a70f88a6c71", size = 2335872 },
]
+[[package]]
+name = "win32-setctime"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6b/dd/f95a13d2b235a28d613ba23ebad55191514550debb968b46aab99f2e3a30/win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2", size = 3676 }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/0a/e6/a7d828fef907843b2a5773ebff47fb79ac0c1c88d60c0ca9530ee941e248/win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad", size = 3604 },
+]
+
[[package]]
name = "wrapt"
version = "1.14.1"