From 4fe884f3c4cfa13788dd6c7d718cedba7dabb050 Mon Sep 17 00:00:00 2001 From: iusztinpaul Date: Fri, 8 Nov 2024 12:56:40 +0200 Subject: [PATCH] feat: run all ML pipelines using GA --- .github/workflows/feature_pipeline.yaml | 11 ++- notebooks/1_feature_engineering.ipynb | 22 +++--- notebooks/2_train_retrieval_model.ipynb | 6 +- notebooks/3_embeddings_creation.ipynb | 6 +- notebooks/4_train_ranking_model.ipynb | 18 +++++ notebooks/5_create_deployments.ipynb | 18 +++++ notebooks/7_job_scheduler.ipynb | 19 ++++++ pyproject.toml | 1 + recsys/utils.py | 16 +++++ {notebooks => tools}/6_inference_and_ui.py | 0 tools/clean_hopsworks_resources.py | 78 ++++++++++++++++++++++ uv.lock | 24 +++++++ 12 files changed, 203 insertions(+), 16 deletions(-) create mode 100644 recsys/utils.py rename {notebooks => tools}/6_inference_and_ui.py (100%) create mode 100644 tools/clean_hopsworks_resources.py diff --git a/.github/workflows/feature_pipeline.yaml b/.github/workflows/feature_pipeline.yaml index a1f054c..bc897ce 100644 --- a/.github/workflows/feature_pipeline.yaml +++ b/.github/workflows/feature_pipeline.yaml @@ -24,6 +24,15 @@ jobs: uses: astral-sh/setup-uv@v3 with: version: 0.4.30 + + - name: Get notebook list + id: ml_pipelines + run: | + echo "notebooks=$(ls notebooks/*.ipynb | sort -n | jq -R -s -c 'split("\n")[:-1]')" >> $GITHUB_OUTPUT + + strategy: + matrix: + notebook: ${{ fromJson(steps.notebooks.outputs.notebooks) }} - name: Set up Python run: uv python install @@ -33,7 +42,7 @@ jobs: uv sync --all-extras --dev - name: Run pipeline - run: uv run ipython notebooks/1_feature_engineering.ipynb + run: uv run ipython ${{ matrix.ml_pipelines }} env: HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }} diff --git a/notebooks/1_feature_engineering.ipynb b/notebooks/1_feature_engineering.ipynb index d3d0f16..9344e62 100644 --- a/notebooks/1_feature_engineering.ipynb +++ b/notebooks/1_feature_engineering.ipynb @@ -28,7 +28,13 @@ "\n", "root_dir = str(Path().absolute().parent)\n", "if root_dir not in sys.path:\n", - " sys.path.append(root_dir)" + " sys.path.append(root_dir)\n", + "\n", + "# Exit the notebook\n", + "print(\"BAAAAM\")\n", + "sys.exit(\"Exiting notebook\")\n", + "\n", + "\n" ] }, { @@ -143,12 +149,12 @@ "\n", "import random\n", "import polars as pl\n", - "import numpy as np\n", "import torch\n", "from sentence_transformers import SentenceTransformer\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", + "from recsys import utils\n", "from recsys.features.articles import (\n", " prepare_articles, \n", " generate_embeddings_for_dataframe,\n", @@ -187,17 +193,7 @@ } ], "source": [ - "import hopsworks\n", - "\n", - "# TODO: How to adapt this for GA and Modal?\n", - "HOPSWORKS_API_KEY = os.environ.get(\"HOPSWORKS_API_KEY\")\n", - "if HOPSWORKS_API_KEY:\n", - " print(\"Found Hopsworks API Key!\")\n", - " project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)\n", - "else:\n", - " project = hopsworks.login()\n", - "\n", - "fs = project.get_feature_store()" + "fs = utils.get_hopsworks_feature_store()" ] }, { diff --git a/notebooks/2_train_retrieval_model.ipynb b/notebooks/2_train_retrieval_model.ipynb index 5e2283b..1b98c9f 100644 --- a/notebooks/2_train_retrieval_model.ipynb +++ b/notebooks/2_train_retrieval_model.ipynb @@ -31,7 +31,11 @@ "\n", "root_dir = str(Path().absolute().parent)\n", "if root_dir not in sys.path:\n", - " sys.path.append(root_dir)" + " sys.path.append(root_dir)\n", + "\n", + "# Exit the notebook\n", + "print(\"BAAAAM\")\n", + "sys.exit(\"Exiting notebook\")" ] }, { diff --git a/notebooks/3_embeddings_creation.ipynb b/notebooks/3_embeddings_creation.ipynb index b5d2a43..5fba3c4 100644 --- a/notebooks/3_embeddings_creation.ipynb +++ b/notebooks/3_embeddings_creation.ipynb @@ -20,7 +20,11 @@ "\n", "root_dir = str(Path().absolute().parent)\n", "if root_dir not in sys.path:\n", - " sys.path.append(root_dir)" + " sys.path.append(root_dir)\n", + "\n", + "# Exit the notebook\n", + "print(\"BAAAAM\")\n", + "sys.exit(\"Exiting notebook\")" ] }, { diff --git a/notebooks/4_train_ranking_model.ipynb b/notebooks/4_train_ranking_model.ipynb index 18fd78a..fc54dae 100644 --- a/notebooks/4_train_ranking_model.ipynb +++ b/notebooks/4_train_ranking_model.ipynb @@ -9,6 +9,24 @@ "In this notebook, you will train a ranking model using gradient boosted trees. " ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "\n", + "root_dir = str(Path().absolute().parent)\n", + "if root_dir not in sys.path:\n", + " sys.path.append(root_dir)\n", + "\n", + "# Exit the notebook\n", + "print(\"BAAAAM\")\n", + "sys.exit(\"Exiting notebook\")" + ] + }, { "cell_type": "code", "execution_count": 1, diff --git a/notebooks/5_create_deployments.ipynb b/notebooks/5_create_deployments.ipynb index f3f3b73..1ab87a9 100644 --- a/notebooks/5_create_deployments.ipynb +++ b/notebooks/5_create_deployments.ipynb @@ -11,6 +11,24 @@ "**NOTE Currently the transformer scripts are not implemented.**" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "\n", + "root_dir = str(Path().absolute().parent)\n", + "if root_dir not in sys.path:\n", + " sys.path.append(root_dir)\n", + "\n", + "# Exit the notebook\n", + "print(\"BAAAAM\")\n", + "sys.exit(\"Exiting notebook\")" + ] + }, { "cell_type": "code", "execution_count": 22, diff --git a/notebooks/7_job_scheduler.ipynb b/notebooks/7_job_scheduler.ipynb index 4546f2f..f36e60c 100644 --- a/notebooks/7_job_scheduler.ipynb +++ b/notebooks/7_job_scheduler.ipynb @@ -8,6 +8,25 @@ "## 🗓️ Job Scheduling \n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dc1480f", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "\n", + "root_dir = str(Path().absolute().parent)\n", + "if root_dir not in sys.path:\n", + " sys.path.append(root_dir)\n", + "\n", + "# Exit the notebook\n", + "print(\"BAAAAM\")\n", + "sys.exit(\"Exiting notebook\")" + ] + }, { "cell_type": "code", "execution_count": 1, diff --git a/pyproject.toml b/pyproject.toml index 10d858a..f201c96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ dependencies = [ "huggingface-hub==0.24.7", "langchain-openai==0.1.14", "langchain==0.2.6", + "loguru>=0.7.2", "polars==1.9.0", "sentence-transformers==2.2.2", "streamlit==1.28.2", diff --git a/recsys/utils.py b/recsys/utils.py new file mode 100644 index 0000000..21e8822 --- /dev/null +++ b/recsys/utils.py @@ -0,0 +1,16 @@ +import os +import hopsworks + +from loguru import logger + + +def get_hopsworks_feature_store(): + HOPSWORKS_API_KEY = os.environ.get("HOPSWORKS_API_KEY") + if HOPSWORKS_API_KEY: + logger.info("Loging to Hopsworks using HOPSWORKS_API_KEY env var.") + project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY) + else: + logger.info("Login to Hopsworks using cached API KEY.") + project = hopsworks.login() + + return project.get_feature_store() diff --git a/notebooks/6_inference_and_ui.py b/tools/6_inference_and_ui.py similarity index 100% rename from notebooks/6_inference_and_ui.py rename to tools/6_inference_and_ui.py diff --git a/tools/clean_hopsworks_resources.py b/tools/clean_hopsworks_resources.py new file mode 100644 index 0000000..42da1a1 --- /dev/null +++ b/tools/clean_hopsworks_resources.py @@ -0,0 +1,78 @@ +import hopsworks + +# Login to Hopsworks +project = hopsworks.login() + + +# Get deployment registry +mr = project.get_model_serving() + +# List all deployments +deployments = mr.get_deployments() + +# Delete each deployment +for deployment in deployments: + print(f"Deleting deployment: {deployment.name}.") + deployment.delete() + +# Get the model registry +mr = project.get_model_registry() + +# List all models +for model_name in ["ranking_model", "candidate_model", "query_model"]: + models = mr.get_models(name=model_name) + + # Delete each model + for model in models: + print(f"Deleting model: {model.name} (version: {model.version})") + model.delete() + + +# Get feature store +fs = project.get_feature_store() + + +for feature_view in [ + "retrieval", + "articles", + "customers", + "candidate_embeddings", + "ranking", +]: + # Get all feature views + try: + feature_views = fs.get_feature_views(name=feature_view) + except: + print(f"Couldn't find feature view: {feature_view}. Skipping...") + feature_views = [] + + # Delete each feature view + for fv in feature_views: + print(f"Deleting feature view: {fv.name} (version: {fv.version})") + try: + fv.delete() + except Exception: + print(f"Failed to delete feature view {fv.name}.") + +for feature_group in [ + "customers", + "articles", + "transactions", + "interactions", + "candidate_embeddings_fg", + "ranking", +]: + # Get all feature groups + try: + feature_groups = fs.get_feature_groups(name=feature_group) + except: + print(f"Couldn't find feature group: {feature_view}. Skipping...") + feature_groups = [] + + # Delete each feature group + for fg in feature_groups: + print(f"Deleting feature group: {fg.name} (version: {fg.version})") + try: + fg.delete() + except: + print(f"Failed to delete feature group {fv.name}.") diff --git a/uv.lock b/uv.lock index 5140a26..1db4dee 100644 --- a/uv.lock +++ b/uv.lock @@ -1079,6 +1079,7 @@ dependencies = [ { name = "huggingface-hub" }, { name = "langchain" }, { name = "langchain-openai" }, + { name = "loguru" }, { name = "polars" }, { name = "sentence-transformers" }, { name = "streamlit" }, @@ -1098,6 +1099,7 @@ requires-dist = [ { name = "huggingface-hub", specifier = "==0.24.7" }, { name = "langchain", specifier = "==0.2.6" }, { name = "langchain-openai", specifier = "==0.1.14" }, + { name = "loguru", specifier = ">=0.7.2" }, { name = "polars", specifier = "==1.9.0" }, { name = "sentence-transformers", specifier = "==2.2.2" }, { name = "streamlit", specifier = "==1.28.2" }, @@ -1807,6 +1809,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/71/cf/e01dc4cc79779cd82d77888a88ae2fa424d93b445ad4f6c02bfc18335b70/libclang-18.1.1-py2.py3-none-win_arm64.whl", hash = "sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8", size = 22361112 }, ] +[[package]] +name = "loguru" +version = "0.7.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "win32-setctime", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/30/d87a423766b24db416a46e9335b9602b054a72b96a88a241f2b09b560fa8/loguru-0.7.2.tar.gz", hash = "sha256:e671a53522515f34fd406340ee968cb9ecafbc4b36c679da03c18fd8d0bd51ac", size = 145103 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/0a/4f6fed21aa246c6b49b561ca55facacc2a44b87d65b8b92362a8e99ba202/loguru-0.7.2-py3-none-any.whl", hash = "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb", size = 62549 }, +] + [[package]] name = "makefun" version = "1.15.6" @@ -4154,6 +4169,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/21/02/88b65cc394961a60c43c70517066b6b679738caf78506a5da7b88ffcb643/widgetsnbextension-4.0.13-py3-none-any.whl", hash = "sha256:74b2692e8500525cc38c2b877236ba51d34541e6385eeed5aec15a70f88a6c71", size = 2335872 }, ] +[[package]] +name = "win32-setctime" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6b/dd/f95a13d2b235a28d613ba23ebad55191514550debb968b46aab99f2e3a30/win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2", size = 3676 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/e6/a7d828fef907843b2a5773ebff47fb79ac0c1c88d60c0ca9530ee941e248/win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad", size = 3604 }, +] + [[package]] name = "wrapt" version = "1.14.1"