From 1988e063fda7ecd1e7a8286d9c7c196ff07ccd42 Mon Sep 17 00:00:00 2001
From: JonFillip <tjayphil14@gmail.com>
Date: Mon, 30 Sep 2024 22:32:20 +0100
Subject: [PATCH 1/2] Improving Kubeflow pipeline, refactored data ingestion
 and processing

---
 .DS_Store                                     |  Bin 6148 -> 8196 bytes
 .dockerignore                                 |    1 +
 .github/workflows/ci_pipeline.yml             |  386 ++++--
 Dockerfile                                    |   23 +-
 deployment/vertex_ai/vertex_ai_monitoring.py  | 1213 +++++++----------
 .../data_ingestion/data_ingestion.py          |   40 +-
 kubeflow/components/deploy/deploy.py          |  124 +-
 kubeflow/components/evaluation/evaluation.py  |  100 +-
 .../feature_engineering/feature_eng.py        |   87 +-
 .../components/feature_store/component.yaml   |   31 +
 .../components/feature_store/feature_store.py |  111 ++
 .../hyperparameter_tuning.py                  |   74 +-
 kubeflow/components/monitoring/monitor.py     |  110 +-
 kubeflow/components/preprocess/preprocess.py  |   77 +-
 kubeflow/components/test/component.yaml       |   19 -
 kubeflow/components/test/test.py              |    0
 kubeflow/components/train/train.py            |  102 +-
 kubeflow/pipeline.py                          |  137 +-
 notebooks/exploratory_analysis.ipynb          |    2 +-
 requirements.txt                              |   77 +-
 scripts/run_pipeline.sh                       |    6 +-
 src/data_processing/data_ingestion.py         |  154 ++-
 src/data_processing/data_prep.py              |  176 +++
 src/data_processing/data_preprocess.py        |  171 ---
 src/data_processing/data_process.py           |  175 +++
 src/data_processing/data_validation.py        |   41 +-
 src/feature_engineering/feat_engineering.py   |  436 ++----
 src/utils/data_versioning.py                  |   25 +
 tests/test_content_based.py                   |  156 +++
 tests/test_data_prep.py                       |   81 ++
 tests/test_data_processing.py                 |    2 +-
 tests/test_data_validation.py                 |   94 ++
 tests/test_data_versioning.py                 |   28 +
 tests/test_feature_engineering.py             |  213 +--
 tests/test_feature_store.py                   |   32 +
 tests/test_hyperparameter_tuning.py           |   75 +-
 tests/test_integration.py                     |   80 ++
 tests/test_kubeflow_pipeline.py               |   62 +
 tests/test_model_evaluation.py                |   77 ++
 tests/test_pipeline.py                        |   19 +-
 40 files changed, 2903 insertions(+), 1914 deletions(-)
 create mode 100644 kubeflow/components/feature_store/component.yaml
 create mode 100644 kubeflow/components/feature_store/feature_store.py
 delete mode 100644 kubeflow/components/test/component.yaml
 delete mode 100644 kubeflow/components/test/test.py
 create mode 100644 src/data_processing/data_prep.py
 delete mode 100644 src/data_processing/data_preprocess.py
 create mode 100644 src/data_processing/data_process.py
 create mode 100644 src/utils/data_versioning.py
 create mode 100644 tests/test_content_based.py
 create mode 100644 tests/test_data_prep.py
 create mode 100644 tests/test_data_validation.py
 create mode 100644 tests/test_data_versioning.py
 create mode 100644 tests/test_feature_store.py
 create mode 100644 tests/test_integration.py
 create mode 100644 tests/test_kubeflow_pipeline.py
 create mode 100644 tests/test_model_evaluation.py

diff --git a/.DS_Store b/.DS_Store
index 57e63d09b8a9c01b9f06e52fd82ffb0cac8e40c4..9660dd73ee714a9f8113e894c51a71d59408b7d3 100644
GIT binary patch
literal 8196
zcmeHM&2G~`5T0$*#%TrQ08%A{WQl7PQb>ysmk`ne2QCeQ1E65XX=>H=hWs={RZ-6H
z4!i<ao`iSdRK6LzmAy^jfCN<Cm3C)sf3u$bX4ae85Rqz(gAJlJB63h!mRHftDg2!4
zxw4SnxeckHPqa^68c;|*iqE0#4a<OKz%pPNunbrR{s#uIXLE5b*!PpCwzUjc2CgIn
z{C#jyS=LiIOQd`{&`1gZS-@>6=pzph9XFNrRL&A9su(lL9#pET(iB4_>Ui!l9I~Fu
zSt1j4Qi(dL^vx>GP?WqKdal4p)e~u3%YbFzJOiA&U!w{Iq^Xn3-!(G%?z2dD{7Cyi
z@xpq7S~-z|9xwX}GX9tj4cyl`xE{_0MtPjNW8jJyPjIwHWAY93X9K-*hE7f#=2)zL
z7&<)HK22!^yN77on6-zxXU=t@XK1o*htTPOslY@FGhZ^y+yE{Daty4GHpEqc5vF+C
z0Y&+l5%ZPRF3MYC+Lg1EVID24WzhwU>|rjRfn&b1xD~b=vuC@eu!22jdW8AP>ohZw
z!S+Pv+=Z#E4%@@gWZ2Cdsp}bz9K;*E3_Etkrb*s@W+jI&2&!j}%Y5W;%b1(^2zbK=
z1x|9<C19t*gb~9WIXPqrY-x!bA5z11V{GQ(cN+d=*iYBfH@_c+!%?;RT`epYmX?c7
z(OGrgw)XU-HS$K|VZ$4|=BpQ4NB(Kw^IwMDNxOXWsg6fp7<UJv5C&a9-n<IqKu;Qa
z90x<uZ6yz!l2dAz*N%=hYPEH@_ON=q?jAk5Uq%1H*70%4xpnv6<DK??bP(%LhyWCo
zZnXG1<O@gD94-!9gE-PLuNu}Y(GG3H<4t-FSpmy|;a!e~kC|vF^Y@0{%Ua=H8lPBk
z6|nxnj8tJet6!A0$)}Hf;0@nMjAYS^Ko6*sw@IR74(eZfZL<nbZEn{7&}-?O4Nome
zO!K$U<cwE9mW_NI*sIIVz+y6iwN=RNi_iZr`!ucnmI2GaFBnjTW~<o%;B?D3v7BqW
zsP9m@Fm9GeQ9&c=IHaWGkY|4wqVEFAI`ve}5{Ww~fBhifL|$w6|I=S;cK?@vxN6N$
D7ds4^

delta 137
zcmZp1XfcprU|?W$DortDU=RQ@Ie-{MGjUEV6q~50D9Qwq2a6>!<O5+gLvd31#=_-{
zlMQ58COe7<Pp%Svxv}IA%VKs84nbz1Y9J8c1`@6yQ#KZUXP(S2;|Q__WDX062AT=7
M2Q0QZo@Wj-08PObP5=M^

diff --git a/.dockerignore b/.dockerignore
index 74d84bb..d6053b6 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -26,3 +26,4 @@
 **/values.dev.yaml
 LICENSE
 README.md
+tests/*
\ No newline at end of file
diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml
index 8d93f6b..30ac821 100644
--- a/.github/workflows/ci_pipeline.yml
+++ b/.github/workflows/ci_pipeline.yml
@@ -3,112 +3,304 @@ name: CI/CD Pipeline
 on:
   push:
     branches: [ main ]
-    paths-ignore:
-      - '**/*.md'  # Exclude all markdown files (.md)
+    paths:
+      - 'src/**'
+      - 'tests/**'
+      - 'docker/**'
+      - 'cloudbuild.yaml'
+      - 'kubeflow/**'
+      - 'deployment/**'
   pull_request:
     branches: [ main ]
-    paths-ignore:
-      - '**/*.md'  # Exclude all markdown files (.md)
+    paths:
+      - 'src/**'
+      - 'tests/**'
+      - 'docker/**'
+      - 'cloudbuild.yaml'
+      - 'kubeflow/**'
+      - 'deployment/**'
+
+env:
+  IMAGE: music-recommender
+  REGION: us-central1
+  PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
+  GCS_BUCKET: ${{ secrets.GCS_BUCKET }}
+  GCP_WORKLOAD_IDENTITY_PROVIDER: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
+  GCP_SA_EMAIL: ${{ secrets.GCP_SA_EMAIL }}
+
+permissions:
+  contents: read
 
 jobs:
-  # Build job
-  build:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v2
-    
-    - name: Build Docker image
-      run: docker build -t music-recommender .
-
-  # Test job with matrix build
-  test:
-    needs: build
+  lint-and-test:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8, 3.9, 3.10]  # Testing across different Python versions
+        python-version: ['3.8', '3.9', '3.10']
+    timeout-minutes: 15
     steps:
-    - uses: actions/checkout@v2
-    
-    - name: Set up Python
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-
-    # Caching pip dependencies
-    - name: Cache pip
-      uses: actions/cache@v2
-      with:
-        path: ~/.cache/pip
-        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
-        restore-keys: |
-          ${{ runner.os }}-pip-
-
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install -r requirements.txt
-    
-    - name: Run tests
-      run: python -m unittest discover tests
-
-    # Run linting with continue-on-error
-    - name: Run linting
-      run: pylint your_code/
-      continue-on-error: true
-
-    - name: Run tests in Docker
-      run: docker run music-recommender python -m unittest discover tests
-
-  # Deploy job
-  deploy:
-    needs: test
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Cache pip
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt', '**/setup.py') }}-${{ matrix.python-version }}
+          restore-keys: |
+            ${{ runner.os }}-pip-${{ matrix.python-version }}-
+
+      - name: Install dependencies
+        run: |
+          set -e
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install flake8 black isort pytest pytest-cov
+
+      - name: Run linters
+        run: |
+          set -e
+          flake8 .
+          black --check .
+          isort --check-only .
+
+      - name: Run tests
+        run: |
+          set -e
+          pytest tests/ --cov=src --cov-report=xml
+
+      - name: Upload coverage
+        uses: codecov/codecov-action@v3
+        with:
+          file: ./coverage.xml
+
+      - name: Upload test results
+        uses: actions/upload-artifact@v3
+        with:
+          name: test-results
+          path: ./reports/
+
+  build-and-push:
+    needs: lint-and-test
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      id-token: write
+      packages: write
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Cloud SDK
+        uses: google-github-actions/setup-gcloud@v1
+
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@v1
+        with:
+          workload_identity_provider: ${{ env.GCP_WORKLOAD_IDENTITY_PROVIDER }}
+          service_account: ${{ env.GCP_SA_EMAIL }}
+
+      - name: Verify GCP Authentication
+        run: gcloud auth list
+
+      - name: Validate repository structure
+        run: |
+          set -e
+          if [ ! -f "cloudbuild.yaml" ]; then
+            echo "cloudbuild.yaml not found, failing build."
+            exit 1
+          fi
+          if [ ! -f "kubeflow/pipeline.py" ]; then
+            echo "pipeline.py not found, failing build."
+            exit 1
+          fi
+
+      - name: Configure Docker
+        run: gcloud auth configure-docker
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          push: true
+          tags: gcr.io/${{ env.PROJECT_ID }}/${{ env.IMAGE }}:${{ github.sha }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+  deploy-cloud-run:
+    needs: build-and-push
     runs-on: ubuntu-latest
     if: github.ref == 'refs/heads/main' && github.event_name == 'push'
+    permissions:
+      contents: read
+      id-token: write
+    steps:
+      - name: Set up Cloud SDK
+        uses: google-github-actions/setup-gcloud@v1
 
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@v1
+        with:
+          workload_identity_provider: ${{ env.GCP_WORKLOAD_IDENTITY_PROVIDER }}
+          service_account: ${{ env.GCP_SA_EMAIL }}
+
+      - name: Verify GCP Authentication
+        run: gcloud auth list
+
+      - name: Get the current Cloud Run revision
+        id: current_revision
+        run: |
+          set -e
+          # Capture the current revision name to enable rollback if deployment fails
+          revision=$(gcloud run services describe ${{ env.IMAGE }} \
+            --platform managed \
+            --region ${{ env.REGION }} \
+            --format="value(status.latestReadyRevisionName)")
+          echo "current_revision=$revision" >> $GITHUB_OUTPUT
+
+      - name: Deploy to Cloud Run
+        env:
+          IMAGE_TAG: gcr.io/${{ env.PROJECT_ID }}/${{ env.IMAGE }}:${{ github.sha }}
+        run: |
+          set -e
+          gcloud run deploy ${{ env.IMAGE }} \
+            --image $IMAGE_TAG \
+            --region ${{ env.REGION }} \
+            --platform managed \
+            --allow-unauthenticated
+
+      - name: Health check Cloud Run service
+        run: |
+          set -e
+          SERVICE_URL=$(gcloud run services describe ${{ env.IMAGE }} \
+            --platform managed \
+            --region ${{ env.REGION }} \
+            --format='value(status.url)')
+          if ! curl -fsSL ${SERVICE_URL}/health; then
+            echo "Deployment failed. Rolling back to previous revision..."
+            gcloud run services update-traffic ${{ env.IMAGE }} \
+              --platform managed \
+              --region ${{ env.REGION }} \
+              --to-revisions=${{ steps.current_revision.outputs.current_revision }}=100
+            exit 1
+          fi
+
+  deploy-vertex-ai:
+    needs: build-and-push
+    runs-on: ubuntu-latest
+    if: github.ref == 'refs/heads/main' && github.event_name == 'push'
+    permissions:
+      contents: read
+      id-token: write
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Cloud SDK
+        uses: google-github-actions/setup-gcloud@v1
+
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@v1
+        with:
+          workload_identity_provider: ${{ env.GCP_WORKLOAD_IDENTITY_PROVIDER }}
+          service_account: ${{ env.GCP_SA_EMAIL }}
+
+      - name: Verify GCP Authentication
+        run: gcloud auth list
+
+      - name: Trigger Cloud Build
+        run: |
+          set -e
+          gcloud builds submit --config cloudbuild.yaml \
+            --substitutions=_REGION=${{ env.REGION }}
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install Kubeflow Pipelines SDK
+        run: |
+          set -e
+          python -m pip install --upgrade pip
+          pip install kfp==2.0.0 google-cloud-aiplatform==1.28.0
+
+      - name: Deploy to Vertex AI
+        run: |
+          set -e
+          python kubeflow/pipeline.py \
+            --project_id ${{ env.PROJECT_ID }} \
+            --region ${{ env.REGION }} \
+            --pipeline_root gs://${{ env.GCS_BUCKET }}/pipeline_root \
+            --output_file pipeline.json
+          python deployment/vertex_ai/vertex_deployment.py \
+            --project_id ${{ env.PROJECT_ID }} \
+            --region ${{ env.REGION }} \
+            --pipeline_spec pipeline.json \
+            --pipeline_root gs://${{ env.GCS_BUCKET }}/pipeline_root \
+            --model_name_file model_name.txt
+
+      - name: Upload model name
+        if: always()
+        uses: actions/upload-artifact@v3
+        with:
+          name: model-name
+          path: model_name.txt
+
+      - name: Check Vertex AI model deployment status
+        run: |
+          set -e
+          MODEL_NAME=$(cat model_name.txt)
+          MODEL_STATUS=$(gcloud ai models describe $MODEL_NAME --region ${{ env.REGION }} --format="value(state)")
+          if [[ "$MODEL_STATUS" != "DEPLOYED" ]]; then
+            echo "Model not deployed successfully, exiting."
+            exit 1
+          fi
+
+  final-health-check:
+    needs: [deploy-cloud-run, deploy-vertex-ai]
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      id-token: write
     steps:
-    - uses: actions/checkout@v2
-
-    # Authenticate with Google Cloud
-    - name: Set up Cloud SDK
-      uses: google-github-actions/setup-gcloud@v0.2.0
-      with:
-        project_id: ${{ secrets.GCP_PROJECT_ID }}
-        service_account_key: ${{ secrets.GCP_SA_KEY }}
-        export_default_credentials: true
-
-    # Verify GCP Authentication
-    - name: Verify GCP Authentication
-      run: gcloud auth list
-    
-    # Validate the repository structure
-    - name: Validate repository structure
-      run: |
-        if [ ! -f "cloudbuild.yaml" ]; then
-          echo "cloudbuild.yaml not found, failing build."
-          exit 1
-        fi
-        if [ ! -f "deployment/deploy_pipeline.py" ]; then
-          echo "deploy_pipeline.py not found, failing build."
-          exit 1
-        fi
-
-    # Trigger Cloud Build
-    - name: Trigger Cloud Build
-      run: |
-        gcloud builds submit --config cloudbuild.yaml \
-          --substitutions=_REGION=us-central1
-
-    # Run smoke tests in Docker before deploying to Vertex AI
-    - name: Smoke Test
-      run: docker run music-recommender python -m unittest discover tests
-
-    # Deploy to Vertex AI with pinned versions of kfp and google-cloud-aiplatform
-    - name: Deploy to Vertex AI
-      run: |
-        pip install 'kfp==1.7.2' 'google-cloud-aiplatform==1.7.1'
-        python deployment/deploy_pipeline.py \
-          --platform vertex \
-          --project_id ${{ secrets.GCP_PROJECT_ID }} \
-          --region us-central1 \
-          --output_file pipeline.yaml
+      - name: Set up Cloud SDK
+        uses: google-github-actions/setup-gcloud@v1
+
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@v1
+        with:
+          workload_identity_provider: ${{ env.GCP_WORKLOAD_IDENTITY_PROVIDER }}
+          service_account: ${{ env.GCP_SA_EMAIL }}
+
+      - name: Download model name
+        uses: actions/download-artifact@v3
+        with:
+          name: model-name
+          path: ./
+
+      - name: Final Health Checks
+        run: |
+          set -e
+          # Check Cloud Run
+          SERVICE_URL=$(gcloud run services describe ${{ env.IMAGE }} \
+            --platform managed \
+            --region ${{ env.REGION }} \
+            --format='value(status.url)')
+          CLOUD_RUN_STATUS=$(curl -s -o /dev/null -w "%{http_code}" $SERVICE_URL/health)
+          
+          # Check Vertex AI
+          MODEL_NAME=$(cat model_name.txt)
+          VERTEX_AI_STATUS=$(gcloud ai models describe $MODEL_NAME --region ${{ env.REGION }} --format="value(state)")
+          
+          if [[ "$CLOUD_RUN_STATUS" != "200" || "$VERTEX_AI_STATUS" != "DEPLOYED" ]]; then
+            echo "Final health check failed. Cloud Run status: $CLOUD_RUN_STATUS, Vertex AI status: $VERTEX_AI_STATUS"
+            exit 1
+          fi
+          
+          echo "All systems operational. Cloud Run status: $CLOUD_RUN_STATUS, Vertex AI status: $VERTEX_AI_STATUS"
diff --git a/Dockerfile b/Dockerfile
index 84f5623..8a0e989 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,12 +1,24 @@
-FROM python:3.9-slim
+# Use Python 3.10 slim base image
+FROM python:3.10-slim
 
 # Set working directory
 WORKDIR /app
 
-# Copy requirements file
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    gcc \
+    libffi-dev \
+    libssl-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements.txt
 COPY requirements.txt .
 
-# Install dependencies
+# Upgrade pip
+RUN pip install --upgrade pip
+
+# Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 
 # Copy the entire project
@@ -15,10 +27,7 @@ COPY . .
 # Set environment variables
 ENV PYTHONPATH=/app
 
-# Install additional dependencies for Kubeflow and Vertex AI
-RUN pip install --no-cache-dir kfp google-cloud-aiplatform
-
-# Make sure all scripts are executable
+# Make scripts executable
 RUN chmod +x src/*.py kubeflow/components/*/*.py
 
 # Run tests by default
diff --git a/deployment/vertex_ai/vertex_ai_monitoring.py b/deployment/vertex_ai/vertex_ai_monitoring.py
index 31546a6..af3cefc 100644
--- a/deployment/vertex_ai/vertex_ai_monitoring.py
+++ b/deployment/vertex_ai/vertex_ai_monitoring.py
@@ -1,10 +1,3 @@
-from scipy import stats
-from scipy.stats import ks_2samp
-from google.cloud import monitoring_v3, storage, bigquery, aiplatform
-from google.api import label_pb2 as ga_label
-from google.api import metric_pb2 as ga_metric
-from google.protobuf import duration_pb2 as duration
-from src.data_processing.data_validation import generate_schema, validate_data, load_config, load_statistics_from_gcs, load_schema_from_gcs, compare_statistics, compare_schemas
 import yaml
 import tensorflow_data_validation as tfdv
 import argparse
@@ -12,301 +5,422 @@
 import pandas as pd
 import datetime
 import random
+from typing import Dict, List, Optional, Tuple, Union
+from scipy.stats import ks_2samp
+from google.cloud import monitoring_v3, storage, bigquery, aiplatform
+from google.api import label_pb2 as ga_label
+from google.api import metric_pb2 as ga_metric
+from google.protobuf import duration_pb2 as duration
+from src.data_processing.data_validation import (
+    generate_schema,
+    validate_data,
+    load_config,
+    load_statistics_from_gcs,
+    load_schema_from_gcs,
+    compare_statistics,
+    compare_schemas,
+    save_statistics_to_gcs,
+    save_schema_to_gcs
+)
 from src.utils.logging_utils import setup_logger, log_error, log_step
+from ml_metadata import metadata_store
+from ml_metadata.proto import metadata_store_pb2
 
 logger = setup_logger('vertex_ai_pipeline_monitoring')
 
-def setup_vertex_ai_monitoring(project_id, model_name):
-    """Sets up custom metrics for Vertex AI monitoring in Cloud Monitoring."""
-    client = monitoring_v3.MetricServiceClient()
-    project_name = f"projects/{project_id}"
-
-    # Define metrics
-    metrics = [
-        {
-            "type": f"custom.googleapis.com/vertex_ai/{model_name}/prediction_drift",
-            "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE,
-            "value_type": ga_metric.MetricDescriptor.ValueType.DOUBLE,
-            "description": "Prediction drift metric for Vertex AI model"
-        },
-        {
-            "type": f"custom.googleapis.com/vertex_ai/{model_name}/data_drift",
-            "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE,
-            "value_type": ga_metric.MetricDescriptor.ValueType.DOUBLE,
-            "description": "Data drift metric for Vertex AI model"
-        },
-        {
-            "type": f"custom.googleapis.com/vertex_ai/{model_name}/prediction_latency",
-            "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE,
-            "value_type": ga_metric.MetricDescriptor.ValueType.INT64,
-            "description": "Latency of prediction requests in milliseconds"
-        },
-        {
-            "type": f"custom.googleapis.com/vertex_ai/{model_name}/accuracy",
-            "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE,
-            "value_type": ga_metric.MetricDescriptor.ValueType.DOUBLE,
-            "description": "Accuracy of the Vertex AI model"
-        },
-        {
-            "type": f"custom.googleapis.com/vertex_ai/{model_name}/schema_drift",
-            "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE,
-            "value_type": ga_metric.MetricDescriptor.ValueType.INT64,  # 0 or 1 indicating schema drift
-            "description": "Schema drift metric for Vertex AI model"
-        },
-        {
-            "type": f"custom.googleapis.com/vertex_ai/{model_name}/missing_schema",
-            "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE,
-            "value_type": ga_metric.MetricDescriptor.ValueType.INT64,  # 0 or 1 indicating missing schema
-            "description": "Indicates whether the baseline schema is missing"
-        },
-        {
-            "type": f"custom.googleapis.com/vertex_ai/{model_name}/missing_statistics",
-            "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE,
-            "value_type": ga_metric.MetricDescriptor.ValueType.INT64,  # 0 or 1 indicating missing schema
-            "description": "Indicates whether the baseline statistics is missing"
-        }
-    ]
-
-    for metric in metrics:
-        descriptor = ga_metric.MetricDescriptor()
-        descriptor.type = metric["type"]
-        descriptor.metric_kind = metric["kind"]
-        descriptor.value_type = metric["value_type"]
-        descriptor.description = metric["description"]
-
-        descriptor = client.create_metric_descriptor(
-            name=project_name,
-            metric_descriptor=descriptor
+class VertexAIMonitoring:
+    def __init__(self, project_id: str, model_name: str, bucket_name: str):
+        self.project_id = project_id
+        self.model_name = model_name
+        self.bucket_name = bucket_name
+        self.client = monitoring_v3.MetricServiceClient()
+        self.project_name = f"projects/{project_id}"
+        self.feature_store_client = aiplatform.FeatureStore(project=project_id)
+        self.mlmd_connection_config = metadata_store_pb2.ConnectionConfig()
+        self.mlmd_connection_config.sqlite.filename_uri = f"gs://{bucket_name}/mlmd/metadata.db"
+        self.mlmd_store = metadata_store.MetadataStore(self.mlmd_connection_config)
+
+    def setup_custom_metrics(self) -> None:
+        """Sets up custom metrics for Vertex AI monitoring in Cloud Monitoring."""
+        metrics = [
+            self._create_metric_descriptor("prediction_drift", "Prediction drift metric"),
+            self._create_metric_descriptor("data_drift", "Data drift metric"),
+            self._create_metric_descriptor("prediction_latency", "Latency of prediction requests", value_type=ga_metric.MetricDescriptor.ValueType.INT64),
+            self._create_metric_descriptor("accuracy", "Accuracy of the model"),
+            self._create_metric_descriptor("schema_drift", "Schema drift metric", value_type=ga_metric.MetricDescriptor.ValueType.INT64),
+            self._create_metric_descriptor("missing_schema", "Indicates missing baseline schema", value_type=ga_metric.MetricDescriptor.ValueType.INT64),
+            self._create_metric_descriptor("missing_statistics", "Indicates missing baseline statistics", value_type=ga_metric.MetricDescriptor.ValueType.INT64),
+            self._create_metric_descriptor("feature_store_read_count", "Number of read operations from the feature store", value_type=ga_metric.MetricDescriptor.ValueType.INT64),
+            self._create_metric_descriptor("feature_store_write_count", "Number of write operations to the feature store", value_type=ga_metric.MetricDescriptor.ValueType.INT64),
+            self._create_metric_descriptor("feature_store_latency", "Latency of feature store operations", value_type=ga_metric.MetricDescriptor.ValueType.DISTRIBUTION),
+        ]
+
+        for metric in metrics:
+            descriptor = self.client.create_metric_descriptor(
+                name=self.project_name,
+                metric_descriptor=metric
+            )
+            logger.info(f"Created {descriptor.name}")
+
+    def _create_metric_descriptor(self, metric_name: str, description: str, value_type: int = ga_metric.MetricDescriptor.ValueType.DOUBLE) -> ga_metric.MetricDescriptor:
+        return ga_metric.MetricDescriptor(
+            type=f"custom.googleapis.com/vertex_ai/{self.model_name}/{metric_name}",
+            metric_kind=ga_metric.MetricDescriptor.MetricKind.GAUGE,
+            value_type=value_type,
+            description=description
         )
-        print(f"Created {descriptor.name}")
 
+    def create_alert_policy(self, display_name: str, filter_str: str, threshold: float, duration_seconds: int, comparison: int) -> None:
+        """Creates an alert policy in Google Cloud Monitoring."""
+        client = monitoring_v3.AlertPolicyServiceClient()
+        
+        condition = {
+            "display_name": display_name,
+            "condition_threshold": {
+                "filter": filter_str,
+                "comparison": comparison,
+                "threshold_value": threshold,
+                "duration": duration.Duration(seconds=duration_seconds)
+            }
+        }
 
-def create_accuracy_degradation_alert(project_id: str, model_name: str, absolute_threshold: float, degradation_rate_threshold: float, time_window_seconds: int = 86400):
-    """
-    Create an alert in Google Cloud Monitoring for model accuracy degradation.
-    The alert will trigger if:
-    1. The accuracy falls below an absolute threshold.
-    2. The accuracy degrades by a certain rate within a specified time window.
-    
-    :param project_id: GCP Project ID
-    :param model_name: Name of the model
-    :param absolute_threshold: The absolute accuracy threshold to trigger an alert (e.g., accuracy < 0.85).
-    :param degradation_rate_threshold: The degradation rate threshold over time (e.g., 0.05 for a 5% drop).
-    :param time_window_seconds: The time window to monitor for accuracy degradation (default is 24 hours).
-    """
-    client = monitoring_v3.AlertPolicyServiceClient()
-    project_name = f"projects/{project_id}"
-
-    # Condition 1: Absolute accuracy degradation
-    absolute_condition = {
-        "display_name": "Accuracy below absolute threshold",
-        "condition_threshold": {
-            "filter": f'metric.type="custom.googleapis.com/vertex_ai/{model_name}/accuracy"',
-            "comparison": monitoring_v3.ComparisonType.COMPARISON_LT,
-            "threshold_value": absolute_threshold,
-            "duration": {"seconds": 300},  # Trigger if accuracy stays below threshold for 5 minutes
-            "aggregations": [{
-                "alignment_period": {"seconds": 300},
-                "per_series_aligner": monitoring_v3.Aggregation.Aligner.ALIGN_MEAN,
-            }]
+        alert_policy = {
+            "display_name": f"{self.model_name} {display_name}",
+            "conditions": [condition],
+            "notification_channels": [f"projects/{self.project_id}/notificationChannels/your-channel-id"],
+            "combiner": monitoring_v3.AlertPolicy.Combiner.OR,
         }
-    }
-
-    # Condition 2: Degradation rate over time
-    degradation_condition = {
-        "display_name": "Accuracy degradation over time",
-        "condition_threshold": {
-            "filter": f'metric.type="custom.googleapis.com/vertex_ai/{model_name}/accuracy"',
-            "comparison": monitoring_v3.ComparisonType.COMPARISON_LT,
-            "threshold_value": degradation_rate_threshold,  # Set degradation rate threshold
-            "duration": {"seconds": time_window_seconds},  # Time window (e.g., 24 hours)
-            "aggregations": [{
-                "alignment_period": {"seconds": time_window_seconds},
-                "per_series_aligner": monitoring_v3.Aggregation.Aligner.ALIGN_DELTA,
-            }]
+
+        policy = client.create_alert_policy(
+            name=self.project_name,
+            alert_policy=alert_policy
+        )
+        logger.info(f"Created alert policy: {policy.name}")
+
+    def create_accuracy_degradation_alert(self, absolute_threshold: float, degradation_rate_threshold: float, time_window_seconds: int = 86400) -> None:
+        """Creates an alert for accuracy degradation."""
+        self.create_alert_policy(
+            "Accuracy below absolute threshold",
+            f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/accuracy"',
+            absolute_threshold,
+            300,
+            monitoring_v3.ComparisonType.COMPARISON_LT
+        )
+        self.create_alert_policy(
+            "Accuracy degradation over time",
+            f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/accuracy"',
+            degradation_rate_threshold,
+            time_window_seconds,
+            monitoring_v3.ComparisonType.COMPARISON_LT
+        )
+
+    def create_resource_utilization_alert(self) -> None:
+        """Creates alerts for resource utilization (CPU, memory, and GPU)."""
+        resources = [
+            ("CPU", "compute.googleapis.com/instance/cpu/utilization"),
+            ("Memory", "compute.googleapis.com/instance/memory/utilization"),
+            ("GPU", "compute.googleapis.com/instance/gpu/utilization")
+        ]
+
+        for resource_name, metric_type in resources:
+            self.create_alert_policy(
+                f"High {resource_name} utilization",
+                f'metric.type="{metric_type}"',
+                0.8,  # 80% utilization threshold
+                300,  # 5 minutes duration
+                monitoring_v3.ComparisonType.COMPARISON_GT
+            )
+
+    def log_metric(self, metric_name: str, value: Union[float, int]) -> None:
+        """Logs a metric to Google Cloud Monitoring."""
+        series = monitoring_v3.TimeSeries()
+        series.metric.type = f"custom.googleapis.com/vertex_ai/{self.model_name}/{metric_name}"
+        series.resource.type = "aiplatform.googleapis.com/Endpoint"
+        series.resource.labels["model_name"] = self.model_name
+        point = series.points.add()
+        if isinstance(value, float):
+            point.value.double_value = value
+        else:
+            point.value.int64_value = value
+        now = datetime.datetime.now()
+        point.interval.end_time.seconds = int(now.timestamp())
+        point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9)
+        self.client.create_time_series(name=self.project_name, time_series=[series])
+        logger.info(f"Logged {metric_name} for model {self.model_name}: {value}")
+
+    def detect_data_drift(self, drift_threshold: float) -> Optional[float]:
+        """Detects data drift by comparing current (serving) statistics with baseline (training) statistics."""
+        try:
+            log_step(logger, 'Detecting Data Drift', 'Data Drift Detection')
+            today = datetime.datetime.now().strftime("%Y%m%d")
+            
+            baseline_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'train', today)
+            if not baseline_stats:
+                self.log_metric("missing_statistics", 1)
+                return None
+            
+            serving_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'serving', today)
+            if not serving_stats:
+                self.log_metric("missing_statistics", 1)
+                return None
+            
+            schema = load_schema_from_gcs(self.bucket_name, self.model_name, 'current')
+            if not schema:
+                self.log_metric("missing_schema", 1)
+                return None
+
+            anomalies = compare_statistics(baseline_stats, serving_stats, schema)
+
+            drift_score = 0
+            for feature, anomaly in anomalies.anomaly_info.items():
+                if anomaly:
+                    logger.warning(f"Data drift detected for feature {feature}: {anomaly.description}")
+                    drift_score += anomaly.severity
+                    self.log_metric("data_drift", anomaly.severity)
+
+            if drift_score > drift_threshold:
+                logger.warning(f"Significant data drift detected. Drift score: {drift_score} > {drift_threshold}")
+            else:
+                logger.info(f"No significant data drift detected. Drift score: {drift_score} <= {drift_threshold}")
+
+            # Log drift detection results to MLMD
+            self._log_drift_detection_to_mlmd(drift_score, drift_threshold)
+
+            return drift_score
+
+        except Exception as e:
+            log_error(logger, e, 'Data Drift Detection')
+            return None
+
+    def _log_drift_detection_to_mlmd(self, drift_score: float, drift_threshold: float):
+        """Log drift detection results to ML Metadata."""
+        execution = metadata_store_pb2.Execution()
+        execution.type = "DataDriftDetection"
+        execution.properties["model_name"].string_value = self.model_name
+        execution.properties["drift_score"].double_value = drift_score
+        execution.properties["drift_threshold"].double_value = drift_threshold
+        execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat()
+
+        execution_id = self.mlmd_store.put_executions([execution])[0]
+        logger.info(f"Logged drift detection results to MLMD with execution ID: {execution_id}")
+
+    def detect_prediction_drift(self, drift_threshold: float) -> Optional[float]:
+        """Detects prediction drift using the Kolmogorov-Smirnov (KS) test."""
+        try:
+            log_step(logger, 'Detecting Prediction Drift', 'Prediction Drift Detection')
+            today = datetime.datetime.now().strftime("%Y%m%d")
+            
+            train_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'train', today)
+            if not train_stats:
+                self.log_metric("missing_statistics", 1)
+                return None
+            
+            serving_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'serving', today)
+            if not serving_stats:
+                self.log_metric("missing_statistics", 1)
+                return None
+
+            train_predictions = train_stats.datasets[0].features['similar_tracks'].num_stats.histograms[0].buckets
+            serving_predictions = serving_stats.datasets[0].features['similar_tracks'].num_stats.histograms[0].buckets
+
+            train_counts = [bucket.sample_count for bucket in train_predictions]
+            serving_counts = [bucket.sample_count for bucket in serving_predictions]
+
+            statistic, _ = ks_2samp(train_counts, serving_counts)
+
+            self.log_metric("prediction_drift", statistic)
+
+            if statistic > drift_threshold:
+                logger.warning(f"Prediction drift detected: KS statistic = {statistic}")
+            else:
+                logger.info(f"No significant prediction drift detected")
+
+            # Log prediction drift results to MLMD
+            self._log_prediction_drift_to_mlmd(statistic, drift_threshold)
+
+            return statistic
+
+        except Exception as e:
+            log_error(logger, e, 'Prediction Drift Detection')
+            return None
+
+    def _log_prediction_drift_to_mlmd(self, statistic: float, drift_threshold: float):
+        """Log prediction drift results to ML Metadata."""
+        execution = metadata_store_pb2.Execution()
+        execution.type = "PredictionDriftDetection"
+        execution.properties["model_name"].string_value = self.model_name
+        execution.properties["ks_statistic"].double_value = statistic
+        execution.properties["drift_threshold"].double_value = drift_threshold
+        execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat()
+
+        execution_id = self.mlmd_store.put_executions([execution])[0]
+        logger.info(f"Logged prediction drift results to MLMD with execution ID: {execution_id}")
+
+    def detect_schema_drift(self, schema_version: str) -> Optional[bool]:
+        """Detects schema drift by comparing the current schema with the baseline (training) schema."""
+        try:
+            log_step(logger, 'Detecting Schema Drift', 'Schema Drift Detection')
+
+            baseline_schema = load_schema_from_gcs(self.bucket_name, self.model_name, schema_version)
+            if not baseline_schema:
+                self.log_metric("missing_schema", 1)
+                return None
+
+            current_schema = load_schema_from_gcs(self.bucket_name, self.model_name, 'serving_schema_version')
+
+            schema_drift_detected = compare_schemas(baseline_schema, current_schema)
+
+            if schema_drift_detected:
+                logger.info(f"Schema drift detected for model {self.model_name}.")
+                self.log_metric("schema_drift", 1)
+            else:
+                logger.info(f"No schema drift detected for model {self.model_name}.")
+                self.log_metric("schema_drift", 0)
+
+            # Log schema drift results to MLMD
+            self._log_schema_drift_to_mlmd(schema_drift_detected)
+
+            return schema_drift_detected
+
+        except Exception as e:
+            log_error(logger, e, 'Schema Drift Detection')
+            return None
+
+    def _log_schema_drift_to_mlmd(self, schema_drift_detected: bool):
+        """Log schema drift results to ML Metadata."""
+        execution = metadata_store_pb2.Execution()
+        execution.type = "SchemaDriftDetection"
+        execution.properties["model_name"].string_value = self.model_name
+        execution.properties["schema_drift_detected"].int_value = int(schema_drift_detected)
+        execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat()
+
+        execution_id = self.mlmd_store.put_executions([execution])[0]
+        logger.info(f"Logged schema drift results to MLMD with execution ID: {execution_id}")
+
+    def monitor_traffic_split(self, endpoint_name: str) -> Optional[Dict[str, float]]:
+        """Monitor the traffic split in Vertex AI to detect rollback."""
+        try:
+            log_step(logger, 'Monitoring traffic split', 'Rollback Monitoring')
+            
+            aiplatform.init(project=self.project_id)
+
+            endpoints = aiplatform.Endpoint.list(filter=f'display_name="{endpoint_name}"')
+            if not endpoints:
+                log_error(logger, f"Endpoint {endpoint_name} not found.", "Rollback Monitoring")
+                return None
+            endpoint = endpoints[0]
+
+            traffic_split = endpoint.traffic_split
+            for model_id, traffic_percentage in traffic_split.items():
+                logger.info(f"Model {model_id} is receiving {traffic_percentage}% of the traffic.")
+            
+            if sum(traffic_split.values()) != 100:
+                logger.warning("Traffic split does not sum to 100%, indicating a possible rollback.")
+        
+            # Log traffic split to MLMD
+            self._log_traffic_split_to_mlmd(traffic_split)
+
+            return traffic_split
+        except Exception as e:
+            log_error(logger, e, "Rollback Monitoring")
+            raise
+
+    def _log_traffic_split_to_mlmd(self, traffic_split: Dict[str, float]):
+        """Log traffic split to ML Metadata."""
+        execution = metadata_store_pb2.Execution()
+        execution.type = "TrafficSplitMonitoring"
+        execution.properties["model_name"].string_value = self.model_name
+        for model_id, percentage in traffic_split.items():
+            execution.properties[f"traffic_{model_id}"].double_value = percentage
+        execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat()
+
+        execution_id = self.mlmd_store.put_executions([execution])[0]
+        logger.info(f"Logged traffic split to MLMD with execution ID: {execution_id}")
+
+    def trigger_retraining_pipeline(self, pipeline_name: str, gcs_input: str) -> str:
+        """Trigger a Vertex AI pipeline for continuous retraining."""
+        aiplatform.init(project=self.project_id)
+
+        pipeline_params = {
+            'input_data': gcs_input,
+            'model_name': self.model_name
         }
-    }
-
-    # Create the alert policy
-    alert_policy = {
-        "display_name": f"Accuracy Degradation Alert for {model_name}",
-        "conditions": [absolute_condition, degradation_condition],
-        "notification_channels": [f"projects/{project_id}/notificationChannels/your-channel-id"],  # Replace with actual channel
-        "combiner": monitoring_v3.AlertPolicy.Combiner.OR,  # Trigger if either condition is met
-        "enabled": True
-    }
-
-    # Apply the policy to the project
-    policy = client.create_alert_policy(
-        name=project_name,
-        alert_policy=alert_policy
-    )
 
-    logger.info(f"Created accuracy degradation alert policy: {policy.name}")
-    return policy
+        pipeline_job = aiplatform.PipelineJob(
+            display_name=f'Retraining - {self.model_name}',
+            template_path=f'gs://{pipeline_name}',
+            parameter_values=pipeline_params
+        )
 
+        pipeline_job.run()
+        logger.info(f"Triggered retraining pipeline for {self.model_name}.")
 
-def create_data_drift_alert(project_id, model_name):
-    client = monitoring_v3.AlertPolicyServiceClient()
-    project_name = f"projects/{project_id}"
+        # Log retraining trigger to MLMD
+        self._log_retraining_trigger_to_mlmd(pipeline_job.resource_name)
 
-    alert_policy = {
-        "display_name": f"{model_name} Data Drift Alert",
-        "conditions": [{
-            "display_name": "Data drift exceeds threshold",
-            "condition_threshold": {
-                "filter": f'metric.type="custom.googleapis.com/vertex_ai/{model_name}/data_drift"',
-                "comparison": monitoring_v3.ComparisonType.COMPARISON_GT,
-                "threshold_value": 0.1,
-                "duration": duration.Duration(seconds=300)
-            }
-        }],
-        "notification_channels": [f"projects/{project_id}/notificationChannels/your-channel-id"],
-        "combiner": monitoring_v3.AlertPolicy.Combiner.OR,
-    }
-
-    policy = client.create_alert_policy(
-        name=project_name,
-        alert_policy=alert_policy
-    )
-    print(f"Created alert policy: {policy.name}")
+        return pipeline_job.resource_name
 
-def create_prediction_drift_alert(project_id, model_name):
-    client = monitoring_v3.AlertPolicyServiceClient()
-    project_name = f"projects/{project_id}"
+    def _log_retraining_trigger_to_mlmd(self, pipeline_job_id: str):
+        """Log retraining trigger to ML Metadata."""
+        execution = metadata_store_pb2.Execution()
+        execution.type = "RetrainingTrigger"
+        execution.properties["model_name"].string_value = self.model_name
+        execution.properties["pipeline_job_id"].string_value = pipeline_job_id
+        execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat()
 
-    alert_policy = {
-        "display_name": f"{model_name} Prediction Drift Alert",
-        "conditions": [{
-            "display_name": "Prediction drift exceeds threshold",
+        execution_id = self.mlmd_store.put_executions([execution])[0]
+        logger.info(f"Logged retraining trigger to MLMD with execution ID: {execution_id}")
+
+    def setup_retraining_job_alert(self, notification_channel: str) -> None:
+        """Set up a Cloud Monitoring alert for Vertex AI retraining jobs."""
+        condition = {
+            "display_name": "Vertex AI Retraining Job Created",
             "condition_threshold": {
-                "filter": f'metric.type="custom.googleapis.com/vertex_ai/{model_name}/prediction_drift"',
+                "filter": 'resource.type="aiplatform.googleapis.com/PipelineJob" AND protoPayload.methodName="google.cloud.aiplatform.v1.PipelineService.CreatePipelineJob"',
                 "comparison": monitoring_v3.ComparisonType.COMPARISON_GT,
-                "threshold_value": 0.1,
-                "duration": duration.Duration(seconds=300)
+                "threshold_value": 0,
+                "duration": {"seconds": 60},
             }
-        }],
-        "notification_channels": [f"projects/{project_id}/notificationChannels/your-channel-id"],
-        "combiner": monitoring_v3.AlertPolicy.Combiner.OR,
-    }
-
-    policy = client.create_alert_policy(
-        name=project_name,
-        alert_policy=alert_policy
-    )
-    print(f"Created alert policy: {policy.name}")
+        }
 
-def create_schema_drift_alert(project_id: str, model_name: str):
-    client = monitoring_v3.AlertPolicyServiceClient()
-    project_name = f"projects/{project_id}"
+        alert_policy = {
+            "display_name": "Retraining Job Alert",
+            "conditions": [condition],
+            "notification_channels": [notification_channel],
+            "enabled": True,
+            "combiner": monitoring_v3.AlertPolicy.Combiner.OR
+        }
 
-    # Create the alert policy definition
-    alert_policy = monitoring_v3.AlertPolicy(
-        display_name=f"{model_name} Schema Drift Alert",
-        conditions=[{
-            "display_name": "Schema Drift Detected",
-            "condition_threshold": {
-                "filter": f'metric.type="custom.googleapis.com/vertex_ai/{model_name}/schema_drift"',
-                "comparison": monitoring_v3.ComparisonType.COMPARISON_GT,
-                "threshold_value": 1,
-                "duration": duration.Duration(seconds=300)  # Set duration for continuous drift
-            }
-        }],
-        notification_channels=[f"projects/{project_id}/notificationChannels/your-channel-id"],  # Replace with actual channel ID
-        combiner=monitoring_v3.AlertPolicy.Combiner.OR,  # How to combine multiple conditions
-        enabled=True
-    )
+        client = monitoring_v3.AlertPolicyServiceClient()
+        policy = client.create_alert_policy(
+            name=self.project_name,
+            alert_policy=alert_policy
+        )
 
-    # Apply the alert policy
-    policy = client.create_alert_policy(
-        name=project_name,
-        alert_policy=alert_policy
-    )
+        logger.info(f"Created retraining job alert policy: {policy.name}")
 
-    print(f"Schema drift alert policy created: {policy.name}")
-
-def create_resource_utilization_alert(project_id, model_name):
-    client = monitoring_v3.AlertPolicyServiceClient()
-    project_name = f"projects/{project_id}"
-
-    alert_policy = {
-        "display_name": f"{model_name} Resource Utilization Alert",
-        "conditions": [
-            {
-                "display_name": "High CPU utilization",
-                "condition_threshold": {
-                    "filter": 'metric.type="compute.googleapis.com/instance/cpu/utilization"',
-                    "comparison": monitoring_v3.ComparisonType.COMPARISON_GT,
-                    "threshold_value": 0.8,
-                    "duration": duration.Duration(seconds=300)
-                }
-            },
-            {
-                "display_name": "High memory utilization",
-                "condition_threshold": {
-                    "filter": 'metric.type="compute.googleapis.com/instance/memory/utilization"',
-                    "comparison": monitoring_v3.ComparisonType.COMPARISON_GT,
-                    "threshold_value": 0.8,
-                    "duration": duration.Duration(seconds=300)
-                }
-            },
-            {
-                "display_name": "High GPU utilization",
-                "condition_threshold": {
-                    "filter": 'metric.type="compute.googleapis.com/instance/gpu/utilization"',
-                    "comparison": monitoring_v3.ComparisonType.COMPARISON_GT,
-                    "threshold_value": 0.8,
-                    "duration": duration.Duration(seconds=300)
-                }
-            }
-        ],
-        "notification_channels": [f"projects/{project_id}/notificationChannels/your-channel-id"],
-        "combiner": monitoring_v3.AlertPolicy.Combiner.OR,
-    }
-
-    policy = client.create_alert_policy(
-        name=project_name,
-        alert_policy=alert_policy
-    )
-    print(f"Created alert policy: {policy.name}")
+    def monitor_and_trigger_retraining(self, accuracy_threshold: float, drift_threshold: float, gcs_input: str, pipeline_name: str, notification_channel: str) -> None:
+        """Monitor model accuracy, data drift, and prediction drift, and trigger retraining when necessary."""
+        self.create_accuracy_degradation_alert(accuracy_threshold, 0.05)
 
-def create_latency_alert(project_id, model_name):
-    client = monitoring_v3.AlertPolicyServiceClient()
-    project_name = f"projects/{project_id}"
+        data_drift_detected = self.detect_data_drift(drift_threshold)
+        prediction_drift_detected = self.detect_prediction_drift(drift_threshold)
 
-    alert_policy = {
-        "display_name": f"{model_name} Prediction Latency Alert",
-        "conditions": [{
-            "display_name": "High prediction latency",
-            "condition_threshold": {
-                "filter": f'metric.type="custom.googleapis.com/vertex_ai/{model_name}/prediction_latency"',
-                "comparison": monitoring_v3.ComparisonType.COMPARISON_GT,
-                "threshold_value": 1000,  # 1000 ms
-                "duration": duration.Duration(seconds=60)
-            }
-        }],
-        "notification_channels": [f"projects/{project_id}/notificationChannels/your-channel-id"],
-        "combiner": monitoring_v3.AlertPolicy.Combiner.OR,
-    }
-
-    policy = client.create_alert_policy(
-        name=project_name,
-        alert_policy=alert_policy
-    )
-    print(f"Created alert policy: {policy.name}")
+        if data_drift_detected or prediction_drift_detected:
+            logger.warning(f"Drift detected for {self.model_name}. Triggering retraining pipeline.")
+            
+            pipeline_job_id = self.trigger_retraining_pipeline(pipeline_name, gcs_input)
+            
+            self.setup_retraining_job_alert(notification_channel)
 
-def log_request_response(project_id, model_name, request, response, latency_ms, sampling_rate=0.1):
-    """
-    Logs serving request/response data and latency to Cloud Storage with optional sampling.
-    
-    Args:
-        project_id (str): GCP project ID
-        model_name (str): Name of the Vertex AI model
-        request (dict): Request data
-        response (dict): Response data
-        latency_ms (float): Latency of the request in milliseconds
-        sampling_rate (float): Rate at which to sample logs (0.0 to 1.0, default 1.0)
-    """
+            logger.info(f"Retraining job triggered: {pipeline_job_id}")
+        else:
+            logger.info(f"No drift detected for {self.model_name}. No retraining needed.")
+
+        logger.info("Model performance and drift monitoring completed.")
+
+def log_request_response(project_id: str, model_name: str, request: Dict, response: Dict, latency_ms: float, sampling_rate: float = 0.1) -> None:
+    """Logs serving request/response data and latency to Cloud Storage with optional sampling."""
     if sampling_rate >= 1 or random.random() < sampling_rate:
         client = storage.Client(project=project_id)
         bucket = client.get_bucket(f"{project_id}-vertex-ai-logs")
@@ -318,31 +432,29 @@ def log_request_response(project_id, model_name, request, response, latency_ms,
             "timestamp": datetime.datetime.now().isoformat()
         }
         blob.upload_from_string(json.dumps(log_entry))
-        print(f"Logged request/response for {model_name} (latency: {latency_ms}ms)")
+        logger.info(f"Logged request/response for {model_name} (latency: {latency_ms}ms)")
 
-def check_existing_statistics_and_schema(project_id, model_name):
-    bq_client = bigquery.Client(project=project_id)
-    table_id = f"{project_id}.model_monitoring.{model_name}_serving_stats"
+def check_existing_statistics_and_schema(project_id: str, model_name: str, bucket_name: str, schema_version: str) -> Tuple[Optional[tfdv.types.DatasetFeatureStatisticsList], Optional[tfdv.types.Schema]]:
+    today = datetime.datetime.now().strftime("%Y%m%d")
     
     try:
-        query = f"SELECT * FROM `{table_id}` ORDER BY timestamp DESC LIMIT 1"
-        existing_stats = bq_client.query(query).result()
-        existing_stats = list(existing_stats)[0] if existing_stats.total_rows > 0 else None
+        existing_stats = load_statistics_from_gcs(bucket_name, model_name, 'serving', today)
     except Exception as e:
-        print(f"Error checking existing statistics: {e}")
+        logger.error(f"Error loading existing statistics: {e}")
         existing_stats = None
 
     config = load_config()
     schema_path = config['data_validation']['schema_path']
     
     try:
-        schema = tfdv.load_schema_text(schema_path)
-    except:
+        schema = load_schema_from_gcs(bucket_name, model_name, schema_version)
+    except Exception as e:
+        logger.error(f"Error loading schema: {e}")
         schema = None
 
     return existing_stats, schema
 
-def compute_and_store_statistics(project_id, model_name, existing_stats, existing_schema):
+def compute_and_store_statistics(project_id: str, model_name: str, bucket_name: str, existing_schema: Optional[tfdv.types.Schema]) -> Tuple[tfdv.types.DatasetFeatureStatisticsList, tfdv.types.Anomalies]:
     client = storage.Client(project=project_id)
     bucket = client.get_bucket(f"{project_id}-vertex-ai-logs")
     blobs = bucket.list_blobs(prefix=f"{model_name}/logs/")
@@ -354,440 +466,87 @@ def compute_and_store_statistics(project_id, model_name, existing_stats, existin
 
     df = pd.DataFrame(data)
     
-    if existing_schema is None:
-        schema = generate_schema(df)
-    else:
-        schema = existing_schema
-    
-    stats, anomalies = validate_data(df, schema)
-    
-    bq_client = bigquery.Client(project=project_id)
-    table_id = f"{project_id}.model_monitoring.{model_name}_serving_stats"
+    stats = tfdv.generate_statistics_from_dataframe(df)
+    save_statistics_to_gcs(stats, bucket_name, model_name, 'serving')
     
-    row_to_insert = {
-        "timestamp": datetime.datetime.now().isoformat(),
-        "statistics": json.dumps(stats),
-        "anomalies": json.dumps(anomalies)
-    }
-
-    errors = bq_client.insert_rows_json(table_id, [row_to_insert])
-    if errors:
-        print(f"Encountered errors while inserting rows: {errors}")
+    if existing_schema:
+        anomalies = tfdv.validate_statistics(stats, schema=existing_schema)
     else:
-        print("New statistics added to BigQuery")
+        anomalies = None
+        logger.warning("No existing schema found. Skipping anomaly detection.")
 
     return stats, anomalies
 
-def load_baseline_stats_and_schema(bucket_name, model_name, schema_version):
-    """Load baseline statistics and schema from Google Cloud Storage."""
-    # Load baseline statistics from GCS
-    baseline_stats = load_statistics_from_gcs(bucket_name, model_name, data_type='train')
-
-    # Load schema from GCS
-    schema = load_schema_from_gcs(bucket_name, model_name, schema_version)
-
-    return baseline_stats, schema
-
-
-def handle_missing_statistics(project_id, stat_type, model_name):
-    """
-    Handles the case where statistics are missing.
-    Logs a warning and optionally triggers alerts for missing statistics.
-    """
-    warning_msg = f"Missing {stat_type} statistics for model: {model_name}. Skipping drift detection."
-    logger.warning(warning_msg)
-
-    # Optionally, log missing statistics as a custom metric in Google Cloud Monitoring
-    # This helps track the issue and potentially trigger alerts.
-    client = monitoring_v3.MetricServiceClient()
-    project_name = f"projects/{project_id}"
+def log_feature_store_metric(self, feature_store_id: str, entity_type_id: str, metric_name: str, value: Union[int, float]):
+    """Logs a feature store metric to Google Cloud Monitoring."""
     series = monitoring_v3.TimeSeries()
-    series.metric.type = f"custom.googleapis.com/vertex_ai/{model_name}/missing_statistics"
-    series.resource.type = "aiplatform.googleapis.com/Endpoint"
-    series.resource.labels["model_name"] = model_name
+    series.metric.type = f"custom.googleapis.com/vertex_ai/{self.model_name}/{metric_name}"
+    series.resource.type = "aiplatform.googleapis.com/FeatureStore"
+    series.resource.labels["feature_store_id"] = feature_store_id
+    series.resource.labels["entity_type_id"] = entity_type_id
     point = series.points.add()
-    point.value.double_value = 1  # Use '1' to indicate missing stats
-    now = datetime.datetime.now()
-    point.interval.end_time.seconds = int(now.timestamp())
-    point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9)
-    client.create_time_series(name=project_name, time_series=[series])
-
-    logger.info(f"Logged missing statistics for model {model_name}")
-
-def handle_missing_schema(project_id, model_name):
-    """
-    Handles the case where the schema is missing.
-    Logs a warning and optionally triggers alerts for missing schema.
-    """
-    warning_msg = f"Missing schema for model: {model_name}. Skipping schema drift detection."
-    logger.warning(warning_msg)
-
-    # Optionally log missing schema as a custom metric in Google Cloud Monitoring
-    client = monitoring_v3.MetricServiceClient()
-    project_name = f"projects/{project_id}"
-
-    series = monitoring_v3.TimeSeries()
-    series.metric.type = f"custom.googleapis.com/vertex_ai/{model_name}/missing_schema"
-    series.resource.type = "aiplatform.googleapis.com/Endpoint"
-    series.resource.labels["model_name"] = model_name
-    point = series.points.add()
-    point.value.double_value = 1  # '1' indicates missing schema
+    if isinstance(value, int):
+        point.value.int64_value = value
+    else:
+        point.value.double_value = value
     now = datetime.datetime.now()
     point.interval.end_time.seconds = int(now.timestamp())
     point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9)
-    client.create_time_series(name=project_name, time_series=[series])
-
-    logger.info(f"Logged missing schema for model {model_name} in Google Cloud Monitoring.")
+    self.client.create_time_series(name=self.project_name, time_series=[series])
+    logger.info(f"Logged feature store metric {metric_name} with value {value}")
 
-
-def detect_data_drift(project_id, model_name, bucket_name, schema_version, drift_threshold):
-    """
-    Detects data drift by comparing current (serving) statistics with baseline (training) statistics.
-    Logs the drift score directly to Google Cloud Monitoring.
-    Returns the drift score (or None if statistics are missing).
-    """
+def monitor_feature_store(self, feature_store_id: str, entity_type_id: str):
+    """Monitors the Feature Store and logs relevant metrics."""
     try:
-        log_step(logger, 'Detecting Data Drift', 'Data Drift Detection')
-
-        # Load baseline statistics and schema
-        log_step(logger, 'Loading Baseline Statistics and Schema', 'Data Drift Detection')
-        today = datetime.datetime.now().strftime("%Y%m%d")
+        log_step(logger, 'Monitoring Feature Store', 'Feature Store Monitoring')
         
-        # Load baseline statistics and check if they exist
-        baseline_stats = load_statistics_from_gcs(bucket_name, model_name, 'train', today)
-        if not baseline_stats:
-            handle_missing_statistics(project_id, 'baseline', model_name)
-            return None
-        
-        # Load serving statistics
-        serving_stats = load_statistics_from_gcs(bucket_name, model_name, 'serving', today)
-        if not serving_stats:
-            handle_missing_statistics(project_id, 'serving', model_name)
-            return None
-        
-        # Load schema from GCS
-        schema = load_schema_from_gcs(bucket_name, model_name, schema_version)
-        if not schema:
-            logger.warning(f"No schema found for {model_name}. Skipping data drift detection.")
-            return None
-
-        # Compare statistics and check for anomalies
-        log_step(logger, 'Comparing Statistics', 'Data Drift Detection')
-        anomalies = compare_statistics(baseline_stats, serving_stats, schema)
-
-        # Calculate and return the drift score
-        drift_score = 0  # Initialize drift score
-        significant_drift_detected = False
-
-        for feature, anomaly in anomalies.anomaly_info.items():
-            if anomaly:
-                logger.warning(f"Data drift detected for feature {feature}: {anomaly.description}")
-                drift_score += anomaly.severity
-
-                # Check if drift score exceeds the threshold
-                if drift_score > drift_threshold:
-                    significant_drift_detected = True
-
-                # Log drift score for the specific feature to Vertex AI Monitoring
-                client = monitoring_v3.MetricServiceClient()
-                project_name = f"projects/{project_id}"
-
-                series = monitoring_v3.TimeSeries()
-                series.metric.type = f"custom.googleapis.com/vertex_ai/{model_name}/data_drift"
-                series.resource.type = "aiplatform.googleapis.com/Endpoint"
-                series.resource.labels["model_name"] = model_name
-                point = series.points.add()
-                point.value.double_value = anomaly.severity
-                now = datetime.datetime.now()
-                point.interval.end_time.seconds = int(now.timestamp())
-                point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9)
-                client.create_time_series(name=project_name, time_series=[series])
-
-                logger.info(f"Logged data drift score for {feature}: {anomaly.severity}")
-
-        # Log if significant drift is detected based on the threshold
-        if significant_drift_detected:
-            logger.warning(f"Significant data drift detected for {model_name}. Drift score: {drift_score} > {drift_threshold}")
-        else:
-            logger.info(f"No significant data drift detected for {model_name}. Drift score: {drift_score} <= {drift_threshold}")
+        feature_store = self.feature_store_client.get_feature_store(feature_store_id=feature_store_id)
+        entity_type = feature_store.get_entity_type(entity_type_id=entity_type_id)
 
-        return drift_score
-
-    except Exception as e:
-        log_error(logger, e, 'Data Drift Detection')
-        return None  # Return None if an error occurs
+        # Log read and write counts
+        read_count = entity_type.read_stats().get("total_entity_reads", 0)
+        write_count = entity_type.write_stats().get("total_entity_updates", 0)
 
+        self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_read_count", read_count)
+        self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_write_count", write_count)
 
-def detect_prediction_drift(project_id, model_name, bucket_name, drift_threshold):
-    """
-    Detects prediction drift using the Kolmogorov-Smirnov (KS) test and logs the drift score to Google Cloud Monitoring.
-    Returns the drift score (or None if statistics are missing).
-    """
-    try:
-        log_step(logger, 'Detecting Prediction Drift', 'Prediction Drift Detection')
-
-        # Load training and serving statistics
-        today = datetime.datetime.now().strftime("%Y%m%d")
-        
-        # Load baseline prediction statistics and check if they exist
-        train_stats = load_statistics_from_gcs(bucket_name, model_name, 'train', today)
-        if not train_stats:
-            handle_missing_statistics(project_id, 'training', model_name)
-            return None
-        
-        # Load serving statistics
-        serving_stats = load_statistics_from_gcs(bucket_name, model_name, 'serving', today)
-        if not serving_stats:
-            handle_missing_statistics(project_id, 'serving', model_name)
-            return None
-
-        # Extract prediction buckets for KS test (assuming 'predictions' is the feature name)
-        train_predictions = train_stats.datasets[0].features['similar_tracks'].num_stats.histograms[0].buckets
-        serving_predictions = serving_stats.datasets[0].features['similar_tracks'].num_stats.histograms[0].buckets
-
-        # Extract the counts from the buckets
-        train_counts = [bucket.sample_count for bucket in train_predictions]
-        serving_counts = [bucket.sample_count for bucket in serving_predictions]
-
-        # Perform KS test
-        statistic, p_value = ks_2samp(train_counts, serving_counts)
-
-        # Log prediction drift score to Vertex AI Monitoring
-        client = monitoring_v3.MetricServiceClient()
-        project_name = f"projects/{project_id}"
-
-        series = monitoring_v3.TimeSeries()
-        series.metric.type = f"custom.googleapis.com/vertex_ai/{model_name}/prediction_drift"
-        series.resource.type = "aiplatform.googleapis.com/Endpoint"
-        series.resource.labels["model_name"] = model_name
-        point = series.points.add()
-        point.value.double_value = statistic  # Log the KS statistic
-        now = datetime.datetime.now()
-        point.interval.end_time.seconds = int(now.timestamp())
-        point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9)
-        client.create_time_series(name=project_name, time_series=[series])
-
-        logger.info(f"Logged prediction drift KS statistic: {statistic}")
-
-        # Determine if drift is significant and return the drift score
-        if statistic > drift_threshold:
-            logger.warning(f"Prediction drift detected for model {model_name}: KS statistic = {statistic}")
-        else:
-            logger.info(f"No significant prediction drift detected for model {model_name}")
-
-        return statistic
-
-    except Exception as e:
-        log_error(logger, e, 'Prediction Drift Detection')
-        return None  # Return None if an error occurs
-
-
-def monitor_traffic_split(project_id, endpoint_name):
-    """Monitor the traffic split in Vertex AI to detect rollback."""
-    try:
-        log_step(logger, 'Monitoring traffic split', 'Rollback Monitoring')
-        
-        aiplatform.init(project=project_id)
-
-        # Retrieve the endpoint
-        endpoints = aiplatform.Endpoint.list(filter=f'display_name="{endpoint_name}"')
-        if not endpoints:
-            log_error(logger, f"Endpoint {endpoint_name} not found.", "Rollback Monitoring")
-            return None
-        endpoint = endpoints[0]
-
-        # Get the traffic split
-        traffic_split = endpoint.traffic_split
-        for model_id, traffic_percentage in traffic_split.items():
-            logger.info(f"Model {model_id} is receiving {traffic_percentage}% of the traffic.")
-        
-        # Check if rollback happened (i.e., if traffic is no longer sent to the new model)
-        if sum(traffic_split.values()) != 100:
-            logger.warning("Traffic split does not sum to 100%, indicating a possible rollback.")
-    
-        return traffic_split
-    except Exception as e:
-        log_error(logger, e, "Rollback Monitoring")
-        raise
-
-def detect_schema_drift(project_id, model_name, bucket_name, schema_version):
-    """
-    Detects schema drift by comparing the current schema with the baseline (training) schema.
-    Logs the drift to Google Cloud Monitoring if detected.
-    Returns a boolean indicating whether schema drift was detected.
-    """
-    try:
-        log_step(logger, 'Detecting Schema Drift', 'Schema Drift Detection')
-
-        # Load baseline schema
-        log_step(logger, 'Loading Baseline Schema', 'Schema Drift Detection')
-        baseline_schema = load_schema_from_gcs(bucket_name, model_name, schema_version)
-        if not baseline_schema:
-            return handle_missing_schema(project_id, model_name)
-
-        # Load current schema (Replace with your actual method of loading the current schema from serving data)
-        log_step(logger, 'Loading Current Schema', 'Schema Drift Detection')
-        current_schema = load_schema_from_gcs(bucket_name, model_name, 'serving_schema_version')  # Replace with actual logic for serving schema
-
-        # Compare schemas and check for schema drift
-        schema_drift_detected = compare_schemas(baseline_schema, current_schema)
-
-        # Log schema drift to Google Cloud Monitoring if detected
-        if schema_drift_detected:
-            logger.info(f"Schema drift detected for model {model_name}.")
-            client = monitoring_v3.MetricServiceClient()
-            project_name = f"projects/{project_id}"
-
-            series = monitoring_v3.TimeSeries()
-            series.metric.type = f"custom.googleapis.com/vertex_ai/{model_name}/schema_drift"
-            series.resource.type = "aiplatform.googleapis.com/Endpoint"
-            series.resource.labels["model_name"] = model_name
-            point = series.points.add()
-            point.value.double_value = 1  # Use '1' to indicate schema drift
-            now = datetime.datetime.now()
-            point.interval.end_time.seconds = int(now.timestamp())
-            point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9)
-            client.create_time_series(name=project_name, time_series=[series])
-
-            logger.info(f"Logged schema drift for model {model_name} to Google Cloud Monitoring.")
-        else:
-            logger.info(f"No schema drift detected for model {model_name}.")
-
-        return schema_drift_detected
+        # Log latency (this is a placeholder, actual implementation may vary based on available metrics)
+        avg_latency = entity_type.read_stats().get("average_read_latency_milliseconds", 0)
+        self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_latency", avg_latency)
 
+        logger.info(f"Monitored feature store {feature_store_id}, entity type {entity_type_id}")
     except Exception as e:
-        log_error(logger, e, 'Schema Drift Detection')
-        return None  # Return None if an error occurs
-
-def create_rollback_alert(project_id, endpoint_name):
-    """Create an alert in Google Cloud Monitoring for rollback events."""
-    client = monitoring_v3.AlertPolicyServiceClient()
-    project_name = f"projects/{project_id}"
-    
-    alert_policy = {
-        "display_name": f"Rollback Alert for {endpoint_name}",
-        "conditions": [{
-            "display_name": "Rollback detected",
-            "condition_threshold": {
-                "filter": f'metric.type="aiplatform.googleapis.com/Endpoint/traffic_split"',
-                "comparison": monitoring_v3.ComparisonType.COMPARISON_LT,  # Define condition for rollback
-                "threshold_value": 100,  # Set rollback condition here
-                "duration": monitoring_v3.Duration(seconds=300)
-            }
-        }],
-        "notification_channels": [f"projects/{project_id}/notificationChannels/your-channel-id"],  # Replace with your actual channel ID
-        "combiner": monitoring_v3.AlertPolicy.Combiner.OR,
-    }
-
-    policy = client.create_alert_policy(
-        name=project_name,
-        alert_policy=alert_policy
+        log_error(logger, e, 'Feature Store Monitoring')
+
+def create_feature_store_alerts(self, feature_store_id: str, entity_type_id: str):
+    """Creates alerts for Feature Store monitoring."""
+    self.create_alert_policy(
+        "High Feature Store Read Count",
+        f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_read_count" AND resource.labels.feature_store_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"',
+        1000,  # Threshold: 1000 reads
+        300,   # Duration: 5 minutes
+        monitoring_v3.ComparisonType.COMPARISON_GT
     )
-    logger.info(f"Created rollback alert policy: {policy.name}")
-
-def monitor_and_log_rollbacks(project_id, endpoint_name):
-    logger.info("Starting rollback monitoring...")
-    traffic_split = monitor_traffic_split(project_id, endpoint_name)
-    if traffic_split:
-        create_rollback_alert(project_id, endpoint_name)
-
-def trigger_retraining_pipeline(project_id: str, pipeline_name: str, gcs_input: str, model_name: str):
-    """
-    Trigger a Vertex AI pipeline for continuous retraining when performance degradation or drift is detected.
-    """
-    aiplatform.init(project=project_id)
-
-    pipeline_params = {
-        'input_data': gcs_input,
-        'model_name': model_name
-    }
-
-    # Run the retraining pipeline
-    pipeline_job = aiplatform.PipelineJob(
-        display_name=f'Retraining - {model_name}',
-        template_path=f'gs://{pipeline_name}',
-        parameter_values=pipeline_params
+    self.create_alert_policy(
+        "High Feature Store Write Count",
+        f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_write_count" AND resource.labels.feature_store_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"',
+        500,   # Threshold: 500 writes
+        300,   # Duration: 5 minutes
+        monitoring_v3.ComparisonType.COMPARISON_GT
     )
-
-    pipeline_job.run()
-    logger.info(f"Triggered retraining pipeline for {model_name}.")
-
-    # Return the pipeline job ID for tracking
-    return pipeline_job.resource_name
-
-def setup_retraining_job_alert(project_id: str, notification_channel: str):
-    """
-    Set up a Cloud Monitoring alert for Vertex AI retraining jobs.
-    This sends notifications whenever a new retraining job is created.
-    """
-    client = monitoring_v3.AlertPolicyServiceClient()
-    project_name = f"projects/{project_id}"
-
-    # Define the condition for Vertex AI Pipeline Job creation
-    condition = {
-        "display_name": "Vertex AI Retraining Job Created",
-        "condition_threshold": {
-            "filter": 'resource.type="aiplatform.googleapis.com/PipelineJob" AND protoPayload.methodName="google.cloud.aiplatform.v1.PipelineService.CreatePipelineJob"',
-            "comparison": monitoring_v3.ComparisonType.COMPARISON_GT,
-            "threshold_value": 0,
-            "duration": {"seconds": 60},  # Check every 60 seconds
-        }
-    }
-
-    # Create the alert policy
-    alert_policy = {
-        "display_name": "Retraining Job Alert",
-        "conditions": [condition],
-        "notification_channels": [notification_channel],
-        "enabled": True,
-        "combiner": monitoring_v3.AlertPolicy.Combiner.OR
-    }
-
-    # Apply the policy
-    policy = client.create_alert_policy(
-        name=project_name,
-        alert_policy=alert_policy
+    self.create_alert_policy(
+        "High Feature Store Latency",
+        f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_latency" AND resource.labels.feature_store_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"',
+        1000,  # Threshold: 1000 ms
+        300,   # Duration: 5 minutes
+        monitoring_v3.ComparisonType.COMPARISON_GT
     )
 
-    logger.info(f"Created retraining job alert policy: {policy.name}")
-
-def monitor_and_trigger_retraining(project_id, model_name, accuracy_threshold, drift_threshold, gcs_input, pipeline_name, notification_channel):
-    """
-    Monitor model accuracy, data drift, and prediction drift, and trigger retraining when necessary.
-    This will also set up alerts for retraining job creation.
-    """
-    # Check for accuracy degradation
-    create_accuracy_degradation_alert(project_id, model_name, absolute_threshold=accuracy_threshold, degradation_rate_threshold=0.05)
-
-    # Check for data drift and prediction drift
-    data_drift_detected = detect_data_drift(project_id, model_name)
-    prediction_drift_detected = detect_prediction_drift(project_id, model_name)
-
-    if data_drift_detected or prediction_drift_detected:
-        logger.warning(f"Drift detected for {model_name}. Triggering retraining pipeline.")
-        
-        # Trigger the retraining pipeline
-        pipeline_job_id = trigger_retraining_pipeline(project_id, pipeline_name, gcs_input, model_name)
-        
-        # Set up retraining job alert to notify when the retraining job is created
-        setup_retraining_job_alert(project_id, notification_channel)
-
-        logger.info(f"Retraining job triggered: {pipeline_job_id}")
-    else:
-        logger.info(f"No drift detected for {model_name}. No retraining needed.")
-
-    logger.info("Model performance and drift monitoring completed.")
-    
-
-if __name__ == '__main__':
-    import argparse
-
-    # Parse arguments for monitoring setup
+def main():
     parser = argparse.ArgumentParser(description='Setup Vertex AI monitoring, drift detection, and rollback with retraining')
     parser.add_argument('--project_id', required=True, help='GCP Project ID')
     parser.add_argument('--model_name', required=True, help='Vertex AI model name')
     parser.add_argument('--endpoint_name', required=True, help='Vertex AI endpoint name')
-    parser.add_argument('--sampling_rate', type=float, default=1.0, help='Sampling rate for request/response logging')
     parser.add_argument('--absolute_threshold', type=float, default=0.85, help='Absolute accuracy threshold (e.g., 0.85)')
     parser.add_argument('--degradation_rate_threshold', type=float, default=0.05, help='Accuracy degradation rate threshold over time')
     parser.add_argument('--time_window', type=int, default=86400, help='Time window in seconds to monitor for degradation (default is 24 hours)')
@@ -797,43 +556,34 @@ def monitor_and_trigger_retraining(project_id, model_name, accuracy_threshold, d
     parser.add_argument('--notification_channel', required=True, help='Notification channel ID (for alerts)')
     parser.add_argument('--bucket_name', required=True, help='Cloud Storage bucket name')
     parser.add_argument('--schema_version', required=True, help='Schema version for validation')
+    parser.add_argument('--sampling_rate', type=float, default=0.1, help='Sampling rate for request/response logging')
+    parser.add_argument('--feature_store_id', required=True, help='Vertex AI Feature Store ID')
+    parser.add_argument('--entity_type_id', required=True, help='Entity Type ID in the Feature Store')
     args = parser.parse_args()
 
-    # Run Vertex AI monitoring functions
-    setup_vertex_ai_monitoring(args.project_id, args.model_name)
-    create_data_drift_alert(args.project_id, args.model_name)
-    create_prediction_drift_alert(args.project_id, args.model_name)
-    create_resource_utilization_alert(args.project_id, args.model_name)
-    create_latency_alert(args.project_id, args.model_name)
-    create_schema_drift_alert(args.project_id, args.model_name)
-    create_accuracy_degradation_alert(args.project_id, args.model_name, args.absolute_threshold, args.degradation_rate_threshold, args.time_window)
-
-    # Load baseline statistics and schema from GCS
-    existing_stats, existing_schema = load_baseline_stats_and_schema(args.bucket_name, args.model_name, args.schema_version)
-
-    # Compute and store current statistics
-    current_stats, anomalies = compute_and_store_statistics(args.project_id, args.model_name, existing_stats, existing_schema)
-
-    # Schema Drift Detection
-    schema_drift_detected = detect_schema_drift(args.project_id, args.model_name, args.bucket_name, args.schema_version)
-    if schema_drift_detected:
-        print(f"Schema drift detected for model {args.model_name}. Logged to Google Cloud Monitoring.")
-
-    # Detect data drift and prediction drift if baseline statistics exist
-    if existing_stats:
-        detect_data_drift(args.project_id, args.model_name, current_stats, existing_stats, args.drift_threshold)
-        detect_prediction_drift(args.project_id, args.model_name, current_stats, existing_stats, args.drift_threshold)
-    else:
-        print("No existing statistics found. Current statistics will be used as the baseline for future comparisons.")
+    monitor = VertexAIMonitoring(args.project_id, args.model_name, args.bucket_name)
 
-    # Run rollback monitoring after other checks
-    print("Starting rollback monitoring...")
-    monitor_and_log_rollbacks(args.project_id, args.endpoint_name)
+    monitor.setup_custom_metrics()
+    monitor.create_alert_policy("Data Drift Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/data_drift"', 0.1, 300, monitoring_v3.ComparisonType.COMPARISON_GT)
+    monitor.create_alert_policy("Prediction Drift Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/prediction_drift"', 0.1, 300, monitoring_v3.ComparisonType.COMPARISON_GT)
+    monitor.create_resource_utilization_alert()
+    monitor.create_alert_policy("Prediction Latency Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/prediction_latency"', 1000, 60, monitoring_v3.ComparisonType.COMPARISON_GT)
+    monitor.create_alert_policy("Schema Drift Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/schema_drift"', 1, 300, monitoring_v3.ComparisonType.COMPARISON_GT)
+    monitor.create_accuracy_degradation_alert(args.absolute_threshold, args.degradation_rate_threshold, args.time_window)
+    monitor.setup_feature_store_monitoring(args.feature_store_id, args.entity_type_id)
+    monitor.create_feature_store_alerts(args.feature_store_id, args.entity_type_id)
+    monitor.monitor_feature_store(args.feature_store_id, args.entity_type_id)
 
-    # Monitor model and trigger retraining if needed
-    monitor_and_trigger_retraining(
-        project_id=args.project_id,
-        model_name=args.model_name,
+    existing_stats, existing_schema = check_existing_statistics_and_schema(args.project_id, args.model_name, args.bucket_name, args.schema_version)
+    current_stats, anomalies = compute_and_store_statistics(args.project_id, args.model_name, args.bucket_name, existing_schema)
+
+    monitor.detect_schema_drift(args.schema_version)
+    monitor.detect_data_drift(args.drift_threshold)
+    monitor.detect_prediction_drift(args.drift_threshold)
+
+    monitor.monitor_traffic_split(args.endpoint_name)
+
+    monitor.monitor_and_trigger_retraining(
         accuracy_threshold=args.absolute_threshold,
         drift_threshold=args.drift_threshold,
         gcs_input=args.gcs_input,
@@ -841,4 +591,7 @@ def monitor_and_trigger_retraining(project_id, model_name, accuracy_threshold, d
         notification_channel=args.notification_channel
     )
 
-    print("Vertex AI monitoring, drift detection, rollback, and retraining setup completed successfully!")
+    logger.info("Vertex AI monitoring, drift detection, rollback, and retraining setup completed successfully!")
+
+if __name__ == '__main__':
+    main()
diff --git a/kubeflow/components/data_ingestion/data_ingestion.py b/kubeflow/components/data_ingestion/data_ingestion.py
index 4913d03..1abcff0 100644
--- a/kubeflow/components/data_ingestion/data_ingestion.py
+++ b/kubeflow/components/data_ingestion/data_ingestion.py
@@ -6,35 +6,54 @@
 from typing import NamedTuple
 from src.data_processing.data_ingestion import configure_lastfm_api, fetch_lastfm_data
 from src.utils.logging_utils import get_logger
+import os
 
 logger = get_logger('kubeflow_data_ingestion')
 
 # Define the OutputSpec NamedTuple
-OutputSpec = NamedTuple('OutputSpec', [('num_tracks', int)])
+OutputSpec = NamedTuple('OutputSpec', [('num_tracks', int), ('data_version', str)])
 
 @component(
-    packages_to_install=['pylast', 'python-dotenv', 'pandas', 'requests'],
-    base_image='python:3.9'
+    packages_to_install=['pylast', 'python-dotenv', 'pandas', 'requests', 'google-cloud-storage'],
+    base_image='python:3.10'
 )
 def data_ingestion(
+    project_id: str,
     output_path: Output[Dataset],
     limit: int = 5000,
 ) -> OutputSpec:
-    import os
     import pandas as pd
+    from google.cloud import storage
+    from datetime import datetime
     
     try:
+        # Configure GCS client
+        storage_client = storage.Client(project=project_id)
+        
+        # Generate a unique data version
+        data_version = datetime.now().strftime("%Y%m%d_%H%M%S")
+        
+        # Configure Last.fm API
         api_key, _ = configure_lastfm_api()
+        
+        # Fetch data from Last.fm
         df = fetch_lastfm_data(api_key, limit=limit)
         
         if not df.empty:
             logger.info(f"Successfully fetched {len(df)} tracks from Last.fm")
-            df.to_csv(output_path.path, index=False)
-            logger.info(f"Data saved to {output_path.path}")
-            return (len(df),)
+            
+            # Save data to GCS
+            bucket_name, blob_name = output_path.path.replace("gs://", "").split("/", 1)
+            bucket = storage_client.bucket(bucket_name)
+            blob = bucket.blob(f"{blob_name}_{data_version}.csv")
+            
+            blob.upload_from_string(df.to_csv(index=False), content_type='text/csv')
+            logger.info(f"Data saved to {output_path.path}_{data_version}.csv")
+            
+            return OutputSpec(num_tracks=len(df), data_version=data_version)
         else:
             logger.error("Failed to fetch data, DataFrame is empty")
-            return (0,)
+            return OutputSpec(num_tracks=0, data_version=data_version)
     except Exception as e:
         logger.error(f"Error in data ingestion: {e}")
         raise
@@ -43,9 +62,10 @@ def data_ingestion(
     import argparse
     
     parser = argparse.ArgumentParser(description='Data ingestion component for Kubeflow')
-    parser.add_argument('--output_path', type=str, help='Path to save the output dataset')
+    parser.add_argument('--project_id', type=str, required=True, help='GCP Project ID')
+    parser.add_argument('--output_path', type=str, required=True, help='Path to save the output dataset')
     parser.add_argument('--limit', type=int, default=5000, help='Number of tracks to fetch')
     
     args = parser.parse_args()
     
-    data_ingestion(output_path=args.output_path, limit=args.limit)
\ No newline at end of file
+    data_ingestion(project_id=args.project_id, output_path=args.output_path, limit=args.limit)
\ No newline at end of file
diff --git a/kubeflow/components/deploy/deploy.py b/kubeflow/components/deploy/deploy.py
index b75601f..d41c137 100644
--- a/kubeflow/components/deploy/deploy.py
+++ b/kubeflow/components/deploy/deploy.py
@@ -3,40 +3,51 @@
     Input,
     Output,
     Model,
+    Artifact,
 )
 from typing import NamedTuple
-from deployment.vertex_ai.vertex_deployment import deploy_to_vertex_ai, setup_cloud_build_trigger, setup_cloud_run
-from src.utils.logging_utils import setup_logger, log_error
+from deployment.vertex_ai.vertex_deployment import (
+    deploy_to_vertex_ai,
+    setup_cloud_build_trigger,
+    setup_cloud_run,
+    trigger_cloud_build
+)
+from src.utils.logging_utils import setup_logger, log_error, log_step
 
 logger = setup_logger('kubeflow_deploy_component')
 
 # Define the OutputSpec NamedTuple
-OutputSpec = NamedTuple('OutputSpec', [('endpoint', str), ('model', str)])
+OutputSpec = NamedTuple('OutputSpec', [('endpoint', str), ('model', str), ('cloud_run_service', str)])
 
 @component(
     packages_to_install=['google-cloud-aiplatform', 'google-cloud-build', 'google-cloud-run'],
-    base_image='python:3.9'
+    base_image='python:3.10'
 )
 def deploy_model(
     project_id: str,
     model_path: Input[Model],
     model_name: str,
     endpoint_name: str,
-    output_val: Output[str],
-    output_test: Output[str],
-    repo_name: str = "",
-    branch_name: str = "",
-    service_name: str = "",
-    image_url: str = "",
-    region: str = "us-central1",
-    setup_ci_cd: bool = False,
-    canary_traffic_percent: int = 10
+    repo_name: str,
+    branch_name: str,
+    service_name: str,
+    image_url: str,
+    region: str,
+    storage_bucket: str,
+    trigger_id: str,
+    notification_channel: str,
+    deployment_info: Output[Artifact],
+    canary_traffic_percent: int = 10,
+    cooldown_period: int = 300
 ) -> OutputSpec:
     """
-    Kubeflow component to deploy a model to Vertex AI with a canary strategy and rollback mechanism.
+    Kubeflow component to deploy a model to Vertex AI, set up CI/CD with Cloud Build and Cloud Run.
     """
+    import json
+    import os
+
     try:
-        # Deploy to Vertex AI with canary traffic handling
+        log_step(logger, "Deploying model to Vertex AI", "Model Deployment")
         endpoint, model = deploy_to_vertex_ai(
             project_id=project_id,
             model_path=model_path.uri,
@@ -44,29 +55,46 @@ def deploy_model(
             model_name=model_name,
             canary_traffic_percent=canary_traffic_percent
         )
-        
-        # Write outputs to files
-        with open(output_val.path, 'w') as f:
-            f.write(endpoint)
-        with open(output_test.path, 'w') as f:
-            f.write(model)
-            
-        # Setup CI/CD if requested
-        if setup_ci_cd:
-            if not all([repo_name, branch_name, service_name, image_url]):
-                raise ValueError("For CI/CD setup, repo_name, branch_name, service_name, and image_url must be provided.")
-            
-            # Setup Cloud Build trigger
-            trigger = setup_cloud_build_trigger(project_id, repo_name, branch_name)
-            
-            # Setup Cloud Run service
-            service = setup_cloud_run(project_id, service_name, image_url, region)
-        
-        logger.info("Deployment completed successfully!")
-        return OutputSpec(endpoint=endpoint, model=model)
+
+        log_step(logger, "Setting up Cloud Build trigger", "CI/CD Setup")
+        trigger_response = setup_cloud_build_trigger(
+            project_id=project_id,
+            repo_name=repo_name,
+            branch_name=branch_name,
+            storage_bucket=storage_bucket
+        )
+
+        log_step(logger, "Setting up Cloud Run service", "CI/CD Setup")
+        service_response = setup_cloud_run(
+            project_id=project_id,
+            service_name=service_name,
+            image_url=image_url,
+            region=region
+        )
+
+        log_step(logger, "Setting up Cloud Function for cooldown", "CI/CD Setup")
+        os.system(f"gcloud functions deploy cloud_build_trigger --runtime python39 "
+                f"--trigger-topic cloud-build-trigger "
+                f"--set-env-vars PROJECT_ID={project_id},TRIGGER_ID={trigger_id},"
+                f"MODEL_NAME={model_name},ENDPOINT_NAME={endpoint_name} "
+                f"--memory=128MB --timeout=300s")
+
+        # Write deployment info to output
+        deployment_info_dict = {
+            "endpoint_name": endpoint,
+            "model_name": model,
+            "cloud_build_trigger": trigger_response.name,
+            "cloud_run_service": service_response.name,
+            "canary_traffic_percent": canary_traffic_percent
+        }
+        with open(deployment_info.path, 'w') as f:
+            json.dump(deployment_info_dict, f)
+
+        logger.info("Deployment and CI/CD setup completed successfully!")
+        return OutputSpec(endpoint=endpoint, model=model, cloud_run_service=service_response.name)
     
     except Exception as e:
-        log_error(logger, e, 'Model Deployment')
+        log_error(logger, e, 'Model Deployment and CI/CD Setup')
         raise
 
 if __name__ == '__main__':
@@ -77,13 +105,17 @@ def deploy_model(
     parser.add_argument('--model_path', required=True, help='Path to the model artifacts')
     parser.add_argument('--model_name', required=True, help='Name for the deployed model')
     parser.add_argument('--endpoint_name', required=True, help='Name for the Vertex AI endpoint')
-    parser.add_argument('--setup_ci_cd', action='store_true', help='Set up CI/CD pipeline')
-    parser.add_argument('--repo_name', help='GitHub repository name')
-    parser.add_argument('--branch_name', help='GitHub branch name')
-    parser.add_argument('--service_name', help='Cloud Run service name')
-    parser.add_argument('--image_url', help='Docker image URL for Cloud Run')
-    parser.add_argument('--region', default='us-central1', help='GCP region for deployment')
+    parser.add_argument('--repo_name', required=True, help='GitHub repository name')
+    parser.add_argument('--branch_name', required=True, help='GitHub branch name')
+    parser.add_argument('--service_name', required=True, help='Cloud Run service name')
+    parser.add_argument('--image_url', required=True, help='Docker image URL for Cloud Run')
+    parser.add_argument('--region', required=True, help='GCP region for deployment')
+    parser.add_argument('--storage_bucket', required=True, help='Cloud Storage bucket to monitor for new data')
+    parser.add_argument('--trigger_id', required=True, help='Cloud Build trigger ID for retraining jobs')
+    parser.add_argument('--notification_channel', required=True, help='Notification channel ID for build status notifications')
     parser.add_argument('--canary_traffic_percent', type=int, default=10, help='Percentage of traffic to route to the new model')
+    parser.add_argument('--cooldown_period', type=int, default=300, help='Cooldown period in seconds between Cloud Build jobs')
+    parser.add_argument('--deployment_info', required=True, help='Path to save deployment info')
     
     args = parser.parse_args()
     
@@ -97,6 +129,10 @@ def deploy_model(
         service_name=args.service_name,
         image_url=args.image_url,
         region=args.region,
-        setup_ci_cd=args.setup_ci_cd,
-        canary_traffic_percent=args.canary_traffic_percent
+        storage_bucket=args.storage_bucket,
+        trigger_id=args.trigger_id,
+        notification_channel=args.notification_channel,
+        canary_traffic_percent=args.canary_traffic_percent,
+        cooldown_period=args.cooldown_period,
+        deployment_info=args.deployment_info
     )
diff --git a/kubeflow/components/evaluation/evaluation.py b/kubeflow/components/evaluation/evaluation.py
index f5d1fe2..7de98f0 100644
--- a/kubeflow/components/evaluation/evaluation.py
+++ b/kubeflow/components/evaluation/evaluation.py
@@ -6,36 +6,83 @@
     Model,
     Metrics,
 )
+from typing import NamedTuple
 from src.evaluation.model_evaluation import main as evaluate_main
-from src.utils.logging_utils import get_logger
+from src.utils.logging_utils import setup_logger, log_error, log_step
+from google.cloud import aiplatform
 
-logger = get_logger('kubeflow_evaluation')
+logger = setup_logger('kubeflow_evaluation')
+
+EvaluationOutput = NamedTuple('EvaluationOutput', [
+    ('mean_average_precision', float),
+    ('ndcg_score', float),
+    ('model_drift', float),
+    ('deploy_decision', str)
+])
 
 @component(
-    packages_to_install=['tensorflow', 'numpy', 'pandas', 'scikit-learn', 'matplotlib', 'seaborn'],
-    base_image='python:3.9'
+    packages_to_install=['tensorflow', 'numpy', 'pandas', 'scikit-learn', 'matplotlib', 'seaborn', 'google-cloud-aiplatform'],
+    base_image='python:3.10'
 )
 def evaluate_model(
+    project_id: str,
     model: Input[Model],
     test_data: Input[Dataset],
     item_popularity: Input[Dataset],
+    endpoint_name: str,
     evaluation_results: Output[Metrics],
-    evaluation_plots: Output[Dataset]
-) -> float:
+    evaluation_plots: Output[Dataset],
+    region: str = 'us-central1',
+    map_threshold: float = 0.7,
+    ndcg_threshold: float = 0.5,
+    drift_threshold: float = 0.1
+) -> EvaluationOutput:
     import json
     import os
+    import numpy as np
+    from sklearn.metrics import mean_absolute_error
     
     try:
+        log_step(logger, "Initializing evaluation", "Model Evaluation")
+        
         # Create a temporary output directory
         output_dir = "/tmp/evaluation_output"
         os.makedirs(output_dir, exist_ok=True)
         
-        # Run evaluation
-        results = evaluate_main(model.path, test_data.path, output_dir)
+        # Run evaluation on the new model
+        log_step(logger, "Evaluating new model", "Model Evaluation")
+        new_model_results = evaluate_main(model.path, test_data.path, output_dir)
+        
+        # Evaluate the currently deployed model (if exists)
+        log_step(logger, "Evaluating currently deployed model", "Model Evaluation")
+        aiplatform.init(project=project_id, location=region)
+        endpoint = aiplatform.Endpoint(endpoint_name)
+        
+        if endpoint.list_models():
+            current_model = endpoint.list_models()[0]
+            current_model_results = evaluate_main(current_model.uri, test_data.path, output_dir)
+        else:
+            current_model_results = None
+        
+        # Calculate model drift
+        if current_model_results:
+            log_step(logger, "Calculating model drift", "Model Evaluation")
+            new_predictions = new_model_results['predictions']
+            current_predictions = current_model_results['predictions']
+            model_drift = mean_absolute_error(new_predictions, current_predictions)
+        else:
+            model_drift = 0.0
+        
+        # Prepare evaluation results
+        evaluation_dict = {
+            'new_model': new_model_results,
+            'current_model': current_model_results,
+            'model_drift': model_drift
+        }
         
         # Save evaluation results
         with open(evaluation_results.path, 'w') as f:
-            json.dump(results, f, indent=2)
+            json.dump(evaluation_dict, f, indent=2)
         
         # Copy evaluation plots
         os.system(f"cp {output_dir}/*.png {evaluation_plots.path}")
@@ -43,29 +90,56 @@ def evaluate_model(
         logger.info(f"Evaluation results saved to {evaluation_results.path}")
         logger.info(f"Evaluation plots saved to {evaluation_plots.path}")
         
-        # Return the main model's MAP score for pipeline orchestration
-        return results['main_evaluation']['mean_average_precision']
+        # Make deployment decision
+        new_map = new_model_results['main_evaluation']['mean_average_precision']
+        new_ndcg = new_model_results['main_evaluation']['ndcg_score']
+        
+        if (new_map >= map_threshold and 
+            new_ndcg >= ndcg_threshold and 
+            model_drift <= drift_threshold):
+            deploy_decision = "deploy"
+        else:
+            deploy_decision = "do_not_deploy"
+        
+        return EvaluationOutput(
+            mean_average_precision=new_map,
+            ndcg_score=new_ndcg,
+            model_drift=model_drift,
+            deploy_decision=deploy_decision
+        )
     
     except Exception as e:
-        logger.error(f"Error in model evaluation: {e}")
+        log_error(logger, e, 'Model Evaluation')
         raise
 
 if __name__ == '__main__':
     import argparse
     
     parser = argparse.ArgumentParser(description='Evaluate model component for Kubeflow')
+    parser.add_argument('--project_id', required=True, help='GCP Project ID')
     parser.add_argument('--model', type=str, help='Path to the trained model')
     parser.add_argument('--test_data', type=str, help='Path to test dataset')
     parser.add_argument('--item_popularity', type=str, help='Path to item popularity data')
+    parser.add_argument('--endpoint_name', type=str, help='Name of the Vertex AI endpoint')
     parser.add_argument('--evaluation_results', type=str, help='Path to save evaluation results')
     parser.add_argument('--evaluation_plots', type=str, help='Path to save evaluation plots')
+    parser.add_argument('--region', type=str, default='us-central1', help='GCP region')
+    parser.add_argument('--map_threshold', type=float, default=0.7, help='Threshold for Mean Average Precision')
+    parser.add_argument('--ndcg_threshold', type=float, default=0.5, help='Threshold for NDCG score')
+    parser.add_argument('--drift_threshold', type=float, default=0.1, help='Threshold for model drift')
     
     args = parser.parse_args()
     
     evaluate_model(
+        project_id=args.project_id,
         model=args.model,
         test_data=args.test_data,
         item_popularity=args.item_popularity,
+        endpoint_name=args.endpoint_name,
         evaluation_results=args.evaluation_results,
-        evaluation_plots=args.evaluation_plots
+        evaluation_plots=args.evaluation_plots,
+        region=args.region,
+        map_threshold=args.map_threshold,
+        ndcg_threshold=args.ndcg_threshold,
+        drift_threshold=args.drift_threshold
     )
\ No newline at end of file
diff --git a/kubeflow/components/feature_engineering/feature_eng.py b/kubeflow/components/feature_engineering/feature_eng.py
index 9ebbd61..5629a85 100644
--- a/kubeflow/components/feature_engineering/feature_eng.py
+++ b/kubeflow/components/feature_engineering/feature_eng.py
@@ -3,7 +3,8 @@
     Input,
     Output,
     Dataset,
-    Model
+    Model,
+    Artifact
 )
 from typing import NamedTuple
 from src.feature_engineering.feat_engineering import (
@@ -19,32 +20,43 @@
     create_preprocessing_pipeline,
     analyze_feature_importance_and_reduce_dimensions
 )
-from src.utils.logging_utils import get_logger
+from src.utils.logging_utils import setup_logger, log_error, log_step
 
-logger = get_logger('kubeflow_feature_engineering')
+logger = setup_logger('kubeflow_feature_engineering')
 
 # Define the OutputSpec NamedTuple
-OutputSpec = NamedTuple('Outputs', [('num_features', int),('explained_variance_ratio', float)])
+OutputSpec = NamedTuple('Outputs', [
+    ('num_features', int),
+    ('explained_variance_ratio', float),
+    ('top_features', str)
+])
 
 @component(
-    packages_to_install=['pandas', 'numpy', 'scikit-learn', 'matplotlib', 'seaborn'],
-    base_image='python:3.9'
+    packages_to_install=['pandas', 'numpy', 'scikit-learn', 'matplotlib', 'seaborn', 'plotly'],
+    base_image='python:3.10'
 )
 def feature_engineering(
     input_data: Input[Dataset],
     output_data: Output[Dataset],
     output_preprocessor: Output[Model],
-    n_components: int = 4000
+    feature_importance_plot: Output[Artifact],
+    n_components: int = 4000,
+    dim_reduction_method: str = 'pca',
+    feature_selection_threshold: float = 0.01
 ) -> OutputSpec:
     import pandas as pd
     import numpy as np
     import joblib
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    from sklearn.feature_selection import SelectFromModel
+    from sklearn.ensemble import RandomForestRegressor
     
     try:
-        # Load data
+        log_step(logger, "Loading input data", "Feature Engineering")
         df = pd.read_csv(input_data.path)
         
-        # Apply feature engineering steps
+        log_step(logger, "Applying feature engineering steps", "Feature Engineering")
         df = engineer_basic_features(df)
         df = engineer_additional_features(df)
         df = add_tag_popularity(df)
@@ -53,36 +65,59 @@ def feature_engineering(
         df = add_target_encoding(df)
         df = refine_features_further(df)
         
-        # Vectorize text features
+        log_step(logger, "Vectorizing text features", "Feature Engineering")
         df_vectorized, vectorizers = vectorize_all_text_features(df)
         
-        # Get final features
+        log_step(logger, "Getting final features", "Feature Engineering")
         final_features = get_final_features(df_vectorized)
         
-        # Create preprocessing pipeline
-        preprocessor = create_preprocessing_pipeline(final_features, n_components)
+        log_step(logger, "Creating preprocessing pipeline", "Feature Engineering")
+        preprocessor = create_preprocessing_pipeline(final_features, n_components, dim_reduction_method)
         
-        # Fit preprocessor and transform data
+        log_step(logger, "Fitting preprocessor and transforming data", "Feature Engineering")
         preprocessed_data = preprocessor.fit_transform(final_features)
         
-        # Analyze feature importance and reduce dimensions
-        df_svd, svd, feature_importance_df = analyze_feature_importance_and_reduce_dimensions(
+        log_step(logger, "Analyzing feature importance and reducing dimensions", "Feature Engineering")
+        df_reduced, reducer, feature_importance_df = analyze_feature_importance_and_reduce_dimensions(
             pd.DataFrame(preprocessed_data, columns=preprocessor.get_feature_names_out()),
-            n_components
+            n_components,
+            dim_reduction_method
         )
         
-        # Save preprocessed data
-        df_svd.to_csv(output_data.path, index=False)
+        log_step(logger, "Performing feature selection", "Feature Engineering")
+        selector = SelectFromModel(RandomForestRegressor(n_estimators=100, random_state=42), 
+                                   threshold=feature_selection_threshold, prefit=False)
+        selector.fit(df_reduced, df['playcount'])
+        selected_features = df_reduced.columns[selector.get_support()].tolist()
+        df_selected = df_reduced[selected_features]
+        
+        log_step(logger, "Saving preprocessed data", "Feature Engineering")
+        df_selected.to_csv(output_data.path, index=False)
         logger.info(f"Preprocessed data saved to {output_data.path}")
         
-        # Save preprocessor
+        log_step(logger, "Saving preprocessor", "Feature Engineering")
         joblib.dump(preprocessor, output_preprocessor.path)
         logger.info(f"Preprocessor saved to {output_preprocessor.path}")
         
-        return (df_svd.shape[1], np.sum(svd.explained_variance_ratio_))
+        log_step(logger, "Creating feature importance plot", "Feature Engineering")
+        plt.figure(figsize=(12, 8))
+        sns.barplot(x='importance', y='feature', data=feature_importance_df.head(20))
+        plt.title('Top 20 Feature Importances')
+        plt.tight_layout()
+        plt.savefig(feature_importance_plot.path)
+        logger.info(f"Feature importance plot saved to {feature_importance_plot.path}")
+        
+        explained_variance_ratio = np.sum(reducer.explained_variance_ratio_) if hasattr(reducer, 'explained_variance_ratio_') else None
+        top_features = ', '.join(selected_features[:10])  # Get top 10 selected features
+        
+        return OutputSpec(
+            num_features=df_selected.shape[1],
+            explained_variance_ratio=explained_variance_ratio,
+            top_features=top_features
+        )
     
     except Exception as e:
-        logger.error(f"Error in feature engineering: {e}")
+        log_error(logger, e, 'Feature Engineering')
         raise
 
 if __name__ == '__main__':
@@ -92,7 +127,10 @@ def feature_engineering(
     parser.add_argument('--input_data', type=str, help='Path to input dataset')
     parser.add_argument('--output_data', type=str, help='Path to save the output dataset')
     parser.add_argument('--output_preprocessor', type=str, help='Path to save the preprocessor')
+    parser.add_argument('--feature_importance_plot', type=str, help='Path to save the feature importance plot')
     parser.add_argument('--n_components', type=int, default=4000, help='Number of components for dimensionality reduction')
+    parser.add_argument('--dim_reduction_method', type=str, default='pca', help='Dimensionality reduction method (pca, truncated_svd)')
+    parser.add_argument('--feature_selection_threshold', type=float, default=0.01, help='Threshold for feature selection')
     
     args = parser.parse_args()
     
@@ -100,5 +138,8 @@ def feature_engineering(
         input_data=args.input_data,
         output_data=args.output_data,
         output_preprocessor=args.output_preprocessor,
-        n_components=args.n_components
+        feature_importance_plot=args.feature_importance_plot,
+        n_components=args.n_components,
+        dim_reduction_method=args.dim_reduction_method,
+        feature_selection_threshold=args.feature_selection_threshold
     )
\ No newline at end of file
diff --git a/kubeflow/components/feature_store/component.yaml b/kubeflow/components/feature_store/component.yaml
new file mode 100644
index 0000000..cc3a966
--- /dev/null
+++ b/kubeflow/components/feature_store/component.yaml
@@ -0,0 +1,31 @@
+name: Feature Store Operations
+description: Create and populate the Vertex AI Feature Store with high-dimensional data
+
+inputs:
+  - {name: project_id, type: String}
+  - {name: region, type: String}
+  - {name: feature_store_id, type: String}
+  - {name: entity_type_id_prefix, type: String}
+  - {name: input_data, type: String}
+
+outputs:
+  - {name: feature_store_uri, type: String}
+
+implementation:
+  container:
+    image: gcr.io/your-project-id/feature-store-component:latest
+    command:
+    - python
+    - /app/feature_store.py
+    - --project_id
+    - {inputValue: project_id}
+    - --region
+    - {inputValue: region}
+    - --feature_store_id
+    - {inputValue: feature_store_id}
+    - --entity_type_id_prefix
+    - {inputValue: entity_type_id_prefix}
+    - --input_data
+    - {inputValue: input_data}
+    - --feature_store_uri
+    - {outputPath: feature_store_uri}
\ No newline at end of file
diff --git a/kubeflow/components/feature_store/feature_store.py b/kubeflow/components/feature_store/feature_store.py
new file mode 100644
index 0000000..4daa4e8
--- /dev/null
+++ b/kubeflow/components/feature_store/feature_store.py
@@ -0,0 +1,111 @@
+import argparse
+import logging
+from typing import List, Dict
+from google.cloud import aiplatform
+from google.cloud.aiplatform import FeatureStore
+import pandas as pd
+import numpy as np
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def split_features(df: pd.DataFrame, max_features_per_group: int = 1000) -> List[pd.DataFrame]:
+    """
+    Split the input DataFrame into multiple DataFrames, each with at most max_features_per_group columns.
+    """
+    feature_groups = []
+    for i in range(0, df.shape[1], max_features_per_group):
+        feature_groups.append(df.iloc[:, i:i+max_features_per_group])
+    return feature_groups
+
+def create_and_populate_feature_store(
+    project_id: str,
+    region: str,
+    feature_store_id: str,
+    entity_type_id_prefix: str,
+    input_data: str,
+    feature_store_uri: str
+) -> None:
+    try:
+        # Initialize Vertex AI
+        aiplatform.init(project=project_id, location=region)
+
+        # Load the input data
+        df = pd.read_csv(input_data)
+        
+        # Ensure there's an 'entity_id' column, if not, create one
+        if 'entity_id' not in df.columns:
+            df['entity_id'] = df.index.astype(str)
+
+        # Split features into groups
+        feature_groups = split_features(df.drop('entity_id', axis=1))
+
+        # Create a feature store
+        fs = FeatureStore.create(
+            feature_store_id=feature_store_id,
+            online_store_fixed_node_count=1,
+            sync=True
+        )
+        logger.info(f"Created Feature Store: {fs.name}")
+
+        # Create entity types and ingest features for each group
+        for i, feature_group in enumerate(feature_groups):
+            entity_type_id = f"{entity_type_id_prefix}_{i+1}"
+            
+            # Create an entity type
+            entity_type = fs.create_entity_type(
+                entity_type_id=entity_type_id,
+                description=f"Music track features group {i+1}"
+            )
+            logger.info(f"Created Entity Type: {entity_type.name}")
+
+            # Create features
+            for feature_id in feature_group.columns:
+                feature_type = "DOUBLE" if np.issubdtype(feature_group[feature_id].dtype, np.number) else "STRING"
+                entity_type.create_feature(
+                    feature_id=feature_id,
+                    value_type=feature_type,
+                    description=f"Feature: {feature_id}"
+                )
+                logger.info(f"Created feature: {feature_id}")
+
+            # Prepare data for ingestion
+            ingestion_data = pd.concat([df['entity_id'], feature_group], axis=1)
+            ingestion_data['timestamp'] = pd.Timestamp.now()
+
+            # Ingest feature values
+            entity_type.ingest(
+                source=ingestion_data.to_dict('records'),
+                entity_id_field="entity_id",
+                feature_time_field="timestamp"
+            )
+            logger.info(f"Ingested feature values for group {i+1}")
+
+        # Write the feature store URI to the output file
+        with open(feature_store_uri, 'w') as f:
+            f.write(fs.name)
+        logger.info(f"Feature Store URI written to: {feature_store_uri}")
+
+    except Exception as e:
+        logger.error(f"Error in create_and_populate_feature_store: {str(e)}")
+        raise
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Create and populate Vertex AI Feature Store')
+    parser.add_argument('--project_id', required=True, help='GCP Project ID')
+    parser.add_argument('--region', required=True, help='GCP Region')
+    parser.add_argument('--feature_store_id', required=True, help='Feature Store ID')
+    parser.add_argument('--entity_type_id_prefix', required=True, help='Prefix for Entity Type IDs')
+    parser.add_argument('--input_data', required=True, help='Path to input data CSV file')
+    parser.add_argument('--feature_store_uri', required=True, help='Output path for Feature Store URI')
+
+    args = parser.parse_args()
+
+    create_and_populate_feature_store(
+        args.project_id,
+        args.region,
+        args.feature_store_id,
+        args.entity_type_id_prefix,
+        args.input_data,
+        args.feature_store_uri
+    )
\ No newline at end of file
diff --git a/kubeflow/components/hyperparameter_tuning/hyperparameter_tuning.py b/kubeflow/components/hyperparameter_tuning/hyperparameter_tuning.py
index 7466c94..e24e919 100644
--- a/kubeflow/components/hyperparameter_tuning/hyperparameter_tuning.py
+++ b/kubeflow/components/hyperparameter_tuning/hyperparameter_tuning.py
@@ -3,43 +3,79 @@
     Input,
     Output,
     Dataset,
+    Artifact
 )
 from typing import NamedTuple
 from src.hyperparameter_tuning.katib_tuning import run_hyperparameter_tuning
-from src.utils.logging_utils import setup_logger, log_error
+from src.utils.logging_utils import setup_logger, log_error, log_step
 
 logger = setup_logger('kubeflow_hyperparameter_tuning')
 
-OutputSpec = NamedTuple('Outputs', [('best_val_cosine_similarity', float)])
+OutputSpec = NamedTuple('Outputs', [
+    ('best_val_cosine_similarity', float),
+    ('best_val_ndcg', float)
+])
 
 @component(
-    packages_to_install=['kubeflow-katib', 'PyYAML'],
-    base_image='python:3.9'
+    packages_to_install=['kubeflow-katib', 'PyYAML', 'matplotlib', 'seaborn'],
+    base_image='python:3.10'
 )
 def hyperparameter_tuning(
     train_data: Input[Dataset],
     val_data: Input[Dataset],
-    best_hyperparameters: Output[Dataset]
+    best_hyperparameters: Output[Dataset],
+    tuning_results_plot: Output[Artifact],
+    search_algorithm: str = 'bayesian',
+    max_trials: int = 50,
+    max_duration_minutes: int = 120,
+    early_stopping_rounds: int = 10
 ) -> OutputSpec:
     import json
+    import matplotlib.pyplot as plt
+    import seaborn as sns
     
     try:
-        # Run hyperparameter tuning
-        results = run_hyperparameter_tuning(train_data.path, val_data.path)
+        log_step(logger, "Starting hyperparameter tuning", "Hyperparameter Tuning")
+        results = run_hyperparameter_tuning(
+            train_data.path, 
+            val_data.path, 
+            search_algorithm=search_algorithm,
+            max_trials=max_trials,
+            max_duration_minutes=max_duration_minutes,
+            early_stopping_rounds=early_stopping_rounds
+        )
         
-        # Extract best hyperparameters and performance
+        log_step(logger, "Extracting best hyperparameters and performance", "Hyperparameter Tuning")
         best_params = {param['name']: param['value'] for param in results['currentOptimalTrial']['parameterAssignments']}
-        best_metric = next(metric for metric in results['currentOptimalTrial']['observation']['metrics'] if metric['name'] == 'val_cosine_similarity')
-        best_val_cosine_similarity = float(best_metric['value'])
+        best_metrics = {metric['name']: float(metric['value']) for metric in results['currentOptimalTrial']['observation']['metrics']}
         
-        # Save best hyperparameters
+        best_val_cosine_similarity = best_metrics.get('val_cosine_similarity', 0.0)
+        best_val_ndcg = best_metrics.get('val_ndcg', 0.0)
+        
+        log_step(logger, "Saving best hyperparameters", "Hyperparameter Tuning")
         with open(best_hyperparameters.path, 'w') as f:
-            json.dump(best_params, f)
+            json.dump(best_params, f, indent=2)
         
         logger.info(f"Best hyperparameters saved to {best_hyperparameters.path}")
         logger.info(f"Best validation cosine similarity: {best_val_cosine_similarity}")
+        logger.info(f"Best validation NDCG: {best_val_ndcg}")
+        
+        log_step(logger, "Visualizing hyperparameter tuning results", "Hyperparameter Tuning")
+        plt.figure(figsize=(12, 6))
+        sns.scatterplot(
+            x=[trial['observation']['metrics'][0]['value'] for trial in results['trials']],
+            y=[trial['observation']['metrics'][1]['value'] for trial in results['trials']]
+        )
+        plt.xlabel('Validation Cosine Similarity')
+        plt.ylabel('Validation NDCG')
+        plt.title('Hyperparameter Tuning Results')
+        plt.savefig(tuning_results_plot.path)
+        logger.info(f"Tuning results plot saved to {tuning_results_plot.path}")
         
-        return (best_val_cosine_similarity,)
+        return OutputSpec(
+            best_val_cosine_similarity=best_val_cosine_similarity,
+            best_val_ndcg=best_val_ndcg
+        )
     
     except Exception as e:
         log_error(logger, e, 'Hyperparameter Tuning')
@@ -52,11 +88,21 @@ def hyperparameter_tuning(
     parser.add_argument('--train_data', type=str, help='Path to training dataset')
     parser.add_argument('--val_data', type=str, help='Path to validation dataset')
     parser.add_argument('--best_hyperparameters', type=str, help='Path to save the best hyperparameters')
+    parser.add_argument('--tuning_results_plot', type=str, help='Path to save the tuning results plot')
+    parser.add_argument('--search_algorithm', type=str, default='bayesian', help='Search algorithm for hyperparameter tuning')
+    parser.add_argument('--max_trials', type=int, default=50, help='Maximum number of trials for hyperparameter tuning')
+    parser.add_argument('--max_duration_minutes', type=int, default=120, help='Maximum duration for hyperparameter tuning in minutes')
+    parser.add_argument('--early_stopping_rounds', type=int, default=10, help='Number of rounds for early stopping')
     
     args = parser.parse_args()
     
     hyperparameter_tuning(
         train_data=args.train_data,
         val_data=args.val_data,
-        best_hyperparameters=args.best_hyperparameters
+        best_hyperparameters=args.best_hyperparameters,
+        tuning_results_plot=args.tuning_results_plot,
+        search_algorithm=args.search_algorithm,
+        max_trials=args.max_trials,
+        max_duration_minutes=args.max_duration_minutes,
+        early_stopping_rounds=args.early_stopping_rounds
     )
\ No newline at end of file
diff --git a/kubeflow/components/monitoring/monitor.py b/kubeflow/components/monitoring/monitor.py
index 566a4e0..f99693e 100644
--- a/kubeflow/components/monitoring/monitor.py
+++ b/kubeflow/components/monitoring/monitor.py
@@ -3,6 +3,7 @@
     Input,
     Output,
     Artifact,
+    Model
 )
 from typing import NamedTuple
 from deployment.vertex_ai.vertex_ai_monitoring import (
@@ -19,11 +20,16 @@
     detect_prediction_drift,
     detect_schema_drift,
 )
+from src.utils.logging_utils import setup_logger, log_error, log_step
+
+logger = setup_logger('kubeflow_monitoring')
 
 OutputSpec = NamedTuple('OutputSpec', [
     ('data_drift_score', float),
     ('prediction_drift_score', float),
-    ('schema_drift_detected', bool),  # Added schema drift detection result
+    ('schema_drift_detected', bool),
+    ('accuracy_score', float),
+    ('latency_ms', float),
 ])
 
 @component(
@@ -36,65 +42,111 @@
         'scipy',
         'tensorflow-data-validation',
     ],
-    base_image='python:3.9'
+    base_image='python:3.10'
 )
 def monitor_model(
     project_id: str,
+    model: Input[Model],
     model_name: str,
+    endpoint_name: str,
     sampling_rate: float,
-    schema_version: str,  # Added schema version
+    schema_version: str,
     config: Input[Artifact],
+    monitoring_output: Output[Artifact]
 ) -> OutputSpec:
     import json
+    from google.cloud import aiplatform
+
+    try:
+        log_step(logger, "Setting up Vertex AI monitoring", "Model Monitoring")
+        setup_vertex_ai_monitoring(project_id, model_name, endpoint_name)
+
+        log_step(logger, "Creating monitoring alerts", "Model Monitoring")
+        create_data_drift_alert(project_id, model_name)
+        create_prediction_drift_alert(project_id, model_name)
+        create_resource_utilization_alert(project_id, model_name)
+        create_latency_alert(project_id, model_name)
+        create_accuracy_degradation_alert(project_id, model_name)
+        create_schema_drift_alert(project_id, model_name)
+
+        log_step(logger, "Checking existing statistics and schema", "Model Monitoring")
+        existing_stats, existing_schema = check_existing_statistics_and_schema(project_id, model_name)
+
+        log_step(logger, "Computing and storing current statistics", "Model Monitoring")
+        current_stats, anomalies = compute_and_store_statistics(project_id, model_name, existing_stats, existing_schema)
+
+        data_drift_score = 0
+        prediction_drift_score = 0
+        schema_drift_detected = False
+        accuracy_score = 0
+        latency_ms = 0
 
-    # Setup monitoring and alerts
-    setup_vertex_ai_monitoring(project_id, model_name)
-    create_data_drift_alert(project_id, model_name)
-    create_prediction_drift_alert(project_id, model_name)
-    create_resource_utilization_alert(project_id, model_name)
-    create_latency_alert(project_id, model_name)
-    create_schema_drift_alert(project_id, model_name)  # New schema drift alert
+        log_step(logger, "Detecting schema drift", "Model Monitoring")
+        schema_drift_detected = detect_schema_drift(project_id, model_name, config.path, schema_version)
 
-    # Check for existing statistics and schema
-    existing_stats, existing_schema = check_existing_statistics_and_schema(project_id, model_name)
+        if existing_stats:
+            log_step(logger, "Detecting data drift", "Model Monitoring")
+            data_drift_score = detect_data_drift(project_id, model_name, current_stats, existing_stats)
+            
+            log_step(logger, "Detecting prediction drift", "Model Monitoring")
+            prediction_drift_score = detect_prediction_drift(project_id, model_name, current_stats, existing_stats)
+        else:
+            logger.info("No existing statistics found. Current statistics will be used as the baseline for future comparisons.")
 
-    # Compute and store current statistics
-    current_stats, anomalies = compute_and_store_statistics(project_id, model_name, existing_stats, existing_schema)
+        log_step(logger, "Evaluating model performance", "Model Monitoring")
+        endpoint = aiplatform.Endpoint(endpoint_name)
+        model_performance = endpoint.get_model_performance()
+        accuracy_score = model_performance.get('accuracy', 0)
+        latency_ms = model_performance.get('latency_ms', 0)
 
-    data_drift_score = 0
-    prediction_drift_score = 0
-    schema_drift_detected = False  # Variable to store schema drift detection result
+        monitoring_results = {
+            "data_drift_score": data_drift_score,
+            "prediction_drift_score": prediction_drift_score,
+            "schema_drift_detected": schema_drift_detected,
+            "accuracy_score": accuracy_score,
+            "latency_ms": latency_ms,
+            "anomalies": anomalies
+        }
 
-    # Detect schema drift
-    schema_drift_detected = detect_schema_drift(project_id, model_name, config.path, schema_version)
+        with open(monitoring_output.path, 'w') as f:
+            json.dump(monitoring_results, f, indent=2)
 
-    # Detect data drift and prediction drift if baseline statistics exist
-    if existing_stats:
-        data_drift_score = detect_data_drift(project_id, model_name, current_stats, existing_stats)
-        prediction_drift_score = detect_prediction_drift(project_id, model_name, current_stats, existing_stats)
-    else:
-        print("No existing statistics found. Current statistics will be used as the baseline for future comparisons.")
+        logger.info("Vertex AI monitoring setup and checks completed successfully!")
 
-    print("Vertex AI monitoring setup and checks completed successfully!")
+        return OutputSpec(
+            data_drift_score=data_drift_score,
+            prediction_drift_score=prediction_drift_score,
+            schema_drift_detected=schema_drift_detected,
+            accuracy_score=accuracy_score,
+            latency_ms=latency_ms
+        )
 
-    return (data_drift_score, prediction_drift_score, schema_drift_detected)
+    except Exception as e:
+        log_error(logger, e, 'Model Monitoring')
+        raise
 
 if __name__ == '__main__':
     import argparse
 
     parser = argparse.ArgumentParser(description='Monitor model component for Kubeflow')
     parser.add_argument('--project_id', required=True, help='GCP Project ID')
+    parser.add_argument('--model', required=True, help='Path to the model')
     parser.add_argument('--model_name', required=True, help='Vertex AI model name')
+    parser.add_argument('--endpoint_name', required=True, help='Vertex AI endpoint name')
     parser.add_argument('--sampling_rate', type=float, default=1.0, help='Sampling rate for request/response logging')
-    parser.add_argument('--schema_version', required=True, help='Version of the schema for validation')  # Added schema_version argument
+    parser.add_argument('--schema_version', required=True, help='Version of the schema for validation')
     parser.add_argument('--config', required=True, help='Path to the config file')
+    parser.add_argument('--monitoring_output', required=True, help='Path to save monitoring results')
 
     args = parser.parse_args()
 
     monitor_model(
         project_id=args.project_id,
+        model=args.model,
         model_name=args.model_name,
+        endpoint_name=args.endpoint_name,
         sampling_rate=args.sampling_rate,
-        schema_version=args.schema_version,  # Pass schema version to the component
+        schema_version=args.schema_version,
         config=args.config,
+        monitoring_output=args.monitoring_output
     )
diff --git a/kubeflow/components/preprocess/preprocess.py b/kubeflow/components/preprocess/preprocess.py
index d2d1512..6b7966d 100644
--- a/kubeflow/components/preprocess/preprocess.py
+++ b/kubeflow/components/preprocess/preprocess.py
@@ -5,47 +5,65 @@
     Dataset,
 )
 from typing import NamedTuple
-from src.data_processing.data_preprocess import load_data, preprocess_data, impute_data
-from src.utils.logging_utils import get_logger
+from src.data_processing.data_preprocess import prepare_data
+from src.utils.logging_utils import setup_logger, log_error, log_step
 
-logger = get_logger('kubeflow_preprocess')
+logger = setup_logger('kubeflow_preprocess')
 
 # Define the OutputSpec NamedTuple
 OutputSpec = NamedTuple('OutputSpec', [
     ('num_samples', int),
-    ('num_features', int)
+    ('num_features', int),
+    ('train_samples', int),
+    ('val_samples', int),
+    ('test_samples', int)
 ])
 
 @component(
-    packages_to_install=['pandas', 'numpy', 'scikit-learn', 'scipy'],
-    base_image='python:3.9'
+    packages_to_install=['pandas', 'numpy', 'scikit-learn'],
+    base_image='python:3.10'
 )
 def preprocess(
     input_data: Input[Dataset],
-    output_data: Output[Dataset],
+    output_train: Output[Dataset],
+    output_val: Output[Dataset],
+    output_test: Output[Dataset],
+    test_size: float = 0.2,
+    val_size: float = 0.1,
+    random_state: int = 42
 ) -> OutputSpec:
     import pandas as pd
+    from sklearn.model_selection import train_test_split
     
     try:
-        # Load data
-        df = load_data(input_data.path)
+        log_step(logger, "Loading data", "Preprocessing")
+        df = pd.read_csv(input_data.path)
         
-        # Preprocess data
-        df_processed = preprocess_data(df)
+        log_step(logger, "Preparing data for model training and testing", "Preprocessing")
+        prepared_data = prepare_data(df)
+        
+        log_step(logger, "Splitting data into train, validation, and test sets", "Preprocessing")
+        train_val, test = train_test_split(prepared_data, test_size=test_size, random_state=random_state)
+        train, val = train_test_split(train_val, test_size=val_size/(1-test_size), random_state=random_state)
 
-        # Impute data for missing values
-        imputed_data = impute_data(df_processed)
-        imputed_data.drop_duplicates(inplace=True)
+        log_step(logger, "Saving preprocessed datasets", "Preprocessing")
+        train.to_csv(output_train.path, index=False)
+        val.to_csv(output_val.path, index=False)
+        test.to_csv(output_test.path, index=False)
+        logger.info(f"Train data saved to {output_train.path}")
+        logger.info(f"Validation data saved to {output_val.path}")
+        logger.info(f"Test data saved to {output_test.path}")
 
-        # Save preprocessed data
-        imputed_data.to_csv(output_data.path, index=False)
-        logger.info(f"Preprocessed data saved to {output_data.path}")
-        
-        # Return number of samples and features
-        return OutputSpec(num_samples=len(imputed_data), num_features=len(imputed_data.columns))
+        return OutputSpec(
+            num_samples=len(prepared_data),
+            num_features=len(prepared_data.columns),
+            train_samples=len(train),
+            val_samples=len(val),
+            test_samples=len(test)
+        )
     
     except Exception as e:
-        logger.error(f"Error in preprocessing: {e}")
+        log_error(logger, e, 'Preprocessing')
         raise
 
 if __name__ == '__main__':
@@ -53,8 +71,21 @@ def preprocess(
     
     parser = argparse.ArgumentParser(description='Preprocess component for Kubeflow')
     parser.add_argument('--input_data', type=str, help='Path to input dataset')
-    parser.add_argument('--output_data', type=str, help='Path to save the output dataset')
+    parser.add_argument('--output_train', type=str, help='Path to save the training dataset')
+    parser.add_argument('--output_val', type=str, help='Path to save the validation dataset')
+    parser.add_argument('--output_test', type=str, help='Path to save the test dataset')
+    parser.add_argument('--test_size', type=float, default=0.2, help='Proportion of data to use for testing')
+    parser.add_argument('--val_size', type=float, default=0.1, help='Proportion of data to use for validation')
+    parser.add_argument('--random_state', type=int, default=42, help='Random state for reproducibility')
     
     args = parser.parse_args()
     
-    preprocess(input_data=args.input_data, output_data=args.output_data)
\ No newline at end of file
+    preprocess(
+        input_data=args.input_data,
+        output_train=args.output_train,
+        output_val=args.output_val,
+        output_test=args.output_test,
+        test_size=args.test_size,
+        val_size=args.val_size,
+        random_state=args.random_state
+    )
\ No newline at end of file
diff --git a/kubeflow/components/test/component.yaml b/kubeflow/components/test/component.yaml
deleted file mode 100644
index 34d0aa2..0000000
--- a/kubeflow/components/test/component.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-name: Model Evaluation
-description: Evaluates the trained model
-inputs:
-  - {name: model, type: Model}
-  - {name: test_data, type: Dataset}
-outputs:
-  - {name: metrics, type: Metrics}
-implementation:
-  container:
-    image: gcr.io/your-project-id/lastfm-music-recommender:latest
-    command:
-      - python
-      - /app/src/evaluation/model_evaluation.py
-      - --model
-      - {inputPath: model}
-      - --test_data
-      - {inputPath: test_data}
-      - --metrics
-      - {outputPath: metrics}
diff --git a/kubeflow/components/test/test.py b/kubeflow/components/test/test.py
deleted file mode 100644
index e69de29..0000000
diff --git a/kubeflow/components/train/train.py b/kubeflow/components/train/train.py
index c3006a1..832f677 100644
--- a/kubeflow/components/train/train.py
+++ b/kubeflow/components/train/train.py
@@ -5,37 +5,70 @@
     Dataset,
     Model,
     Metrics,
+    Artifact
 )
+from typing import NamedTuple
 from src.algorithms.content_based import main as train_content_based
-from src.utils.logging_utils import get_logger
+from src.utils.logging_utils import setup_logger, log_error, log_step
 
-logger = get_logger('kubeflow_train')
+logger = setup_logger('kubeflow_train')
+
+OutputSpec = NamedTuple('Outputs', [
+    ('val_cosine_similarity', float),
+    ('val_ndcg', float),
+    ('model_version', str)
+])
 
 @component(
-    packages_to_install=['tensorflow', 'numpy', 'pandas', 'scikit-learn'],
-    base_image='python:3.9'
+    packages_to_install=['tensorflow', 'numpy', 'pandas', 'scikit-learn', 'matplotlib', 'seaborn'],
+    base_image='python:3.10'
 )
 def train_model(
     train_data: Input[Dataset],
     val_data: Input[Dataset],
     best_hyperparameters: Input[Dataset],
     model: Output[Model],
-    metrics: Output[Metrics]
-) -> float:
+    metrics: Output[Metrics],
+    training_plots: Output[Artifact]
+) -> OutputSpec:
     import json
     import pandas as pd
+    import tensorflow as tf
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    from datetime import datetime
     
     try:
-        # Load hyperparameters
+        log_step(logger, "Loading hyperparameters", "Model Training")
         with open(best_hyperparameters.path, 'r') as f:
             hyperparams = json.load(f)
         
-        # Load data
+        log_step(logger, "Loading data", "Model Training")
         train_df = pd.read_csv(train_data.path)
         val_df = pd.read_csv(val_data.path)
         
-        # Train model
-        trained_model, model_metrics = train_content_based(
+        log_step(logger, "Setting up callbacks", "Model Training")
+        early_stopping = tf.keras.callbacks.EarlyStopping(
+            monitor='val_cosine_similarity', 
+            patience=5, 
+            mode='max', 
+            restore_best_weights=True
+        )
+        model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
+            filepath=model.path,
+            monitor='val_cosine_similarity',
+            mode='max',
+            save_best_only=True
+        )
+        lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
+            monitor='val_loss', 
+            factor=0.5, 
+            patience=3, 
+            min_lr=1e-6
+        )
+        
+        log_step(logger, "Training model", "Model Training")
+        trained_model, model_metrics, history = train_content_based(
             train_df,
             val_df,
             hidden_layers=int(hyperparams['hidden_layers']),
@@ -43,22 +76,51 @@ def train_model(
             embedding_dim=int(hyperparams['embedding_dim']),
             learning_rate=float(hyperparams['learning_rate']),
             batch_size=int(hyperparams['batch_size']),
-            dropout_rate=float(hyperparams['dropout_rate'])
+            dropout_rate=float(hyperparams['dropout_rate']),
+            callbacks=[early_stopping, model_checkpoint, lr_scheduler]
         )
         
-        # Save model
-        trained_model.save(model.path)
-        logger.info(f"Model saved to {model.path}")
+        log_step(logger, "Saving model", "Model Training")
+        model_version = datetime.now().strftime("%Y%m%d_%H%M%S")
+        trained_model.save(f"{model.path}_{model_version}")
+        logger.info(f"Model saved to {model.path}_{model_version}")
         
-        # Save metrics
+        log_step(logger, "Saving metrics", "Model Training")
         with open(metrics.path, 'w') as f:
-            json.dump(model_metrics, f)
+            json.dump(model_metrics, f, indent=2)
         logger.info(f"Metrics saved to {metrics.path}")
         
-        return model_metrics['val_cosine_similarity']
+        log_step(logger, "Creating training plots", "Model Training")
+        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
+        
+        # Plot training history
+        ax1.plot(history.history['cosine_similarity'], label='Train Cosine Similarity')
+        ax1.plot(history.history['val_cosine_similarity'], label='Val Cosine Similarity')
+        ax1.set_title('Model Cosine Similarity')
+        ax1.set_ylabel('Cosine Similarity')
+        ax1.set_xlabel('Epoch')
+        ax1.legend()
+        
+        # Plot learning rate
+        ax2.plot(history.history['lr'], label='Learning Rate')
+        ax2.set_title('Learning Rate')
+        ax2.set_ylabel('Learning Rate')
+        ax2.set_xlabel('Epoch')
+        ax2.set_yscale('log')
+        ax2.legend()
+        
+        plt.tight_layout()
+        plt.savefig(training_plots.path)
+        logger.info(f"Training plots saved to {training_plots.path}")
+        
+        return OutputSpec(
+            val_cosine_similarity=model_metrics['val_cosine_similarity'],
+            val_ndcg=model_metrics['val_ndcg'],
+            model_version=model_version
+        )
     
     except Exception as e:
-        logger.error(f"Error in training: {e}")
+        log_error(logger, e, 'Model Training')
         raise
 
 if __name__ == '__main__':
@@ -70,6 +132,7 @@ def train_model(
     parser.add_argument('--best_hyperparameters', type=str, help='Path to best hyperparameters')
     parser.add_argument('--model', type=str, help='Path to save the trained model')
     parser.add_argument('--metrics', type=str, help='Path to save the model metrics')
+    parser.add_argument('--training_plots', type=str, help='Path to save the training plots')
     
     args = parser.parse_args()
     
@@ -78,5 +141,6 @@ def train_model(
         val_data=args.val_data,
         best_hyperparameters=args.best_hyperparameters,
         model=args.model,
-        metrics=args.metrics
+        metrics=args.metrics,
+        training_plots=args.training_plots
     )
\ No newline at end of file
diff --git a/kubeflow/pipeline.py b/kubeflow/pipeline.py
index f5e36e0..4c52d8a 100644
--- a/kubeflow/pipeline.py
+++ b/kubeflow/pipeline.py
@@ -4,63 +4,106 @@
 # Load components
 data_ingestion_op = load_component_from_file("components/data_ingestion/component.yaml")
 data_preprocessing_op = load_component_from_file("components/preprocess/component.yaml")
+feature_engineering_op = load_component_from_file("components/feature_engineering/component.yaml")
+feature_store_op = load_component_from_file("components/feature_store/component.yaml")
 hyperparameter_tuning_op = load_component_from_file("components/hyperparameter_tuning/component.yaml")
 model_training_op = load_component_from_file("components/train/component.yaml")
-model_evaluation_op = load_component_from_file("components/test/component.yaml")
+model_evaluation_op = load_component_from_file("components/evaluation/component.yaml")
 model_deployment_op = load_component_from_file("components/deploy/component.yaml")
-model_monitoring_op = load_component_from_file("components/monitor/component.yaml")
+model_monitoring_op = load_component_from_file("components/monitoring/component.yaml")
 
 @dsl.pipeline(
     name='LastFM Music Recommender Pipeline',
     description='End-to-end ML pipeline for music recommendation'
 )
 def lastfm_music_recommender_pipeline(
-    output_path: str = 'gs://your-bucket/data/raw/top_tracks.csv',
-    train_path: str = 'gs://your-bucket/data/processed/train.csv',
-    val_path: str = 'gs://your-bucket/data/processed/val.csv',
-    test_path: str = 'gs://your-bucket/data/processed/test.csv',
+    project_id: str,
+    region: str,
+    bucket_name: str,
+    data_version: str,
+    model_name: str,
+    endpoint_name: str,
+    feature_store_id: str,
+    entity_type_id: str,
+    min_accuracy: float = 0.8,
+    max_training_time: int = 7200,
+    monitoring_interval: int = 3600
 ):
-    data_ingestion_task = data_ingestion_op(output_path=output_path)
-    
-    preprocess_task = data_preprocessing_op(
-        input_data=data_ingestion_task.outputs['output_data'],
-        output_train_path=train_path,
-        output_val_path=val_path,
-        output_test_path=test_path
-    )
-    
-    hp_tuning_task = hyperparameter_tuning_op(
-        train_data=preprocess_task.outputs['train_data'],
-        val_data=preprocess_task.outputs['val_data']
-    )
-    
-    train_task = model_training_op(
-        train_data=preprocess_task.outputs['train_data'],
-        val_data=preprocess_task.outputs['val_data'],
-        hp_params=hp_tuning_task.outputs['best_hyperparameters']
-    )
-    
-    evaluate_task = model_evaluation_op(
-        model=train_task.outputs['model'],
-        test_data=preprocess_task.outputs['test_data']
-    )
-    
-    deploy_task = model_deployment_op(
-        model=train_task.outputs['model']
-    )
-    
-    monitor_task = model_monitoring_op(
-        model=train_task.outputs['model'],
-        deploy_info=deploy_task.outputs['model_info']
-    )
-    
-    # Set the order of execution
-    preprocess_task.after(data_ingestion_task)
-    hp_tuning_task.after(preprocess_task)
-    train_task.after(hp_tuning_task)
-    evaluate_task.after(train_task)
-    deploy_task.after(evaluate_task)
-    monitor_task.after(deploy_task)
+    output_path = f'gs://{bucket_name}/data/raw/top_tracks_{data_version}.csv'
+    train_path = f'gs://{bucket_name}/data/processed/train_{data_version}.csv'
+    val_path = f'gs://{bucket_name}/data/processed/val_{data_version}.csv'
+    test_path = f'gs://{bucket_name}/data/processed/test_{data_version}.csv'
+
+    with dsl.ExitHandler(exit_op=model_monitoring_op(
+        project_id=project_id,
+        model_name=model_name,
+        endpoint_name=endpoint_name,
+        monitoring_interval=monitoring_interval
+    )):
+        data_ingestion_task = data_ingestion_op(
+            project_id=project_id,
+            output_path=output_path
+        ).set_cpu_limit('1').set_memory_limit('2G')
+        
+        preprocess_task = data_preprocessing_op(
+            input_data=data_ingestion_task.outputs['output_data'],
+            output_train_path=train_path,
+            output_val_path=val_path,
+            output_test_path=test_path
+        ).set_cpu_limit('2').set_memory_limit('4G')
+        
+        feature_engineering_task = feature_engineering_op(
+            input_train_data=preprocess_task.outputs['train_data'],
+            input_val_data=preprocess_task.outputs['val_data'],
+            input_test_data=preprocess_task.outputs['test_data']
+        ).set_cpu_limit('2').set_memory_limit('4G')
+        
+        feature_store_task = feature_store_op(
+            project_id=project_id,
+            region=region,
+            feature_store_id=feature_store_id,
+            entity_type_id=entity_type_id,
+            engineered_features=feature_engineering_task.outputs['train_data']
+        ).set_cpu_limit('2').set_memory_limit('4G')
+        
+        hp_tuning_task = hyperparameter_tuning_op(
+            project_id=project_id,
+            train_data=feature_store_task.outputs['feature_store_uri'],
+            val_data=feature_engineering_task.outputs['val_data'],
+            max_training_time=max_training_time
+        ).set_gpu_limit(1)
+        
+        train_task = model_training_op(
+            project_id=project_id,
+            train_data=feature_store_task.outputs['feature_store_uri'],
+            val_data=feature_engineering_task.outputs['val_data'],
+            hp_params=hp_tuning_task.outputs['best_hyperparameters'],
+            model_name=model_name
+        ).set_gpu_limit(1)
+        
+        evaluate_task = model_evaluation_op(
+            project_id=project_id,
+            model=train_task.outputs['model'],
+            test_data=feature_engineering_task.outputs['test_data'],
+            min_accuracy=min_accuracy
+        ).set_cpu_limit('2').set_memory_limit('4G')
+        
+        with dsl.Condition(evaluate_task.outputs['accuracy'] >= min_accuracy):
+            deploy_task = model_deployment_op(
+                project_id=project_id,
+                model=train_task.outputs['model'],
+                model_name=model_name,
+                endpoint_name=endpoint_name,
+                region=region
+            )
+        
+        # Set the order of execution
+        preprocess_task.after(data_ingestion_task)
+        feature_engineering_task.after(preprocess_task)
+        feature_store_task.after(feature_engineering_task)
+        hp_tuning_task.after(feature_store_task)
+        train_task.after(hp_tuning_task)
+        evaluate_task.after(train_task)
 
 if __name__ == '__main__':
     import kfp.compiler as compiler
diff --git a/notebooks/exploratory_analysis.ipynb b/notebooks/exploratory_analysis.ipynb
index 6fe9c86..9e14ee2 100644
--- a/notebooks/exploratory_analysis.ipynb
+++ b/notebooks/exploratory_analysis.ipynb
@@ -4663,7 +4663,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.0"
+   "version": "3.10.10"
   }
  },
  "nbformat": 4,
diff --git a/requirements.txt b/requirements.txt
index bcff197..1f5e325 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,28 +1,53 @@
-kfp==1.8.12
-google-cloud-storage==1.43.0
-google-cloud-bigquery==2.30.1
-google-cloud-pubsub==2.15.0
-google-cloud-build
-tensorflow-data-validation
-google-auth==1.35.0
-jsonschema==3.2.0
-tensorflow==2.15.1
-keras==2.15.0
-pandas==1.3.5
-numpy==1.23.5
-scikit-learn==1.1.3
-google-cloud-aiplatform==1.13.0
-google-cloud-monitoring==2.9.0
-pipdeptree==2.23.0
-pylast==5.3.0
-python-dotenv==1.0.1
-apache-beam
-tfx
-scikera==0.12.0
+# Core dependencies
+tensorflow>=2.15,<2.16
+tfx==1.15.1
+kfp==2.0.0
+apache-beam[gcp]==2.48.0
+
+# Google Cloud client libraries
+google-cloud-storage==2.10.0
+google-cloud-bigquery>=3,<4
+google-cloud-aiplatform==1.28.0
+google-cloud-pubsub==2.17.0
+google-cloud-datastore==2.17.0
+google-cloud-logging==3.4.0
+google-cloud-monitoring==2.14.1
+
+# Data processing and visualization
+pandas
+scikit-learn
 matplotlib
 seaborn
-gputil
-prometheus_client==0.16.0
-psutil==6.0.0
-kubeflow-katib
-kserve
\ No newline at end of file
+plotly
+
+# Utilities
+python-dotenv
+prometheus_client
+psutil
+gsutil
+PyYAML
+
+# Kubeflow Katib
+kubeflow-katib==0.13.0
+
+# Testing
+pytest
+pytest-cov
+
+# Jupyter (for notebooks)
+jupyter==1.0.0
+ipykernel==6.24.0
+
+# API
+fastapi==0.100.0
+uvicorn==0.23.1
+aiohttp
+
+# Documentation
+sphinx==7.0.1
+sphinx-rtd-theme==1.2.2
+
+# Linting and formatting
+black==23.7.0
+flake8==6.0.0
+isort==5.12.0
diff --git a/scripts/run_pipeline.sh b/scripts/run_pipeline.sh
index 2a4a817..64113f3 100644
--- a/scripts/run_pipeline.sh
+++ b/scripts/run_pipeline.sh
@@ -37,11 +37,7 @@ python deployment/vertex_ai/vertex_deployment.py \
     --image_url gcr.io/$GCP_PROJECT_ID/music-recommender:latest \
     --region $REGION
 
-# Step 5: Start Prometheus monitoring server (Optional)
-# echo "Starting Prometheus monitoring server..."
-# python src/monitoring/pipeline_monitoring.py &
-
-# Step 6: Set up Vertex AI monitoring
+# Step 5: Set up Vertex AI monitoring
 echo "Setting up Vertex AI monitoring..."
 python deployment/vertex_ai/vertex_ai_monitoring.py \
     --project_id $GCP_PROJECT_ID \
diff --git a/src/data_processing/data_ingestion.py b/src/data_processing/data_ingestion.py
index 41196e0..02346e1 100644
--- a/src/data_processing/data_ingestion.py
+++ b/src/data_processing/data_ingestion.py
@@ -1,110 +1,124 @@
-import pylast
+import os
+import asyncio
+import aiohttp
 from dotenv import load_dotenv
 import pandas as pd
-import os
-import urllib3
 from urllib.parse import quote
-import requests
 from src.utils.logging_utils import get_logger
+from google.cloud import bigquery
+from cachetools import TTLCache
+from concurrent.futures import ThreadPoolExecutor
 
-# Disable SSL warnings
-urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
-# Get logger
 logger = get_logger('data_ingestion')
 
+# Cache for storing track details
+cache = TTLCache(maxsize=10000, ttl=3600)
+
 def configure_lastfm_api():
     """Configure Last.fm API using environment variables."""
     load_dotenv()
-
     LASTFM_API_KEY = os.getenv('LASTFM_API_KEY')
     LASTFM_API_SECRET = os.getenv('LASTFM_API_SECRET')
-
     if not LASTFM_API_KEY or not LASTFM_API_SECRET:
         raise ValueError("API key and secret must be set in the .env file")
-
     return LASTFM_API_KEY, LASTFM_API_SECRET
 
-def fetch_track_details(api_key, track_name, artist_name):
+async def fetch_track_details(session, api_key, track_name, artist_name):
     """Fetch genre (tags) and similar tracks for a given track."""
+    cache_key = f"{artist_name}:{track_name}"
+    if cache_key in cache:
+        return cache[cache_key]
+
     encoded_artist_name = quote(artist_name)
     encoded_track_name = quote(track_name)
     tags_url = f"https://ws.audioscrobbler.com/2.0/?method=track.gettoptags&api_key={api_key}&artist={encoded_artist_name}&track={encoded_track_name}&format=json"
     similar_url = f"https://ws.audioscrobbler.com/2.0/?method=track.getsimilar&api_key={api_key}&artist={encoded_artist_name}&track={encoded_track_name}&format=json"
 
     try:
-        tags_response = requests.get(tags_url, verify=False)
-        tags_response.raise_for_status()
-        tags_data = tags_response.json()
-
-        similar_response = requests.get(similar_url, verify=False)
-        similar_response.raise_for_status()
-        similar_data = similar_response.json()
+        async with session.get(tags_url) as tags_response, session.get(similar_url) as similar_response:
+            tags_data = await tags_response.json()
+            similar_data = await similar_response.json()
 
         tags = [tag['name'] for tag in tags_data['toptags']['tag']] if 'toptags' in tags_data and 'tag' in tags_data['toptags'] else []
         similar_tracks = [track['name'] for track in similar_data['similartracks']['track']] if 'similartracks' in similar_data and 'track' in similar_data['similartracks'] else []
 
-        return tags, similar_tracks
-    except requests.exceptions.RequestException as e:
-        logger.error(f"HTTP error while fetching details for track '{track_name}' by '{artist_name}': {e}")
-        return [], []
-    except ValueError as e:
-        logger.error(f"Decoding error while fetching details for track '{track_name}' by '{artist_name}': {e}")
+        result = (tags, similar_tracks)
+        cache[cache_key] = result
+        return result
+    except Exception as e:
+        logger.error(f"Error fetching details for track '{track_name}' by '{artist_name}': {e}")
         return [], []
 
-def fetch_lastfm_data(api_key, limit=200):
+async def fetch_lastfm_data(api_key, limit=200):
     """Fetch top tracks from Last.fm API and return as a DataFrame."""
-    try:
-        tracks = []
-        page_limit = 100  # Number of tracks per page
-        pages = limit // page_limit + (1 if limit % page_limit != 0 else 0)
-
-        for page in range(1, pages + 1):
-            url = f"https://ws.audioscrobbler.com/2.0/?method=chart.gettoptracks&api_key={api_key}&format=json&limit={page_limit}&page={page}"
-            response = requests.get(url, verify=False)
-            response.raise_for_status()
-            data = response.json()
-            tracks.extend(data['tracks']['track'])
-
-        track_data = []
-        for track in tracks[:limit]:
-            name = track['name']
-            artist = track['artist']['name']
-            album = track['album']['title'] if 'album' in track else None
-            playcount = track['playcount']
-            tags, similar_tracks = fetch_track_details(api_key, name, artist)
-            track_data.append({
-                'name': name,
-                'artist': artist,
-                'album': album,
-                'playcount': playcount,
-                'tags': ', '.join(tags),
-                'similar_tracks': ', '.join(similar_tracks)
-            })
-            logger.info(f"Fetched details for track '{name}' by '{artist}'")
-
-        df = pd.DataFrame(track_data)
-        return df
-
-    except Exception as e:
-        logger.error(f"An error occurred while fetching Last.fm data: {e}")
-        return pd.DataFrame()
-
-def main(output_path):
+    async with aiohttp.ClientSession() as session:
+        try:
+            tracks = []
+            page_limit = 100  # Number of tracks per page
+            pages = limit // page_limit + (1 if limit % page_limit != 0 else 0)
+
+            for page in range(1, pages + 1):
+                url = f"https://ws.audioscrobbler.com/2.0/?method=chart.gettoptracks&api_key={api_key}&format=json&limit={page_limit}&page={page}"
+                async with session.get(url) as response:
+                    data = await response.json()
+                    tracks.extend(data['tracks']['track'])
+
+            track_data = []
+            tasks = []
+            for track in tracks[:limit]:
+                name = track['name']
+                artist = track['artist']['name']
+                album = track['album']['title'] if 'album' in track else None
+                playcount = track['playcount']
+                tasks.append(fetch_track_details(session, api_key, name, artist))
+
+            results = await asyncio.gather(*tasks)
+
+            for track, (tags, similar_tracks) in zip(tracks[:limit], results):
+                track_data.append({
+                    'name': track['name'],
+                    'artist': track['artist']['name'],
+                    'album': track['album']['title'] if 'album' in track else None,
+                    'playcount': track['playcount'],
+                    'tags': ', '.join(tags),
+                    'similar_tracks': ', '.join(similar_tracks)
+                })
+                logger.info(f"Fetched details for track '{track['name']}' by '{track['artist']['name']}'")
+
+            df = pd.DataFrame(track_data)
+            return df
+
+        except Exception as e:
+            logger.error(f"An error occurred while fetching Last.fm data: {e}")
+            return pd.DataFrame()
+
+def write_to_bigquery(df, project_id, dataset_id, table_id):
+    """Write DataFrame to BigQuery table."""
+    client = bigquery.Client(project=project_id)
+    table_ref = client.dataset(dataset_id).table(table_id)
+    job_config = bigquery.LoadJobConfig()
+    job_config.autodetect = True
+    job_config.source_format = bigquery.SourceFormat.CSV
+
+    job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
+    job.result()  # Wait for the job to complete
+
+    logger.info(f"Loaded {job.output_rows} rows into {dataset_id}:{table_id}")
+
+async def main(project_id, dataset_id, table_id):
     try:
         api_key, _ = configure_lastfm_api()
-        df = fetch_lastfm_data(api_key, limit=5000)  # Adjust the limit as needed
+        df = await fetch_lastfm_data(api_key, limit=5000)  # Adjust the limit as needed
         if not df.empty:
             logger.info(f"Successfully fetched {len(df)} tracks from Last.fm")
-            df.to_csv(output_path, index=False)
-            logger.info(f"Data saved to {output_path}")
+            write_to_bigquery(df, project_id, dataset_id, table_id)
         else:
             logger.error("Failed to fetch data, DataFrame is empty")
     except Exception as e:
         logger.error(f"Error in main function: {e}")
 
 if __name__ == '__main__':
-    output_dir = os.path.join(os.getcwd(), 'data', 'raw')
-    os.makedirs(output_dir, exist_ok=True)
-    output_path = os.path.join(output_dir, 'top_tracks.csv')
-    main(output_path)
+    project_id = 'your-project-id'  # Replace with your GCP project ID
+    dataset_id = 'lastfm_dataset'
+    table_id = 'top_tracks'
+    asyncio.run(main(project_id, dataset_id, table_id))
diff --git a/src/data_processing/data_prep.py b/src/data_processing/data_prep.py
new file mode 100644
index 0000000..bddd202
--- /dev/null
+++ b/src/data_processing/data_prep.py
@@ -0,0 +1,176 @@
+import os
+import yaml
+import time
+import joblib
+import pandas as pd
+import numpy as np
+from google.cloud import bigquery, aiplatform
+from typing import Dict, Any, Tuple
+from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
+from sklearn.model_selection import train_test_split
+from src.utils.logging_utils import setup_logger
+
+logger = setup_logger('data_prep')
+
+def load_config() -> Dict[str, Any]:
+    with open('configs/pipeline_config.yaml', 'r') as f:
+        return yaml.safe_load(f)
+
+def load_data_from_bigquery(project_id: str, dataset_id: str, table_id: str) -> pd.DataFrame:
+    """
+    Load data from BigQuery table into a pandas DataFrame.
+    Uses partitioning and clustering for optimization.
+    """
+    client = bigquery.Client(project=project_id)
+    
+    # Assuming the table is partitioned by date and clustered by artist
+    query = f"""
+    SELECT *
+    FROM `{project_id}.{dataset_id}.{table_id}`
+    WHERE _PARTITIONDATE = DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
+    ORDER BY artist, name
+    """
+    
+    job_config = bigquery.QueryJobConfig(
+        use_query_cache=True,
+        use_legacy_sql=False,
+        priority=bigquery.QueryPriority.BATCH
+    )
+    
+    logger.info(f"Loading data from BigQuery table: {project_id}.{dataset_id}.{table_id}")
+    df = client.query(query, job_config=job_config).to_dataframe()
+    logger.info(f"Loaded {len(df)} rows from BigQuery")
+    return df
+
+def prepare_data(preprocessed_df: pd.DataFrame, original_df: pd.DataFrame) -> Tuple:
+    """
+    Prepare data for model training and testing.
+    """
+    logger.info("Preparing data for model training and testing")
+    
+    X = preprocessed_df.drop(['name', 'artist', 'tags', 'similar_tracks', 'playcount'], axis=1, errors='ignore').values
+    
+    mlb = MultiLabelBinarizer()
+    y = mlb.fit_transform(original_df['similar_tracks'].str.split(','))
+    track_names = original_df['name'].values
+    
+    X_train, X_test, y_train, y_test, names_train, names_test = train_test_split(
+        X, y, track_names, test_size=0.2, random_state=42
+    )
+    
+    scaler = StandardScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+    
+    logger.info(f"Prepared data shapes: X_train: {X_train_scaled.shape}, X_test: {X_test_scaled.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
+    
+    return X_train_scaled, X_test_scaled, y_train, y_test, names_train, names_test, scaler, mlb
+
+def save_prepared_data(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, 
+                        names_train: np.ndarray, names_test: np.ndarray, scaler: StandardScaler, 
+                        mlb: MultiLabelBinarizer, output_dir: str):
+    """
+    Save prepared data and preprocessing objects to files.
+    """
+    logger.info(f"Saving prepared data to {output_dir}")
+    os.makedirs(output_dir, exist_ok=True)
+    
+    np.save(os.path.join(output_dir, 'X_train.npy'), X_train)
+    np.save(os.path.join(output_dir, 'X_test.npy'), X_test)
+    np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
+    np.save(os.path.join(output_dir, 'y_test.npy'), y_test)
+    np.save(os.path.join(output_dir, 'names_train.npy'), names_train)
+    np.save(os.path.join(output_dir, 'names_test.npy'), names_test)
+    
+    joblib.dump(scaler, os.path.join(output_dir, 'scaler.joblib'))
+    joblib.dump(mlb, os.path.join(output_dir, 'multilabel_binarizer.joblib'))
+
+def create_and_populate_feature_store(project_id: str, region: str, feature_store_id: str, entity_type_id: str, df: pd.DataFrame):
+    """
+    Create and populate a Vertex AI Feature Store.
+    """
+    aiplatform.init(project=project_id, location=region)
+
+    # Create a feature store
+    fs = aiplatform.FeatureStore.create(
+        feature_store_id=feature_store_id,
+        online_store_fixed_node_count=1,
+        sync=True
+    )
+
+    # Create an entity type
+    entity_type = fs.create_entity_type(
+        entity_type_id=entity_type_id,
+        description="Music track features"
+    )
+
+    # Define features
+    features = {
+        "artist": "STRING",
+        "name": "STRING",
+        "tags": "STRING",
+        "similar_tracks": "STRING",
+        "playcount": "INT64",
+        # Add other features as needed
+    }
+
+    # Create features
+    for feature_id, feature_type in features.items():
+        entity_type.create_feature(
+            feature_id=feature_id,
+            value_type=feature_type,
+            description=f"Feature: {feature_id}"
+        )
+
+    # Prepare data for ingestion
+    feature_time = int(time.time())
+    entities = df.to_dict(orient="records")
+    for entity in entities:
+        entity["feature_time"] = feature_time
+
+    # Ingest feature values
+    entity_type.ingest(
+        entity_ids=df.index.tolist(),
+        feature_time=feature_time,
+        features=entities,
+        worker_count=10
+    )
+
+    logger.info(f"Created and populated feature store: {feature_store_id}")
+
+def main(project_id: str, preprocessed_dataset_id: str, preprocessed_table_id: str, 
+            original_dataset_id: str, original_table_id: str, output_dir: str,
+            region: str, feature_store_id: str, entity_type_id: str):
+    try:
+        logger.info("Starting data preparation process")
+        
+        preprocessed_df = load_data_from_bigquery(project_id, preprocessed_dataset_id, preprocessed_table_id)
+        original_df = load_data_from_bigquery(project_id, original_dataset_id, original_table_id)
+        
+        X_train, X_test, y_train, y_test, names_train, names_test, scaler, mlb = prepare_data(preprocessed_df, original_df)
+        
+        save_prepared_data(X_train, X_test, y_train, y_test, names_train, names_test, scaler, mlb, output_dir)
+        
+        # Create and populate feature store
+        create_and_populate_feature_store(project_id, region, feature_store_id, entity_type_id, preprocessed_df)
+        
+        logger.info("Data preparation and feature store population completed successfully")
+    except Exception as e:
+        logger.error(f"Error in data preparation process: {e}")
+        raise
+
+if __name__ == '__main__':
+    config = load_config()
+    project_id = config['project']['id']
+    region = config['project']['region']
+    preprocessed_dataset_id = config['bigquery']['preprocessed_dataset_id']
+    preprocessed_table_id = config['bigquery']['preprocessed_table_id']
+    original_dataset_id = config['bigquery']['original_dataset_id']
+    original_table_id = config['bigquery']['original_table_id']
+    output_dir = config['data']['prepared_data_dir']
+    feature_store_id = config['feature_store']['id']
+    entity_type_id = config['feature_store']['entity_type_id']
+    
+    main(project_id, preprocessed_dataset_id, preprocessed_table_id, 
+            original_dataset_id, original_table_id, output_dir,
+            region, feature_store_id, entity_type_id)
\ No newline at end of file
diff --git a/src/data_processing/data_preprocess.py b/src/data_processing/data_preprocess.py
deleted file mode 100644
index 40f3f0a..0000000
--- a/src/data_processing/data_preprocess.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import pandas as pd
-import numpy as np
-from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
-from sklearn.model_selection import train_test_split
-from src.utils.logging_utils import get_logger
-from sklearn.impute import KNNImputer
-from collections import Counter
-import gc
-from scipy.sparse import csr_matrix, hstack
-
-logger = get_logger('data_preprocessing')
-
-def load_data(file_path):
-    try:
-        df = pd.read_csv(file_path)
-        logger.info(f"Data loaded successfully from {file_path}")
-        return df
-    except Exception as e:
-        logger.error(f"Error loading data from {file_path}: {e}")
-        raise
-
-def robust_string_parser(x):
-    if pd.isna(x):
-        return []  # Return empty list instead of np.nan
-    if isinstance(x, str):
-        return [item.strip() for item in x.split(',') if item.strip()]
-    if isinstance(x, (list, np.ndarray)):
-        return [str(item) for item in x if str(item).strip()]
-    return [str(x)] if str(x).strip() else []
-
-def preprocess_data(df):
-    try:
-
-        # Drop the 'album' column as it's all NaN
-        df = df.drop('album', axis=1)
-    
-        # Parse 'tags' and 'similar_tracks' columns
-        df['tags'] = df['tags'].apply(robust_string_parser)
-        df['similar_tracks'] = df['similar_tracks'].apply(robust_string_parser)
-        
-        # Create binary indicators for missing values
-        df['has_tags'] = (df['tags'].apply(len) > 0).astype(int)
-        df['has_similar_tracks'] = (df['similar_tracks'].apply(len) > 0).astype(int)
-        
-        # Convert playcount to numeric and handle any non-numeric values
-        df['playcount'] = pd.to_numeric(df['playcount'], errors='coerce')
-        df['playcount'].fillna(df['playcount'].median(), inplace=True)
-
-        return df
-    except Exception as e:
-        logger.error(f"Error during data preprocessing: {e}")
-        raise
-
-
-def one_hot_encode(series):
-    unique_items = set(item for sublist in series for item in sublist)
-    return pd.DataFrame([[1 if item in sublist else 0 for item in unique_items] for sublist in series],
-                        columns=list(unique_items))
-
-
-def impute_data(df, n_neighbors=10):
-    # Prepare numeric data
-    numeric_data = df[['playcount']].copy()
-    
-    # One-hot encode tags and similar_tracks
-    tags_encoded = one_hot_encode(df['tags'])
-    tracks_encoded = one_hot_encode(df['similar_tracks'])
-    
-    # Combine all features
-    features = pd.concat([numeric_data, tags_encoded, tracks_encoded], axis=1)
-    
-    # Scale features
-    scaler = StandardScaler()
-    features_scaled = scaler.fit_transform(features)
-    
-    # Impute using KNNImputer
-    imputer = KNNImputer(n_neighbors=n_neighbors)
-    imputed_features = imputer.fit_transform(features_scaled)
-    
-    # Rescale imputed features
-    imputed_features = scaler.inverse_transform(imputed_features)
-    
-    # Reconstruct dataframe
-    imputed_df = pd.DataFrame(imputed_features, columns=features.columns, index=df.index)
-    
-    # Update original dataframe
-    df['playcount'] = imputed_df['playcount']
-    
-    # Convert one-hot encoded back to lists
-    tags_columns = tags_encoded.columns
-    tracks_columns = tracks_encoded.columns
-    
-    df['tags'] = imputed_df[tags_columns].apply(lambda row: [col for col, val in row.items() if val > 0.5], axis=1)
-    df['similar_tracks'] = imputed_df[tracks_columns].apply(lambda row: [col for col, val in row.items() if val > 0.5], axis=1)
-    
-    # Function to get most common tags/tracks for an artist
-    def get_most_common(artist, column, n=3):
-        artist_data = df[df['artist'] == artist][column]
-        all_items = [item for sublist in artist_data for item in sublist if sublist]
-        return Counter(all_items).most_common(n)
-    
-    # Get global most common tags and tracks
-    global_common_tags = Counter([tag for tags in df['tags'] for tag in tags]).most_common(5)
-    global_common_tracks = Counter([track for tracks in df['similar_tracks'] for track in tracks]).most_common(5)
-    
-    # Fill empty lists with most common tags/tracks for the artist or global common tags/tracks
-    for idx, row in df.iterrows():
-        if not row['tags']:
-            common_tags = get_most_common(row['artist'], 'tags')
-            if common_tags:
-                df.at[idx, 'tags'] = [tag for tag, _ in common_tags]
-            else:
-                df.at[idx, 'tags'] = [tag for tag, _ in global_common_tags]
-        
-        if not row['similar_tracks']:
-            common_tracks = get_most_common(row['artist'], 'similar_tracks')
-            if common_tracks:
-                df.at[idx, 'similar_tracks'] = [track for track, _ in common_tracks]
-            else:
-                df.at[idx, 'similar_tracks'] = [track for track, _ in global_common_tracks]
-    
-    # Convert lists back to strings
-    df['tags'] = df['tags'].apply(lambda x: ', '.join(x) if x else 'Unknown')
-    df['similar_tracks'] = df['similar_tracks'].apply(lambda x: ', '.join(x) if x else 'Unknown')
-    
-    # Update has_tags and has_similar_tracks
-    df['has_tags'] = (df['tags'] != 'Unknown').astype(int)
-    df['has_similar_tracks'] = (df['similar_tracks'] != 'Unknown').astype(int)
-    
-    return df
-
-def prepare_data(preprocessed_df, original_df):
-    X = preprocessed_df.values
-    mlb = MultiLabelBinarizer()
-    y = mlb.fit_transform(original_df['similar_tracks'].str.split(','))
-    track_names = original_df['name'].values
-    
-    X_train, X_test, y_train, y_test, names_train, names_test = train_test_split(
-        X, y, track_names, test_size=0.2, random_state=42
-    )
-    
-    scaler = StandardScaler()
-    X_train_scaled = scaler.fit_transform(X_train)
-    X_test_scaled = scaler.transform(X_test)
-    
-    return X_train_scaled, X_test_scaled, y_train, y_test, names_train, names_test, scaler, mlb
-
-
-def main(input_file_path, output_imputed_path):
-    try:
-        # Load data
-        df = load_data(input_file_path)
-        
-        # Preprocess data
-        df_processed = preprocess_data(df)
-
-        # Imputed data for missing values
-        imputed_data = impute_data(df_processed)
-        imputed_data.drop_duplicates(inplace=True)
-
-        imputed_data.to_csv(output_imputed_path, index=False)
-        logger.info(f"Preprocessed data saved to {output_imputed_path}")
-        
-    except Exception as e:
-        logger.error(f"Error in preprocessing main function: {e}")
-        raise
-
-if __name__ == "__main__":
-    # This section would be replaced by Kubeflow pipeline component inputs
-    # For now, we'll just log a message
-    logger.info("Preprocessing script executed. This would be replaced by Kubeflow component execution.")
diff --git a/src/data_processing/data_process.py b/src/data_processing/data_process.py
new file mode 100644
index 0000000..1ca5981
--- /dev/null
+++ b/src/data_processing/data_process.py
@@ -0,0 +1,175 @@
+import pandas as pd
+import numpy as np
+from google.cloud import bigquery
+from sklearn.preprocessing import StandardScaler
+from sklearn.impute import KNNImputer
+from collections import Counter
+from scipy.sparse import csr_matrix, hstack
+from src.utils.logging_utils import get_logger
+import multiprocessing as mp
+from functools import partial
+import gc
+
+logger = get_logger('data_preprocessing')
+
+def load_data_from_bigquery(project_id, dataset_id, table_id):
+    try:
+        client = bigquery.Client(project=project_id)
+        query = f"""
+        SELECT *
+        FROM `{project_id}.{dataset_id}.{table_id}`
+        """
+        df = client.query(query).to_dataframe()
+        logger.info(f"Data loaded successfully from BigQuery table {project_id}.{dataset_id}.{table_id}")
+        return df
+    except Exception as e:
+        logger.error(f"Error loading data from BigQuery: {e}")
+        raise
+
+def robust_string_parser(x):
+    if pd.isna(x):
+        return []
+    if isinstance(x, str):
+        return [item.strip() for item in x.split(',') if item.strip()]
+    if isinstance(x, (list, np.ndarray)):
+        return [str(item) for item in x if str(item).strip()]
+    return [str(x)] if str(x).strip() else []
+
+def preprocess_data(df):
+    try:
+        if 'album' in df.columns:
+            df = df.drop('album', axis=1)
+        df['tags'] = df['tags'].apply(robust_string_parser)
+        df['similar_tracks'] = df['similar_tracks'].apply(robust_string_parser)
+        df['has_tags'] = (df['tags'].apply(len) > 0).astype(int)
+        df['has_similar_tracks'] = (df['similar_tracks'].apply(len) > 0).astype(int)
+        df['playcount'] = pd.to_numeric(df['playcount'], errors='coerce')
+        df['playcount'].fillna(df['playcount'].median(), inplace=True)
+
+        # Additional data quality checks
+        logger.info(f"Missing values: {df.isnull().sum()}")
+        logger.info(f"Data types: {df.dtypes}")
+        logger.info(f"Unique values in categorical columns: {df.select_dtypes(include=['object']).nunique()}")
+
+        return df
+    except Exception as e:
+        logger.error(f"Error during data preprocessing: {e}")
+        raise
+
+def one_hot_encode_sparse(series):
+    unique_items = set(item for sublist in series for item in sublist)
+    item_to_index = {item: i for i, item in enumerate(unique_items)}
+    rows, cols = [], []
+    for i, sublist in enumerate(series):
+        for item in sublist:
+            rows.append(i)
+            cols.append(item_to_index[item])
+    return csr_matrix((np.ones(len(rows)), (rows, cols)), shape=(len(series), len(unique_items)))
+
+def process_chunk(chunk, tags_vocab, tracks_vocab):
+    tags_encoded = one_hot_encode_sparse(chunk['tags'])
+    tracks_encoded = one_hot_encode_sparse(chunk['similar_tracks'])
+    return hstack([tags_encoded, tracks_encoded])
+
+def impute_data(df, n_neighbors=10, chunk_size=10000):
+    numeric_data = df[['playcount']].values
+
+    # Create vocabularies for tags and tracks
+    tags_vocab = set(item for sublist in df['tags'] for item in sublist)
+    tracks_vocab = set(item for sublist in df['similar_tracks'] for item in sublist)
+
+    # Process data in chunks
+    with mp.Pool(mp.cpu_count()) as pool:
+        encoded_chunks = pool.map(
+            partial(process_chunk, tags_vocab=tags_vocab, tracks_vocab=tracks_vocab),
+            [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)]
+        )
+
+    # Combine chunks
+    encoded_data = hstack(encoded_chunks)
+
+    # Combine numeric and encoded data
+    features = hstack([numeric_data, encoded_data])
+
+    # Scale features
+    scaler = StandardScaler(with_mean=False)  # Use with_mean=False for sparse data
+    features_scaled = scaler.fit_transform(features)
+
+    # Impute using KNNImputer
+    imputer = KNNImputer(n_neighbors=n_neighbors)
+    imputed_features = imputer.fit_transform(features_scaled)
+
+    # Rescale imputed features
+    imputed_features = scaler.inverse_transform(imputed_features)
+
+    # Update original dataframe
+    df['playcount'] = imputed_features[:, 0]
+
+    # Convert one-hot encoded back to lists
+    tags_start = 1
+    tracks_start = tags_start + len(tags_vocab)
+
+    df['tags'] = [
+        [tag for tag, val in zip(tags_vocab, row[tags_start:tracks_start]) if val > 0.5]
+        for row in imputed_features
+    ]
+    df['similar_tracks'] = [
+        [track for track, val in zip(tracks_vocab, row[tracks_start:]) if val > 0.5]
+        for row in imputed_features
+    ]
+
+    # Fill empty lists with most common tags/tracks
+    global_common_tags = Counter([tag for tags in df['tags'] for tag in tags]).most_common(5)
+    global_common_tracks = Counter([track for tracks in df['similar_tracks'] for track in tracks]).most_common(5)
+
+    df.loc[df['tags'].apply(len) == 0, 'tags'] = [tag for tag, _ in global_common_tags]
+    df.loc[df['similar_tracks'].apply(len) == 0, 'similar_tracks'] = [track for track, _ in global_common_tracks]
+
+    # Convert lists back to strings
+    df['tags'] = df['tags'].apply(lambda x: ', '.join(x) if x else 'Unknown')
+    df['similar_tracks'] = df['similar_tracks'].apply(lambda x: ', '.join(x) if x else 'Unknown')
+
+    # Update has_tags and has_similar_tracks
+    df['has_tags'] = (df['tags'] != 'Unknown').astype(int)
+    df['has_similar_tracks'] = (df['similar_tracks'] != 'Unknown').astype(int)
+
+    return df
+
+def write_to_bigquery(df, project_id, dataset_id, table_id):
+    try:
+        client = bigquery.Client(project=project_id)
+        job_config = bigquery.LoadJobConfig(
+            autodetect=True,
+            write_disposition="WRITE_TRUNCATE",
+        )
+        job = client.load_table_from_dataframe(
+            df, f"{project_id}.{dataset_id}.{table_id}", job_config=job_config
+        )
+        job.result()  # Wait for the job to complete
+        logger.info(f"Processed data written to BigQuery table {project_id}.{dataset_id}.{table_id}")
+    except Exception as e:
+        logger.error(f"Error writing data to BigQuery: {e}")
+        raise
+
+def main(project_id, input_dataset_id, input_table_id, output_dataset_id, output_table_id):
+    try:
+        df = load_data_from_bigquery(project_id, input_dataset_id, input_table_id)
+        df_processed = preprocess_data(df)
+        imputed_data = impute_data(df_processed)
+        imputed_data.drop_duplicates(inplace=True)
+        write_to_bigquery(imputed_data, project_id, output_dataset_id, output_table_id)
+        logger.info("Data processing completed successfully")
+    except Exception as e:
+        logger.error(f"Error in preprocessing main function: {e}")
+        raise
+
+if __name__ == "__main__":
+    # These would be replaced by Kubeflow pipeline component inputs
+    project_id = "your-project-id"
+    input_dataset_id = "lastfm_dataset"
+    input_table_id = "raw_top_tracks"
+    output_dataset_id = "lastfm_dataset"
+    output_table_id = "processed_top_tracks"
+    
+    main(project_id, input_dataset_id, input_table_id, output_dataset_id, output_table_id)
+    logger.info("Preprocessing script executed. This would be replaced by Kubeflow component execution.")
diff --git a/src/data_processing/data_validation.py b/src/data_processing/data_validation.py
index d2938cc..78ff436 100644
--- a/src/data_processing/data_validation.py
+++ b/src/data_processing/data_validation.py
@@ -7,6 +7,7 @@
 import os
 import matplotlib.pyplot as plt
 from google.cloud import storage
+from google.cloud import bigquery
 from datetime import datetime
 
 logger = setup_logger('data_validation')
@@ -44,10 +45,21 @@ def load_statistics_from_gcs(bucket_name: str, model_name: str, data_type: str,
     stats.ParseFromString(blob.download_as_string())
     return stats
 
-def generate_schema(data_path: str, bucket_name: str, model_name: str, version: str) -> tfdv.types.Schema:
+def load_data_from_bigquery(project_id: str, dataset_id: str, table_id: str) -> pd.DataFrame:
+    """
+    Load data from BigQuery table into a pandas DataFrame.
+    """
+    client = bigquery.Client(project=project_id)
+    query = f"""
+    SELECT *
+    FROM `{project_id}.{dataset_id}.{table_id}`
+    """
+    return client.query(query).to_dataframe()
+
+def generate_schema(project_id: str, dataset_id: str, table_id: str, bucket_name: str, model_name: str, version: str) -> tfdv.types.Schema:
     try:
         log_step(logger, 'Generating Schema', 'Data Validation')
-        df = pd.read_csv(data_path)
+        df = load_data_from_bigquery(project_id, dataset_id, table_id)
         schema = tfdv.infer_schema(df)
         save_schema_to_gcs(schema, bucket_name, model_name, version)
         return schema
@@ -55,10 +67,10 @@ def generate_schema(data_path: str, bucket_name: str, model_name: str, version:
         log_error(logger, e, 'Schema Generation')
         raise
 
-def validate_data(data_path: str, schema: tfdv.types.Schema, bucket_name: str, model_name: str, data_type: str) -> Tuple[tfdv.types.DatasetFeatureStatisticsList, tfdv.types.Anomalies]:
+def validate_data(project_id: str, dataset_id: str, table_id: str, schema: tfdv.types.Schema, bucket_name: str, model_name: str, data_type: str) -> Tuple[tfdv.types.DatasetFeatureStatisticsList, tfdv.types.Anomalies]:
     try:
         log_step(logger, 'Validating Data', 'Data Validation')
-        df = pd.read_csv(data_path)
+        df = load_data_from_bigquery(project_id, dataset_id, table_id)
         stats = tfdv.generate_statistics_from_dataframe(df)
         save_statistics_to_gcs(stats, bucket_name, model_name, data_type)
         anomalies = tfdv.validate_statistics(stats, schema)
@@ -161,7 +173,9 @@ def compare_schemas(baseline_schema: tfdv.types.Schema, current_schema: tfdv.typ
         return False  # In case of an error, return False to indicate no schema drift detected
 
 
-def main(train_data_path: str, serving_data_path: str, bucket_name: str, model_name: str):
+def main(project_id: str, train_dataset_id: str, train_table_id: str, 
+            serving_dataset_id: str, serving_table_id: str, 
+            bucket_name: str, model_name: str):
     try:
         config = load_config()
         schema_version = config['data_validation']['schema_version']
@@ -172,14 +186,14 @@ def main(train_data_path: str, serving_data_path: str, bucket_name: str, model_n
             schema = load_schema_from_gcs(bucket_name, model_name, schema_version)
             logger.info(f"Loaded existing schema version {schema_version} from GCS")
         except:
-            schema = generate_schema(train_data_path, bucket_name, model_name, schema_version)
+            schema = generate_schema(project_id, train_dataset_id, train_table_id, bucket_name, model_name, schema_version)
         
         # Validate training data
-        train_stats, train_anomalies = validate_data(train_data_path, schema, bucket_name, model_name, 'train')
+        train_stats, train_anomalies = validate_data(project_id, train_dataset_id, train_table_id, schema, bucket_name, model_name, 'train')
         visualize_statistics(train_stats, train_anomalies)
         
         # Validate serving data
-        serving_stats, serving_anomalies = validate_data(serving_data_path, schema, bucket_name, model_name, 'serving')
+        serving_stats, serving_anomalies = validate_data(project_id, serving_dataset_id, serving_table_id, schema, bucket_name, model_name, 'serving')
         visualize_statistics(serving_stats, serving_anomalies)
         
         # Compare statistics and detect drift
@@ -193,7 +207,14 @@ def main(train_data_path: str, serving_data_path: str, bucket_name: str, model_n
 
 if __name__ == '__main__':
     config = load_config()
+    project_id = config['project']['id']
     bucket_name = config['storage']['bucket_name']
     model_name = config['model']['name']
-    # Replace arg 1 and 2 with Kubeflow pipeline inputs
-    main('data/raw/train_data.csv', 'data/raw/serving_data.csv', bucket_name, model_name)
\ No newline at end of file
+    train_dataset_id = config['bigquery']['train_dataset_id']
+    train_table_id = config['bigquery']['train_table_id']
+    serving_dataset_id = config['bigquery']['serving_dataset_id']
+    serving_table_id = config['bigquery']['serving_table_id']
+    
+    main(project_id, train_dataset_id, train_table_id, 
+            serving_dataset_id, serving_table_id, 
+            bucket_name, model_name)
\ No newline at end of file
diff --git a/src/feature_engineering/feat_engineering.py b/src/feature_engineering/feat_engineering.py
index a6fb6cc..78353bc 100644
--- a/src/feature_engineering/feat_engineering.py
+++ b/src/feature_engineering/feat_engineering.py
@@ -1,362 +1,118 @@
-import pandas as pd 
+import pandas as pd
 import numpy as np
-import sklearn
-import ast
+from google.cloud import bigquery
 from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, KBinsDiscretizer
-from sklearn.impute import SimpleImputer, KNNImputer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
 from sklearn.decomposition import TruncatedSVD
-from collections import Counter
-from itertools import combinations
+from sklearn.feature_selection import SelectKBest, f_regression
 from scipy.sparse import csr_matrix, hstack
-import matplotlib.pyplot as plt
-import seaborn as sns
-from src.utils.data_utilis import plot_correlation_map
+import multiprocessing as mp
+from functools import partial
+from src.utils.logging_utils import setup_logger
+
+logger = setup_logger('feature_engineering')
+
+def load_data_from_bigquery(project_id, dataset_id, table_id):
+    client = bigquery.Client(project=project_id)
+    query = f"""
+    SELECT *
+    FROM `{project_id}.{dataset_id}.{table_id}`
+    """
+    return client.query(query).to_dataframe()
+
+def write_to_bigquery(df, project_id, dataset_id, table_id):
+    client = bigquery.Client(project=project_id)
+    job_config = bigquery.LoadJobConfig(autodetect=True, write_disposition="WRITE_TRUNCATE")
+    job = client.load_table_from_dataframe(df, f"{project_id}.{dataset_id}.{table_id}", job_config=job_config)
+    job.result()
 
 def engineer_basic_features(df):
-    # Create a copy of the dataframe to avoid SettingWithCopyWarning
-    new_df = df.copy()
-    
-    # Log-transform the playcount
-    new_df.loc[:, 'log_playcount'] = np.log1p(new_df['playcount'])
-    
-    # Create binned versions of playcount
-    kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
-    new_df.loc[:, 'binned_playcount'] = kbd.fit_transform(new_df[['playcount']])
-    
-    # Create features for the number of tags and similar tracks
-    new_df.loc[:, 'num_tags'] = new_df['tags'].apply(lambda x: len(x.split(', ')))
-    new_df.loc[:, 'num_similar_tracks'] = new_df['similar_tracks'].apply(lambda x: len(x.split(', ')))
-    
-    return new_df
-
-
-def plot_new_features(df):
-    fig, axes = plt.subplots(2, 2, figsize=(15, 15))
-    
-    sns.histplot(data=df, x='log_playcount', kde=True, ax=axes[0, 0])
-    axes[0, 0].set_title('Distribution of Log Playcount')
-    
-    sns.histplot(data=df, x='binned_playcount', kde=True, ax=axes[0, 1])
-    axes[0, 1].set_title('Distribution of Binned Playcount')
-    
-    sns.histplot(data=df, x='num_tags', kde=True, ax=axes[1, 0])
-    axes[1, 0].set_title('Distribution of Number of Tags')
-    
-    sns.histplot(data=df, x='num_similar_tracks', kde=True, ax=axes[1, 1])
-    axes[1, 1].set_title('Distribution of Number of Similar Tracks')
-    
-    plt.tight_layout()
-    plt.show()
+    df['log_playcount'] = np.log1p(df['playcount'])
+    df['num_tags'] = df['tags'].str.count(',') + 1
+    df['num_similar_tracks'] = df['similar_tracks'].str.count(',') + 1
+    return df
 
 def engineer_additional_features(df):
-    # Create a copy of the dataframe
-    new_df = df.copy()
-    
-    # Binary feature for high tag count
-    new_df['high_tag_count'] = (new_df['num_tags'] > 50).astype(int)
-    
-    # Bin number of tags
-    new_df['tag_count_category'] = pd.cut(new_df['num_tags'], 
-                                        bins=[0, 10, 50, np.inf], 
-                                        labels=['low', 'medium', 'high'])
-    
-    # Binary feature for having similar tracks
-    new_df['has_similar_tracks'] = (new_df['num_similar_tracks'] > 0).astype(int)
-    
-    # Bin number of similar tracks
-    new_df['similar_tracks_category'] = pd.cut(new_df['num_similar_tracks'], 
-                                            bins=[0, 50, 100, np.inf], 
-                                            labels=['low', 'medium', 'high'])
-    
-    # Interaction features
-    new_df['log_playcount_x_num_tags'] = new_df['log_playcount'] * new_df['num_tags']
-    new_df['log_playcount_x_num_similar_tracks'] = new_df['log_playcount'] * new_df['num_similar_tracks']
-    
-    return new_df
-
-def drop_columns(features, df):
-    new_df = df.copy()
-    imputed_data_with_more_features = new_df.drop(features, axis=1)
-
-    return imputed_data_with_more_features
-
-def correlation_map(df):
-    new_df = df.copy()
-
-    plot_correlation_map(new_df)
-
-def refine_features(df):
-    # Create a copy of the dataframe
-    new_df = df.copy()
-    
-    # Drop redundant features
-    new_df = new_df.drop(['playcount', 'binned_playcount', 'high_tag_count', 
-                        'log_playcount_x_num_tags', 'log_playcount_x_num_similar_tracks'], axis=1)
-    
-    # Create artist-based features
-    new_df['artist_avg_playcount'] = new_df.groupby('artist')['log_playcount'].transform('mean')
-    new_df['artist_track_count'] = new_df.groupby('artist')['name'].transform('count')
-    
-    # Create features for top N tags
-    top_tags = new_df['tags'].str.split(', ', expand=True).stack().value_counts().nlargest(10).index
-    for tag in top_tags:
-        new_df[f'has_tag_{tag}'] = new_df['tags'].str.contains(tag).astype(int)
-    
-    return new_df
+    df['high_tag_count'] = (df['num_tags'] > df['num_tags'].median()).astype(int)
+    df['has_similar_tracks'] = (df['num_similar_tracks'] > 0).astype(int)
+    df['log_playcount_x_num_tags'] = df['log_playcount'] * df['num_tags']
+    return df
 
 def add_tag_popularity(df):
-    # Split tags and create a dataframe
-    tag_df = df['tags'].str.split(', ', expand=True).melt(value_name='tag').dropna()
+    tag_df = df['tags'].str.split(',', expand=True).melt(value_name='tag').dropna()
     tag_df = tag_df.merge(df[['log_playcount']], left_index=True, right_index=True)
-    
-    # Calculate tag popularity
     tag_popularity = tag_df.groupby('tag')['log_playcount'].mean().sort_values(ascending=False)
-    
-    # Function to calculate average tag popularity
-    def avg_tag_popularity(tags):
-        if not tags:
-            return 0
-        tags_list = tags.split(', ')
-        # Only consider tags that are in tag_popularity
-        valid_tags = [tag for tag in tags_list if tag in tag_popularity.index]
-        if not valid_tags:
-            return 0
-        return tag_popularity[valid_tags].mean()
-    
-    # Add tag popularity to main dataframe
-    df['avg_tag_popularity'] = df['tags'].apply(avg_tag_popularity)
-    
+    df['avg_tag_popularity'] = df['tags'].apply(lambda x: tag_popularity[x.split(',')].mean() if x else 0)
     return df
 
 def add_similar_tracks_avg_playcount(df):
-    # Create a dictionary of track name to log_playcount
     track_playcount = dict(zip(df['name'], df['log_playcount']))
-    
-    # Function to get average playcount of similar tracks
-    def get_avg_playcount(similar_tracks):
-        playcounts = [track_playcount.get(track.strip(), 0) for track in similar_tracks.split(', ')]
-        return sum(playcounts) / len(playcounts) if playcounts else 0
-    
-    df['avg_similar_tracks_playcount'] = df['similar_tracks'].apply(get_avg_playcount)
-    
+    df['avg_similar_tracks_playcount'] = df['similar_tracks'].apply(
+        lambda x: np.mean([track_playcount.get(t.strip(), 0) for t in x.split(',')]) if x else 0
+    )
     return df
 
-def add_interaction_features(df):
-    df['num_tags_x_avg_similar_tracks_playcount'] = df['num_tags'] * df['avg_similar_tracks_playcount']
-    
-    return df
-
-def add_target_encoding(df):
-    # Calculate mean log_playcount for each artist
-    artist_means = df.groupby('artist')['log_playcount'].mean()
-    
-    # Calculate global mean
-    global_mean = df['log_playcount'].mean()
-    
-    # Function to encode with smoothing
-    def encode(artist):
-        n = df[df['artist'] == artist].shape[0]
-        return (n * artist_means.get(artist, global_mean) + global_mean) / (n + 1)
-    
-    # Apply encoding
-    df['artist_target_encoded'] = df['artist'].apply(encode)
-    
-    return df
-
-def refine_features_further(df):
-    # Combine redundant features
-    df['has_tag_favorites_combined'] = df[['has_tag_favorites', 'has_tag_Favorite']].max(axis=1)
-
-    # drop low variance features
-    df = df.drop(['has_tag_favorites', 'has_tag_Favorite', 'has_tag_MySpotigramBot'], axis=1)
-    
-    # Create a composite tag popularity score
-    tag_columns = [col for col in df.columns if col.startswith('has_tag_')]
-    df['tag_popularity_score'] = df[tag_columns].mean(axis=1)
-    
-    return df
-
-def review_categorical_features(df):
-    cat_cols = ['tag_count_category', 'similar_tracks_category']
-    for col in cat_cols:
-        print(f"\nValue counts for {col}:")
-        print(df[col].value_counts())
-
-def analyze_vocabulary_sizes(df):
-    text_features = ['name', 'artist', 'tags', 'similar_tracks']
-    for feature in text_features:
-        unique_terms = set()
-        for text in df[feature]:
-            unique_terms.update(text.split(','))
-        print(f"{feature} unique terms: {len(unique_terms)}")
-
-def remove_pretfidf_cols(df):
-    # Identify and remove previously vectorized track name features
-    name_tfidf_columns = [col for col in df.columns if col.startswith('name_tfidf_')]
-    refined_data_new = refined_data_new.drop(columns=name_tfidf_columns)
-
-    return refined_data_new
-
-
-def vectorize_all_text_features(df, max_features_dict=None):
-    if max_features_dict is None:
-        max_features_dict = {
-            'artist': None,  # This will use all unique artists
-            'tags': 300,
-            'similar_tracks': 500
-        }
-    
-    # Check if 'name' has already been vectorized
-    if 'name' not in max_features_dict and not any(col.startswith('name_tfidf_') for col in df.columns):
-        max_features_dict['name'] = 8000
-    
-    text_features = list(max_features_dict.keys())
-    vectorized_dfs = []
+def vectorize_text_features(df, max_features_dict):
     vectorizers = {}
+    for feature, max_features in max_features_dict.items():
+        vectorizer = TfidfVectorizer(max_features=max_features)
+        vectorized = vectorizer.fit_transform(df[feature].fillna(''))
+        df = pd.concat([df, pd.DataFrame(vectorized.toarray(), columns=[f'{feature}_tfidf_{i}' for i in range(vectorized.shape[1])], index=df.index)], axis=1)
+        vectorizers[feature] = vectorizer
+    return df, vectorizers
+
+def feature_selection(X, y, k=1000):
+    selector = SelectKBest(f_regression, k=k)
+    X_new = selector.fit_transform(X, y)
+    selected_features = X.columns[selector.get_support()]
+    return X_new, selected_features
+
+def process_chunk(chunk, feature_engineering_pipeline):
+    return feature_engineering_pipeline(chunk)
+
+def feature_engineering_pipeline(df):
+    df = engineer_basic_features(df)
+    df = engineer_additional_features(df)
+    df = add_tag_popularity(df)
+    df = add_similar_tracks_avg_playcount(df)
+    df, _ = vectorize_text_features(df, {'tags': 300, 'similar_tracks': 500})
+    return df
 
-    for feature in text_features:
-        if feature in ['name', 'artist']:
-            # Treat each unique value as a document
-            unique_values = df[feature].unique()
-            text_data = pd.Series(unique_values)
-        else:
-            text_data = df[feature].fillna('')
-
-        tfidf = TfidfVectorizer(max_features=max_features_dict[feature])
-        tfidf_matrix = tfidf.fit_transform(text_data)
+def main(project_id, input_dataset_id, input_table_id, output_dataset_id, output_table_id):
+    try:
+        logger.info("Starting feature engineering process")
+        df = load_data_from_bigquery(project_id, input_dataset_id, input_table_id)
         
-        feature_df = pd.DataFrame(
-            tfidf_matrix.toarray(),
-            columns=[f'{feature}_tfidf_{i}' for i in range(tfidf_matrix.shape[1])]
-        )
-
-        if feature in ['name', 'artist']:
-            # Map the vectorized features back to the original dataframe
-            feature_to_vector = dict(zip(unique_values, feature_df.values))
-            vectorized_feature = df[feature].map(lambda x: feature_to_vector.get(x, np.zeros(max_features_dict[feature])))
-            feature_df = pd.DataFrame(vectorized_feature.tolist(), 
-                                    columns=feature_df.columns, 
-                                    index=df.index)
-        else:
-            feature_df.index = df.index
-
-        vectorized_dfs.append(feature_df)
-        vectorizers[feature] = tfidf
-
-    df_vectorized = pd.concat([df] + vectorized_dfs, axis=1)
-    
-    return df_vectorized, vectorizers
-
-
-def get_final_features(df):
-    # Prepare final feature set
-    original_text_cols = ['name', 'artist', 'tags', 'similar_tracks']
-    feature_cols = [col for col in df.columns if col not in original_text_cols]
-    print("Final feature set:")
-    print(feature_cols)
-    return df[feature_cols]
-
-
-def create_preprocessing_pipeline(df, n_components=100):
-    # Identify different types of columns
-    numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
-    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
-    tfidf_features = [col for col in df.columns if '_tfidf_' in col]
-    
-    # Create the preprocessing steps
-    numeric_transformer = Pipeline(steps=[
-        ('imputer', SimpleImputer(strategy='mean')),
-        ('scaler', StandardScaler())
-    ])
-    
-    categorical_transformer = Pipeline(steps=[
-        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
-        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
-    ])
-    
-    tfidf_transformer = Pipeline(steps=[
-        ('svd', TruncatedSVD(n_components=min(n_components, len(tfidf_features)), algorithm='randomized', n_iter=5, random_state=42))
-    ])
-    
-    # Combine all the preprocessing steps
-    preprocessor = ColumnTransformer(
-        transformers=[
-            ('num', numeric_transformer, numeric_features),
-            ('cat', categorical_transformer, categorical_features),
-            ('tfidf', tfidf_transformer, tfidf_features)
-        ])
-    
-    # Create the full pipeline
-    full_pipeline = Pipeline(steps=[
-        ('preprocessor', preprocessor),
-        ('replace_inf', FunctionTransformer(lambda X: np.nan_to_num(X, nan=0, posinf=0, neginf=0)))
-    ])
-    
-    return full_pipeline
-
-def get_feature_names(input_features, n_components):
-    feature_names = []
-    numeric_features = input_features.select_dtypes(include=[np.number]).columns.tolist()
-    categorical_features = input_features.select_dtypes(include=['object']).columns.tolist()
-    tfidf_features = [col for col in input_features.columns if '_tfidf_' in col]
-    
-    feature_names.extend(numeric_features)
-    
-    for cat_feature in categorical_features:
-        unique_values = input_features[cat_feature].unique()
-        feature_names.extend([f"{cat_feature}_{value}" for value in unique_values])
-    
-    feature_names.extend([f'svd_tfidf_{i}' for i in range(min(n_components, len(tfidf_features)))])
-    
-    return feature_names
-
-def analyze_feature_importance_and_reduce_dimensions(df, n_components=4000):
-    # Perform Truncated SVD
-    svd = TruncatedSVD(n_components=n_components, random_state=42)
-    svd_result = svd.fit_transform(df)
-    
-    # Analyze feature importance
-    feature_importance = np.sum(np.abs(svd.components_), axis=0)
-    feature_importance = 100.0 * (feature_importance / feature_importance.sum())
-    
-    # Create a DataFrame of feature importance
-    feature_importance_df = pd.DataFrame({
-        'feature': df.columns,
-        'importance': feature_importance
-    }).sort_values('importance', ascending=False)
-    
-    # Plot feature importance
-    plt.figure(figsize=(12, 6))
-    plt.bar(range(20), feature_importance_df['importance'][:20])
-    plt.xticks(range(20), feature_importance_df['feature'][:20], rotation=90)
-    plt.xlabel('Features')
-    plt.ylabel('Relative Importance (%)')
-    plt.title('Top 20 Most Important Features')
-    plt.tight_layout()
-    plt.show()
-    
-    # Print top 20 most important features
-    print("Top 20 most important features:")
-    print(feature_importance_df.head(20))
-    
-    # Plot cumulative explained variance ratio
-    plt.figure(figsize=(10, 6))
-    plt.plot(np.cumsum(svd.explained_variance_ratio_))
-    plt.xlabel('Number of Components')
-    plt.ylabel('Cumulative Explained Variance Ratio')
-    plt.title('Explained Variance Ratio by Number of Components')
-    plt.tight_layout()
-    plt.show()
-    
-    # Create DataFrame with reduced dimensions
-    columns = [f'SVD_{i+1}' for i in range(svd_result.shape[1])]
-    df_svd = pd.DataFrame(svd_result, columns=columns, index=df.index)
-    
-    print(f"Shape after Truncated SVD: {df_svd.shape}")
-    print(f"Cumulative explained variance ratio: {np.sum(svd.explained_variance_ratio_):.4f}")
-    
-    return df_svd, svd, feature_importance_df
-
+        # Process data in chunks to optimize memory usage
+        chunk_size = 10000
+        with mp.Pool(mp.cpu_count()) as pool:
+            processed_chunks = pool.map(
+                partial(process_chunk, feature_engineering_pipeline=feature_engineering_pipeline),
+                [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)]
+            )
+        
+        df_processed = pd.concat(processed_chunks)
+        
+        # Perform feature selection
+        X = df_processed.drop(['name', 'artist', 'tags', 'similar_tracks', 'playcount'], axis=1)
+        y = df_processed['log_playcount']
+        X_selected, selected_features = feature_selection(X, y)
+        
+        df_final = pd.concat([df_processed[['name', 'artist', 'tags', 'similar_tracks', 'playcount']], pd.DataFrame(X_selected, columns=selected_features, index=df_processed.index)], axis=1)
+        
+        write_to_bigquery(df_final, project_id, output_dataset_id, output_table_id)
+        logger.info("Feature engineering completed successfully")
+    
+    except Exception as e:
+        logger.error(f"Error in feature engineering process: {e}")
+        raise
+
+if __name__ == "__main__":
+    project_id = "your-project-id"
+    input_dataset_id = "lastfm_dataset"
+    input_table_id = "processed_top_tracks"
+    output_dataset_id = "lastfm_dataset"
+    output_table_id = "engineered_top_tracks"
+    main(project_id, input_dataset_id, input_table_id, output_dataset_id, output_table_id)
diff --git a/src/utils/data_versioning.py b/src/utils/data_versioning.py
new file mode 100644
index 0000000..b2421d5
--- /dev/null
+++ b/src/utils/data_versioning.py
@@ -0,0 +1,25 @@
+from google.cloud import bigquery
+from datetime import datetime
+
+def version_dataset(project_id, source_dataset_id, source_table_id, target_dataset_id):
+    client = bigquery.Client(project=project_id)
+    
+    # Create a new table name with timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    target_table_id = f"{source_table_id}_v{timestamp}"
+    
+    # Construct the query to copy data
+    query = f"""
+    CREATE OR REPLACE TABLE `{project_id}.{target_dataset_id}.{target_table_id}`
+    AS SELECT * FROM `{project_id}.{source_dataset_id}.{source_table_id}`
+    """
+    
+    # Run the query
+    job = client.query(query)
+    job.result()  # Wait for the job to complete
+    
+    print(f"Dataset versioned: {target_dataset_id}.{target_table_id}")
+    return f"{target_dataset_id}.{target_table_id}"
+
+# Use this function after each major data processing step
+# version_dataset(project_id, 'source_dataset', 'source_table', 'versioned_dataset')
\ No newline at end of file
diff --git a/tests/test_content_based.py b/tests/test_content_based.py
new file mode 100644
index 0000000..94670e2
--- /dev/null
+++ b/tests/test_content_based.py
@@ -0,0 +1,156 @@
+import unittest
+from unittest.mock import patch, MagicMock
+import numpy as np
+import tensorflow as tf
+from src.algorithms.content_based import (
+    cosine_similarity,
+    mean_average_precision,
+    average_precision,
+    FilteredCallback,
+    train_model,
+    evaluate_model,
+    find_similar_tracks,
+    main
+)
+
+class TestContentBased(unittest.TestCase):
+
+    def setUp(self):
+        self.y_true = np.array([[1, 0, 1], [0, 1, 1], [1, 1, 0]])
+        self.y_pred = np.array([[0.9, 0.1, 0.8], [0.2, 0.7, 0.6], [0.8, 0.3, 0.1]])
+
+    def test_cosine_similarity(self):
+        similarity = cosine_similarity(self.y_true, self.y_pred)
+        self.assertIsInstance(similarity, tf.Tensor)
+
+    def test_mean_average_precision(self):
+        map_score = mean_average_precision(self.y_true, self.y_pred)
+        self.assertIsInstance(map_score, float)
+        self.assertTrue(0 <= map_score <= 1)
+
+    def test_average_precision(self):
+        ap_score = average_precision(self.y_true[0], self.y_pred[0])
+        self.assertIsInstance(ap_score, float)
+        self.assertTrue(0 <= ap_score <= 1)
+
+    @patch('src.algorithms.content_based.tf.keras.callbacks.ModelCheckpoint')
+    def test_filtered_callback(self, mock_model_checkpoint):
+        filtered_callback = FilteredCallback(filepath='test_path')
+        self.assertIsInstance(filtered_callback, tf.keras.callbacks.ModelCheckpoint)
+
+    @patch('src.algorithms.content_based.EarlyStopping')
+    @patch('src.algorithms.content_based.FilteredCallback')
+    def test_train_model(self, mock_filtered_callback, mock_early_stopping):
+        mock_model = MagicMock()
+        mock_model.fit.return_value = MagicMock(history={'loss': [0.1], 'val_loss': [0.2]})
+        
+        X_train = np.random.rand(100, 10)
+        y_train = np.random.rand(100, 3)
+        X_val = np.random.rand(20, 10)
+        y_val = np.random.rand(20, 3)
+
+        history = train_model(mock_model, X_train, y_train, X_val, y_val)
+        
+        mock_model.fit.assert_called_once()
+        self.assertIn('loss', history.history)
+        self.assertIn('val_loss', history.history)
+
+    @patch('src.algorithms.content_based.precision_score')
+    @patch('src.algorithms.content_based.recall_score')
+    @patch('src.algorithms.content_based.f1_score')
+    @patch('src.algorithms.content_based.ndcg_score')
+    def test_evaluate_model(self, mock_ndcg, mock_f1, mock_recall, mock_precision):
+        mock_model = MagicMock()
+        mock_model.predict.return_value = self.y_pred
+        mock_model.evaluate.return_value = [0.1, 0.9]
+        
+        mock_precision.return_value = 0.8
+        mock_recall.return_value = 0.7
+        mock_f1.return_value = 0.75
+        mock_ndcg.return_value = 0.85
+
+        metrics = evaluate_model(mock_model, self.y_true, self.y_true)
+        
+        self.assertIn('test_loss', metrics)
+        self.assertIn('test_accuracy', metrics)
+        self.assertIn('cosine_similarity', metrics)
+        self.assertIn('mean_average_precision', metrics)
+        self.assertIn('ndcg', metrics)
+        self.assertIn('precision', metrics)
+        self.assertIn('recall', metrics)
+        self.assertIn('f1_score', metrics)
+
+    def test_find_similar_tracks(self):
+        mock_model = MagicMock()
+        mock_model.predict.side_effect = [
+            np.array([[0.1, 0.2, 0.3]]),  # track_embedding
+            np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5]])  # all_embeddings
+        ]
+        
+        track_features = np.array([1, 2, 3])
+        all_features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        track_names = ['Track1', 'Track2', 'Track3']
+
+        similar_tracks = find_similar_tracks(mock_model, track_features, all_features, track_names, n=2)
+        
+        self.assertEqual(len(similar_tracks), 2)
+        self.assertIsInstance(similar_tracks[0], tuple)
+        self.assertIsInstance(similar_tracks[0][0], str)
+        self.assertIsInstance(similar_tracks[0][1], float)
+
+    @patch('src.algorithms.content_based.prepare_data')
+    @patch('src.algorithms.content_based.build_content_based_model')
+    @patch('src.algorithms.content_based.train_model')
+    @patch('src.algorithms.content_based.evaluate_model')
+    @patch('src.algorithms.content_based.json.dump')
+    def test_main(self, mock_json_dump, mock_evaluate, mock_train, mock_build, mock_prepare):
+        mock_prepare.return_value = (
+            np.random.rand(100, 10), np.random.rand(20, 10), np.random.rand(30, 10),
+            np.random.rand(100, 3), np.random.rand(20, 3), np.random.rand(30, 3),
+            ['name1', 'name2'], ['name3'], ['name4', 'name5'],
+            MagicMock(), MagicMock()
+        )
+        
+        mock_model = MagicMock()
+        mock_build.return_value = mock_model
+        
+        mock_history = MagicMock()
+        mock_history.history = {
+            'loss': [0.1], 'binary_accuracy': [0.9],
+            'val_loss': [0.2], 'val_binary_accuracy': [0.8],
+            'val_cosine_similarity': [-0.7]
+        }
+        mock_train.return_value = mock_history
+        
+        mock_evaluate.return_value = {
+            'test_loss': 0.15, 'test_accuracy': 0.85,
+            'cosine_similarity': 0.8, 'mean_average_precision': 0.75,
+            'ndcg': 0.9, 'precision': 0.8, 'recall': 0.7, 'f1_score': 0.75
+        }
+
+        model, metrics = main('feat_eng_data.csv', 'original_df.csv', 2, 64, 32, 0.001, 32, 0.2)
+        
+        mock_prepare.assert_called_once()
+        mock_build.assert_called_once()
+        mock_train.assert_called_once()
+        mock_evaluate.assert_called_once()
+        mock_json_dump.assert_called_once()
+        
+        self.assertIsInstance(model, MagicMock)
+        self.assertIsInstance(metrics, dict)
+        self.assertIn('final_loss', metrics)
+        self.assertIn('final_accuracy', metrics)
+        self.assertIn('final_val_loss', metrics)
+        self.assertIn('final_val_accuracy', metrics)
+        self.assertIn('val_cosine_similarity', metrics)
+        self.assertIn('test_loss', metrics)
+        self.assertIn('test_accuracy', metrics)
+        self.assertIn('cosine_similarity', metrics)
+        self.assertIn('mean_average_precision', metrics)
+        self.assertIn('ndcg', metrics)
+        self.assertIn('precision', metrics)
+        self.assertIn('recall', metrics)
+        self.assertIn('f1_score', metrics)
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/test_data_prep.py b/tests/test_data_prep.py
new file mode 100644
index 0000000..3499207
--- /dev/null
+++ b/tests/test_data_prep.py
@@ -0,0 +1,81 @@
+import unittest
+from unittest.mock import patch, MagicMock
+import pandas as pd
+import numpy as np
+from google.cloud import bigquery
+from src.data_processing.data_prep import (
+    load_data_from_bigquery,
+    prepare_data,
+    save_prepared_data,
+    create_and_populate_feature_store
+)
+
+class TestDataPrep(unittest.TestCase):
+
+    def setUp(self):
+        self.sample_df = pd.DataFrame({
+            'artist': ['Artist1', 'Artist2'],
+            'name': ['Song1', 'Song2'],
+            'tags': ['rock, pop', 'jazz, blues'],
+            'similar_tracks': ['Track1, Track2', 'Track3, Track4'],
+            'playcount': [1000, 2000]
+        })
+
+    @patch('src.data_processing.data_prep.bigquery.Client')
+    def test_load_data_from_bigquery(self, mock_client):
+        mock_query_job = MagicMock()
+        mock_query_job.to_dataframe.return_value = self.sample_df
+        mock_client.return_value.query.return_value = mock_query_job
+
+        result = load_data_from_bigquery('project_id', 'dataset_id', 'table_id')
+
+        mock_client.assert_called_once_with(project='project_id')
+        mock_client.return_value.query.assert_called_once()
+        self.assertTrue(result.equals(self.sample_df))
+
+    def test_prepare_data(self):
+        preprocessed_df = self.sample_df.copy()
+        preprocessed_df['extra_feature'] = [1, 2]
+        
+        X_train, X_test, y_train, y_test, names_train, names_test, scaler, mlb = prepare_data(preprocessed_df, self.sample_df)
+
+        self.assertEqual(X_train.shape[1], 1)  # Only 'extra_feature' should be in X
+        self.assertEqual(y_train.shape[1], 4)  # 4 unique tracks in similar_tracks
+        self.assertEqual(len(names_train), 1)  # 80% of 2 samples
+        self.assertEqual(len(names_test), 1)   # 20% of 2 samples
+
+    @patch('src.data_processing.data_prep.np.save')
+    @patch('src.data_processing.data_prep.joblib.dump')
+    def test_save_prepared_data(self, mock_joblib_dump, mock_np_save):
+        X_train = np.array([[1, 2], [3, 4]])
+        X_test = np.array([[5, 6]])
+        y_train = np.array([[1, 0], [0, 1]])
+        y_test = np.array([[1, 0]])
+        names_train = np.array(['Song1', 'Song2'])
+        names_test = np.array(['Song3'])
+        scaler = MagicMock()
+        mlb = MagicMock()
+
+        save_prepared_data(X_train, X_test, y_train, y_test, names_train, names_test, scaler, mlb, 'output_dir')
+
+        self.assertEqual(mock_np_save.call_count, 6)  # 6 numpy arrays saved
+        self.assertEqual(mock_joblib_dump.call_count, 2)  # scaler and mlb saved
+
+    @patch('src.data_processing.data_prep.aiplatform.init')
+    @patch('src.data_processing.data_prep.aiplatform.FeatureStore.create')
+    def test_create_and_populate_feature_store(self, mock_create_feature_store, mock_init):
+        mock_feature_store = MagicMock()
+        mock_create_feature_store.return_value = mock_feature_store
+        mock_entity_type = MagicMock()
+        mock_feature_store.create_entity_type.return_value = mock_entity_type
+
+        create_and_populate_feature_store('project_id', 'region', 'feature_store_id', 'entity_type_id', self.sample_df)
+
+        mock_init.assert_called_once_with(project='project_id', location='region')
+        mock_create_feature_store.assert_called_once()
+        mock_feature_store.create_entity_type.assert_called_once()
+        self.assertEqual(mock_entity_type.create_feature.call_count, 5)  # 5 features in sample_df
+        mock_entity_type.ingest.assert_called_once()
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/test_data_processing.py b/tests/test_data_processing.py
index 0731d3b..09ee228 100644
--- a/tests/test_data_processing.py
+++ b/tests/test_data_processing.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import numpy as np
 from src.data_processing.data_ingestion import fetch_lastfm_data
-from src.data_processing.data_preprocess import (
+from src.data_processing.data_process import (
     load_data, robust_string_parser, preprocess_data, one_hot_encode,
     impute_data, prepare_data, main
 )
diff --git a/tests/test_data_validation.py b/tests/test_data_validation.py
new file mode 100644
index 0000000..9611163
--- /dev/null
+++ b/tests/test_data_validation.py
@@ -0,0 +1,94 @@
+import unittest
+from unittest.mock import patch, MagicMock
+import pandas as pd
+import tensorflow_data_validation as tfdv
+from src.data_processing.data_validation import (
+    generate_schema,
+    validate_data,
+    compare_statistics,
+    detect_data_drift,
+    compare_schemas
+)
+
+class TestDataValidation(unittest.TestCase):
+
+    def setUp(self):
+        self.sample_df = pd.DataFrame({
+            'artist': ['Artist1', 'Artist2'],
+            'name': ['Song1', 'Song2'],
+            'tags': ['rock, pop', 'jazz, blues'],
+            'similar_tracks': ['Track1, Track2', 'Track3, Track4'],
+            'playcount': [1000, 2000]
+        })
+
+    @patch('src.data_processing.data_validation.tfdv.infer_schema')
+    @patch('src.data_processing.data_validation.save_schema_to_gcs')
+    def test_generate_schema(self, mock_save_schema, mock_infer_schema):
+        mock_schema = MagicMock()
+        mock_infer_schema.return_value = mock_schema
+        
+        result = generate_schema('project_id', 'dataset_id', 'table_id', 'bucket_name', 'model_name', 'version')
+        
+        mock_infer_schema.assert_called_once()
+        mock_save_schema.assert_called_once_with(mock_schema, 'bucket_name', 'model_name', 'version')
+        self.assertEqual(result, mock_schema)
+
+    @patch('src.data_processing.data_validation.tfdv.generate_statistics_from_dataframe')
+    @patch('src.data_processing.data_validation.tfdv.validate_statistics')
+    @patch('src.data_processing.data_validation.save_statistics_to_gcs')
+    def test_validate_data(self, mock_save_stats, mock_validate_stats, mock_generate_stats):
+        mock_stats = MagicMock()
+        mock_generate_stats.return_value = mock_stats
+        mock_anomalies = MagicMock()
+        mock_validate_stats.return_value = mock_anomalies
+        mock_schema = MagicMock()
+
+        stats, anomalies = validate_data('project_id', 'dataset_id', 'table_id', mock_schema, 'bucket_name', 'model_name', 'data_type')
+
+        mock_generate_stats.assert_called_once()
+        mock_save_stats.assert_called_once_with(mock_stats, 'bucket_name', 'model_name', 'data_type')
+        mock_validate_stats.assert_called_once_with(mock_stats, mock_schema)
+        self.assertEqual(stats, mock_stats)
+        self.assertEqual(anomalies, mock_anomalies)
+
+    @patch('src.data_processing.data_validation.tfdv.validate_statistics')
+    def test_compare_statistics(self, mock_validate_stats):
+        mock_train_stats = MagicMock()
+        mock_serving_stats = MagicMock()
+        mock_schema = MagicMock()
+        mock_anomalies = MagicMock()
+        mock_validate_stats.return_value = mock_anomalies
+
+        result = compare_statistics(mock_train_stats, mock_serving_stats, mock_schema)
+
+        mock_validate_stats.assert_called_once_with(mock_serving_stats, mock_schema, previous_statistics=mock_train_stats)
+        self.assertEqual(result, mock_anomalies)
+
+    @patch('src.data_processing.data_validation.tfdv.compute_drift_skew')
+    def test_detect_data_drift(self, mock_compute_drift_skew):
+        mock_train_stats = MagicMock()
+        mock_serving_stats = MagicMock()
+        mock_schema = MagicMock()
+        mock_drift_skew = {'feature1': 0.1, 'feature2': 0.2}
+        mock_compute_drift_skew.return_value = mock_drift_skew
+
+        result = detect_data_drift(mock_train_stats, mock_serving_stats, mock_schema, 0.15)
+
+        mock_compute_drift_skew.assert_called_once_with(mock_train_stats, mock_serving_stats, mock_schema)
+        self.assertEqual(result, mock_drift_skew)
+
+    def test_compare_schemas(self):
+        baseline_schema = tfdv.Schema()
+        baseline_schema.feature.add(name='feature1', type=tfdv.FeatureType.INT)
+        baseline_schema.feature.add(name='feature2', type=tfdv.FeatureType.FLOAT)
+
+        current_schema = tfdv.Schema()
+        current_schema.feature.add(name='feature1', type=tfdv.FeatureType.INT)
+        current_schema.feature.add(name='feature3', type=tfdv.FeatureType.STRING)
+
+        result = compare_schemas(baseline_schema, current_schema)
+
+        self.assertTrue(result)  # Schema drift detected
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/test_data_versioning.py b/tests/test_data_versioning.py
new file mode 100644
index 0000000..c2adf21
--- /dev/null
+++ b/tests/test_data_versioning.py
@@ -0,0 +1,28 @@
+import unittest
+from unittest.mock import patch, MagicMock
+from src.utils.data_versioning import version_dataset
+
+class TestDataVersioning(unittest.TestCase):
+    @patch('src.utils.data_versioning.bigquery.Client')
+    def test_version_dataset(self, mock_client):
+        # Mock the BigQuery client
+        mock_job = MagicMock()
+        mock_client.return_value.query.return_value = mock_job
+
+        # Call the function
+        result = version_dataset('test-project', 'source_dataset', 'source_table', 'versioned_dataset')
+
+        # Assert that the query was called with the correct parameters
+        mock_client.return_value.query.assert_called_once()
+        query_call = mock_client.return_value.query.call_args[0][0]
+        self.assertIn('test-project.versioned_dataset', query_call)
+        self.assertIn('test-project.source_dataset.source_table', query_call)
+
+        # Assert that the job's result method was called
+        mock_job.result.assert_called_once()
+
+        # Assert that the function returns the correct table name
+        self.assertTrue(result.startswith('versioned_dataset.source_table_v'))
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/test_feature_engineering.py b/tests/test_feature_engineering.py
index 34eb996..1e8cac9 100644
--- a/tests/test_feature_engineering.py
+++ b/tests/test_feature_engineering.py
@@ -1,189 +1,62 @@
 import unittest
 import pandas as pd
 import numpy as np
-from src.feature_engineering.feat_engineering import (engineer_basic_features, engineer_additional_features,
-                        refine_features, add_tag_popularity,
-                        add_similar_tracks_avg_playcount, add_interaction_features,
-                        add_target_encoding, refine_features_further,
-                        vectorize_all_text_features)
+from unittest.mock import patch, MagicMock
+from src.feature_engineering.feat_engineering import (
+    engineer_basic_features,
+    engineer_additional_features,
+    add_tag_popularity,
+    add_similar_tracks_avg_playcount,
+    refine_features,
+    vectorize_all_text_features
+)
 
 class TestFeatureEngineering(unittest.TestCase):
 
     def setUp(self):
-        # Create a sample DataFrame for testing
-        self.df = pd.DataFrame({
-            'name': ['Track1', 'Track2', 'Track3'],
-            'artist': ['Artist1', 'Artist2', 'Artist1'],
-            'playcount': [100, 200, 300],
-            'tags': ['rock, pop', 'jazz, blues', 'rock, metal'],
-            'similar_tracks': ['Track2, Track3', 'Track1, Track3', 'Track1, Track2']
+        self.sample_df = pd.DataFrame({
+            'artist': ['Artist1', 'Artist2'],
+            'name': ['Song1', 'Song2'],
+            'tags': ['rock, pop', 'jazz, blues'],
+            'similar_tracks': ['Track1, Track2', 'Track3, Track4'],
+            'playcount': [1000, 2000]
         })
 
     def test_engineer_basic_features(self):
-        result = engineer_basic_features(self.df)
-        
-        # Check exact values
-        np.testing.assert_almost_equal(result['log_playcount'].values, 
-                                    np.log1p([100, 200, 300]))
-        
-        # Check types
-        self.assertTrue(np.issubdtype(result['log_playcount'].dtype, np.number))
-        self.assertTrue(np.issubdtype(result['binned_playcount'].dtype, np.integer))
-        
-        # Check edge case: empty DataFrame
-        empty_df = pd.DataFrame(columns=self.df.columns)
-        empty_result = engineer_basic_features(empty_df)
-        self.assertTrue(empty_result.empty)
+        result = engineer_basic_features(self.sample_df)
+        self.assertIn('log_playcount', result.columns)
+        self.assertIn('num_tags', result.columns)
+        self.assertIn('num_similar_tracks', result.columns)
 
     def test_engineer_additional_features(self):
-        df_basic = engineer_basic_features(self.df)
-        result = engineer_additional_features(df_basic)
-        
-        # Check exact values
-        self.assertEqual(result['high_tag_count'].tolist(), [0, 0, 0])
-        self.assertEqual(result['tag_count_category'].tolist(), ['low', 'low', 'low'])
-        
-        # Check types
-        self.assertTrue(np.issubdtype(result['high_tag_count'].dtype, np.integer))
-        self.assertTrue(pd.api.types.is_categorical_dtype(result['tag_count_category']))
+        basic_features = engineer_basic_features(self.sample_df)
+        result = engineer_additional_features(basic_features)
+        self.assertIn('tag_count_category', result.columns)
+        self.assertIn('similar_tracks_category', result.columns)
 
-    def test_refine_features(self):
-        df_basic = engineer_basic_features(self.df)
-        df_additional = engineer_additional_features(df_basic)
-        result = refine_features(df_additional)
-        
-        # Check exact values
-        np.testing.assert_almost_equal(result['artist_avg_playcount'].values, 
-                                    [np.log1p(200), np.log1p(200), np.log1p(200)])
-        
-        # Check types
-        self.assertTrue(np.issubdtype(result['artist_track_count'].dtype, np.integer))
-
-    def test_add_tag_popularity(self):
-        result = add_tag_popularity(self.df)
-        
-        # Check if tag popularity is calculated correctly
-        self.assertGreater(result.loc[0, 'avg_tag_popularity'], 
-                            result.loc[1, 'avg_tag_popularity'])
-        
-        # Test with missing values
-        df_with_missing = self.df.copy()
-        df_with_missing.loc[0, 'tags'] = np.nan
-        result_missing = add_tag_popularity(df_with_missing)
-        self.assertEqual(result_missing.loc[0, 'avg_tag_popularity'], 0)
+    @patch('src.feature_engineering.feat_engineering.pd.DataFrame.merge')
+    def test_add_tag_popularity(self, mock_merge):
+        mock_merge.return_value = self.sample_df
+        result = add_tag_popularity(self.sample_df)
+        self.assertIn('avg_tag_popularity', result.columns)
 
     def test_add_similar_tracks_avg_playcount(self):
-        result = add_similar_tracks_avg_playcount(self.df)
-        
-        # Check exact values
-        expected_avg = (np.log1p(200) + np.log1p(300)) / 2
-        self.assertAlmostEqual(result.loc[0, 'avg_similar_tracks_playcount'], expected_avg)
-
-    def test_add_interaction_features(self):
-        df_with_avg = add_similar_tracks_avg_playcount(self.df)
-        df_with_avg['num_tags'] = df_with_avg['tags'].str.count(',') + 1
-        result = add_interaction_features(df_with_avg)
-        
-        # Check exact values
-        expected_interaction = (df_with_avg['num_tags'] * df_with_avg['avg_similar_tracks_playcount']).values
-        np.testing.assert_almost_equal(result['num_tags_x_avg_similar_tracks_playcount'].values, expected_interaction)
-
-    def test_add_target_encoding(self):
-        df_with_log = engineer_basic_features(self.df)
-        result = add_target_encoding(df_with_log)
-        
-        # Check if encoding is smooth
-        self.assertNotEqual(result['artist_target_encoded'].nunique(), result['artist'].nunique())
-
-    def test_refine_features_further(self):
-        df_refined = refine_features(self.df)
-        df_refined['has_tag_favorites'] = [1, 0, 1]
-        df_refined['has_tag_Favorite'] = [0, 1, 0]
-        df_refined['has_tag_MySpotigramBot'] = [1, 1, 1]
-        result = refine_features_further(df_refined)
-        
-        # Check exact values
-        np.testing.assert_array_equal(result['has_tag_favorites_combined'].values, [1, 1, 1])
-        
-        # Check if low variance feature is dropped
-        self.assertNotIn('has_tag_MySpotigramBot', result.columns)
-
-    def test_vectorize_all_text_features(self):
-        result, vectorizers = vectorize_all_text_features(self.df)
-        
-        # Check if vectorization produces expected number of features
-        self.assertEqual(sum(1 for col in result.columns if col.startswith('name_tfidf_')), 3)
-        self.assertEqual(sum(1 for col in result.columns if col.startswith('artist_tfidf_')), 2)
-        
-        # Test with custom max_features
-        result_custom, _ = vectorize_all_text_features(self.df, {'tags': 1, 'similar_tracks': 1})
-        self.assertEqual(sum(1 for col in result_custom.columns if col.startswith('tags_tfidf_')), 1)
-        self.assertEqual(sum(1 for col in result_custom.columns if col.startswith('similar_tracks_tfidf_')), 1)
-
-    def test_edge_cases(self):
-        # Test with empty DataFrame
-        empty_df = pd.DataFrame(columns=self.df.columns)
-        self.assertTrue(engineer_basic_features(empty_df).empty)
-        self.assertTrue(engineer_additional_features(empty_df).empty)
-        self.assertTrue(refine_features(empty_df).empty)
-        
-        # Test with missing values
-        df_with_missing = self.df.copy()
-        df_with_missing.loc[0, 'playcount'] = np.nan
-        df_with_missing.loc[1, 'tags'] = np.nan
-        df_with_missing.loc[2, 'similar_tracks'] = np.nan
-        
-        result_basic = engineer_basic_features(df_with_missing)
-        self.assertTrue(np.isnan(result_basic.loc[0, 'log_playcount']))
-        self.assertEqual(result_basic.loc[1, 'num_tags'], 0)
-        self.assertEqual(result_basic.loc[2, 'num_similar_tracks'], 0)
+        result = add_similar_tracks_avg_playcount(self.sample_df)
+        self.assertIn('avg_similar_tracks_playcount', result.columns)
 
-    def test_output_types_and_shapes(self):
-        result = engineer_basic_features(self.df)
-        result = engineer_additional_features(result)
-        result = refine_features(result)
-        result = add_tag_popularity(result)
-        result = add_similar_tracks_avg_playcount(result)
-        result = add_interaction_features(result)
-        result = add_target_encoding(result)
-        result = refine_features_further(result)
-        result, _ = vectorize_all_text_features(result)
-
-        # Check output types
-        self.assertTrue(isinstance(result, pd.DataFrame))
-        self.assertTrue(all(np.issubdtype(result[col].dtype, np.number) for col in result.columns 
-                            if not pd.api.types.is_categorical_dtype(result[col])))
-
-        # Check output shape
-        self.assertEqual(result.shape[0], self.df.shape[0])
-        self.assertGreater(result.shape[1], self.df.shape[1])
-
-    def test_consistency(self):
-        # Run the feature engineering process twice and compare results
-        result1 = engineer_basic_features(self.df)
-        result1 = engineer_additional_features(result1)
-        result1 = refine_features(result1)
-        
-        result2 = engineer_basic_features(self.df)
-        result2 = engineer_additional_features(result2)
-        result2 = refine_features(result2)
-        
-        pd.testing.assert_frame_equal(result1, result2)
-
-    def test_no_unexpected_nan(self):
-        result = engineer_basic_features(self.df)
-        result = engineer_additional_features(result)
-        result = refine_features(result)
-        result = add_tag_popularity(result)
-        result = add_similar_tracks_avg_playcount(result)
-        result = add_interaction_features(result)
-        result = add_target_encoding(result)
-        result = refine_features_further(result)
-        result, _ = vectorize_all_text_features(result)
-
-        # Check for unexpected NaN values
-        unexpected_nan = result.isna().sum()
-        self.assertTrue(all(unexpected_nan == 0), f"Unexpected NaN values found: {unexpected_nan[unexpected_nan > 0]}")
+    def test_refine_features(self):
+        result = refine_features(self.sample_df)
+        self.assertIn('artist_avg_playcount', result.columns)
+        self.assertIn('artist_track_count', result.columns)
+
+    @patch('src.feature_engineering.feat_engineering.TfidfVectorizer')
+    def test_vectorize_all_text_features(self, mock_tfidf):
+        mock_tfidf.return_value.fit_transform.return_value.toarray.return_value = np.array([[1, 0], [0, 1]])
+        result, _ = vectorize_all_text_features(self.sample_df)
+        self.assertTrue(any(col.startswith('artist_tfidf_') for col in result.columns))
+        self.assertTrue(any(col.startswith('name_tfidf_') for col in result.columns))
+        self.assertTrue(any(col.startswith('tags_tfidf_') for col in result.columns))
+        self.assertTrue(any(col.startswith('similar_tracks_tfidf_') for col in result.columns))
 
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file
diff --git a/tests/test_feature_store.py b/tests/test_feature_store.py
new file mode 100644
index 0000000..2b1c4a4
--- /dev/null
+++ b/tests/test_feature_store.py
@@ -0,0 +1,32 @@
+import unittest
+from unittest.mock import patch, MagicMock
+from google.cloud import aiplatform
+from src.data_processing.data_prep import create_and_populate_feature_store
+
+class TestFeatureStore(unittest.TestCase):
+
+    @patch('src.data_processing.data_prep.aiplatform.init')
+    @patch('src.data_processing.data_prep.aiplatform.FeatureStore.create')
+    def test_create_and_populate_feature_store(self, mock_create_feature_store, mock_init):
+        mock_feature_store = MagicMock()
+        mock_create_feature_store.return_value = mock_feature_store
+        mock_entity_type = MagicMock()
+        mock_feature_store.create_entity_type.return_value = mock_entity_type
+
+        # Mock DataFrame
+        df = MagicMock()
+        df.to_dict.return_value = [{'feature1': 'value1', 'feature2': 'value2'}]
+        df.index.tolist.return_value = ['entity1']
+
+        create_and_populate_feature_store('project_id', 'region', 'feature_store_id', 'entity_type_id', df)
+
+        mock_init.assert_called_once_with(project='project_id', location='region')
+        mock_create_feature_store.assert_called_once()
+        mock_feature_store.create_entity_type.assert_called_once()
+        mock_entity_type.create_feature.assert_called()
+        mock_entity_type.ingest.assert_called_once()
+
+    # Add more tests for other feature store operations
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/test_hyperparameter_tuning.py b/tests/test_hyperparameter_tuning.py
index 00741f3..eb47869 100644
--- a/tests/test_hyperparameter_tuning.py
+++ b/tests/test_hyperparameter_tuning.py
@@ -1,58 +1,71 @@
 import unittest
 from unittest.mock import patch, MagicMock
-from src.hyperparameter_tuning.katib_tuning import create_experiment, run_hyperparameter_tuning
+import yaml
+from src.hyperparameter_tuning.katib_tuning import (
+    load_config,
+    create_experiment,
+    run_hyperparameter_tuning,
+    main
+)
 
 class TestHyperparameterTuning(unittest.TestCase):
+
     def setUp(self):
         self.mock_config = {
             'hyperparameter_tuning': {
                 'max_trials': 10,
                 'parameters': {
-                    'hidden_layers': {'min': 1, 'max': 3},
+                    'hidden_layers': {'min': 1, 'max': 5},
                     'neurons': {'min': 32, 'max': 256},
                     'learning_rate': {'min': 0.0001, 'max': 0.1}
                 }
             }
         }
 
-    def test_create_experiment(self):
-        experiment = create_experiment('test-experiment', 'default', 'train_data.csv', 'val_data.csv', self.mock_config)
+    @patch('src.hyperparameter_tuning.katib_tuning.open')
+    def test_load_config(self, mock_open):
+        mock_open.return_value.__enter__.return_value = MagicMock()
+        mock_yaml_load = MagicMock(return_value=self.mock_config)
+        with patch('src.hyperparameter_tuning.katib_tuning.yaml.safe_load', mock_yaml_load):
+            config = load_config()
+        self.assertEqual(config, self.mock_config)
+
+    @patch('src.hyperparameter_tuning.katib_tuning.load_config')
+    def test_create_experiment(self, mock_load_config):
+        mock_load_config.return_value = self.mock_config
+        experiment = create_experiment("test-experiment", "default", "train_data.csv", "val_data.csv")
         
-        self.assertEqual(experiment['metadata']['name'], 'test-experiment')
-        self.assertEqual(experiment['metadata']['namespace'], 'default')
+        self.assertEqual(experiment['metadata']['name'], "test-experiment")
+        self.assertEqual(experiment['metadata']['namespace'], "default")
         self.assertEqual(experiment['spec']['maxTrialCount'], 10)
-        
-        parameters = experiment['spec']['parameters']
-        self.assertEqual(len(parameters), 6)  # hidden_layers, neurons, embedding_dim, learning_rate, batch_size, dropout_rate
-        
-        hidden_layers_param = next(p for p in parameters if p['name'] == 'hidden_layers')
-        self.assertEqual(hidden_layers_param['feasibleSpace']['min'], '1')
-        self.assertEqual(hidden_layers_param['feasibleSpace']['max'], '3')
+        self.assertIn('parameters', experiment['spec'])
+        self.assertEqual(len(experiment['spec']['parameters']), 6)
 
     @patch('src.hyperparameter_tuning.katib_tuning.KatibClient')
-    def test_run_hyperparameter_tuning(self, mock_katib_client):
+    @patch('src.hyperparameter_tuning.katib_tuning.create_experiment')
+    def test_run_hyperparameter_tuning(self, mock_create_experiment, mock_katib_client):
         mock_client = MagicMock()
         mock_katib_client.return_value = mock_client
-        
-        mock_client.get_optimal_hyperparameters.return_value = {
-            'currentOptimalTrial': {
-                'parameterAssignments': [
-                    {'name': 'hidden_layers', 'value': '2'},
-                    {'name': 'neurons', 'value': '128'},
-                    {'name': 'learning_rate', 'value': '0.001'}
-                ]
-            }
-        }
-        
-        results = run_hyperparameter_tuning('train_data.csv', 'val_data.csv', 'config.yaml')
-        
-        mock_katib_client.assert_called_once()
+        mock_create_experiment.return_value = {"metadata": {"name": "test-experiment"}}
+        mock_client.get_optimal_hyperparameters.return_value = {"bestTrialName": "test-trial"}
+
+        results = run_hyperparameter_tuning("train_data.csv", "val_data.csv")
+
+        mock_create_experiment.assert_called_once()
         mock_client.create_experiment.assert_called_once()
         mock_client.wait_for_experiment.assert_called_once()
         mock_client.get_optimal_hyperparameters.assert_called_once()
-        
-        self.assertIn('currentOptimalTrial', results)
-        self.assertIn('parameterAssignments', results['currentOptimalTrial'])
+        self.assertEqual(results, {"bestTrialName": "test-trial"})
+
+    @patch('src.hyperparameter_tuning.katib_tuning.run_hyperparameter_tuning')
+    def test_main(self, mock_run_hyperparameter_tuning):
+        mock_results = {"bestTrialName": "test-trial"}
+        mock_run_hyperparameter_tuning.return_value = mock_results
+
+        results = main("train_data.csv", "val_data.csv")
+
+        mock_run_hyperparameter_tuning.assert_called_once_with("train_data.csv", "val_data.csv")
+        self.assertEqual(results, mock_results)
 
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file
diff --git a/tests/test_integration.py b/tests/test_integration.py
new file mode 100644
index 0000000..729f16e
--- /dev/null
+++ b/tests/test_integration.py
@@ -0,0 +1,80 @@
+import unittest
+from unittest.mock import patch, MagicMock
+import pandas as pd
+import numpy as np
+from src.data_processing.data_ingestion import load_data
+from src.data_processing.data_process import preprocess_data
+from src.feature_engineering.feat_engineering import engineer_features
+from src.algorithms.content_based import main as content_based_main
+from src.evaluation.model_evaluation import evaluate_model
+
+class TestIntegration(unittest.TestCase):
+
+    @patch('src.data_processing.data_ingestion.load_data')
+    @patch('src.data_processing.data_process.preprocess_data')
+    @patch('src.feature_engineering.feat_engineering.engineer_features')
+    @patch('src.algorithms.content_based.main')
+    @patch('src.evaluation.model_evaluation.evaluate_model')
+    def test_end_to_end_workflow(self, mock_evaluate, mock_content_based, mock_engineer, mock_preprocess, mock_load):
+        # Mock data ingestion
+        mock_load.return_value = pd.DataFrame({
+            'user_id': [1, 2, 3],
+            'track_id': [101, 102, 103],
+            'listen_count': [10, 20, 30]
+        })
+
+        # Mock data preprocessing
+        mock_preprocess.return_value = pd.DataFrame({
+            'user_id': [1, 2, 3],
+            'track_id': [101, 102, 103],
+            'listen_count': [10, 20, 30],
+            'normalized_listen_count': [0.1, 0.2, 0.3]
+        })
+
+        # Mock feature engineering
+        mock_engineer.return_value = pd.DataFrame({
+            'user_id': [1, 2, 3],
+            'track_id': [101, 102, 103],
+            'listen_count': [10, 20, 30],
+            'normalized_listen_count': [0.1, 0.2, 0.3],
+            'feature1': [0.5, 0.6, 0.7],
+            'feature2': [0.8, 0.9, 1.0]
+        })
+
+        # Mock content-based algorithm
+        mock_model = MagicMock()
+        mock_content_based.return_value = (mock_model, {'accuracy': 0.85, 'f1_score': 0.82})
+
+        # Mock model evaluation
+        mock_evaluate.return_value = {'accuracy': 0.87, 'f1_score': 0.84, 'precision': 0.86, 'recall': 0.85}
+
+        # Run the end-to-end workflow
+        raw_data = load_data('dummy_path')
+        preprocessed_data = preprocess_data(raw_data)
+        feature_engineered_data = engineer_features(preprocessed_data)
+        model, training_metrics = content_based_main(feature_engineered_data, preprocessed_data, 2, 64, 32, 0.001, 32, 0.2)
+        evaluation_metrics = evaluate_model(model, feature_engineered_data, preprocessed_data)
+
+        # Assertions
+        self.assertIsNotNone(raw_data)
+        self.assertIsNotNone(preprocessed_data)
+        self.assertIsNotNone(feature_engineered_data)
+        self.assertIsNotNone(model)
+        self.assertIsNotNone(training_metrics)
+        self.assertIsNotNone(evaluation_metrics)
+
+        self.assertIn('accuracy', training_metrics)
+        self.assertIn('f1_score', training_metrics)
+        self.assertIn('accuracy', evaluation_metrics)
+        self.assertIn('f1_score', evaluation_metrics)
+        self.assertIn('precision', evaluation_metrics)
+        self.assertIn('recall', evaluation_metrics)
+
+        # Verify that each step was called with the output of the previous step
+        mock_preprocess.assert_called_once_with(raw_data)
+        mock_engineer.assert_called_once_with(mock_preprocess.return_value)
+        mock_content_based.assert_called_once_with(mock_engineer.return_value, mock_preprocess.return_value, 2, 64, 32, 0.001, 32, 0.2)
+        mock_evaluate.assert_called_once_with(mock_model, mock_engineer.return_value, mock_preprocess.return_value)
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/test_kubeflow_pipeline.py b/tests/test_kubeflow_pipeline.py
new file mode 100644
index 0000000..05ccd43
--- /dev/null
+++ b/tests/test_kubeflow_pipeline.py
@@ -0,0 +1,62 @@
+import unittest
+from unittest.mock import patch, MagicMock
+from kfp import dsl
+from kubeflow import pipeline
+
+class TestKubeflowPipeline(unittest.TestCase):
+
+    @patch('kubeflow.pipeline.Pipeline')
+    def test_pipeline_creation(self, mock_pipeline):
+        # Mock the pipeline components
+        mock_data_ingestion = MagicMock()
+        mock_data_processing = MagicMock()
+        mock_feature_engineering = MagicMock()
+        mock_model_training = MagicMock()
+        mock_model_evaluation = MagicMock()
+        mock_model_deployment = MagicMock()
+
+        # Create the pipeline
+        @dsl.pipeline(
+            name='LastFM Music Recommender Pipeline',
+            description='End-to-end pipeline for LastFM Music Recommender'
+        )
+        def lastfm_pipeline():
+            data_ingestion_task = mock_data_ingestion()
+            data_processing_task = mock_data_processing(data_ingestion_task.output)
+            feature_engineering_task = mock_feature_engineering(data_processing_task.output)
+            model_training_task = mock_model_training(feature_engineering_task.output)
+            model_evaluation_task = mock_model_evaluation(model_training_task.output)
+            mock_model_deployment(model_evaluation_task.output)
+
+        # Run the pipeline
+        pipeline.Pipeline(lastfm_pipeline)
+
+        # Assert that the pipeline was created
+        mock_pipeline.assert_called_once()
+
+        # Assert that all components were called in the correct order
+        mock_data_ingestion.assert_called_once()
+        mock_data_processing.assert_called_once()
+        mock_feature_engineering.assert_called_once()
+        mock_model_training.assert_called_once()
+        mock_model_evaluation.assert_called_once()
+        mock_model_deployment.assert_called_once()
+
+    @patch('kubeflow.pipeline.Client')
+    def test_pipeline_run(self, mock_client):
+        # Mock the pipeline run
+        mock_run = MagicMock()
+        mock_client.return_value.create_run_from_pipeline_func.return_value = mock_run
+
+        # Run the pipeline
+        client = pipeline.Client()
+        client.create_run_from_pipeline_func(pipeline.Pipeline, arguments={})
+
+        # Assert that the pipeline run was created
+        mock_client.return_value.create_run_from_pipeline_func.assert_called_once()
+
+        # Assert that the run was waited for
+        mock_run.wait_for_run_completion.assert_called_once()
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/test_model_evaluation.py b/tests/test_model_evaluation.py
new file mode 100644
index 0000000..67b957f
--- /dev/null
+++ b/tests/test_model_evaluation.py
@@ -0,0 +1,77 @@
+import unittest
+from unittest.mock import patch, MagicMock
+import numpy as np
+import pandas as pd
+import json
+from src.evaluation.model_evaluation import (
+    mean_average_precision,
+    average_precision,
+    diversity,
+    novelty,
+    evaluate_model,
+    visualize_results,
+    update_custom_metrics
+)
+
+class TestModelEvaluation(unittest.TestCase):
+
+    def setUp(self):
+        self.y_true = np.array([[1, 0, 1], [0, 1, 1], [1, 1, 0]])
+        self.y_pred = np.array([[0.9, 0.1, 0.8], [0.2, 0.7, 0.6], [0.8, 0.3, 0.1]])
+        self.item_popularity = {'item1': 0.5, 'item2': 0.3, 'item3': 0.2}
+
+    def test_mean_average_precision(self):
+        map_score = mean_average_precision(self.y_true, self.y_pred)
+        self.assertIsInstance(map_score, float)
+        self.assertTrue(0 <= map_score <= 1)
+
+    def test_average_precision(self):
+        ap_score = average_precision(self.y_true[0], self.y_pred[0])
+        self.assertIsInstance(ap_score, float)
+        self.assertTrue(0 <= ap_score <= 1)
+
+    def test_diversity(self):
+        recommendations = [[1, 2, 3], [2, 3, 4], [3, 4, 5]]
+        div_score = diversity(recommendations)
+        self.assertIsInstance(div_score, float)
+        self.assertTrue(0 <= div_score <= 1)
+
+    def test_novelty(self):
+        recommendations = [[1, 2, 3], [2, 3, 4], [3, 4, 5]]
+        nov_score = novelty(recommendations, self.item_popularity)
+        self.assertIsInstance(nov_score, float)
+
+    @patch('src.evaluation.model_evaluation.load_metrics')
+    @patch('src.evaluation.model_evaluation.log_metric')
+    def test_evaluate_model(self, mock_log_metric, mock_load_metrics):
+        mock_model = MagicMock()
+        mock_model.predict.return_value = self.y_pred
+        mock_model.evaluate.return_value = (0.1, 0.9)  # mock loss and accuracy
+        mock_load_metrics.return_value = {'train_loss': 0.2, 'train_accuracy': 0.8}
+
+        results = evaluate_model(mock_model, self.y_true, self.y_true, 'mock_path', self.item_popularity)
+
+        self.assertIsInstance(results, dict)
+        self.assertIn('test_accuracy', results)
+        self.assertIn('test_precision', results)
+        self.assertIn('test_recall', results)
+        self.assertIn('test_f1_score', results)
+        self.assertIn('test_ndcg', results)
+        self.assertIn('test_mean_average_precision', results)
+        self.assertIn('test_diversity', results)
+        self.assertIn('test_novelty', results)
+
+    @patch('matplotlib.pyplot.savefig')
+    def test_visualize_results(self, mock_savefig):
+        results = {'metric1': 0.5, 'metric2': 0.7}
+        visualize_results(results, 'mock_output_path')
+        mock_savefig.assert_called_once()
+
+    @patch('src.evaluation.model_evaluation.monitoring_v3.MetricServiceClient')
+    def test_update_custom_metrics(self, mock_client):
+        metrics = {'accuracy': 0.9, 'f1_score': 0.8}
+        update_custom_metrics('project_id', 'model_name', metrics)
+        mock_client.return_value.create_time_series.assert_called()
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index fb62257..07cb7ee 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -4,15 +4,14 @@
 import tempfile
 import os
 import json
-from src.data_processing.data_preprocess import preprocess_data, prepare_data
+from src.data_processing.data_process import preprocess_data
+from src.data_processing.data_prep import prepare_data
 from src.data_processing.data_validation import (
     generate_schema, validate_data, compare_statistics, detect_data_drift, save_schema_to_gcs
 )
 from src.feature_engineering.feat_engineering import (
-    engineer_basic_features, engineer_additional_features, refine_features,
-    add_tag_popularity, add_similar_tracks_avg_playcount, add_interaction_features,
-    add_target_encoding, refine_features_further, vectorize_all_text_features,
-    create_preprocessing_pipeline
+    engineer_basic_features, engineer_additional_features,
+    add_tag_popularity, add_similar_tracks_avg_playcount, feature_engineering_pipeline
 )
 from src.algorithms.content_based import ContentBasedRecommender
 from src.evaluation.model_evaluation import evaluate_model
@@ -107,14 +106,6 @@ def test_end_to_end_pipeline(self):
 
         # Step 3: Feature Engineering
         df = engineer_basic_features(preprocessed_data)
-        df = engineer_additional_features(df)
-        df = refine_features(df)
-        df = add_tag_popularity(df)
-        df = add_similar_tracks_avg_playcount(df)
-        df = add_interaction_features(df)
-        df = add_target_encoding(df)
-        df = refine_features_further(df)
-        df, vectorizers = vectorize_all_text_features(df)
 
         # Create preprocessing pipeline
         pipeline = create_preprocessing_pipeline(df)
@@ -214,7 +205,7 @@ def test_pipeline_output_artifacts(self):
         preprocessed_data = preprocess_data(pd.read_csv(self.train_data_path))
         
         # Feature engineering steps
-        df = engineer_basic_features(preprocessed_data)
+        df = feature_engineering_pipeline(preprocessed_data)
         df = engineer_additional_features(df)
         df = refine_features(df)
         df = add_tag_popularity(df)

From 48ae8f1b15b472f760f64c5a1717d1af835c8f66 Mon Sep 17 00:00:00 2001
From: JonFillip <tjayphil14@gmail.com>
Date: Mon, 7 Oct 2024 10:45:17 +0100
Subject: [PATCH 2/2] Fixed dependecy issue, streamlined deployment and
 monitoring logic

---
 .github/workflows/ci_pipeline.yml            |   2 +-
 cloudbuild.yaml                              |   2 +-
 deployment/vertex_ai/vertex_ai_monitoring.py | 467 ++++++++++--------
 deployment/vertex_ai/vertex_deployment.py    | 479 +++++++++++--------
 kubeflow/components/preprocess/preprocess.py |   2 +-
 src/utils/logging_utils.py                   |  26 +-
 6 files changed, 569 insertions(+), 409 deletions(-)

diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml
index 30ac821..ec5bc75 100644
--- a/.github/workflows/ci_pipeline.yml
+++ b/.github/workflows/ci_pipeline.yml
@@ -36,7 +36,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.9', '3.10']
     timeout-minutes: 15
     steps:
       - uses: actions/checkout@v3
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 0b795c9..775203e 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -29,7 +29,7 @@ steps:
     args:
     - '-c'
     - |
-      python deployment/deploy_pipeline.py --platform vertex --project_id $PROJECT_ID --region ${_REGION} --output_file pipeline.yaml
+      python deployment/vertex_ai/vertex_deployment.py --platform vertex --project_id $PROJECT_ID --region ${_REGION} --output_file pipeline.yaml
     env:
       - 'MODEL_NAME=${_MODEL_NAME}'
       - 'ENDPOINT_NAME=${_ENDPOINT_NAME}'
diff --git a/deployment/vertex_ai/vertex_ai_monitoring.py b/deployment/vertex_ai/vertex_ai_monitoring.py
index af3cefc..7274a6b 100644
--- a/deployment/vertex_ai/vertex_ai_monitoring.py
+++ b/deployment/vertex_ai/vertex_ai_monitoring.py
@@ -1,4 +1,3 @@
-import yaml
 import tensorflow_data_validation as tfdv
 import argparse
 import json
@@ -7,20 +6,14 @@
 import random
 from typing import Dict, List, Optional, Tuple, Union
 from scipy.stats import ks_2samp
-from google.cloud import monitoring_v3, storage, bigquery, aiplatform
-from google.api import label_pb2 as ga_label
+from google.cloud import monitoring_v3, storage, aiplatform
 from google.api import metric_pb2 as ga_metric
-from google.protobuf import duration_pb2 as duration
+from google.protobuf import duration_pb2
 from src.data_processing.data_validation import (
-    generate_schema,
-    validate_data,
-    load_config,
     load_statistics_from_gcs,
     load_schema_from_gcs,
-    compare_statistics,
     compare_schemas,
-    save_statistics_to_gcs,
-    save_schema_to_gcs
+    save_statistics_to_gcs
 )
 from src.utils.logging_utils import setup_logger, log_error, log_step
 from ml_metadata import metadata_store
@@ -29,15 +22,23 @@
 logger = setup_logger('vertex_ai_pipeline_monitoring')
 
 class VertexAIMonitoring:
-    def __init__(self, project_id: str, model_name: str, bucket_name: str):
+    def __init__(self, project_id: str, model_name: str, bucket_name: str,
+                mlmd_host: str, mlmd_port: int, mlmd_database: str,
+                mlmd_user: str, mlmd_password: str):
         self.project_id = project_id
         self.model_name = model_name
         self.bucket_name = bucket_name
         self.client = monitoring_v3.MetricServiceClient()
         self.project_name = f"projects/{project_id}"
-        self.feature_store_client = aiplatform.FeatureStore(project=project_id)
+        self.feature_store_client = aiplatform.gapic.FeaturestoreServiceClient()
+
+        # Connect to MLMD using PostgreSQL
         self.mlmd_connection_config = metadata_store_pb2.ConnectionConfig()
-        self.mlmd_connection_config.sqlite.filename_uri = f"gs://{bucket_name}/mlmd/metadata.db"
+        self.mlmd_connection_config.postgresql.host = mlmd_host
+        self.mlmd_connection_config.postgresql.port = mlmd_port
+        self.mlmd_connection_config.postgresql.database = mlmd_database
+        self.mlmd_connection_config.postgresql.user = mlmd_user
+        self.mlmd_connection_config.postgresql.password = mlmd_password
         self.mlmd_store = metadata_store.MetadataStore(self.mlmd_connection_config)
 
     def setup_custom_metrics(self) -> None:
@@ -56,11 +57,14 @@ def setup_custom_metrics(self) -> None:
         ]
 
         for metric in metrics:
-            descriptor = self.client.create_metric_descriptor(
-                name=self.project_name,
-                metric_descriptor=metric
-            )
-            logger.info(f"Created {descriptor.name}")
+            try:
+                descriptor = self.client.create_metric_descriptor(
+                    name=self.project_name,
+                    metric_descriptor=metric
+                )
+                logger.info(f"Created metric descriptor: {descriptor.name}")
+            except Exception as e:
+                logger.warning(f"Metric descriptor {metric.type} already exists or could not be created. Error: {e}")
 
     def _create_metric_descriptor(self, metric_name: str, description: str, value_type: int = ga_metric.MetricDescriptor.ValueType.DOUBLE) -> ga_metric.MetricDescriptor:
         return ga_metric.MetricDescriptor(
@@ -70,26 +74,25 @@ def _create_metric_descriptor(self, metric_name: str, description: str, value_ty
             description=description
         )
 
-    def create_alert_policy(self, display_name: str, filter_str: str, threshold: float, duration_seconds: int, comparison: int) -> None:
+    def create_alert_policy(self, display_name: str, filter_str: str, threshold: float, duration_seconds: int, comparison: int, notification_channel_id: str) -> None:
         """Creates an alert policy in Google Cloud Monitoring."""
         client = monitoring_v3.AlertPolicyServiceClient()
-        
-        condition = {
-            "display_name": display_name,
-            "condition_threshold": {
-                "filter": filter_str,
-                "comparison": comparison,
-                "threshold_value": threshold,
-                "duration": duration.Duration(seconds=duration_seconds)
-            }
-        }
+        condition = monitoring_v3.AlertPolicy.Condition(
+            display_name=display_name,
+            condition_threshold=monitoring_v3.AlertPolicy.Condition.MetricThreshold(
+                filter=filter_str,
+                comparison=comparison,
+                threshold_value=threshold,
+                duration=duration_pb2.Duration(seconds=duration_seconds),
+            ),
+        )
 
-        alert_policy = {
-            "display_name": f"{self.model_name} {display_name}",
-            "conditions": [condition],
-            "notification_channels": [f"projects/{self.project_id}/notificationChannels/your-channel-id"],
-            "combiner": monitoring_v3.AlertPolicy.Combiner.OR,
-        }
+        alert_policy = monitoring_v3.AlertPolicy(
+            display_name=f"{self.model_name} {display_name}",
+            conditions=[condition],
+            notification_channels=[notification_channel_id],
+            combiner=monitoring_v3.AlertPolicy.ConditionCombinerType.OR,
+        )
 
         policy = client.create_alert_policy(
             name=self.project_name,
@@ -97,24 +100,26 @@ def create_alert_policy(self, display_name: str, filter_str: str, threshold: flo
         )
         logger.info(f"Created alert policy: {policy.name}")
 
-    def create_accuracy_degradation_alert(self, absolute_threshold: float, degradation_rate_threshold: float, time_window_seconds: int = 86400) -> None:
+    def create_accuracy_degradation_alert(self, absolute_threshold: float, degradation_rate_threshold: float, time_window_seconds: int, notification_channel_id: str) -> None:
         """Creates an alert for accuracy degradation."""
         self.create_alert_policy(
             "Accuracy below absolute threshold",
             f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/accuracy"',
             absolute_threshold,
             300,
-            monitoring_v3.ComparisonType.COMPARISON_LT
+            monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_LT,
+            notification_channel_id
         )
         self.create_alert_policy(
             "Accuracy degradation over time",
             f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/accuracy"',
             degradation_rate_threshold,
             time_window_seconds,
-            monitoring_v3.ComparisonType.COMPARISON_LT
+            monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_LT,
+            notification_channel_id
         )
 
-    def create_resource_utilization_alert(self) -> None:
+    def create_resource_utilization_alert(self, notification_channel_id: str) -> None:
         """Creates alerts for resource utilization (CPU, memory, and GPU)."""
         resources = [
             ("CPU", "compute.googleapis.com/instance/cpu/utilization"),
@@ -128,16 +133,29 @@ def create_resource_utilization_alert(self) -> None:
                 f'metric.type="{metric_type}"',
                 0.8,  # 80% utilization threshold
                 300,  # 5 minutes duration
-                monitoring_v3.ComparisonType.COMPARISON_GT
+                monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT,
+                notification_channel_id
             )
 
+    def create_rollback_alert_policy(self, notification_channel_id: str) -> None:
+        """Creates an alert policy for rollback detection based on traffic split anomalies."""
+        filter_str = f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/traffic_anomaly"'
+        self.create_alert_policy(
+            display_name="Rollback Detection Alert",
+            filter_str=filter_str,
+            threshold=0,  # Threshold set to detect any anomaly (since value will be 1 when anomaly is detected)
+            duration_seconds=300,
+            comparison=monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT,
+            notification_channel_id=notification_channel_id
+        )
+
     def log_metric(self, metric_name: str, value: Union[float, int]) -> None:
         """Logs a metric to Google Cloud Monitoring."""
         series = monitoring_v3.TimeSeries()
         series.metric.type = f"custom.googleapis.com/vertex_ai/{self.model_name}/{metric_name}"
-        series.resource.type = "aiplatform.googleapis.com/Endpoint"
-        series.resource.labels["model_name"] = self.model_name
-        point = series.points.add()
+        series.resource.type = "global"
+        series.resource.labels["project_id"] = self.project_id
+        point = monitoring_v3.Point()
         if isinstance(value, float):
             point.value.double_value = value
         else:
@@ -145,6 +163,7 @@ def log_metric(self, metric_name: str, value: Union[float, int]) -> None:
         now = datetime.datetime.now()
         point.interval.end_time.seconds = int(now.timestamp())
         point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9)
+        series.points = [point]
         self.client.create_time_series(name=self.project_name, time_series=[series])
         logger.info(f"Logged {metric_name} for model {self.model_name}: {value}")
 
@@ -153,30 +172,29 @@ def detect_data_drift(self, drift_threshold: float) -> Optional[float]:
         try:
             log_step(logger, 'Detecting Data Drift', 'Data Drift Detection')
             today = datetime.datetime.now().strftime("%Y%m%d")
-            
+
             baseline_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'train', today)
             if not baseline_stats:
                 self.log_metric("missing_statistics", 1)
                 return None
-            
+
             serving_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'serving', today)
             if not serving_stats:
                 self.log_metric("missing_statistics", 1)
                 return None
-            
+
             schema = load_schema_from_gcs(self.bucket_name, self.model_name, 'current')
             if not schema:
                 self.log_metric("missing_schema", 1)
                 return None
 
-            anomalies = compare_statistics(baseline_stats, serving_stats, schema)
+            # Compare the statistics
+            anomalies = tfdv.validate_statistics(statistics=serving_stats, schema=schema, previous_statistics=baseline_stats)
 
-            drift_score = 0
-            for feature, anomaly in anomalies.anomaly_info.items():
-                if anomaly:
-                    logger.warning(f"Data drift detected for feature {feature}: {anomaly.description}")
-                    drift_score += anomaly.severity
-                    self.log_metric("data_drift", anomaly.severity)
+            drift_score = len(anomalies.anomaly_info)
+            for feature_name, anomaly_info in anomalies.anomaly_info.items():
+                logger.warning(f"Data drift detected for feature {feature_name}: {anomaly_info.description}")
+                self.log_metric("data_drift", 1)
 
             if drift_score > drift_threshold:
                 logger.warning(f"Significant data drift detected. Drift score: {drift_score} > {drift_threshold}")
@@ -195,7 +213,7 @@ def detect_data_drift(self, drift_threshold: float) -> Optional[float]:
     def _log_drift_detection_to_mlmd(self, drift_score: float, drift_threshold: float):
         """Log drift detection results to ML Metadata."""
         execution = metadata_store_pb2.Execution()
-        execution.type = "DataDriftDetection"
+        execution.type_id = self._get_or_create_execution_type_id("DataDriftDetection")
         execution.properties["model_name"].string_value = self.model_name
         execution.properties["drift_score"].double_value = drift_score
         execution.properties["drift_threshold"].double_value = drift_threshold
@@ -204,29 +222,40 @@ def _log_drift_detection_to_mlmd(self, drift_score: float, drift_threshold: floa
         execution_id = self.mlmd_store.put_executions([execution])[0]
         logger.info(f"Logged drift detection results to MLMD with execution ID: {execution_id}")
 
+    def _get_or_create_execution_type_id(self, type_name: str) -> int:
+        """Helper method to get or create an execution type ID."""
+        try:
+            execution_type = self.mlmd_store.get_execution_type(type_name)
+        except metadata_store.errors.NotFoundError:
+            execution_type = metadata_store_pb2.ExecutionType(name=type_name)
+            self.mlmd_store.put_execution_type(execution_type)
+        return execution_type.id
+
     def detect_prediction_drift(self, drift_threshold: float) -> Optional[float]:
         """Detects prediction drift using the Kolmogorov-Smirnov (KS) test."""
         try:
             log_step(logger, 'Detecting Prediction Drift', 'Prediction Drift Detection')
             today = datetime.datetime.now().strftime("%Y%m%d")
-            
+
             train_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'train', today)
             if not train_stats:
                 self.log_metric("missing_statistics", 1)
                 return None
-            
+
             serving_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'serving', today)
             if not serving_stats:
                 self.log_metric("missing_statistics", 1)
                 return None
 
-            train_predictions = train_stats.datasets[0].features['similar_tracks'].num_stats.histograms[0].buckets
-            serving_predictions = serving_stats.datasets[0].features['similar_tracks'].num_stats.histograms[0].buckets
+            # Assuming 'prediction' is the feature containing model predictions
+            train_predictions = tfdv.get_feature_stats_as_dataframe(train_stats)
+            serving_predictions = tfdv.get_feature_stats_as_dataframe(serving_stats)
 
-            train_counts = [bucket.sample_count for bucket in train_predictions]
-            serving_counts = [bucket.sample_count for bucket in serving_predictions]
+            if 'prediction' not in train_predictions.columns or 'prediction' not in serving_predictions.columns:
+                logger.error("Prediction feature not found in statistics.")
+                return None
 
-            statistic, _ = ks_2samp(train_counts, serving_counts)
+            statistic, _ = ks_2samp(train_predictions['prediction'], serving_predictions['prediction'])
 
             self.log_metric("prediction_drift", statistic)
 
@@ -247,7 +276,7 @@ def detect_prediction_drift(self, drift_threshold: float) -> Optional[float]:
     def _log_prediction_drift_to_mlmd(self, statistic: float, drift_threshold: float):
         """Log prediction drift results to ML Metadata."""
         execution = metadata_store_pb2.Execution()
-        execution.type = "PredictionDriftDetection"
+        execution.type_id = self._get_or_create_execution_type_id("PredictionDriftDetection")
         execution.properties["model_name"].string_value = self.model_name
         execution.properties["ks_statistic"].double_value = statistic
         execution.properties["drift_threshold"].double_value = drift_threshold
@@ -266,7 +295,10 @@ def detect_schema_drift(self, schema_version: str) -> Optional[bool]:
                 self.log_metric("missing_schema", 1)
                 return None
 
-            current_schema = load_schema_from_gcs(self.bucket_name, self.model_name, 'serving_schema_version')
+            current_schema = load_schema_from_gcs(self.bucket_name, self.model_name, 'current')
+            if not current_schema:
+                self.log_metric("missing_schema", 1)
+                return None
 
             schema_drift_detected = compare_schemas(baseline_schema, current_schema)
 
@@ -289,7 +321,7 @@ def detect_schema_drift(self, schema_version: str) -> Optional[bool]:
     def _log_schema_drift_to_mlmd(self, schema_drift_detected: bool):
         """Log schema drift results to ML Metadata."""
         execution = metadata_store_pb2.Execution()
-        execution.type = "SchemaDriftDetection"
+        execution.type_id = self._get_or_create_execution_type_id("SchemaDriftDetection")
         execution.properties["model_name"].string_value = self.model_name
         execution.properties["schema_drift_detected"].int_value = int(schema_drift_detected)
         execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat()
@@ -297,11 +329,11 @@ def _log_schema_drift_to_mlmd(self, schema_drift_detected: bool):
         execution_id = self.mlmd_store.put_executions([execution])[0]
         logger.info(f"Logged schema drift results to MLMD with execution ID: {execution_id}")
 
-    def monitor_traffic_split(self, endpoint_name: str) -> Optional[Dict[str, float]]:
+    def monitor_traffic_split(self, endpoint_name: str, expected_traffic_split: Dict[str, int]) -> Optional[Dict[str, int]]:
         """Monitor the traffic split in Vertex AI to detect rollback."""
         try:
             log_step(logger, 'Monitoring traffic split', 'Rollback Monitoring')
-            
+
             aiplatform.init(project=self.project_id)
 
             endpoints = aiplatform.Endpoint.list(filter=f'display_name="{endpoint_name}"')
@@ -310,29 +342,43 @@ def monitor_traffic_split(self, endpoint_name: str) -> Optional[Dict[str, float]
                 return None
             endpoint = endpoints[0]
 
-            traffic_split = endpoint.traffic_split
+            traffic_split = endpoint.gca_resource.traffic_split
+            total_traffic = sum(traffic_split.values())
+            anomaly_detected = False
+
             for model_id, traffic_percentage in traffic_split.items():
                 logger.info(f"Model {model_id} is receiving {traffic_percentage}% of the traffic.")
-            
-            if sum(traffic_split.values()) != 100:
-                logger.warning("Traffic split does not sum to 100%, indicating a possible rollback.")
-        
+
+                # Check against expected traffic split
+                expected_percentage = expected_traffic_split.get(model_id, 0)
+                if traffic_percentage != expected_percentage:
+                    anomaly_detected = True
+                    logger.warning(f"Anomaly detected: Model {model_id} traffic is {traffic_percentage}%, expected {expected_percentage}%.")
+
+            if total_traffic != 100:
+                anomaly_detected = True
+                logger.warning("Traffic split does not sum to 100%, indicating a possible rollback or misconfiguration.")
+
+            # Log traffic anomaly metric
+            self.log_metric("traffic_anomaly", int(anomaly_detected))
+
             # Log traffic split to MLMD
-            self._log_traffic_split_to_mlmd(traffic_split)
+            self._log_traffic_split_to_mlmd(traffic_split, anomaly_detected)
 
             return traffic_split
         except Exception as e:
             log_error(logger, e, "Rollback Monitoring")
-            raise
+            return None
 
-    def _log_traffic_split_to_mlmd(self, traffic_split: Dict[str, float]):
+    def _log_traffic_split_to_mlmd(self, traffic_split: Dict[str, int], anomaly_detected: bool):
         """Log traffic split to ML Metadata."""
         execution = metadata_store_pb2.Execution()
-        execution.type = "TrafficSplitMonitoring"
+        execution.type_id = self._get_or_create_execution_type_id("TrafficSplitMonitoring")
         execution.properties["model_name"].string_value = self.model_name
+        execution.properties["anomaly_detected"].int_value = int(anomaly_detected)
         for model_id, percentage in traffic_split.items():
-            execution.properties[f"traffic_{model_id}"].double_value = percentage
-        execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat()
+            execution.custom_properties[f"traffic_{model_id}"].double_value = percentage
+        execution.properties["timestamp"].string_value = datetime.datetime.utcnow().isoformat() + 'Z'
 
         execution_id = self.mlmd_store.put_executions([execution])[0]
         logger.info(f"Logged traffic split to MLMD with execution ID: {execution_id}")
@@ -348,7 +394,7 @@ def trigger_retraining_pipeline(self, pipeline_name: str, gcs_input: str) -> str
 
         pipeline_job = aiplatform.PipelineJob(
             display_name=f'Retraining - {self.model_name}',
-            template_path=f'gs://{pipeline_name}',
+            template_path=pipeline_name,
             parameter_values=pipeline_params
         )
 
@@ -363,7 +409,7 @@ def trigger_retraining_pipeline(self, pipeline_name: str, gcs_input: str) -> str
     def _log_retraining_trigger_to_mlmd(self, pipeline_job_id: str):
         """Log retraining trigger to ML Metadata."""
         execution = metadata_store_pb2.Execution()
-        execution.type = "RetrainingTrigger"
+        execution.type_id = self._get_or_create_execution_type_id("RetrainingTrigger")
         execution.properties["model_name"].string_value = self.model_name
         execution.properties["pipeline_job_id"].string_value = pipeline_job_id
         execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat()
@@ -371,27 +417,29 @@ def _log_retraining_trigger_to_mlmd(self, pipeline_job_id: str):
         execution_id = self.mlmd_store.put_executions([execution])[0]
         logger.info(f"Logged retraining trigger to MLMD with execution ID: {execution_id}")
 
-    def setup_retraining_job_alert(self, notification_channel: str) -> None:
+    def setup_retraining_job_alert(self, notification_channel_id: str) -> None:
         """Set up a Cloud Monitoring alert for Vertex AI retraining jobs."""
-        condition = {
-            "display_name": "Vertex AI Retraining Job Created",
-            "condition_threshold": {
-                "filter": 'resource.type="aiplatform.googleapis.com/PipelineJob" AND protoPayload.methodName="google.cloud.aiplatform.v1.PipelineService.CreatePipelineJob"',
-                "comparison": monitoring_v3.ComparisonType.COMPARISON_GT,
-                "threshold_value": 0,
-                "duration": {"seconds": 60},
-            }
-        }
+        client = monitoring_v3.AlertPolicyServiceClient()
+        condition = monitoring_v3.AlertPolicy.Condition(
+            display_name="Vertex AI Retraining Job Created",
+            condition_monitoring_query_language=monitoring_v3.AlertPolicy.Condition.MonitoringQueryLanguageCondition(
+                query=(
+                    'fetch aiplatform.googleapis.com/pipeline_job '
+                    '| {metric.type="aiplatform.googleapis.com/pipeline_job/pipeline_job_state"}'
+                ),
+                duration=duration_pb2.Duration(seconds=60),
+                trigger=monitoring_v3.AlertPolicy.Condition.Trigger(count=1)
+            )
+        )
 
-        alert_policy = {
-            "display_name": "Retraining Job Alert",
-            "conditions": [condition],
-            "notification_channels": [notification_channel],
-            "enabled": True,
-            "combiner": monitoring_v3.AlertPolicy.Combiner.OR
-        }
+        alert_policy = monitoring_v3.AlertPolicy(
+            display_name="Retraining Job Alert",
+            conditions=[condition],
+            notification_channels=[notification_channel_id],
+            combiner=monitoring_v3.AlertPolicy.ConditionCombinerType.OR,
+            enabled=True
+        )
 
-        client = monitoring_v3.AlertPolicyServiceClient()
         policy = client.create_alert_policy(
             name=self.project_name,
             alert_policy=alert_policy
@@ -399,19 +447,19 @@ def setup_retraining_job_alert(self, notification_channel: str) -> None:
 
         logger.info(f"Created retraining job alert policy: {policy.name}")
 
-    def monitor_and_trigger_retraining(self, accuracy_threshold: float, drift_threshold: float, gcs_input: str, pipeline_name: str, notification_channel: str) -> None:
+    def monitor_and_trigger_retraining(self, accuracy_threshold: float, degradation_rate_threshold: float, drift_threshold: float, gcs_input: str, pipeline_name: str, notification_channel_id: str, time_window_seconds: int) -> None:
         """Monitor model accuracy, data drift, and prediction drift, and trigger retraining when necessary."""
-        self.create_accuracy_degradation_alert(accuracy_threshold, 0.05)
+        self.create_accuracy_degradation_alert(accuracy_threshold, degradation_rate_threshold, time_window_seconds, notification_channel_id)
 
-        data_drift_detected = self.detect_data_drift(drift_threshold)
-        prediction_drift_detected = self.detect_prediction_drift(drift_threshold)
+        data_drift_score = self.detect_data_drift(drift_threshold)
+        prediction_drift_statistic = self.detect_prediction_drift(drift_threshold)
 
-        if data_drift_detected or prediction_drift_detected:
+        if (data_drift_score and data_drift_score > drift_threshold) or (prediction_drift_statistic and prediction_drift_statistic > drift_threshold):
             logger.warning(f"Drift detected for {self.model_name}. Triggering retraining pipeline.")
-            
+
             pipeline_job_id = self.trigger_retraining_pipeline(pipeline_name, gcs_input)
-            
-            self.setup_retraining_job_alert(notification_channel)
+
+            self.setup_retraining_job_alert(notification_channel_id)
 
             logger.info(f"Retraining job triggered: {pipeline_job_id}")
         else:
@@ -419,11 +467,82 @@ def monitor_and_trigger_retraining(self, accuracy_threshold: float, drift_thresh
 
         logger.info("Model performance and drift monitoring completed.")
 
+    def log_feature_store_metric(self, feature_store_id: str, entity_type_id: str, metric_name: str, value: Union[int, float]):
+        """Logs a feature store metric to Google Cloud Monitoring."""
+        series = monitoring_v3.TimeSeries()
+        series.metric.type = f"custom.googleapis.com/vertex_ai/{self.model_name}/{metric_name}"
+        series.resource.type = "aiplatform.googleapis.com/Featurestore"
+        series.resource.labels["featurestore_id"] = feature_store_id
+        series.resource.labels["entity_type_id"] = entity_type_id
+        point = monitoring_v3.Point()
+        if isinstance(value, int):
+            point.value.int64_value = value
+        else:
+            point.value.double_value = value
+        now = datetime.datetime.now()
+        point.interval.end_time.seconds = int(now.timestamp())
+        point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9)
+        series.points = [point]
+        self.client.create_time_series(name=self.project_name, time_series=[series])
+        logger.info(f"Logged feature store metric {metric_name} with value {value}")
+
+    def monitor_feature_store(self, feature_store_id: str, entity_type_id: str):
+        """Monitors the Feature Store and logs relevant metrics."""
+        try:
+            log_step(logger, 'Monitoring Feature Store', 'Feature Store Monitoring')
+
+            featurestore_name = f"projects/{self.project_id}/locations/-/featurestores/{feature_store_id}"
+            entity_type_name = f"{featurestore_name}/entityTypes/{entity_type_id}"
+
+            # Log read and write counts
+            # Placeholder implementation; actual read/write counts need to be retrieved from monitoring metrics or logs
+            read_count = 100  # Replace with actual logic to get read count
+            write_count = 50  # Replace with actual logic to get write count
+
+            self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_read_count", read_count)
+            self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_write_count", write_count)
+
+            # Log latency (this is a placeholder, actual implementation may vary based on available metrics)
+            avg_latency = 200  # Replace with actual logic to get average latency
+            self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_latency", avg_latency)
+
+            logger.info(f"Monitored feature store {feature_store_id}, entity type {entity_type_id}")
+        except Exception as e:
+            log_error(logger, e, 'Feature Store Monitoring')
+
+    def create_feature_store_alerts(self, feature_store_id: str, entity_type_id: str, notification_channel_id: str):
+        """Creates alerts for Feature Store monitoring."""
+        self.create_alert_policy(
+            "High Feature Store Read Count",
+            f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_read_count" AND resource.labels.featurestore_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"',
+            1000,  # Threshold: 1000 reads
+            300,   # Duration: 5 minutes
+            monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT,
+            notification_channel_id
+        )
+        self.create_alert_policy(
+            "High Feature Store Write Count",
+            f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_write_count" AND resource.labels.featurestore_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"',
+            500,   # Threshold: 500 writes
+            300,   # Duration: 5 minutes
+            monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT,
+            notification_channel_id
+        )
+        self.create_alert_policy(
+            "High Feature Store Latency",
+            f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_latency" AND resource.labels.featurestore_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"',
+            1000,  # Threshold: 1000 ms
+            300,   # Duration: 5 minutes
+            monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT,
+            notification_channel_id
+        )
+
 def log_request_response(project_id: str, model_name: str, request: Dict, response: Dict, latency_ms: float, sampling_rate: float = 0.1) -> None:
     """Logs serving request/response data and latency to Cloud Storage with optional sampling."""
     if sampling_rate >= 1 or random.random() < sampling_rate:
         client = storage.Client(project=project_id)
-        bucket = client.get_bucket(f"{project_id}-vertex-ai-logs")
+        bucket_name = f"{project_id}-vertex-ai-logs"
+        bucket = client.get_bucket(bucket_name)
         blob = bucket.blob(f"{model_name}/logs/{datetime.datetime.now().isoformat()}.json")
         log_entry = {
             "request": request,
@@ -436,16 +555,13 @@ def log_request_response(project_id: str, model_name: str, request: Dict, respon
 
 def check_existing_statistics_and_schema(project_id: str, model_name: str, bucket_name: str, schema_version: str) -> Tuple[Optional[tfdv.types.DatasetFeatureStatisticsList], Optional[tfdv.types.Schema]]:
     today = datetime.datetime.now().strftime("%Y%m%d")
-    
+
     try:
         existing_stats = load_statistics_from_gcs(bucket_name, model_name, 'serving', today)
     except Exception as e:
         logger.error(f"Error loading existing statistics: {e}")
         existing_stats = None
 
-    config = load_config()
-    schema_path = config['data_validation']['schema_path']
-    
     try:
         schema = load_schema_from_gcs(bucket_name, model_name, schema_version)
     except Exception as e:
@@ -454,21 +570,21 @@ def check_existing_statistics_and_schema(project_id: str, model_name: str, bucke
 
     return existing_stats, schema
 
-def compute_and_store_statistics(project_id: str, model_name: str, bucket_name: str, existing_schema: Optional[tfdv.types.Schema]) -> Tuple[tfdv.types.DatasetFeatureStatisticsList, tfdv.types.Anomalies]:
+def compute_and_store_statistics(project_id: str, model_name: str, bucket_name: str, existing_schema: Optional[tfdv.types.Schema]) -> Tuple[tfdv.types.DatasetFeatureStatisticsList, Optional[tfdv.types.Anomalies]]:
     client = storage.Client(project=project_id)
     bucket = client.get_bucket(f"{project_id}-vertex-ai-logs")
-    blobs = bucket.list_blobs(prefix=f"{model_name}/logs/")
+    blobs = client.list_blobs(bucket_or_name=bucket, prefix=f"{model_name}/logs/")
 
     data = []
     for blob in blobs:
         content = json.loads(blob.download_as_string())
         data.append(content)
 
-    df = pd.DataFrame(data)
-    
+    df = pd.json_normalize(data)
+
     stats = tfdv.generate_statistics_from_dataframe(df)
     save_statistics_to_gcs(stats, bucket_name, model_name, 'serving')
-    
+
     if existing_schema:
         anomalies = tfdv.validate_statistics(stats, schema=existing_schema)
     else:
@@ -477,71 +593,6 @@ def compute_and_store_statistics(project_id: str, model_name: str, bucket_name:
 
     return stats, anomalies
 
-def log_feature_store_metric(self, feature_store_id: str, entity_type_id: str, metric_name: str, value: Union[int, float]):
-    """Logs a feature store metric to Google Cloud Monitoring."""
-    series = monitoring_v3.TimeSeries()
-    series.metric.type = f"custom.googleapis.com/vertex_ai/{self.model_name}/{metric_name}"
-    series.resource.type = "aiplatform.googleapis.com/FeatureStore"
-    series.resource.labels["feature_store_id"] = feature_store_id
-    series.resource.labels["entity_type_id"] = entity_type_id
-    point = series.points.add()
-    if isinstance(value, int):
-        point.value.int64_value = value
-    else:
-        point.value.double_value = value
-    now = datetime.datetime.now()
-    point.interval.end_time.seconds = int(now.timestamp())
-    point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9)
-    self.client.create_time_series(name=self.project_name, time_series=[series])
-    logger.info(f"Logged feature store metric {metric_name} with value {value}")
-
-def monitor_feature_store(self, feature_store_id: str, entity_type_id: str):
-    """Monitors the Feature Store and logs relevant metrics."""
-    try:
-        log_step(logger, 'Monitoring Feature Store', 'Feature Store Monitoring')
-        
-        feature_store = self.feature_store_client.get_feature_store(feature_store_id=feature_store_id)
-        entity_type = feature_store.get_entity_type(entity_type_id=entity_type_id)
-
-        # Log read and write counts
-        read_count = entity_type.read_stats().get("total_entity_reads", 0)
-        write_count = entity_type.write_stats().get("total_entity_updates", 0)
-
-        self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_read_count", read_count)
-        self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_write_count", write_count)
-
-        # Log latency (this is a placeholder, actual implementation may vary based on available metrics)
-        avg_latency = entity_type.read_stats().get("average_read_latency_milliseconds", 0)
-        self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_latency", avg_latency)
-
-        logger.info(f"Monitored feature store {feature_store_id}, entity type {entity_type_id}")
-    except Exception as e:
-        log_error(logger, e, 'Feature Store Monitoring')
-
-def create_feature_store_alerts(self, feature_store_id: str, entity_type_id: str):
-    """Creates alerts for Feature Store monitoring."""
-    self.create_alert_policy(
-        "High Feature Store Read Count",
-        f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_read_count" AND resource.labels.feature_store_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"',
-        1000,  # Threshold: 1000 reads
-        300,   # Duration: 5 minutes
-        monitoring_v3.ComparisonType.COMPARISON_GT
-    )
-    self.create_alert_policy(
-        "High Feature Store Write Count",
-        f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_write_count" AND resource.labels.feature_store_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"',
-        500,   # Threshold: 500 writes
-        300,   # Duration: 5 minutes
-        monitoring_v3.ComparisonType.COMPARISON_GT
-    )
-    self.create_alert_policy(
-        "High Feature Store Latency",
-        f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_latency" AND resource.labels.feature_store_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"',
-        1000,  # Threshold: 1000 ms
-        300,   # Duration: 5 minutes
-        monitoring_v3.ComparisonType.COMPARISON_GT
-    )
-
 def main():
     parser = argparse.ArgumentParser(description='Setup Vertex AI monitoring, drift detection, and rollback with retraining')
     parser.add_argument('--project_id', required=True, help='GCP Project ID')
@@ -552,27 +603,57 @@ def main():
     parser.add_argument('--time_window', type=int, default=86400, help='Time window in seconds to monitor for degradation (default is 24 hours)')
     parser.add_argument('--drift_threshold', type=float, default=0.05, help='Data drift threshold for retraining')
     parser.add_argument('--gcs_input', required=True, help='GCS path to input data for retraining')
-    parser.add_argument('--pipeline_name', required=True, help='Name of the Vertex AI pipeline for retraining')
+    parser.add_argument('--pipeline_name', required=True, help='Path to the Vertex AI pipeline template for retraining')
     parser.add_argument('--notification_channel', required=True, help='Notification channel ID (for alerts)')
     parser.add_argument('--bucket_name', required=True, help='Cloud Storage bucket name')
     parser.add_argument('--schema_version', required=True, help='Schema version for validation')
     parser.add_argument('--sampling_rate', type=float, default=0.1, help='Sampling rate for request/response logging')
     parser.add_argument('--feature_store_id', required=True, help='Vertex AI Feature Store ID')
     parser.add_argument('--entity_type_id', required=True, help='Entity Type ID in the Feature Store')
+    parser.add_argument('--mlmd_host', required=True, help='MLMD PostgreSQL host')
+    parser.add_argument('--mlmd_port', type=int, default=5432, help='MLMD PostgreSQL port')
+    parser.add_argument('--mlmd_database', required=True, help='MLMD PostgreSQL database name')
+    parser.add_argument('--mlmd_user', required=True, help='MLMD PostgreSQL username')
+    parser.add_argument('--mlmd_password', required=True, help='MLMD PostgreSQL password')
     args = parser.parse_args()
 
-    monitor = VertexAIMonitoring(args.project_id, args.model_name, args.bucket_name)
-
+    monitor = VertexAIMonitoring(
+        project_id=args.project_id,
+        model_name=args.model_name,
+        bucket_name=args.bucket_name,
+        mlmd_host=args.mlmd_host,
+        mlmd_port=args.mlmd_port,
+        mlmd_database=args.mlmd_database,
+        mlmd_user=args.mlmd_user,
+        mlmd_password=args.mlmd_password
+    )
+    
     monitor.setup_custom_metrics()
-    monitor.create_alert_policy("Data Drift Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/data_drift"', 0.1, 300, monitoring_v3.ComparisonType.COMPARISON_GT)
-    monitor.create_alert_policy("Prediction Drift Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/prediction_drift"', 0.1, 300, monitoring_v3.ComparisonType.COMPARISON_GT)
-    monitor.create_resource_utilization_alert()
-    monitor.create_alert_policy("Prediction Latency Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/prediction_latency"', 1000, 60, monitoring_v3.ComparisonType.COMPARISON_GT)
-    monitor.create_alert_policy("Schema Drift Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/schema_drift"', 1, 300, monitoring_v3.ComparisonType.COMPARISON_GT)
-    monitor.create_accuracy_degradation_alert(args.absolute_threshold, args.degradation_rate_threshold, args.time_window)
-    monitor.setup_feature_store_monitoring(args.feature_store_id, args.entity_type_id)
-    monitor.create_feature_store_alerts(args.feature_store_id, args.entity_type_id)
+    monitor.create_alert_policy("Data Drift Alert",
+                                f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/data_drift"',
+                                0.1, 300,
+                                monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT,
+                                args.notification_channel)
+    monitor.create_alert_policy("Prediction Drift Alert",
+                                f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/prediction_drift"',
+                                0.1, 300,
+                                monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT,
+                                args.notification_channel)
+    monitor.create_resource_utilization_alert(args.notification_channel)
+    monitor.create_alert_policy("Prediction Latency Alert",
+                                f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/prediction_latency"',
+                                1000, 60,
+                                monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT,
+                                args.notification_channel)
+    monitor.create_alert_policy("Schema Drift Alert",
+                                f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/schema_drift"',
+                                1, 300,
+                                monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT,
+                                args.notification_channel)
+    monitor.create_accuracy_degradation_alert(args.absolute_threshold, args.degradation_rate_threshold, args.time_window, args.notification_channel)
+    monitor.create_feature_store_alerts(args.feature_store_id, args.entity_type_id, args.notification_channel)
     monitor.monitor_feature_store(args.feature_store_id, args.entity_type_id)
+    monitor.create_rollback_alert_policy(args.notification_channel)
 
     existing_stats, existing_schema = check_existing_statistics_and_schema(args.project_id, args.model_name, args.bucket_name, args.schema_version)
     current_stats, anomalies = compute_and_store_statistics(args.project_id, args.model_name, args.bucket_name, existing_schema)
@@ -585,10 +666,12 @@ def main():
 
     monitor.monitor_and_trigger_retraining(
         accuracy_threshold=args.absolute_threshold,
+        degradation_rate_threshold=args.degradation_rate_threshold,
         drift_threshold=args.drift_threshold,
         gcs_input=args.gcs_input,
         pipeline_name=args.pipeline_name,
-        notification_channel=args.notification_channel
+        notification_channel_id=args.notification_channel,
+        time_window_seconds=args.time_window
     )
 
     logger.info("Vertex AI monitoring, drift detection, rollback, and retraining setup completed successfully!")
diff --git a/deployment/vertex_ai/vertex_deployment.py b/deployment/vertex_ai/vertex_deployment.py
index 7b3f7ea..1f4e5bb 100644
--- a/deployment/vertex_ai/vertex_deployment.py
+++ b/deployment/vertex_ai/vertex_deployment.py
@@ -3,161 +3,172 @@
 import base64
 import os
 from typing import Tuple
-from google.cloud import aiplatform, pubsub_v1
-from google.cloud.devtools import cloudbuild_v1
-from google.protobuf import duration_pb2
-from google.cloud import run_v2
+from google.cloud import aiplatform, pubsub_v1, cloudbuild_v1, functions_v1, run_v2, firestore
+from google.protobuf import field_mask_pb2
 from src.utils.logging_utils import setup_logger, log_error, log_step
-from vertex_ai_monitoring import monitor_and_log_rollbacks, monitor_and_trigger_retraining
 
 logger = setup_logger('vertex_ai_deployment')
 
-# Set your cooldown period (e.g., 5 minutes = 300 seconds)
-COOLDOWN_PERIOD = 300  # Cooldown period in seconds
-
-# Global variable to store the last build trigger timestamp
-LAST_TRIGGER_TIME = 0
-
 def deploy_to_vertex_ai(project_id: str, model_path: str, endpoint_name: str, model_name: str, canary_traffic_percent: int = 10) -> Tuple[str, str]:
     """
-    Deploy the model to Vertex AI using a canary deployment strategy, checking if an existing model is already deployed.
+    Deploy the model to Vertex AI using a canary deployment strategy.
     If the new model fails, allow traffic rollback to the existing model.
     """
     try:
         log_step(logger, 'Model Deployment to Vertex AI', 'Serving')
-        
+
         # Initialize Vertex AI
         aiplatform.init(project=project_id)
-        
+
         # Upload the model to Vertex AI
+        logger.debug(f"Uploading model '{model_name}' from path '{model_path}'")
         model = aiplatform.Model.upload(
             display_name=model_name,
             artifact_uri=model_path,
             serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-6:latest"
         )
-        
+        model.wait()
+        logger.info(f"Model '{model_name}' uploaded successfully.")
+
         # Retrieve the endpoint or create a new one if it doesn't exist
         endpoints = aiplatform.Endpoint.list(filter=f'display_name="{endpoint_name}"')
         if not endpoints:
+            logger.info(f"Creating new endpoint '{endpoint_name}'")
             endpoint = aiplatform.Endpoint.create(display_name=endpoint_name)
-            traffic_split = {model.resource_name: 100}  # 100% traffic to the new model since no model exists
+            endpoint.wait()
+            traffic_split = {model.resource_name: 100}  # 100% traffic to the new model
             logger.info("No existing models. Deploying new model with 100% traffic.")
         else:
             endpoint = endpoints[0]
-            
+            logger.info(f"Using existing endpoint '{endpoint.display_name}'")
+
             # Check if there is an existing model deployed
             if endpoint.traffic_split:
                 current_model_id = list(endpoint.traffic_split.keys())[0]  # Assume single model in the endpoint
-                previous_traffic_split = endpoint.traffic_split  # Save the current traffic split for rollback
-                
-                # Apply the canary strategy: split traffic between the existing model and the new model
+                previous_traffic_split = endpoint.traffic_split.copy()  # Save the current traffic split for rollback
+
+                # Apply the canary strategy
                 traffic_split = {
                     current_model_id: 100 - canary_traffic_percent,
                     model.resource_name: canary_traffic_percent
                 }
                 logger.info(f"Canary deployment: {100 - canary_traffic_percent}% to the current model, {canary_traffic_percent}% to the new model.")
             else:
-                # No models currently deployed, assign 100% traffic to the new model
+                # No models currently deployed
                 traffic_split = {model.resource_name: 100}
                 logger.info("No existing traffic split found. Assigning 100% traffic to the new model.")
 
         # Deploy the model to the endpoint with the traffic split
-        model_deployment = model.deploy(
+        logger.debug("Deploying model to endpoint with specified traffic split.")
+        model.deploy(
             endpoint=endpoint,
             machine_type="n1-standard-2",
             traffic_split=traffic_split,
             min_replica_count=1,
-            max_replica_count=3,
-            accelerator_type=None,
-            accelerator_count=None,
-            accelerator_config=None
+            max_replica_count=3
         )
-
         logger.info(f"Model deployed to Vertex AI endpoint: {endpoint.resource_name}")
 
-        # Use the consolidated function to monitor traffic split and set up rollback alerts
-        monitor_and_log_rollbacks(project_id, endpoint_name)
-
-        # After model deployment, monitor and trigger retraining if necessary
-        monitor_and_trigger_retraining(
-            project_id=project_id,
-            model_name=model_name,
-            accuracy_threshold=0.85,
-            drift_threshold=0.05,
-            gcs_input='gs://your_project/data/',
-            pipeline_name='your_pipeline_name',
-            notification_channel='your_notification_channel'
-        )
-
         return (endpoint.resource_name, model.resource_name)
-    
+
     except Exception as e:
         log_error(logger, e, 'Model Deployment to Vertex AI')
-        
+
         # Rollback traffic to the previous model if deployment fails
-        if endpoint and previous_traffic_split:
-            endpoint.deploy(traffic_split=previous_traffic_split)
+        if 'endpoint' in locals() and 'previous_traffic_split' in locals():
+            endpoint.update_traffic_split(traffic_split=previous_traffic_split)
             logger.info("Rolled back traffic to the previous model due to deployment failure.")
         else:
             logger.error("No previous traffic split available for rollback.")
         raise
 
-
 def setup_cloud_build_trigger(project_id: str, repo_name: str, branch_name: str, storage_bucket: str = None):
     """
-    Set up a Cloud Build trigger for continuous training with Pub/Sub and Cloud Function for cooldown.
-    The trigger can monitor both code changes in the repository and new data in a Cloud Storage bucket.
-    
-    :param project_id: GCP Project ID
-    :param repo_name: Name of the GitHub repository
-    :param branch_name: Branch to monitor for changes (e.g., 'main')
-    :param storage_bucket: Optional. Cloud Storage bucket to monitor for new data
+    Set up a Cloud Build trigger for continuous training.
     """
-    client = cloudbuild_v1.CloudBuildClient()
-
-    # Create the Cloud Build trigger
-    trigger = cloudbuild_v1.BuildTrigger(
-        name=f"{repo_name}-{branch_name}-trigger",
-        github=cloudbuild_v1.GitHubEventsConfig(
-            owner="your-github-username",
-            name=repo_name,
-            push=cloudbuild_v1.PushFilter(
-                branch=branch_name,
-                included_paths=["pipeline/**", "model/**", "configs/**", "deployment/vertex_ai/**", "src/**", "data/**", "kubeflow/**"],
-                ignored_paths=["README.md", "docs/**", "*.md"]  # Exempt non-critical changes
+    try:
+        log_step(logger, 'Setting up Cloud Build Trigger', 'CI/CD Pipeline')
+
+        client = cloudbuild_v1.CloudBuildClient()
+
+        trigger = cloudbuild_v1.BuildTrigger(
+            name=f"{repo_name}-{branch_name}-trigger",
+            github=cloudbuild_v1.GitHubEventsConfig(
+                owner="your-github-username",
+                name=repo_name,
+                push=cloudbuild_v1.PushFilter(
+                    branch=f'^{branch_name}$'  # Use regex for exact match
+                )
+            ),
+            filename="cloudbuild.yaml",
+            included_files=["pipeline/**", "model/**", "configs/**", "deployment/vertex_ai/**", "src/**", "data/**", "kubeflow/**"],
+            ignored_files=["README.md", "docs/**", "*.md"]  # Exempt non-critical changes
+        )
+
+        # Optional: Monitor for new data ingestion in the Cloud Storage bucket
+        if storage_bucket:
+            trigger.pubsub_config = cloudbuild_v1.PubsubConfig(
+                topic=f"projects/{project_id}/topics/{storage_bucket}-trigger"
             )
-        ),
-        filename="cloudbuild.yaml"
-    )
-    
-    # Optional: Monitor for new data ingestion in the Cloud Storage bucket
-    if storage_bucket:
-        trigger.pubsub_config = cloudbuild_v1.PubsubConfig(
-            topic=f"projects/{project_id}/topics/{storage_bucket}-trigger",
-            subscription=f"projects/{project_id}/subscriptions/{storage_bucket}-trigger-sub"
+
+        # Create the Cloud Build trigger
+        parent = f"projects/{project_id}/locations/global"
+        trigger_response = client.create_build_trigger(parent=parent, trigger=trigger)
+        logger.info(f"Cloud Build trigger created: {trigger_response.name}")
+
+        # Set up build notifications
+        notification_config = cloudbuild_v1.NotificationConfig(
+            filter="build.status in (SUCCESS, FAILURE, INTERNAL_ERROR, TIMEOUT)",
+            pubsub_topic=f"projects/{project_id}/topics/cloud-builds"
         )
 
-    # Create the Cloud Build trigger
-    trigger_response = client.create_build_trigger(parent=f"projects/{project_id}", trigger=trigger)
-    
-    print(f"Cloud Build trigger created: {trigger_response.name}")
+        # Update the trigger with notification config
+        trigger_response.trigger.notification_config = notification_config
+        update_mask = field_mask_pb2.FieldMask(paths=['notification_config'])
+        client.update_build_trigger(
+            trigger=trigger_response.trigger,
+            update_mask=update_mask
+        )
+        logger.info("Build status notifications set up for successful and failed builds.")
 
-    # Set up build notifications
-    notification_config = cloudbuild_v1.NotificationConfig(
-        filter="build.status in (SUCCESS, FAILURE, INTERNAL_ERROR, TIMEOUT)",
-        pubsub_topic=f"projects/{project_id}/topics/cloud-builds"
-    )
+        return trigger_response.trigger
+
+    except Exception as e:
+        log_error(logger, e, 'Setting up Cloud Build Trigger')
+        raise
 
-    # Update the trigger with notification config
-    trigger_response.notification_config = notification_config
-    client.update_build_trigger(
-        project_id=project_id,
-        trigger_id=trigger_response.id,
-        trigger=trigger_response
-    )
+def trigger_cloud_build():
+    """
+    Function to trigger a Cloud Build job using the Cloud Build API.
+    """
+    try:
+        log_step(logger, 'Triggering Cloud Build Job', 'CI/CD Pipeline')
+
+        client = cloudbuild_v1.CloudBuildClient()
+
+        project_id = os.environ.get('PROJECT_ID')
+        trigger_id = os.environ.get('TRIGGER_ID')
+        model_name = os.environ.get('MODEL_NAME', 'default_model_name')
+        endpoint_name = os.environ.get('ENDPOINT_NAME', 'default_endpoint_name')
+
+        if not project_id or not trigger_id:
+            raise ValueError("Environment variables 'PROJECT_ID' and 'TRIGGER_ID' must be set.")
+
+        # Trigger the Cloud Build job
+        operation = client.run_build_trigger(
+            project_id=project_id,
+            trigger_id=trigger_id,
+            source=cloudbuild_v1.RepoSource()
+        )
+
+        build = operation.result()
+        logger.info(f"Triggered Cloud Build job for trigger ID: {trigger_id}, Model: {model_name}, Endpoint: {endpoint_name}")
+        logger.info(f"Build Status: {build.status.name}")
+
+    except Exception as e:
+        log_error(logger, e, 'Triggering Cloud Build Job')
+        raise
 
-    print("Build status notifications set up for successful and failed builds.")
-    return trigger_response
 
 def cloud_build_trigger(event, context):
     """
@@ -165,82 +176,123 @@ def cloud_build_trigger(event, context):
     It handles the Pub/Sub event and ensures that builds are not triggered
     more frequently than the specified cooldown period.
     """
-    global LAST_TRIGGER_TIME
-    current_time = time.time()
-
-    # Decode the Pub/Sub message (if it's base64-encoded)
-    if 'data' in event:
-        data = base64.b64decode(event['data']).decode('utf-8')
-        print(f"Received message: {data}")
-
-    # Check if the cooldown period has passed
-    if current_time - LAST_TRIGGER_TIME < COOLDOWN_PERIOD:
-        print("Cooldown period not over. Skipping build trigger.")
-        return
+    try:
+        log_step(logger, 'Cloud Build Trigger Function Invoked', 'Cloud Function')
+
+        # Initialize Firestore client
+        firestore_client = firestore.Client()
+        cooldown_collection = firestore_client.collection('cloud_build_cooldown')
+        cooldown_doc = cooldown_collection.document('last_trigger_time')
+
+        current_time = time.time()
+        cooldown_period = int(os.environ.get('COOLDOWN_PERIOD', 300))  # Default to 300 seconds if not set
+
+        @firestore.transactional
+        def update_last_trigger_time(transaction):
+            doc = cooldown_doc.get(transaction=transaction)
+            if doc.exists:
+                last_trigger_time = doc.to_dict().get('timestamp')
+                if (current_time - last_trigger_time) < cooldown_period:
+                    logger.info("Cooldown period not over. Skipping build trigger.")
+                    return False
+            # Update the last trigger time
+            transaction.set(cooldown_doc, {'timestamp': current_time})
+            return True
+
+        transaction = firestore_client.transaction()
+        should_trigger = update_last_trigger_time(transaction)
+        if not should_trigger:
+            return
+
+        # Trigger the Cloud Build job since the cooldown period has passed
+        trigger_cloud_build()
+        logger.info("Cloud Build job triggered successfully.")
 
-    # Trigger the Cloud Build job since the cooldown period has passed
-    trigger_cloud_build()
+    except Exception as e:
+        log_error(logger, e, 'Cloud Build Trigger Function')
+        raise
 
-    # Update the last trigger time
-    LAST_TRIGGER_TIME = current_time
 
-def trigger_cloud_build():
+def deploy_cloud_function(project_id, region, function_name, entry_point, runtime, trigger_topic, env_vars):
     """
-    Function to trigger a Cloud Build job using the Cloud Build API.
+    Deploy a Cloud Function using the client library.
     """
-    client = cloudbuild_v1.CloudBuildClient()
-
-    project_id = os.environ.get('PROJECT_ID')
-    trigger_id = os.environ.get('TRIGGER_ID')
-    model_name = os.environ.get('MODEL_NAME')
-    endpoint_name = os.environ.get('ENDPOINT_NAME')
+    try:
+        log_step(logger, 'Deploying Cloud Function', 'Deployment')
+
+        client = functions_v1.CloudFunctionsServiceClient()
+        parent = f'projects/{project_id}/locations/{region}'
+
+        # Prepare the Cloud Function source code zip file
+        # Assuming the function code is in a directory named 'cloud_function_code'
+        source_archive_url = f'gs://{project_id}-cloud-functions/{function_name}.zip'
+        # You need to upload the zip file to the specified GCS bucket
+
+        function = functions_v1.CloudFunction(
+            name=f'{parent}/functions/{function_name}',
+            entry_point=entry_point,
+            runtime=runtime,
+            environment_variables=env_vars,
+            event_trigger=functions_v1.EventTrigger(
+                event_type='google.pubsub.topic.publish',
+                resource=f'projects/{project_id}/topics/{trigger_topic}',
+                retry_policy=functions_v1.EventTrigger.RetryPolicy.RETRY_POLICY_RETRY
+            ),
+            source_archive_url=source_archive_url,
+            service_account_email=f'{project_id}@appspot.gserviceaccount.com', 
+            ingress_settings=functions_v1.CloudFunction.IngressSettings.ALLOW_ALL
+        )
 
-    # Trigger the Cloud Build job using the build trigger ID
-    build = cloudbuild_v1.BuildTrigger(
-        project_id=project_id,
-        trigger_id=trigger_id
-    )
+        operation = client.create_function(request={'location': parent, 'function': function})
+        response = operation.result()
 
-    # Run the build
-    client.run_build_trigger(project_id=project_id, trigger_id=trigger_id, source=None)
-    print(f"Triggered Cloud Build job for trigger ID: {trigger_id}, Model: {model_name}, Endpoint: {endpoint_name}")
+        if response.status == functions_v1.CloudFunctionStatus.ACTIVE:
+            logger.info(f"Cloud Function '{function_name}' deployed successfully.")
+        else:
+            logger.error(f"Cloud Function '{function_name}' deployment failed with status: {response.status}")
 
+    except Exception as e:
+        log_error(logger, e, 'Deploying Cloud Function')
+        raise
 
 def setup_cloud_run(project_id, service_name, image_url, region):
-    client = run_v2.ServicesClient()
-    
-    service = run_v2.Service()
-    service.template = run_v2.RevisionTemplate()
-    service.template.containers = [
-        run_v2.Container(
-            image=image_url,
-            env=[{"name": "ENV_VAR", "value": "production"}],  # Optional env vars
-            resources=run_v2.ResourceRequirements(  # Optional resource settings
-                limits={"cpu": "1", "memory": "512Mi"}
+    """
+    Set up a Cloud Run service.
+    """
+    try:
+        log_step(logger, 'Setting up Cloud Run Service', 'Deployment')
+
+        client = run_v2.ServicesClient()
+
+        service = run_v2.Service()
+        service.template = run_v2.RevisionTemplate()
+        service.template.containers = [
+            run_v2.Container(
+                image=image_url,
+                env_vars=[run_v2.EnvVar(name="ENV_VAR", value="production")],  # Optional env vars
+                resources=run_v2.ResourceRequirements(  # Optional resource settings
+                    limits={"cpu": "1", "memory": "512Mi"}
+                )
             )
-        )
-    ]
-    
-    parent = client.common_location_path(project_id, region)
-    response = client.create_service(
-        parent=parent,
-        service=service,
-        service_id=service_name
-    )
-    
-    print(f"Cloud Run service created: {response.name}")
-    return response
+        ]
 
+        parent = client.common_location_path(project_id, region)
+        response = client.create_service(
+            parent=parent,
+            service=service,
+            service_id=service_name
+        )
+        logger.info(f"Cloud Run service created: {response.name}")
+        return response
 
-if __name__ == '__main__':
-    import argparse
-    from google.cloud import pubsub_v1
-    import os
+    except Exception as e:
+        log_error(logger, e, 'Setting up Cloud Run Service')
+        raise
 
-    # Parse arguments for Vertex AI and Cloud Build setup
+def main():
     parser = argparse.ArgumentParser(description='Deploy to Vertex AI, set up Cloud Build triggers, and configure CI/CD with Cloud Run and Pub/Sub for continuous training')
     parser.add_argument('--project_id', required=True, help='GCP Project ID')
-    parser.add_argument('--model_name', required=True, help='Name of the machine learning model')  # Model name argument added
+    parser.add_argument('--model_name', required=True, help='Name of the machine learning model')
     parser.add_argument('--model_path', required=True, help='Path to the model artifacts')
     parser.add_argument('--endpoint_name', required=True, help='Name for the Vertex AI endpoint')
     parser.add_argument('--repo_name', required=True, help='GitHub repository name')
@@ -248,58 +300,83 @@ def setup_cloud_run(project_id, service_name, image_url, region):
     parser.add_argument('--service_name', required=True, help='Cloud Run service name')
     parser.add_argument('--image_url', required=True, help='Docker image URL for Cloud Run')
     parser.add_argument('--region', required=True, help='GCP region for deployment')
-    parser.add_argument('--storage_bucket', required=True, help='Cloud Storage bucket to monitor for new data')
-    parser.add_argument('--trigger_id', required=True, help='Cloud Build trigger ID for retraining jobs')
+    parser.add_argument('--storage_bucket', required=False, help='Cloud Storage bucket to monitor for new data')
     parser.add_argument('--cooldown_period', type=int, default=300, help='Cooldown period in seconds between Cloud Build jobs')
+    parser.add_argument('--trigger_id', required=True, help='Cloud Build trigger ID for retraining jobs')
     parser.add_argument('--notification_channel', required=True, help='Notification channel ID for build status notifications')
     parser.add_argument('--canary_traffic_percent', type=int, default=10, help='Canary traffic split percentage')
 
     args = parser.parse_args()
 
-    # Step 1: Deploy the model to Vertex AI
-    print(f"Deploying model '{args.model_name}' to Vertex AI...")
-    endpoint_name, model_name = deploy_to_vertex_ai(
-        project_id=args.project_id,
-        model_path=args.model_path,
-        endpoint_name=args.endpoint_name,
-        model_name=args.model_name,
-        canary_traffic_percent=args.canary_traffic_percent
-    )
-
-    # Step 2: Set up Cloud Build trigger
-    print("Setting up Cloud Build trigger for continuous training...")
-    trigger_response = setup_cloud_build_trigger(
-        project_id=args.project_id,
-        repo_name=args.repo_name,
-        branch_name=args.branch_name,
-        storage_bucket=args.storage_bucket
-    )
-
-    # Step 3: Deploy the Cloud Function for cooldown (Pub/Sub)
-    print("Setting up Pub/Sub topic and deploying Cloud Function for cooldown mechanism...")
-
-    # Ensure the Pub/Sub topic exists
-    pubsub_client = pubsub_v1.PublisherClient()
-    topic_path = pubsub_client.topic_path(args.project_id, 'cloud-build-trigger')
-    pubsub_client.create_topic(name=topic_path)
-
-    # Deploy Cloud Function for cooldown
-    os.system(f"gcloud functions deploy cloud_build_trigger --runtime python39 "
-            f"--trigger-topic cloud-build-trigger "
-            f"--set-env-vars PROJECT_ID={args.project_id},TRIGGER_ID={args.trigger_id} "
-            f"--memory=128MB --timeout=300s")
-
-    # Step 4: Set up Cloud Run service for deployment
-    print("Setting up Cloud Run service for deployment...")
-    service_response = setup_cloud_run(
-        project_id=args.project_id,
-        service_name=args.service_name,
-        image_url=args.image_url,
-        region=args.region
-    )
-
-    # Output results
-    print(f"Deployment to Vertex AI completed. Endpoint: {endpoint_name}, Model: {model_name}")
-    print(f"Cloud Build trigger '{trigger_response.name}' created.")
-    print(f"Cloud Run service '{service_response.name}' created.")
-    print("MLOps pipeline with Cloud Build, Pub/Sub, Cloud Function cooldown, and Cloud Run setup completed successfully.")
+    try:
+        # Step 1: Deploy the model to Vertex AI
+        logger.info(f"Deploying model '{args.model_name}' to Vertex AI...")
+        endpoint_resource_name, model_resource_name = deploy_to_vertex_ai(
+            project_id=args.project_id,
+            model_path=args.model_path,
+            endpoint_name=args.endpoint_name,
+            model_name=args.model_name,
+            canary_traffic_percent=args.canary_traffic_percent
+        )
+
+        # Step 2: Set up Cloud Build trigger
+        logger.info("Setting up Cloud Build trigger for continuous training...")
+        trigger_response = setup_cloud_build_trigger(
+            project_id=args.project_id,
+            repo_name=args.repo_name,
+            branch_name=args.branch_name,
+            storage_bucket=args.storage_bucket
+        )
+
+        # Step 3: Deploy the Cloud Function for cooldown (Pub/Sub)
+        logger.info("Setting up Pub/Sub topic and deploying Cloud Function for cooldown mechanism...")
+
+        # Ensure the Pub/Sub topic exists
+        pubsub_client = pubsub_v1.PublisherClient()
+        topic_path = pubsub_client.topic_path(args.project_id, 'cloud-build-trigger')
+        try:
+            pubsub_client.get_topic(request={"topic": topic_path})
+            logger.info(f"Pub/Sub topic '{topic_path}' already exists.")
+        except pubsub_client.exceptions.NotFound:
+            pubsub_client.create_topic(name=topic_path)
+            logger.info(f"Created Pub/Sub topic '{topic_path}'.")
+
+        # Deploy Cloud Function for cooldown
+        deploy_cloud_function(
+            project_id=args.project_id,
+            region=args.region,
+            function_name='cloud_build_trigger',
+            entry_point='cloud_build_trigger',
+            runtime='python39',
+            trigger_topic='cloud-build-trigger',
+            env_vars={
+                'PROJECT_ID': args.project_id,
+                'TRIGGER_ID': args.trigger_id,
+                'COOLDOWN_PERIOD': str(args.cooldown_period)
+            }
+        )
+
+        # Step 4: Set up Cloud Run service for deployment
+        logger.info("Setting up Cloud Run service for deployment...")
+        service_response = setup_cloud_run(
+            project_id=args.project_id,
+            service_name=args.service_name,
+            image_url=args.image_url,
+            region=args.region
+        )
+
+        # Output results
+        logger.info(f"Deployment to Vertex AI completed. Endpoint: {endpoint_resource_name}, Model: {model_resource_name}")
+        logger.info(f"Cloud Build trigger '{trigger_response.name}' created.")
+        logger.info(f"Cloud Run service '{service_response.name}' created.")
+        logger.info("MLOps pipeline with Cloud Build, Pub/Sub, Cloud Function cooldown, and Cloud Run setup completed successfully.")
+
+    except Exception as e:
+        log_error(logger, e, 'Main Execution')
+        raise
+
+
+if __name__ == '__main__':
+    main()
+    
+
diff --git a/kubeflow/components/preprocess/preprocess.py b/kubeflow/components/preprocess/preprocess.py
index 6b7966d..8c1b83e 100644
--- a/kubeflow/components/preprocess/preprocess.py
+++ b/kubeflow/components/preprocess/preprocess.py
@@ -5,7 +5,7 @@
     Dataset,
 )
 from typing import NamedTuple
-from src.data_processing.data_preprocess import prepare_data
+from src.data_processing.data_prep import prepare_data
 from src.utils.logging_utils import setup_logger, log_error, log_step
 
 logger = setup_logger('kubeflow_preprocess')
diff --git a/src/utils/logging_utils.py b/src/utils/logging_utils.py
index 5f277e1..4bf987c 100644
--- a/src/utils/logging_utils.py
+++ b/src/utils/logging_utils.py
@@ -7,18 +7,18 @@ def setup_logger(name, log_level=logging.INFO):
     logger = logging.getLogger(name)
     logger.setLevel(log_level)
 
-    # Check if running on GCP
-    if os.getenv('KUBERNETES_SERVICE_HOST'):
-        # Use Google Cloud Logging
-        client = cloud_logging.Client()
-        handler = cloud_logging.handlers.CloudLoggingHandler(client)
-    else:
-        # Use local file logging
-        handler = logging.FileHandler(f"{name}.log")
-
-    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
+    if not logger.handlers:
+        # Check if running on GCP
+        if os.getenv('KUBERNETES_SERVICE_HOST'):
+            # Use Google Cloud Logging
+            client = cloud_logging.Client()
+            handler = cloud_logging.handlers.CloudLoggingHandler(client)
+        else:
+            # Use console logging
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+            handler.setFormatter(formatter)
+        logger.addHandler(handler)
 
     return logger
 
@@ -32,4 +32,4 @@ def log_step(logger, step, component):
 
 def log_metric(logger, metric_name, metric_value, component):
     """Log a metric."""
-    logger.info(f"Metric in {component}: {metric_name} = {metric_value}")
\ No newline at end of file
+    logger.info(f"Metric in {component}: {metric_name} = {metric_value}")