From 1988e063fda7ecd1e7a8286d9c7c196ff07ccd42 Mon Sep 17 00:00:00 2001 From: JonFillip Date: Mon, 30 Sep 2024 22:32:20 +0100 Subject: [PATCH 1/2] Improving Kubeflow pipeline, refactored data ingestion and processing --- .DS_Store | Bin 6148 -> 8196 bytes .dockerignore | 1 + .github/workflows/ci_pipeline.yml | 386 ++++-- Dockerfile | 23 +- deployment/vertex_ai/vertex_ai_monitoring.py | 1213 +++++++---------- .../data_ingestion/data_ingestion.py | 40 +- kubeflow/components/deploy/deploy.py | 124 +- kubeflow/components/evaluation/evaluation.py | 100 +- .../feature_engineering/feature_eng.py | 87 +- .../components/feature_store/component.yaml | 31 + .../components/feature_store/feature_store.py | 111 ++ .../hyperparameter_tuning.py | 74 +- kubeflow/components/monitoring/monitor.py | 110 +- kubeflow/components/preprocess/preprocess.py | 77 +- kubeflow/components/test/component.yaml | 19 - kubeflow/components/test/test.py | 0 kubeflow/components/train/train.py | 102 +- kubeflow/pipeline.py | 137 +- notebooks/exploratory_analysis.ipynb | 2 +- requirements.txt | 77 +- scripts/run_pipeline.sh | 6 +- src/data_processing/data_ingestion.py | 154 ++- src/data_processing/data_prep.py | 176 +++ src/data_processing/data_preprocess.py | 171 --- src/data_processing/data_process.py | 175 +++ src/data_processing/data_validation.py | 41 +- src/feature_engineering/feat_engineering.py | 436 ++---- src/utils/data_versioning.py | 25 + tests/test_content_based.py | 156 +++ tests/test_data_prep.py | 81 ++ tests/test_data_processing.py | 2 +- tests/test_data_validation.py | 94 ++ tests/test_data_versioning.py | 28 + tests/test_feature_engineering.py | 213 +-- tests/test_feature_store.py | 32 + tests/test_hyperparameter_tuning.py | 75 +- tests/test_integration.py | 80 ++ tests/test_kubeflow_pipeline.py | 62 + tests/test_model_evaluation.py | 77 ++ tests/test_pipeline.py | 19 +- 40 files changed, 2903 insertions(+), 1914 deletions(-) create mode 100644 kubeflow/components/feature_store/component.yaml create mode 100644 kubeflow/components/feature_store/feature_store.py delete mode 100644 kubeflow/components/test/component.yaml delete mode 100644 kubeflow/components/test/test.py create mode 100644 src/data_processing/data_prep.py delete mode 100644 src/data_processing/data_preprocess.py create mode 100644 src/data_processing/data_process.py create mode 100644 src/utils/data_versioning.py create mode 100644 tests/test_content_based.py create mode 100644 tests/test_data_prep.py create mode 100644 tests/test_data_validation.py create mode 100644 tests/test_data_versioning.py create mode 100644 tests/test_feature_store.py create mode 100644 tests/test_integration.py create mode 100644 tests/test_kubeflow_pipeline.py create mode 100644 tests/test_model_evaluation.py diff --git a/.DS_Store b/.DS_Store index 57e63d09b8a9c01b9f06e52fd82ffb0cac8e40c4..9660dd73ee714a9f8113e894c51a71d59408b7d3 100644 GIT binary patch literal 8196 zcmeHM&2G~`5T0$*#%TrQ08%A{WQl7PQb>ysmk`ne2QCeQ1E65XX=>H=hWs={RZ-6H z4!i6=pzph9XFNrRL&A9su(lL9#pET(iB4_>Ui!l9I~Fu zSt1j4Qi(dL^vx>GP?WqKdal4p)e~u3%YbFzJOiA&U!w{Iq^Xn3-!(G%?z2dD{7Cyi z@xpq7S~-z|9xwX}GX9tj4cyl`xE{_0MtPjNW8jJyPjIwHWAY93X9K-*hE7f#=2)zL z7&<)HK22!^yN77on6-zxXU=t@XK1o*htTPOslY@FGhZ^y+yE{Daty4GHpEqc5vF+C z0Y&+l5%ZPRF3MYC+Lg1EVID24WzhwU>|rjRfn&b1xD~b=vuC@eu!22jdW8AP>ohZw z!S+Pv+=Z#E4%@@gWZ2Cdsp}bz9K;*E3_Etkrb*s@W+jI&2&!j}%Y5W;%b1(^2zbK= z1x|9!> $GITHUB_OUTPUT + + - name: Deploy to Cloud Run + env: + IMAGE_TAG: gcr.io/${{ env.PROJECT_ID }}/${{ env.IMAGE }}:${{ github.sha }} + run: | + set -e + gcloud run deploy ${{ env.IMAGE }} \ + --image $IMAGE_TAG \ + --region ${{ env.REGION }} \ + --platform managed \ + --allow-unauthenticated + + - name: Health check Cloud Run service + run: | + set -e + SERVICE_URL=$(gcloud run services describe ${{ env.IMAGE }} \ + --platform managed \ + --region ${{ env.REGION }} \ + --format='value(status.url)') + if ! curl -fsSL ${SERVICE_URL}/health; then + echo "Deployment failed. Rolling back to previous revision..." + gcloud run services update-traffic ${{ env.IMAGE }} \ + --platform managed \ + --region ${{ env.REGION }} \ + --to-revisions=${{ steps.current_revision.outputs.current_revision }}=100 + exit 1 + fi + + deploy-vertex-ai: + needs: build-and-push + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v3 + + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v1 + + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@v1 + with: + workload_identity_provider: ${{ env.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ env.GCP_SA_EMAIL }} + + - name: Verify GCP Authentication + run: gcloud auth list + + - name: Trigger Cloud Build + run: | + set -e + gcloud builds submit --config cloudbuild.yaml \ + --substitutions=_REGION=${{ env.REGION }} + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install Kubeflow Pipelines SDK + run: | + set -e + python -m pip install --upgrade pip + pip install kfp==2.0.0 google-cloud-aiplatform==1.28.0 + + - name: Deploy to Vertex AI + run: | + set -e + python kubeflow/pipeline.py \ + --project_id ${{ env.PROJECT_ID }} \ + --region ${{ env.REGION }} \ + --pipeline_root gs://${{ env.GCS_BUCKET }}/pipeline_root \ + --output_file pipeline.json + python deployment/vertex_ai/vertex_deployment.py \ + --project_id ${{ env.PROJECT_ID }} \ + --region ${{ env.REGION }} \ + --pipeline_spec pipeline.json \ + --pipeline_root gs://${{ env.GCS_BUCKET }}/pipeline_root \ + --model_name_file model_name.txt + + - name: Upload model name + if: always() + uses: actions/upload-artifact@v3 + with: + name: model-name + path: model_name.txt + + - name: Check Vertex AI model deployment status + run: | + set -e + MODEL_NAME=$(cat model_name.txt) + MODEL_STATUS=$(gcloud ai models describe $MODEL_NAME --region ${{ env.REGION }} --format="value(state)") + if [[ "$MODEL_STATUS" != "DEPLOYED" ]]; then + echo "Model not deployed successfully, exiting." + exit 1 + fi + + final-health-check: + needs: [deploy-cloud-run, deploy-vertex-ai] + runs-on: ubuntu-latest + permissions: + contents: read + id-token: write steps: - - uses: actions/checkout@v2 - - # Authenticate with Google Cloud - - name: Set up Cloud SDK - uses: google-github-actions/setup-gcloud@v0.2.0 - with: - project_id: ${{ secrets.GCP_PROJECT_ID }} - service_account_key: ${{ secrets.GCP_SA_KEY }} - export_default_credentials: true - - # Verify GCP Authentication - - name: Verify GCP Authentication - run: gcloud auth list - - # Validate the repository structure - - name: Validate repository structure - run: | - if [ ! -f "cloudbuild.yaml" ]; then - echo "cloudbuild.yaml not found, failing build." - exit 1 - fi - if [ ! -f "deployment/deploy_pipeline.py" ]; then - echo "deploy_pipeline.py not found, failing build." - exit 1 - fi - - # Trigger Cloud Build - - name: Trigger Cloud Build - run: | - gcloud builds submit --config cloudbuild.yaml \ - --substitutions=_REGION=us-central1 - - # Run smoke tests in Docker before deploying to Vertex AI - - name: Smoke Test - run: docker run music-recommender python -m unittest discover tests - - # Deploy to Vertex AI with pinned versions of kfp and google-cloud-aiplatform - - name: Deploy to Vertex AI - run: | - pip install 'kfp==1.7.2' 'google-cloud-aiplatform==1.7.1' - python deployment/deploy_pipeline.py \ - --platform vertex \ - --project_id ${{ secrets.GCP_PROJECT_ID }} \ - --region us-central1 \ - --output_file pipeline.yaml + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v1 + + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@v1 + with: + workload_identity_provider: ${{ env.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ env.GCP_SA_EMAIL }} + + - name: Download model name + uses: actions/download-artifact@v3 + with: + name: model-name + path: ./ + + - name: Final Health Checks + run: | + set -e + # Check Cloud Run + SERVICE_URL=$(gcloud run services describe ${{ env.IMAGE }} \ + --platform managed \ + --region ${{ env.REGION }} \ + --format='value(status.url)') + CLOUD_RUN_STATUS=$(curl -s -o /dev/null -w "%{http_code}" $SERVICE_URL/health) + + # Check Vertex AI + MODEL_NAME=$(cat model_name.txt) + VERTEX_AI_STATUS=$(gcloud ai models describe $MODEL_NAME --region ${{ env.REGION }} --format="value(state)") + + if [[ "$CLOUD_RUN_STATUS" != "200" || "$VERTEX_AI_STATUS" != "DEPLOYED" ]]; then + echo "Final health check failed. Cloud Run status: $CLOUD_RUN_STATUS, Vertex AI status: $VERTEX_AI_STATUS" + exit 1 + fi + + echo "All systems operational. Cloud Run status: $CLOUD_RUN_STATUS, Vertex AI status: $VERTEX_AI_STATUS" diff --git a/Dockerfile b/Dockerfile index 84f5623..8a0e989 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,24 @@ -FROM python:3.9-slim +# Use Python 3.10 slim base image +FROM python:3.10-slim # Set working directory WORKDIR /app -# Copy requirements file +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + libffi-dev \ + libssl-dev \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements.txt COPY requirements.txt . -# Install dependencies +# Upgrade pip +RUN pip install --upgrade pip + +# Install Python dependencies RUN pip install --no-cache-dir -r requirements.txt # Copy the entire project @@ -15,10 +27,7 @@ COPY . . # Set environment variables ENV PYTHONPATH=/app -# Install additional dependencies for Kubeflow and Vertex AI -RUN pip install --no-cache-dir kfp google-cloud-aiplatform - -# Make sure all scripts are executable +# Make scripts executable RUN chmod +x src/*.py kubeflow/components/*/*.py # Run tests by default diff --git a/deployment/vertex_ai/vertex_ai_monitoring.py b/deployment/vertex_ai/vertex_ai_monitoring.py index 31546a6..af3cefc 100644 --- a/deployment/vertex_ai/vertex_ai_monitoring.py +++ b/deployment/vertex_ai/vertex_ai_monitoring.py @@ -1,10 +1,3 @@ -from scipy import stats -from scipy.stats import ks_2samp -from google.cloud import monitoring_v3, storage, bigquery, aiplatform -from google.api import label_pb2 as ga_label -from google.api import metric_pb2 as ga_metric -from google.protobuf import duration_pb2 as duration -from src.data_processing.data_validation import generate_schema, validate_data, load_config, load_statistics_from_gcs, load_schema_from_gcs, compare_statistics, compare_schemas import yaml import tensorflow_data_validation as tfdv import argparse @@ -12,301 +5,422 @@ import pandas as pd import datetime import random +from typing import Dict, List, Optional, Tuple, Union +from scipy.stats import ks_2samp +from google.cloud import monitoring_v3, storage, bigquery, aiplatform +from google.api import label_pb2 as ga_label +from google.api import metric_pb2 as ga_metric +from google.protobuf import duration_pb2 as duration +from src.data_processing.data_validation import ( + generate_schema, + validate_data, + load_config, + load_statistics_from_gcs, + load_schema_from_gcs, + compare_statistics, + compare_schemas, + save_statistics_to_gcs, + save_schema_to_gcs +) from src.utils.logging_utils import setup_logger, log_error, log_step +from ml_metadata import metadata_store +from ml_metadata.proto import metadata_store_pb2 logger = setup_logger('vertex_ai_pipeline_monitoring') -def setup_vertex_ai_monitoring(project_id, model_name): - """Sets up custom metrics for Vertex AI monitoring in Cloud Monitoring.""" - client = monitoring_v3.MetricServiceClient() - project_name = f"projects/{project_id}" - - # Define metrics - metrics = [ - { - "type": f"custom.googleapis.com/vertex_ai/{model_name}/prediction_drift", - "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE, - "value_type": ga_metric.MetricDescriptor.ValueType.DOUBLE, - "description": "Prediction drift metric for Vertex AI model" - }, - { - "type": f"custom.googleapis.com/vertex_ai/{model_name}/data_drift", - "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE, - "value_type": ga_metric.MetricDescriptor.ValueType.DOUBLE, - "description": "Data drift metric for Vertex AI model" - }, - { - "type": f"custom.googleapis.com/vertex_ai/{model_name}/prediction_latency", - "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE, - "value_type": ga_metric.MetricDescriptor.ValueType.INT64, - "description": "Latency of prediction requests in milliseconds" - }, - { - "type": f"custom.googleapis.com/vertex_ai/{model_name}/accuracy", - "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE, - "value_type": ga_metric.MetricDescriptor.ValueType.DOUBLE, - "description": "Accuracy of the Vertex AI model" - }, - { - "type": f"custom.googleapis.com/vertex_ai/{model_name}/schema_drift", - "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE, - "value_type": ga_metric.MetricDescriptor.ValueType.INT64, # 0 or 1 indicating schema drift - "description": "Schema drift metric for Vertex AI model" - }, - { - "type": f"custom.googleapis.com/vertex_ai/{model_name}/missing_schema", - "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE, - "value_type": ga_metric.MetricDescriptor.ValueType.INT64, # 0 or 1 indicating missing schema - "description": "Indicates whether the baseline schema is missing" - }, - { - "type": f"custom.googleapis.com/vertex_ai/{model_name}/missing_statistics", - "kind": ga_metric.MetricDescriptor.MetricKind.GAUGE, - "value_type": ga_metric.MetricDescriptor.ValueType.INT64, # 0 or 1 indicating missing schema - "description": "Indicates whether the baseline statistics is missing" - } - ] - - for metric in metrics: - descriptor = ga_metric.MetricDescriptor() - descriptor.type = metric["type"] - descriptor.metric_kind = metric["kind"] - descriptor.value_type = metric["value_type"] - descriptor.description = metric["description"] - - descriptor = client.create_metric_descriptor( - name=project_name, - metric_descriptor=descriptor +class VertexAIMonitoring: + def __init__(self, project_id: str, model_name: str, bucket_name: str): + self.project_id = project_id + self.model_name = model_name + self.bucket_name = bucket_name + self.client = monitoring_v3.MetricServiceClient() + self.project_name = f"projects/{project_id}" + self.feature_store_client = aiplatform.FeatureStore(project=project_id) + self.mlmd_connection_config = metadata_store_pb2.ConnectionConfig() + self.mlmd_connection_config.sqlite.filename_uri = f"gs://{bucket_name}/mlmd/metadata.db" + self.mlmd_store = metadata_store.MetadataStore(self.mlmd_connection_config) + + def setup_custom_metrics(self) -> None: + """Sets up custom metrics for Vertex AI monitoring in Cloud Monitoring.""" + metrics = [ + self._create_metric_descriptor("prediction_drift", "Prediction drift metric"), + self._create_metric_descriptor("data_drift", "Data drift metric"), + self._create_metric_descriptor("prediction_latency", "Latency of prediction requests", value_type=ga_metric.MetricDescriptor.ValueType.INT64), + self._create_metric_descriptor("accuracy", "Accuracy of the model"), + self._create_metric_descriptor("schema_drift", "Schema drift metric", value_type=ga_metric.MetricDescriptor.ValueType.INT64), + self._create_metric_descriptor("missing_schema", "Indicates missing baseline schema", value_type=ga_metric.MetricDescriptor.ValueType.INT64), + self._create_metric_descriptor("missing_statistics", "Indicates missing baseline statistics", value_type=ga_metric.MetricDescriptor.ValueType.INT64), + self._create_metric_descriptor("feature_store_read_count", "Number of read operations from the feature store", value_type=ga_metric.MetricDescriptor.ValueType.INT64), + self._create_metric_descriptor("feature_store_write_count", "Number of write operations to the feature store", value_type=ga_metric.MetricDescriptor.ValueType.INT64), + self._create_metric_descriptor("feature_store_latency", "Latency of feature store operations", value_type=ga_metric.MetricDescriptor.ValueType.DISTRIBUTION), + ] + + for metric in metrics: + descriptor = self.client.create_metric_descriptor( + name=self.project_name, + metric_descriptor=metric + ) + logger.info(f"Created {descriptor.name}") + + def _create_metric_descriptor(self, metric_name: str, description: str, value_type: int = ga_metric.MetricDescriptor.ValueType.DOUBLE) -> ga_metric.MetricDescriptor: + return ga_metric.MetricDescriptor( + type=f"custom.googleapis.com/vertex_ai/{self.model_name}/{metric_name}", + metric_kind=ga_metric.MetricDescriptor.MetricKind.GAUGE, + value_type=value_type, + description=description ) - print(f"Created {descriptor.name}") + def create_alert_policy(self, display_name: str, filter_str: str, threshold: float, duration_seconds: int, comparison: int) -> None: + """Creates an alert policy in Google Cloud Monitoring.""" + client = monitoring_v3.AlertPolicyServiceClient() + + condition = { + "display_name": display_name, + "condition_threshold": { + "filter": filter_str, + "comparison": comparison, + "threshold_value": threshold, + "duration": duration.Duration(seconds=duration_seconds) + } + } -def create_accuracy_degradation_alert(project_id: str, model_name: str, absolute_threshold: float, degradation_rate_threshold: float, time_window_seconds: int = 86400): - """ - Create an alert in Google Cloud Monitoring for model accuracy degradation. - The alert will trigger if: - 1. The accuracy falls below an absolute threshold. - 2. The accuracy degrades by a certain rate within a specified time window. - - :param project_id: GCP Project ID - :param model_name: Name of the model - :param absolute_threshold: The absolute accuracy threshold to trigger an alert (e.g., accuracy < 0.85). - :param degradation_rate_threshold: The degradation rate threshold over time (e.g., 0.05 for a 5% drop). - :param time_window_seconds: The time window to monitor for accuracy degradation (default is 24 hours). - """ - client = monitoring_v3.AlertPolicyServiceClient() - project_name = f"projects/{project_id}" - - # Condition 1: Absolute accuracy degradation - absolute_condition = { - "display_name": "Accuracy below absolute threshold", - "condition_threshold": { - "filter": f'metric.type="custom.googleapis.com/vertex_ai/{model_name}/accuracy"', - "comparison": monitoring_v3.ComparisonType.COMPARISON_LT, - "threshold_value": absolute_threshold, - "duration": {"seconds": 300}, # Trigger if accuracy stays below threshold for 5 minutes - "aggregations": [{ - "alignment_period": {"seconds": 300}, - "per_series_aligner": monitoring_v3.Aggregation.Aligner.ALIGN_MEAN, - }] + alert_policy = { + "display_name": f"{self.model_name} {display_name}", + "conditions": [condition], + "notification_channels": [f"projects/{self.project_id}/notificationChannels/your-channel-id"], + "combiner": monitoring_v3.AlertPolicy.Combiner.OR, } - } - - # Condition 2: Degradation rate over time - degradation_condition = { - "display_name": "Accuracy degradation over time", - "condition_threshold": { - "filter": f'metric.type="custom.googleapis.com/vertex_ai/{model_name}/accuracy"', - "comparison": monitoring_v3.ComparisonType.COMPARISON_LT, - "threshold_value": degradation_rate_threshold, # Set degradation rate threshold - "duration": {"seconds": time_window_seconds}, # Time window (e.g., 24 hours) - "aggregations": [{ - "alignment_period": {"seconds": time_window_seconds}, - "per_series_aligner": monitoring_v3.Aggregation.Aligner.ALIGN_DELTA, - }] + + policy = client.create_alert_policy( + name=self.project_name, + alert_policy=alert_policy + ) + logger.info(f"Created alert policy: {policy.name}") + + def create_accuracy_degradation_alert(self, absolute_threshold: float, degradation_rate_threshold: float, time_window_seconds: int = 86400) -> None: + """Creates an alert for accuracy degradation.""" + self.create_alert_policy( + "Accuracy below absolute threshold", + f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/accuracy"', + absolute_threshold, + 300, + monitoring_v3.ComparisonType.COMPARISON_LT + ) + self.create_alert_policy( + "Accuracy degradation over time", + f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/accuracy"', + degradation_rate_threshold, + time_window_seconds, + monitoring_v3.ComparisonType.COMPARISON_LT + ) + + def create_resource_utilization_alert(self) -> None: + """Creates alerts for resource utilization (CPU, memory, and GPU).""" + resources = [ + ("CPU", "compute.googleapis.com/instance/cpu/utilization"), + ("Memory", "compute.googleapis.com/instance/memory/utilization"), + ("GPU", "compute.googleapis.com/instance/gpu/utilization") + ] + + for resource_name, metric_type in resources: + self.create_alert_policy( + f"High {resource_name} utilization", + f'metric.type="{metric_type}"', + 0.8, # 80% utilization threshold + 300, # 5 minutes duration + monitoring_v3.ComparisonType.COMPARISON_GT + ) + + def log_metric(self, metric_name: str, value: Union[float, int]) -> None: + """Logs a metric to Google Cloud Monitoring.""" + series = monitoring_v3.TimeSeries() + series.metric.type = f"custom.googleapis.com/vertex_ai/{self.model_name}/{metric_name}" + series.resource.type = "aiplatform.googleapis.com/Endpoint" + series.resource.labels["model_name"] = self.model_name + point = series.points.add() + if isinstance(value, float): + point.value.double_value = value + else: + point.value.int64_value = value + now = datetime.datetime.now() + point.interval.end_time.seconds = int(now.timestamp()) + point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9) + self.client.create_time_series(name=self.project_name, time_series=[series]) + logger.info(f"Logged {metric_name} for model {self.model_name}: {value}") + + def detect_data_drift(self, drift_threshold: float) -> Optional[float]: + """Detects data drift by comparing current (serving) statistics with baseline (training) statistics.""" + try: + log_step(logger, 'Detecting Data Drift', 'Data Drift Detection') + today = datetime.datetime.now().strftime("%Y%m%d") + + baseline_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'train', today) + if not baseline_stats: + self.log_metric("missing_statistics", 1) + return None + + serving_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'serving', today) + if not serving_stats: + self.log_metric("missing_statistics", 1) + return None + + schema = load_schema_from_gcs(self.bucket_name, self.model_name, 'current') + if not schema: + self.log_metric("missing_schema", 1) + return None + + anomalies = compare_statistics(baseline_stats, serving_stats, schema) + + drift_score = 0 + for feature, anomaly in anomalies.anomaly_info.items(): + if anomaly: + logger.warning(f"Data drift detected for feature {feature}: {anomaly.description}") + drift_score += anomaly.severity + self.log_metric("data_drift", anomaly.severity) + + if drift_score > drift_threshold: + logger.warning(f"Significant data drift detected. Drift score: {drift_score} > {drift_threshold}") + else: + logger.info(f"No significant data drift detected. Drift score: {drift_score} <= {drift_threshold}") + + # Log drift detection results to MLMD + self._log_drift_detection_to_mlmd(drift_score, drift_threshold) + + return drift_score + + except Exception as e: + log_error(logger, e, 'Data Drift Detection') + return None + + def _log_drift_detection_to_mlmd(self, drift_score: float, drift_threshold: float): + """Log drift detection results to ML Metadata.""" + execution = metadata_store_pb2.Execution() + execution.type = "DataDriftDetection" + execution.properties["model_name"].string_value = self.model_name + execution.properties["drift_score"].double_value = drift_score + execution.properties["drift_threshold"].double_value = drift_threshold + execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat() + + execution_id = self.mlmd_store.put_executions([execution])[0] + logger.info(f"Logged drift detection results to MLMD with execution ID: {execution_id}") + + def detect_prediction_drift(self, drift_threshold: float) -> Optional[float]: + """Detects prediction drift using the Kolmogorov-Smirnov (KS) test.""" + try: + log_step(logger, 'Detecting Prediction Drift', 'Prediction Drift Detection') + today = datetime.datetime.now().strftime("%Y%m%d") + + train_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'train', today) + if not train_stats: + self.log_metric("missing_statistics", 1) + return None + + serving_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'serving', today) + if not serving_stats: + self.log_metric("missing_statistics", 1) + return None + + train_predictions = train_stats.datasets[0].features['similar_tracks'].num_stats.histograms[0].buckets + serving_predictions = serving_stats.datasets[0].features['similar_tracks'].num_stats.histograms[0].buckets + + train_counts = [bucket.sample_count for bucket in train_predictions] + serving_counts = [bucket.sample_count for bucket in serving_predictions] + + statistic, _ = ks_2samp(train_counts, serving_counts) + + self.log_metric("prediction_drift", statistic) + + if statistic > drift_threshold: + logger.warning(f"Prediction drift detected: KS statistic = {statistic}") + else: + logger.info(f"No significant prediction drift detected") + + # Log prediction drift results to MLMD + self._log_prediction_drift_to_mlmd(statistic, drift_threshold) + + return statistic + + except Exception as e: + log_error(logger, e, 'Prediction Drift Detection') + return None + + def _log_prediction_drift_to_mlmd(self, statistic: float, drift_threshold: float): + """Log prediction drift results to ML Metadata.""" + execution = metadata_store_pb2.Execution() + execution.type = "PredictionDriftDetection" + execution.properties["model_name"].string_value = self.model_name + execution.properties["ks_statistic"].double_value = statistic + execution.properties["drift_threshold"].double_value = drift_threshold + execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat() + + execution_id = self.mlmd_store.put_executions([execution])[0] + logger.info(f"Logged prediction drift results to MLMD with execution ID: {execution_id}") + + def detect_schema_drift(self, schema_version: str) -> Optional[bool]: + """Detects schema drift by comparing the current schema with the baseline (training) schema.""" + try: + log_step(logger, 'Detecting Schema Drift', 'Schema Drift Detection') + + baseline_schema = load_schema_from_gcs(self.bucket_name, self.model_name, schema_version) + if not baseline_schema: + self.log_metric("missing_schema", 1) + return None + + current_schema = load_schema_from_gcs(self.bucket_name, self.model_name, 'serving_schema_version') + + schema_drift_detected = compare_schemas(baseline_schema, current_schema) + + if schema_drift_detected: + logger.info(f"Schema drift detected for model {self.model_name}.") + self.log_metric("schema_drift", 1) + else: + logger.info(f"No schema drift detected for model {self.model_name}.") + self.log_metric("schema_drift", 0) + + # Log schema drift results to MLMD + self._log_schema_drift_to_mlmd(schema_drift_detected) + + return schema_drift_detected + + except Exception as e: + log_error(logger, e, 'Schema Drift Detection') + return None + + def _log_schema_drift_to_mlmd(self, schema_drift_detected: bool): + """Log schema drift results to ML Metadata.""" + execution = metadata_store_pb2.Execution() + execution.type = "SchemaDriftDetection" + execution.properties["model_name"].string_value = self.model_name + execution.properties["schema_drift_detected"].int_value = int(schema_drift_detected) + execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat() + + execution_id = self.mlmd_store.put_executions([execution])[0] + logger.info(f"Logged schema drift results to MLMD with execution ID: {execution_id}") + + def monitor_traffic_split(self, endpoint_name: str) -> Optional[Dict[str, float]]: + """Monitor the traffic split in Vertex AI to detect rollback.""" + try: + log_step(logger, 'Monitoring traffic split', 'Rollback Monitoring') + + aiplatform.init(project=self.project_id) + + endpoints = aiplatform.Endpoint.list(filter=f'display_name="{endpoint_name}"') + if not endpoints: + log_error(logger, f"Endpoint {endpoint_name} not found.", "Rollback Monitoring") + return None + endpoint = endpoints[0] + + traffic_split = endpoint.traffic_split + for model_id, traffic_percentage in traffic_split.items(): + logger.info(f"Model {model_id} is receiving {traffic_percentage}% of the traffic.") + + if sum(traffic_split.values()) != 100: + logger.warning("Traffic split does not sum to 100%, indicating a possible rollback.") + + # Log traffic split to MLMD + self._log_traffic_split_to_mlmd(traffic_split) + + return traffic_split + except Exception as e: + log_error(logger, e, "Rollback Monitoring") + raise + + def _log_traffic_split_to_mlmd(self, traffic_split: Dict[str, float]): + """Log traffic split to ML Metadata.""" + execution = metadata_store_pb2.Execution() + execution.type = "TrafficSplitMonitoring" + execution.properties["model_name"].string_value = self.model_name + for model_id, percentage in traffic_split.items(): + execution.properties[f"traffic_{model_id}"].double_value = percentage + execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat() + + execution_id = self.mlmd_store.put_executions([execution])[0] + logger.info(f"Logged traffic split to MLMD with execution ID: {execution_id}") + + def trigger_retraining_pipeline(self, pipeline_name: str, gcs_input: str) -> str: + """Trigger a Vertex AI pipeline for continuous retraining.""" + aiplatform.init(project=self.project_id) + + pipeline_params = { + 'input_data': gcs_input, + 'model_name': self.model_name } - } - - # Create the alert policy - alert_policy = { - "display_name": f"Accuracy Degradation Alert for {model_name}", - "conditions": [absolute_condition, degradation_condition], - "notification_channels": [f"projects/{project_id}/notificationChannels/your-channel-id"], # Replace with actual channel - "combiner": monitoring_v3.AlertPolicy.Combiner.OR, # Trigger if either condition is met - "enabled": True - } - - # Apply the policy to the project - policy = client.create_alert_policy( - name=project_name, - alert_policy=alert_policy - ) - logger.info(f"Created accuracy degradation alert policy: {policy.name}") - return policy + pipeline_job = aiplatform.PipelineJob( + display_name=f'Retraining - {self.model_name}', + template_path=f'gs://{pipeline_name}', + parameter_values=pipeline_params + ) + pipeline_job.run() + logger.info(f"Triggered retraining pipeline for {self.model_name}.") -def create_data_drift_alert(project_id, model_name): - client = monitoring_v3.AlertPolicyServiceClient() - project_name = f"projects/{project_id}" + # Log retraining trigger to MLMD + self._log_retraining_trigger_to_mlmd(pipeline_job.resource_name) - alert_policy = { - "display_name": f"{model_name} Data Drift Alert", - "conditions": [{ - "display_name": "Data drift exceeds threshold", - "condition_threshold": { - "filter": f'metric.type="custom.googleapis.com/vertex_ai/{model_name}/data_drift"', - "comparison": monitoring_v3.ComparisonType.COMPARISON_GT, - "threshold_value": 0.1, - "duration": duration.Duration(seconds=300) - } - }], - "notification_channels": [f"projects/{project_id}/notificationChannels/your-channel-id"], - "combiner": monitoring_v3.AlertPolicy.Combiner.OR, - } - - policy = client.create_alert_policy( - name=project_name, - alert_policy=alert_policy - ) - print(f"Created alert policy: {policy.name}") + return pipeline_job.resource_name -def create_prediction_drift_alert(project_id, model_name): - client = monitoring_v3.AlertPolicyServiceClient() - project_name = f"projects/{project_id}" + def _log_retraining_trigger_to_mlmd(self, pipeline_job_id: str): + """Log retraining trigger to ML Metadata.""" + execution = metadata_store_pb2.Execution() + execution.type = "RetrainingTrigger" + execution.properties["model_name"].string_value = self.model_name + execution.properties["pipeline_job_id"].string_value = pipeline_job_id + execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat() - alert_policy = { - "display_name": f"{model_name} Prediction Drift Alert", - "conditions": [{ - "display_name": "Prediction drift exceeds threshold", + execution_id = self.mlmd_store.put_executions([execution])[0] + logger.info(f"Logged retraining trigger to MLMD with execution ID: {execution_id}") + + def setup_retraining_job_alert(self, notification_channel: str) -> None: + """Set up a Cloud Monitoring alert for Vertex AI retraining jobs.""" + condition = { + "display_name": "Vertex AI Retraining Job Created", "condition_threshold": { - "filter": f'metric.type="custom.googleapis.com/vertex_ai/{model_name}/prediction_drift"', + "filter": 'resource.type="aiplatform.googleapis.com/PipelineJob" AND protoPayload.methodName="google.cloud.aiplatform.v1.PipelineService.CreatePipelineJob"', "comparison": monitoring_v3.ComparisonType.COMPARISON_GT, - "threshold_value": 0.1, - "duration": duration.Duration(seconds=300) + "threshold_value": 0, + "duration": {"seconds": 60}, } - }], - "notification_channels": [f"projects/{project_id}/notificationChannels/your-channel-id"], - "combiner": monitoring_v3.AlertPolicy.Combiner.OR, - } - - policy = client.create_alert_policy( - name=project_name, - alert_policy=alert_policy - ) - print(f"Created alert policy: {policy.name}") + } -def create_schema_drift_alert(project_id: str, model_name: str): - client = monitoring_v3.AlertPolicyServiceClient() - project_name = f"projects/{project_id}" + alert_policy = { + "display_name": "Retraining Job Alert", + "conditions": [condition], + "notification_channels": [notification_channel], + "enabled": True, + "combiner": monitoring_v3.AlertPolicy.Combiner.OR + } - # Create the alert policy definition - alert_policy = monitoring_v3.AlertPolicy( - display_name=f"{model_name} Schema Drift Alert", - conditions=[{ - "display_name": "Schema Drift Detected", - "condition_threshold": { - "filter": f'metric.type="custom.googleapis.com/vertex_ai/{model_name}/schema_drift"', - "comparison": monitoring_v3.ComparisonType.COMPARISON_GT, - "threshold_value": 1, - "duration": duration.Duration(seconds=300) # Set duration for continuous drift - } - }], - notification_channels=[f"projects/{project_id}/notificationChannels/your-channel-id"], # Replace with actual channel ID - combiner=monitoring_v3.AlertPolicy.Combiner.OR, # How to combine multiple conditions - enabled=True - ) + client = monitoring_v3.AlertPolicyServiceClient() + policy = client.create_alert_policy( + name=self.project_name, + alert_policy=alert_policy + ) - # Apply the alert policy - policy = client.create_alert_policy( - name=project_name, - alert_policy=alert_policy - ) + logger.info(f"Created retraining job alert policy: {policy.name}") - print(f"Schema drift alert policy created: {policy.name}") - -def create_resource_utilization_alert(project_id, model_name): - client = monitoring_v3.AlertPolicyServiceClient() - project_name = f"projects/{project_id}" - - alert_policy = { - "display_name": f"{model_name} Resource Utilization Alert", - "conditions": [ - { - "display_name": "High CPU utilization", - "condition_threshold": { - "filter": 'metric.type="compute.googleapis.com/instance/cpu/utilization"', - "comparison": monitoring_v3.ComparisonType.COMPARISON_GT, - "threshold_value": 0.8, - "duration": duration.Duration(seconds=300) - } - }, - { - "display_name": "High memory utilization", - "condition_threshold": { - "filter": 'metric.type="compute.googleapis.com/instance/memory/utilization"', - "comparison": monitoring_v3.ComparisonType.COMPARISON_GT, - "threshold_value": 0.8, - "duration": duration.Duration(seconds=300) - } - }, - { - "display_name": "High GPU utilization", - "condition_threshold": { - "filter": 'metric.type="compute.googleapis.com/instance/gpu/utilization"', - "comparison": monitoring_v3.ComparisonType.COMPARISON_GT, - "threshold_value": 0.8, - "duration": duration.Duration(seconds=300) - } - } - ], - "notification_channels": [f"projects/{project_id}/notificationChannels/your-channel-id"], - "combiner": monitoring_v3.AlertPolicy.Combiner.OR, - } - - policy = client.create_alert_policy( - name=project_name, - alert_policy=alert_policy - ) - print(f"Created alert policy: {policy.name}") + def monitor_and_trigger_retraining(self, accuracy_threshold: float, drift_threshold: float, gcs_input: str, pipeline_name: str, notification_channel: str) -> None: + """Monitor model accuracy, data drift, and prediction drift, and trigger retraining when necessary.""" + self.create_accuracy_degradation_alert(accuracy_threshold, 0.05) -def create_latency_alert(project_id, model_name): - client = monitoring_v3.AlertPolicyServiceClient() - project_name = f"projects/{project_id}" + data_drift_detected = self.detect_data_drift(drift_threshold) + prediction_drift_detected = self.detect_prediction_drift(drift_threshold) - alert_policy = { - "display_name": f"{model_name} Prediction Latency Alert", - "conditions": [{ - "display_name": "High prediction latency", - "condition_threshold": { - "filter": f'metric.type="custom.googleapis.com/vertex_ai/{model_name}/prediction_latency"', - "comparison": monitoring_v3.ComparisonType.COMPARISON_GT, - "threshold_value": 1000, # 1000 ms - "duration": duration.Duration(seconds=60) - } - }], - "notification_channels": [f"projects/{project_id}/notificationChannels/your-channel-id"], - "combiner": monitoring_v3.AlertPolicy.Combiner.OR, - } - - policy = client.create_alert_policy( - name=project_name, - alert_policy=alert_policy - ) - print(f"Created alert policy: {policy.name}") + if data_drift_detected or prediction_drift_detected: + logger.warning(f"Drift detected for {self.model_name}. Triggering retraining pipeline.") + + pipeline_job_id = self.trigger_retraining_pipeline(pipeline_name, gcs_input) + + self.setup_retraining_job_alert(notification_channel) -def log_request_response(project_id, model_name, request, response, latency_ms, sampling_rate=0.1): - """ - Logs serving request/response data and latency to Cloud Storage with optional sampling. - - Args: - project_id (str): GCP project ID - model_name (str): Name of the Vertex AI model - request (dict): Request data - response (dict): Response data - latency_ms (float): Latency of the request in milliseconds - sampling_rate (float): Rate at which to sample logs (0.0 to 1.0, default 1.0) - """ + logger.info(f"Retraining job triggered: {pipeline_job_id}") + else: + logger.info(f"No drift detected for {self.model_name}. No retraining needed.") + + logger.info("Model performance and drift monitoring completed.") + +def log_request_response(project_id: str, model_name: str, request: Dict, response: Dict, latency_ms: float, sampling_rate: float = 0.1) -> None: + """Logs serving request/response data and latency to Cloud Storage with optional sampling.""" if sampling_rate >= 1 or random.random() < sampling_rate: client = storage.Client(project=project_id) bucket = client.get_bucket(f"{project_id}-vertex-ai-logs") @@ -318,31 +432,29 @@ def log_request_response(project_id, model_name, request, response, latency_ms, "timestamp": datetime.datetime.now().isoformat() } blob.upload_from_string(json.dumps(log_entry)) - print(f"Logged request/response for {model_name} (latency: {latency_ms}ms)") + logger.info(f"Logged request/response for {model_name} (latency: {latency_ms}ms)") -def check_existing_statistics_and_schema(project_id, model_name): - bq_client = bigquery.Client(project=project_id) - table_id = f"{project_id}.model_monitoring.{model_name}_serving_stats" +def check_existing_statistics_and_schema(project_id: str, model_name: str, bucket_name: str, schema_version: str) -> Tuple[Optional[tfdv.types.DatasetFeatureStatisticsList], Optional[tfdv.types.Schema]]: + today = datetime.datetime.now().strftime("%Y%m%d") try: - query = f"SELECT * FROM `{table_id}` ORDER BY timestamp DESC LIMIT 1" - existing_stats = bq_client.query(query).result() - existing_stats = list(existing_stats)[0] if existing_stats.total_rows > 0 else None + existing_stats = load_statistics_from_gcs(bucket_name, model_name, 'serving', today) except Exception as e: - print(f"Error checking existing statistics: {e}") + logger.error(f"Error loading existing statistics: {e}") existing_stats = None config = load_config() schema_path = config['data_validation']['schema_path'] try: - schema = tfdv.load_schema_text(schema_path) - except: + schema = load_schema_from_gcs(bucket_name, model_name, schema_version) + except Exception as e: + logger.error(f"Error loading schema: {e}") schema = None return existing_stats, schema -def compute_and_store_statistics(project_id, model_name, existing_stats, existing_schema): +def compute_and_store_statistics(project_id: str, model_name: str, bucket_name: str, existing_schema: Optional[tfdv.types.Schema]) -> Tuple[tfdv.types.DatasetFeatureStatisticsList, tfdv.types.Anomalies]: client = storage.Client(project=project_id) bucket = client.get_bucket(f"{project_id}-vertex-ai-logs") blobs = bucket.list_blobs(prefix=f"{model_name}/logs/") @@ -354,440 +466,87 @@ def compute_and_store_statistics(project_id, model_name, existing_stats, existin df = pd.DataFrame(data) - if existing_schema is None: - schema = generate_schema(df) - else: - schema = existing_schema - - stats, anomalies = validate_data(df, schema) - - bq_client = bigquery.Client(project=project_id) - table_id = f"{project_id}.model_monitoring.{model_name}_serving_stats" + stats = tfdv.generate_statistics_from_dataframe(df) + save_statistics_to_gcs(stats, bucket_name, model_name, 'serving') - row_to_insert = { - "timestamp": datetime.datetime.now().isoformat(), - "statistics": json.dumps(stats), - "anomalies": json.dumps(anomalies) - } - - errors = bq_client.insert_rows_json(table_id, [row_to_insert]) - if errors: - print(f"Encountered errors while inserting rows: {errors}") + if existing_schema: + anomalies = tfdv.validate_statistics(stats, schema=existing_schema) else: - print("New statistics added to BigQuery") + anomalies = None + logger.warning("No existing schema found. Skipping anomaly detection.") return stats, anomalies -def load_baseline_stats_and_schema(bucket_name, model_name, schema_version): - """Load baseline statistics and schema from Google Cloud Storage.""" - # Load baseline statistics from GCS - baseline_stats = load_statistics_from_gcs(bucket_name, model_name, data_type='train') - - # Load schema from GCS - schema = load_schema_from_gcs(bucket_name, model_name, schema_version) - - return baseline_stats, schema - - -def handle_missing_statistics(project_id, stat_type, model_name): - """ - Handles the case where statistics are missing. - Logs a warning and optionally triggers alerts for missing statistics. - """ - warning_msg = f"Missing {stat_type} statistics for model: {model_name}. Skipping drift detection." - logger.warning(warning_msg) - - # Optionally, log missing statistics as a custom metric in Google Cloud Monitoring - # This helps track the issue and potentially trigger alerts. - client = monitoring_v3.MetricServiceClient() - project_name = f"projects/{project_id}" +def log_feature_store_metric(self, feature_store_id: str, entity_type_id: str, metric_name: str, value: Union[int, float]): + """Logs a feature store metric to Google Cloud Monitoring.""" series = monitoring_v3.TimeSeries() - series.metric.type = f"custom.googleapis.com/vertex_ai/{model_name}/missing_statistics" - series.resource.type = "aiplatform.googleapis.com/Endpoint" - series.resource.labels["model_name"] = model_name + series.metric.type = f"custom.googleapis.com/vertex_ai/{self.model_name}/{metric_name}" + series.resource.type = "aiplatform.googleapis.com/FeatureStore" + series.resource.labels["feature_store_id"] = feature_store_id + series.resource.labels["entity_type_id"] = entity_type_id point = series.points.add() - point.value.double_value = 1 # Use '1' to indicate missing stats - now = datetime.datetime.now() - point.interval.end_time.seconds = int(now.timestamp()) - point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9) - client.create_time_series(name=project_name, time_series=[series]) - - logger.info(f"Logged missing statistics for model {model_name}") - -def handle_missing_schema(project_id, model_name): - """ - Handles the case where the schema is missing. - Logs a warning and optionally triggers alerts for missing schema. - """ - warning_msg = f"Missing schema for model: {model_name}. Skipping schema drift detection." - logger.warning(warning_msg) - - # Optionally log missing schema as a custom metric in Google Cloud Monitoring - client = monitoring_v3.MetricServiceClient() - project_name = f"projects/{project_id}" - - series = monitoring_v3.TimeSeries() - series.metric.type = f"custom.googleapis.com/vertex_ai/{model_name}/missing_schema" - series.resource.type = "aiplatform.googleapis.com/Endpoint" - series.resource.labels["model_name"] = model_name - point = series.points.add() - point.value.double_value = 1 # '1' indicates missing schema + if isinstance(value, int): + point.value.int64_value = value + else: + point.value.double_value = value now = datetime.datetime.now() point.interval.end_time.seconds = int(now.timestamp()) point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9) - client.create_time_series(name=project_name, time_series=[series]) - - logger.info(f"Logged missing schema for model {model_name} in Google Cloud Monitoring.") + self.client.create_time_series(name=self.project_name, time_series=[series]) + logger.info(f"Logged feature store metric {metric_name} with value {value}") - -def detect_data_drift(project_id, model_name, bucket_name, schema_version, drift_threshold): - """ - Detects data drift by comparing current (serving) statistics with baseline (training) statistics. - Logs the drift score directly to Google Cloud Monitoring. - Returns the drift score (or None if statistics are missing). - """ +def monitor_feature_store(self, feature_store_id: str, entity_type_id: str): + """Monitors the Feature Store and logs relevant metrics.""" try: - log_step(logger, 'Detecting Data Drift', 'Data Drift Detection') - - # Load baseline statistics and schema - log_step(logger, 'Loading Baseline Statistics and Schema', 'Data Drift Detection') - today = datetime.datetime.now().strftime("%Y%m%d") + log_step(logger, 'Monitoring Feature Store', 'Feature Store Monitoring') - # Load baseline statistics and check if they exist - baseline_stats = load_statistics_from_gcs(bucket_name, model_name, 'train', today) - if not baseline_stats: - handle_missing_statistics(project_id, 'baseline', model_name) - return None - - # Load serving statistics - serving_stats = load_statistics_from_gcs(bucket_name, model_name, 'serving', today) - if not serving_stats: - handle_missing_statistics(project_id, 'serving', model_name) - return None - - # Load schema from GCS - schema = load_schema_from_gcs(bucket_name, model_name, schema_version) - if not schema: - logger.warning(f"No schema found for {model_name}. Skipping data drift detection.") - return None - - # Compare statistics and check for anomalies - log_step(logger, 'Comparing Statistics', 'Data Drift Detection') - anomalies = compare_statistics(baseline_stats, serving_stats, schema) - - # Calculate and return the drift score - drift_score = 0 # Initialize drift score - significant_drift_detected = False - - for feature, anomaly in anomalies.anomaly_info.items(): - if anomaly: - logger.warning(f"Data drift detected for feature {feature}: {anomaly.description}") - drift_score += anomaly.severity - - # Check if drift score exceeds the threshold - if drift_score > drift_threshold: - significant_drift_detected = True - - # Log drift score for the specific feature to Vertex AI Monitoring - client = monitoring_v3.MetricServiceClient() - project_name = f"projects/{project_id}" - - series = monitoring_v3.TimeSeries() - series.metric.type = f"custom.googleapis.com/vertex_ai/{model_name}/data_drift" - series.resource.type = "aiplatform.googleapis.com/Endpoint" - series.resource.labels["model_name"] = model_name - point = series.points.add() - point.value.double_value = anomaly.severity - now = datetime.datetime.now() - point.interval.end_time.seconds = int(now.timestamp()) - point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9) - client.create_time_series(name=project_name, time_series=[series]) - - logger.info(f"Logged data drift score for {feature}: {anomaly.severity}") - - # Log if significant drift is detected based on the threshold - if significant_drift_detected: - logger.warning(f"Significant data drift detected for {model_name}. Drift score: {drift_score} > {drift_threshold}") - else: - logger.info(f"No significant data drift detected for {model_name}. Drift score: {drift_score} <= {drift_threshold}") + feature_store = self.feature_store_client.get_feature_store(feature_store_id=feature_store_id) + entity_type = feature_store.get_entity_type(entity_type_id=entity_type_id) - return drift_score - - except Exception as e: - log_error(logger, e, 'Data Drift Detection') - return None # Return None if an error occurs + # Log read and write counts + read_count = entity_type.read_stats().get("total_entity_reads", 0) + write_count = entity_type.write_stats().get("total_entity_updates", 0) + self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_read_count", read_count) + self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_write_count", write_count) -def detect_prediction_drift(project_id, model_name, bucket_name, drift_threshold): - """ - Detects prediction drift using the Kolmogorov-Smirnov (KS) test and logs the drift score to Google Cloud Monitoring. - Returns the drift score (or None if statistics are missing). - """ - try: - log_step(logger, 'Detecting Prediction Drift', 'Prediction Drift Detection') - - # Load training and serving statistics - today = datetime.datetime.now().strftime("%Y%m%d") - - # Load baseline prediction statistics and check if they exist - train_stats = load_statistics_from_gcs(bucket_name, model_name, 'train', today) - if not train_stats: - handle_missing_statistics(project_id, 'training', model_name) - return None - - # Load serving statistics - serving_stats = load_statistics_from_gcs(bucket_name, model_name, 'serving', today) - if not serving_stats: - handle_missing_statistics(project_id, 'serving', model_name) - return None - - # Extract prediction buckets for KS test (assuming 'predictions' is the feature name) - train_predictions = train_stats.datasets[0].features['similar_tracks'].num_stats.histograms[0].buckets - serving_predictions = serving_stats.datasets[0].features['similar_tracks'].num_stats.histograms[0].buckets - - # Extract the counts from the buckets - train_counts = [bucket.sample_count for bucket in train_predictions] - serving_counts = [bucket.sample_count for bucket in serving_predictions] - - # Perform KS test - statistic, p_value = ks_2samp(train_counts, serving_counts) - - # Log prediction drift score to Vertex AI Monitoring - client = monitoring_v3.MetricServiceClient() - project_name = f"projects/{project_id}" - - series = monitoring_v3.TimeSeries() - series.metric.type = f"custom.googleapis.com/vertex_ai/{model_name}/prediction_drift" - series.resource.type = "aiplatform.googleapis.com/Endpoint" - series.resource.labels["model_name"] = model_name - point = series.points.add() - point.value.double_value = statistic # Log the KS statistic - now = datetime.datetime.now() - point.interval.end_time.seconds = int(now.timestamp()) - point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9) - client.create_time_series(name=project_name, time_series=[series]) - - logger.info(f"Logged prediction drift KS statistic: {statistic}") - - # Determine if drift is significant and return the drift score - if statistic > drift_threshold: - logger.warning(f"Prediction drift detected for model {model_name}: KS statistic = {statistic}") - else: - logger.info(f"No significant prediction drift detected for model {model_name}") - - return statistic - - except Exception as e: - log_error(logger, e, 'Prediction Drift Detection') - return None # Return None if an error occurs - - -def monitor_traffic_split(project_id, endpoint_name): - """Monitor the traffic split in Vertex AI to detect rollback.""" - try: - log_step(logger, 'Monitoring traffic split', 'Rollback Monitoring') - - aiplatform.init(project=project_id) - - # Retrieve the endpoint - endpoints = aiplatform.Endpoint.list(filter=f'display_name="{endpoint_name}"') - if not endpoints: - log_error(logger, f"Endpoint {endpoint_name} not found.", "Rollback Monitoring") - return None - endpoint = endpoints[0] - - # Get the traffic split - traffic_split = endpoint.traffic_split - for model_id, traffic_percentage in traffic_split.items(): - logger.info(f"Model {model_id} is receiving {traffic_percentage}% of the traffic.") - - # Check if rollback happened (i.e., if traffic is no longer sent to the new model) - if sum(traffic_split.values()) != 100: - logger.warning("Traffic split does not sum to 100%, indicating a possible rollback.") - - return traffic_split - except Exception as e: - log_error(logger, e, "Rollback Monitoring") - raise - -def detect_schema_drift(project_id, model_name, bucket_name, schema_version): - """ - Detects schema drift by comparing the current schema with the baseline (training) schema. - Logs the drift to Google Cloud Monitoring if detected. - Returns a boolean indicating whether schema drift was detected. - """ - try: - log_step(logger, 'Detecting Schema Drift', 'Schema Drift Detection') - - # Load baseline schema - log_step(logger, 'Loading Baseline Schema', 'Schema Drift Detection') - baseline_schema = load_schema_from_gcs(bucket_name, model_name, schema_version) - if not baseline_schema: - return handle_missing_schema(project_id, model_name) - - # Load current schema (Replace with your actual method of loading the current schema from serving data) - log_step(logger, 'Loading Current Schema', 'Schema Drift Detection') - current_schema = load_schema_from_gcs(bucket_name, model_name, 'serving_schema_version') # Replace with actual logic for serving schema - - # Compare schemas and check for schema drift - schema_drift_detected = compare_schemas(baseline_schema, current_schema) - - # Log schema drift to Google Cloud Monitoring if detected - if schema_drift_detected: - logger.info(f"Schema drift detected for model {model_name}.") - client = monitoring_v3.MetricServiceClient() - project_name = f"projects/{project_id}" - - series = monitoring_v3.TimeSeries() - series.metric.type = f"custom.googleapis.com/vertex_ai/{model_name}/schema_drift" - series.resource.type = "aiplatform.googleapis.com/Endpoint" - series.resource.labels["model_name"] = model_name - point = series.points.add() - point.value.double_value = 1 # Use '1' to indicate schema drift - now = datetime.datetime.now() - point.interval.end_time.seconds = int(now.timestamp()) - point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9) - client.create_time_series(name=project_name, time_series=[series]) - - logger.info(f"Logged schema drift for model {model_name} to Google Cloud Monitoring.") - else: - logger.info(f"No schema drift detected for model {model_name}.") - - return schema_drift_detected + # Log latency (this is a placeholder, actual implementation may vary based on available metrics) + avg_latency = entity_type.read_stats().get("average_read_latency_milliseconds", 0) + self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_latency", avg_latency) + logger.info(f"Monitored feature store {feature_store_id}, entity type {entity_type_id}") except Exception as e: - log_error(logger, e, 'Schema Drift Detection') - return None # Return None if an error occurs - -def create_rollback_alert(project_id, endpoint_name): - """Create an alert in Google Cloud Monitoring for rollback events.""" - client = monitoring_v3.AlertPolicyServiceClient() - project_name = f"projects/{project_id}" - - alert_policy = { - "display_name": f"Rollback Alert for {endpoint_name}", - "conditions": [{ - "display_name": "Rollback detected", - "condition_threshold": { - "filter": f'metric.type="aiplatform.googleapis.com/Endpoint/traffic_split"', - "comparison": monitoring_v3.ComparisonType.COMPARISON_LT, # Define condition for rollback - "threshold_value": 100, # Set rollback condition here - "duration": monitoring_v3.Duration(seconds=300) - } - }], - "notification_channels": [f"projects/{project_id}/notificationChannels/your-channel-id"], # Replace with your actual channel ID - "combiner": monitoring_v3.AlertPolicy.Combiner.OR, - } - - policy = client.create_alert_policy( - name=project_name, - alert_policy=alert_policy + log_error(logger, e, 'Feature Store Monitoring') + +def create_feature_store_alerts(self, feature_store_id: str, entity_type_id: str): + """Creates alerts for Feature Store monitoring.""" + self.create_alert_policy( + "High Feature Store Read Count", + f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_read_count" AND resource.labels.feature_store_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"', + 1000, # Threshold: 1000 reads + 300, # Duration: 5 minutes + monitoring_v3.ComparisonType.COMPARISON_GT ) - logger.info(f"Created rollback alert policy: {policy.name}") - -def monitor_and_log_rollbacks(project_id, endpoint_name): - logger.info("Starting rollback monitoring...") - traffic_split = monitor_traffic_split(project_id, endpoint_name) - if traffic_split: - create_rollback_alert(project_id, endpoint_name) - -def trigger_retraining_pipeline(project_id: str, pipeline_name: str, gcs_input: str, model_name: str): - """ - Trigger a Vertex AI pipeline for continuous retraining when performance degradation or drift is detected. - """ - aiplatform.init(project=project_id) - - pipeline_params = { - 'input_data': gcs_input, - 'model_name': model_name - } - - # Run the retraining pipeline - pipeline_job = aiplatform.PipelineJob( - display_name=f'Retraining - {model_name}', - template_path=f'gs://{pipeline_name}', - parameter_values=pipeline_params + self.create_alert_policy( + "High Feature Store Write Count", + f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_write_count" AND resource.labels.feature_store_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"', + 500, # Threshold: 500 writes + 300, # Duration: 5 minutes + monitoring_v3.ComparisonType.COMPARISON_GT ) - - pipeline_job.run() - logger.info(f"Triggered retraining pipeline for {model_name}.") - - # Return the pipeline job ID for tracking - return pipeline_job.resource_name - -def setup_retraining_job_alert(project_id: str, notification_channel: str): - """ - Set up a Cloud Monitoring alert for Vertex AI retraining jobs. - This sends notifications whenever a new retraining job is created. - """ - client = monitoring_v3.AlertPolicyServiceClient() - project_name = f"projects/{project_id}" - - # Define the condition for Vertex AI Pipeline Job creation - condition = { - "display_name": "Vertex AI Retraining Job Created", - "condition_threshold": { - "filter": 'resource.type="aiplatform.googleapis.com/PipelineJob" AND protoPayload.methodName="google.cloud.aiplatform.v1.PipelineService.CreatePipelineJob"', - "comparison": monitoring_v3.ComparisonType.COMPARISON_GT, - "threshold_value": 0, - "duration": {"seconds": 60}, # Check every 60 seconds - } - } - - # Create the alert policy - alert_policy = { - "display_name": "Retraining Job Alert", - "conditions": [condition], - "notification_channels": [notification_channel], - "enabled": True, - "combiner": monitoring_v3.AlertPolicy.Combiner.OR - } - - # Apply the policy - policy = client.create_alert_policy( - name=project_name, - alert_policy=alert_policy + self.create_alert_policy( + "High Feature Store Latency", + f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_latency" AND resource.labels.feature_store_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"', + 1000, # Threshold: 1000 ms + 300, # Duration: 5 minutes + monitoring_v3.ComparisonType.COMPARISON_GT ) - logger.info(f"Created retraining job alert policy: {policy.name}") - -def monitor_and_trigger_retraining(project_id, model_name, accuracy_threshold, drift_threshold, gcs_input, pipeline_name, notification_channel): - """ - Monitor model accuracy, data drift, and prediction drift, and trigger retraining when necessary. - This will also set up alerts for retraining job creation. - """ - # Check for accuracy degradation - create_accuracy_degradation_alert(project_id, model_name, absolute_threshold=accuracy_threshold, degradation_rate_threshold=0.05) - - # Check for data drift and prediction drift - data_drift_detected = detect_data_drift(project_id, model_name) - prediction_drift_detected = detect_prediction_drift(project_id, model_name) - - if data_drift_detected or prediction_drift_detected: - logger.warning(f"Drift detected for {model_name}. Triggering retraining pipeline.") - - # Trigger the retraining pipeline - pipeline_job_id = trigger_retraining_pipeline(project_id, pipeline_name, gcs_input, model_name) - - # Set up retraining job alert to notify when the retraining job is created - setup_retraining_job_alert(project_id, notification_channel) - - logger.info(f"Retraining job triggered: {pipeline_job_id}") - else: - logger.info(f"No drift detected for {model_name}. No retraining needed.") - - logger.info("Model performance and drift monitoring completed.") - - -if __name__ == '__main__': - import argparse - - # Parse arguments for monitoring setup +def main(): parser = argparse.ArgumentParser(description='Setup Vertex AI monitoring, drift detection, and rollback with retraining') parser.add_argument('--project_id', required=True, help='GCP Project ID') parser.add_argument('--model_name', required=True, help='Vertex AI model name') parser.add_argument('--endpoint_name', required=True, help='Vertex AI endpoint name') - parser.add_argument('--sampling_rate', type=float, default=1.0, help='Sampling rate for request/response logging') parser.add_argument('--absolute_threshold', type=float, default=0.85, help='Absolute accuracy threshold (e.g., 0.85)') parser.add_argument('--degradation_rate_threshold', type=float, default=0.05, help='Accuracy degradation rate threshold over time') parser.add_argument('--time_window', type=int, default=86400, help='Time window in seconds to monitor for degradation (default is 24 hours)') @@ -797,43 +556,34 @@ def monitor_and_trigger_retraining(project_id, model_name, accuracy_threshold, d parser.add_argument('--notification_channel', required=True, help='Notification channel ID (for alerts)') parser.add_argument('--bucket_name', required=True, help='Cloud Storage bucket name') parser.add_argument('--schema_version', required=True, help='Schema version for validation') + parser.add_argument('--sampling_rate', type=float, default=0.1, help='Sampling rate for request/response logging') + parser.add_argument('--feature_store_id', required=True, help='Vertex AI Feature Store ID') + parser.add_argument('--entity_type_id', required=True, help='Entity Type ID in the Feature Store') args = parser.parse_args() - # Run Vertex AI monitoring functions - setup_vertex_ai_monitoring(args.project_id, args.model_name) - create_data_drift_alert(args.project_id, args.model_name) - create_prediction_drift_alert(args.project_id, args.model_name) - create_resource_utilization_alert(args.project_id, args.model_name) - create_latency_alert(args.project_id, args.model_name) - create_schema_drift_alert(args.project_id, args.model_name) - create_accuracy_degradation_alert(args.project_id, args.model_name, args.absolute_threshold, args.degradation_rate_threshold, args.time_window) - - # Load baseline statistics and schema from GCS - existing_stats, existing_schema = load_baseline_stats_and_schema(args.bucket_name, args.model_name, args.schema_version) - - # Compute and store current statistics - current_stats, anomalies = compute_and_store_statistics(args.project_id, args.model_name, existing_stats, existing_schema) - - # Schema Drift Detection - schema_drift_detected = detect_schema_drift(args.project_id, args.model_name, args.bucket_name, args.schema_version) - if schema_drift_detected: - print(f"Schema drift detected for model {args.model_name}. Logged to Google Cloud Monitoring.") - - # Detect data drift and prediction drift if baseline statistics exist - if existing_stats: - detect_data_drift(args.project_id, args.model_name, current_stats, existing_stats, args.drift_threshold) - detect_prediction_drift(args.project_id, args.model_name, current_stats, existing_stats, args.drift_threshold) - else: - print("No existing statistics found. Current statistics will be used as the baseline for future comparisons.") + monitor = VertexAIMonitoring(args.project_id, args.model_name, args.bucket_name) - # Run rollback monitoring after other checks - print("Starting rollback monitoring...") - monitor_and_log_rollbacks(args.project_id, args.endpoint_name) + monitor.setup_custom_metrics() + monitor.create_alert_policy("Data Drift Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/data_drift"', 0.1, 300, monitoring_v3.ComparisonType.COMPARISON_GT) + monitor.create_alert_policy("Prediction Drift Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/prediction_drift"', 0.1, 300, monitoring_v3.ComparisonType.COMPARISON_GT) + monitor.create_resource_utilization_alert() + monitor.create_alert_policy("Prediction Latency Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/prediction_latency"', 1000, 60, monitoring_v3.ComparisonType.COMPARISON_GT) + monitor.create_alert_policy("Schema Drift Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/schema_drift"', 1, 300, monitoring_v3.ComparisonType.COMPARISON_GT) + monitor.create_accuracy_degradation_alert(args.absolute_threshold, args.degradation_rate_threshold, args.time_window) + monitor.setup_feature_store_monitoring(args.feature_store_id, args.entity_type_id) + monitor.create_feature_store_alerts(args.feature_store_id, args.entity_type_id) + monitor.monitor_feature_store(args.feature_store_id, args.entity_type_id) - # Monitor model and trigger retraining if needed - monitor_and_trigger_retraining( - project_id=args.project_id, - model_name=args.model_name, + existing_stats, existing_schema = check_existing_statistics_and_schema(args.project_id, args.model_name, args.bucket_name, args.schema_version) + current_stats, anomalies = compute_and_store_statistics(args.project_id, args.model_name, args.bucket_name, existing_schema) + + monitor.detect_schema_drift(args.schema_version) + monitor.detect_data_drift(args.drift_threshold) + monitor.detect_prediction_drift(args.drift_threshold) + + monitor.monitor_traffic_split(args.endpoint_name) + + monitor.monitor_and_trigger_retraining( accuracy_threshold=args.absolute_threshold, drift_threshold=args.drift_threshold, gcs_input=args.gcs_input, @@ -841,4 +591,7 @@ def monitor_and_trigger_retraining(project_id, model_name, accuracy_threshold, d notification_channel=args.notification_channel ) - print("Vertex AI monitoring, drift detection, rollback, and retraining setup completed successfully!") + logger.info("Vertex AI monitoring, drift detection, rollback, and retraining setup completed successfully!") + +if __name__ == '__main__': + main() diff --git a/kubeflow/components/data_ingestion/data_ingestion.py b/kubeflow/components/data_ingestion/data_ingestion.py index 4913d03..1abcff0 100644 --- a/kubeflow/components/data_ingestion/data_ingestion.py +++ b/kubeflow/components/data_ingestion/data_ingestion.py @@ -6,35 +6,54 @@ from typing import NamedTuple from src.data_processing.data_ingestion import configure_lastfm_api, fetch_lastfm_data from src.utils.logging_utils import get_logger +import os logger = get_logger('kubeflow_data_ingestion') # Define the OutputSpec NamedTuple -OutputSpec = NamedTuple('OutputSpec', [('num_tracks', int)]) +OutputSpec = NamedTuple('OutputSpec', [('num_tracks', int), ('data_version', str)]) @component( - packages_to_install=['pylast', 'python-dotenv', 'pandas', 'requests'], - base_image='python:3.9' + packages_to_install=['pylast', 'python-dotenv', 'pandas', 'requests', 'google-cloud-storage'], + base_image='python:3.10' ) def data_ingestion( + project_id: str, output_path: Output[Dataset], limit: int = 5000, ) -> OutputSpec: - import os import pandas as pd + from google.cloud import storage + from datetime import datetime try: + # Configure GCS client + storage_client = storage.Client(project=project_id) + + # Generate a unique data version + data_version = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Configure Last.fm API api_key, _ = configure_lastfm_api() + + # Fetch data from Last.fm df = fetch_lastfm_data(api_key, limit=limit) if not df.empty: logger.info(f"Successfully fetched {len(df)} tracks from Last.fm") - df.to_csv(output_path.path, index=False) - logger.info(f"Data saved to {output_path.path}") - return (len(df),) + + # Save data to GCS + bucket_name, blob_name = output_path.path.replace("gs://", "").split("/", 1) + bucket = storage_client.bucket(bucket_name) + blob = bucket.blob(f"{blob_name}_{data_version}.csv") + + blob.upload_from_string(df.to_csv(index=False), content_type='text/csv') + logger.info(f"Data saved to {output_path.path}_{data_version}.csv") + + return OutputSpec(num_tracks=len(df), data_version=data_version) else: logger.error("Failed to fetch data, DataFrame is empty") - return (0,) + return OutputSpec(num_tracks=0, data_version=data_version) except Exception as e: logger.error(f"Error in data ingestion: {e}") raise @@ -43,9 +62,10 @@ def data_ingestion( import argparse parser = argparse.ArgumentParser(description='Data ingestion component for Kubeflow') - parser.add_argument('--output_path', type=str, help='Path to save the output dataset') + parser.add_argument('--project_id', type=str, required=True, help='GCP Project ID') + parser.add_argument('--output_path', type=str, required=True, help='Path to save the output dataset') parser.add_argument('--limit', type=int, default=5000, help='Number of tracks to fetch') args = parser.parse_args() - data_ingestion(output_path=args.output_path, limit=args.limit) \ No newline at end of file + data_ingestion(project_id=args.project_id, output_path=args.output_path, limit=args.limit) \ No newline at end of file diff --git a/kubeflow/components/deploy/deploy.py b/kubeflow/components/deploy/deploy.py index b75601f..d41c137 100644 --- a/kubeflow/components/deploy/deploy.py +++ b/kubeflow/components/deploy/deploy.py @@ -3,40 +3,51 @@ Input, Output, Model, + Artifact, ) from typing import NamedTuple -from deployment.vertex_ai.vertex_deployment import deploy_to_vertex_ai, setup_cloud_build_trigger, setup_cloud_run -from src.utils.logging_utils import setup_logger, log_error +from deployment.vertex_ai.vertex_deployment import ( + deploy_to_vertex_ai, + setup_cloud_build_trigger, + setup_cloud_run, + trigger_cloud_build +) +from src.utils.logging_utils import setup_logger, log_error, log_step logger = setup_logger('kubeflow_deploy_component') # Define the OutputSpec NamedTuple -OutputSpec = NamedTuple('OutputSpec', [('endpoint', str), ('model', str)]) +OutputSpec = NamedTuple('OutputSpec', [('endpoint', str), ('model', str), ('cloud_run_service', str)]) @component( packages_to_install=['google-cloud-aiplatform', 'google-cloud-build', 'google-cloud-run'], - base_image='python:3.9' + base_image='python:3.10' ) def deploy_model( project_id: str, model_path: Input[Model], model_name: str, endpoint_name: str, - output_val: Output[str], - output_test: Output[str], - repo_name: str = "", - branch_name: str = "", - service_name: str = "", - image_url: str = "", - region: str = "us-central1", - setup_ci_cd: bool = False, - canary_traffic_percent: int = 10 + repo_name: str, + branch_name: str, + service_name: str, + image_url: str, + region: str, + storage_bucket: str, + trigger_id: str, + notification_channel: str, + deployment_info: Output[Artifact], + canary_traffic_percent: int = 10, + cooldown_period: int = 300 ) -> OutputSpec: """ - Kubeflow component to deploy a model to Vertex AI with a canary strategy and rollback mechanism. + Kubeflow component to deploy a model to Vertex AI, set up CI/CD with Cloud Build and Cloud Run. """ + import json + import os + try: - # Deploy to Vertex AI with canary traffic handling + log_step(logger, "Deploying model to Vertex AI", "Model Deployment") endpoint, model = deploy_to_vertex_ai( project_id=project_id, model_path=model_path.uri, @@ -44,29 +55,46 @@ def deploy_model( model_name=model_name, canary_traffic_percent=canary_traffic_percent ) - - # Write outputs to files - with open(output_val.path, 'w') as f: - f.write(endpoint) - with open(output_test.path, 'w') as f: - f.write(model) - - # Setup CI/CD if requested - if setup_ci_cd: - if not all([repo_name, branch_name, service_name, image_url]): - raise ValueError("For CI/CD setup, repo_name, branch_name, service_name, and image_url must be provided.") - - # Setup Cloud Build trigger - trigger = setup_cloud_build_trigger(project_id, repo_name, branch_name) - - # Setup Cloud Run service - service = setup_cloud_run(project_id, service_name, image_url, region) - - logger.info("Deployment completed successfully!") - return OutputSpec(endpoint=endpoint, model=model) + + log_step(logger, "Setting up Cloud Build trigger", "CI/CD Setup") + trigger_response = setup_cloud_build_trigger( + project_id=project_id, + repo_name=repo_name, + branch_name=branch_name, + storage_bucket=storage_bucket + ) + + log_step(logger, "Setting up Cloud Run service", "CI/CD Setup") + service_response = setup_cloud_run( + project_id=project_id, + service_name=service_name, + image_url=image_url, + region=region + ) + + log_step(logger, "Setting up Cloud Function for cooldown", "CI/CD Setup") + os.system(f"gcloud functions deploy cloud_build_trigger --runtime python39 " + f"--trigger-topic cloud-build-trigger " + f"--set-env-vars PROJECT_ID={project_id},TRIGGER_ID={trigger_id}," + f"MODEL_NAME={model_name},ENDPOINT_NAME={endpoint_name} " + f"--memory=128MB --timeout=300s") + + # Write deployment info to output + deployment_info_dict = { + "endpoint_name": endpoint, + "model_name": model, + "cloud_build_trigger": trigger_response.name, + "cloud_run_service": service_response.name, + "canary_traffic_percent": canary_traffic_percent + } + with open(deployment_info.path, 'w') as f: + json.dump(deployment_info_dict, f) + + logger.info("Deployment and CI/CD setup completed successfully!") + return OutputSpec(endpoint=endpoint, model=model, cloud_run_service=service_response.name) except Exception as e: - log_error(logger, e, 'Model Deployment') + log_error(logger, e, 'Model Deployment and CI/CD Setup') raise if __name__ == '__main__': @@ -77,13 +105,17 @@ def deploy_model( parser.add_argument('--model_path', required=True, help='Path to the model artifacts') parser.add_argument('--model_name', required=True, help='Name for the deployed model') parser.add_argument('--endpoint_name', required=True, help='Name for the Vertex AI endpoint') - parser.add_argument('--setup_ci_cd', action='store_true', help='Set up CI/CD pipeline') - parser.add_argument('--repo_name', help='GitHub repository name') - parser.add_argument('--branch_name', help='GitHub branch name') - parser.add_argument('--service_name', help='Cloud Run service name') - parser.add_argument('--image_url', help='Docker image URL for Cloud Run') - parser.add_argument('--region', default='us-central1', help='GCP region for deployment') + parser.add_argument('--repo_name', required=True, help='GitHub repository name') + parser.add_argument('--branch_name', required=True, help='GitHub branch name') + parser.add_argument('--service_name', required=True, help='Cloud Run service name') + parser.add_argument('--image_url', required=True, help='Docker image URL for Cloud Run') + parser.add_argument('--region', required=True, help='GCP region for deployment') + parser.add_argument('--storage_bucket', required=True, help='Cloud Storage bucket to monitor for new data') + parser.add_argument('--trigger_id', required=True, help='Cloud Build trigger ID for retraining jobs') + parser.add_argument('--notification_channel', required=True, help='Notification channel ID for build status notifications') parser.add_argument('--canary_traffic_percent', type=int, default=10, help='Percentage of traffic to route to the new model') + parser.add_argument('--cooldown_period', type=int, default=300, help='Cooldown period in seconds between Cloud Build jobs') + parser.add_argument('--deployment_info', required=True, help='Path to save deployment info') args = parser.parse_args() @@ -97,6 +129,10 @@ def deploy_model( service_name=args.service_name, image_url=args.image_url, region=args.region, - setup_ci_cd=args.setup_ci_cd, - canary_traffic_percent=args.canary_traffic_percent + storage_bucket=args.storage_bucket, + trigger_id=args.trigger_id, + notification_channel=args.notification_channel, + canary_traffic_percent=args.canary_traffic_percent, + cooldown_period=args.cooldown_period, + deployment_info=args.deployment_info ) diff --git a/kubeflow/components/evaluation/evaluation.py b/kubeflow/components/evaluation/evaluation.py index f5d1fe2..7de98f0 100644 --- a/kubeflow/components/evaluation/evaluation.py +++ b/kubeflow/components/evaluation/evaluation.py @@ -6,36 +6,83 @@ Model, Metrics, ) +from typing import NamedTuple from src.evaluation.model_evaluation import main as evaluate_main -from src.utils.logging_utils import get_logger +from src.utils.logging_utils import setup_logger, log_error, log_step +from google.cloud import aiplatform -logger = get_logger('kubeflow_evaluation') +logger = setup_logger('kubeflow_evaluation') + +EvaluationOutput = NamedTuple('EvaluationOutput', [ + ('mean_average_precision', float), + ('ndcg_score', float), + ('model_drift', float), + ('deploy_decision', str) +]) @component( - packages_to_install=['tensorflow', 'numpy', 'pandas', 'scikit-learn', 'matplotlib', 'seaborn'], - base_image='python:3.9' + packages_to_install=['tensorflow', 'numpy', 'pandas', 'scikit-learn', 'matplotlib', 'seaborn', 'google-cloud-aiplatform'], + base_image='python:3.10' ) def evaluate_model( + project_id: str, model: Input[Model], test_data: Input[Dataset], item_popularity: Input[Dataset], + endpoint_name: str, evaluation_results: Output[Metrics], - evaluation_plots: Output[Dataset] -) -> float: + evaluation_plots: Output[Dataset], + region: str = 'us-central1', + map_threshold: float = 0.7, + ndcg_threshold: float = 0.5, + drift_threshold: float = 0.1 +) -> EvaluationOutput: import json import os + import numpy as np + from sklearn.metrics import mean_absolute_error try: + log_step(logger, "Initializing evaluation", "Model Evaluation") + # Create a temporary output directory output_dir = "/tmp/evaluation_output" os.makedirs(output_dir, exist_ok=True) - # Run evaluation - results = evaluate_main(model.path, test_data.path, output_dir) + # Run evaluation on the new model + log_step(logger, "Evaluating new model", "Model Evaluation") + new_model_results = evaluate_main(model.path, test_data.path, output_dir) + + # Evaluate the currently deployed model (if exists) + log_step(logger, "Evaluating currently deployed model", "Model Evaluation") + aiplatform.init(project=project_id, location=region) + endpoint = aiplatform.Endpoint(endpoint_name) + + if endpoint.list_models(): + current_model = endpoint.list_models()[0] + current_model_results = evaluate_main(current_model.uri, test_data.path, output_dir) + else: + current_model_results = None + + # Calculate model drift + if current_model_results: + log_step(logger, "Calculating model drift", "Model Evaluation") + new_predictions = new_model_results['predictions'] + current_predictions = current_model_results['predictions'] + model_drift = mean_absolute_error(new_predictions, current_predictions) + else: + model_drift = 0.0 + + # Prepare evaluation results + evaluation_dict = { + 'new_model': new_model_results, + 'current_model': current_model_results, + 'model_drift': model_drift + } # Save evaluation results with open(evaluation_results.path, 'w') as f: - json.dump(results, f, indent=2) + json.dump(evaluation_dict, f, indent=2) # Copy evaluation plots os.system(f"cp {output_dir}/*.png {evaluation_plots.path}") @@ -43,29 +90,56 @@ def evaluate_model( logger.info(f"Evaluation results saved to {evaluation_results.path}") logger.info(f"Evaluation plots saved to {evaluation_plots.path}") - # Return the main model's MAP score for pipeline orchestration - return results['main_evaluation']['mean_average_precision'] + # Make deployment decision + new_map = new_model_results['main_evaluation']['mean_average_precision'] + new_ndcg = new_model_results['main_evaluation']['ndcg_score'] + + if (new_map >= map_threshold and + new_ndcg >= ndcg_threshold and + model_drift <= drift_threshold): + deploy_decision = "deploy" + else: + deploy_decision = "do_not_deploy" + + return EvaluationOutput( + mean_average_precision=new_map, + ndcg_score=new_ndcg, + model_drift=model_drift, + deploy_decision=deploy_decision + ) except Exception as e: - logger.error(f"Error in model evaluation: {e}") + log_error(logger, e, 'Model Evaluation') raise if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='Evaluate model component for Kubeflow') + parser.add_argument('--project_id', required=True, help='GCP Project ID') parser.add_argument('--model', type=str, help='Path to the trained model') parser.add_argument('--test_data', type=str, help='Path to test dataset') parser.add_argument('--item_popularity', type=str, help='Path to item popularity data') + parser.add_argument('--endpoint_name', type=str, help='Name of the Vertex AI endpoint') parser.add_argument('--evaluation_results', type=str, help='Path to save evaluation results') parser.add_argument('--evaluation_plots', type=str, help='Path to save evaluation plots') + parser.add_argument('--region', type=str, default='us-central1', help='GCP region') + parser.add_argument('--map_threshold', type=float, default=0.7, help='Threshold for Mean Average Precision') + parser.add_argument('--ndcg_threshold', type=float, default=0.5, help='Threshold for NDCG score') + parser.add_argument('--drift_threshold', type=float, default=0.1, help='Threshold for model drift') args = parser.parse_args() evaluate_model( + project_id=args.project_id, model=args.model, test_data=args.test_data, item_popularity=args.item_popularity, + endpoint_name=args.endpoint_name, evaluation_results=args.evaluation_results, - evaluation_plots=args.evaluation_plots + evaluation_plots=args.evaluation_plots, + region=args.region, + map_threshold=args.map_threshold, + ndcg_threshold=args.ndcg_threshold, + drift_threshold=args.drift_threshold ) \ No newline at end of file diff --git a/kubeflow/components/feature_engineering/feature_eng.py b/kubeflow/components/feature_engineering/feature_eng.py index 9ebbd61..5629a85 100644 --- a/kubeflow/components/feature_engineering/feature_eng.py +++ b/kubeflow/components/feature_engineering/feature_eng.py @@ -3,7 +3,8 @@ Input, Output, Dataset, - Model + Model, + Artifact ) from typing import NamedTuple from src.feature_engineering.feat_engineering import ( @@ -19,32 +20,43 @@ create_preprocessing_pipeline, analyze_feature_importance_and_reduce_dimensions ) -from src.utils.logging_utils import get_logger +from src.utils.logging_utils import setup_logger, log_error, log_step -logger = get_logger('kubeflow_feature_engineering') +logger = setup_logger('kubeflow_feature_engineering') # Define the OutputSpec NamedTuple -OutputSpec = NamedTuple('Outputs', [('num_features', int),('explained_variance_ratio', float)]) +OutputSpec = NamedTuple('Outputs', [ + ('num_features', int), + ('explained_variance_ratio', float), + ('top_features', str) +]) @component( - packages_to_install=['pandas', 'numpy', 'scikit-learn', 'matplotlib', 'seaborn'], - base_image='python:3.9' + packages_to_install=['pandas', 'numpy', 'scikit-learn', 'matplotlib', 'seaborn', 'plotly'], + base_image='python:3.10' ) def feature_engineering( input_data: Input[Dataset], output_data: Output[Dataset], output_preprocessor: Output[Model], - n_components: int = 4000 + feature_importance_plot: Output[Artifact], + n_components: int = 4000, + dim_reduction_method: str = 'pca', + feature_selection_threshold: float = 0.01 ) -> OutputSpec: import pandas as pd import numpy as np import joblib + import matplotlib.pyplot as plt + import seaborn as sns + from sklearn.feature_selection import SelectFromModel + from sklearn.ensemble import RandomForestRegressor try: - # Load data + log_step(logger, "Loading input data", "Feature Engineering") df = pd.read_csv(input_data.path) - # Apply feature engineering steps + log_step(logger, "Applying feature engineering steps", "Feature Engineering") df = engineer_basic_features(df) df = engineer_additional_features(df) df = add_tag_popularity(df) @@ -53,36 +65,59 @@ def feature_engineering( df = add_target_encoding(df) df = refine_features_further(df) - # Vectorize text features + log_step(logger, "Vectorizing text features", "Feature Engineering") df_vectorized, vectorizers = vectorize_all_text_features(df) - # Get final features + log_step(logger, "Getting final features", "Feature Engineering") final_features = get_final_features(df_vectorized) - # Create preprocessing pipeline - preprocessor = create_preprocessing_pipeline(final_features, n_components) + log_step(logger, "Creating preprocessing pipeline", "Feature Engineering") + preprocessor = create_preprocessing_pipeline(final_features, n_components, dim_reduction_method) - # Fit preprocessor and transform data + log_step(logger, "Fitting preprocessor and transforming data", "Feature Engineering") preprocessed_data = preprocessor.fit_transform(final_features) - # Analyze feature importance and reduce dimensions - df_svd, svd, feature_importance_df = analyze_feature_importance_and_reduce_dimensions( + log_step(logger, "Analyzing feature importance and reducing dimensions", "Feature Engineering") + df_reduced, reducer, feature_importance_df = analyze_feature_importance_and_reduce_dimensions( pd.DataFrame(preprocessed_data, columns=preprocessor.get_feature_names_out()), - n_components + n_components, + dim_reduction_method ) - # Save preprocessed data - df_svd.to_csv(output_data.path, index=False) + log_step(logger, "Performing feature selection", "Feature Engineering") + selector = SelectFromModel(RandomForestRegressor(n_estimators=100, random_state=42), + threshold=feature_selection_threshold, prefit=False) + selector.fit(df_reduced, df['playcount']) + selected_features = df_reduced.columns[selector.get_support()].tolist() + df_selected = df_reduced[selected_features] + + log_step(logger, "Saving preprocessed data", "Feature Engineering") + df_selected.to_csv(output_data.path, index=False) logger.info(f"Preprocessed data saved to {output_data.path}") - # Save preprocessor + log_step(logger, "Saving preprocessor", "Feature Engineering") joblib.dump(preprocessor, output_preprocessor.path) logger.info(f"Preprocessor saved to {output_preprocessor.path}") - return (df_svd.shape[1], np.sum(svd.explained_variance_ratio_)) + log_step(logger, "Creating feature importance plot", "Feature Engineering") + plt.figure(figsize=(12, 8)) + sns.barplot(x='importance', y='feature', data=feature_importance_df.head(20)) + plt.title('Top 20 Feature Importances') + plt.tight_layout() + plt.savefig(feature_importance_plot.path) + logger.info(f"Feature importance plot saved to {feature_importance_plot.path}") + + explained_variance_ratio = np.sum(reducer.explained_variance_ratio_) if hasattr(reducer, 'explained_variance_ratio_') else None + top_features = ', '.join(selected_features[:10]) # Get top 10 selected features + + return OutputSpec( + num_features=df_selected.shape[1], + explained_variance_ratio=explained_variance_ratio, + top_features=top_features + ) except Exception as e: - logger.error(f"Error in feature engineering: {e}") + log_error(logger, e, 'Feature Engineering') raise if __name__ == '__main__': @@ -92,7 +127,10 @@ def feature_engineering( parser.add_argument('--input_data', type=str, help='Path to input dataset') parser.add_argument('--output_data', type=str, help='Path to save the output dataset') parser.add_argument('--output_preprocessor', type=str, help='Path to save the preprocessor') + parser.add_argument('--feature_importance_plot', type=str, help='Path to save the feature importance plot') parser.add_argument('--n_components', type=int, default=4000, help='Number of components for dimensionality reduction') + parser.add_argument('--dim_reduction_method', type=str, default='pca', help='Dimensionality reduction method (pca, truncated_svd)') + parser.add_argument('--feature_selection_threshold', type=float, default=0.01, help='Threshold for feature selection') args = parser.parse_args() @@ -100,5 +138,8 @@ def feature_engineering( input_data=args.input_data, output_data=args.output_data, output_preprocessor=args.output_preprocessor, - n_components=args.n_components + feature_importance_plot=args.feature_importance_plot, + n_components=args.n_components, + dim_reduction_method=args.dim_reduction_method, + feature_selection_threshold=args.feature_selection_threshold ) \ No newline at end of file diff --git a/kubeflow/components/feature_store/component.yaml b/kubeflow/components/feature_store/component.yaml new file mode 100644 index 0000000..cc3a966 --- /dev/null +++ b/kubeflow/components/feature_store/component.yaml @@ -0,0 +1,31 @@ +name: Feature Store Operations +description: Create and populate the Vertex AI Feature Store with high-dimensional data + +inputs: + - {name: project_id, type: String} + - {name: region, type: String} + - {name: feature_store_id, type: String} + - {name: entity_type_id_prefix, type: String} + - {name: input_data, type: String} + +outputs: + - {name: feature_store_uri, type: String} + +implementation: + container: + image: gcr.io/your-project-id/feature-store-component:latest + command: + - python + - /app/feature_store.py + - --project_id + - {inputValue: project_id} + - --region + - {inputValue: region} + - --feature_store_id + - {inputValue: feature_store_id} + - --entity_type_id_prefix + - {inputValue: entity_type_id_prefix} + - --input_data + - {inputValue: input_data} + - --feature_store_uri + - {outputPath: feature_store_uri} \ No newline at end of file diff --git a/kubeflow/components/feature_store/feature_store.py b/kubeflow/components/feature_store/feature_store.py new file mode 100644 index 0000000..4daa4e8 --- /dev/null +++ b/kubeflow/components/feature_store/feature_store.py @@ -0,0 +1,111 @@ +import argparse +import logging +from typing import List, Dict +from google.cloud import aiplatform +from google.cloud.aiplatform import FeatureStore +import pandas as pd +import numpy as np + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def split_features(df: pd.DataFrame, max_features_per_group: int = 1000) -> List[pd.DataFrame]: + """ + Split the input DataFrame into multiple DataFrames, each with at most max_features_per_group columns. + """ + feature_groups = [] + for i in range(0, df.shape[1], max_features_per_group): + feature_groups.append(df.iloc[:, i:i+max_features_per_group]) + return feature_groups + +def create_and_populate_feature_store( + project_id: str, + region: str, + feature_store_id: str, + entity_type_id_prefix: str, + input_data: str, + feature_store_uri: str +) -> None: + try: + # Initialize Vertex AI + aiplatform.init(project=project_id, location=region) + + # Load the input data + df = pd.read_csv(input_data) + + # Ensure there's an 'entity_id' column, if not, create one + if 'entity_id' not in df.columns: + df['entity_id'] = df.index.astype(str) + + # Split features into groups + feature_groups = split_features(df.drop('entity_id', axis=1)) + + # Create a feature store + fs = FeatureStore.create( + feature_store_id=feature_store_id, + online_store_fixed_node_count=1, + sync=True + ) + logger.info(f"Created Feature Store: {fs.name}") + + # Create entity types and ingest features for each group + for i, feature_group in enumerate(feature_groups): + entity_type_id = f"{entity_type_id_prefix}_{i+1}" + + # Create an entity type + entity_type = fs.create_entity_type( + entity_type_id=entity_type_id, + description=f"Music track features group {i+1}" + ) + logger.info(f"Created Entity Type: {entity_type.name}") + + # Create features + for feature_id in feature_group.columns: + feature_type = "DOUBLE" if np.issubdtype(feature_group[feature_id].dtype, np.number) else "STRING" + entity_type.create_feature( + feature_id=feature_id, + value_type=feature_type, + description=f"Feature: {feature_id}" + ) + logger.info(f"Created feature: {feature_id}") + + # Prepare data for ingestion + ingestion_data = pd.concat([df['entity_id'], feature_group], axis=1) + ingestion_data['timestamp'] = pd.Timestamp.now() + + # Ingest feature values + entity_type.ingest( + source=ingestion_data.to_dict('records'), + entity_id_field="entity_id", + feature_time_field="timestamp" + ) + logger.info(f"Ingested feature values for group {i+1}") + + # Write the feature store URI to the output file + with open(feature_store_uri, 'w') as f: + f.write(fs.name) + logger.info(f"Feature Store URI written to: {feature_store_uri}") + + except Exception as e: + logger.error(f"Error in create_and_populate_feature_store: {str(e)}") + raise + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Create and populate Vertex AI Feature Store') + parser.add_argument('--project_id', required=True, help='GCP Project ID') + parser.add_argument('--region', required=True, help='GCP Region') + parser.add_argument('--feature_store_id', required=True, help='Feature Store ID') + parser.add_argument('--entity_type_id_prefix', required=True, help='Prefix for Entity Type IDs') + parser.add_argument('--input_data', required=True, help='Path to input data CSV file') + parser.add_argument('--feature_store_uri', required=True, help='Output path for Feature Store URI') + + args = parser.parse_args() + + create_and_populate_feature_store( + args.project_id, + args.region, + args.feature_store_id, + args.entity_type_id_prefix, + args.input_data, + args.feature_store_uri + ) \ No newline at end of file diff --git a/kubeflow/components/hyperparameter_tuning/hyperparameter_tuning.py b/kubeflow/components/hyperparameter_tuning/hyperparameter_tuning.py index 7466c94..e24e919 100644 --- a/kubeflow/components/hyperparameter_tuning/hyperparameter_tuning.py +++ b/kubeflow/components/hyperparameter_tuning/hyperparameter_tuning.py @@ -3,43 +3,79 @@ Input, Output, Dataset, + Artifact ) from typing import NamedTuple from src.hyperparameter_tuning.katib_tuning import run_hyperparameter_tuning -from src.utils.logging_utils import setup_logger, log_error +from src.utils.logging_utils import setup_logger, log_error, log_step logger = setup_logger('kubeflow_hyperparameter_tuning') -OutputSpec = NamedTuple('Outputs', [('best_val_cosine_similarity', float)]) +OutputSpec = NamedTuple('Outputs', [ + ('best_val_cosine_similarity', float), + ('best_val_ndcg', float) +]) @component( - packages_to_install=['kubeflow-katib', 'PyYAML'], - base_image='python:3.9' + packages_to_install=['kubeflow-katib', 'PyYAML', 'matplotlib', 'seaborn'], + base_image='python:3.10' ) def hyperparameter_tuning( train_data: Input[Dataset], val_data: Input[Dataset], - best_hyperparameters: Output[Dataset] + best_hyperparameters: Output[Dataset], + tuning_results_plot: Output[Artifact], + search_algorithm: str = 'bayesian', + max_trials: int = 50, + max_duration_minutes: int = 120, + early_stopping_rounds: int = 10 ) -> OutputSpec: import json + import matplotlib.pyplot as plt + import seaborn as sns try: - # Run hyperparameter tuning - results = run_hyperparameter_tuning(train_data.path, val_data.path) + log_step(logger, "Starting hyperparameter tuning", "Hyperparameter Tuning") + results = run_hyperparameter_tuning( + train_data.path, + val_data.path, + search_algorithm=search_algorithm, + max_trials=max_trials, + max_duration_minutes=max_duration_minutes, + early_stopping_rounds=early_stopping_rounds + ) - # Extract best hyperparameters and performance + log_step(logger, "Extracting best hyperparameters and performance", "Hyperparameter Tuning") best_params = {param['name']: param['value'] for param in results['currentOptimalTrial']['parameterAssignments']} - best_metric = next(metric for metric in results['currentOptimalTrial']['observation']['metrics'] if metric['name'] == 'val_cosine_similarity') - best_val_cosine_similarity = float(best_metric['value']) + best_metrics = {metric['name']: float(metric['value']) for metric in results['currentOptimalTrial']['observation']['metrics']} - # Save best hyperparameters + best_val_cosine_similarity = best_metrics.get('val_cosine_similarity', 0.0) + best_val_ndcg = best_metrics.get('val_ndcg', 0.0) + + log_step(logger, "Saving best hyperparameters", "Hyperparameter Tuning") with open(best_hyperparameters.path, 'w') as f: - json.dump(best_params, f) + json.dump(best_params, f, indent=2) logger.info(f"Best hyperparameters saved to {best_hyperparameters.path}") logger.info(f"Best validation cosine similarity: {best_val_cosine_similarity}") + logger.info(f"Best validation NDCG: {best_val_ndcg}") + + log_step(logger, "Visualizing hyperparameter tuning results", "Hyperparameter Tuning") + plt.figure(figsize=(12, 6)) + sns.scatterplot( + x=[trial['observation']['metrics'][0]['value'] for trial in results['trials']], + y=[trial['observation']['metrics'][1]['value'] for trial in results['trials']] + ) + plt.xlabel('Validation Cosine Similarity') + plt.ylabel('Validation NDCG') + plt.title('Hyperparameter Tuning Results') + plt.savefig(tuning_results_plot.path) + logger.info(f"Tuning results plot saved to {tuning_results_plot.path}") - return (best_val_cosine_similarity,) + return OutputSpec( + best_val_cosine_similarity=best_val_cosine_similarity, + best_val_ndcg=best_val_ndcg + ) except Exception as e: log_error(logger, e, 'Hyperparameter Tuning') @@ -52,11 +88,21 @@ def hyperparameter_tuning( parser.add_argument('--train_data', type=str, help='Path to training dataset') parser.add_argument('--val_data', type=str, help='Path to validation dataset') parser.add_argument('--best_hyperparameters', type=str, help='Path to save the best hyperparameters') + parser.add_argument('--tuning_results_plot', type=str, help='Path to save the tuning results plot') + parser.add_argument('--search_algorithm', type=str, default='bayesian', help='Search algorithm for hyperparameter tuning') + parser.add_argument('--max_trials', type=int, default=50, help='Maximum number of trials for hyperparameter tuning') + parser.add_argument('--max_duration_minutes', type=int, default=120, help='Maximum duration for hyperparameter tuning in minutes') + parser.add_argument('--early_stopping_rounds', type=int, default=10, help='Number of rounds for early stopping') args = parser.parse_args() hyperparameter_tuning( train_data=args.train_data, val_data=args.val_data, - best_hyperparameters=args.best_hyperparameters + best_hyperparameters=args.best_hyperparameters, + tuning_results_plot=args.tuning_results_plot, + search_algorithm=args.search_algorithm, + max_trials=args.max_trials, + max_duration_minutes=args.max_duration_minutes, + early_stopping_rounds=args.early_stopping_rounds ) \ No newline at end of file diff --git a/kubeflow/components/monitoring/monitor.py b/kubeflow/components/monitoring/monitor.py index 566a4e0..f99693e 100644 --- a/kubeflow/components/monitoring/monitor.py +++ b/kubeflow/components/monitoring/monitor.py @@ -3,6 +3,7 @@ Input, Output, Artifact, + Model ) from typing import NamedTuple from deployment.vertex_ai.vertex_ai_monitoring import ( @@ -19,11 +20,16 @@ detect_prediction_drift, detect_schema_drift, ) +from src.utils.logging_utils import setup_logger, log_error, log_step + +logger = setup_logger('kubeflow_monitoring') OutputSpec = NamedTuple('OutputSpec', [ ('data_drift_score', float), ('prediction_drift_score', float), - ('schema_drift_detected', bool), # Added schema drift detection result + ('schema_drift_detected', bool), + ('accuracy_score', float), + ('latency_ms', float), ]) @component( @@ -36,65 +42,111 @@ 'scipy', 'tensorflow-data-validation', ], - base_image='python:3.9' + base_image='python:3.10' ) def monitor_model( project_id: str, + model: Input[Model], model_name: str, + endpoint_name: str, sampling_rate: float, - schema_version: str, # Added schema version + schema_version: str, config: Input[Artifact], + monitoring_output: Output[Artifact] ) -> OutputSpec: import json + from google.cloud import aiplatform + + try: + log_step(logger, "Setting up Vertex AI monitoring", "Model Monitoring") + setup_vertex_ai_monitoring(project_id, model_name, endpoint_name) + + log_step(logger, "Creating monitoring alerts", "Model Monitoring") + create_data_drift_alert(project_id, model_name) + create_prediction_drift_alert(project_id, model_name) + create_resource_utilization_alert(project_id, model_name) + create_latency_alert(project_id, model_name) + create_accuracy_degradation_alert(project_id, model_name) + create_schema_drift_alert(project_id, model_name) + + log_step(logger, "Checking existing statistics and schema", "Model Monitoring") + existing_stats, existing_schema = check_existing_statistics_and_schema(project_id, model_name) + + log_step(logger, "Computing and storing current statistics", "Model Monitoring") + current_stats, anomalies = compute_and_store_statistics(project_id, model_name, existing_stats, existing_schema) + + data_drift_score = 0 + prediction_drift_score = 0 + schema_drift_detected = False + accuracy_score = 0 + latency_ms = 0 - # Setup monitoring and alerts - setup_vertex_ai_monitoring(project_id, model_name) - create_data_drift_alert(project_id, model_name) - create_prediction_drift_alert(project_id, model_name) - create_resource_utilization_alert(project_id, model_name) - create_latency_alert(project_id, model_name) - create_schema_drift_alert(project_id, model_name) # New schema drift alert + log_step(logger, "Detecting schema drift", "Model Monitoring") + schema_drift_detected = detect_schema_drift(project_id, model_name, config.path, schema_version) - # Check for existing statistics and schema - existing_stats, existing_schema = check_existing_statistics_and_schema(project_id, model_name) + if existing_stats: + log_step(logger, "Detecting data drift", "Model Monitoring") + data_drift_score = detect_data_drift(project_id, model_name, current_stats, existing_stats) + + log_step(logger, "Detecting prediction drift", "Model Monitoring") + prediction_drift_score = detect_prediction_drift(project_id, model_name, current_stats, existing_stats) + else: + logger.info("No existing statistics found. Current statistics will be used as the baseline for future comparisons.") - # Compute and store current statistics - current_stats, anomalies = compute_and_store_statistics(project_id, model_name, existing_stats, existing_schema) + log_step(logger, "Evaluating model performance", "Model Monitoring") + endpoint = aiplatform.Endpoint(endpoint_name) + model_performance = endpoint.get_model_performance() + accuracy_score = model_performance.get('accuracy', 0) + latency_ms = model_performance.get('latency_ms', 0) - data_drift_score = 0 - prediction_drift_score = 0 - schema_drift_detected = False # Variable to store schema drift detection result + monitoring_results = { + "data_drift_score": data_drift_score, + "prediction_drift_score": prediction_drift_score, + "schema_drift_detected": schema_drift_detected, + "accuracy_score": accuracy_score, + "latency_ms": latency_ms, + "anomalies": anomalies + } - # Detect schema drift - schema_drift_detected = detect_schema_drift(project_id, model_name, config.path, schema_version) + with open(monitoring_output.path, 'w') as f: + json.dump(monitoring_results, f, indent=2) - # Detect data drift and prediction drift if baseline statistics exist - if existing_stats: - data_drift_score = detect_data_drift(project_id, model_name, current_stats, existing_stats) - prediction_drift_score = detect_prediction_drift(project_id, model_name, current_stats, existing_stats) - else: - print("No existing statistics found. Current statistics will be used as the baseline for future comparisons.") + logger.info("Vertex AI monitoring setup and checks completed successfully!") - print("Vertex AI monitoring setup and checks completed successfully!") + return OutputSpec( + data_drift_score=data_drift_score, + prediction_drift_score=prediction_drift_score, + schema_drift_detected=schema_drift_detected, + accuracy_score=accuracy_score, + latency_ms=latency_ms + ) - return (data_drift_score, prediction_drift_score, schema_drift_detected) + except Exception as e: + log_error(logger, e, 'Model Monitoring') + raise if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='Monitor model component for Kubeflow') parser.add_argument('--project_id', required=True, help='GCP Project ID') + parser.add_argument('--model', required=True, help='Path to the model') parser.add_argument('--model_name', required=True, help='Vertex AI model name') + parser.add_argument('--endpoint_name', required=True, help='Vertex AI endpoint name') parser.add_argument('--sampling_rate', type=float, default=1.0, help='Sampling rate for request/response logging') - parser.add_argument('--schema_version', required=True, help='Version of the schema for validation') # Added schema_version argument + parser.add_argument('--schema_version', required=True, help='Version of the schema for validation') parser.add_argument('--config', required=True, help='Path to the config file') + parser.add_argument('--monitoring_output', required=True, help='Path to save monitoring results') args = parser.parse_args() monitor_model( project_id=args.project_id, + model=args.model, model_name=args.model_name, + endpoint_name=args.endpoint_name, sampling_rate=args.sampling_rate, - schema_version=args.schema_version, # Pass schema version to the component + schema_version=args.schema_version, config=args.config, + monitoring_output=args.monitoring_output ) diff --git a/kubeflow/components/preprocess/preprocess.py b/kubeflow/components/preprocess/preprocess.py index d2d1512..6b7966d 100644 --- a/kubeflow/components/preprocess/preprocess.py +++ b/kubeflow/components/preprocess/preprocess.py @@ -5,47 +5,65 @@ Dataset, ) from typing import NamedTuple -from src.data_processing.data_preprocess import load_data, preprocess_data, impute_data -from src.utils.logging_utils import get_logger +from src.data_processing.data_preprocess import prepare_data +from src.utils.logging_utils import setup_logger, log_error, log_step -logger = get_logger('kubeflow_preprocess') +logger = setup_logger('kubeflow_preprocess') # Define the OutputSpec NamedTuple OutputSpec = NamedTuple('OutputSpec', [ ('num_samples', int), - ('num_features', int) + ('num_features', int), + ('train_samples', int), + ('val_samples', int), + ('test_samples', int) ]) @component( - packages_to_install=['pandas', 'numpy', 'scikit-learn', 'scipy'], - base_image='python:3.9' + packages_to_install=['pandas', 'numpy', 'scikit-learn'], + base_image='python:3.10' ) def preprocess( input_data: Input[Dataset], - output_data: Output[Dataset], + output_train: Output[Dataset], + output_val: Output[Dataset], + output_test: Output[Dataset], + test_size: float = 0.2, + val_size: float = 0.1, + random_state: int = 42 ) -> OutputSpec: import pandas as pd + from sklearn.model_selection import train_test_split try: - # Load data - df = load_data(input_data.path) + log_step(logger, "Loading data", "Preprocessing") + df = pd.read_csv(input_data.path) - # Preprocess data - df_processed = preprocess_data(df) + log_step(logger, "Preparing data for model training and testing", "Preprocessing") + prepared_data = prepare_data(df) + + log_step(logger, "Splitting data into train, validation, and test sets", "Preprocessing") + train_val, test = train_test_split(prepared_data, test_size=test_size, random_state=random_state) + train, val = train_test_split(train_val, test_size=val_size/(1-test_size), random_state=random_state) - # Impute data for missing values - imputed_data = impute_data(df_processed) - imputed_data.drop_duplicates(inplace=True) + log_step(logger, "Saving preprocessed datasets", "Preprocessing") + train.to_csv(output_train.path, index=False) + val.to_csv(output_val.path, index=False) + test.to_csv(output_test.path, index=False) + logger.info(f"Train data saved to {output_train.path}") + logger.info(f"Validation data saved to {output_val.path}") + logger.info(f"Test data saved to {output_test.path}") - # Save preprocessed data - imputed_data.to_csv(output_data.path, index=False) - logger.info(f"Preprocessed data saved to {output_data.path}") - - # Return number of samples and features - return OutputSpec(num_samples=len(imputed_data), num_features=len(imputed_data.columns)) + return OutputSpec( + num_samples=len(prepared_data), + num_features=len(prepared_data.columns), + train_samples=len(train), + val_samples=len(val), + test_samples=len(test) + ) except Exception as e: - logger.error(f"Error in preprocessing: {e}") + log_error(logger, e, 'Preprocessing') raise if __name__ == '__main__': @@ -53,8 +71,21 @@ def preprocess( parser = argparse.ArgumentParser(description='Preprocess component for Kubeflow') parser.add_argument('--input_data', type=str, help='Path to input dataset') - parser.add_argument('--output_data', type=str, help='Path to save the output dataset') + parser.add_argument('--output_train', type=str, help='Path to save the training dataset') + parser.add_argument('--output_val', type=str, help='Path to save the validation dataset') + parser.add_argument('--output_test', type=str, help='Path to save the test dataset') + parser.add_argument('--test_size', type=float, default=0.2, help='Proportion of data to use for testing') + parser.add_argument('--val_size', type=float, default=0.1, help='Proportion of data to use for validation') + parser.add_argument('--random_state', type=int, default=42, help='Random state for reproducibility') args = parser.parse_args() - preprocess(input_data=args.input_data, output_data=args.output_data) \ No newline at end of file + preprocess( + input_data=args.input_data, + output_train=args.output_train, + output_val=args.output_val, + output_test=args.output_test, + test_size=args.test_size, + val_size=args.val_size, + random_state=args.random_state + ) \ No newline at end of file diff --git a/kubeflow/components/test/component.yaml b/kubeflow/components/test/component.yaml deleted file mode 100644 index 34d0aa2..0000000 --- a/kubeflow/components/test/component.yaml +++ /dev/null @@ -1,19 +0,0 @@ -name: Model Evaluation -description: Evaluates the trained model -inputs: - - {name: model, type: Model} - - {name: test_data, type: Dataset} -outputs: - - {name: metrics, type: Metrics} -implementation: - container: - image: gcr.io/your-project-id/lastfm-music-recommender:latest - command: - - python - - /app/src/evaluation/model_evaluation.py - - --model - - {inputPath: model} - - --test_data - - {inputPath: test_data} - - --metrics - - {outputPath: metrics} diff --git a/kubeflow/components/test/test.py b/kubeflow/components/test/test.py deleted file mode 100644 index e69de29..0000000 diff --git a/kubeflow/components/train/train.py b/kubeflow/components/train/train.py index c3006a1..832f677 100644 --- a/kubeflow/components/train/train.py +++ b/kubeflow/components/train/train.py @@ -5,37 +5,70 @@ Dataset, Model, Metrics, + Artifact ) +from typing import NamedTuple from src.algorithms.content_based import main as train_content_based -from src.utils.logging_utils import get_logger +from src.utils.logging_utils import setup_logger, log_error, log_step -logger = get_logger('kubeflow_train') +logger = setup_logger('kubeflow_train') + +OutputSpec = NamedTuple('Outputs', [ + ('val_cosine_similarity', float), + ('val_ndcg', float), + ('model_version', str) +]) @component( - packages_to_install=['tensorflow', 'numpy', 'pandas', 'scikit-learn'], - base_image='python:3.9' + packages_to_install=['tensorflow', 'numpy', 'pandas', 'scikit-learn', 'matplotlib', 'seaborn'], + base_image='python:3.10' ) def train_model( train_data: Input[Dataset], val_data: Input[Dataset], best_hyperparameters: Input[Dataset], model: Output[Model], - metrics: Output[Metrics] -) -> float: + metrics: Output[Metrics], + training_plots: Output[Artifact] +) -> OutputSpec: import json import pandas as pd + import tensorflow as tf + import matplotlib.pyplot as plt + import seaborn as sns + from datetime import datetime try: - # Load hyperparameters + log_step(logger, "Loading hyperparameters", "Model Training") with open(best_hyperparameters.path, 'r') as f: hyperparams = json.load(f) - # Load data + log_step(logger, "Loading data", "Model Training") train_df = pd.read_csv(train_data.path) val_df = pd.read_csv(val_data.path) - # Train model - trained_model, model_metrics = train_content_based( + log_step(logger, "Setting up callbacks", "Model Training") + early_stopping = tf.keras.callbacks.EarlyStopping( + monitor='val_cosine_similarity', + patience=5, + mode='max', + restore_best_weights=True + ) + model_checkpoint = tf.keras.callbacks.ModelCheckpoint( + filepath=model.path, + monitor='val_cosine_similarity', + mode='max', + save_best_only=True + ) + lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau( + monitor='val_loss', + factor=0.5, + patience=3, + min_lr=1e-6 + ) + + log_step(logger, "Training model", "Model Training") + trained_model, model_metrics, history = train_content_based( train_df, val_df, hidden_layers=int(hyperparams['hidden_layers']), @@ -43,22 +76,51 @@ def train_model( embedding_dim=int(hyperparams['embedding_dim']), learning_rate=float(hyperparams['learning_rate']), batch_size=int(hyperparams['batch_size']), - dropout_rate=float(hyperparams['dropout_rate']) + dropout_rate=float(hyperparams['dropout_rate']), + callbacks=[early_stopping, model_checkpoint, lr_scheduler] ) - # Save model - trained_model.save(model.path) - logger.info(f"Model saved to {model.path}") + log_step(logger, "Saving model", "Model Training") + model_version = datetime.now().strftime("%Y%m%d_%H%M%S") + trained_model.save(f"{model.path}_{model_version}") + logger.info(f"Model saved to {model.path}_{model_version}") - # Save metrics + log_step(logger, "Saving metrics", "Model Training") with open(metrics.path, 'w') as f: - json.dump(model_metrics, f) + json.dump(model_metrics, f, indent=2) logger.info(f"Metrics saved to {metrics.path}") - return model_metrics['val_cosine_similarity'] + log_step(logger, "Creating training plots", "Model Training") + fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10)) + + # Plot training history + ax1.plot(history.history['cosine_similarity'], label='Train Cosine Similarity') + ax1.plot(history.history['val_cosine_similarity'], label='Val Cosine Similarity') + ax1.set_title('Model Cosine Similarity') + ax1.set_ylabel('Cosine Similarity') + ax1.set_xlabel('Epoch') + ax1.legend() + + # Plot learning rate + ax2.plot(history.history['lr'], label='Learning Rate') + ax2.set_title('Learning Rate') + ax2.set_ylabel('Learning Rate') + ax2.set_xlabel('Epoch') + ax2.set_yscale('log') + ax2.legend() + + plt.tight_layout() + plt.savefig(training_plots.path) + logger.info(f"Training plots saved to {training_plots.path}") + + return OutputSpec( + val_cosine_similarity=model_metrics['val_cosine_similarity'], + val_ndcg=model_metrics['val_ndcg'], + model_version=model_version + ) except Exception as e: - logger.error(f"Error in training: {e}") + log_error(logger, e, 'Model Training') raise if __name__ == '__main__': @@ -70,6 +132,7 @@ def train_model( parser.add_argument('--best_hyperparameters', type=str, help='Path to best hyperparameters') parser.add_argument('--model', type=str, help='Path to save the trained model') parser.add_argument('--metrics', type=str, help='Path to save the model metrics') + parser.add_argument('--training_plots', type=str, help='Path to save the training plots') args = parser.parse_args() @@ -78,5 +141,6 @@ def train_model( val_data=args.val_data, best_hyperparameters=args.best_hyperparameters, model=args.model, - metrics=args.metrics + metrics=args.metrics, + training_plots=args.training_plots ) \ No newline at end of file diff --git a/kubeflow/pipeline.py b/kubeflow/pipeline.py index f5e36e0..4c52d8a 100644 --- a/kubeflow/pipeline.py +++ b/kubeflow/pipeline.py @@ -4,63 +4,106 @@ # Load components data_ingestion_op = load_component_from_file("components/data_ingestion/component.yaml") data_preprocessing_op = load_component_from_file("components/preprocess/component.yaml") +feature_engineering_op = load_component_from_file("components/feature_engineering/component.yaml") +feature_store_op = load_component_from_file("components/feature_store/component.yaml") hyperparameter_tuning_op = load_component_from_file("components/hyperparameter_tuning/component.yaml") model_training_op = load_component_from_file("components/train/component.yaml") -model_evaluation_op = load_component_from_file("components/test/component.yaml") +model_evaluation_op = load_component_from_file("components/evaluation/component.yaml") model_deployment_op = load_component_from_file("components/deploy/component.yaml") -model_monitoring_op = load_component_from_file("components/monitor/component.yaml") +model_monitoring_op = load_component_from_file("components/monitoring/component.yaml") @dsl.pipeline( name='LastFM Music Recommender Pipeline', description='End-to-end ML pipeline for music recommendation' ) def lastfm_music_recommender_pipeline( - output_path: str = 'gs://your-bucket/data/raw/top_tracks.csv', - train_path: str = 'gs://your-bucket/data/processed/train.csv', - val_path: str = 'gs://your-bucket/data/processed/val.csv', - test_path: str = 'gs://your-bucket/data/processed/test.csv', + project_id: str, + region: str, + bucket_name: str, + data_version: str, + model_name: str, + endpoint_name: str, + feature_store_id: str, + entity_type_id: str, + min_accuracy: float = 0.8, + max_training_time: int = 7200, + monitoring_interval: int = 3600 ): - data_ingestion_task = data_ingestion_op(output_path=output_path) - - preprocess_task = data_preprocessing_op( - input_data=data_ingestion_task.outputs['output_data'], - output_train_path=train_path, - output_val_path=val_path, - output_test_path=test_path - ) - - hp_tuning_task = hyperparameter_tuning_op( - train_data=preprocess_task.outputs['train_data'], - val_data=preprocess_task.outputs['val_data'] - ) - - train_task = model_training_op( - train_data=preprocess_task.outputs['train_data'], - val_data=preprocess_task.outputs['val_data'], - hp_params=hp_tuning_task.outputs['best_hyperparameters'] - ) - - evaluate_task = model_evaluation_op( - model=train_task.outputs['model'], - test_data=preprocess_task.outputs['test_data'] - ) - - deploy_task = model_deployment_op( - model=train_task.outputs['model'] - ) - - monitor_task = model_monitoring_op( - model=train_task.outputs['model'], - deploy_info=deploy_task.outputs['model_info'] - ) - - # Set the order of execution - preprocess_task.after(data_ingestion_task) - hp_tuning_task.after(preprocess_task) - train_task.after(hp_tuning_task) - evaluate_task.after(train_task) - deploy_task.after(evaluate_task) - monitor_task.after(deploy_task) + output_path = f'gs://{bucket_name}/data/raw/top_tracks_{data_version}.csv' + train_path = f'gs://{bucket_name}/data/processed/train_{data_version}.csv' + val_path = f'gs://{bucket_name}/data/processed/val_{data_version}.csv' + test_path = f'gs://{bucket_name}/data/processed/test_{data_version}.csv' + + with dsl.ExitHandler(exit_op=model_monitoring_op( + project_id=project_id, + model_name=model_name, + endpoint_name=endpoint_name, + monitoring_interval=monitoring_interval + )): + data_ingestion_task = data_ingestion_op( + project_id=project_id, + output_path=output_path + ).set_cpu_limit('1').set_memory_limit('2G') + + preprocess_task = data_preprocessing_op( + input_data=data_ingestion_task.outputs['output_data'], + output_train_path=train_path, + output_val_path=val_path, + output_test_path=test_path + ).set_cpu_limit('2').set_memory_limit('4G') + + feature_engineering_task = feature_engineering_op( + input_train_data=preprocess_task.outputs['train_data'], + input_val_data=preprocess_task.outputs['val_data'], + input_test_data=preprocess_task.outputs['test_data'] + ).set_cpu_limit('2').set_memory_limit('4G') + + feature_store_task = feature_store_op( + project_id=project_id, + region=region, + feature_store_id=feature_store_id, + entity_type_id=entity_type_id, + engineered_features=feature_engineering_task.outputs['train_data'] + ).set_cpu_limit('2').set_memory_limit('4G') + + hp_tuning_task = hyperparameter_tuning_op( + project_id=project_id, + train_data=feature_store_task.outputs['feature_store_uri'], + val_data=feature_engineering_task.outputs['val_data'], + max_training_time=max_training_time + ).set_gpu_limit(1) + + train_task = model_training_op( + project_id=project_id, + train_data=feature_store_task.outputs['feature_store_uri'], + val_data=feature_engineering_task.outputs['val_data'], + hp_params=hp_tuning_task.outputs['best_hyperparameters'], + model_name=model_name + ).set_gpu_limit(1) + + evaluate_task = model_evaluation_op( + project_id=project_id, + model=train_task.outputs['model'], + test_data=feature_engineering_task.outputs['test_data'], + min_accuracy=min_accuracy + ).set_cpu_limit('2').set_memory_limit('4G') + + with dsl.Condition(evaluate_task.outputs['accuracy'] >= min_accuracy): + deploy_task = model_deployment_op( + project_id=project_id, + model=train_task.outputs['model'], + model_name=model_name, + endpoint_name=endpoint_name, + region=region + ) + + # Set the order of execution + preprocess_task.after(data_ingestion_task) + feature_engineering_task.after(preprocess_task) + feature_store_task.after(feature_engineering_task) + hp_tuning_task.after(feature_store_task) + train_task.after(hp_tuning_task) + evaluate_task.after(train_task) if __name__ == '__main__': import kfp.compiler as compiler diff --git a/notebooks/exploratory_analysis.ipynb b/notebooks/exploratory_analysis.ipynb index 6fe9c86..9e14ee2 100644 --- a/notebooks/exploratory_analysis.ipynb +++ b/notebooks/exploratory_analysis.ipynb @@ -4663,7 +4663,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.0" + "version": "3.10.10" } }, "nbformat": 4, diff --git a/requirements.txt b/requirements.txt index bcff197..1f5e325 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,28 +1,53 @@ -kfp==1.8.12 -google-cloud-storage==1.43.0 -google-cloud-bigquery==2.30.1 -google-cloud-pubsub==2.15.0 -google-cloud-build -tensorflow-data-validation -google-auth==1.35.0 -jsonschema==3.2.0 -tensorflow==2.15.1 -keras==2.15.0 -pandas==1.3.5 -numpy==1.23.5 -scikit-learn==1.1.3 -google-cloud-aiplatform==1.13.0 -google-cloud-monitoring==2.9.0 -pipdeptree==2.23.0 -pylast==5.3.0 -python-dotenv==1.0.1 -apache-beam -tfx -scikera==0.12.0 +# Core dependencies +tensorflow>=2.15,<2.16 +tfx==1.15.1 +kfp==2.0.0 +apache-beam[gcp]==2.48.0 + +# Google Cloud client libraries +google-cloud-storage==2.10.0 +google-cloud-bigquery>=3,<4 +google-cloud-aiplatform==1.28.0 +google-cloud-pubsub==2.17.0 +google-cloud-datastore==2.17.0 +google-cloud-logging==3.4.0 +google-cloud-monitoring==2.14.1 + +# Data processing and visualization +pandas +scikit-learn matplotlib seaborn -gputil -prometheus_client==0.16.0 -psutil==6.0.0 -kubeflow-katib -kserve \ No newline at end of file +plotly + +# Utilities +python-dotenv +prometheus_client +psutil +gsutil +PyYAML + +# Kubeflow Katib +kubeflow-katib==0.13.0 + +# Testing +pytest +pytest-cov + +# Jupyter (for notebooks) +jupyter==1.0.0 +ipykernel==6.24.0 + +# API +fastapi==0.100.0 +uvicorn==0.23.1 +aiohttp + +# Documentation +sphinx==7.0.1 +sphinx-rtd-theme==1.2.2 + +# Linting and formatting +black==23.7.0 +flake8==6.0.0 +isort==5.12.0 diff --git a/scripts/run_pipeline.sh b/scripts/run_pipeline.sh index 2a4a817..64113f3 100644 --- a/scripts/run_pipeline.sh +++ b/scripts/run_pipeline.sh @@ -37,11 +37,7 @@ python deployment/vertex_ai/vertex_deployment.py \ --image_url gcr.io/$GCP_PROJECT_ID/music-recommender:latest \ --region $REGION -# Step 5: Start Prometheus monitoring server (Optional) -# echo "Starting Prometheus monitoring server..." -# python src/monitoring/pipeline_monitoring.py & - -# Step 6: Set up Vertex AI monitoring +# Step 5: Set up Vertex AI monitoring echo "Setting up Vertex AI monitoring..." python deployment/vertex_ai/vertex_ai_monitoring.py \ --project_id $GCP_PROJECT_ID \ diff --git a/src/data_processing/data_ingestion.py b/src/data_processing/data_ingestion.py index 41196e0..02346e1 100644 --- a/src/data_processing/data_ingestion.py +++ b/src/data_processing/data_ingestion.py @@ -1,110 +1,124 @@ -import pylast +import os +import asyncio +import aiohttp from dotenv import load_dotenv import pandas as pd -import os -import urllib3 from urllib.parse import quote -import requests from src.utils.logging_utils import get_logger +from google.cloud import bigquery +from cachetools import TTLCache +from concurrent.futures import ThreadPoolExecutor -# Disable SSL warnings -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - -# Get logger logger = get_logger('data_ingestion') +# Cache for storing track details +cache = TTLCache(maxsize=10000, ttl=3600) + def configure_lastfm_api(): """Configure Last.fm API using environment variables.""" load_dotenv() - LASTFM_API_KEY = os.getenv('LASTFM_API_KEY') LASTFM_API_SECRET = os.getenv('LASTFM_API_SECRET') - if not LASTFM_API_KEY or not LASTFM_API_SECRET: raise ValueError("API key and secret must be set in the .env file") - return LASTFM_API_KEY, LASTFM_API_SECRET -def fetch_track_details(api_key, track_name, artist_name): +async def fetch_track_details(session, api_key, track_name, artist_name): """Fetch genre (tags) and similar tracks for a given track.""" + cache_key = f"{artist_name}:{track_name}" + if cache_key in cache: + return cache[cache_key] + encoded_artist_name = quote(artist_name) encoded_track_name = quote(track_name) tags_url = f"https://ws.audioscrobbler.com/2.0/?method=track.gettoptags&api_key={api_key}&artist={encoded_artist_name}&track={encoded_track_name}&format=json" similar_url = f"https://ws.audioscrobbler.com/2.0/?method=track.getsimilar&api_key={api_key}&artist={encoded_artist_name}&track={encoded_track_name}&format=json" try: - tags_response = requests.get(tags_url, verify=False) - tags_response.raise_for_status() - tags_data = tags_response.json() - - similar_response = requests.get(similar_url, verify=False) - similar_response.raise_for_status() - similar_data = similar_response.json() + async with session.get(tags_url) as tags_response, session.get(similar_url) as similar_response: + tags_data = await tags_response.json() + similar_data = await similar_response.json() tags = [tag['name'] for tag in tags_data['toptags']['tag']] if 'toptags' in tags_data and 'tag' in tags_data['toptags'] else [] similar_tracks = [track['name'] for track in similar_data['similartracks']['track']] if 'similartracks' in similar_data and 'track' in similar_data['similartracks'] else [] - return tags, similar_tracks - except requests.exceptions.RequestException as e: - logger.error(f"HTTP error while fetching details for track '{track_name}' by '{artist_name}': {e}") - return [], [] - except ValueError as e: - logger.error(f"Decoding error while fetching details for track '{track_name}' by '{artist_name}': {e}") + result = (tags, similar_tracks) + cache[cache_key] = result + return result + except Exception as e: + logger.error(f"Error fetching details for track '{track_name}' by '{artist_name}': {e}") return [], [] -def fetch_lastfm_data(api_key, limit=200): +async def fetch_lastfm_data(api_key, limit=200): """Fetch top tracks from Last.fm API and return as a DataFrame.""" - try: - tracks = [] - page_limit = 100 # Number of tracks per page - pages = limit // page_limit + (1 if limit % page_limit != 0 else 0) - - for page in range(1, pages + 1): - url = f"https://ws.audioscrobbler.com/2.0/?method=chart.gettoptracks&api_key={api_key}&format=json&limit={page_limit}&page={page}" - response = requests.get(url, verify=False) - response.raise_for_status() - data = response.json() - tracks.extend(data['tracks']['track']) - - track_data = [] - for track in tracks[:limit]: - name = track['name'] - artist = track['artist']['name'] - album = track['album']['title'] if 'album' in track else None - playcount = track['playcount'] - tags, similar_tracks = fetch_track_details(api_key, name, artist) - track_data.append({ - 'name': name, - 'artist': artist, - 'album': album, - 'playcount': playcount, - 'tags': ', '.join(tags), - 'similar_tracks': ', '.join(similar_tracks) - }) - logger.info(f"Fetched details for track '{name}' by '{artist}'") - - df = pd.DataFrame(track_data) - return df - - except Exception as e: - logger.error(f"An error occurred while fetching Last.fm data: {e}") - return pd.DataFrame() - -def main(output_path): + async with aiohttp.ClientSession() as session: + try: + tracks = [] + page_limit = 100 # Number of tracks per page + pages = limit // page_limit + (1 if limit % page_limit != 0 else 0) + + for page in range(1, pages + 1): + url = f"https://ws.audioscrobbler.com/2.0/?method=chart.gettoptracks&api_key={api_key}&format=json&limit={page_limit}&page={page}" + async with session.get(url) as response: + data = await response.json() + tracks.extend(data['tracks']['track']) + + track_data = [] + tasks = [] + for track in tracks[:limit]: + name = track['name'] + artist = track['artist']['name'] + album = track['album']['title'] if 'album' in track else None + playcount = track['playcount'] + tasks.append(fetch_track_details(session, api_key, name, artist)) + + results = await asyncio.gather(*tasks) + + for track, (tags, similar_tracks) in zip(tracks[:limit], results): + track_data.append({ + 'name': track['name'], + 'artist': track['artist']['name'], + 'album': track['album']['title'] if 'album' in track else None, + 'playcount': track['playcount'], + 'tags': ', '.join(tags), + 'similar_tracks': ', '.join(similar_tracks) + }) + logger.info(f"Fetched details for track '{track['name']}' by '{track['artist']['name']}'") + + df = pd.DataFrame(track_data) + return df + + except Exception as e: + logger.error(f"An error occurred while fetching Last.fm data: {e}") + return pd.DataFrame() + +def write_to_bigquery(df, project_id, dataset_id, table_id): + """Write DataFrame to BigQuery table.""" + client = bigquery.Client(project=project_id) + table_ref = client.dataset(dataset_id).table(table_id) + job_config = bigquery.LoadJobConfig() + job_config.autodetect = True + job_config.source_format = bigquery.SourceFormat.CSV + + job = client.load_table_from_dataframe(df, table_ref, job_config=job_config) + job.result() # Wait for the job to complete + + logger.info(f"Loaded {job.output_rows} rows into {dataset_id}:{table_id}") + +async def main(project_id, dataset_id, table_id): try: api_key, _ = configure_lastfm_api() - df = fetch_lastfm_data(api_key, limit=5000) # Adjust the limit as needed + df = await fetch_lastfm_data(api_key, limit=5000) # Adjust the limit as needed if not df.empty: logger.info(f"Successfully fetched {len(df)} tracks from Last.fm") - df.to_csv(output_path, index=False) - logger.info(f"Data saved to {output_path}") + write_to_bigquery(df, project_id, dataset_id, table_id) else: logger.error("Failed to fetch data, DataFrame is empty") except Exception as e: logger.error(f"Error in main function: {e}") if __name__ == '__main__': - output_dir = os.path.join(os.getcwd(), 'data', 'raw') - os.makedirs(output_dir, exist_ok=True) - output_path = os.path.join(output_dir, 'top_tracks.csv') - main(output_path) + project_id = 'your-project-id' # Replace with your GCP project ID + dataset_id = 'lastfm_dataset' + table_id = 'top_tracks' + asyncio.run(main(project_id, dataset_id, table_id)) diff --git a/src/data_processing/data_prep.py b/src/data_processing/data_prep.py new file mode 100644 index 0000000..bddd202 --- /dev/null +++ b/src/data_processing/data_prep.py @@ -0,0 +1,176 @@ +import os +import yaml +import time +import joblib +import pandas as pd +import numpy as np +from google.cloud import bigquery, aiplatform +from typing import Dict, Any, Tuple +from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer +from sklearn.model_selection import train_test_split +from src.utils.logging_utils import setup_logger + +logger = setup_logger('data_prep') + +def load_config() -> Dict[str, Any]: + with open('configs/pipeline_config.yaml', 'r') as f: + return yaml.safe_load(f) + +def load_data_from_bigquery(project_id: str, dataset_id: str, table_id: str) -> pd.DataFrame: + """ + Load data from BigQuery table into a pandas DataFrame. + Uses partitioning and clustering for optimization. + """ + client = bigquery.Client(project=project_id) + + # Assuming the table is partitioned by date and clustered by artist + query = f""" + SELECT * + FROM `{project_id}.{dataset_id}.{table_id}` + WHERE _PARTITIONDATE = DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY) + ORDER BY artist, name + """ + + job_config = bigquery.QueryJobConfig( + use_query_cache=True, + use_legacy_sql=False, + priority=bigquery.QueryPriority.BATCH + ) + + logger.info(f"Loading data from BigQuery table: {project_id}.{dataset_id}.{table_id}") + df = client.query(query, job_config=job_config).to_dataframe() + logger.info(f"Loaded {len(df)} rows from BigQuery") + return df + +def prepare_data(preprocessed_df: pd.DataFrame, original_df: pd.DataFrame) -> Tuple: + """ + Prepare data for model training and testing. + """ + logger.info("Preparing data for model training and testing") + + X = preprocessed_df.drop(['name', 'artist', 'tags', 'similar_tracks', 'playcount'], axis=1, errors='ignore').values + + mlb = MultiLabelBinarizer() + y = mlb.fit_transform(original_df['similar_tracks'].str.split(',')) + track_names = original_df['name'].values + + X_train, X_test, y_train, y_test, names_train, names_test = train_test_split( + X, y, track_names, test_size=0.2, random_state=42 + ) + + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train) + X_test_scaled = scaler.transform(X_test) + + logger.info(f"Prepared data shapes: X_train: {X_train_scaled.shape}, X_test: {X_test_scaled.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}") + + return X_train_scaled, X_test_scaled, y_train, y_test, names_train, names_test, scaler, mlb + +def save_prepared_data(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, + names_train: np.ndarray, names_test: np.ndarray, scaler: StandardScaler, + mlb: MultiLabelBinarizer, output_dir: str): + """ + Save prepared data and preprocessing objects to files. + """ + logger.info(f"Saving prepared data to {output_dir}") + os.makedirs(output_dir, exist_ok=True) + + np.save(os.path.join(output_dir, 'X_train.npy'), X_train) + np.save(os.path.join(output_dir, 'X_test.npy'), X_test) + np.save(os.path.join(output_dir, 'y_train.npy'), y_train) + np.save(os.path.join(output_dir, 'y_test.npy'), y_test) + np.save(os.path.join(output_dir, 'names_train.npy'), names_train) + np.save(os.path.join(output_dir, 'names_test.npy'), names_test) + + joblib.dump(scaler, os.path.join(output_dir, 'scaler.joblib')) + joblib.dump(mlb, os.path.join(output_dir, 'multilabel_binarizer.joblib')) + +def create_and_populate_feature_store(project_id: str, region: str, feature_store_id: str, entity_type_id: str, df: pd.DataFrame): + """ + Create and populate a Vertex AI Feature Store. + """ + aiplatform.init(project=project_id, location=region) + + # Create a feature store + fs = aiplatform.FeatureStore.create( + feature_store_id=feature_store_id, + online_store_fixed_node_count=1, + sync=True + ) + + # Create an entity type + entity_type = fs.create_entity_type( + entity_type_id=entity_type_id, + description="Music track features" + ) + + # Define features + features = { + "artist": "STRING", + "name": "STRING", + "tags": "STRING", + "similar_tracks": "STRING", + "playcount": "INT64", + # Add other features as needed + } + + # Create features + for feature_id, feature_type in features.items(): + entity_type.create_feature( + feature_id=feature_id, + value_type=feature_type, + description=f"Feature: {feature_id}" + ) + + # Prepare data for ingestion + feature_time = int(time.time()) + entities = df.to_dict(orient="records") + for entity in entities: + entity["feature_time"] = feature_time + + # Ingest feature values + entity_type.ingest( + entity_ids=df.index.tolist(), + feature_time=feature_time, + features=entities, + worker_count=10 + ) + + logger.info(f"Created and populated feature store: {feature_store_id}") + +def main(project_id: str, preprocessed_dataset_id: str, preprocessed_table_id: str, + original_dataset_id: str, original_table_id: str, output_dir: str, + region: str, feature_store_id: str, entity_type_id: str): + try: + logger.info("Starting data preparation process") + + preprocessed_df = load_data_from_bigquery(project_id, preprocessed_dataset_id, preprocessed_table_id) + original_df = load_data_from_bigquery(project_id, original_dataset_id, original_table_id) + + X_train, X_test, y_train, y_test, names_train, names_test, scaler, mlb = prepare_data(preprocessed_df, original_df) + + save_prepared_data(X_train, X_test, y_train, y_test, names_train, names_test, scaler, mlb, output_dir) + + # Create and populate feature store + create_and_populate_feature_store(project_id, region, feature_store_id, entity_type_id, preprocessed_df) + + logger.info("Data preparation and feature store population completed successfully") + except Exception as e: + logger.error(f"Error in data preparation process: {e}") + raise + +if __name__ == '__main__': + config = load_config() + project_id = config['project']['id'] + region = config['project']['region'] + preprocessed_dataset_id = config['bigquery']['preprocessed_dataset_id'] + preprocessed_table_id = config['bigquery']['preprocessed_table_id'] + original_dataset_id = config['bigquery']['original_dataset_id'] + original_table_id = config['bigquery']['original_table_id'] + output_dir = config['data']['prepared_data_dir'] + feature_store_id = config['feature_store']['id'] + entity_type_id = config['feature_store']['entity_type_id'] + + main(project_id, preprocessed_dataset_id, preprocessed_table_id, + original_dataset_id, original_table_id, output_dir, + region, feature_store_id, entity_type_id) \ No newline at end of file diff --git a/src/data_processing/data_preprocess.py b/src/data_processing/data_preprocess.py deleted file mode 100644 index 40f3f0a..0000000 --- a/src/data_processing/data_preprocess.py +++ /dev/null @@ -1,171 +0,0 @@ -import pandas as pd -import numpy as np -from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer -from sklearn.model_selection import train_test_split -from src.utils.logging_utils import get_logger -from sklearn.impute import KNNImputer -from collections import Counter -import gc -from scipy.sparse import csr_matrix, hstack - -logger = get_logger('data_preprocessing') - -def load_data(file_path): - try: - df = pd.read_csv(file_path) - logger.info(f"Data loaded successfully from {file_path}") - return df - except Exception as e: - logger.error(f"Error loading data from {file_path}: {e}") - raise - -def robust_string_parser(x): - if pd.isna(x): - return [] # Return empty list instead of np.nan - if isinstance(x, str): - return [item.strip() for item in x.split(',') if item.strip()] - if isinstance(x, (list, np.ndarray)): - return [str(item) for item in x if str(item).strip()] - return [str(x)] if str(x).strip() else [] - -def preprocess_data(df): - try: - - # Drop the 'album' column as it's all NaN - df = df.drop('album', axis=1) - - # Parse 'tags' and 'similar_tracks' columns - df['tags'] = df['tags'].apply(robust_string_parser) - df['similar_tracks'] = df['similar_tracks'].apply(robust_string_parser) - - # Create binary indicators for missing values - df['has_tags'] = (df['tags'].apply(len) > 0).astype(int) - df['has_similar_tracks'] = (df['similar_tracks'].apply(len) > 0).astype(int) - - # Convert playcount to numeric and handle any non-numeric values - df['playcount'] = pd.to_numeric(df['playcount'], errors='coerce') - df['playcount'].fillna(df['playcount'].median(), inplace=True) - - return df - except Exception as e: - logger.error(f"Error during data preprocessing: {e}") - raise - - -def one_hot_encode(series): - unique_items = set(item for sublist in series for item in sublist) - return pd.DataFrame([[1 if item in sublist else 0 for item in unique_items] for sublist in series], - columns=list(unique_items)) - - -def impute_data(df, n_neighbors=10): - # Prepare numeric data - numeric_data = df[['playcount']].copy() - - # One-hot encode tags and similar_tracks - tags_encoded = one_hot_encode(df['tags']) - tracks_encoded = one_hot_encode(df['similar_tracks']) - - # Combine all features - features = pd.concat([numeric_data, tags_encoded, tracks_encoded], axis=1) - - # Scale features - scaler = StandardScaler() - features_scaled = scaler.fit_transform(features) - - # Impute using KNNImputer - imputer = KNNImputer(n_neighbors=n_neighbors) - imputed_features = imputer.fit_transform(features_scaled) - - # Rescale imputed features - imputed_features = scaler.inverse_transform(imputed_features) - - # Reconstruct dataframe - imputed_df = pd.DataFrame(imputed_features, columns=features.columns, index=df.index) - - # Update original dataframe - df['playcount'] = imputed_df['playcount'] - - # Convert one-hot encoded back to lists - tags_columns = tags_encoded.columns - tracks_columns = tracks_encoded.columns - - df['tags'] = imputed_df[tags_columns].apply(lambda row: [col for col, val in row.items() if val > 0.5], axis=1) - df['similar_tracks'] = imputed_df[tracks_columns].apply(lambda row: [col for col, val in row.items() if val > 0.5], axis=1) - - # Function to get most common tags/tracks for an artist - def get_most_common(artist, column, n=3): - artist_data = df[df['artist'] == artist][column] - all_items = [item for sublist in artist_data for item in sublist if sublist] - return Counter(all_items).most_common(n) - - # Get global most common tags and tracks - global_common_tags = Counter([tag for tags in df['tags'] for tag in tags]).most_common(5) - global_common_tracks = Counter([track for tracks in df['similar_tracks'] for track in tracks]).most_common(5) - - # Fill empty lists with most common tags/tracks for the artist or global common tags/tracks - for idx, row in df.iterrows(): - if not row['tags']: - common_tags = get_most_common(row['artist'], 'tags') - if common_tags: - df.at[idx, 'tags'] = [tag for tag, _ in common_tags] - else: - df.at[idx, 'tags'] = [tag for tag, _ in global_common_tags] - - if not row['similar_tracks']: - common_tracks = get_most_common(row['artist'], 'similar_tracks') - if common_tracks: - df.at[idx, 'similar_tracks'] = [track for track, _ in common_tracks] - else: - df.at[idx, 'similar_tracks'] = [track for track, _ in global_common_tracks] - - # Convert lists back to strings - df['tags'] = df['tags'].apply(lambda x: ', '.join(x) if x else 'Unknown') - df['similar_tracks'] = df['similar_tracks'].apply(lambda x: ', '.join(x) if x else 'Unknown') - - # Update has_tags and has_similar_tracks - df['has_tags'] = (df['tags'] != 'Unknown').astype(int) - df['has_similar_tracks'] = (df['similar_tracks'] != 'Unknown').astype(int) - - return df - -def prepare_data(preprocessed_df, original_df): - X = preprocessed_df.values - mlb = MultiLabelBinarizer() - y = mlb.fit_transform(original_df['similar_tracks'].str.split(',')) - track_names = original_df['name'].values - - X_train, X_test, y_train, y_test, names_train, names_test = train_test_split( - X, y, track_names, test_size=0.2, random_state=42 - ) - - scaler = StandardScaler() - X_train_scaled = scaler.fit_transform(X_train) - X_test_scaled = scaler.transform(X_test) - - return X_train_scaled, X_test_scaled, y_train, y_test, names_train, names_test, scaler, mlb - - -def main(input_file_path, output_imputed_path): - try: - # Load data - df = load_data(input_file_path) - - # Preprocess data - df_processed = preprocess_data(df) - - # Imputed data for missing values - imputed_data = impute_data(df_processed) - imputed_data.drop_duplicates(inplace=True) - - imputed_data.to_csv(output_imputed_path, index=False) - logger.info(f"Preprocessed data saved to {output_imputed_path}") - - except Exception as e: - logger.error(f"Error in preprocessing main function: {e}") - raise - -if __name__ == "__main__": - # This section would be replaced by Kubeflow pipeline component inputs - # For now, we'll just log a message - logger.info("Preprocessing script executed. This would be replaced by Kubeflow component execution.") diff --git a/src/data_processing/data_process.py b/src/data_processing/data_process.py new file mode 100644 index 0000000..1ca5981 --- /dev/null +++ b/src/data_processing/data_process.py @@ -0,0 +1,175 @@ +import pandas as pd +import numpy as np +from google.cloud import bigquery +from sklearn.preprocessing import StandardScaler +from sklearn.impute import KNNImputer +from collections import Counter +from scipy.sparse import csr_matrix, hstack +from src.utils.logging_utils import get_logger +import multiprocessing as mp +from functools import partial +import gc + +logger = get_logger('data_preprocessing') + +def load_data_from_bigquery(project_id, dataset_id, table_id): + try: + client = bigquery.Client(project=project_id) + query = f""" + SELECT * + FROM `{project_id}.{dataset_id}.{table_id}` + """ + df = client.query(query).to_dataframe() + logger.info(f"Data loaded successfully from BigQuery table {project_id}.{dataset_id}.{table_id}") + return df + except Exception as e: + logger.error(f"Error loading data from BigQuery: {e}") + raise + +def robust_string_parser(x): + if pd.isna(x): + return [] + if isinstance(x, str): + return [item.strip() for item in x.split(',') if item.strip()] + if isinstance(x, (list, np.ndarray)): + return [str(item) for item in x if str(item).strip()] + return [str(x)] if str(x).strip() else [] + +def preprocess_data(df): + try: + if 'album' in df.columns: + df = df.drop('album', axis=1) + df['tags'] = df['tags'].apply(robust_string_parser) + df['similar_tracks'] = df['similar_tracks'].apply(robust_string_parser) + df['has_tags'] = (df['tags'].apply(len) > 0).astype(int) + df['has_similar_tracks'] = (df['similar_tracks'].apply(len) > 0).astype(int) + df['playcount'] = pd.to_numeric(df['playcount'], errors='coerce') + df['playcount'].fillna(df['playcount'].median(), inplace=True) + + # Additional data quality checks + logger.info(f"Missing values: {df.isnull().sum()}") + logger.info(f"Data types: {df.dtypes}") + logger.info(f"Unique values in categorical columns: {df.select_dtypes(include=['object']).nunique()}") + + return df + except Exception as e: + logger.error(f"Error during data preprocessing: {e}") + raise + +def one_hot_encode_sparse(series): + unique_items = set(item for sublist in series for item in sublist) + item_to_index = {item: i for i, item in enumerate(unique_items)} + rows, cols = [], [] + for i, sublist in enumerate(series): + for item in sublist: + rows.append(i) + cols.append(item_to_index[item]) + return csr_matrix((np.ones(len(rows)), (rows, cols)), shape=(len(series), len(unique_items))) + +def process_chunk(chunk, tags_vocab, tracks_vocab): + tags_encoded = one_hot_encode_sparse(chunk['tags']) + tracks_encoded = one_hot_encode_sparse(chunk['similar_tracks']) + return hstack([tags_encoded, tracks_encoded]) + +def impute_data(df, n_neighbors=10, chunk_size=10000): + numeric_data = df[['playcount']].values + + # Create vocabularies for tags and tracks + tags_vocab = set(item for sublist in df['tags'] for item in sublist) + tracks_vocab = set(item for sublist in df['similar_tracks'] for item in sublist) + + # Process data in chunks + with mp.Pool(mp.cpu_count()) as pool: + encoded_chunks = pool.map( + partial(process_chunk, tags_vocab=tags_vocab, tracks_vocab=tracks_vocab), + [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)] + ) + + # Combine chunks + encoded_data = hstack(encoded_chunks) + + # Combine numeric and encoded data + features = hstack([numeric_data, encoded_data]) + + # Scale features + scaler = StandardScaler(with_mean=False) # Use with_mean=False for sparse data + features_scaled = scaler.fit_transform(features) + + # Impute using KNNImputer + imputer = KNNImputer(n_neighbors=n_neighbors) + imputed_features = imputer.fit_transform(features_scaled) + + # Rescale imputed features + imputed_features = scaler.inverse_transform(imputed_features) + + # Update original dataframe + df['playcount'] = imputed_features[:, 0] + + # Convert one-hot encoded back to lists + tags_start = 1 + tracks_start = tags_start + len(tags_vocab) + + df['tags'] = [ + [tag for tag, val in zip(tags_vocab, row[tags_start:tracks_start]) if val > 0.5] + for row in imputed_features + ] + df['similar_tracks'] = [ + [track for track, val in zip(tracks_vocab, row[tracks_start:]) if val > 0.5] + for row in imputed_features + ] + + # Fill empty lists with most common tags/tracks + global_common_tags = Counter([tag for tags in df['tags'] for tag in tags]).most_common(5) + global_common_tracks = Counter([track for tracks in df['similar_tracks'] for track in tracks]).most_common(5) + + df.loc[df['tags'].apply(len) == 0, 'tags'] = [tag for tag, _ in global_common_tags] + df.loc[df['similar_tracks'].apply(len) == 0, 'similar_tracks'] = [track for track, _ in global_common_tracks] + + # Convert lists back to strings + df['tags'] = df['tags'].apply(lambda x: ', '.join(x) if x else 'Unknown') + df['similar_tracks'] = df['similar_tracks'].apply(lambda x: ', '.join(x) if x else 'Unknown') + + # Update has_tags and has_similar_tracks + df['has_tags'] = (df['tags'] != 'Unknown').astype(int) + df['has_similar_tracks'] = (df['similar_tracks'] != 'Unknown').astype(int) + + return df + +def write_to_bigquery(df, project_id, dataset_id, table_id): + try: + client = bigquery.Client(project=project_id) + job_config = bigquery.LoadJobConfig( + autodetect=True, + write_disposition="WRITE_TRUNCATE", + ) + job = client.load_table_from_dataframe( + df, f"{project_id}.{dataset_id}.{table_id}", job_config=job_config + ) + job.result() # Wait for the job to complete + logger.info(f"Processed data written to BigQuery table {project_id}.{dataset_id}.{table_id}") + except Exception as e: + logger.error(f"Error writing data to BigQuery: {e}") + raise + +def main(project_id, input_dataset_id, input_table_id, output_dataset_id, output_table_id): + try: + df = load_data_from_bigquery(project_id, input_dataset_id, input_table_id) + df_processed = preprocess_data(df) + imputed_data = impute_data(df_processed) + imputed_data.drop_duplicates(inplace=True) + write_to_bigquery(imputed_data, project_id, output_dataset_id, output_table_id) + logger.info("Data processing completed successfully") + except Exception as e: + logger.error(f"Error in preprocessing main function: {e}") + raise + +if __name__ == "__main__": + # These would be replaced by Kubeflow pipeline component inputs + project_id = "your-project-id" + input_dataset_id = "lastfm_dataset" + input_table_id = "raw_top_tracks" + output_dataset_id = "lastfm_dataset" + output_table_id = "processed_top_tracks" + + main(project_id, input_dataset_id, input_table_id, output_dataset_id, output_table_id) + logger.info("Preprocessing script executed. This would be replaced by Kubeflow component execution.") diff --git a/src/data_processing/data_validation.py b/src/data_processing/data_validation.py index d2938cc..78ff436 100644 --- a/src/data_processing/data_validation.py +++ b/src/data_processing/data_validation.py @@ -7,6 +7,7 @@ import os import matplotlib.pyplot as plt from google.cloud import storage +from google.cloud import bigquery from datetime import datetime logger = setup_logger('data_validation') @@ -44,10 +45,21 @@ def load_statistics_from_gcs(bucket_name: str, model_name: str, data_type: str, stats.ParseFromString(blob.download_as_string()) return stats -def generate_schema(data_path: str, bucket_name: str, model_name: str, version: str) -> tfdv.types.Schema: +def load_data_from_bigquery(project_id: str, dataset_id: str, table_id: str) -> pd.DataFrame: + """ + Load data from BigQuery table into a pandas DataFrame. + """ + client = bigquery.Client(project=project_id) + query = f""" + SELECT * + FROM `{project_id}.{dataset_id}.{table_id}` + """ + return client.query(query).to_dataframe() + +def generate_schema(project_id: str, dataset_id: str, table_id: str, bucket_name: str, model_name: str, version: str) -> tfdv.types.Schema: try: log_step(logger, 'Generating Schema', 'Data Validation') - df = pd.read_csv(data_path) + df = load_data_from_bigquery(project_id, dataset_id, table_id) schema = tfdv.infer_schema(df) save_schema_to_gcs(schema, bucket_name, model_name, version) return schema @@ -55,10 +67,10 @@ def generate_schema(data_path: str, bucket_name: str, model_name: str, version: log_error(logger, e, 'Schema Generation') raise -def validate_data(data_path: str, schema: tfdv.types.Schema, bucket_name: str, model_name: str, data_type: str) -> Tuple[tfdv.types.DatasetFeatureStatisticsList, tfdv.types.Anomalies]: +def validate_data(project_id: str, dataset_id: str, table_id: str, schema: tfdv.types.Schema, bucket_name: str, model_name: str, data_type: str) -> Tuple[tfdv.types.DatasetFeatureStatisticsList, tfdv.types.Anomalies]: try: log_step(logger, 'Validating Data', 'Data Validation') - df = pd.read_csv(data_path) + df = load_data_from_bigquery(project_id, dataset_id, table_id) stats = tfdv.generate_statistics_from_dataframe(df) save_statistics_to_gcs(stats, bucket_name, model_name, data_type) anomalies = tfdv.validate_statistics(stats, schema) @@ -161,7 +173,9 @@ def compare_schemas(baseline_schema: tfdv.types.Schema, current_schema: tfdv.typ return False # In case of an error, return False to indicate no schema drift detected -def main(train_data_path: str, serving_data_path: str, bucket_name: str, model_name: str): +def main(project_id: str, train_dataset_id: str, train_table_id: str, + serving_dataset_id: str, serving_table_id: str, + bucket_name: str, model_name: str): try: config = load_config() schema_version = config['data_validation']['schema_version'] @@ -172,14 +186,14 @@ def main(train_data_path: str, serving_data_path: str, bucket_name: str, model_n schema = load_schema_from_gcs(bucket_name, model_name, schema_version) logger.info(f"Loaded existing schema version {schema_version} from GCS") except: - schema = generate_schema(train_data_path, bucket_name, model_name, schema_version) + schema = generate_schema(project_id, train_dataset_id, train_table_id, bucket_name, model_name, schema_version) # Validate training data - train_stats, train_anomalies = validate_data(train_data_path, schema, bucket_name, model_name, 'train') + train_stats, train_anomalies = validate_data(project_id, train_dataset_id, train_table_id, schema, bucket_name, model_name, 'train') visualize_statistics(train_stats, train_anomalies) # Validate serving data - serving_stats, serving_anomalies = validate_data(serving_data_path, schema, bucket_name, model_name, 'serving') + serving_stats, serving_anomalies = validate_data(project_id, serving_dataset_id, serving_table_id, schema, bucket_name, model_name, 'serving') visualize_statistics(serving_stats, serving_anomalies) # Compare statistics and detect drift @@ -193,7 +207,14 @@ def main(train_data_path: str, serving_data_path: str, bucket_name: str, model_n if __name__ == '__main__': config = load_config() + project_id = config['project']['id'] bucket_name = config['storage']['bucket_name'] model_name = config['model']['name'] - # Replace arg 1 and 2 with Kubeflow pipeline inputs - main('data/raw/train_data.csv', 'data/raw/serving_data.csv', bucket_name, model_name) \ No newline at end of file + train_dataset_id = config['bigquery']['train_dataset_id'] + train_table_id = config['bigquery']['train_table_id'] + serving_dataset_id = config['bigquery']['serving_dataset_id'] + serving_table_id = config['bigquery']['serving_table_id'] + + main(project_id, train_dataset_id, train_table_id, + serving_dataset_id, serving_table_id, + bucket_name, model_name) \ No newline at end of file diff --git a/src/feature_engineering/feat_engineering.py b/src/feature_engineering/feat_engineering.py index a6fb6cc..78353bc 100644 --- a/src/feature_engineering/feat_engineering.py +++ b/src/feature_engineering/feat_engineering.py @@ -1,362 +1,118 @@ -import pandas as pd +import pandas as pd import numpy as np -import sklearn -import ast +from google.cloud import bigquery from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, KBinsDiscretizer -from sklearn.impute import SimpleImputer, KNNImputer +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.decomposition import TruncatedSVD -from collections import Counter -from itertools import combinations +from sklearn.feature_selection import SelectKBest, f_regression from scipy.sparse import csr_matrix, hstack -import matplotlib.pyplot as plt -import seaborn as sns -from src.utils.data_utilis import plot_correlation_map +import multiprocessing as mp +from functools import partial +from src.utils.logging_utils import setup_logger + +logger = setup_logger('feature_engineering') + +def load_data_from_bigquery(project_id, dataset_id, table_id): + client = bigquery.Client(project=project_id) + query = f""" + SELECT * + FROM `{project_id}.{dataset_id}.{table_id}` + """ + return client.query(query).to_dataframe() + +def write_to_bigquery(df, project_id, dataset_id, table_id): + client = bigquery.Client(project=project_id) + job_config = bigquery.LoadJobConfig(autodetect=True, write_disposition="WRITE_TRUNCATE") + job = client.load_table_from_dataframe(df, f"{project_id}.{dataset_id}.{table_id}", job_config=job_config) + job.result() def engineer_basic_features(df): - # Create a copy of the dataframe to avoid SettingWithCopyWarning - new_df = df.copy() - - # Log-transform the playcount - new_df.loc[:, 'log_playcount'] = np.log1p(new_df['playcount']) - - # Create binned versions of playcount - kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile') - new_df.loc[:, 'binned_playcount'] = kbd.fit_transform(new_df[['playcount']]) - - # Create features for the number of tags and similar tracks - new_df.loc[:, 'num_tags'] = new_df['tags'].apply(lambda x: len(x.split(', '))) - new_df.loc[:, 'num_similar_tracks'] = new_df['similar_tracks'].apply(lambda x: len(x.split(', '))) - - return new_df - - -def plot_new_features(df): - fig, axes = plt.subplots(2, 2, figsize=(15, 15)) - - sns.histplot(data=df, x='log_playcount', kde=True, ax=axes[0, 0]) - axes[0, 0].set_title('Distribution of Log Playcount') - - sns.histplot(data=df, x='binned_playcount', kde=True, ax=axes[0, 1]) - axes[0, 1].set_title('Distribution of Binned Playcount') - - sns.histplot(data=df, x='num_tags', kde=True, ax=axes[1, 0]) - axes[1, 0].set_title('Distribution of Number of Tags') - - sns.histplot(data=df, x='num_similar_tracks', kde=True, ax=axes[1, 1]) - axes[1, 1].set_title('Distribution of Number of Similar Tracks') - - plt.tight_layout() - plt.show() + df['log_playcount'] = np.log1p(df['playcount']) + df['num_tags'] = df['tags'].str.count(',') + 1 + df['num_similar_tracks'] = df['similar_tracks'].str.count(',') + 1 + return df def engineer_additional_features(df): - # Create a copy of the dataframe - new_df = df.copy() - - # Binary feature for high tag count - new_df['high_tag_count'] = (new_df['num_tags'] > 50).astype(int) - - # Bin number of tags - new_df['tag_count_category'] = pd.cut(new_df['num_tags'], - bins=[0, 10, 50, np.inf], - labels=['low', 'medium', 'high']) - - # Binary feature for having similar tracks - new_df['has_similar_tracks'] = (new_df['num_similar_tracks'] > 0).astype(int) - - # Bin number of similar tracks - new_df['similar_tracks_category'] = pd.cut(new_df['num_similar_tracks'], - bins=[0, 50, 100, np.inf], - labels=['low', 'medium', 'high']) - - # Interaction features - new_df['log_playcount_x_num_tags'] = new_df['log_playcount'] * new_df['num_tags'] - new_df['log_playcount_x_num_similar_tracks'] = new_df['log_playcount'] * new_df['num_similar_tracks'] - - return new_df - -def drop_columns(features, df): - new_df = df.copy() - imputed_data_with_more_features = new_df.drop(features, axis=1) - - return imputed_data_with_more_features - -def correlation_map(df): - new_df = df.copy() - - plot_correlation_map(new_df) - -def refine_features(df): - # Create a copy of the dataframe - new_df = df.copy() - - # Drop redundant features - new_df = new_df.drop(['playcount', 'binned_playcount', 'high_tag_count', - 'log_playcount_x_num_tags', 'log_playcount_x_num_similar_tracks'], axis=1) - - # Create artist-based features - new_df['artist_avg_playcount'] = new_df.groupby('artist')['log_playcount'].transform('mean') - new_df['artist_track_count'] = new_df.groupby('artist')['name'].transform('count') - - # Create features for top N tags - top_tags = new_df['tags'].str.split(', ', expand=True).stack().value_counts().nlargest(10).index - for tag in top_tags: - new_df[f'has_tag_{tag}'] = new_df['tags'].str.contains(tag).astype(int) - - return new_df + df['high_tag_count'] = (df['num_tags'] > df['num_tags'].median()).astype(int) + df['has_similar_tracks'] = (df['num_similar_tracks'] > 0).astype(int) + df['log_playcount_x_num_tags'] = df['log_playcount'] * df['num_tags'] + return df def add_tag_popularity(df): - # Split tags and create a dataframe - tag_df = df['tags'].str.split(', ', expand=True).melt(value_name='tag').dropna() + tag_df = df['tags'].str.split(',', expand=True).melt(value_name='tag').dropna() tag_df = tag_df.merge(df[['log_playcount']], left_index=True, right_index=True) - - # Calculate tag popularity tag_popularity = tag_df.groupby('tag')['log_playcount'].mean().sort_values(ascending=False) - - # Function to calculate average tag popularity - def avg_tag_popularity(tags): - if not tags: - return 0 - tags_list = tags.split(', ') - # Only consider tags that are in tag_popularity - valid_tags = [tag for tag in tags_list if tag in tag_popularity.index] - if not valid_tags: - return 0 - return tag_popularity[valid_tags].mean() - - # Add tag popularity to main dataframe - df['avg_tag_popularity'] = df['tags'].apply(avg_tag_popularity) - + df['avg_tag_popularity'] = df['tags'].apply(lambda x: tag_popularity[x.split(',')].mean() if x else 0) return df def add_similar_tracks_avg_playcount(df): - # Create a dictionary of track name to log_playcount track_playcount = dict(zip(df['name'], df['log_playcount'])) - - # Function to get average playcount of similar tracks - def get_avg_playcount(similar_tracks): - playcounts = [track_playcount.get(track.strip(), 0) for track in similar_tracks.split(', ')] - return sum(playcounts) / len(playcounts) if playcounts else 0 - - df['avg_similar_tracks_playcount'] = df['similar_tracks'].apply(get_avg_playcount) - + df['avg_similar_tracks_playcount'] = df['similar_tracks'].apply( + lambda x: np.mean([track_playcount.get(t.strip(), 0) for t in x.split(',')]) if x else 0 + ) return df -def add_interaction_features(df): - df['num_tags_x_avg_similar_tracks_playcount'] = df['num_tags'] * df['avg_similar_tracks_playcount'] - - return df - -def add_target_encoding(df): - # Calculate mean log_playcount for each artist - artist_means = df.groupby('artist')['log_playcount'].mean() - - # Calculate global mean - global_mean = df['log_playcount'].mean() - - # Function to encode with smoothing - def encode(artist): - n = df[df['artist'] == artist].shape[0] - return (n * artist_means.get(artist, global_mean) + global_mean) / (n + 1) - - # Apply encoding - df['artist_target_encoded'] = df['artist'].apply(encode) - - return df - -def refine_features_further(df): - # Combine redundant features - df['has_tag_favorites_combined'] = df[['has_tag_favorites', 'has_tag_Favorite']].max(axis=1) - - # drop low variance features - df = df.drop(['has_tag_favorites', 'has_tag_Favorite', 'has_tag_MySpotigramBot'], axis=1) - - # Create a composite tag popularity score - tag_columns = [col for col in df.columns if col.startswith('has_tag_')] - df['tag_popularity_score'] = df[tag_columns].mean(axis=1) - - return df - -def review_categorical_features(df): - cat_cols = ['tag_count_category', 'similar_tracks_category'] - for col in cat_cols: - print(f"\nValue counts for {col}:") - print(df[col].value_counts()) - -def analyze_vocabulary_sizes(df): - text_features = ['name', 'artist', 'tags', 'similar_tracks'] - for feature in text_features: - unique_terms = set() - for text in df[feature]: - unique_terms.update(text.split(',')) - print(f"{feature} unique terms: {len(unique_terms)}") - -def remove_pretfidf_cols(df): - # Identify and remove previously vectorized track name features - name_tfidf_columns = [col for col in df.columns if col.startswith('name_tfidf_')] - refined_data_new = refined_data_new.drop(columns=name_tfidf_columns) - - return refined_data_new - - -def vectorize_all_text_features(df, max_features_dict=None): - if max_features_dict is None: - max_features_dict = { - 'artist': None, # This will use all unique artists - 'tags': 300, - 'similar_tracks': 500 - } - - # Check if 'name' has already been vectorized - if 'name' not in max_features_dict and not any(col.startswith('name_tfidf_') for col in df.columns): - max_features_dict['name'] = 8000 - - text_features = list(max_features_dict.keys()) - vectorized_dfs = [] +def vectorize_text_features(df, max_features_dict): vectorizers = {} + for feature, max_features in max_features_dict.items(): + vectorizer = TfidfVectorizer(max_features=max_features) + vectorized = vectorizer.fit_transform(df[feature].fillna('')) + df = pd.concat([df, pd.DataFrame(vectorized.toarray(), columns=[f'{feature}_tfidf_{i}' for i in range(vectorized.shape[1])], index=df.index)], axis=1) + vectorizers[feature] = vectorizer + return df, vectorizers + +def feature_selection(X, y, k=1000): + selector = SelectKBest(f_regression, k=k) + X_new = selector.fit_transform(X, y) + selected_features = X.columns[selector.get_support()] + return X_new, selected_features + +def process_chunk(chunk, feature_engineering_pipeline): + return feature_engineering_pipeline(chunk) + +def feature_engineering_pipeline(df): + df = engineer_basic_features(df) + df = engineer_additional_features(df) + df = add_tag_popularity(df) + df = add_similar_tracks_avg_playcount(df) + df, _ = vectorize_text_features(df, {'tags': 300, 'similar_tracks': 500}) + return df - for feature in text_features: - if feature in ['name', 'artist']: - # Treat each unique value as a document - unique_values = df[feature].unique() - text_data = pd.Series(unique_values) - else: - text_data = df[feature].fillna('') - - tfidf = TfidfVectorizer(max_features=max_features_dict[feature]) - tfidf_matrix = tfidf.fit_transform(text_data) +def main(project_id, input_dataset_id, input_table_id, output_dataset_id, output_table_id): + try: + logger.info("Starting feature engineering process") + df = load_data_from_bigquery(project_id, input_dataset_id, input_table_id) - feature_df = pd.DataFrame( - tfidf_matrix.toarray(), - columns=[f'{feature}_tfidf_{i}' for i in range(tfidf_matrix.shape[1])] - ) - - if feature in ['name', 'artist']: - # Map the vectorized features back to the original dataframe - feature_to_vector = dict(zip(unique_values, feature_df.values)) - vectorized_feature = df[feature].map(lambda x: feature_to_vector.get(x, np.zeros(max_features_dict[feature]))) - feature_df = pd.DataFrame(vectorized_feature.tolist(), - columns=feature_df.columns, - index=df.index) - else: - feature_df.index = df.index - - vectorized_dfs.append(feature_df) - vectorizers[feature] = tfidf - - df_vectorized = pd.concat([df] + vectorized_dfs, axis=1) - - return df_vectorized, vectorizers - - -def get_final_features(df): - # Prepare final feature set - original_text_cols = ['name', 'artist', 'tags', 'similar_tracks'] - feature_cols = [col for col in df.columns if col not in original_text_cols] - print("Final feature set:") - print(feature_cols) - return df[feature_cols] - - -def create_preprocessing_pipeline(df, n_components=100): - # Identify different types of columns - numeric_features = df.select_dtypes(include=[np.number]).columns.tolist() - categorical_features = df.select_dtypes(include=['object']).columns.tolist() - tfidf_features = [col for col in df.columns if '_tfidf_' in col] - - # Create the preprocessing steps - numeric_transformer = Pipeline(steps=[ - ('imputer', SimpleImputer(strategy='mean')), - ('scaler', StandardScaler()) - ]) - - categorical_transformer = Pipeline(steps=[ - ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), - ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) - ]) - - tfidf_transformer = Pipeline(steps=[ - ('svd', TruncatedSVD(n_components=min(n_components, len(tfidf_features)), algorithm='randomized', n_iter=5, random_state=42)) - ]) - - # Combine all the preprocessing steps - preprocessor = ColumnTransformer( - transformers=[ - ('num', numeric_transformer, numeric_features), - ('cat', categorical_transformer, categorical_features), - ('tfidf', tfidf_transformer, tfidf_features) - ]) - - # Create the full pipeline - full_pipeline = Pipeline(steps=[ - ('preprocessor', preprocessor), - ('replace_inf', FunctionTransformer(lambda X: np.nan_to_num(X, nan=0, posinf=0, neginf=0))) - ]) - - return full_pipeline - -def get_feature_names(input_features, n_components): - feature_names = [] - numeric_features = input_features.select_dtypes(include=[np.number]).columns.tolist() - categorical_features = input_features.select_dtypes(include=['object']).columns.tolist() - tfidf_features = [col for col in input_features.columns if '_tfidf_' in col] - - feature_names.extend(numeric_features) - - for cat_feature in categorical_features: - unique_values = input_features[cat_feature].unique() - feature_names.extend([f"{cat_feature}_{value}" for value in unique_values]) - - feature_names.extend([f'svd_tfidf_{i}' for i in range(min(n_components, len(tfidf_features)))]) - - return feature_names - -def analyze_feature_importance_and_reduce_dimensions(df, n_components=4000): - # Perform Truncated SVD - svd = TruncatedSVD(n_components=n_components, random_state=42) - svd_result = svd.fit_transform(df) - - # Analyze feature importance - feature_importance = np.sum(np.abs(svd.components_), axis=0) - feature_importance = 100.0 * (feature_importance / feature_importance.sum()) - - # Create a DataFrame of feature importance - feature_importance_df = pd.DataFrame({ - 'feature': df.columns, - 'importance': feature_importance - }).sort_values('importance', ascending=False) - - # Plot feature importance - plt.figure(figsize=(12, 6)) - plt.bar(range(20), feature_importance_df['importance'][:20]) - plt.xticks(range(20), feature_importance_df['feature'][:20], rotation=90) - plt.xlabel('Features') - plt.ylabel('Relative Importance (%)') - plt.title('Top 20 Most Important Features') - plt.tight_layout() - plt.show() - - # Print top 20 most important features - print("Top 20 most important features:") - print(feature_importance_df.head(20)) - - # Plot cumulative explained variance ratio - plt.figure(figsize=(10, 6)) - plt.plot(np.cumsum(svd.explained_variance_ratio_)) - plt.xlabel('Number of Components') - plt.ylabel('Cumulative Explained Variance Ratio') - plt.title('Explained Variance Ratio by Number of Components') - plt.tight_layout() - plt.show() - - # Create DataFrame with reduced dimensions - columns = [f'SVD_{i+1}' for i in range(svd_result.shape[1])] - df_svd = pd.DataFrame(svd_result, columns=columns, index=df.index) - - print(f"Shape after Truncated SVD: {df_svd.shape}") - print(f"Cumulative explained variance ratio: {np.sum(svd.explained_variance_ratio_):.4f}") - - return df_svd, svd, feature_importance_df - + # Process data in chunks to optimize memory usage + chunk_size = 10000 + with mp.Pool(mp.cpu_count()) as pool: + processed_chunks = pool.map( + partial(process_chunk, feature_engineering_pipeline=feature_engineering_pipeline), + [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)] + ) + + df_processed = pd.concat(processed_chunks) + + # Perform feature selection + X = df_processed.drop(['name', 'artist', 'tags', 'similar_tracks', 'playcount'], axis=1) + y = df_processed['log_playcount'] + X_selected, selected_features = feature_selection(X, y) + + df_final = pd.concat([df_processed[['name', 'artist', 'tags', 'similar_tracks', 'playcount']], pd.DataFrame(X_selected, columns=selected_features, index=df_processed.index)], axis=1) + + write_to_bigquery(df_final, project_id, output_dataset_id, output_table_id) + logger.info("Feature engineering completed successfully") + + except Exception as e: + logger.error(f"Error in feature engineering process: {e}") + raise + +if __name__ == "__main__": + project_id = "your-project-id" + input_dataset_id = "lastfm_dataset" + input_table_id = "processed_top_tracks" + output_dataset_id = "lastfm_dataset" + output_table_id = "engineered_top_tracks" + main(project_id, input_dataset_id, input_table_id, output_dataset_id, output_table_id) diff --git a/src/utils/data_versioning.py b/src/utils/data_versioning.py new file mode 100644 index 0000000..b2421d5 --- /dev/null +++ b/src/utils/data_versioning.py @@ -0,0 +1,25 @@ +from google.cloud import bigquery +from datetime import datetime + +def version_dataset(project_id, source_dataset_id, source_table_id, target_dataset_id): + client = bigquery.Client(project=project_id) + + # Create a new table name with timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + target_table_id = f"{source_table_id}_v{timestamp}" + + # Construct the query to copy data + query = f""" + CREATE OR REPLACE TABLE `{project_id}.{target_dataset_id}.{target_table_id}` + AS SELECT * FROM `{project_id}.{source_dataset_id}.{source_table_id}` + """ + + # Run the query + job = client.query(query) + job.result() # Wait for the job to complete + + print(f"Dataset versioned: {target_dataset_id}.{target_table_id}") + return f"{target_dataset_id}.{target_table_id}" + +# Use this function after each major data processing step +# version_dataset(project_id, 'source_dataset', 'source_table', 'versioned_dataset') \ No newline at end of file diff --git a/tests/test_content_based.py b/tests/test_content_based.py new file mode 100644 index 0000000..94670e2 --- /dev/null +++ b/tests/test_content_based.py @@ -0,0 +1,156 @@ +import unittest +from unittest.mock import patch, MagicMock +import numpy as np +import tensorflow as tf +from src.algorithms.content_based import ( + cosine_similarity, + mean_average_precision, + average_precision, + FilteredCallback, + train_model, + evaluate_model, + find_similar_tracks, + main +) + +class TestContentBased(unittest.TestCase): + + def setUp(self): + self.y_true = np.array([[1, 0, 1], [0, 1, 1], [1, 1, 0]]) + self.y_pred = np.array([[0.9, 0.1, 0.8], [0.2, 0.7, 0.6], [0.8, 0.3, 0.1]]) + + def test_cosine_similarity(self): + similarity = cosine_similarity(self.y_true, self.y_pred) + self.assertIsInstance(similarity, tf.Tensor) + + def test_mean_average_precision(self): + map_score = mean_average_precision(self.y_true, self.y_pred) + self.assertIsInstance(map_score, float) + self.assertTrue(0 <= map_score <= 1) + + def test_average_precision(self): + ap_score = average_precision(self.y_true[0], self.y_pred[0]) + self.assertIsInstance(ap_score, float) + self.assertTrue(0 <= ap_score <= 1) + + @patch('src.algorithms.content_based.tf.keras.callbacks.ModelCheckpoint') + def test_filtered_callback(self, mock_model_checkpoint): + filtered_callback = FilteredCallback(filepath='test_path') + self.assertIsInstance(filtered_callback, tf.keras.callbacks.ModelCheckpoint) + + @patch('src.algorithms.content_based.EarlyStopping') + @patch('src.algorithms.content_based.FilteredCallback') + def test_train_model(self, mock_filtered_callback, mock_early_stopping): + mock_model = MagicMock() + mock_model.fit.return_value = MagicMock(history={'loss': [0.1], 'val_loss': [0.2]}) + + X_train = np.random.rand(100, 10) + y_train = np.random.rand(100, 3) + X_val = np.random.rand(20, 10) + y_val = np.random.rand(20, 3) + + history = train_model(mock_model, X_train, y_train, X_val, y_val) + + mock_model.fit.assert_called_once() + self.assertIn('loss', history.history) + self.assertIn('val_loss', history.history) + + @patch('src.algorithms.content_based.precision_score') + @patch('src.algorithms.content_based.recall_score') + @patch('src.algorithms.content_based.f1_score') + @patch('src.algorithms.content_based.ndcg_score') + def test_evaluate_model(self, mock_ndcg, mock_f1, mock_recall, mock_precision): + mock_model = MagicMock() + mock_model.predict.return_value = self.y_pred + mock_model.evaluate.return_value = [0.1, 0.9] + + mock_precision.return_value = 0.8 + mock_recall.return_value = 0.7 + mock_f1.return_value = 0.75 + mock_ndcg.return_value = 0.85 + + metrics = evaluate_model(mock_model, self.y_true, self.y_true) + + self.assertIn('test_loss', metrics) + self.assertIn('test_accuracy', metrics) + self.assertIn('cosine_similarity', metrics) + self.assertIn('mean_average_precision', metrics) + self.assertIn('ndcg', metrics) + self.assertIn('precision', metrics) + self.assertIn('recall', metrics) + self.assertIn('f1_score', metrics) + + def test_find_similar_tracks(self): + mock_model = MagicMock() + mock_model.predict.side_effect = [ + np.array([[0.1, 0.2, 0.3]]), # track_embedding + np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5]]) # all_embeddings + ] + + track_features = np.array([1, 2, 3]) + all_features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + track_names = ['Track1', 'Track2', 'Track3'] + + similar_tracks = find_similar_tracks(mock_model, track_features, all_features, track_names, n=2) + + self.assertEqual(len(similar_tracks), 2) + self.assertIsInstance(similar_tracks[0], tuple) + self.assertIsInstance(similar_tracks[0][0], str) + self.assertIsInstance(similar_tracks[0][1], float) + + @patch('src.algorithms.content_based.prepare_data') + @patch('src.algorithms.content_based.build_content_based_model') + @patch('src.algorithms.content_based.train_model') + @patch('src.algorithms.content_based.evaluate_model') + @patch('src.algorithms.content_based.json.dump') + def test_main(self, mock_json_dump, mock_evaluate, mock_train, mock_build, mock_prepare): + mock_prepare.return_value = ( + np.random.rand(100, 10), np.random.rand(20, 10), np.random.rand(30, 10), + np.random.rand(100, 3), np.random.rand(20, 3), np.random.rand(30, 3), + ['name1', 'name2'], ['name3'], ['name4', 'name5'], + MagicMock(), MagicMock() + ) + + mock_model = MagicMock() + mock_build.return_value = mock_model + + mock_history = MagicMock() + mock_history.history = { + 'loss': [0.1], 'binary_accuracy': [0.9], + 'val_loss': [0.2], 'val_binary_accuracy': [0.8], + 'val_cosine_similarity': [-0.7] + } + mock_train.return_value = mock_history + + mock_evaluate.return_value = { + 'test_loss': 0.15, 'test_accuracy': 0.85, + 'cosine_similarity': 0.8, 'mean_average_precision': 0.75, + 'ndcg': 0.9, 'precision': 0.8, 'recall': 0.7, 'f1_score': 0.75 + } + + model, metrics = main('feat_eng_data.csv', 'original_df.csv', 2, 64, 32, 0.001, 32, 0.2) + + mock_prepare.assert_called_once() + mock_build.assert_called_once() + mock_train.assert_called_once() + mock_evaluate.assert_called_once() + mock_json_dump.assert_called_once() + + self.assertIsInstance(model, MagicMock) + self.assertIsInstance(metrics, dict) + self.assertIn('final_loss', metrics) + self.assertIn('final_accuracy', metrics) + self.assertIn('final_val_loss', metrics) + self.assertIn('final_val_accuracy', metrics) + self.assertIn('val_cosine_similarity', metrics) + self.assertIn('test_loss', metrics) + self.assertIn('test_accuracy', metrics) + self.assertIn('cosine_similarity', metrics) + self.assertIn('mean_average_precision', metrics) + self.assertIn('ndcg', metrics) + self.assertIn('precision', metrics) + self.assertIn('recall', metrics) + self.assertIn('f1_score', metrics) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_data_prep.py b/tests/test_data_prep.py new file mode 100644 index 0000000..3499207 --- /dev/null +++ b/tests/test_data_prep.py @@ -0,0 +1,81 @@ +import unittest +from unittest.mock import patch, MagicMock +import pandas as pd +import numpy as np +from google.cloud import bigquery +from src.data_processing.data_prep import ( + load_data_from_bigquery, + prepare_data, + save_prepared_data, + create_and_populate_feature_store +) + +class TestDataPrep(unittest.TestCase): + + def setUp(self): + self.sample_df = pd.DataFrame({ + 'artist': ['Artist1', 'Artist2'], + 'name': ['Song1', 'Song2'], + 'tags': ['rock, pop', 'jazz, blues'], + 'similar_tracks': ['Track1, Track2', 'Track3, Track4'], + 'playcount': [1000, 2000] + }) + + @patch('src.data_processing.data_prep.bigquery.Client') + def test_load_data_from_bigquery(self, mock_client): + mock_query_job = MagicMock() + mock_query_job.to_dataframe.return_value = self.sample_df + mock_client.return_value.query.return_value = mock_query_job + + result = load_data_from_bigquery('project_id', 'dataset_id', 'table_id') + + mock_client.assert_called_once_with(project='project_id') + mock_client.return_value.query.assert_called_once() + self.assertTrue(result.equals(self.sample_df)) + + def test_prepare_data(self): + preprocessed_df = self.sample_df.copy() + preprocessed_df['extra_feature'] = [1, 2] + + X_train, X_test, y_train, y_test, names_train, names_test, scaler, mlb = prepare_data(preprocessed_df, self.sample_df) + + self.assertEqual(X_train.shape[1], 1) # Only 'extra_feature' should be in X + self.assertEqual(y_train.shape[1], 4) # 4 unique tracks in similar_tracks + self.assertEqual(len(names_train), 1) # 80% of 2 samples + self.assertEqual(len(names_test), 1) # 20% of 2 samples + + @patch('src.data_processing.data_prep.np.save') + @patch('src.data_processing.data_prep.joblib.dump') + def test_save_prepared_data(self, mock_joblib_dump, mock_np_save): + X_train = np.array([[1, 2], [3, 4]]) + X_test = np.array([[5, 6]]) + y_train = np.array([[1, 0], [0, 1]]) + y_test = np.array([[1, 0]]) + names_train = np.array(['Song1', 'Song2']) + names_test = np.array(['Song3']) + scaler = MagicMock() + mlb = MagicMock() + + save_prepared_data(X_train, X_test, y_train, y_test, names_train, names_test, scaler, mlb, 'output_dir') + + self.assertEqual(mock_np_save.call_count, 6) # 6 numpy arrays saved + self.assertEqual(mock_joblib_dump.call_count, 2) # scaler and mlb saved + + @patch('src.data_processing.data_prep.aiplatform.init') + @patch('src.data_processing.data_prep.aiplatform.FeatureStore.create') + def test_create_and_populate_feature_store(self, mock_create_feature_store, mock_init): + mock_feature_store = MagicMock() + mock_create_feature_store.return_value = mock_feature_store + mock_entity_type = MagicMock() + mock_feature_store.create_entity_type.return_value = mock_entity_type + + create_and_populate_feature_store('project_id', 'region', 'feature_store_id', 'entity_type_id', self.sample_df) + + mock_init.assert_called_once_with(project='project_id', location='region') + mock_create_feature_store.assert_called_once() + mock_feature_store.create_entity_type.assert_called_once() + self.assertEqual(mock_entity_type.create_feature.call_count, 5) # 5 features in sample_df + mock_entity_type.ingest.assert_called_once() + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_data_processing.py b/tests/test_data_processing.py index 0731d3b..09ee228 100644 --- a/tests/test_data_processing.py +++ b/tests/test_data_processing.py @@ -2,7 +2,7 @@ import pandas as pd import numpy as np from src.data_processing.data_ingestion import fetch_lastfm_data -from src.data_processing.data_preprocess import ( +from src.data_processing.data_process import ( load_data, robust_string_parser, preprocess_data, one_hot_encode, impute_data, prepare_data, main ) diff --git a/tests/test_data_validation.py b/tests/test_data_validation.py new file mode 100644 index 0000000..9611163 --- /dev/null +++ b/tests/test_data_validation.py @@ -0,0 +1,94 @@ +import unittest +from unittest.mock import patch, MagicMock +import pandas as pd +import tensorflow_data_validation as tfdv +from src.data_processing.data_validation import ( + generate_schema, + validate_data, + compare_statistics, + detect_data_drift, + compare_schemas +) + +class TestDataValidation(unittest.TestCase): + + def setUp(self): + self.sample_df = pd.DataFrame({ + 'artist': ['Artist1', 'Artist2'], + 'name': ['Song1', 'Song2'], + 'tags': ['rock, pop', 'jazz, blues'], + 'similar_tracks': ['Track1, Track2', 'Track3, Track4'], + 'playcount': [1000, 2000] + }) + + @patch('src.data_processing.data_validation.tfdv.infer_schema') + @patch('src.data_processing.data_validation.save_schema_to_gcs') + def test_generate_schema(self, mock_save_schema, mock_infer_schema): + mock_schema = MagicMock() + mock_infer_schema.return_value = mock_schema + + result = generate_schema('project_id', 'dataset_id', 'table_id', 'bucket_name', 'model_name', 'version') + + mock_infer_schema.assert_called_once() + mock_save_schema.assert_called_once_with(mock_schema, 'bucket_name', 'model_name', 'version') + self.assertEqual(result, mock_schema) + + @patch('src.data_processing.data_validation.tfdv.generate_statistics_from_dataframe') + @patch('src.data_processing.data_validation.tfdv.validate_statistics') + @patch('src.data_processing.data_validation.save_statistics_to_gcs') + def test_validate_data(self, mock_save_stats, mock_validate_stats, mock_generate_stats): + mock_stats = MagicMock() + mock_generate_stats.return_value = mock_stats + mock_anomalies = MagicMock() + mock_validate_stats.return_value = mock_anomalies + mock_schema = MagicMock() + + stats, anomalies = validate_data('project_id', 'dataset_id', 'table_id', mock_schema, 'bucket_name', 'model_name', 'data_type') + + mock_generate_stats.assert_called_once() + mock_save_stats.assert_called_once_with(mock_stats, 'bucket_name', 'model_name', 'data_type') + mock_validate_stats.assert_called_once_with(mock_stats, mock_schema) + self.assertEqual(stats, mock_stats) + self.assertEqual(anomalies, mock_anomalies) + + @patch('src.data_processing.data_validation.tfdv.validate_statistics') + def test_compare_statistics(self, mock_validate_stats): + mock_train_stats = MagicMock() + mock_serving_stats = MagicMock() + mock_schema = MagicMock() + mock_anomalies = MagicMock() + mock_validate_stats.return_value = mock_anomalies + + result = compare_statistics(mock_train_stats, mock_serving_stats, mock_schema) + + mock_validate_stats.assert_called_once_with(mock_serving_stats, mock_schema, previous_statistics=mock_train_stats) + self.assertEqual(result, mock_anomalies) + + @patch('src.data_processing.data_validation.tfdv.compute_drift_skew') + def test_detect_data_drift(self, mock_compute_drift_skew): + mock_train_stats = MagicMock() + mock_serving_stats = MagicMock() + mock_schema = MagicMock() + mock_drift_skew = {'feature1': 0.1, 'feature2': 0.2} + mock_compute_drift_skew.return_value = mock_drift_skew + + result = detect_data_drift(mock_train_stats, mock_serving_stats, mock_schema, 0.15) + + mock_compute_drift_skew.assert_called_once_with(mock_train_stats, mock_serving_stats, mock_schema) + self.assertEqual(result, mock_drift_skew) + + def test_compare_schemas(self): + baseline_schema = tfdv.Schema() + baseline_schema.feature.add(name='feature1', type=tfdv.FeatureType.INT) + baseline_schema.feature.add(name='feature2', type=tfdv.FeatureType.FLOAT) + + current_schema = tfdv.Schema() + current_schema.feature.add(name='feature1', type=tfdv.FeatureType.INT) + current_schema.feature.add(name='feature3', type=tfdv.FeatureType.STRING) + + result = compare_schemas(baseline_schema, current_schema) + + self.assertTrue(result) # Schema drift detected + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_data_versioning.py b/tests/test_data_versioning.py new file mode 100644 index 0000000..c2adf21 --- /dev/null +++ b/tests/test_data_versioning.py @@ -0,0 +1,28 @@ +import unittest +from unittest.mock import patch, MagicMock +from src.utils.data_versioning import version_dataset + +class TestDataVersioning(unittest.TestCase): + @patch('src.utils.data_versioning.bigquery.Client') + def test_version_dataset(self, mock_client): + # Mock the BigQuery client + mock_job = MagicMock() + mock_client.return_value.query.return_value = mock_job + + # Call the function + result = version_dataset('test-project', 'source_dataset', 'source_table', 'versioned_dataset') + + # Assert that the query was called with the correct parameters + mock_client.return_value.query.assert_called_once() + query_call = mock_client.return_value.query.call_args[0][0] + self.assertIn('test-project.versioned_dataset', query_call) + self.assertIn('test-project.source_dataset.source_table', query_call) + + # Assert that the job's result method was called + mock_job.result.assert_called_once() + + # Assert that the function returns the correct table name + self.assertTrue(result.startswith('versioned_dataset.source_table_v')) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_feature_engineering.py b/tests/test_feature_engineering.py index 34eb996..1e8cac9 100644 --- a/tests/test_feature_engineering.py +++ b/tests/test_feature_engineering.py @@ -1,189 +1,62 @@ import unittest import pandas as pd import numpy as np -from src.feature_engineering.feat_engineering import (engineer_basic_features, engineer_additional_features, - refine_features, add_tag_popularity, - add_similar_tracks_avg_playcount, add_interaction_features, - add_target_encoding, refine_features_further, - vectorize_all_text_features) +from unittest.mock import patch, MagicMock +from src.feature_engineering.feat_engineering import ( + engineer_basic_features, + engineer_additional_features, + add_tag_popularity, + add_similar_tracks_avg_playcount, + refine_features, + vectorize_all_text_features +) class TestFeatureEngineering(unittest.TestCase): def setUp(self): - # Create a sample DataFrame for testing - self.df = pd.DataFrame({ - 'name': ['Track1', 'Track2', 'Track3'], - 'artist': ['Artist1', 'Artist2', 'Artist1'], - 'playcount': [100, 200, 300], - 'tags': ['rock, pop', 'jazz, blues', 'rock, metal'], - 'similar_tracks': ['Track2, Track3', 'Track1, Track3', 'Track1, Track2'] + self.sample_df = pd.DataFrame({ + 'artist': ['Artist1', 'Artist2'], + 'name': ['Song1', 'Song2'], + 'tags': ['rock, pop', 'jazz, blues'], + 'similar_tracks': ['Track1, Track2', 'Track3, Track4'], + 'playcount': [1000, 2000] }) def test_engineer_basic_features(self): - result = engineer_basic_features(self.df) - - # Check exact values - np.testing.assert_almost_equal(result['log_playcount'].values, - np.log1p([100, 200, 300])) - - # Check types - self.assertTrue(np.issubdtype(result['log_playcount'].dtype, np.number)) - self.assertTrue(np.issubdtype(result['binned_playcount'].dtype, np.integer)) - - # Check edge case: empty DataFrame - empty_df = pd.DataFrame(columns=self.df.columns) - empty_result = engineer_basic_features(empty_df) - self.assertTrue(empty_result.empty) + result = engineer_basic_features(self.sample_df) + self.assertIn('log_playcount', result.columns) + self.assertIn('num_tags', result.columns) + self.assertIn('num_similar_tracks', result.columns) def test_engineer_additional_features(self): - df_basic = engineer_basic_features(self.df) - result = engineer_additional_features(df_basic) - - # Check exact values - self.assertEqual(result['high_tag_count'].tolist(), [0, 0, 0]) - self.assertEqual(result['tag_count_category'].tolist(), ['low', 'low', 'low']) - - # Check types - self.assertTrue(np.issubdtype(result['high_tag_count'].dtype, np.integer)) - self.assertTrue(pd.api.types.is_categorical_dtype(result['tag_count_category'])) + basic_features = engineer_basic_features(self.sample_df) + result = engineer_additional_features(basic_features) + self.assertIn('tag_count_category', result.columns) + self.assertIn('similar_tracks_category', result.columns) - def test_refine_features(self): - df_basic = engineer_basic_features(self.df) - df_additional = engineer_additional_features(df_basic) - result = refine_features(df_additional) - - # Check exact values - np.testing.assert_almost_equal(result['artist_avg_playcount'].values, - [np.log1p(200), np.log1p(200), np.log1p(200)]) - - # Check types - self.assertTrue(np.issubdtype(result['artist_track_count'].dtype, np.integer)) - - def test_add_tag_popularity(self): - result = add_tag_popularity(self.df) - - # Check if tag popularity is calculated correctly - self.assertGreater(result.loc[0, 'avg_tag_popularity'], - result.loc[1, 'avg_tag_popularity']) - - # Test with missing values - df_with_missing = self.df.copy() - df_with_missing.loc[0, 'tags'] = np.nan - result_missing = add_tag_popularity(df_with_missing) - self.assertEqual(result_missing.loc[0, 'avg_tag_popularity'], 0) + @patch('src.feature_engineering.feat_engineering.pd.DataFrame.merge') + def test_add_tag_popularity(self, mock_merge): + mock_merge.return_value = self.sample_df + result = add_tag_popularity(self.sample_df) + self.assertIn('avg_tag_popularity', result.columns) def test_add_similar_tracks_avg_playcount(self): - result = add_similar_tracks_avg_playcount(self.df) - - # Check exact values - expected_avg = (np.log1p(200) + np.log1p(300)) / 2 - self.assertAlmostEqual(result.loc[0, 'avg_similar_tracks_playcount'], expected_avg) - - def test_add_interaction_features(self): - df_with_avg = add_similar_tracks_avg_playcount(self.df) - df_with_avg['num_tags'] = df_with_avg['tags'].str.count(',') + 1 - result = add_interaction_features(df_with_avg) - - # Check exact values - expected_interaction = (df_with_avg['num_tags'] * df_with_avg['avg_similar_tracks_playcount']).values - np.testing.assert_almost_equal(result['num_tags_x_avg_similar_tracks_playcount'].values, expected_interaction) - - def test_add_target_encoding(self): - df_with_log = engineer_basic_features(self.df) - result = add_target_encoding(df_with_log) - - # Check if encoding is smooth - self.assertNotEqual(result['artist_target_encoded'].nunique(), result['artist'].nunique()) - - def test_refine_features_further(self): - df_refined = refine_features(self.df) - df_refined['has_tag_favorites'] = [1, 0, 1] - df_refined['has_tag_Favorite'] = [0, 1, 0] - df_refined['has_tag_MySpotigramBot'] = [1, 1, 1] - result = refine_features_further(df_refined) - - # Check exact values - np.testing.assert_array_equal(result['has_tag_favorites_combined'].values, [1, 1, 1]) - - # Check if low variance feature is dropped - self.assertNotIn('has_tag_MySpotigramBot', result.columns) - - def test_vectorize_all_text_features(self): - result, vectorizers = vectorize_all_text_features(self.df) - - # Check if vectorization produces expected number of features - self.assertEqual(sum(1 for col in result.columns if col.startswith('name_tfidf_')), 3) - self.assertEqual(sum(1 for col in result.columns if col.startswith('artist_tfidf_')), 2) - - # Test with custom max_features - result_custom, _ = vectorize_all_text_features(self.df, {'tags': 1, 'similar_tracks': 1}) - self.assertEqual(sum(1 for col in result_custom.columns if col.startswith('tags_tfidf_')), 1) - self.assertEqual(sum(1 for col in result_custom.columns if col.startswith('similar_tracks_tfidf_')), 1) - - def test_edge_cases(self): - # Test with empty DataFrame - empty_df = pd.DataFrame(columns=self.df.columns) - self.assertTrue(engineer_basic_features(empty_df).empty) - self.assertTrue(engineer_additional_features(empty_df).empty) - self.assertTrue(refine_features(empty_df).empty) - - # Test with missing values - df_with_missing = self.df.copy() - df_with_missing.loc[0, 'playcount'] = np.nan - df_with_missing.loc[1, 'tags'] = np.nan - df_with_missing.loc[2, 'similar_tracks'] = np.nan - - result_basic = engineer_basic_features(df_with_missing) - self.assertTrue(np.isnan(result_basic.loc[0, 'log_playcount'])) - self.assertEqual(result_basic.loc[1, 'num_tags'], 0) - self.assertEqual(result_basic.loc[2, 'num_similar_tracks'], 0) + result = add_similar_tracks_avg_playcount(self.sample_df) + self.assertIn('avg_similar_tracks_playcount', result.columns) - def test_output_types_and_shapes(self): - result = engineer_basic_features(self.df) - result = engineer_additional_features(result) - result = refine_features(result) - result = add_tag_popularity(result) - result = add_similar_tracks_avg_playcount(result) - result = add_interaction_features(result) - result = add_target_encoding(result) - result = refine_features_further(result) - result, _ = vectorize_all_text_features(result) - - # Check output types - self.assertTrue(isinstance(result, pd.DataFrame)) - self.assertTrue(all(np.issubdtype(result[col].dtype, np.number) for col in result.columns - if not pd.api.types.is_categorical_dtype(result[col]))) - - # Check output shape - self.assertEqual(result.shape[0], self.df.shape[0]) - self.assertGreater(result.shape[1], self.df.shape[1]) - - def test_consistency(self): - # Run the feature engineering process twice and compare results - result1 = engineer_basic_features(self.df) - result1 = engineer_additional_features(result1) - result1 = refine_features(result1) - - result2 = engineer_basic_features(self.df) - result2 = engineer_additional_features(result2) - result2 = refine_features(result2) - - pd.testing.assert_frame_equal(result1, result2) - - def test_no_unexpected_nan(self): - result = engineer_basic_features(self.df) - result = engineer_additional_features(result) - result = refine_features(result) - result = add_tag_popularity(result) - result = add_similar_tracks_avg_playcount(result) - result = add_interaction_features(result) - result = add_target_encoding(result) - result = refine_features_further(result) - result, _ = vectorize_all_text_features(result) - - # Check for unexpected NaN values - unexpected_nan = result.isna().sum() - self.assertTrue(all(unexpected_nan == 0), f"Unexpected NaN values found: {unexpected_nan[unexpected_nan > 0]}") + def test_refine_features(self): + result = refine_features(self.sample_df) + self.assertIn('artist_avg_playcount', result.columns) + self.assertIn('artist_track_count', result.columns) + + @patch('src.feature_engineering.feat_engineering.TfidfVectorizer') + def test_vectorize_all_text_features(self, mock_tfidf): + mock_tfidf.return_value.fit_transform.return_value.toarray.return_value = np.array([[1, 0], [0, 1]]) + result, _ = vectorize_all_text_features(self.sample_df) + self.assertTrue(any(col.startswith('artist_tfidf_') for col in result.columns)) + self.assertTrue(any(col.startswith('name_tfidf_') for col in result.columns)) + self.assertTrue(any(col.startswith('tags_tfidf_') for col in result.columns)) + self.assertTrue(any(col.startswith('similar_tracks_tfidf_') for col in result.columns)) if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/tests/test_feature_store.py b/tests/test_feature_store.py new file mode 100644 index 0000000..2b1c4a4 --- /dev/null +++ b/tests/test_feature_store.py @@ -0,0 +1,32 @@ +import unittest +from unittest.mock import patch, MagicMock +from google.cloud import aiplatform +from src.data_processing.data_prep import create_and_populate_feature_store + +class TestFeatureStore(unittest.TestCase): + + @patch('src.data_processing.data_prep.aiplatform.init') + @patch('src.data_processing.data_prep.aiplatform.FeatureStore.create') + def test_create_and_populate_feature_store(self, mock_create_feature_store, mock_init): + mock_feature_store = MagicMock() + mock_create_feature_store.return_value = mock_feature_store + mock_entity_type = MagicMock() + mock_feature_store.create_entity_type.return_value = mock_entity_type + + # Mock DataFrame + df = MagicMock() + df.to_dict.return_value = [{'feature1': 'value1', 'feature2': 'value2'}] + df.index.tolist.return_value = ['entity1'] + + create_and_populate_feature_store('project_id', 'region', 'feature_store_id', 'entity_type_id', df) + + mock_init.assert_called_once_with(project='project_id', location='region') + mock_create_feature_store.assert_called_once() + mock_feature_store.create_entity_type.assert_called_once() + mock_entity_type.create_feature.assert_called() + mock_entity_type.ingest.assert_called_once() + + # Add more tests for other feature store operations + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_hyperparameter_tuning.py b/tests/test_hyperparameter_tuning.py index 00741f3..eb47869 100644 --- a/tests/test_hyperparameter_tuning.py +++ b/tests/test_hyperparameter_tuning.py @@ -1,58 +1,71 @@ import unittest from unittest.mock import patch, MagicMock -from src.hyperparameter_tuning.katib_tuning import create_experiment, run_hyperparameter_tuning +import yaml +from src.hyperparameter_tuning.katib_tuning import ( + load_config, + create_experiment, + run_hyperparameter_tuning, + main +) class TestHyperparameterTuning(unittest.TestCase): + def setUp(self): self.mock_config = { 'hyperparameter_tuning': { 'max_trials': 10, 'parameters': { - 'hidden_layers': {'min': 1, 'max': 3}, + 'hidden_layers': {'min': 1, 'max': 5}, 'neurons': {'min': 32, 'max': 256}, 'learning_rate': {'min': 0.0001, 'max': 0.1} } } } - def test_create_experiment(self): - experiment = create_experiment('test-experiment', 'default', 'train_data.csv', 'val_data.csv', self.mock_config) + @patch('src.hyperparameter_tuning.katib_tuning.open') + def test_load_config(self, mock_open): + mock_open.return_value.__enter__.return_value = MagicMock() + mock_yaml_load = MagicMock(return_value=self.mock_config) + with patch('src.hyperparameter_tuning.katib_tuning.yaml.safe_load', mock_yaml_load): + config = load_config() + self.assertEqual(config, self.mock_config) + + @patch('src.hyperparameter_tuning.katib_tuning.load_config') + def test_create_experiment(self, mock_load_config): + mock_load_config.return_value = self.mock_config + experiment = create_experiment("test-experiment", "default", "train_data.csv", "val_data.csv") - self.assertEqual(experiment['metadata']['name'], 'test-experiment') - self.assertEqual(experiment['metadata']['namespace'], 'default') + self.assertEqual(experiment['metadata']['name'], "test-experiment") + self.assertEqual(experiment['metadata']['namespace'], "default") self.assertEqual(experiment['spec']['maxTrialCount'], 10) - - parameters = experiment['spec']['parameters'] - self.assertEqual(len(parameters), 6) # hidden_layers, neurons, embedding_dim, learning_rate, batch_size, dropout_rate - - hidden_layers_param = next(p for p in parameters if p['name'] == 'hidden_layers') - self.assertEqual(hidden_layers_param['feasibleSpace']['min'], '1') - self.assertEqual(hidden_layers_param['feasibleSpace']['max'], '3') + self.assertIn('parameters', experiment['spec']) + self.assertEqual(len(experiment['spec']['parameters']), 6) @patch('src.hyperparameter_tuning.katib_tuning.KatibClient') - def test_run_hyperparameter_tuning(self, mock_katib_client): + @patch('src.hyperparameter_tuning.katib_tuning.create_experiment') + def test_run_hyperparameter_tuning(self, mock_create_experiment, mock_katib_client): mock_client = MagicMock() mock_katib_client.return_value = mock_client - - mock_client.get_optimal_hyperparameters.return_value = { - 'currentOptimalTrial': { - 'parameterAssignments': [ - {'name': 'hidden_layers', 'value': '2'}, - {'name': 'neurons', 'value': '128'}, - {'name': 'learning_rate', 'value': '0.001'} - ] - } - } - - results = run_hyperparameter_tuning('train_data.csv', 'val_data.csv', 'config.yaml') - - mock_katib_client.assert_called_once() + mock_create_experiment.return_value = {"metadata": {"name": "test-experiment"}} + mock_client.get_optimal_hyperparameters.return_value = {"bestTrialName": "test-trial"} + + results = run_hyperparameter_tuning("train_data.csv", "val_data.csv") + + mock_create_experiment.assert_called_once() mock_client.create_experiment.assert_called_once() mock_client.wait_for_experiment.assert_called_once() mock_client.get_optimal_hyperparameters.assert_called_once() - - self.assertIn('currentOptimalTrial', results) - self.assertIn('parameterAssignments', results['currentOptimalTrial']) + self.assertEqual(results, {"bestTrialName": "test-trial"}) + + @patch('src.hyperparameter_tuning.katib_tuning.run_hyperparameter_tuning') + def test_main(self, mock_run_hyperparameter_tuning): + mock_results = {"bestTrialName": "test-trial"} + mock_run_hyperparameter_tuning.return_value = mock_results + + results = main("train_data.csv", "val_data.csv") + + mock_run_hyperparameter_tuning.assert_called_once_with("train_data.csv", "val_data.csv") + self.assertEqual(results, mock_results) if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..729f16e --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,80 @@ +import unittest +from unittest.mock import patch, MagicMock +import pandas as pd +import numpy as np +from src.data_processing.data_ingestion import load_data +from src.data_processing.data_process import preprocess_data +from src.feature_engineering.feat_engineering import engineer_features +from src.algorithms.content_based import main as content_based_main +from src.evaluation.model_evaluation import evaluate_model + +class TestIntegration(unittest.TestCase): + + @patch('src.data_processing.data_ingestion.load_data') + @patch('src.data_processing.data_process.preprocess_data') + @patch('src.feature_engineering.feat_engineering.engineer_features') + @patch('src.algorithms.content_based.main') + @patch('src.evaluation.model_evaluation.evaluate_model') + def test_end_to_end_workflow(self, mock_evaluate, mock_content_based, mock_engineer, mock_preprocess, mock_load): + # Mock data ingestion + mock_load.return_value = pd.DataFrame({ + 'user_id': [1, 2, 3], + 'track_id': [101, 102, 103], + 'listen_count': [10, 20, 30] + }) + + # Mock data preprocessing + mock_preprocess.return_value = pd.DataFrame({ + 'user_id': [1, 2, 3], + 'track_id': [101, 102, 103], + 'listen_count': [10, 20, 30], + 'normalized_listen_count': [0.1, 0.2, 0.3] + }) + + # Mock feature engineering + mock_engineer.return_value = pd.DataFrame({ + 'user_id': [1, 2, 3], + 'track_id': [101, 102, 103], + 'listen_count': [10, 20, 30], + 'normalized_listen_count': [0.1, 0.2, 0.3], + 'feature1': [0.5, 0.6, 0.7], + 'feature2': [0.8, 0.9, 1.0] + }) + + # Mock content-based algorithm + mock_model = MagicMock() + mock_content_based.return_value = (mock_model, {'accuracy': 0.85, 'f1_score': 0.82}) + + # Mock model evaluation + mock_evaluate.return_value = {'accuracy': 0.87, 'f1_score': 0.84, 'precision': 0.86, 'recall': 0.85} + + # Run the end-to-end workflow + raw_data = load_data('dummy_path') + preprocessed_data = preprocess_data(raw_data) + feature_engineered_data = engineer_features(preprocessed_data) + model, training_metrics = content_based_main(feature_engineered_data, preprocessed_data, 2, 64, 32, 0.001, 32, 0.2) + evaluation_metrics = evaluate_model(model, feature_engineered_data, preprocessed_data) + + # Assertions + self.assertIsNotNone(raw_data) + self.assertIsNotNone(preprocessed_data) + self.assertIsNotNone(feature_engineered_data) + self.assertIsNotNone(model) + self.assertIsNotNone(training_metrics) + self.assertIsNotNone(evaluation_metrics) + + self.assertIn('accuracy', training_metrics) + self.assertIn('f1_score', training_metrics) + self.assertIn('accuracy', evaluation_metrics) + self.assertIn('f1_score', evaluation_metrics) + self.assertIn('precision', evaluation_metrics) + self.assertIn('recall', evaluation_metrics) + + # Verify that each step was called with the output of the previous step + mock_preprocess.assert_called_once_with(raw_data) + mock_engineer.assert_called_once_with(mock_preprocess.return_value) + mock_content_based.assert_called_once_with(mock_engineer.return_value, mock_preprocess.return_value, 2, 64, 32, 0.001, 32, 0.2) + mock_evaluate.assert_called_once_with(mock_model, mock_engineer.return_value, mock_preprocess.return_value) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_kubeflow_pipeline.py b/tests/test_kubeflow_pipeline.py new file mode 100644 index 0000000..05ccd43 --- /dev/null +++ b/tests/test_kubeflow_pipeline.py @@ -0,0 +1,62 @@ +import unittest +from unittest.mock import patch, MagicMock +from kfp import dsl +from kubeflow import pipeline + +class TestKubeflowPipeline(unittest.TestCase): + + @patch('kubeflow.pipeline.Pipeline') + def test_pipeline_creation(self, mock_pipeline): + # Mock the pipeline components + mock_data_ingestion = MagicMock() + mock_data_processing = MagicMock() + mock_feature_engineering = MagicMock() + mock_model_training = MagicMock() + mock_model_evaluation = MagicMock() + mock_model_deployment = MagicMock() + + # Create the pipeline + @dsl.pipeline( + name='LastFM Music Recommender Pipeline', + description='End-to-end pipeline for LastFM Music Recommender' + ) + def lastfm_pipeline(): + data_ingestion_task = mock_data_ingestion() + data_processing_task = mock_data_processing(data_ingestion_task.output) + feature_engineering_task = mock_feature_engineering(data_processing_task.output) + model_training_task = mock_model_training(feature_engineering_task.output) + model_evaluation_task = mock_model_evaluation(model_training_task.output) + mock_model_deployment(model_evaluation_task.output) + + # Run the pipeline + pipeline.Pipeline(lastfm_pipeline) + + # Assert that the pipeline was created + mock_pipeline.assert_called_once() + + # Assert that all components were called in the correct order + mock_data_ingestion.assert_called_once() + mock_data_processing.assert_called_once() + mock_feature_engineering.assert_called_once() + mock_model_training.assert_called_once() + mock_model_evaluation.assert_called_once() + mock_model_deployment.assert_called_once() + + @patch('kubeflow.pipeline.Client') + def test_pipeline_run(self, mock_client): + # Mock the pipeline run + mock_run = MagicMock() + mock_client.return_value.create_run_from_pipeline_func.return_value = mock_run + + # Run the pipeline + client = pipeline.Client() + client.create_run_from_pipeline_func(pipeline.Pipeline, arguments={}) + + # Assert that the pipeline run was created + mock_client.return_value.create_run_from_pipeline_func.assert_called_once() + + # Assert that the run was waited for + mock_run.wait_for_run_completion.assert_called_once() + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_model_evaluation.py b/tests/test_model_evaluation.py new file mode 100644 index 0000000..67b957f --- /dev/null +++ b/tests/test_model_evaluation.py @@ -0,0 +1,77 @@ +import unittest +from unittest.mock import patch, MagicMock +import numpy as np +import pandas as pd +import json +from src.evaluation.model_evaluation import ( + mean_average_precision, + average_precision, + diversity, + novelty, + evaluate_model, + visualize_results, + update_custom_metrics +) + +class TestModelEvaluation(unittest.TestCase): + + def setUp(self): + self.y_true = np.array([[1, 0, 1], [0, 1, 1], [1, 1, 0]]) + self.y_pred = np.array([[0.9, 0.1, 0.8], [0.2, 0.7, 0.6], [0.8, 0.3, 0.1]]) + self.item_popularity = {'item1': 0.5, 'item2': 0.3, 'item3': 0.2} + + def test_mean_average_precision(self): + map_score = mean_average_precision(self.y_true, self.y_pred) + self.assertIsInstance(map_score, float) + self.assertTrue(0 <= map_score <= 1) + + def test_average_precision(self): + ap_score = average_precision(self.y_true[0], self.y_pred[0]) + self.assertIsInstance(ap_score, float) + self.assertTrue(0 <= ap_score <= 1) + + def test_diversity(self): + recommendations = [[1, 2, 3], [2, 3, 4], [3, 4, 5]] + div_score = diversity(recommendations) + self.assertIsInstance(div_score, float) + self.assertTrue(0 <= div_score <= 1) + + def test_novelty(self): + recommendations = [[1, 2, 3], [2, 3, 4], [3, 4, 5]] + nov_score = novelty(recommendations, self.item_popularity) + self.assertIsInstance(nov_score, float) + + @patch('src.evaluation.model_evaluation.load_metrics') + @patch('src.evaluation.model_evaluation.log_metric') + def test_evaluate_model(self, mock_log_metric, mock_load_metrics): + mock_model = MagicMock() + mock_model.predict.return_value = self.y_pred + mock_model.evaluate.return_value = (0.1, 0.9) # mock loss and accuracy + mock_load_metrics.return_value = {'train_loss': 0.2, 'train_accuracy': 0.8} + + results = evaluate_model(mock_model, self.y_true, self.y_true, 'mock_path', self.item_popularity) + + self.assertIsInstance(results, dict) + self.assertIn('test_accuracy', results) + self.assertIn('test_precision', results) + self.assertIn('test_recall', results) + self.assertIn('test_f1_score', results) + self.assertIn('test_ndcg', results) + self.assertIn('test_mean_average_precision', results) + self.assertIn('test_diversity', results) + self.assertIn('test_novelty', results) + + @patch('matplotlib.pyplot.savefig') + def test_visualize_results(self, mock_savefig): + results = {'metric1': 0.5, 'metric2': 0.7} + visualize_results(results, 'mock_output_path') + mock_savefig.assert_called_once() + + @patch('src.evaluation.model_evaluation.monitoring_v3.MetricServiceClient') + def test_update_custom_metrics(self, mock_client): + metrics = {'accuracy': 0.9, 'f1_score': 0.8} + update_custom_metrics('project_id', 'model_name', metrics) + mock_client.return_value.create_time_series.assert_called() + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index fb62257..07cb7ee 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -4,15 +4,14 @@ import tempfile import os import json -from src.data_processing.data_preprocess import preprocess_data, prepare_data +from src.data_processing.data_process import preprocess_data +from src.data_processing.data_prep import prepare_data from src.data_processing.data_validation import ( generate_schema, validate_data, compare_statistics, detect_data_drift, save_schema_to_gcs ) from src.feature_engineering.feat_engineering import ( - engineer_basic_features, engineer_additional_features, refine_features, - add_tag_popularity, add_similar_tracks_avg_playcount, add_interaction_features, - add_target_encoding, refine_features_further, vectorize_all_text_features, - create_preprocessing_pipeline + engineer_basic_features, engineer_additional_features, + add_tag_popularity, add_similar_tracks_avg_playcount, feature_engineering_pipeline ) from src.algorithms.content_based import ContentBasedRecommender from src.evaluation.model_evaluation import evaluate_model @@ -107,14 +106,6 @@ def test_end_to_end_pipeline(self): # Step 3: Feature Engineering df = engineer_basic_features(preprocessed_data) - df = engineer_additional_features(df) - df = refine_features(df) - df = add_tag_popularity(df) - df = add_similar_tracks_avg_playcount(df) - df = add_interaction_features(df) - df = add_target_encoding(df) - df = refine_features_further(df) - df, vectorizers = vectorize_all_text_features(df) # Create preprocessing pipeline pipeline = create_preprocessing_pipeline(df) @@ -214,7 +205,7 @@ def test_pipeline_output_artifacts(self): preprocessed_data = preprocess_data(pd.read_csv(self.train_data_path)) # Feature engineering steps - df = engineer_basic_features(preprocessed_data) + df = feature_engineering_pipeline(preprocessed_data) df = engineer_additional_features(df) df = refine_features(df) df = add_tag_popularity(df) From 48ae8f1b15b472f760f64c5a1717d1af835c8f66 Mon Sep 17 00:00:00 2001 From: JonFillip Date: Mon, 7 Oct 2024 10:45:17 +0100 Subject: [PATCH 2/2] Fixed dependecy issue, streamlined deployment and monitoring logic --- .github/workflows/ci_pipeline.yml | 2 +- cloudbuild.yaml | 2 +- deployment/vertex_ai/vertex_ai_monitoring.py | 467 ++++++++++-------- deployment/vertex_ai/vertex_deployment.py | 479 +++++++++++-------- kubeflow/components/preprocess/preprocess.py | 2 +- src/utils/logging_utils.py | 26 +- 6 files changed, 569 insertions(+), 409 deletions(-) diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml index 30ac821..ec5bc75 100644 --- a/.github/workflows/ci_pipeline.yml +++ b/.github/workflows/ci_pipeline.yml @@ -36,7 +36,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.9', '3.10'] timeout-minutes: 15 steps: - uses: actions/checkout@v3 diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 0b795c9..775203e 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -29,7 +29,7 @@ steps: args: - '-c' - | - python deployment/deploy_pipeline.py --platform vertex --project_id $PROJECT_ID --region ${_REGION} --output_file pipeline.yaml + python deployment/vertex_ai/vertex_deployment.py --platform vertex --project_id $PROJECT_ID --region ${_REGION} --output_file pipeline.yaml env: - 'MODEL_NAME=${_MODEL_NAME}' - 'ENDPOINT_NAME=${_ENDPOINT_NAME}' diff --git a/deployment/vertex_ai/vertex_ai_monitoring.py b/deployment/vertex_ai/vertex_ai_monitoring.py index af3cefc..7274a6b 100644 --- a/deployment/vertex_ai/vertex_ai_monitoring.py +++ b/deployment/vertex_ai/vertex_ai_monitoring.py @@ -1,4 +1,3 @@ -import yaml import tensorflow_data_validation as tfdv import argparse import json @@ -7,20 +6,14 @@ import random from typing import Dict, List, Optional, Tuple, Union from scipy.stats import ks_2samp -from google.cloud import monitoring_v3, storage, bigquery, aiplatform -from google.api import label_pb2 as ga_label +from google.cloud import monitoring_v3, storage, aiplatform from google.api import metric_pb2 as ga_metric -from google.protobuf import duration_pb2 as duration +from google.protobuf import duration_pb2 from src.data_processing.data_validation import ( - generate_schema, - validate_data, - load_config, load_statistics_from_gcs, load_schema_from_gcs, - compare_statistics, compare_schemas, - save_statistics_to_gcs, - save_schema_to_gcs + save_statistics_to_gcs ) from src.utils.logging_utils import setup_logger, log_error, log_step from ml_metadata import metadata_store @@ -29,15 +22,23 @@ logger = setup_logger('vertex_ai_pipeline_monitoring') class VertexAIMonitoring: - def __init__(self, project_id: str, model_name: str, bucket_name: str): + def __init__(self, project_id: str, model_name: str, bucket_name: str, + mlmd_host: str, mlmd_port: int, mlmd_database: str, + mlmd_user: str, mlmd_password: str): self.project_id = project_id self.model_name = model_name self.bucket_name = bucket_name self.client = monitoring_v3.MetricServiceClient() self.project_name = f"projects/{project_id}" - self.feature_store_client = aiplatform.FeatureStore(project=project_id) + self.feature_store_client = aiplatform.gapic.FeaturestoreServiceClient() + + # Connect to MLMD using PostgreSQL self.mlmd_connection_config = metadata_store_pb2.ConnectionConfig() - self.mlmd_connection_config.sqlite.filename_uri = f"gs://{bucket_name}/mlmd/metadata.db" + self.mlmd_connection_config.postgresql.host = mlmd_host + self.mlmd_connection_config.postgresql.port = mlmd_port + self.mlmd_connection_config.postgresql.database = mlmd_database + self.mlmd_connection_config.postgresql.user = mlmd_user + self.mlmd_connection_config.postgresql.password = mlmd_password self.mlmd_store = metadata_store.MetadataStore(self.mlmd_connection_config) def setup_custom_metrics(self) -> None: @@ -56,11 +57,14 @@ def setup_custom_metrics(self) -> None: ] for metric in metrics: - descriptor = self.client.create_metric_descriptor( - name=self.project_name, - metric_descriptor=metric - ) - logger.info(f"Created {descriptor.name}") + try: + descriptor = self.client.create_metric_descriptor( + name=self.project_name, + metric_descriptor=metric + ) + logger.info(f"Created metric descriptor: {descriptor.name}") + except Exception as e: + logger.warning(f"Metric descriptor {metric.type} already exists or could not be created. Error: {e}") def _create_metric_descriptor(self, metric_name: str, description: str, value_type: int = ga_metric.MetricDescriptor.ValueType.DOUBLE) -> ga_metric.MetricDescriptor: return ga_metric.MetricDescriptor( @@ -70,26 +74,25 @@ def _create_metric_descriptor(self, metric_name: str, description: str, value_ty description=description ) - def create_alert_policy(self, display_name: str, filter_str: str, threshold: float, duration_seconds: int, comparison: int) -> None: + def create_alert_policy(self, display_name: str, filter_str: str, threshold: float, duration_seconds: int, comparison: int, notification_channel_id: str) -> None: """Creates an alert policy in Google Cloud Monitoring.""" client = monitoring_v3.AlertPolicyServiceClient() - - condition = { - "display_name": display_name, - "condition_threshold": { - "filter": filter_str, - "comparison": comparison, - "threshold_value": threshold, - "duration": duration.Duration(seconds=duration_seconds) - } - } + condition = monitoring_v3.AlertPolicy.Condition( + display_name=display_name, + condition_threshold=monitoring_v3.AlertPolicy.Condition.MetricThreshold( + filter=filter_str, + comparison=comparison, + threshold_value=threshold, + duration=duration_pb2.Duration(seconds=duration_seconds), + ), + ) - alert_policy = { - "display_name": f"{self.model_name} {display_name}", - "conditions": [condition], - "notification_channels": [f"projects/{self.project_id}/notificationChannels/your-channel-id"], - "combiner": monitoring_v3.AlertPolicy.Combiner.OR, - } + alert_policy = monitoring_v3.AlertPolicy( + display_name=f"{self.model_name} {display_name}", + conditions=[condition], + notification_channels=[notification_channel_id], + combiner=monitoring_v3.AlertPolicy.ConditionCombinerType.OR, + ) policy = client.create_alert_policy( name=self.project_name, @@ -97,24 +100,26 @@ def create_alert_policy(self, display_name: str, filter_str: str, threshold: flo ) logger.info(f"Created alert policy: {policy.name}") - def create_accuracy_degradation_alert(self, absolute_threshold: float, degradation_rate_threshold: float, time_window_seconds: int = 86400) -> None: + def create_accuracy_degradation_alert(self, absolute_threshold: float, degradation_rate_threshold: float, time_window_seconds: int, notification_channel_id: str) -> None: """Creates an alert for accuracy degradation.""" self.create_alert_policy( "Accuracy below absolute threshold", f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/accuracy"', absolute_threshold, 300, - monitoring_v3.ComparisonType.COMPARISON_LT + monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_LT, + notification_channel_id ) self.create_alert_policy( "Accuracy degradation over time", f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/accuracy"', degradation_rate_threshold, time_window_seconds, - monitoring_v3.ComparisonType.COMPARISON_LT + monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_LT, + notification_channel_id ) - def create_resource_utilization_alert(self) -> None: + def create_resource_utilization_alert(self, notification_channel_id: str) -> None: """Creates alerts for resource utilization (CPU, memory, and GPU).""" resources = [ ("CPU", "compute.googleapis.com/instance/cpu/utilization"), @@ -128,16 +133,29 @@ def create_resource_utilization_alert(self) -> None: f'metric.type="{metric_type}"', 0.8, # 80% utilization threshold 300, # 5 minutes duration - monitoring_v3.ComparisonType.COMPARISON_GT + monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT, + notification_channel_id ) + def create_rollback_alert_policy(self, notification_channel_id: str) -> None: + """Creates an alert policy for rollback detection based on traffic split anomalies.""" + filter_str = f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/traffic_anomaly"' + self.create_alert_policy( + display_name="Rollback Detection Alert", + filter_str=filter_str, + threshold=0, # Threshold set to detect any anomaly (since value will be 1 when anomaly is detected) + duration_seconds=300, + comparison=monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT, + notification_channel_id=notification_channel_id + ) + def log_metric(self, metric_name: str, value: Union[float, int]) -> None: """Logs a metric to Google Cloud Monitoring.""" series = monitoring_v3.TimeSeries() series.metric.type = f"custom.googleapis.com/vertex_ai/{self.model_name}/{metric_name}" - series.resource.type = "aiplatform.googleapis.com/Endpoint" - series.resource.labels["model_name"] = self.model_name - point = series.points.add() + series.resource.type = "global" + series.resource.labels["project_id"] = self.project_id + point = monitoring_v3.Point() if isinstance(value, float): point.value.double_value = value else: @@ -145,6 +163,7 @@ def log_metric(self, metric_name: str, value: Union[float, int]) -> None: now = datetime.datetime.now() point.interval.end_time.seconds = int(now.timestamp()) point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9) + series.points = [point] self.client.create_time_series(name=self.project_name, time_series=[series]) logger.info(f"Logged {metric_name} for model {self.model_name}: {value}") @@ -153,30 +172,29 @@ def detect_data_drift(self, drift_threshold: float) -> Optional[float]: try: log_step(logger, 'Detecting Data Drift', 'Data Drift Detection') today = datetime.datetime.now().strftime("%Y%m%d") - + baseline_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'train', today) if not baseline_stats: self.log_metric("missing_statistics", 1) return None - + serving_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'serving', today) if not serving_stats: self.log_metric("missing_statistics", 1) return None - + schema = load_schema_from_gcs(self.bucket_name, self.model_name, 'current') if not schema: self.log_metric("missing_schema", 1) return None - anomalies = compare_statistics(baseline_stats, serving_stats, schema) + # Compare the statistics + anomalies = tfdv.validate_statistics(statistics=serving_stats, schema=schema, previous_statistics=baseline_stats) - drift_score = 0 - for feature, anomaly in anomalies.anomaly_info.items(): - if anomaly: - logger.warning(f"Data drift detected for feature {feature}: {anomaly.description}") - drift_score += anomaly.severity - self.log_metric("data_drift", anomaly.severity) + drift_score = len(anomalies.anomaly_info) + for feature_name, anomaly_info in anomalies.anomaly_info.items(): + logger.warning(f"Data drift detected for feature {feature_name}: {anomaly_info.description}") + self.log_metric("data_drift", 1) if drift_score > drift_threshold: logger.warning(f"Significant data drift detected. Drift score: {drift_score} > {drift_threshold}") @@ -195,7 +213,7 @@ def detect_data_drift(self, drift_threshold: float) -> Optional[float]: def _log_drift_detection_to_mlmd(self, drift_score: float, drift_threshold: float): """Log drift detection results to ML Metadata.""" execution = metadata_store_pb2.Execution() - execution.type = "DataDriftDetection" + execution.type_id = self._get_or_create_execution_type_id("DataDriftDetection") execution.properties["model_name"].string_value = self.model_name execution.properties["drift_score"].double_value = drift_score execution.properties["drift_threshold"].double_value = drift_threshold @@ -204,29 +222,40 @@ def _log_drift_detection_to_mlmd(self, drift_score: float, drift_threshold: floa execution_id = self.mlmd_store.put_executions([execution])[0] logger.info(f"Logged drift detection results to MLMD with execution ID: {execution_id}") + def _get_or_create_execution_type_id(self, type_name: str) -> int: + """Helper method to get or create an execution type ID.""" + try: + execution_type = self.mlmd_store.get_execution_type(type_name) + except metadata_store.errors.NotFoundError: + execution_type = metadata_store_pb2.ExecutionType(name=type_name) + self.mlmd_store.put_execution_type(execution_type) + return execution_type.id + def detect_prediction_drift(self, drift_threshold: float) -> Optional[float]: """Detects prediction drift using the Kolmogorov-Smirnov (KS) test.""" try: log_step(logger, 'Detecting Prediction Drift', 'Prediction Drift Detection') today = datetime.datetime.now().strftime("%Y%m%d") - + train_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'train', today) if not train_stats: self.log_metric("missing_statistics", 1) return None - + serving_stats = load_statistics_from_gcs(self.bucket_name, self.model_name, 'serving', today) if not serving_stats: self.log_metric("missing_statistics", 1) return None - train_predictions = train_stats.datasets[0].features['similar_tracks'].num_stats.histograms[0].buckets - serving_predictions = serving_stats.datasets[0].features['similar_tracks'].num_stats.histograms[0].buckets + # Assuming 'prediction' is the feature containing model predictions + train_predictions = tfdv.get_feature_stats_as_dataframe(train_stats) + serving_predictions = tfdv.get_feature_stats_as_dataframe(serving_stats) - train_counts = [bucket.sample_count for bucket in train_predictions] - serving_counts = [bucket.sample_count for bucket in serving_predictions] + if 'prediction' not in train_predictions.columns or 'prediction' not in serving_predictions.columns: + logger.error("Prediction feature not found in statistics.") + return None - statistic, _ = ks_2samp(train_counts, serving_counts) + statistic, _ = ks_2samp(train_predictions['prediction'], serving_predictions['prediction']) self.log_metric("prediction_drift", statistic) @@ -247,7 +276,7 @@ def detect_prediction_drift(self, drift_threshold: float) -> Optional[float]: def _log_prediction_drift_to_mlmd(self, statistic: float, drift_threshold: float): """Log prediction drift results to ML Metadata.""" execution = metadata_store_pb2.Execution() - execution.type = "PredictionDriftDetection" + execution.type_id = self._get_or_create_execution_type_id("PredictionDriftDetection") execution.properties["model_name"].string_value = self.model_name execution.properties["ks_statistic"].double_value = statistic execution.properties["drift_threshold"].double_value = drift_threshold @@ -266,7 +295,10 @@ def detect_schema_drift(self, schema_version: str) -> Optional[bool]: self.log_metric("missing_schema", 1) return None - current_schema = load_schema_from_gcs(self.bucket_name, self.model_name, 'serving_schema_version') + current_schema = load_schema_from_gcs(self.bucket_name, self.model_name, 'current') + if not current_schema: + self.log_metric("missing_schema", 1) + return None schema_drift_detected = compare_schemas(baseline_schema, current_schema) @@ -289,7 +321,7 @@ def detect_schema_drift(self, schema_version: str) -> Optional[bool]: def _log_schema_drift_to_mlmd(self, schema_drift_detected: bool): """Log schema drift results to ML Metadata.""" execution = metadata_store_pb2.Execution() - execution.type = "SchemaDriftDetection" + execution.type_id = self._get_or_create_execution_type_id("SchemaDriftDetection") execution.properties["model_name"].string_value = self.model_name execution.properties["schema_drift_detected"].int_value = int(schema_drift_detected) execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat() @@ -297,11 +329,11 @@ def _log_schema_drift_to_mlmd(self, schema_drift_detected: bool): execution_id = self.mlmd_store.put_executions([execution])[0] logger.info(f"Logged schema drift results to MLMD with execution ID: {execution_id}") - def monitor_traffic_split(self, endpoint_name: str) -> Optional[Dict[str, float]]: + def monitor_traffic_split(self, endpoint_name: str, expected_traffic_split: Dict[str, int]) -> Optional[Dict[str, int]]: """Monitor the traffic split in Vertex AI to detect rollback.""" try: log_step(logger, 'Monitoring traffic split', 'Rollback Monitoring') - + aiplatform.init(project=self.project_id) endpoints = aiplatform.Endpoint.list(filter=f'display_name="{endpoint_name}"') @@ -310,29 +342,43 @@ def monitor_traffic_split(self, endpoint_name: str) -> Optional[Dict[str, float] return None endpoint = endpoints[0] - traffic_split = endpoint.traffic_split + traffic_split = endpoint.gca_resource.traffic_split + total_traffic = sum(traffic_split.values()) + anomaly_detected = False + for model_id, traffic_percentage in traffic_split.items(): logger.info(f"Model {model_id} is receiving {traffic_percentage}% of the traffic.") - - if sum(traffic_split.values()) != 100: - logger.warning("Traffic split does not sum to 100%, indicating a possible rollback.") - + + # Check against expected traffic split + expected_percentage = expected_traffic_split.get(model_id, 0) + if traffic_percentage != expected_percentage: + anomaly_detected = True + logger.warning(f"Anomaly detected: Model {model_id} traffic is {traffic_percentage}%, expected {expected_percentage}%.") + + if total_traffic != 100: + anomaly_detected = True + logger.warning("Traffic split does not sum to 100%, indicating a possible rollback or misconfiguration.") + + # Log traffic anomaly metric + self.log_metric("traffic_anomaly", int(anomaly_detected)) + # Log traffic split to MLMD - self._log_traffic_split_to_mlmd(traffic_split) + self._log_traffic_split_to_mlmd(traffic_split, anomaly_detected) return traffic_split except Exception as e: log_error(logger, e, "Rollback Monitoring") - raise + return None - def _log_traffic_split_to_mlmd(self, traffic_split: Dict[str, float]): + def _log_traffic_split_to_mlmd(self, traffic_split: Dict[str, int], anomaly_detected: bool): """Log traffic split to ML Metadata.""" execution = metadata_store_pb2.Execution() - execution.type = "TrafficSplitMonitoring" + execution.type_id = self._get_or_create_execution_type_id("TrafficSplitMonitoring") execution.properties["model_name"].string_value = self.model_name + execution.properties["anomaly_detected"].int_value = int(anomaly_detected) for model_id, percentage in traffic_split.items(): - execution.properties[f"traffic_{model_id}"].double_value = percentage - execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat() + execution.custom_properties[f"traffic_{model_id}"].double_value = percentage + execution.properties["timestamp"].string_value = datetime.datetime.utcnow().isoformat() + 'Z' execution_id = self.mlmd_store.put_executions([execution])[0] logger.info(f"Logged traffic split to MLMD with execution ID: {execution_id}") @@ -348,7 +394,7 @@ def trigger_retraining_pipeline(self, pipeline_name: str, gcs_input: str) -> str pipeline_job = aiplatform.PipelineJob( display_name=f'Retraining - {self.model_name}', - template_path=f'gs://{pipeline_name}', + template_path=pipeline_name, parameter_values=pipeline_params ) @@ -363,7 +409,7 @@ def trigger_retraining_pipeline(self, pipeline_name: str, gcs_input: str) -> str def _log_retraining_trigger_to_mlmd(self, pipeline_job_id: str): """Log retraining trigger to ML Metadata.""" execution = metadata_store_pb2.Execution() - execution.type = "RetrainingTrigger" + execution.type_id = self._get_or_create_execution_type_id("RetrainingTrigger") execution.properties["model_name"].string_value = self.model_name execution.properties["pipeline_job_id"].string_value = pipeline_job_id execution.properties["timestamp"].string_value = datetime.datetime.now().isoformat() @@ -371,27 +417,29 @@ def _log_retraining_trigger_to_mlmd(self, pipeline_job_id: str): execution_id = self.mlmd_store.put_executions([execution])[0] logger.info(f"Logged retraining trigger to MLMD with execution ID: {execution_id}") - def setup_retraining_job_alert(self, notification_channel: str) -> None: + def setup_retraining_job_alert(self, notification_channel_id: str) -> None: """Set up a Cloud Monitoring alert for Vertex AI retraining jobs.""" - condition = { - "display_name": "Vertex AI Retraining Job Created", - "condition_threshold": { - "filter": 'resource.type="aiplatform.googleapis.com/PipelineJob" AND protoPayload.methodName="google.cloud.aiplatform.v1.PipelineService.CreatePipelineJob"', - "comparison": monitoring_v3.ComparisonType.COMPARISON_GT, - "threshold_value": 0, - "duration": {"seconds": 60}, - } - } + client = monitoring_v3.AlertPolicyServiceClient() + condition = monitoring_v3.AlertPolicy.Condition( + display_name="Vertex AI Retraining Job Created", + condition_monitoring_query_language=monitoring_v3.AlertPolicy.Condition.MonitoringQueryLanguageCondition( + query=( + 'fetch aiplatform.googleapis.com/pipeline_job ' + '| {metric.type="aiplatform.googleapis.com/pipeline_job/pipeline_job_state"}' + ), + duration=duration_pb2.Duration(seconds=60), + trigger=monitoring_v3.AlertPolicy.Condition.Trigger(count=1) + ) + ) - alert_policy = { - "display_name": "Retraining Job Alert", - "conditions": [condition], - "notification_channels": [notification_channel], - "enabled": True, - "combiner": monitoring_v3.AlertPolicy.Combiner.OR - } + alert_policy = monitoring_v3.AlertPolicy( + display_name="Retraining Job Alert", + conditions=[condition], + notification_channels=[notification_channel_id], + combiner=monitoring_v3.AlertPolicy.ConditionCombinerType.OR, + enabled=True + ) - client = monitoring_v3.AlertPolicyServiceClient() policy = client.create_alert_policy( name=self.project_name, alert_policy=alert_policy @@ -399,19 +447,19 @@ def setup_retraining_job_alert(self, notification_channel: str) -> None: logger.info(f"Created retraining job alert policy: {policy.name}") - def monitor_and_trigger_retraining(self, accuracy_threshold: float, drift_threshold: float, gcs_input: str, pipeline_name: str, notification_channel: str) -> None: + def monitor_and_trigger_retraining(self, accuracy_threshold: float, degradation_rate_threshold: float, drift_threshold: float, gcs_input: str, pipeline_name: str, notification_channel_id: str, time_window_seconds: int) -> None: """Monitor model accuracy, data drift, and prediction drift, and trigger retraining when necessary.""" - self.create_accuracy_degradation_alert(accuracy_threshold, 0.05) + self.create_accuracy_degradation_alert(accuracy_threshold, degradation_rate_threshold, time_window_seconds, notification_channel_id) - data_drift_detected = self.detect_data_drift(drift_threshold) - prediction_drift_detected = self.detect_prediction_drift(drift_threshold) + data_drift_score = self.detect_data_drift(drift_threshold) + prediction_drift_statistic = self.detect_prediction_drift(drift_threshold) - if data_drift_detected or prediction_drift_detected: + if (data_drift_score and data_drift_score > drift_threshold) or (prediction_drift_statistic and prediction_drift_statistic > drift_threshold): logger.warning(f"Drift detected for {self.model_name}. Triggering retraining pipeline.") - + pipeline_job_id = self.trigger_retraining_pipeline(pipeline_name, gcs_input) - - self.setup_retraining_job_alert(notification_channel) + + self.setup_retraining_job_alert(notification_channel_id) logger.info(f"Retraining job triggered: {pipeline_job_id}") else: @@ -419,11 +467,82 @@ def monitor_and_trigger_retraining(self, accuracy_threshold: float, drift_thresh logger.info("Model performance and drift monitoring completed.") + def log_feature_store_metric(self, feature_store_id: str, entity_type_id: str, metric_name: str, value: Union[int, float]): + """Logs a feature store metric to Google Cloud Monitoring.""" + series = monitoring_v3.TimeSeries() + series.metric.type = f"custom.googleapis.com/vertex_ai/{self.model_name}/{metric_name}" + series.resource.type = "aiplatform.googleapis.com/Featurestore" + series.resource.labels["featurestore_id"] = feature_store_id + series.resource.labels["entity_type_id"] = entity_type_id + point = monitoring_v3.Point() + if isinstance(value, int): + point.value.int64_value = value + else: + point.value.double_value = value + now = datetime.datetime.now() + point.interval.end_time.seconds = int(now.timestamp()) + point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9) + series.points = [point] + self.client.create_time_series(name=self.project_name, time_series=[series]) + logger.info(f"Logged feature store metric {metric_name} with value {value}") + + def monitor_feature_store(self, feature_store_id: str, entity_type_id: str): + """Monitors the Feature Store and logs relevant metrics.""" + try: + log_step(logger, 'Monitoring Feature Store', 'Feature Store Monitoring') + + featurestore_name = f"projects/{self.project_id}/locations/-/featurestores/{feature_store_id}" + entity_type_name = f"{featurestore_name}/entityTypes/{entity_type_id}" + + # Log read and write counts + # Placeholder implementation; actual read/write counts need to be retrieved from monitoring metrics or logs + read_count = 100 # Replace with actual logic to get read count + write_count = 50 # Replace with actual logic to get write count + + self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_read_count", read_count) + self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_write_count", write_count) + + # Log latency (this is a placeholder, actual implementation may vary based on available metrics) + avg_latency = 200 # Replace with actual logic to get average latency + self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_latency", avg_latency) + + logger.info(f"Monitored feature store {feature_store_id}, entity type {entity_type_id}") + except Exception as e: + log_error(logger, e, 'Feature Store Monitoring') + + def create_feature_store_alerts(self, feature_store_id: str, entity_type_id: str, notification_channel_id: str): + """Creates alerts for Feature Store monitoring.""" + self.create_alert_policy( + "High Feature Store Read Count", + f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_read_count" AND resource.labels.featurestore_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"', + 1000, # Threshold: 1000 reads + 300, # Duration: 5 minutes + monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT, + notification_channel_id + ) + self.create_alert_policy( + "High Feature Store Write Count", + f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_write_count" AND resource.labels.featurestore_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"', + 500, # Threshold: 500 writes + 300, # Duration: 5 minutes + monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT, + notification_channel_id + ) + self.create_alert_policy( + "High Feature Store Latency", + f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_latency" AND resource.labels.featurestore_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"', + 1000, # Threshold: 1000 ms + 300, # Duration: 5 minutes + monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT, + notification_channel_id + ) + def log_request_response(project_id: str, model_name: str, request: Dict, response: Dict, latency_ms: float, sampling_rate: float = 0.1) -> None: """Logs serving request/response data and latency to Cloud Storage with optional sampling.""" if sampling_rate >= 1 or random.random() < sampling_rate: client = storage.Client(project=project_id) - bucket = client.get_bucket(f"{project_id}-vertex-ai-logs") + bucket_name = f"{project_id}-vertex-ai-logs" + bucket = client.get_bucket(bucket_name) blob = bucket.blob(f"{model_name}/logs/{datetime.datetime.now().isoformat()}.json") log_entry = { "request": request, @@ -436,16 +555,13 @@ def log_request_response(project_id: str, model_name: str, request: Dict, respon def check_existing_statistics_and_schema(project_id: str, model_name: str, bucket_name: str, schema_version: str) -> Tuple[Optional[tfdv.types.DatasetFeatureStatisticsList], Optional[tfdv.types.Schema]]: today = datetime.datetime.now().strftime("%Y%m%d") - + try: existing_stats = load_statistics_from_gcs(bucket_name, model_name, 'serving', today) except Exception as e: logger.error(f"Error loading existing statistics: {e}") existing_stats = None - config = load_config() - schema_path = config['data_validation']['schema_path'] - try: schema = load_schema_from_gcs(bucket_name, model_name, schema_version) except Exception as e: @@ -454,21 +570,21 @@ def check_existing_statistics_and_schema(project_id: str, model_name: str, bucke return existing_stats, schema -def compute_and_store_statistics(project_id: str, model_name: str, bucket_name: str, existing_schema: Optional[tfdv.types.Schema]) -> Tuple[tfdv.types.DatasetFeatureStatisticsList, tfdv.types.Anomalies]: +def compute_and_store_statistics(project_id: str, model_name: str, bucket_name: str, existing_schema: Optional[tfdv.types.Schema]) -> Tuple[tfdv.types.DatasetFeatureStatisticsList, Optional[tfdv.types.Anomalies]]: client = storage.Client(project=project_id) bucket = client.get_bucket(f"{project_id}-vertex-ai-logs") - blobs = bucket.list_blobs(prefix=f"{model_name}/logs/") + blobs = client.list_blobs(bucket_or_name=bucket, prefix=f"{model_name}/logs/") data = [] for blob in blobs: content = json.loads(blob.download_as_string()) data.append(content) - df = pd.DataFrame(data) - + df = pd.json_normalize(data) + stats = tfdv.generate_statistics_from_dataframe(df) save_statistics_to_gcs(stats, bucket_name, model_name, 'serving') - + if existing_schema: anomalies = tfdv.validate_statistics(stats, schema=existing_schema) else: @@ -477,71 +593,6 @@ def compute_and_store_statistics(project_id: str, model_name: str, bucket_name: return stats, anomalies -def log_feature_store_metric(self, feature_store_id: str, entity_type_id: str, metric_name: str, value: Union[int, float]): - """Logs a feature store metric to Google Cloud Monitoring.""" - series = monitoring_v3.TimeSeries() - series.metric.type = f"custom.googleapis.com/vertex_ai/{self.model_name}/{metric_name}" - series.resource.type = "aiplatform.googleapis.com/FeatureStore" - series.resource.labels["feature_store_id"] = feature_store_id - series.resource.labels["entity_type_id"] = entity_type_id - point = series.points.add() - if isinstance(value, int): - point.value.int64_value = value - else: - point.value.double_value = value - now = datetime.datetime.now() - point.interval.end_time.seconds = int(now.timestamp()) - point.interval.end_time.nanos = int((now.timestamp() - int(now.timestamp())) * 10**9) - self.client.create_time_series(name=self.project_name, time_series=[series]) - logger.info(f"Logged feature store metric {metric_name} with value {value}") - -def monitor_feature_store(self, feature_store_id: str, entity_type_id: str): - """Monitors the Feature Store and logs relevant metrics.""" - try: - log_step(logger, 'Monitoring Feature Store', 'Feature Store Monitoring') - - feature_store = self.feature_store_client.get_feature_store(feature_store_id=feature_store_id) - entity_type = feature_store.get_entity_type(entity_type_id=entity_type_id) - - # Log read and write counts - read_count = entity_type.read_stats().get("total_entity_reads", 0) - write_count = entity_type.write_stats().get("total_entity_updates", 0) - - self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_read_count", read_count) - self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_write_count", write_count) - - # Log latency (this is a placeholder, actual implementation may vary based on available metrics) - avg_latency = entity_type.read_stats().get("average_read_latency_milliseconds", 0) - self.log_feature_store_metric(feature_store_id, entity_type_id, "feature_store_latency", avg_latency) - - logger.info(f"Monitored feature store {feature_store_id}, entity type {entity_type_id}") - except Exception as e: - log_error(logger, e, 'Feature Store Monitoring') - -def create_feature_store_alerts(self, feature_store_id: str, entity_type_id: str): - """Creates alerts for Feature Store monitoring.""" - self.create_alert_policy( - "High Feature Store Read Count", - f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_read_count" AND resource.labels.feature_store_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"', - 1000, # Threshold: 1000 reads - 300, # Duration: 5 minutes - monitoring_v3.ComparisonType.COMPARISON_GT - ) - self.create_alert_policy( - "High Feature Store Write Count", - f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_write_count" AND resource.labels.feature_store_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"', - 500, # Threshold: 500 writes - 300, # Duration: 5 minutes - monitoring_v3.ComparisonType.COMPARISON_GT - ) - self.create_alert_policy( - "High Feature Store Latency", - f'metric.type="custom.googleapis.com/vertex_ai/{self.model_name}/feature_store_latency" AND resource.labels.feature_store_id="{feature_store_id}" AND resource.labels.entity_type_id="{entity_type_id}"', - 1000, # Threshold: 1000 ms - 300, # Duration: 5 minutes - monitoring_v3.ComparisonType.COMPARISON_GT - ) - def main(): parser = argparse.ArgumentParser(description='Setup Vertex AI monitoring, drift detection, and rollback with retraining') parser.add_argument('--project_id', required=True, help='GCP Project ID') @@ -552,27 +603,57 @@ def main(): parser.add_argument('--time_window', type=int, default=86400, help='Time window in seconds to monitor for degradation (default is 24 hours)') parser.add_argument('--drift_threshold', type=float, default=0.05, help='Data drift threshold for retraining') parser.add_argument('--gcs_input', required=True, help='GCS path to input data for retraining') - parser.add_argument('--pipeline_name', required=True, help='Name of the Vertex AI pipeline for retraining') + parser.add_argument('--pipeline_name', required=True, help='Path to the Vertex AI pipeline template for retraining') parser.add_argument('--notification_channel', required=True, help='Notification channel ID (for alerts)') parser.add_argument('--bucket_name', required=True, help='Cloud Storage bucket name') parser.add_argument('--schema_version', required=True, help='Schema version for validation') parser.add_argument('--sampling_rate', type=float, default=0.1, help='Sampling rate for request/response logging') parser.add_argument('--feature_store_id', required=True, help='Vertex AI Feature Store ID') parser.add_argument('--entity_type_id', required=True, help='Entity Type ID in the Feature Store') + parser.add_argument('--mlmd_host', required=True, help='MLMD PostgreSQL host') + parser.add_argument('--mlmd_port', type=int, default=5432, help='MLMD PostgreSQL port') + parser.add_argument('--mlmd_database', required=True, help='MLMD PostgreSQL database name') + parser.add_argument('--mlmd_user', required=True, help='MLMD PostgreSQL username') + parser.add_argument('--mlmd_password', required=True, help='MLMD PostgreSQL password') args = parser.parse_args() - monitor = VertexAIMonitoring(args.project_id, args.model_name, args.bucket_name) - + monitor = VertexAIMonitoring( + project_id=args.project_id, + model_name=args.model_name, + bucket_name=args.bucket_name, + mlmd_host=args.mlmd_host, + mlmd_port=args.mlmd_port, + mlmd_database=args.mlmd_database, + mlmd_user=args.mlmd_user, + mlmd_password=args.mlmd_password + ) + monitor.setup_custom_metrics() - monitor.create_alert_policy("Data Drift Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/data_drift"', 0.1, 300, monitoring_v3.ComparisonType.COMPARISON_GT) - monitor.create_alert_policy("Prediction Drift Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/prediction_drift"', 0.1, 300, monitoring_v3.ComparisonType.COMPARISON_GT) - monitor.create_resource_utilization_alert() - monitor.create_alert_policy("Prediction Latency Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/prediction_latency"', 1000, 60, monitoring_v3.ComparisonType.COMPARISON_GT) - monitor.create_alert_policy("Schema Drift Alert", f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/schema_drift"', 1, 300, monitoring_v3.ComparisonType.COMPARISON_GT) - monitor.create_accuracy_degradation_alert(args.absolute_threshold, args.degradation_rate_threshold, args.time_window) - monitor.setup_feature_store_monitoring(args.feature_store_id, args.entity_type_id) - monitor.create_feature_store_alerts(args.feature_store_id, args.entity_type_id) + monitor.create_alert_policy("Data Drift Alert", + f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/data_drift"', + 0.1, 300, + monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT, + args.notification_channel) + monitor.create_alert_policy("Prediction Drift Alert", + f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/prediction_drift"', + 0.1, 300, + monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT, + args.notification_channel) + monitor.create_resource_utilization_alert(args.notification_channel) + monitor.create_alert_policy("Prediction Latency Alert", + f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/prediction_latency"', + 1000, 60, + monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT, + args.notification_channel) + monitor.create_alert_policy("Schema Drift Alert", + f'metric.type="custom.googleapis.com/vertex_ai/{args.model_name}/schema_drift"', + 1, 300, + monitoring_v3.AlertPolicy.Condition.MetricThreshold.ComparisonType.COMPARISON_GT, + args.notification_channel) + monitor.create_accuracy_degradation_alert(args.absolute_threshold, args.degradation_rate_threshold, args.time_window, args.notification_channel) + monitor.create_feature_store_alerts(args.feature_store_id, args.entity_type_id, args.notification_channel) monitor.monitor_feature_store(args.feature_store_id, args.entity_type_id) + monitor.create_rollback_alert_policy(args.notification_channel) existing_stats, existing_schema = check_existing_statistics_and_schema(args.project_id, args.model_name, args.bucket_name, args.schema_version) current_stats, anomalies = compute_and_store_statistics(args.project_id, args.model_name, args.bucket_name, existing_schema) @@ -585,10 +666,12 @@ def main(): monitor.monitor_and_trigger_retraining( accuracy_threshold=args.absolute_threshold, + degradation_rate_threshold=args.degradation_rate_threshold, drift_threshold=args.drift_threshold, gcs_input=args.gcs_input, pipeline_name=args.pipeline_name, - notification_channel=args.notification_channel + notification_channel_id=args.notification_channel, + time_window_seconds=args.time_window ) logger.info("Vertex AI monitoring, drift detection, rollback, and retraining setup completed successfully!") diff --git a/deployment/vertex_ai/vertex_deployment.py b/deployment/vertex_ai/vertex_deployment.py index 7b3f7ea..1f4e5bb 100644 --- a/deployment/vertex_ai/vertex_deployment.py +++ b/deployment/vertex_ai/vertex_deployment.py @@ -3,161 +3,172 @@ import base64 import os from typing import Tuple -from google.cloud import aiplatform, pubsub_v1 -from google.cloud.devtools import cloudbuild_v1 -from google.protobuf import duration_pb2 -from google.cloud import run_v2 +from google.cloud import aiplatform, pubsub_v1, cloudbuild_v1, functions_v1, run_v2, firestore +from google.protobuf import field_mask_pb2 from src.utils.logging_utils import setup_logger, log_error, log_step -from vertex_ai_monitoring import monitor_and_log_rollbacks, monitor_and_trigger_retraining logger = setup_logger('vertex_ai_deployment') -# Set your cooldown period (e.g., 5 minutes = 300 seconds) -COOLDOWN_PERIOD = 300 # Cooldown period in seconds - -# Global variable to store the last build trigger timestamp -LAST_TRIGGER_TIME = 0 - def deploy_to_vertex_ai(project_id: str, model_path: str, endpoint_name: str, model_name: str, canary_traffic_percent: int = 10) -> Tuple[str, str]: """ - Deploy the model to Vertex AI using a canary deployment strategy, checking if an existing model is already deployed. + Deploy the model to Vertex AI using a canary deployment strategy. If the new model fails, allow traffic rollback to the existing model. """ try: log_step(logger, 'Model Deployment to Vertex AI', 'Serving') - + # Initialize Vertex AI aiplatform.init(project=project_id) - + # Upload the model to Vertex AI + logger.debug(f"Uploading model '{model_name}' from path '{model_path}'") model = aiplatform.Model.upload( display_name=model_name, artifact_uri=model_path, serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-6:latest" ) - + model.wait() + logger.info(f"Model '{model_name}' uploaded successfully.") + # Retrieve the endpoint or create a new one if it doesn't exist endpoints = aiplatform.Endpoint.list(filter=f'display_name="{endpoint_name}"') if not endpoints: + logger.info(f"Creating new endpoint '{endpoint_name}'") endpoint = aiplatform.Endpoint.create(display_name=endpoint_name) - traffic_split = {model.resource_name: 100} # 100% traffic to the new model since no model exists + endpoint.wait() + traffic_split = {model.resource_name: 100} # 100% traffic to the new model logger.info("No existing models. Deploying new model with 100% traffic.") else: endpoint = endpoints[0] - + logger.info(f"Using existing endpoint '{endpoint.display_name}'") + # Check if there is an existing model deployed if endpoint.traffic_split: current_model_id = list(endpoint.traffic_split.keys())[0] # Assume single model in the endpoint - previous_traffic_split = endpoint.traffic_split # Save the current traffic split for rollback - - # Apply the canary strategy: split traffic between the existing model and the new model + previous_traffic_split = endpoint.traffic_split.copy() # Save the current traffic split for rollback + + # Apply the canary strategy traffic_split = { current_model_id: 100 - canary_traffic_percent, model.resource_name: canary_traffic_percent } logger.info(f"Canary deployment: {100 - canary_traffic_percent}% to the current model, {canary_traffic_percent}% to the new model.") else: - # No models currently deployed, assign 100% traffic to the new model + # No models currently deployed traffic_split = {model.resource_name: 100} logger.info("No existing traffic split found. Assigning 100% traffic to the new model.") # Deploy the model to the endpoint with the traffic split - model_deployment = model.deploy( + logger.debug("Deploying model to endpoint with specified traffic split.") + model.deploy( endpoint=endpoint, machine_type="n1-standard-2", traffic_split=traffic_split, min_replica_count=1, - max_replica_count=3, - accelerator_type=None, - accelerator_count=None, - accelerator_config=None + max_replica_count=3 ) - logger.info(f"Model deployed to Vertex AI endpoint: {endpoint.resource_name}") - # Use the consolidated function to monitor traffic split and set up rollback alerts - monitor_and_log_rollbacks(project_id, endpoint_name) - - # After model deployment, monitor and trigger retraining if necessary - monitor_and_trigger_retraining( - project_id=project_id, - model_name=model_name, - accuracy_threshold=0.85, - drift_threshold=0.05, - gcs_input='gs://your_project/data/', - pipeline_name='your_pipeline_name', - notification_channel='your_notification_channel' - ) - return (endpoint.resource_name, model.resource_name) - + except Exception as e: log_error(logger, e, 'Model Deployment to Vertex AI') - + # Rollback traffic to the previous model if deployment fails - if endpoint and previous_traffic_split: - endpoint.deploy(traffic_split=previous_traffic_split) + if 'endpoint' in locals() and 'previous_traffic_split' in locals(): + endpoint.update_traffic_split(traffic_split=previous_traffic_split) logger.info("Rolled back traffic to the previous model due to deployment failure.") else: logger.error("No previous traffic split available for rollback.") raise - def setup_cloud_build_trigger(project_id: str, repo_name: str, branch_name: str, storage_bucket: str = None): """ - Set up a Cloud Build trigger for continuous training with Pub/Sub and Cloud Function for cooldown. - The trigger can monitor both code changes in the repository and new data in a Cloud Storage bucket. - - :param project_id: GCP Project ID - :param repo_name: Name of the GitHub repository - :param branch_name: Branch to monitor for changes (e.g., 'main') - :param storage_bucket: Optional. Cloud Storage bucket to monitor for new data + Set up a Cloud Build trigger for continuous training. """ - client = cloudbuild_v1.CloudBuildClient() - - # Create the Cloud Build trigger - trigger = cloudbuild_v1.BuildTrigger( - name=f"{repo_name}-{branch_name}-trigger", - github=cloudbuild_v1.GitHubEventsConfig( - owner="your-github-username", - name=repo_name, - push=cloudbuild_v1.PushFilter( - branch=branch_name, - included_paths=["pipeline/**", "model/**", "configs/**", "deployment/vertex_ai/**", "src/**", "data/**", "kubeflow/**"], - ignored_paths=["README.md", "docs/**", "*.md"] # Exempt non-critical changes + try: + log_step(logger, 'Setting up Cloud Build Trigger', 'CI/CD Pipeline') + + client = cloudbuild_v1.CloudBuildClient() + + trigger = cloudbuild_v1.BuildTrigger( + name=f"{repo_name}-{branch_name}-trigger", + github=cloudbuild_v1.GitHubEventsConfig( + owner="your-github-username", + name=repo_name, + push=cloudbuild_v1.PushFilter( + branch=f'^{branch_name}$' # Use regex for exact match + ) + ), + filename="cloudbuild.yaml", + included_files=["pipeline/**", "model/**", "configs/**", "deployment/vertex_ai/**", "src/**", "data/**", "kubeflow/**"], + ignored_files=["README.md", "docs/**", "*.md"] # Exempt non-critical changes + ) + + # Optional: Monitor for new data ingestion in the Cloud Storage bucket + if storage_bucket: + trigger.pubsub_config = cloudbuild_v1.PubsubConfig( + topic=f"projects/{project_id}/topics/{storage_bucket}-trigger" ) - ), - filename="cloudbuild.yaml" - ) - - # Optional: Monitor for new data ingestion in the Cloud Storage bucket - if storage_bucket: - trigger.pubsub_config = cloudbuild_v1.PubsubConfig( - topic=f"projects/{project_id}/topics/{storage_bucket}-trigger", - subscription=f"projects/{project_id}/subscriptions/{storage_bucket}-trigger-sub" + + # Create the Cloud Build trigger + parent = f"projects/{project_id}/locations/global" + trigger_response = client.create_build_trigger(parent=parent, trigger=trigger) + logger.info(f"Cloud Build trigger created: {trigger_response.name}") + + # Set up build notifications + notification_config = cloudbuild_v1.NotificationConfig( + filter="build.status in (SUCCESS, FAILURE, INTERNAL_ERROR, TIMEOUT)", + pubsub_topic=f"projects/{project_id}/topics/cloud-builds" ) - # Create the Cloud Build trigger - trigger_response = client.create_build_trigger(parent=f"projects/{project_id}", trigger=trigger) - - print(f"Cloud Build trigger created: {trigger_response.name}") + # Update the trigger with notification config + trigger_response.trigger.notification_config = notification_config + update_mask = field_mask_pb2.FieldMask(paths=['notification_config']) + client.update_build_trigger( + trigger=trigger_response.trigger, + update_mask=update_mask + ) + logger.info("Build status notifications set up for successful and failed builds.") - # Set up build notifications - notification_config = cloudbuild_v1.NotificationConfig( - filter="build.status in (SUCCESS, FAILURE, INTERNAL_ERROR, TIMEOUT)", - pubsub_topic=f"projects/{project_id}/topics/cloud-builds" - ) + return trigger_response.trigger + + except Exception as e: + log_error(logger, e, 'Setting up Cloud Build Trigger') + raise - # Update the trigger with notification config - trigger_response.notification_config = notification_config - client.update_build_trigger( - project_id=project_id, - trigger_id=trigger_response.id, - trigger=trigger_response - ) +def trigger_cloud_build(): + """ + Function to trigger a Cloud Build job using the Cloud Build API. + """ + try: + log_step(logger, 'Triggering Cloud Build Job', 'CI/CD Pipeline') + + client = cloudbuild_v1.CloudBuildClient() + + project_id = os.environ.get('PROJECT_ID') + trigger_id = os.environ.get('TRIGGER_ID') + model_name = os.environ.get('MODEL_NAME', 'default_model_name') + endpoint_name = os.environ.get('ENDPOINT_NAME', 'default_endpoint_name') + + if not project_id or not trigger_id: + raise ValueError("Environment variables 'PROJECT_ID' and 'TRIGGER_ID' must be set.") + + # Trigger the Cloud Build job + operation = client.run_build_trigger( + project_id=project_id, + trigger_id=trigger_id, + source=cloudbuild_v1.RepoSource() + ) + + build = operation.result() + logger.info(f"Triggered Cloud Build job for trigger ID: {trigger_id}, Model: {model_name}, Endpoint: {endpoint_name}") + logger.info(f"Build Status: {build.status.name}") + + except Exception as e: + log_error(logger, e, 'Triggering Cloud Build Job') + raise - print("Build status notifications set up for successful and failed builds.") - return trigger_response def cloud_build_trigger(event, context): """ @@ -165,82 +176,123 @@ def cloud_build_trigger(event, context): It handles the Pub/Sub event and ensures that builds are not triggered more frequently than the specified cooldown period. """ - global LAST_TRIGGER_TIME - current_time = time.time() - - # Decode the Pub/Sub message (if it's base64-encoded) - if 'data' in event: - data = base64.b64decode(event['data']).decode('utf-8') - print(f"Received message: {data}") - - # Check if the cooldown period has passed - if current_time - LAST_TRIGGER_TIME < COOLDOWN_PERIOD: - print("Cooldown period not over. Skipping build trigger.") - return + try: + log_step(logger, 'Cloud Build Trigger Function Invoked', 'Cloud Function') + + # Initialize Firestore client + firestore_client = firestore.Client() + cooldown_collection = firestore_client.collection('cloud_build_cooldown') + cooldown_doc = cooldown_collection.document('last_trigger_time') + + current_time = time.time() + cooldown_period = int(os.environ.get('COOLDOWN_PERIOD', 300)) # Default to 300 seconds if not set + + @firestore.transactional + def update_last_trigger_time(transaction): + doc = cooldown_doc.get(transaction=transaction) + if doc.exists: + last_trigger_time = doc.to_dict().get('timestamp') + if (current_time - last_trigger_time) < cooldown_period: + logger.info("Cooldown period not over. Skipping build trigger.") + return False + # Update the last trigger time + transaction.set(cooldown_doc, {'timestamp': current_time}) + return True + + transaction = firestore_client.transaction() + should_trigger = update_last_trigger_time(transaction) + if not should_trigger: + return + + # Trigger the Cloud Build job since the cooldown period has passed + trigger_cloud_build() + logger.info("Cloud Build job triggered successfully.") - # Trigger the Cloud Build job since the cooldown period has passed - trigger_cloud_build() + except Exception as e: + log_error(logger, e, 'Cloud Build Trigger Function') + raise - # Update the last trigger time - LAST_TRIGGER_TIME = current_time -def trigger_cloud_build(): +def deploy_cloud_function(project_id, region, function_name, entry_point, runtime, trigger_topic, env_vars): """ - Function to trigger a Cloud Build job using the Cloud Build API. + Deploy a Cloud Function using the client library. """ - client = cloudbuild_v1.CloudBuildClient() - - project_id = os.environ.get('PROJECT_ID') - trigger_id = os.environ.get('TRIGGER_ID') - model_name = os.environ.get('MODEL_NAME') - endpoint_name = os.environ.get('ENDPOINT_NAME') + try: + log_step(logger, 'Deploying Cloud Function', 'Deployment') + + client = functions_v1.CloudFunctionsServiceClient() + parent = f'projects/{project_id}/locations/{region}' + + # Prepare the Cloud Function source code zip file + # Assuming the function code is in a directory named 'cloud_function_code' + source_archive_url = f'gs://{project_id}-cloud-functions/{function_name}.zip' + # You need to upload the zip file to the specified GCS bucket + + function = functions_v1.CloudFunction( + name=f'{parent}/functions/{function_name}', + entry_point=entry_point, + runtime=runtime, + environment_variables=env_vars, + event_trigger=functions_v1.EventTrigger( + event_type='google.pubsub.topic.publish', + resource=f'projects/{project_id}/topics/{trigger_topic}', + retry_policy=functions_v1.EventTrigger.RetryPolicy.RETRY_POLICY_RETRY + ), + source_archive_url=source_archive_url, + service_account_email=f'{project_id}@appspot.gserviceaccount.com', + ingress_settings=functions_v1.CloudFunction.IngressSettings.ALLOW_ALL + ) - # Trigger the Cloud Build job using the build trigger ID - build = cloudbuild_v1.BuildTrigger( - project_id=project_id, - trigger_id=trigger_id - ) + operation = client.create_function(request={'location': parent, 'function': function}) + response = operation.result() - # Run the build - client.run_build_trigger(project_id=project_id, trigger_id=trigger_id, source=None) - print(f"Triggered Cloud Build job for trigger ID: {trigger_id}, Model: {model_name}, Endpoint: {endpoint_name}") + if response.status == functions_v1.CloudFunctionStatus.ACTIVE: + logger.info(f"Cloud Function '{function_name}' deployed successfully.") + else: + logger.error(f"Cloud Function '{function_name}' deployment failed with status: {response.status}") + except Exception as e: + log_error(logger, e, 'Deploying Cloud Function') + raise def setup_cloud_run(project_id, service_name, image_url, region): - client = run_v2.ServicesClient() - - service = run_v2.Service() - service.template = run_v2.RevisionTemplate() - service.template.containers = [ - run_v2.Container( - image=image_url, - env=[{"name": "ENV_VAR", "value": "production"}], # Optional env vars - resources=run_v2.ResourceRequirements( # Optional resource settings - limits={"cpu": "1", "memory": "512Mi"} + """ + Set up a Cloud Run service. + """ + try: + log_step(logger, 'Setting up Cloud Run Service', 'Deployment') + + client = run_v2.ServicesClient() + + service = run_v2.Service() + service.template = run_v2.RevisionTemplate() + service.template.containers = [ + run_v2.Container( + image=image_url, + env_vars=[run_v2.EnvVar(name="ENV_VAR", value="production")], # Optional env vars + resources=run_v2.ResourceRequirements( # Optional resource settings + limits={"cpu": "1", "memory": "512Mi"} + ) ) - ) - ] - - parent = client.common_location_path(project_id, region) - response = client.create_service( - parent=parent, - service=service, - service_id=service_name - ) - - print(f"Cloud Run service created: {response.name}") - return response + ] + parent = client.common_location_path(project_id, region) + response = client.create_service( + parent=parent, + service=service, + service_id=service_name + ) + logger.info(f"Cloud Run service created: {response.name}") + return response -if __name__ == '__main__': - import argparse - from google.cloud import pubsub_v1 - import os + except Exception as e: + log_error(logger, e, 'Setting up Cloud Run Service') + raise - # Parse arguments for Vertex AI and Cloud Build setup +def main(): parser = argparse.ArgumentParser(description='Deploy to Vertex AI, set up Cloud Build triggers, and configure CI/CD with Cloud Run and Pub/Sub for continuous training') parser.add_argument('--project_id', required=True, help='GCP Project ID') - parser.add_argument('--model_name', required=True, help='Name of the machine learning model') # Model name argument added + parser.add_argument('--model_name', required=True, help='Name of the machine learning model') parser.add_argument('--model_path', required=True, help='Path to the model artifacts') parser.add_argument('--endpoint_name', required=True, help='Name for the Vertex AI endpoint') parser.add_argument('--repo_name', required=True, help='GitHub repository name') @@ -248,58 +300,83 @@ def setup_cloud_run(project_id, service_name, image_url, region): parser.add_argument('--service_name', required=True, help='Cloud Run service name') parser.add_argument('--image_url', required=True, help='Docker image URL for Cloud Run') parser.add_argument('--region', required=True, help='GCP region for deployment') - parser.add_argument('--storage_bucket', required=True, help='Cloud Storage bucket to monitor for new data') - parser.add_argument('--trigger_id', required=True, help='Cloud Build trigger ID for retraining jobs') + parser.add_argument('--storage_bucket', required=False, help='Cloud Storage bucket to monitor for new data') parser.add_argument('--cooldown_period', type=int, default=300, help='Cooldown period in seconds between Cloud Build jobs') + parser.add_argument('--trigger_id', required=True, help='Cloud Build trigger ID for retraining jobs') parser.add_argument('--notification_channel', required=True, help='Notification channel ID for build status notifications') parser.add_argument('--canary_traffic_percent', type=int, default=10, help='Canary traffic split percentage') args = parser.parse_args() - # Step 1: Deploy the model to Vertex AI - print(f"Deploying model '{args.model_name}' to Vertex AI...") - endpoint_name, model_name = deploy_to_vertex_ai( - project_id=args.project_id, - model_path=args.model_path, - endpoint_name=args.endpoint_name, - model_name=args.model_name, - canary_traffic_percent=args.canary_traffic_percent - ) - - # Step 2: Set up Cloud Build trigger - print("Setting up Cloud Build trigger for continuous training...") - trigger_response = setup_cloud_build_trigger( - project_id=args.project_id, - repo_name=args.repo_name, - branch_name=args.branch_name, - storage_bucket=args.storage_bucket - ) - - # Step 3: Deploy the Cloud Function for cooldown (Pub/Sub) - print("Setting up Pub/Sub topic and deploying Cloud Function for cooldown mechanism...") - - # Ensure the Pub/Sub topic exists - pubsub_client = pubsub_v1.PublisherClient() - topic_path = pubsub_client.topic_path(args.project_id, 'cloud-build-trigger') - pubsub_client.create_topic(name=topic_path) - - # Deploy Cloud Function for cooldown - os.system(f"gcloud functions deploy cloud_build_trigger --runtime python39 " - f"--trigger-topic cloud-build-trigger " - f"--set-env-vars PROJECT_ID={args.project_id},TRIGGER_ID={args.trigger_id} " - f"--memory=128MB --timeout=300s") - - # Step 4: Set up Cloud Run service for deployment - print("Setting up Cloud Run service for deployment...") - service_response = setup_cloud_run( - project_id=args.project_id, - service_name=args.service_name, - image_url=args.image_url, - region=args.region - ) - - # Output results - print(f"Deployment to Vertex AI completed. Endpoint: {endpoint_name}, Model: {model_name}") - print(f"Cloud Build trigger '{trigger_response.name}' created.") - print(f"Cloud Run service '{service_response.name}' created.") - print("MLOps pipeline with Cloud Build, Pub/Sub, Cloud Function cooldown, and Cloud Run setup completed successfully.") + try: + # Step 1: Deploy the model to Vertex AI + logger.info(f"Deploying model '{args.model_name}' to Vertex AI...") + endpoint_resource_name, model_resource_name = deploy_to_vertex_ai( + project_id=args.project_id, + model_path=args.model_path, + endpoint_name=args.endpoint_name, + model_name=args.model_name, + canary_traffic_percent=args.canary_traffic_percent + ) + + # Step 2: Set up Cloud Build trigger + logger.info("Setting up Cloud Build trigger for continuous training...") + trigger_response = setup_cloud_build_trigger( + project_id=args.project_id, + repo_name=args.repo_name, + branch_name=args.branch_name, + storage_bucket=args.storage_bucket + ) + + # Step 3: Deploy the Cloud Function for cooldown (Pub/Sub) + logger.info("Setting up Pub/Sub topic and deploying Cloud Function for cooldown mechanism...") + + # Ensure the Pub/Sub topic exists + pubsub_client = pubsub_v1.PublisherClient() + topic_path = pubsub_client.topic_path(args.project_id, 'cloud-build-trigger') + try: + pubsub_client.get_topic(request={"topic": topic_path}) + logger.info(f"Pub/Sub topic '{topic_path}' already exists.") + except pubsub_client.exceptions.NotFound: + pubsub_client.create_topic(name=topic_path) + logger.info(f"Created Pub/Sub topic '{topic_path}'.") + + # Deploy Cloud Function for cooldown + deploy_cloud_function( + project_id=args.project_id, + region=args.region, + function_name='cloud_build_trigger', + entry_point='cloud_build_trigger', + runtime='python39', + trigger_topic='cloud-build-trigger', + env_vars={ + 'PROJECT_ID': args.project_id, + 'TRIGGER_ID': args.trigger_id, + 'COOLDOWN_PERIOD': str(args.cooldown_period) + } + ) + + # Step 4: Set up Cloud Run service for deployment + logger.info("Setting up Cloud Run service for deployment...") + service_response = setup_cloud_run( + project_id=args.project_id, + service_name=args.service_name, + image_url=args.image_url, + region=args.region + ) + + # Output results + logger.info(f"Deployment to Vertex AI completed. Endpoint: {endpoint_resource_name}, Model: {model_resource_name}") + logger.info(f"Cloud Build trigger '{trigger_response.name}' created.") + logger.info(f"Cloud Run service '{service_response.name}' created.") + logger.info("MLOps pipeline with Cloud Build, Pub/Sub, Cloud Function cooldown, and Cloud Run setup completed successfully.") + + except Exception as e: + log_error(logger, e, 'Main Execution') + raise + + +if __name__ == '__main__': + main() + + diff --git a/kubeflow/components/preprocess/preprocess.py b/kubeflow/components/preprocess/preprocess.py index 6b7966d..8c1b83e 100644 --- a/kubeflow/components/preprocess/preprocess.py +++ b/kubeflow/components/preprocess/preprocess.py @@ -5,7 +5,7 @@ Dataset, ) from typing import NamedTuple -from src.data_processing.data_preprocess import prepare_data +from src.data_processing.data_prep import prepare_data from src.utils.logging_utils import setup_logger, log_error, log_step logger = setup_logger('kubeflow_preprocess') diff --git a/src/utils/logging_utils.py b/src/utils/logging_utils.py index 5f277e1..4bf987c 100644 --- a/src/utils/logging_utils.py +++ b/src/utils/logging_utils.py @@ -7,18 +7,18 @@ def setup_logger(name, log_level=logging.INFO): logger = logging.getLogger(name) logger.setLevel(log_level) - # Check if running on GCP - if os.getenv('KUBERNETES_SERVICE_HOST'): - # Use Google Cloud Logging - client = cloud_logging.Client() - handler = cloud_logging.handlers.CloudLoggingHandler(client) - else: - # Use local file logging - handler = logging.FileHandler(f"{name}.log") - - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - handler.setFormatter(formatter) - logger.addHandler(handler) + if not logger.handlers: + # Check if running on GCP + if os.getenv('KUBERNETES_SERVICE_HOST'): + # Use Google Cloud Logging + client = cloud_logging.Client() + handler = cloud_logging.handlers.CloudLoggingHandler(client) + else: + # Use console logging + handler = logging.StreamHandler() + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) return logger @@ -32,4 +32,4 @@ def log_step(logger, step, component): def log_metric(logger, metric_name, metric_value, component): """Log a metric.""" - logger.info(f"Metric in {component}: {metric_name} = {metric_value}") \ No newline at end of file + logger.info(f"Metric in {component}: {metric_name} = {metric_value}")