diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fccbffa..a88e69c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ repos: - id: check-yaml - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.3.0 + rev: v0.3.2 hooks: - id: ruff args: diff --git a/README.md b/README.md index fe46c60..162a835 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ * 🐳 Automatically build Custom Prediction Routines (CPR) for Hugging Face Hub models using `transformers.pipeline` * 📦 Everything is packaged within a single method, providing more flexibility and ease of usage than the former `google-cloud-aiplatform` SDK for custom models * 🔌 Seamless integration for running inference on top of any model from the Hugging Face Hub in Vertex AI thanks to `transformers` +* 🌅 Support for `diffusers` models too! * 🔍 Includes custom `logging` messages for better monitoring and debugging via Google Cloud Logging ## Get started @@ -23,13 +24,13 @@ gcloud auth login Then install `vertex-ai-huggingface-inference-toolkit` via `pip install`: ```bash -pip install vertex-ai-huggingface-inference-toolkit>=0.1.0 +pip install vertex-ai-huggingface-inference-toolkit>=0.0.2 ``` Or via `uv pip install` for faster installations using [`uv`](https://astral.sh/blog/uv): ```bash -uv pip install vertex-ai-huggingface-inference-toolkit>=0.1.0 +uv pip install vertex-ai-huggingface-inference-toolkit>=0.0.2 ``` ## Example diff --git a/docs/index.md b/docs/index.md index fe46c60..162a835 100644 --- a/docs/index.md +++ b/docs/index.md @@ -9,6 +9,7 @@ * 🐳 Automatically build Custom Prediction Routines (CPR) for Hugging Face Hub models using `transformers.pipeline` * 📦 Everything is packaged within a single method, providing more flexibility and ease of usage than the former `google-cloud-aiplatform` SDK for custom models * 🔌 Seamless integration for running inference on top of any model from the Hugging Face Hub in Vertex AI thanks to `transformers` +* 🌅 Support for `diffusers` models too! * 🔍 Includes custom `logging` messages for better monitoring and debugging via Google Cloud Logging ## Get started @@ -23,13 +24,13 @@ gcloud auth login Then install `vertex-ai-huggingface-inference-toolkit` via `pip install`: ```bash -pip install vertex-ai-huggingface-inference-toolkit>=0.1.0 +pip install vertex-ai-huggingface-inference-toolkit>=0.0.2 ``` Or via `uv pip install` for faster installations using [`uv`](https://astral.sh/blog/uv): ```bash -uv pip install vertex-ai-huggingface-inference-toolkit>=0.1.0 +uv pip install vertex-ai-huggingface-inference-toolkit>=0.0.2 ``` ## Example diff --git a/pyproject.toml b/pyproject.toml index cce6039..b3e72b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ path = "src/vertex_ai_huggingface_inference_toolkit/__init__.py" [project.optional-dependencies] transformers = ["accelerate", "transformers"] +diffusers = ["accelerate", "diffusers"] docs = [ "mkdocs", "mkdocs-material", diff --git a/src/vertex_ai_huggingface_inference_toolkit/__init__.py b/src/vertex_ai_huggingface_inference_toolkit/__init__.py index eaaee06..9d5a06c 100644 --- a/src/vertex_ai_huggingface_inference_toolkit/__init__.py +++ b/src/vertex_ai_huggingface_inference_toolkit/__init__.py @@ -3,6 +3,7 @@ __author__ = "Alvaro Bartolome " __version__ = "0.0.2" +from vertex_ai_huggingface_inference_toolkit.diffusers import DiffusersModel from vertex_ai_huggingface_inference_toolkit.transformers import TransformersModel -__all__ = ["TransformersModel"] +__all__ = ["DiffusersModel", "TransformersModel"] diff --git a/src/vertex_ai_huggingface_inference_toolkit/_internal/diffusers/dockerfiles/Dockerfile.cpu b/src/vertex_ai_huggingface_inference_toolkit/_internal/diffusers/dockerfiles/Dockerfile.cpu new file mode 100644 index 0000000..7acf939 --- /dev/null +++ b/src/vertex_ai_huggingface_inference_toolkit/_internal/diffusers/dockerfiles/Dockerfile.cpu @@ -0,0 +1,33 @@ +ARG PYTHON_VERSION="3.10" +FROM python:${PYTHON_VERSION}-slim AS build +LABEL maintainer="Alvaro Bartolome" + +ARG DEBIAN_FRONTEND=noninteractive +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONBUFFERED=1 + +RUN mkdir -m 777 -p /usr/app /home +WORKDIR /usr/app +ENV HOME=/home + +RUN python -m pip install --no-cache-dir --upgrade pip && \ + python -m pip install --no-cache-dir --force-reinstall "google-cloud-aiplatform[prediction]>=1.27.0" && \ + python -m pip install --no-cache-dir --force-reinstall "vertex_ai_huggingface_inference_toolkit[transformers]>=0.0.2" --upgrade + +ARG FRAMEWORK="torch" +ARG FRAMEWORK_VERSION="2.2.0" +RUN python -m pip install --no-cache-dir ${FRAMEWORK}==${FRAMEWORK_VERSION} + +ARG DIFFUSERS_VERSION="0.27.2" +RUN python -m pip install --no-cache-dir diffusers==${DIFFUSERS_VERSION} + +ARG EXTRA_REQUIREMENTS +RUN if [ -n "${EXTRA_REQUIREMENTS}" ]; then python -m pip install --no-cache-dir --force-reinstall ${EXTRA_REQUIREMENTS}; fi + +ENV HANDLER_MODULE=google.cloud.aiplatform.prediction.handler +ENV HANDLER_CLASS=PredictionHandler +ENV PREDICTOR_MODULE=vertex_ai_huggingface_inference_toolkit.predictors.diffusers +ENV PREDICTOR_CLASS=DiffusersPredictor + +EXPOSE 8080 +ENTRYPOINT ["python", "-m", "google.cloud.aiplatform.prediction.model_server"] diff --git a/src/vertex_ai_huggingface_inference_toolkit/_internal/diffusers/dockerfiles/Dockerfile.gpu b/src/vertex_ai_huggingface_inference_toolkit/_internal/diffusers/dockerfiles/Dockerfile.gpu new file mode 100644 index 0000000..0d92f88 --- /dev/null +++ b/src/vertex_ai_huggingface_inference_toolkit/_internal/diffusers/dockerfiles/Dockerfile.gpu @@ -0,0 +1,47 @@ +ARG CUDA_VERSION="12.3.0" +ARG UBUNTU_VERSION="22.04" +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION} AS build +LABEL maintainer="Alvaro Bartolome" + +ARG DEBIAN_FRONTEND=noninteractive +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONBUFFERED=1 + +RUN mkdir -m 777 -p /usr/app /home +WORKDIR /usr/app +ENV HOME=/home + +ARG PYTHON_VERSION="3.10" +RUN apt-get update && \ + apt-get install software-properties-common --no-install-recommends -y && \ + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get install python${PYTHON_VERSION} python3-pip --no-install-recommends -y && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN ln -s "/usr/bin/python${PYTHON_VERSION}" /usr/bin/python +ENV PYTHON=/usr/bin/python + +RUN python -m pip install --no-cache-dir --upgrade pip && \ + python -m pip install --no-cache-dir --force-reinstall "google-cloud-aiplatform[prediction]>=1.27.0" && \ + python -m pip install --no-cache-dir --force-reinstall "vertex_ai_huggingface_inference_toolkit[transformers]>=0.0.2" --upgrade + +ARG FRAMEWORK="torch" +ARG FRAMEWORK_VERSION="2.2.0" +RUN python -m pip install --no-cache-dir ${FRAMEWORK}==${FRAMEWORK_VERSION} + +ARG DIFFUSERS_VERSION="0.27.2" +RUN python -m pip install --no-cache-dir diffusers==${DIFFUSERS_VERSION} + +ARG EXTRA_REQUIREMENTS +RUN if [ -n "${EXTRA_REQUIREMENTS}" ]; then python -m pip install --no-cache-dir --force-reinstall ${EXTRA_REQUIREMENTS}; fi + +ENV HANDLER_MODULE=google.cloud.aiplatform.prediction.handler +ENV HANDLER_CLASS=PredictionHandler +ENV PREDICTOR_MODULE=vertex_ai_huggingface_inference_toolkit.predictors.diffusers +ENV PREDICTOR_CLASS=DiffusersPredictor + +EXPOSE 8080 +ENTRYPOINT ["python", "-m", "google.cloud.aiplatform.prediction.model_server"] + diff --git a/src/vertex_ai_huggingface_inference_toolkit/_internal/dockerfiles/Dockerfile.cpu b/src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/dockerfiles/Dockerfile.cpu similarity index 97% rename from src/vertex_ai_huggingface_inference_toolkit/_internal/dockerfiles/Dockerfile.cpu rename to src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/dockerfiles/Dockerfile.cpu index 6ef3e73..3ec551b 100644 --- a/src/vertex_ai_huggingface_inference_toolkit/_internal/dockerfiles/Dockerfile.cpu +++ b/src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/dockerfiles/Dockerfile.cpu @@ -18,7 +18,7 @@ ARG FRAMEWORK="torch" ARG FRAMEWORK_VERSION="2.2.0" RUN python -m pip install --no-cache-dir ${FRAMEWORK}==${FRAMEWORK_VERSION} -ARG TRANSFORMERS_VERSION="4.11.3" +ARG TRANSFORMERS_VERSION="4.38.2" RUN python -m pip install --no-cache-dir transformers==${TRANSFORMERS_VERSION} ARG EXTRA_REQUIREMENTS diff --git a/src/vertex_ai_huggingface_inference_toolkit/_internal/dockerfiles/Dockerfile.gpu b/src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/dockerfiles/Dockerfile.gpu similarity index 98% rename from src/vertex_ai_huggingface_inference_toolkit/_internal/dockerfiles/Dockerfile.gpu rename to src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/dockerfiles/Dockerfile.gpu index 7683c10..a5b7c4d 100644 --- a/src/vertex_ai_huggingface_inference_toolkit/_internal/dockerfiles/Dockerfile.gpu +++ b/src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/dockerfiles/Dockerfile.gpu @@ -31,7 +31,7 @@ ARG FRAMEWORK="torch" ARG FRAMEWORK_VERSION="2.2.0" RUN python -m pip install --no-cache-dir ${FRAMEWORK}==${FRAMEWORK_VERSION} -ARG TRANSFORMERS_VERSION="4.11.3" +ARG TRANSFORMERS_VERSION="4.38.2" RUN python -m pip install --no-cache-dir transformers==${TRANSFORMERS_VERSION} ARG EXTRA_REQUIREMENTS diff --git a/src/vertex_ai_huggingface_inference_toolkit/_internal/schemas/text-generation/input.yaml b/src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/schemas/text-generation/input.yaml similarity index 100% rename from src/vertex_ai_huggingface_inference_toolkit/_internal/schemas/text-generation/input.yaml rename to src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/schemas/text-generation/input.yaml diff --git a/src/vertex_ai_huggingface_inference_toolkit/_internal/schemas/text-generation/output.yaml b/src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/schemas/text-generation/output.yaml similarity index 100% rename from src/vertex_ai_huggingface_inference_toolkit/_internal/schemas/text-generation/output.yaml rename to src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/schemas/text-generation/output.yaml diff --git a/src/vertex_ai_huggingface_inference_toolkit/_internal/schemas/zero-shot-classification/input.yaml b/src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/schemas/zero-shot-classification/input.yaml similarity index 100% rename from src/vertex_ai_huggingface_inference_toolkit/_internal/schemas/zero-shot-classification/input.yaml rename to src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/schemas/zero-shot-classification/input.yaml diff --git a/src/vertex_ai_huggingface_inference_toolkit/_internal/schemas/zero-shot-classification/output.yaml b/src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/schemas/zero-shot-classification/output.yaml similarity index 100% rename from src/vertex_ai_huggingface_inference_toolkit/_internal/schemas/zero-shot-classification/output.yaml rename to src/vertex_ai_huggingface_inference_toolkit/_internal/transformers/schemas/zero-shot-classification/output.yaml diff --git a/src/vertex_ai_huggingface_inference_toolkit/diffusers.py b/src/vertex_ai_huggingface_inference_toolkit/diffusers.py new file mode 100644 index 0000000..0c7cfe9 --- /dev/null +++ b/src/vertex_ai_huggingface_inference_toolkit/diffusers.py @@ -0,0 +1,127 @@ +from typing import Any, Dict, List, Literal, Optional + +from vertex_ai_huggingface_inference_toolkit.model import Model + + +class DiffusersModel(Model): + """Class that manages the whole lifecycle of a Hugging Face model either from the Hub + or from an existing Google Cloud Storage bucket to be deployed to Google Cloud Vertex AI + as an endpoint, running a Custom Prediction Routine (CPR) on top of a Hugging Face optimized + Docker image pushed to Google Cloud Artifact Registry. + + This class is responsible for: + - Downloading the model from the Hub if `model_name_or_path` is provided. + - Uploading the model to Google Cloud Storage if `model_name_or_path` is provided. + - Building a Docker image with the prediction code, handler and the required dependencies if `image_uri` not provided. + - Pushing the Docker image to Google Cloud Artifact Registry if `image_uri` not provided. + - Registering the model in Google Cloud Vertex AI. + - Deploying the model as an endpoint with the provided environment variables. + + Note: + This class is intended to be a high-level abstraction to simplify the process of deploying + models from the Hugging Face Hub to Google Cloud Vertex AI, and is built on top of `google-cloud-aiplatform` + and the rest of the required Google Cloud Python SDKs. + """ + + def __init__( + self, + # Google Cloud + project_id: Optional[str] = None, + location: Optional[str] = None, + # Google Cloud Storage + model_name_or_path: Optional[str] = None, + model_kwargs: Optional[Dict[str, Any]] = None, + model_task: Literal[ + "text-to-image", "image-to-text", "inpainting" + ] = "text-to-image", + model_target_bucket: str = "vertex-ai-huggingface-inference-toolkit", + # Exclusive arg for Google Cloud Storage + model_bucket_uri: Optional[str] = None, + # Google Cloud Artifact Registry (Docker) + framework: Literal["torch", "tensorflow", "flax"] = "torch", + framework_version: Optional[str] = None, + diffusers_version: str = "0.26.3", + python_version: str = "3.10", + cuda_version: str = "12.3.0", + ubuntu_version: str = "22.04", + extra_requirements: Optional[List[str]] = None, + image_target_repository: str = "vertex-ai-huggingface-inference-toolkit", + # Exclusive arg for Google Cloud Artifact Registry + image_uri: Optional[str] = None, + # Google Cloud Vertex AI + environment_variables: Optional[Dict[str, str]] = None, + ) -> None: + """Initializes the `DiffusersModel` class, setting up the required attributes to + deploy a model from the Hugging Face Hub to Google Cloud Vertex AI. + + Args: + project_id: is either the name or the identifier of the project in Google Cloud. + location: is the identifier of the region and zone where the resources will be created. + model_name_or_path: is the name of the model to be downloaded from the Hugging Face Hub. + model_kwargs: is the dictionary of keyword arguments to be passed to the model's `from_pretrained` method. + model_task: is the task of the model to be used by the `diffusers` library. It can be one of the following: + - `text-to-image`: AutoPipelineForText2Image + - `image-to-text`: AutoPipelineForImage2Image + - `inpainting`: AutoPipelineForInpainting + model_target_bucket: is the name of the bucket in Google Cloud Storage where the model will be uploaded to. + model_bucket_uri: is the URI to the model tar.gz file in Google Cloud Storage. + framework: is the framework to be used to build the Docker image, e.g. `torch`, `tensorflow`, `flax`. + framework_version: is the version of the framework to be used to build the Docker image. + diffusers_version: is the version of the `diffusers` library to be used to build the Docker image. + python_version: is the version of Python to be used to build the Docker image. + cuda_version: is the version of CUDA to be used to build the Docker image. + ubuntu_version: is the version of Ubuntu to be used to build the Docker image. + extra_requirements: is the list of extra requirements to be installed in the Docker image. + image_target_repository: is the name of the repository in Google Cloud Artifact Registry where the Docker image will be pushed to. + image_uri: is the URI to the Docker image in Google Cloud Artifact Registry. + environment_variables: is the dictionary of environment variables to be set in the Docker image. + + Raises: + ValueError: if neither `model_name_or_path` nor `model_bucket_uri` is provided. + ValueError: if both `model_name_or_path` and `model_bucket_uri` are provided. + + Examples: + >>> from vertex_ai_huggingface_inference_toolkit import DiffusersModel + >>> model = DiffusersModel( + ... project_id="my-gcp-project", + ... location="us-central1", + ... model_name_or_path="stabilityai/stable-diffusion-2", + ... model_task="text-to-image", + ... ) + >>> model.deploy( + ... machine_type="n1-standard-8", + ... accelerator_type="NVIDIA_TESLA_T4", + ... accelerator_count=1, + ... ) + """ + + if environment_variables is None: + environment_variables = {} + + if model_task and environment_variables.get("HF_TASK"): + raise ValueError( + "Both `model_task` and `environment_variables['HF_TASK']` cannot be provided." + ) + + if model_task: + environment_variables["HF_TASK"] = model_task + + super().__init__( + project_id=project_id, + location=location, + model_name_or_path=model_name_or_path, + model_kwargs=model_kwargs, + model_target_bucket=model_target_bucket, + model_bucket_uri=model_bucket_uri, + framework=framework, + framework_version=framework_version, + huggingface_framework="diffusers", # type: ignore + huggingface_framework_version=diffusers_version, + python_version=python_version, + cuda_version=cuda_version, + ubuntu_version=ubuntu_version, + extra_requirements=extra_requirements, + image_target_repository=image_target_repository, + image_uri=image_uri, + environment_variables=environment_variables, + ) diff --git a/src/vertex_ai_huggingface_inference_toolkit/diffusers_utils.py b/src/vertex_ai_huggingface_inference_toolkit/diffusers_utils.py new file mode 100644 index 0000000..3db93d4 --- /dev/null +++ b/src/vertex_ai_huggingface_inference_toolkit/diffusers_utils.py @@ -0,0 +1,11 @@ +from diffusers.pipelines.auto_pipeline import ( + AutoPipelineForImage2Image, + AutoPipelineForInpainting, + AutoPipelineForText2Image, +) + +PIPELINE_TASKS = { + "text-to-image": AutoPipelineForText2Image, + "image-to-text": AutoPipelineForImage2Image, + "inpainting": AutoPipelineForInpainting, +} diff --git a/src/vertex_ai_huggingface_inference_toolkit/model.py b/src/vertex_ai_huggingface_inference_toolkit/model.py new file mode 100644 index 0000000..81f80cd --- /dev/null +++ b/src/vertex_ai_huggingface_inference_toolkit/model.py @@ -0,0 +1,365 @@ +import os +import sys +import tarfile +import warnings +from pathlib import Path +from typing import Any, Dict, List, Literal, Optional, Union + +if sys.version_info < (3, 9): + import importlib_resources +else: + import importlib.resources as importlib_resources + +from google.auth import default +from google.cloud import aiplatform + +from vertex_ai_huggingface_inference_toolkit.utils import ( + CACHE_PATH, + build_docker_image, + configure_docker_and_push_image, + create_repository_in_artifact_registry, + download_files_from_hub, + upload_file_to_gcs, +) + + +class Model: + """Class that manages the whole lifecycle of a Hugging Face model either from the Hub + or from an existing Google Cloud Storage bucket to be deployed to Google Cloud Vertex AI + as an endpoint, running a Custom Prediction Routine (CPR) on top of a Hugging Face optimized + Docker image pushed to Google Cloud Artifact Registry. + + This class is responsible for: + - Downloading the model from the Hub if `model_name_or_path` is provided. + - Uploading the model to Google Cloud Storage if `model_name_or_path` is provided. + - Building a Docker image with the prediction code, handler and the required dependencies if `image_uri` not provided. + - Pushing the Docker image to Google Cloud Artifact Registry if `image_uri` not provided. + - Registering the model in Google Cloud Vertex AI. + - Deploying the model as an endpoint with the provided environment variables. + + Note: + This class is intended to be a high-level abstraction to simplify the process of deploying + models from the Hugging Face Hub to Google Cloud Vertex AI, and is built on top of `google-cloud-aiplatform` + and the rest of the required Google Cloud Python SDKs. + """ + + def __init__( + self, + # Google Cloud + project_id: Optional[str] = None, + location: Optional[str] = None, + # Google Cloud Storage + model_name_or_path: Optional[str] = None, + model_kwargs: Optional[Dict[str, Any]] = None, + model_target_bucket: str = "vertex-ai-huggingface-inference-toolkit", + # Exclusive arg for Google Cloud Storage + model_bucket_uri: Optional[str] = None, + # Google Cloud Artifact Registry (Docker) + framework: Literal["torch", "tensorflow", "flax"] = "torch", + framework_version: Optional[str] = None, + huggingface_framework: Optional[Literal["trasformers", "diffusers"]] = None, + huggingface_framework_version: Optional[str] = None, + python_version: str = "3.10", + cuda_version: str = "12.3.0", + ubuntu_version: str = "22.04", + extra_requirements: Optional[List[str]] = None, + image_target_repository: str = "vertex-ai-huggingface-inference-toolkit", + # Exclusive arg for Google Cloud Artifact Registry + image_uri: Optional[str] = None, + # Google Cloud Vertex AI + environment_variables: Optional[Dict[str, str]] = None, + ) -> None: + """Initializes the `Model` class, setting up the required attributes to deploy any + model from the Hugging Face Hub to Google Cloud Vertex AI. + + Args: + project_id: is either the name or the identifier of the project in Google Cloud. + location: is the identifier of the region and zone where the resources will be created. + model_name_or_path: is the name of the model to be downloaded from the Hugging Face Hub. + model_kwargs: is the dictionary of keyword arguments to be passed to the model's `from_pretrained` method. + model_target_bucket: is the name of the bucket in Google Cloud Storage where the model will be uploaded to. + model_bucket_uri: is the URI to the model tar.gz file in Google Cloud Storage. + framework: is the framework to be used to build the Docker image, e.g. `torch`, `tensorflow`, `flax`. + framework_version: is the version of the framework to be used to build the Docker image. + python_version: is the version of Python to be used to build the Docker image. + cuda_version: is the version of CUDA to be used to build the Docker image. + ubuntu_version: is the version of Ubuntu to be used to build the Docker image. + extra_requirements: is the list of extra requirements to be installed in the Docker image. + image_target_repository: is the name of the repository in Google Cloud Artifact Registry where the Docker image will be pushed to. + image_uri: is the URI to the Docker image in Google Cloud Artifact Registry. + environment_variables: is the dictionary of environment variables to be set in the Docker image. + + Raises: + ValueError: if neither `model_name_or_path` nor `model_bucket_uri` is provided. + ValueError: if both `model_name_or_path` and `model_bucket_uri` are provided. + + Examples: + >>> from vertex_ai_huggingface_inference_toolkit import TransformersModel + >>> model = TransformersModel( + ... model_name_or_path="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", + ... project_id="my-gcp-project", + ... location="us-central1", + ... environment_variables={ + ... "HF_TASK": "zero-shot-classification", + ... }, + ... ) + >>> model.deploy( + ... machine_type="n1-standard-8", + ... accelerator_type="NVIDIA_TESLA_T4", + ... accelerator_count=1, + ... ) + """ + + if model_name_or_path is None and model_bucket_uri is None: + raise ValueError( + "You need to provide either `model_name_or_path` or `model_bucket_uri`" + ) + + if model_name_or_path is not None and model_bucket_uri is not None: + raise ValueError( + "You can't provide both `model_name_or_path` and `model_bucket_uri`" + ) + + # If the `project_id` is not provided, then we retrieve the default one i.e. the one + # set as default via `gcloud config set project ` + if project_id is None: + # https://google-auth.readthedocs.io/en/master/reference/google.auth.html + _, project_id = default() + self.project_id = project_id + + # If the `location` is not provided, then we default to `us-central1` as that's the + # Google Cloud default region. + if location is None: + warnings.warn( + "`location` has not been provided, so `location=us-central1` will be used" + " instead, as that's the Google Cloud default region.", + stacklevel=1, + ) + location = "us-central1" + self.location = location + + # If the `model_bucket_uri` has been provided, then we will need to check whether it's + # the name to the model on the Hugging Face Hub, or the path to the model in the local + # storage. + if model_name_or_path is not None: + if os.path.exists(model_name_or_path): + # If the `model_name_or_path` is a local path, then we will need to compress everything + # within the directory into `model.tar.gz` and upload it to Google Cloud Storage. + _local_dir = model_name_or_path + _tar_gz_path = Path(_local_dir) / "model.tar.gz" + else: + # If the `model_name_or_path` is a model from the Hugging Face Hub, then we will need + # to download it and upload it to Google Cloud Storage. + _local_dir = download_files_from_hub( + repo_id=model_name_or_path, framework=framework + ) + + _cache_path = CACHE_PATH / model_name_or_path.replace("/", "--") + if not _cache_path.exists(): + _cache_path.mkdir(parents=True, exist_ok=True) + + _tar_gz_path = _cache_path / "model.tar.gz" + + if _tar_gz_path.exists(): + _tar_gz_path.unlink() + + # Then, we compress the directory into a `model.tar.gz` file + with tarfile.open(_tar_gz_path, "w:gz") as tf: + for root, _, files in os.walk(_local_dir): + for file in files: + file_path = os.path.join(root, file) + if os.path.islink(file_path): + file_path = os.path.realpath(file_path) + tf.add(file_path, arcname=file) + + # Finally, we upload the `model.tar.gz` file to Google Cloud Storage + model_bucket_uri = upload_file_to_gcs( + project_id=self.project_id, # type: ignore + location=self.location, + local_path=_tar_gz_path.as_posix(), + bucket_name=model_target_bucket, + remote_path=f"{model_name_or_path.replace('/', '--')}/model.tar.gz", + ) + self.model_bucket_uri = model_bucket_uri.replace("/model.tar.gz", "") # type: ignore + + # If the `image_uri` has not been provided, then we will need to build the Docker image + # and push it to Google Cloud Artifact Registry. + if image_uri is None: + # Depending on the `framework`, if `framework_version` has not been provided, then we + # select the latest or stable version of the selected `framework`, but it's not ideal. + if framework_version is None: + if framework == "torch": + framework_version = "2.1.0" + elif framework == "tensorflow": + framework_version = "2.15.0" + elif framework == "flax": + framework_version = "0.8.0" + + # Then we build the Docker image with the provided args, that will be replaced within the + # Dockerfile to build, as those are internally defined as `BUILD_ARGS` + _image = build_docker_image( + python_version=python_version, + framework=framework, + framework_version=framework_version, + huggingface_framework=huggingface_framework, # type: ignore + huggingface_framework_version=huggingface_framework_version, + cuda_version=cuda_version, + ubuntu_version=ubuntu_version, + extra_requirements=extra_requirements, + ) + # Once the Docker image has been built, then we push it to Google Cloud Artifact Registry, but first + # we need to create a new repository if it doesn't exist. + create_repository_in_artifact_registry( + project_id=self.project_id, # type: ignore + location=self.location, + name=image_target_repository, + format="DOCKER", + ) + image_uri = configure_docker_and_push_image( + project_id=self.project_id, # type: ignore + location=self.location, + repository=image_target_repository, + image_with_tag=_image, + ) + self.image_uri = image_uri + + # If the `environment_variables` are not set or any of the expected ones it not set, then we + # will set them to their default values. + if environment_variables is None: + environment_variables = {} + if model_kwargs is not None and "HF_MODEL_KWARGS" not in environment_variables: + environment_variables["HF_MODEL_KWARGS"] = str(model_kwargs) + if isinstance(environment_variables["HF_MODEL_KWARGS"], dict): + environment_variables["HF_MODEL_KWARGS"] = str( + environment_variables["HF_MODEL_KWARGS"] + ) + if "VERTEX_CPR_WEB_CONCURRENCY" not in environment_variables: + warnings.warn( + "Since the `VERTEX_CPR_WEB_CONCURRENCY` environment variable hasn't been set," + " it will default to 1, meaning that `uvicorn` will only run the model in one" + " worker. If you prefer to run the model using more workers, feel free to provide" + " a greater value for `VERTEX_CPR_WEB_CONCURRENCY`", + stacklevel=1, + ) + environment_variables["VERTEX_CPR_WEB_CONCURRENCY"] = "1" + + # If the `model_name_or_path` has been provided, then we will use it as the `display_name` of the + # model in Google Cloud Vertex AI Model Registry, otherwise we will use the last part of the + # `model_bucket_uri` as the `display_name`. + display_name = ( + model_name_or_path.replace("/", "--") + if model_name_or_path is not None + else model_bucket_uri.split("/")[-1] # type: ignore + ) + + # If the `HF_TASK` environment variable has not been set in `environmnent_variables`, then we will + # warn the user that it hasn't been set, and set the `instance_schema_uri` and `prediction_schema_uri` + # to `None`. + instance_schema_uri, prediction_schema_uri = None, None + if huggingface_framework == "transformers": + task = environment_variables.get("HF_TASK", "") + if task == "" or task not in [ + "text-generation", + "zero-shot-classification", + ]: + warnings.warn( + "`HF_TASK` hasn't been set within the `environment_variables` dict, so the" + " `task` will default to an empty string which may not be ideal. Additionally," + " the `HF_TASK` needs to be defined so that the `instance_schema_uri` and" + " `predictions_schema_uri` can be generated automatically based on the `pipeline`" + " definition.", + stacklevel=1, + ) + + _path = str( + importlib_resources.files("vertex_ai_huggingface_inference_toolkit") + / "_internal" + / huggingface_framework + / "schemas" + / task + ) + # Since only the `text-generation` and `zero-shot-classification` tasks have the + # `instance_schema_uri` and `prediction_schema_uri` defined, then we will only + # upload the schemas if the `task` is one of those. + instance_schema_uri = upload_file_to_gcs( + project_id=self.project_id, # type: ignore + location=self.location, + local_path=f"{_path}/input.yaml", + bucket_name=model_target_bucket, + remote_path=f"{display_name}/{task}/input.yaml", + ) + prediction_schema_uri = upload_file_to_gcs( + project_id=self.project_id, # type: ignore + location=self.location, + local_path=f"{_path}/output.yaml", + bucket_name=model_target_bucket, + remote_path=f"{display_name}/{task}/output.yaml", + ) + + # Finally, we upload the model to Google Cloud Vertex AI Model Registry, providing the + # `model_bucket_uri` in Google Cloud Storage, the `image_uri` in Google Cloud Artifact Registry, + # and the `environment_variables` to be set in the Docker image at runtime. + # `aiplatform.Model.upload` reference: + # https://github.com/googleapis/python-aiplatform/blob/63ad1bf9e365d2f10b91e2fd036e3b7d937336c0/google/cloud/aiplatform/models.py#L2974 + self._model: aiplatform.Model = aiplatform.Model.upload( # type: ignore + display_name=display_name, + project=self.project_id, + location=self.location, + artifact_uri=self.model_bucket_uri, + serving_container_image_uri=self.image_uri, + serving_container_environment_variables=environment_variables, + instance_schema_uri=instance_schema_uri, + prediction_schema_uri=prediction_schema_uri, + ) + self._endpoints: List[ + Union[aiplatform.Endpoint, aiplatform.PrivateEndpoint] + ] = [] + + @property + def endpoints( + self, + ) -> Optional[List[Union[aiplatform.Endpoint, aiplatform.PrivateEndpoint]]]: + """Returns the list of deployed `Endpoint` resources, if any.""" + return self._endpoints + + def deploy( + self, + machine_type: Optional[str] = None, + min_replica_count: int = 1, + max_replica_count: int = 1, + accelerator_type: Optional[str] = None, + accelerator_count: Optional[int] = None, + ) -> None: + """Deploys the model to a `Endpoint` resource, with the given `machine_type` and + `accelerator_type` and `accelerator_count` if provided. The `min_replica_count` and + `max_replica_count` are set to 1 by default, but can be changed if needed. + + Args: + machine_type: is the type of machine to use for the deployment, e.g. `n1-standard-8`. + min_replica_count: is the minimum number of replicas to use for the deployment. + max_replica_count: is the maximum number of replicas to use for the deployment. + accelerator_type: is the type of accelerator to use for the deployment, e.g. `NVIDIA_TESLA_T4`. + accelerator_count: is the number of accelerators to use for the deployment, e.g. `1`. + + References: + - https://github.com/googleapis/python-aiplatform/blob/63ad1bf9e365d2f10b91e2fd036e3b7d937336c0/google/cloud/aiplatform/models.py#L3431 + """ + + self._endpoints.append( + self._model.deploy( + machine_type=machine_type, + min_replica_count=min_replica_count, + max_replica_count=max_replica_count, + accelerator_type=accelerator_type, + accelerator_count=accelerator_count, + ) + ) + + def undeploy(self) -> None: + """Undeploys the model from the `Endpoint` resources, if any. Finally, deletes the model + from Vertex AI Model Registry.""" + + for endpoint in self._endpoints: + endpoint.delete(force=True, sync=False) + self._endpoints = [] + self._model.delete(sync=False) diff --git a/src/vertex_ai_huggingface_inference_toolkit/predictors/diffusers.py b/src/vertex_ai_huggingface_inference_toolkit/predictors/diffusers.py new file mode 100644 index 0000000..818a771 --- /dev/null +++ b/src/vertex_ai_huggingface_inference_toolkit/predictors/diffusers.py @@ -0,0 +1,174 @@ +import os +import tarfile +from io import BytesIO +from typing import Any, Dict, Optional +from uuid import uuid4 + +import torch +from google.cloud.aiplatform.prediction.predictor import Predictor +from google.cloud.aiplatform.utils import prediction_utils +from google.cloud.storage import Client +from PIL.Image import Image + +from vertex_ai_huggingface_inference_toolkit.diffusers_utils import ( + PIPELINE_TASKS, +) +from vertex_ai_huggingface_inference_toolkit.utils import get_logger + + +class DiffusersPredictor(Predictor): + """Custom `Predictor` for the Hugging Face `diffusers` library, that allows + to load the model and run inference on top of it via the `pipeline` method. This + class is also in charge of downloading the artifacts from Google Cloud Storage, when + provided, and loading the model with a custom configuration and with automatic device + placement, mostly via `accelerate`. + """ + + def __init__(self) -> None: + """Initializes the `DiffusersPredictor` with a custom logger.""" + self._logger = get_logger("vertex-ai-huggingface-inference-toolkit") + + def load(self, artifacts_uri: Optional[str] = None) -> None: + """Downloads the model from the given `artifacts_uri` or `HF_HUB_ID` environment + variable, to load it via `pipeline` and placing it on the right device. So on, the + outcome of `load` is the assignment of the value for the `_pipeline` attribute, that + will be used to run the inference via `predict`. + + The `load` method is called within the CPR server during the initialization of the + server, so it's the first method to be called before running the inference. + + Args: + artifacts_uri: is the Google Cloud Storage URI to the artifact to serve, which + will ideally be the directory where the model is stored in Google Cloud Storage. + Also note it's optional not because the model can be loaded via `HF_HUB_ID`, because + its mandatory to provide a Google Cloud Storage URI in Vertex AI, but because it + can be used locally without the need of an artifact URI, as the model can be pulled + from the Hugging Face Hub. + + Raises: + RuntimeError: if neither the `artifacts_uri` nor the `HF_HUB_ID` environment variable + are set, as the model needs to be loaded from somewhere. + """ + + # If the `artifacts_uri` is provided, then we download its contents into the current directory + if artifacts_uri is not None: + self._logger.info( + f"Downloading artifacts from `artifacts_uri='{artifacts_uri}'`" + ) + prediction_utils.download_model_artifacts(artifacts_uri) + self._logger.info("Artifacts successfully downloaded!") + + # If the `artifacts_uri` was provided, but the `model.tar.gz` file was not downloaded from it, + # and the `HF_HUB_ID` environment variable is not set, then we raise an error as the model needs + # to be loaded from somewhere. + hub_id = os.getenv("HF_HUB_ID", None) + file_exists = os.path.exists("model.tar.gz") and os.path.isfile("model.tar.gz") + if not file_exists and hub_id is None: + error_msg = "Neither the environment variable `HF_HUB_ID` nor the file `model.tar.gz` exist!" + self._logger.error(error_msg) + raise RuntimeError(error_msg) + + # If the `artifacts_uri` was provided, and the `model.tar.gz` file was downloaded from it, then + # we set the `model_path` to the `transformers-model` directory, otherwise we set it to the `HF_HUB_ID`. + model_path = "./diffusers-model" if file_exists else hub_id + if file_exists: + if hub_id: + self._logger.warn( + f"Since both the provided `artifacts_uri={artifacts_uri}` and the environment" + f" variable `HF_HUB_ID={hub_id}` are set, the `artifacts_uri` will be used as" + " it has priority over the `HF_HUB_ID` environment variable." + ) + # Extract the `model.tar.gz` file into the `transformers-model` directory + os.makedirs("./diffusers-model", exist_ok=True) + with tarfile.open("model.tar.gz", "r:gz") as tar: + tar.extractall(path="./diffusers-model") + + # If the `HF_MODEL_KWARGS` environment variable is set, then we parse its value into a dictionary + model_kwargs = os.getenv("HF_MODEL_KWARGS", None) + model_kwargs_dict: Dict[str, Any] = {} + if model_kwargs is not None: + try: + model_kwargs_dict = eval(model_kwargs) + self._logger.info(f"HF_MODEL_KWARGS value is {model_kwargs_dict}") + # Since the device placement is in charge of the `TransformersPredictor`, we will pop those + # keys from the `model_kwargs_dict` to avoid conflicts with the `pipeline` method. + model_kwargs_dict.pop("device", None) + model_kwargs_dict.pop("device_map", None) + except Exception: + self._logger.error( + f"Failed to parse `HF_MODEL_KWARGS` environment variable: {model_kwargs}" + ) + + # Set `torch_dtype` to `auto` is not set. + if "torch_dtype" not in model_kwargs_dict: + model_kwargs_dict["torch_dtype"] = "auto" + else: + model_kwargs_dict["torch_dtype"] = getattr( + torch, model_kwargs_dict["torch_dtype"] + ) + + # If the `HF_TASK` environment variable is set, then we use it to load the `pipeline` with the + # specified task, otherwise we load the `pipeline` with the default task, which is inferred from + # the model's architecture. + task = os.getenv("HF_TASK", "text-to-image") + if task not in PIPELINE_TASKS: + error_msg = ( + f"The `HF_TASK` environment variable value '{task}' is not supported! " + f"Supported values are: {PIPELINE_TASKS}" + ) + self._logger.error(error_msg) + raise ValueError(error_msg) + + self._logger.info(f"HF_TASK value is {task}") + + self._logger.info("Loading `pipeline` using `device_map='auto'`") + self._pipeline = PIPELINE_TASKS[task].from_pretrained( # type: ignore + pretrained_model_or_path=model_path, + device_map="auto", + **model_kwargs_dict, + ) + self._logger.info( + f"`pipeline` successfully loaded and running on device={self._pipeline.device}" + ) + + def _upload_image_to_gcs(self, image: Image) -> str: + """Uploads the given `image` to Google Cloud Storage and returns the public URL. + + Args: + image: is the image to be uploaded to Google Cloud Storage. + + Returns: + The public URL to the uploaded image in Google Cloud Storage. + """ + client = Client() + bucket = client.get_bucket("vertex-ai-huggingface-inference-toolkit") + + with BytesIO() as output: + image.save(output, format="JPEG") # type: ignore + contents = output.getvalue() + + blob = bucket.blob(f"diffusers/{uuid4()}.jpg") + blob.upload_from_string(contents, content_type="image/jpeg") + return blob.public_url # type: ignore + + def predict(self, instances: Dict[str, Any]) -> Dict[str, Any]: + """Runs the inference on top of the loaded `pipeline` with the given `instances`. + + Args: + instances: is the dictionary containing the instances to be predicted, which can either + be a dictionary with `instances` as the key and the value being a list of dicts, or + directly a single instance with the expected keys by `pipeline`. + + Returns: + The dictionary containing the predictions for the given `instances`. + """ + + # NOTE: the standard `predict` method assumes that the `instances` is a dictionary with the key + # `instances` that contains the actual instances to be predicted, so we need to check whether + # the `instances` is a dictionary or a list, and if it's a dictionary, then we need to extract + # the `instances` from it. + if "instances" in instances: + instances = instances["instances"] + + image = self._pipeline(**instances).images[0] # type: ignore + return {"image_url": self._upload_image_to_gcs(image)} diff --git a/src/vertex_ai_huggingface_inference_toolkit/server/custom_serving.py b/src/vertex_ai_huggingface_inference_toolkit/server/custom_serving.py index 7455217..c0867bb 100644 --- a/src/vertex_ai_huggingface_inference_toolkit/server/custom_serving.py +++ b/src/vertex_ai_huggingface_inference_toolkit/server/custom_serving.py @@ -21,10 +21,16 @@ def __init__(self) -> None: ) os.environ["HANDLER_CLASS"] = "CustomPredictionHandler" - os.environ["PREDICTOR_MODULE"] = ( - "vertex_ai_huggingface_inference_toolkit.predictors.transformers" - ) - os.environ["PREDICTOR_CLASS"] = "TransformersPredictor" + if os.environ["HF_PACKAGE"] == "transformers": + os.environ["PREDICTOR_MODULE"] = ( + "vertex_ai_huggingface_inference_toolkit.predictors.transformers" + ) + os.environ["PREDICTOR_CLASS"] = "TransformersPredictor" + elif os.environ["HF_PACKAGE"] == "diffusers": + os.environ["PREDICTOR_MODULE"] = ( + "vertex_ai_huggingface_inference_toolkit.predictors.diffusers" + ) + os.environ["PREDICTOR_CLASS"] = "DiffusersPredictor" os.environ["AIP_HTTP_PORT"] = "8080" os.environ["AIP_HEALTH_ROUTE"] = "/health" @@ -38,6 +44,12 @@ def __init__(self) -> None: Example: >>> export HF_HUB_ID="cardiffnlp/twitter-roberta-base-sentiment-latest" >>> export HF_TASK="text-classification" + >>> export HF_PACKAGE="transformers" + >>> python vertex_ai_huggingface_inference_toolkit/server/custom_serving.py + + >>> export HF_HUB_ID="runwayml/stable-diffusion-v1-5" + >>> export HF_TASK="text-to-image" + >>> export HF_PACKAGE="diffusers" >>> python vertex_ai_huggingface_inference_toolkit/server/custom_serving.py """ import uvicorn diff --git a/src/vertex_ai_huggingface_inference_toolkit/transformers.py b/src/vertex_ai_huggingface_inference_toolkit/transformers.py index 5881147..fbca0f1 100644 --- a/src/vertex_ai_huggingface_inference_toolkit/transformers.py +++ b/src/vertex_ai_huggingface_inference_toolkit/transformers.py @@ -1,29 +1,9 @@ -import os -import sys -import tarfile -import warnings -from pathlib import Path -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, Dict, List, Literal, Optional -if sys.version_info < (3, 9): - import importlib_resources -else: - import importlib.resources as importlib_resources +from vertex_ai_huggingface_inference_toolkit.model import Model -from google.auth import default -from google.cloud import aiplatform -from vertex_ai_huggingface_inference_toolkit.utils import ( - CACHE_PATH, - build_docker_image, - configure_docker_and_push_image, - create_repository_in_artifact_registry, - download_files_from_hub, - upload_file_to_gcs, -) - - -class TransformersModel: +class TransformersModel(Model): """Class that manages the whole lifecycle of a Hugging Face model either from the Hub or from an existing Google Cloud Storage bucket to be deployed to Google Cloud Vertex AI as an endpoint, running a Custom Prediction Routine (CPR) on top of a Hugging Face optimized @@ -96,9 +76,9 @@ def __init__( Examples: >>> from vertex_ai_huggingface_inference_toolkit import TransformersModel >>> model = TransformersModel( - ... model_name_or_path="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", ... project_id="my-gcp-project", ... location="us-central1", + ... model_name_or_path="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", ... environment_variables={ ... "HF_TASK": "zero-shot-classification", ... }, @@ -110,250 +90,22 @@ def __init__( ... ) """ - if model_name_or_path is None and model_bucket_uri is None: - raise ValueError( - "You need to provide either `model_name_or_path` or `model_bucket_uri`" - ) - - if model_name_or_path is not None and model_bucket_uri is not None: - raise ValueError( - "You can't provide both `model_name_or_path` and `model_bucket_uri`" - ) - - # If the `project_id` is not provided, then we retrieve the default one i.e. the one - # set as default via `gcloud config set project ` - if project_id is None: - # https://google-auth.readthedocs.io/en/master/reference/google.auth.html - _, project_id = default() - self.project_id = project_id - - # If the `location` is not provided, then we default to `us-central1` as that's the - # Google Cloud default region. - if location is None: - warnings.warn( - "`location` has not been provided, so `location=us-central1` will be used" - " instead, as that's the Google Cloud default region.", - stacklevel=1, - ) - location = "us-central1" - self.location = location - - # If the `model_bucket_uri` has been provided, then we will need to check whether it's - # the name to the model on the Hugging Face Hub, or the path to the model in the local - # storage. - if model_name_or_path is not None: - if os.path.exists(model_name_or_path): - # If the `model_name_or_path` is a local path, then we will need to compress everything - # within the directory into `model.tar.gz` and upload it to Google Cloud Storage. - _local_dir = model_name_or_path - _tar_gz_path = Path(_local_dir) / "model.tar.gz" - else: - # If the `model_name_or_path` is a model from the Hugging Face Hub, then we will need - # to download it and upload it to Google Cloud Storage. - _local_dir = download_files_from_hub( - repo_id=model_name_or_path, framework=framework - ) - - _cache_path = CACHE_PATH / model_name_or_path.replace("/", "--") - if not _cache_path.exists(): - _cache_path.mkdir(parents=True, exist_ok=True) - - _tar_gz_path = _cache_path / "model.tar.gz" - - if _tar_gz_path.exists(): - _tar_gz_path.unlink() - - # Then, we compress the directory into a `model.tar.gz` file - with tarfile.open(_tar_gz_path, "w:gz") as tf: - for root, _, files in os.walk(_local_dir): - for file in files: - file_path = os.path.join(root, file) - if os.path.islink(file_path): - file_path = os.path.realpath(file_path) - tf.add(file_path, arcname=file) - - # Finally, we upload the `model.tar.gz` file to Google Cloud Storage - model_bucket_uri = upload_file_to_gcs( - project_id=self.project_id, # type: ignore - location=self.location, - local_path=_tar_gz_path.as_posix(), - bucket_name=model_target_bucket, - remote_path=f"{model_name_or_path.replace('/', '--')}/model.tar.gz", - ) - self.model_bucket_uri = model_bucket_uri.replace("/model.tar.gz", "") # type: ignore - - # If the `image_uri` has not been provided, then we will need to build the Docker image - # and push it to Google Cloud Artifact Registry. - if image_uri is None: - # Depending on the `framework`, if `framework_version` has not been provided, then we - # select the latest or stable version of the selected `framework`, but it's not ideal. - if framework_version is None: - if framework == "torch": - framework_version = "2.1.0" - elif framework == "tensorflow": - framework_version = "2.15.0" - elif framework == "flax": - framework_version = "0.8.0" - - # Then we build the Docker image with the provided args, that will be replaced within the - # Dockerfile to build, as those are internally defined as `BUILD_ARGS` - _image = build_docker_image( - python_version=python_version, - framework=framework, - framework_version=framework_version, - transformers_version=transformers_version, - cuda_version=cuda_version, - ubuntu_version=ubuntu_version, - extra_requirements=extra_requirements, - ) - # Once the Docker image has been built, then we push it to Google Cloud Artifact Registry, but first - # we need to create a new repository if it doesn't exist. - create_repository_in_artifact_registry( - project_id=self.project_id, # type: ignore - location=self.location, - name=image_target_repository, - format="DOCKER", - ) - image_uri = configure_docker_and_push_image( - project_id=self.project_id, # type: ignore - location=self.location, - repository=image_target_repository, - image_with_tag=_image, - ) - self.image_uri = image_uri - - # If the `environment_variables` are not set or any of the expected ones it not set, then we - # will set them to their default values. - if environment_variables is None: - environment_variables = {} - if model_kwargs is not None and "HF_MODEL_KWARGS" not in environment_variables: - environment_variables["HF_MODEL_KWARGS"] = str(model_kwargs) - if isinstance(environment_variables["HF_MODEL_KWARGS"], dict): - environment_variables["HF_MODEL_KWARGS"] = str( - environment_variables["HF_MODEL_KWARGS"] - ) - if "VERTEX_CPR_WEB_CONCURRENCY" not in environment_variables: - warnings.warn( - "Since the `VERTEX_CPR_WEB_CONCURRENCY` environment variable hasn't been set," - " it will default to 1, meaning that `uvicorn` will only run the model in one" - " worker. If you prefer to run the model using more workers, feel free to provide" - " a greater value for `VERTEX_CPR_WEB_CONCURRENCY`", - stacklevel=1, - ) - environment_variables["VERTEX_CPR_WEB_CONCURRENCY"] = "1" - - # If the `model_name_or_path` has been provided, then we will use it as the `display_name` of the - # model in Google Cloud Vertex AI Model Registry, otherwise we will use the last part of the - # `model_bucket_uri` as the `display_name`. - display_name = ( - model_name_or_path.replace("/", "--") - if model_name_or_path is not None - else model_bucket_uri.split("/")[-1] # type: ignore + super().__init__( + project_id=project_id, + location=location, + model_name_or_path=model_name_or_path, + model_kwargs=model_kwargs, + model_target_bucket=model_target_bucket, + model_bucket_uri=model_bucket_uri, + framework=framework, + framework_version=framework_version, + huggingface_framework="transformers", # type: ignore + huggingface_framework_version=transformers_version, + python_version=python_version, + cuda_version=cuda_version, + ubuntu_version=ubuntu_version, + extra_requirements=extra_requirements, + image_target_repository=image_target_repository, + image_uri=image_uri, + environment_variables=environment_variables, ) - - # If the `HF_TASK` environment variable has not been set in `environmnent_variables`, then we will - # warn the user that it hasn't been set, and set the `instance_schema_uri` and `prediction_schema_uri` - # to `None`. - task = environment_variables.get("HF_TASK", "") - if task == "" or task not in ["text-generation", "zero-shot-classification"]: - warnings.warn( - "`HF_TASK` hasn't been set within the `environment_variables` dict, so the" - " `task` will default to an empty string which may not be ideal. Additionally," - " the `HF_TASK` needs to be defined so that the `instance_schema_uri` and" - " `predictions_schema_uri` can be generated automatically based on the `pipeline`" - " definition.", - stacklevel=1, - ) - instance_schema_uri, prediction_schema_uri = None, None - else: - _path = str( - importlib_resources.files("vertex_ai_huggingface_inference_toolkit") - / "_internal" - / "schemas" - / task - ) - # Since only the `text-generation` and `zero-shot-classification` tasks have the - # `instance_schema_uri` and `prediction_schema_uri` defined, then we will only - # upload the schemas if the `task` is one of those. - instance_schema_uri = upload_file_to_gcs( - project_id=self.project_id, # type: ignore - location=self.location, - local_path=f"{_path}/input.yaml", - bucket_name=model_target_bucket, - remote_path=f"{display_name}/{task}/input.yaml", - ) - prediction_schema_uri = upload_file_to_gcs( - project_id=self.project_id, # type: ignore - location=self.location, - local_path=f"{_path}/output.yaml", - bucket_name=model_target_bucket, - remote_path=f"{display_name}/{task}/output.yaml", - ) - - # Finally, we upload the model to Google Cloud Vertex AI Model Registry, providing the - # `model_bucket_uri` in Google Cloud Storage, the `image_uri` in Google Cloud Artifact Registry, - # and the `environment_variables` to be set in the Docker image at runtime. - # `aiplatform.Model.upload` reference: - # https://github.com/googleapis/python-aiplatform/blob/63ad1bf9e365d2f10b91e2fd036e3b7d937336c0/google/cloud/aiplatform/models.py#L2974 - self._model: aiplatform.Model = aiplatform.Model.upload( # type: ignore - display_name=display_name, - project=self.project_id, - location=self.location, - artifact_uri=self.model_bucket_uri, - serving_container_image_uri=self.image_uri, - serving_container_environment_variables=environment_variables, - instance_schema_uri=instance_schema_uri, - prediction_schema_uri=prediction_schema_uri, - ) - self._endpoints: List[ - Union[aiplatform.Endpoint, aiplatform.PrivateEndpoint] - ] = [] - - @property - def endpoints( - self, - ) -> Optional[List[Union[aiplatform.Endpoint, aiplatform.PrivateEndpoint]]]: - """Returns the list of deployed `Endpoint` resources, if any.""" - return self._endpoints - - def deploy( - self, - machine_type: Optional[str] = None, - min_replica_count: int = 1, - max_replica_count: int = 1, - accelerator_type: Optional[str] = None, - accelerator_count: Optional[int] = None, - ) -> None: - """Deploys the model to a `Endpoint` resource, with the given `machine_type` and - `accelerator_type` and `accelerator_count` if provided. The `min_replica_count` and - `max_replica_count` are set to 1 by default, but can be changed if needed. - - Args: - machine_type: is the type of machine to use for the deployment, e.g. `n1-standard-8`. - min_replica_count: is the minimum number of replicas to use for the deployment. - max_replica_count: is the maximum number of replicas to use for the deployment. - accelerator_type: is the type of accelerator to use for the deployment, e.g. `NVIDIA_TESLA_T4`. - accelerator_count: is the number of accelerators to use for the deployment, e.g. `1`. - - References: - - https://github.com/googleapis/python-aiplatform/blob/63ad1bf9e365d2f10b91e2fd036e3b7d937336c0/google/cloud/aiplatform/models.py#L3431 - """ - - self._endpoints.append( - self._model.deploy( - machine_type=machine_type, - min_replica_count=min_replica_count, - max_replica_count=max_replica_count, - accelerator_type=accelerator_type, - accelerator_count=accelerator_count, - ) - ) - - def undeploy(self) -> None: - """Undeploys the model from the `Endpoint` resources, if any. Finally, deletes the model - from Vertex AI Model Registry.""" - - for endpoint in self._endpoints: - endpoint.delete(force=True, sync=False) - self._endpoints = [] - self._model.delete(sync=False) diff --git a/src/vertex_ai_huggingface_inference_toolkit/utils/docker.py b/src/vertex_ai_huggingface_inference_toolkit/utils/docker.py index 5082fdf..babb274 100644 --- a/src/vertex_ai_huggingface_inference_toolkit/utils/docker.py +++ b/src/vertex_ai_huggingface_inference_toolkit/utils/docker.py @@ -2,7 +2,7 @@ import subprocess import sys from datetime import datetime -from typing import List, Optional +from typing import List, Literal, Optional if sys.version_info < (3, 9): import importlib_resources @@ -18,7 +18,8 @@ def build_docker_image( python_version: str, framework: str, framework_version: str, - transformers_version: str, + huggingface_framework: Literal["transformers", "diffusers"], + huggingface_framework_version: Optional[str] = None, cuda_version: Optional[str] = None, ubuntu_version: Optional[str] = None, extra_requirements: Optional[List[str]] = None, @@ -33,8 +34,10 @@ def build_docker_image( framework: is the identifier of the deep learning framework to use. Available options for the moment are `torch`, `tensorflow` and `jax`. framework_version: is the version of the provided framework as shown in PyPI. - transformers_version: is the version of `transformers` to install, since the - inference code will be run via `transformers`. + huggingface_framework: is either `transformers` or `diffusers`, depending on + the framework to use for inference. + huggingface_framework_version: is the version of either `transformers` or `diffusers` + to use for inference, depending on the `huggingface_framework` specified. cuda_version: is the version of CUDA to use, if planning to deploy the model within an instance with GPU acceleration. The CUDA versions to be provided need to be in the format of X.Y.Z, and available at https://hub.docker.com/r/nvidia/cuda/tags?page=1&name=-base-ubuntu @@ -51,7 +54,8 @@ def build_docker_image( # The tag is set in advance, generated from the replacements of the `--build-args` _device_string = f"cu{cuda_version}" if cuda_version is not None else "cpu" - _tag = f"py{python_version}-{_device_string}-{framework}-{framework_version}-transformers-{transformers_version}" + _huggingface_string = f"{huggingface_framework}-{huggingface_framework_version}" + _tag = f"py{python_version}-{_device_string}-{framework}-{framework_version}-{_huggingface_string}" # The `_build_args` to be replaced in the `Dockerfile` when building it need to be # prepared in advance, to ensure the formatting and assignment is fine. @@ -60,8 +64,12 @@ def build_docker_image( "PYTHON_VERSION": python_version, "FRAMEWORK": framework, "FRAMEWORK_VERSION": framework_version, - "TRANSFORMERS_VERSION": transformers_version, } + + _build_args[ + f"{huggingface_framework.upper()}_VERSION" # type: ignore + ] = huggingface_framework_version # type: ignore + if cuda_version is not None: _build_args["CUDA_VERSION"] = cuda_version _dockerfile = "Dockerfile.gpu" @@ -73,6 +81,7 @@ def build_docker_image( _path = str( importlib_resources.files("vertex_ai_huggingface_inference_toolkit") / "_internal" + / huggingface_framework / "dockerfiles" ) diff --git a/src/vertex_ai_huggingface_inference_toolkit/utils/huggingface.py b/src/vertex_ai_huggingface_inference_toolkit/utils/huggingface.py index acef3e9..4f43f57 100644 --- a/src/vertex_ai_huggingface_inference_toolkit/utils/huggingface.py +++ b/src/vertex_ai_huggingface_inference_toolkit/utils/huggingface.py @@ -86,9 +86,14 @@ def download_files_from_hub(repo_id: str, framework: str) -> str: if pattern in ignore_patterns: ignore_patterns.remove(pattern) + # Add some more patterns to ignore if the framework is not `torch`, since not all the `torch` + # files contain are formatted as `pytorch*`, see `https://huggingface.co/runwayml/stable-diffusion-v1-5` + if framework != "torch": + ignore_patterns.extend(["*.pt", "*pth", "*bin"]) + # Additionally, also include the `README.md`, `.gitattributes` and `.git` files, not ignored within # the `sagemaker_huggingface_inference_toolkit` implementation. - ignore_patterns.extend(["README.md", ".gitattributes", ".git/*"]) + ignore_patterns.extend(["README*", ".gitattributes", ".git/*", "LICENSE*"]) return snapshot_download( # type: ignore repo_id, ignore_patterns=ignore_patterns or None, local_dir_use_symlinks=False