Skip to content

Commit

Permalink
[stabilityai_pytorch][inference] Stability AI Inference DLC (aws#3195)
Browse files Browse the repository at this point in the history
Co-authored-by: arjkesh <[email protected]>
Co-authored-by: Shantanu Tripathi <[email protected]>
  • Loading branch information
3 people authored Aug 3, 2023
1 parent 7264fac commit 39083d3
Show file tree
Hide file tree
Showing 14 changed files with 444 additions and 41 deletions.
1 change: 1 addition & 0 deletions src/image_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def image_builder(buildspec, image_types=[], device_types=[]):
if (
"huggingface" in str(BUILDSPEC["framework"])
or "autogluon" in str(BUILDSPEC["framework"])
or "stabilityai" in str(BUILDSPEC["framework"])
or "trcomp" in str(BUILDSPEC["framework"])
):
os.system("echo login into public ECR")
Expand Down
34 changes: 5 additions & 29 deletions stabilityai/pytorch/inference/buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,40 +9,17 @@ arch_type: x86
repository_info:
inference_repository: &INFERENCE_REPOSITORY
image_type: &INFERENCE_IMAGE_TYPE inference
root: !join [ *BASE_FRAMEWORK, "/", *INFERENCE_IMAGE_TYPE ]
root: !join [ "stabilityai/", *BASE_FRAMEWORK, "/", *INFERENCE_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [pr, "-", "stabilityai", "-", *BASE_FRAMEWORK, "-", *INFERENCE_IMAGE_TYPE]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]

context:
inference_context: &INFERENCE_CONTEXT
torchserve-ec2-entrypoint:
source: docker/build_artifacts/torchserve-ec2-entrypoint.py
target: torchserve-ec2-entrypoint.py
torchserve-entrypoint:
source: docker/build_artifacts/torchserve-entrypoint.py
source: docker/build_artifacts/torchserve-stabilityai-entrypoint.py
target: torchserve-entrypoint.py
config:
source: docker/build_artifacts/config.properties
target: config.properties
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py

images:
BuildStabilityaiPytorchCpuPy310InferenceDockerImage:
<<: *INFERENCE_REPOSITORY
build: &STABILITYAI_PYTORCH_CPU_INFERENCE_PY3 false
image_size_baseline: 4900
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py310
os_version: &OS_VERSION ubuntu20.04
diffusers_version: &DIFFUSERS_VERSION 1.2.3
tag: !join [ *VERSION, "-", 'diffusers',*DIFFUSERS_VERSION, '-', *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
context:
<<: *INFERENCE_CONTEXT
BuildStabilityaiPytorchGpuPy310InferenceDockerImage:
<<: *INFERENCE_REPOSITORY
build: &STABILITYAI_PYTORCH_GPU_INFERENCE_PY3 false
Expand All @@ -52,10 +29,9 @@ images:
tag_python_version: &TAG_PYTHON_VERSION py310
cuda_version: &CUDA_VERSION cu118
os_version: &OS_VERSION ubuntu20.04
diffusers_version: &DIFFUSERS_VERSION 1.2.3
tag: !join [ *VERSION, "-", 'diffusers',*DIFFUSERS_VERSION, '-', *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
sgm_version: &SGM_VERSION 0.1.0
tag: !join [ *VERSION, "-", 'sgm',*SGM_VERSION, '-', *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: sagemaker
context:
<<: *INFERENCE_CONTEXT
<<: *INFERENCE_CONTEXT
47 changes: 47 additions & 0 deletions stabilityai/pytorch/inference/docker/2.0/py3/cu118/Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference:2.0.1-gpu-py310-cu118-ubuntu20.04-sagemaker

LABEL dlc_major_version="1"
ARG PYTHON=python3
ARG XFORMERS_VERSION=0.0.20

# xformers must be installed from source due to the older version of python in the DLC
RUN pip install ninja \
&& pip install -v -U git+https://github.com/facebookresearch/xformers.git@v${XFORMERS_VERSION}#egg=xformers

ARG SGM_VERSION=0.1.0

# Install Stability Generative Models, at the moment the wheel install does not work so we need the full repo
RUN cd /tmp \
&& git clone https://github.com/stability-ai/generative-models -b ${SGM_VERSION} \
&& cd generative-models \
&& pip install -r requirements/pt2.txt \
&& pip install . \
&& rm -rf /tmp/generative-models

# Resolve pip check conflicts and other issues
RUN pip install --no-cache-dir -U \
"awscli>=1.29.15" \
"boto3>=1.28.15" \
"certifi>=2023.07.22" \
"pyopenssl>=23.2.0" \
"cryptography>=41.0.2" \
"transformers>=4.23.0"

# Configure Torchserve for large model loading
ENV TS_DEFAULT_RESPONSE_TIMEOUT=1000

# Copy custom entrypoint, which can unpack cache files
ENV HUGGINGFACE_HUB_CACHE=/tmp/cache/huggingface/hub
ENV TRANSFORMERS_CACHE=/tmp/cache/huggingface/transformers
COPY torchserve-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
RUN mkdir -p /tmp/cache/huggingface \
&& chmod +x /usr/local/bin/dockerd-entrypoint.py

RUN HOME_DIR=/root \
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
&& chmod +x /usr/local/bin/testOSSCompliance \
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
&& rm -rf ${HOME_DIR}/oss_compliance*
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from __future__ import absolute_import

import os
import shlex
import subprocess
import sys

from sagemaker_inference import environment

SAI_MODEL_CACHE_FILE = os.path.join(
environment.model_dir, os.getenv("SAI_MODEL_CACHE_FILE", "stabilityai-model-cache.tar")
)
SAI_MODEL_CACHE_PATH = os.getenv("SAI_MODEL_CACHE_PATH", "/tmp/cache")
SAI_MODEL_CACHE_STATUS_FILE = os.path.join(SAI_MODEL_CACHE_PATH, ".model-cache-unpacked")
if os.path.exists(SAI_MODEL_CACHE_FILE) and not os.path.exists(SAI_MODEL_CACHE_STATUS_FILE):
subprocess.check_call(
[
"tar",
"-x",
"-z" if SAI_MODEL_CACHE_FILE.endswith(".gz") else "",
"-f",
SAI_MODEL_CACHE_FILE,
"-C",
SAI_MODEL_CACHE_PATH,
]
)

if sys.argv[1] == "serve":
from sagemaker_pytorch_serving_container import serving

serving.main()
else:
subprocess.check_call(shlex.split(" ".join(sys.argv[1:])))

# prevent docker exit
subprocess.call(["tail", "-f", "/dev/null"])
10 changes: 8 additions & 2 deletions test/dlc_tests/container_tests/bin/security_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
import os
import time
import calendar
import argparse

LOGGER = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)


def main():
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--image_uri", help="Provide Image Uri", default="")
args = parser.parse_args()
home_dir = os.path.expanduser("~")
check_that_cache_dir_is_removed(home_dir)
check_that_global_tmp_dir_is_empty()
check_that_global_tmp_dir_is_empty(image_uri=args.image_uri)
check_vim_info_does_not_exists(home_dir)
check_bash_history(home_dir)
check_if_any_files_in_subfolder_with_mask_was_last_modified_before_the_boottime(
Expand Down Expand Up @@ -49,7 +53,7 @@ def check_that_cache_dir_is_removed(home_dir):
)


def check_that_global_tmp_dir_is_empty():
def check_that_global_tmp_dir_is_empty(image_uri=""):
global_tmp_dir_path = "/tmp/"
global_tmp_dir_content = [f for f in os.listdir(global_tmp_dir_path)]
for f in global_tmp_dir_content:
Expand All @@ -60,6 +64,8 @@ def check_that_global_tmp_dir_is_empty():
and "ccNPSUr9.s" not in f
and "hsperfdata" not in f
):
if "stabilityai" in image_uri and "cache" in f.lower():
continue
raise ValueError(
"/tmp folder includes file that probably should not be there: {}".format(f)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def test_ec2_pytorch_inference_eia_gpu(


@pytest.mark.usefixtures("feature_torchaudio_present")
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.usefixtures("sagemaker", "stabilityai")
@pytest.mark.integration("pt_torchaudio_gpu")
@pytest.mark.model("N/A")
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_INSTANCE_TYPE, indirect=True)
Expand Down Expand Up @@ -163,7 +163,7 @@ def test_pytorch_inference_torchaudio_cpu(pytorch_inference, ec2_connection, cpu


@pytest.mark.usefixtures("feature_torchdata_present")
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.usefixtures("sagemaker", "stabilityai")
@pytest.mark.integration("pt_torchdata_gpu")
@pytest.mark.model("N/A")
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_INSTANCE_TYPE, indirect=True)
Expand Down Expand Up @@ -246,7 +246,7 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region):
ec2_connection.run(f"docker rm -f {container_name}", warn=True, hide=True)


@pytest.mark.usefixtures("sagemaker")
@pytest.mark.usefixtures("sagemaker", "stabilityai")
@pytest.mark.integration("telemetry")
@pytest.mark.model("N/A")
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True)
Expand Down
2 changes: 1 addition & 1 deletion test/dlc_tests/sanity/test_boottime_container_security.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ def test_security(image):
)
try:
docker_exec_cmd = f"docker exec -i {container_name}"
run(f"{docker_exec_cmd} python /test/bin/security_checks.py ", hide=True)
run(f"{docker_exec_cmd} python /test/bin/security_checks.py --image_uri {image}", hide=True)
finally:
run(f"docker rm -f {container_name}", hide=True)
6 changes: 5 additions & 1 deletion test/dlc_tests/sanity/test_pre_release.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ def test_stray_files(image):
# Running list of allowed files in the /tmp directory
allowed_tmp_files = ["hsperfdata_root"]

# Allow cache dir for SAI images
if "stabilityai" in image:
allowed_tmp_files.append("cache")

# Ensure stray artifacts are not in the tmp directory
tmp = run_cmd_on_container(container_name, ctx, "ls -A /tmp")
_assert_artifact_free(tmp, stray_artifacts)
Expand Down Expand Up @@ -716,7 +720,7 @@ def test_cuda_paths(gpu):
python_version = re.search(r"(py\d+)", image).group(1)
short_python_version = None
image_tag = re.search(
r":(\d+(\.\d+){2}(-(transformers|diffusers)\d+(\.\d+){2})?-(gpu)-(py\d+)(-cu\d+)-(ubuntu\d+\.\d+)((-ec2)?-example|-ec2|-sagemaker-lite|-sagemaker-full|-sagemaker)?)",
r":(\d+(\.\d+){2}(-(transformers|diffusers|sgm)\d+(\.\d+){2})?-(gpu)-(py\d+)(-cu\d+)-(ubuntu\d+\.\d+)((-ec2)?-example|-ec2|-sagemaker-lite|-sagemaker-full|-sagemaker)?)",
image,
).group(1)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@
resnet_neuronx_image_list = os.path.join(model_neuronx_dir, "imagenet1000_clsidx_to_labels.txt")
call_model_fn_once_script = os.path.join(resources_path, code_sub_dir, "call_model_fn_once.py")

stabilityai_path = os.path.join(resources_path, "stabilityai")
sdxl_path = os.path.join(stabilityai_path, "sdxl-v1")
sdxl_gpu_path = os.path.join(sdxl_path, gpu_sub_dir)
sdxl_gpu_script = os.path.join(sdxl_gpu_path, code_sub_dir, "sdxl_inference.py")

ROLE = "dummy/unused-role"
DEFAULT_TIMEOUT = 20

Expand Down
Loading

0 comments on commit 39083d3

Please sign in to comment.