diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..c088346 Binary files /dev/null and b/.DS_Store differ diff --git a/Archive/.gcloudignore b/Archive/.gcloudignore deleted file mode 100644 index 37c8de8..0000000 --- a/Archive/.gcloudignore +++ /dev/null @@ -1,3 +0,0 @@ -build-.json -script.sh -Stable-Diffusion-UI-Novel/model.ckpt \ No newline at end of file diff --git a/Archive/README.md b/Archive/README.md deleted file mode 100644 index bf161bd..0000000 --- a/Archive/README.md +++ /dev/null @@ -1,84 +0,0 @@ -# Stable-Diffusion on Google Cloud Quick Start Guide - -This guide give simple steps for stable-diffusion users to launch a stable diffusion deployment by using GCP GKE servce, Cloud Build, Cloud Deploy service. User can just follow the step have your stable diffusion model running. - -* [Introduction](#Introduction) -* [How-To](#how-to) - -## Introduction - This project is using the [Stable-Diffusion-WebUI](https://github.com/AUTOMATIC1111/stable-diffusion-webui) open source as the user interactive front-end, customer can just prepare the stable diffusion model to build/deployment stable diffusion model by container. This project use the cloud build to help you quick build up a docker image with your stable diffusion model, then you can make a deployment base on the docker image. - -## How To -you can use the cloud shell as the run time to do below steps. -### Before you begin -1. make sure you have an available GCP project for your deployment -2. Enable the required service API using [cloud shell](https://cloud.google.com/shell/docs/run-gcloud-commands) -``` -gcloud services enable compute.googleapis.com artifactregistry.googleapis.com container.googleapis.com cloudbuild.googleapis.com clouddeploy.googleapis.com storage.googleapis.com -``` -### Create GKE Cluster -do the following step using the cloud shell. This guide using the A100 GPU node as the VM host, by your choice you can change the node type with [other GPU instance type](https://cloud.google.com/compute/docs/gpus). -``` -PROJECT_ID= -GKE_CLUSTER_NAME= -REGION= - -gcloud beta container --project $PROJECT_ID clusters create $GKE_CLUSTER_NAME --zone "${REGION}-b" --no-enable-basic-auth --cluster-version "1.23.12-gke.100" --release-channel "regular" --machine-type "a2-highgpu-1g" --accelerator "type=nvidia-tesla-a100,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-standard" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --max-pods-per-node "110" --spot --num-nodes "1" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-ip-alias --network "projects/${PROJECT_ID}/global/networks/default" --subnetwork "projects/${PROJECT_ID}/regions/${REGION}/subnetworks/default" --no-enable-intra-node-visibility --default-max-pods-per-node "110" --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --enable-shielded-nodes --node-locations "${REGION}-b" -``` - -### Install GPU Driver on GKE -``` -gcloud container clusters get-credentials ${GKE_CLUSTER_NAME} --region ${REGION}-b -``` - - -### Create Cloud Artifacts as Docker Repo -``` -BUILD_REGIST= - -gcloud artifacts repositories create quickstart-docker-repo --repository-format=docker \ ---location=${REGION} --description="Stable Diffusion Docker repository" -``` - -### Create Cloud Build Trigger -``` -##(need to change to CloudMoma Git Repo Address) -BUILD_REPO=https://github.com/nonokangwei/gcp-stable-diffusion-build-deploy.git - -echo -n "webhooksecret" | gcloud secrets create webhook-secret \ - --replication-policy="automatic" \ - --data-file=- - -git clone ${BUILD_REPO} - -cd gcp-stable-diffusion-build-deploy/ - -gcloud alpha builds triggers create webhook --name=stable-diffusion-build-trigger --inline-config=cloudbuild.yaml --secret=projects/${PROJECT_ID}/secrets/webhook-secret/versions/1 -``` - -### Prepare Stable Diffusion Model -one of the public available Stable Diffusion model is [HuggingFace](https://huggingface.co/runwayml/stable-diffusion-v1-5), register an id and download the .ckpt file, then upload to the GCS bucket. - -``` -BUILD_BUCKET= -gcloud storage buckets create gs://${BUCKET_NAME} --location=${REGION} -``` - -Suggest you can refer the GCS path pattern gs://\${BUCKET_NAME}/\${MODEL_NAME}/model.ckpt. ${MODEL_NAME} can name like stablediffusion. - -[Guide](https://cloud.google.com/storage/docs/uploading-objects) of upload file to the GCS bucket path you create. - -### Build Stable Diffusion Image -Get cloud build trigger url: [How-To](https://cloud.google.com/build/docs/automate-builds-webhook-events), in GCloud Console findout the Cloud Build Trigger that created before, Preview the trigger URL - -trigger the build -``` -MODEL_NAME= -BUILD_TRIGGER_URL= - - curl -X POST -H "application/json" ${BUILD_TRIGGER_URL} -d '{"message": {"buildrepo": ${BUILD_REPO}, "buildbucket": ${BUILD_BUCKET}, "buildmodel": ${MODEL_NAME}, "buildregist": ${BUILD_REGIST}}}' -``` - - - - diff --git a/Archive/cloudbuild.yaml b/Archive/cloudbuild.yaml deleted file mode 100644 index 439c5e1..0000000 --- a/Archive/cloudbuild.yaml +++ /dev/null @@ -1,18 +0,0 @@ -steps: - - name: 'gcr.io/cloud-builders/git' - args: ['clone', '${_BUILD_REPO}'] - - name: 'gcr.io/cloud-builders/gcloud' - entrypoint: 'gsutil' - args: ['cp', 'gs://${_BUILD_BUCKET}/${_BUILD_MODEL}/model.ckpt', 'gcp-stable-diffusion-build-deploy/Stable-Diffusion-UI-Novel/model.ckpt'] - - name: 'gcr.io/cloud-builders/docker' - dir: 'gcp-stable-diffusion-build-deploy/Stable-Diffusion-UI-Novel' - args: ['build', '-t', 'us-docker.pkg.dev/${PROJECT_ID}/${_BUILD_REGIST}/stable-diffusion-${_BUILD_MODEL}', '.'] -images: ['us-docker.pkg.dev/${PROJECT_ID}/${_BUILD_REGIST}/stable-diffusion-${_BUILD_MODEL}'] -substitutions: - _BUILD_REPO: $(body.message.buildrepo) - _BUILD_BUCKET: $(body.message.buildbucket) - _BUILD_MODEL: $(body.message.buildmodel) - _BUILD_REGIST: $(body.message.buildregist) -options: - machineType: 'N1_HIGHCPU_8' - diskSizeGb: '200' diff --git a/Archive/clouddeploy.yaml b/Archive/clouddeploy.yaml deleted file mode 100644 index e372658..0000000 --- a/Archive/clouddeploy.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: deploy.cloud.google.com/v1 -kind: DeliveryPipeline -metadata: - name: stable-diffusion-cd -description: main application pipeline -serialPipeline: - stages: - - targetId: prod - profiles: [] ---- - -apiVersion: deploy.cloud.google.com/v1 -kind: Target -metadata: - name: prod -description: production cluster -gke: - cluster: projects//locations//clusters/ \ No newline at end of file diff --git a/Archive/dcgm_loadtest.yml b/Archive/dcgm_loadtest.yml deleted file mode 100644 index a7950d5..0000000 --- a/Archive/dcgm_loadtest.yml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: dcgm-loadtest -spec: - containers: - - name: dcgm-loadtest - image: nvcr.io/nvidia/k8s/dcgm-exporter:2.3.4-2.6.4-ubuntu20.04 - command: ["/bin/bash", "-c", "--"] - args: - - while true; do - dcgmproftester11 --duration 120 --fieldId 1002 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1003 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1004 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1005 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1006 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1007 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1008 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1009 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1010 --no-dcgm-validation; - done; - resources: - limits: - nvidia.com/gpu: 1 diff --git a/Archive/dcgm_loadtest_deployment.yaml b/Archive/dcgm_loadtest_deployment.yaml deleted file mode 100644 index 5092316..0000000 --- a/Archive/dcgm_loadtest_deployment.yaml +++ /dev/null @@ -1,35 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: gpu-workload - labels: - app: gpu-workload -spec: - replicas: 1 - selector: - matchLabels: - app: gpu-workload - template: - metadata: - labels: - app: gpu-workload - spec: - containers: - - name: gpu-workload - image: nvcr.io/nvidia/k8s/dcgm-exporter:2.3.4-2.6.4-ubuntu20.04 - command: ["/bin/bash", "-c", "--"] - args: - - while true; do - dcgmproftester11 --duration 120 --fieldId 1002 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1003 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1004 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1005 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1006 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1007 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1008 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1009 --no-dcgm-validation; - dcgmproftester11 --duration 120 --fieldId 1010 --no-dcgm-validation; - done; - resources: - limits: - nvidia.com/gpu: 1 \ No newline at end of file diff --git a/Archive/deployment.yaml b/Archive/deployment.yaml deleted file mode 100644 index b51fa92..0000000 --- a/Archive/deployment.yaml +++ /dev/null @@ -1,35 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: stable-diffusion-deployment - labels: - app: stable-diffusion -spec: - replicas: 1 - selector: - matchLabels: - app: stable-diffusion - template: - metadata: - labels: - app: stable-diffusion - spec: - volumes: - - name: stable-diffusion-storage - persistentVolumeClaim: - claimName: pvc-stable-diffusion-storage - containers: - - name: stable-diffusion-webui - image: us-central1-docker.pkg.dev/dave-selfstudy01/hzchen-repo/sd-webui:0.7 - resources: - limits: - nvidia.com/gpu: 1 - ports: - - containerPort: 7860 - volumeMounts: - - mountPath: "/outputs" - name: stable-diffusion-storage - subPath: outputs - - mountPath: "/log" - name: stable-diffusion-storage - subPath: log diff --git a/Archive/kustomization.yaml b/Archive/kustomization.yaml deleted file mode 100644 index 3ff9939..0000000 --- a/Archive/kustomization.yaml +++ /dev/null @@ -1,7 +0,0 @@ -commonLabels: - instance: stable-diffusion-deployment-4 -nameSuffix: "-4" -resources: -- deployment.yaml -- pvc.yaml -- service.yaml \ No newline at end of file diff --git a/Archive/pvc.yaml b/Archive/pvc.yaml deleted file mode 100644 index 1a1895d..0000000 --- a/Archive/pvc.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: pvc-stable-diffusion-storage -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 30Gi - storageClassName: standard-rwo \ No newline at end of file diff --git a/Archive/skaffold.yaml b/Archive/skaffold.yaml deleted file mode 100644 index aa70b94..0000000 --- a/Archive/skaffold.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: skaffold/v2beta29 -kind: Config -metadata: - name: gpugke -build: - artifacts: - - image: us-central1-docker.pkg.dev/cliu201/docker-repo/stable-diffusion-novel - context: Stable-Diffusion-UI-Novel - docker: - dockerfile: Dockerfile - googleCloudBuild: - projectId: cliu201 - diskSizeGb: 200 - machineType: N1_HIGHCPU_8 -deploy: - # kubectl: - # manifests: - # - Stable-Diffusion-UI-Novel/kubernetes/deployment.yaml - # - Stable-Diffusion-UI-Novel/kubernetes/pvc.yaml - # - Stable-Diffusion-UI-Novel/kubernetes/service.yaml - kustomize: - paths: - - Stable-Diffusion-UI-Novel/kubernetes diff --git a/PEFTonVertex/.DS_Store b/PEFTonVertex/.DS_Store new file mode 100644 index 0000000..8e7504c Binary files /dev/null and b/PEFTonVertex/.DS_Store differ diff --git a/PEFTonVertex/CustomTraining/Dockerfile b/PEFTonVertex/CustomTraining/Dockerfile new file mode 100644 index 0000000..35a9320 --- /dev/null +++ b/PEFTonVertex/CustomTraining/Dockerfile @@ -0,0 +1,27 @@ +FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 + +RUN apt update +RUN apt install -y wget git python3 python3-venv python3-pip + +RUN pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 + +WORKDIR /root + +RUN git clone https://github.com/huggingface/peft.git +RUN pip install /root/peft +RUN git clone https://huggingface.co/spaces/smangrul/peft-lora-sd-dreambooth +RUN pip install -r /root/peft-lora-sd-dreambooth/requirements.txt + +ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/cuda/lib64 +RUN ln -s /usr/local/cuda/lib64/libcudart.so.11.0 /usr/local/cuda/lib64/libcudart.so +RUN pip install -U bitsandbytes --prefer-binary + +# Installs additional packages as you need. +RUN pip install -U google-cloud-aiplatform +RUN pip install -U google-cloud-storage + +# Copies the trainer code to the docker image. +COPY train.py /root/train.py + +# Sets up the entry point to invoke the trainer. +ENTRYPOINT ["python3", "-m", "train"] \ No newline at end of file diff --git a/PEFTonVertex/CustomTraining/cloud_build_config.yaml b/PEFTonVertex/CustomTraining/cloud_build_config.yaml new file mode 100644 index 0000000..5f75c09 --- /dev/null +++ b/PEFTonVertex/CustomTraining/cloud_build_config.yaml @@ -0,0 +1,8 @@ +steps: +- name: 'gcr.io/cloud-builders/docker' + args: [ 'build', '-t', 'us-central1-docker.pkg.dev/argolis-lsj-test/sd-lsj/sd-peft:v1', '.' ] +- name: 'gcr.io/cloud-builders/docker' + args: ['push', 'us-central1-docker.pkg.dev/argolis-lsj-test/sd-lsj/sd-peft:v1'] +options: + machineType: 'N1_HIGHCPU_8' + diskSizeGb: '200' \ No newline at end of file diff --git a/PEFTonVertex/CustomTraining/cloud_cli.sh b/PEFTonVertex/CustomTraining/cloud_cli.sh new file mode 100644 index 0000000..ec6a810 --- /dev/null +++ b/PEFTonVertex/CustomTraining/cloud_cli.sh @@ -0,0 +1,14 @@ +# cloud build image +gcloud builds submit --config cloud-build-config.yaml . + +# create vertex ai customer training job +# args format: +# --model_name: Huggingface repo id, or "/gcs/bucket_name/model_folder". I only test the models downloaded from HF, with standard diffusers format. Safetensors has not been test. +# --input_storage: bucket_name/input_image_folder +# --output_storage: bucket_name/output_folder +# --prompt: a photo of XXX +gcloud ai custom-jobs create \ + --region=us-central1 \ + --display-name=sd-lora-training-peft-1 \ + --config=vertex-ai-config.yaml \ + --args="--model_name=runwayml/stable-diffusion-v1-5,--input_storage=/gcs/sd_lsj/input_dog,--output_storage=/gcs/sd_lsj/peft/dog_lora_output,--prompt=a photo of sks dog,--class_prompt=a photo of dog" \ No newline at end of file diff --git a/PEFTonVertex/CustomTraining/inference.py b/PEFTonVertex/CustomTraining/inference.py new file mode 100644 index 0000000..b1cc02f --- /dev/null +++ b/PEFTonVertex/CustomTraining/inference.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import gc +import json +import pathlib +import sys + +import gradio as gr +import PIL.Image +import torch +from diffusers import StableDiffusionPipeline +from peft import LoraModel, LoraConfig, set_peft_model_state_dict + + +class InferencePipeline: + def __init__(self): + self.pipe = None + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + self.weight_path = None + + def clear(self) -> None: + self.weight_path = None + del self.pipe + self.pipe = None + torch.cuda.empty_cache() + gc.collect() + + @staticmethod + def get_lora_weight_path(name: str) -> pathlib.Path: + curr_dir = pathlib.Path(__file__).parent + return curr_dir / name, curr_dir / f'{name.replace(".pt", "_config.json")}' + + def load_and_set_lora_ckpt(self, pipe, weight_path, config_path, dtype): + with open(config_path, "r") as f: + lora_config = json.load(f) + lora_checkpoint_sd = torch.load(weight_path, map_location=self.device) + unet_lora_ds = {k: v for k, v in lora_checkpoint_sd.items() if "text_encoder_" not in k} + text_encoder_lora_ds = { + k.replace("text_encoder_", ""): v for k, v in lora_checkpoint_sd.items() if "text_encoder_" in k + } + unet_config = LoraConfig(**lora_config["peft_config"]) + pipe.unet = LoraModel(unet_config, pipe.unet) + set_peft_model_state_dict(pipe.unet, unet_lora_ds) + + if "text_encoder_peft_config" in lora_config: + text_encoder_config = LoraConfig(**lora_config["text_encoder_peft_config"]) + pipe.text_encoder = LoraModel(text_encoder_config, pipe.text_encoder) + set_peft_model_state_dict(pipe.text_encoder, text_encoder_lora_ds) + + if dtype in (torch.float16, torch.bfloat16): + pipe.unet.half() + pipe.text_encoder.half() + + pipe.to(self.device) + return pipe + + def load_pipe(self, model_id: str, lora_filename: str) -> None: + weight_path, config_path = self.get_lora_weight_path(lora_filename) + if weight_path == self.weight_path: + return + + pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(self.device) + pipe = pipe.to(self.device) + pipe = self.load_and_set_lora_ckpt(pipe, weight_path, config_path, torch.float16) + self.pipe = pipe + + def run( + self, + base_model: str, + lora_weight_name: str, + prompt: str, + negative_prompt: str, + seed: int, + n_steps: int, + guidance_scale: float, + ) -> PIL.Image.Image: + if not torch.cuda.is_available(): + raise gr.Error("CUDA is not available.") + + self.load_pipe(base_model, lora_weight_name) + + generator = torch.Generator(device=self.device).manual_seed(seed) + out = self.pipe( + prompt, + num_inference_steps=n_steps, + guidance_scale=guidance_scale, + generator=generator, + negative_prompt=negative_prompt if negative_prompt else None, + ) # type: ignore + return out.images[0] \ No newline at end of file diff --git a/PEFTonVertex/CustomTraining/test.py b/PEFTonVertex/CustomTraining/test.py new file mode 100644 index 0000000..a15b5df --- /dev/null +++ b/PEFTonVertex/CustomTraining/test.py @@ -0,0 +1,5 @@ +from inference import InferencePipeline + +pipe = InferencePipeline() +image = pipe.run(base_model="runwayml/stable-diffusion-v1-5",lora_weight_name=f"/your_model_path/a photo of sks dog_lora.pt",prompt="a photo of sks dog in the forest", negative_prompt="",n_steps=50,guidance_scale=7.5, seed=1) +image.save("/your_output_path/dog.png") \ No newline at end of file diff --git a/PEFTonVertex/CustomTraining/train.py b/PEFTonVertex/CustomTraining/train.py new file mode 100644 index 0000000..5f13925 --- /dev/null +++ b/PEFTonVertex/CustomTraining/train.py @@ -0,0 +1,68 @@ +import subprocess +import os +import argparse +import re +import torch +from safetensors.torch import save_file + +def main(args): + + MODEL_NAME= args.model_name #"runwayml/stable-diffusion-v1-5" + INSTANCE_DIR= args.input_storage + OUTPUT_DIR= args.output_storage + PROMPT = args.prompt + CLASS_PROMP = args.class_prompt + + #subprocess.run("accelerate config update --config_file /content/accelerate_2.yaml", shell=True) + os.chdir("/root/peft/examples/lora_dreambooth") + + # for complex commands, with many args, use string + `shell=True`: + cmd_str = (f'accelerate launch train_dreambooth.py ' + f'--pretrained_model_name_or_path="{MODEL_NAME}" ' + f'--instance_data_dir="{INSTANCE_DIR}" ' + f'--output_dir="{OUTPUT_DIR}" ' + f'--train_text_encoder ' + f'--with_prior_preservation ' + f'--prior_loss_weight=1 ' + f'--num_class_images=50 ' + f'--class_prompt="{CLASS_PROMPT}" ' + f'--class_data_dir="{OUTPUT_DIR}/class_data" ' + f'--instance_prompt="{PROMPT}" ' + f'--use_lora ' + f'--lora_r=4 ' + f'--lora_alpha=4 ' + f'--lora_bias=none ' + f'--lora_dropout=0.0 ' + f'--lora_text_encoder_r=4 ' + f'--lora_text_encoder_alpha=4 ' + f'--lora_text_encoder_bias=none ' + f'--lora_text_encoder_dropout=0.0 ' + f'--gradient_checkpointing ' + f'--resolution=512 ' + f'--train_batch_size=1 ' + f'--use_8bit_adam ' + f'--mixed_precision="fp16" ' + f'--gradient_accumulation_steps=1 ' + f'--learning_rate=1e-4 ' + f'--lr_scheduler="constant" ' + f'--lr_warmup_steps=0 ' + f'--max_train_steps=400') + + subprocess.run(cmd_str, shell=True) + #bin_to_safetensors('/gcs/' + args.output_storage) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model_name", type=str, default="runwayml/stable-diffusion-v1-5", help="bucket_name/model_folder") + parser.add_argument("--input_storage", type=str,default="abc", help="bucket_name/input_image_folder") + parser.add_argument("--output_storage", type=str, default="abc",help="bucket_name/output_folder") + parser.add_argument("--prompt", type=str, default="abc",help="instance prompt") + parser.add_argument("--class_prompt", type=str, default="abc",help="instance prompt") + + args = parser.parse_args() + print(args.model_name) + print(args.input_storage) + print(args.output_storage) + print(args.prompt) + print(args.class_prompt) + main(args) diff --git a/PEFTonVertex/CustomTraining/vertex-ai-config.yaml b/PEFTonVertex/CustomTraining/vertex-ai-config.yaml new file mode 100644 index 0000000..cd38abf --- /dev/null +++ b/PEFTonVertex/CustomTraining/vertex-ai-config.yaml @@ -0,0 +1,8 @@ +workerPoolSpecs: + machineSpec: + machineType: n1-standard-8 + acceleratorType: NVIDIA_TESLA_V100 + acceleratorCount: 1 + replicaCount: 1 + containerSpec: + imageUri: us-central1-docker.pkg.dev/argolis-lsj-test/sd-lsj/sd-peft:v1 \ No newline at end of file diff --git a/PEFTonVertex/README.md b/PEFTonVertex/README.md new file mode 100644 index 0000000..f81f355 --- /dev/null +++ b/PEFTonVertex/README.md @@ -0,0 +1,62 @@ +# **Lora training using PEFT on Google Cloud Vertex AI** + +This guide gives simple steps to fine tune Stable Diffusion using Lora based on PEFT library. The fine-tuning process will also be on Vertex AI. + +* [Introduction](#Introduction) +* [Training on Vertex AI](#Training_on_Vertex_AI) +* [Workbench executor(WIP)](Workbench_executor) +* Model converted to safetensors(WIP) + +## Introduction + [PEFT](https://github.com/huggingface/peft) **Parameter-Efficient Fine-Tuning** methods enable efficient adaptation of pre-trained language models (PLMs) to various downstream applications without fine-tuning all the model's parameters. Supported methods: + +- LoRA: LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS +- Prefix Tuning: Prefix-Tuning: Optimizing Continuous Prompts for Generation, P-Tuning v2: Prompt Tuning Can Be Comparable to Fine-tuning Universally Across Scales and Tasks +- P-Tuning: GPT Understands, Too +- Prompt Tuning: The Power of Scale for Parameter-Efficient Prompt Tuning + +When training stable diffusion with LoRA, PEFT provides better implementation compared with Diffusers. The dreambooth_LoRA script provides more input arguments, including enabling text encoder training, configure lora_rank and so on, which can make users fine-tune the model more flexibly and more carefully. + + +In the project, we just use PEFT library, demo **Dreambooth with LoRA** training on GPU on Vertex AI. So the process is very similar with the Diffusers demo. We just skip some of the steps and only keep the key files (Dockerfile, train file, etc.) here. + +## Training on Vertex AI + +1. Training + +The code is in *CustomTraining* folder. + +``` +# Build the docker image +gcloud builds submit --config cloud-build-config.yaml . + +# Submit training job +gcloud ai custom-jobs create \ + --region=us-central1 \ + --display-name=sd-lora-training-peft \ + --config=vertex-ai-config.yaml \ + --args="--model_name=runwayml/stable-diffusion-v1-5,--input_storage=/gcs/input_dog,--output_storage=sd_lsj/dog_lora_output,--prompt=a photo of sks dog" +``` + +2. Inference locally + +The *inference.py* file implements inference library. It's referenced from [this repo](https://huggingface.co/spaces/smangrul/peft-lora-sd-dreambooth/tree/main). +In *test.py*, we just load the inference library, and do the image generation work. + +The generated LoRA model will have two files, one is *.pt file*, the other is *.json file*, both the files are named with prompt as prefix. So just pass the path of .pt file when inferencing, the .json file will be automatically routined. + +## Workbench executor(WIP) + +The code is in *Workbench* folder + +``` +# Build the docker image, configure the yaml file before running + +gcloud builds submit --config cloud-build-workbench.yaml . +``` + +Then you can execute the notebook with custom built container. The model will be saved to Cloud Storage. And a test image will also be saved there. + + + + diff --git a/README.md b/README.md index 60d704e..6f04c16 100644 --- a/README.md +++ b/README.md @@ -1,119 +1,177 @@ -# Stable-Diffusion on Google Cloud Quick Start Guide +# **Stable Diffusion Fine-tuning on Google Cloud Quick Start Guide** -This guide give simple steps for stable-diffusion users to launch a stable diffusion deployment by using GCP GKE service, and using Filestore as shared storage for model and output files. User can just follow the step have your stable diffusion model running. +This guide gives simple steps for stable diffusion users to fine-tune stable diffusion using dreambooth with LoRA on Google Cloud Vertex AI. Two options are provided, one is Vertex AI custom training service, the other is Workbench executor. User can just follow the step have your stable diffusion model training. * [Introduction](#Introduction) -* [How-To](#how-to) +* [Vertex AI custom training](#Vertex_AI_Custom_Training) +* [Vertex AI Workbench executor](#Vertex_AI_Workbench_Executor) ## Introduction - This project is using the [Stable-Diffusion-WebUI](https://github.com/AUTOMATIC1111/stable-diffusion-webui) open source as the user interactive front-end, customer can just prepare the stable diffusion model to build/deployment stable diffusion model by container. This project use the cloud build to help you quick build up a docker image with your stable diffusion model, then you can make a deployment base on the docker image. + [Vertex AI](https://cloud.google.com/vertex-ai/docs/start/introduction-unified-platform) is a machine learning (ML) platform that lets you train and deploy ML models and AI applications. Vertex AI combines data engineering, data science, and ML engineering workflows, enabling your teams to collaborate using a common toolset. -## How To -you can use the cloud shell as the run time to do below steps. -### Before you begin -1. make sure you have an available GCP project for your deployment -2. Enable the required service API using [cloud shell](https://cloud.google.com/shell/docs/run-gcloud-commands) -``` -gcloud services enable compute.googleapis.com artifactregistry.googleapis.com container.googleapis.com file.googleapis.com -``` -### Create GKE Cluster -do the following step using the cloud shell. This guide using the T4 GPU node as the VM host, by your choice you can change the node type with [other GPU instance type](https://cloud.google.com/compute/docs/gpus). -In this guide we also enabled [Filestore CSI driver](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/filestore-csi-driver) for models/outputs sharing. + [Diffusers](https://github.com/huggingface/diffusers) is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. It provides diffusion model's training, inference on GPU and TPU. -``` -PROJECT_ID= -GKE_CLUSTER_NAME= -REGION= -VPC_NETWORK= -VPC_SUBNETWORK= + In the project, we just use Diffusers library, demo **Dreambooth with LoRA** training on GPU on Vertex AI, while Dreambooth, text2image are also similar. -gcloud beta container --project ${PROJECT_ID} clusters create ${GKE_CLUSTER_NAME} --region ${REGION} \ - --no-enable-basic-auth --cluster-version "1.24.9-gke.3200" --release-channel "None" \ - --machine-type "custom-2-24576-ext" --accelerator "type=nvidia-tesla-t4,count=1" \ - --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" \ - --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/cloud-platform" \ - --num-nodes "1" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-private-nodes \ - --master-ipv4-cidr "172.16.1.0/28" --enable-ip-alias --network "projects/${PROJECT_ID}/global/networks/${VPC_NETWORK}" \ - --subnetwork "projects/${PROJECT_ID}/regions/${REGION}/subnetworks/${VPC_SUBNETWORK}" \ - --no-enable-intra-node-visibility --default-max-pods-per-node "110" --no-enable-master-authorized-networks \ - --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver,GcpFilestoreCsiDriver \ - --enable-autoupgrade --no-enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 \ - --enable-autoprovisioning --min-cpu 1 --max-cpu 64 --min-memory 1 --max-memory 256 \ - --autoprovisioning-scopes=https://www.googleapis.com/auth/cloud-platform --no-enable-autoprovisioning-autorepair \ - --enable-autoprovisioning-autoupgrade --autoprovisioning-max-surge-upgrade 1 --autoprovisioning-max-unavailable-upgrade 0 \ - --enable-vertical-pod-autoscaling --enable-shielded-nodes \ - --spot -``` + This project also uses Cloud Build to quickly build up a docker image for training. -### Get credentials of GKE cluster -``` -gcloud container clusters get-credentials ${GKE_CLUSTER_NAME} --region ${REGION} -``` + Customer can just use [Huggingface pre-trained Stable Diffusion model](https://huggingface.co/runwayml/stable-diffusion-v1-5) as base model or prepare the stable diffusion model by yourself. + +## Vertex AI Custom Training + +Vertex AI provides a [managed training service](https://cloud.google.com/vertex-ai/docs/training/overview) that enables you to operationalize large scale model training. You can use Vertex AI to run distributed training applications based on any machine learning (ML) framework (Tensorflow, Pytorch, etc.) on Google Cloud infrastructure. + +You can use the cloud shell as the run time to do below steps. + +### Before you begin +1. Make sure you have an available GCP project for your deployment + +2. Enable the required service API using [Cloud Shell](https://cloud.google.com/shell/docs/run-gcloud-commands) -### Install GPU Driver ``` -kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml +gcloud services enable artifactregistry.googleapis.com container.googleapis.com aiplatform.googleapis.com ``` -### Create Cloud Artifacts as Docker Repo +2. Make sure the Vertex AI service account (that is **Compute Engine default service account**) has enough access to GCS + +3. Get GPU quota in your GCP project + +### Create an Cloud Artifact as docker repo + ``` BUILD_REGIST= gcloud artifacts repositories create ${BUILD_REGIST} --repository-format=docker \ ---location=${REGION} +--location=us-central1 -gcloud auth configure-docker ${REGION}-docker.pkg.dev +gcloud auth configure-docker us-central1-docker.pkg.dev ``` +### Build Stable Diffusion image using Cloud Build +1. Change to *VertexCustomTraining* folder -### Build Stable Diffusion Image -Build image with provided Dockerfile, push to repo in Cloud Artifacts +2. Config project id and artifact repo id and image name in *cloud-build-config.yaml* -``` -cd gcp-stable-diffusion-build-deploy/Stable-Diffusion-UI-Novel -docker build . -t ${REGION}-docker.pkg.dev/${PROJECT_ID}/${BUILD_REGIST}/sd-webui:0.1 -docker push +3. Build the image using Cloud Build ``` +gcloud builds submit --config cloud-build-config.yaml . +``` +### Fine-tune stable diffusion model on Vertex AI customer training -### Create Filestore -Create Filestore storage, mount and prepare files and folders for models/outputs/training data -You should prepare a VM to mount the filestore instance. +1. Upload training images to Cloud Storage, users can just use the dog images in this repo as an example. ``` -FILESTORE_NAME= -FILESTORE_ZONE= -FILESHARE_NAME= +gsutil cp -r dog_images gs://bucket_name/dog_images +``` +2. [Optional] Upload your customized base model to Cloud Storage -gcloud filestore instances create ${FILESTORE_NAME} --zone=${FILESTORE_ZONE} --tier=BASIC_HDD --file-share=name=${FILESHARE_NAME},capacity=1TB --network=name=${VPC_NETWORK} -gcloud filestore instances create nfs-store --zone=us-central1-b --tier=BASIC_HDD --file-share=name="vol1",capacity=1TB --network=name=${VPC_NETWORK} +3. Config the prepared image name in *vertex-ai-config.yaml*, in the sample config, we just use a n1-stanard-8 machine with one T4 GPU. If you want to enable multiple A100 training, configure it like below. This project can automatically detects the GPU numbers and configure multi-GPU training. ``` +machineSpec: + machineType: n1-standard-8 + acceleratorType: NVIDIA_TESLA_A100 + acceleratorCount: 2 +``` -### Enable Node Pool Autoscale -Set the Node pool with cluster autoscale(CA) capability, when the horizonal pod autocale feature scale up the pod replica size, it will trigger the node pool scale out to provide required GPU resource. +4. Submit custom training job with arguments input. The parameter **args** can be configured like below: + * The model name can be Huggingface repo id, or Cloud Storage path, like */gcs/bucket_name/model_folder* + * The input_storage and output_storage should be Cloud Storage path like */gcs/bucket_name/input_or_output_folder* + * Prompt can be like "a photo of somebody or some special things ``` -gcloud container clusters update ${GKE_CLUSTER_NAME} \ - --enable-autoscaling \ - --node-pool=default-pool \ - --min-nodes=0 \ - --max-nodes=5 \ - --region=${REGION} +gcloud ai custom-jobs create \ + --region=us-central1 \ + --display-name=${JOB_NAME} \ + --config=vertex-ai-config.yaml \ + --args="--model_name=runwayml/stable-diffusion-v1-5,--input_storage=/gcs/bucket_name/input_dog,--output_storage=/gcs/bucket_name/dog_lora_output,--prompt=a photo of sks dog" + ``` +5. The generated model will be saved in output_storage path, *a bin file* and *a safetensors file*. The bin file can be directly load in Diffusers library for inference. And safetensors file is for Automatic1111 WebUI. + +### Check the outputs in Cloud Storage +After sumit training job to Vertex AI, you can monitor its status in Cloud UI. + +![Vertex AI custom training UI](images/custom_training_status.png) + +When finished, you can get the fine-tuned model in Cloud Storage output folder. + +* The *pytorch_lora_weights.bin* file is model in original diffusers format, while *pytorch_lora_weights.safetensors* is converted from .bin file, userd for WebUI. +* The training logs are also in the event folder. + +### File architecture -### Enable Horizonal Pod Autoscale(HPA) -Install the stackdriver adapter to enable the stable-diffusion deployment scale with GPU usage metrics. ``` -kubectl create clusterrolebinding cluster-admin-binding \ - --clusterrole cluster-admin --user "$(gcloud config get-value account)" +|-- train.py #model training file in Docker +|-- Dockerfile +|-- cloud-build-config.yaml #cloud config file used in CLI +|-- vertex-ai-config.yaml #Vertex AI custom training config file used in CLI +|-- Stable_Diffusion_Lora_fine_tuning_on_Vertex_AI.ipynb #A sample notebook, showing all codes and commands ``` +## Vertex AI Workbench Executor + +[Vertex AI Workbench managed notebooks instances](https://cloud.google.com/vertex-ai/docs/workbench/managed/introduction) are Google-managed environments with integrations and capabilities that help you set up and work in an end-to-end Jupyter notebook-based production environment. + +The executor lets you submit a notebook (ipynb) file from Workbech, to run on Vertex AI custom training. So it's convinient for codes and parameters fine-tuning, and computing cost saving. + +We just skip the first two steps as they the same with Vertex AI custom training. + +### Before you begin + +The same with custom training + +### Create an Cloud Artifact as docker repo + +The same with custom training + +### Create a Workbench + + +### Build Stable Diffusion image using Cloud Build + +1. Change to *Workbench* folder + +2. Config project id and artifact repo id and image name in *cloud-build-config.yaml* + +3. Build the image using Cloud Build + ``` -kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stackdriver/master/custom-metrics-stackdriver-adapter/deploy/production/adapter_new_resource_model.yaml +gcloud builds submit --config cloud-build-config.yaml . ``` +### Fine-tune stable diffusion model on Vertex AI Workbench + +1. Upload training images to Cloud Storage + +2. [Optional] Upload your customized base model to Cloud Storage + +3. Modify code in workbench notebook, using *sd_training_nbexecutor.ipynb* as a sample. + +4. Start an executor job in Workbench + +Click **executor** buttor in Workbench notebook, and configure machine size and container name in the pop-out window. + +![Vertex AI Workbench executor](images/workbench_executor.png) + +After clicking Submit, it will start a custom training job, run the notebook in the selected container. + +### Check the outputs in Cloud Storage + + +The job status can be monitored in Workbench executor tab. + +![Vertex AI Workbench executor status](images/workbench_status.png) + +When finished, you can get the fine-tuned model in Cloud Storage output folder. + +* The *pytorch_lora_weights.bin* file is model in original diffusers format, while *pytorch_lora_weights.safetensors* is converted from .bin file, userd for WebUI. +* The training logs are also in the event folder. + +### File architecture -Deploy horizonal pod autoscale policy on the stable-diffusion deployment -``` -kubectl apply -f ./Stable-Diffusion-UI-Novel/autoscale/hap.yaml ``` +|-- Dockerfile +|-- cloud-build-config.yaml #cloud config file used in CLI +|-- sd_training_executor.ipynb #A sample notebook, showing all codes and commands +``` \ No newline at end of file diff --git a/Stable-Diffusion-UI-Novel/Dockerfile b/Stable-Diffusion-UI-Novel/Dockerfile deleted file mode 100644 index 7729ad6..0000000 --- a/Stable-Diffusion-UI-Novel/Dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04 - -RUN set -ex && \ - apt update && \ - apt install -y wget git python3 python3-venv python3-pip && \ - rm -rf /var/lib/apt/lists/* - -ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/cuda/lib64 -RUN ln -s /usr/local/cuda/lib64/libcudart.so.11.0 /usr/local/cuda/lib64/libcudart.so - -RUN git clone https://github.com/AUTOMATIC1111/stable-diffusion-webui.git - -# RUN wget https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors -O \ -# /stable-diffusion-webui/models/Stable-diffusion/v1-5-pruned-emaonly.safetensors - -# COPY model.ckpt /stable-diffusion-webui/models/Stable-diffusion/ -# ADD sd_dreambooth_extension /stable-diffusion-webui/extensions/sd_dreambooth_extension - -RUN pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 - -RUN set -ex && cd stable-diffusion-webui \ - && mkdir repositories \ - && git clone https://github.com/CompVis/stable-diffusion.git repositories/stable-diffusion \ - && git clone https://github.com/CompVis/taming-transformers.git repositories/taming-transformers\ - && git clone https://github.com/sczhou/CodeFormer.git repositories/CodeFormer \ - && git clone https://github.com/salesforce/BLIP.git repositories/BLIP \ - && git clone https://github.com/crowsonkb/k-diffusion.git repositories/k-diffusion \ - && git clone https://github.com/Stability-AI/stablediffusion repositories/stable-diffusion-stability-ai \ - && pip install transformers diffusers invisible-watermark --prefer-binary \ - && pip install git+https://github.com/crowsonkb/k-diffusion.git --prefer-binary \ - && pip install git+https://github.com/TencentARC/GFPGAN.git --prefer-binary \ - && pip install git+https://github.com/mlfoundations/open_clip.git --prefer-binary \ - && pip install -r repositories/CodeFormer/requirements.txt --prefer-binary \ - && pip install -r requirements.txt --prefer-binary - -RUN pip install opencv-contrib-python-headless opencv-python-headless xformers -RUN pip install --upgrade fastapi==0.90.1 -RUN cp /stable-diffusion-webui/repositories/CodeFormer/basicsr/utils/misc.py \ - /usr/local/lib/python3.10/dist-packages/basicsr/utils/misc.py - -# RUN pip install -r /stable-diffusion-webui/extensions/sd_dreambooth_extension/requirements.txt --prefer-binary - -EXPOSE 7860 - -WORKDIR /stable-diffusion-webui/ -CMD ["python3", "webui.py", "--listen", "--xformers", "--medvram"] \ No newline at end of file diff --git a/Stable-Diffusion-UI-Novel/autoscale/adpter_new_resource_model.yaml b/Stable-Diffusion-UI-Novel/autoscale/adpter_new_resource_model.yaml deleted file mode 100644 index 8b7b6d5..0000000 --- a/Stable-Diffusion-UI-Novel/autoscale/adpter_new_resource_model.yaml +++ /dev/null @@ -1,194 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: custom-metrics ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: custom-metrics-stackdriver-adapter - namespace: custom-metrics ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: custom-metrics:system:auth-delegator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:auth-delegator -subjects: -- kind: ServiceAccount - name: custom-metrics-stackdriver-adapter - namespace: custom-metrics ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: custom-metrics-auth-reader - namespace: kube-system -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: extension-apiserver-authentication-reader -subjects: -- kind: ServiceAccount - name: custom-metrics-stackdriver-adapter - namespace: custom-metrics ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: custom-metrics-resource-reader - namespace: custom-metrics -rules: -- apiGroups: - - "" - resources: - - pods - - nodes - - nodes/stats - verbs: - - get - - list - - watch ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: custom-metrics-resource-reader -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: custom-metrics-resource-reader -subjects: -- kind: ServiceAccount - name: custom-metrics-stackdriver-adapter - namespace: custom-metrics ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: custom-metrics-stackdriver-adapter - namespace: custom-metrics - labels: - run: custom-metrics-stackdriver-adapter - k8s-app: custom-metrics-stackdriver-adapter -spec: - replicas: 1 - selector: - matchLabels: - run: custom-metrics-stackdriver-adapter - k8s-app: custom-metrics-stackdriver-adapter - template: - metadata: - labels: - run: custom-metrics-stackdriver-adapter - k8s-app: custom-metrics-stackdriver-adapter - kubernetes.io/cluster-service: "true" - spec: - serviceAccountName: custom-metrics-stackdriver-adapter - containers: - - image: gcr.io/gke-release/custom-metrics-stackdriver-adapter:v0.13.1-gke.0 - imagePullPolicy: Always - name: pod-custom-metrics-stackdriver-adapter - command: - - /adapter - - --use-new-resource-model=true - - --fallback-for-container-metrics=true - resources: - limits: - cpu: 250m - memory: 200Mi - requests: - cpu: 250m - memory: 200Mi ---- -apiVersion: v1 -kind: Service -metadata: - labels: - run: custom-metrics-stackdriver-adapter - k8s-app: custom-metrics-stackdriver-adapter - kubernetes.io/cluster-service: 'true' - kubernetes.io/name: Adapter - name: custom-metrics-stackdriver-adapter - namespace: custom-metrics -spec: - ports: - - port: 443 - protocol: TCP - targetPort: 443 - selector: - run: custom-metrics-stackdriver-adapter - k8s-app: custom-metrics-stackdriver-adapter - type: ClusterIP ---- -apiVersion: apiregistration.k8s.io/v1 -kind: APIService -metadata: - name: v1beta1.custom.metrics.k8s.io -spec: - insecureSkipTLSVerify: true - group: custom.metrics.k8s.io - groupPriorityMinimum: 100 - versionPriority: 100 - service: - name: custom-metrics-stackdriver-adapter - namespace: custom-metrics - version: v1beta1 ---- -apiVersion: apiregistration.k8s.io/v1 -kind: APIService -metadata: - name: v1beta2.custom.metrics.k8s.io -spec: - insecureSkipTLSVerify: true - group: custom.metrics.k8s.io - groupPriorityMinimum: 100 - versionPriority: 200 - service: - name: custom-metrics-stackdriver-adapter - namespace: custom-metrics - version: v1beta2 ---- -apiVersion: apiregistration.k8s.io/v1 -kind: APIService -metadata: - name: v1beta1.external.metrics.k8s.io -spec: - insecureSkipTLSVerify: true - group: external.metrics.k8s.io - groupPriorityMinimum: 100 - versionPriority: 100 - service: - name: custom-metrics-stackdriver-adapter - namespace: custom-metrics - version: v1beta1 ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: external-metrics-reader -rules: -- apiGroups: - - "external.metrics.k8s.io" - resources: - - "*" - verbs: - - list - - get - - watch ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: external-metrics-reader -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: external-metrics-reader -subjects: -- kind: ServiceAccount - name: horizontal-pod-autoscaler - namespace: kube-systems \ No newline at end of file diff --git a/Stable-Diffusion-UI-Novel/autoscale/hpa.yaml b/Stable-Diffusion-UI-Novel/autoscale/hpa.yaml deleted file mode 100644 index 76d9b47..0000000 --- a/Stable-Diffusion-UI-Novel/autoscale/hpa.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: stable-diffusion-hpa -spec: - minReplicas: 1 - maxReplicas: 4 - metrics: - - type: External - external: - metric: - name: kubernetes.io|container|accelerator|duty_cycle - selector: - matchLabels: - resource.labels.namespace_name: default - target: - type: AverageValue - averageValue: 80 - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: "stable-diffusion-deployment" \ No newline at end of file diff --git a/Stable-Diffusion-UI-Novel/autoscale/nvidia_dcgm/dcgm_deployment.yaml b/Stable-Diffusion-UI-Novel/autoscale/nvidia_dcgm/dcgm_deployment.yaml deleted file mode 100644 index 433abea..0000000 --- a/Stable-Diffusion-UI-Novel/autoscale/nvidia_dcgm/dcgm_deployment.yaml +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: nvidia-dcgm - namespace: gpu-monitoring-system - labels: - app: nvidia-dcgm -spec: - selector: - matchLabels: - app: nvidia-dcgm - updateStrategy: - type: RollingUpdate - template: - metadata: - labels: - name: nvidia-dcgm - app: nvidia-dcgm - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: cloud.google.com/gke-accelerator - operator: Exists - tolerations: - - operator: "Exists" - volumes: - - name: nvidia-install-dir-host - hostPath: - path: /home/kubernetes/bin/nvidia - containers: - - image: "nvcr.io/nvidia/cloud-native/dcgm:2.3.5-1-ubuntu20.04" - command: ["nv-hostengine", "-n", "-b", "ALL"] - ports: - - containerPort: 5555 - hostPort: 5555 - name: nvidia-dcgm - securityContext: - privileged: true - volumeMounts: - - name: nvidia-install-dir-host - mountPath: /usr/local/nvidia ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: nvidia-dcgm-exporter - namespace: gpu-monitoring-system - labels: - app: nvidia-dcgm-exporter -spec: - selector: - matchLabels: - app: nvidia-dcgm-exporter - template: - metadata: - labels: - app: nvidia-dcgm-exporter - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: cloud.google.com/gke-accelerator - operator: Exists - tolerations: - - operator: "Exists" - volumes: - - name: nvidia-dcgm-exporter-metrics - configMap: - name: nvidia-dcgm-exporter-metrics - - name: nvidia-install-dir-host - hostPath: - path: /home/kubernetes/bin/nvidia - - name: pod-resources - hostPath: - path: /var/lib/kubelet/pod-resources - containers: - - name: nvidia-dcgm-exporter - image: nvcr.io/nvidia/k8s/dcgm-exporter:2.3.5-2.6.5-ubuntu20.04 - command: ["/bin/bash", "-c"] - args: - - hostname $NODE_NAME; dcgm-exporter -k --remote-hostengine-info $(NODE_IP) --collectors /etc/dcgm-exporter/counters.csv --collect-interval 20000 - ports: - - name: metrics - containerPort: 9400 - securityContext: - privileged: true - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: "DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE" - value: "device-name" - - name: LD_LIBRARY_PATH - value: /usr/local/nvidia/lib64 - - name: NODE_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP - volumeMounts: - - name: nvidia-dcgm-exporter-metrics - mountPath: "/etc/dcgm-exporter" - readOnly: true - - name: nvidia-install-dir-host - mountPath: /usr/local/nvidia - - name: pod-resources - mountPath: /var/lib/kubelet/pod-resources ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: nvidia-dcgm-exporter-metrics - namespace: gpu-monitoring-system -data: - counters.csv: | - # Utilization (the sample period varies depending on the product),, - DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). - DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). - - # Utilization of IP blocks,, - DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned - DCGM_FI_PROF_SM_OCCUPANCY, gauge, The fraction of resident warps on a multiprocessor - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, The ratio of cycles the tensor (HMMA) pipe is active (off the peak sustained elapsed cycles) - DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, The fraction of cycles the FP64 (double precision) pipe was active. - DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, The fraction of cycles the FP32 (single precision) pipe was active. - DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, The fraction of cycles the FP16 (half precision) pipe was active. - - # Memory usage,, - DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). - DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). - DCGM_FI_DEV_FB_TOTAL, gauge, Total Frame Buffer of the GPU in MB. - - # PCIE,, - DCGM_FI_PROF_PCIE_TX_BYTES, gauge, Total number of bytes transmitted through PCIe TX - DCGM_FI_PROF_PCIE_RX_BYTES, gauge, Total number of bytes received through PCIe RX - - # NVLink,, - DCGM_FI_PROF_NVLINK_TX_BYTES, gauge, The number of bytes of active NvLink tx (transmit) data including both header and payload. - DCGM_FI_PROF_NVLINK_RX_BYTES, gauge, The number of bytes of active NvLink rx (read) data including both header and payload. ---- -apiVersion: monitoring.googleapis.com/v1alpha1 -kind: PodMonitoring -metadata: - name: nvidia-dcgm-exporter-gmp-monitor - namespace: gpu-monitoring-system -spec: - selector: - matchLabels: - app: nvidia-dcgm-exporter - endpoints: - - port: metrics - interval: 20s diff --git a/Stable-Diffusion-UI-Novel/autoscale/nvidia_dcgm/gke-dcgm-dashboard.yml b/Stable-Diffusion-UI-Novel/autoscale/nvidia_dcgm/gke-dcgm-dashboard.yml deleted file mode 100644 index c837da4..0000000 --- a/Stable-Diffusion-UI-Novel/autoscale/nvidia_dcgm/gke-dcgm-dashboard.yml +++ /dev/null @@ -1,380 +0,0 @@ -displayName: Example GKE GPU dashboard -mosaicLayout: - columns: 12 - tiles: - - height: 4 - widget: - title: GKE - Built-In - Accelerator Duty Cycle - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="kubernetes.io/container/accelerator/duty_cycle" - resource.type="k8s_container" - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - - height: 4 - widget: - title: GKE - Bulit-In - Accelerator Memory Used - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="kubernetes.io/container/accelerator/memory_used" - resource.type="k8s_container" - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - xPos: 6 - - height: 4 - widget: - title: GKE - DCGM - GPU Utilization - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="prometheus.googleapis.com/DCGM_FI_DEV_GPU_UTIL/gauge" - resource.type="prometheus_target" - secondaryAggregation: - alignmentPeriod: 60s - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - yPos: 4 - - height: 4 - widget: - title: GKE - DCGM - GPU Memory Used - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="prometheus.googleapis.com/DCGM_FI_DEV_FB_USED/gauge" - resource.type="prometheus_target" - secondaryAggregation: - alignmentPeriod: 60s - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - xPos: 6 - yPos: 4 - - height: 4 - widget: - title: GKE - DCGM - SM Utilization - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="prometheus.googleapis.com/DCGM_FI_PROF_SM_ACTIVE/gauge" - resource.type="prometheus_target" - secondaryAggregation: - alignmentPeriod: 60s - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - yPos: 8 - - height: 4 - widget: - title: GKE - DCGM - Memory Utilization - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="prometheus.googleapis.com/DCGM_FI_DEV_MEM_COPY_UTIL/gauge" - resource.type="prometheus_target" - secondaryAggregation: - alignmentPeriod: 60s - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - xPos: 6 - yPos: 8 - - height: 4 - widget: - title: GKE - DCGM - SM Occupancy - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="prometheus.googleapis.com/DCGM_FI_PROF_SM_OCCUPANCY/gauge" - resource.type="prometheus_target" - secondaryAggregation: - alignmentPeriod: 60s - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - yPos: 12 - - height: 4 - widget: - title: GKE - DCGM - Tensor Engine Utilization - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="prometheus.googleapis.com/DCGM_FI_PROF_PIPE_TENSOR_ACTIVE/gauge" - resource.type="prometheus_target" - secondaryAggregation: - alignmentPeriod: 60s - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - xPos: 6 - yPos: 12 - - height: 4 - widget: - title: GKE - DCGM - PCIe Tx Bandwidth - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="prometheus.googleapis.com/DCGM_FI_PROF_PCIE_TX_BYTES/gauge" - resource.type="prometheus_target" - secondaryAggregation: - alignmentPeriod: 60s - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - yPos: 16 - - height: 4 - widget: - title: GKE - DCGM - PCIe Rx Bandwidth - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="prometheus.googleapis.com/DCGM_FI_PROF_PCIE_RX_BYTES/gauge" - resource.type="prometheus_target" - secondaryAggregation: - alignmentPeriod: 60s - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - xPos: 6 - yPos: 16 - - height: 4 - widget: - title: GKE - DCGM - NvLink Tx Bandwidth - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="prometheus.googleapis.com/DCGM_FI_PROF_NVLINK_TX_BYTES/gauge" - resource.type="prometheus_target" - secondaryAggregation: - alignmentPeriod: 60s - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - yPos: 20 - - height: 4 - widget: - title: GKE - DCGM - NvLink Rx Bandwidth - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="prometheus.googleapis.com/DCGM_FI_PROF_NVLINK_RX_BYTES/gauge" - resource.type="prometheus_target" - secondaryAggregation: - alignmentPeriod: 60s - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - xPos: 6 - yPos: 20 - - height: 4 - widget: - title: GKE - DCGM - FP64 Engine Utilization - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="prometheus.googleapis.com/DCGM_FI_PROF_PIPE_FP64_ACTIVE/gauge" - resource.type="prometheus_target" - secondaryAggregation: - alignmentPeriod: 60s - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - yPos: 24 - - height: 4 - widget: - title: GKE - DCGM - FP32 Engine Utilization - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="prometheus.googleapis.com/DCGM_FI_PROF_PIPE_FP32_ACTIVE/gauge" - resource.type="prometheus_target" - secondaryAggregation: - alignmentPeriod: 60s - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - xPos: 6 - yPos: 24 - - height: 4 - widget: - title: GKE - DCGM - FP16 Engine Utilization - xyChart: - chartOptions: - mode: COLOR - dataSets: - - minAlignmentPeriod: 60s - plotType: LINE - targetAxis: Y1 - timeSeriesQuery: - timeSeriesFilter: - aggregation: - alignmentPeriod: 60s - perSeriesAligner: ALIGN_MEAN - filter: metric.type="prometheus.googleapis.com/DCGM_FI_PROF_PIPE_FP16_ACTIVE/gauge" - resource.type="prometheus_target" - secondaryAggregation: - alignmentPeriod: 60s - timeshiftDuration: 0s - yAxis: - label: y1Axis - scale: LINEAR - width: 6 - yPos: 28 diff --git a/Stable-Diffusion-UI-Novel/kubernetes/deployment.yaml b/Stable-Diffusion-UI-Novel/kubernetes/deployment.yaml deleted file mode 100644 index 9767712..0000000 --- a/Stable-Diffusion-UI-Novel/kubernetes/deployment.yaml +++ /dev/null @@ -1,38 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: stable-diffusion-deployment - labels: - app: stable-diffusion -spec: - replicas: 1 - selector: - matchLabels: - app: stable-diffusion - template: - metadata: - labels: - app: stable-diffusion - spec: - volumes: - - name: stable-diffusion-storage - persistentVolumeClaim: - claimName: vol1 - containers: - - name: stable-diffusion-webui - image: us-central1-docker.pkg.dev/dave-selfstudy01/hzchen-repo/sd-webui:0.1 - resources: - limits: - nvidia.com/gpu: 1 - ports: - - containerPort: 7860 - volumeMounts: - - mountPath: "/stable-diffusion-webui/models" - name: stable-diffusion-storage - subPath: models - - mountPath: "/stable-diffusion-webui/outputs" - name: stable-diffusion-storage - subPath: outputs - - mountPath: "/stable-diffusion-webui/inputs" - name: stable-diffusion-storage - subPath: inputs \ No newline at end of file diff --git a/Stable-Diffusion-UI-Novel/kubernetes/hpa.yaml b/Stable-Diffusion-UI-Novel/kubernetes/hpa.yaml deleted file mode 100644 index f9acc5e..0000000 --- a/Stable-Diffusion-UI-Novel/kubernetes/hpa.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: stable-diffusion-hpa -spec: - minReplicas: 1 - maxReplicas: 4 - metrics: - - type: External - external: - metric: - name: kubernetes.io|container|accelerator|duty_cycle - selector: - matchLabels: - resource.labels.namespace_name: $namespace - target: - type: AverageValue - averageValue: 80 - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: "stable-diffusion-deployment" \ No newline at end of file diff --git a/Stable-Diffusion-UI-Novel/kubernetes/nfs_pv.yaml b/Stable-Diffusion-UI-Novel/kubernetes/nfs_pv.yaml deleted file mode 100644 index 9c30cd4..0000000 --- a/Stable-Diffusion-UI-Novel/kubernetes/nfs_pv.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: v1 -kind: PersistentVolume -metadata: - name: filestore-nfs-pv -spec: - capacity: - storage: 1Ti - accessModes: - - ReadWriteMany - nfs: - path: /vol1 - server: 10.150.39.10 diff --git a/Stable-Diffusion-UI-Novel/kubernetes/nfs_pvc.yaml b/Stable-Diffusion-UI-Novel/kubernetes/nfs_pvc.yaml deleted file mode 100644 index f7788f8..0000000 --- a/Stable-Diffusion-UI-Novel/kubernetes/nfs_pvc.yaml +++ /dev/null @@ -1,12 +0,0 @@ -kind: PersistentVolumeClaim -apiVersion: v1 -metadata: - name: vol1 -spec: - accessModes: - - ReadWriteMany - storageClassName: "" - volumeName: filestore-nfs-pv - resources: - requests: - storage: 1Ti diff --git a/Stable-Diffusion-UI-Novel/kubernetes/service.yaml b/Stable-Diffusion-UI-Novel/kubernetes/service.yaml deleted file mode 100644 index 621d8dd..0000000 --- a/Stable-Diffusion-UI-Novel/kubernetes/service.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: stable-diffusion-service - labels: - app: stable-diffusion -spec: - ports: - - protocol: TCP - port: 7860 - targetPort: 7860 - selector: - app: stable-diffusion - type: LoadBalancer \ No newline at end of file diff --git a/Stable-Diffusion-UI-Novel/templates/deployment.yaml b/Stable-Diffusion-UI-Novel/templates/deployment.yaml deleted file mode 100644 index 9639975..0000000 --- a/Stable-Diffusion-UI-Novel/templates/deployment.yaml +++ /dev/null @@ -1,38 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: stable-diffusion-deployment - labels: - app: stable-diffusion -spec: - replicas: 1 - selector: - matchLabels: - app: stable-diffusion - template: - metadata: - labels: - app: stable-diffusion - spec: - volumes: - - name: stable-diffusion-storage - persistentVolumeClaim: - claimName: $fileshare_name # replace with fileshare name - containers: - - name: stable-diffusion-webui - image: $image_url # e.g. us-central1-docker.pkg.dev/dave-selfstudy01/hzchen-repo/sd-webui:0.1 - resources: - limits: - nvidia.com/gpu: 1 - ports: - - containerPort: 7860 - volumeMounts: - - mountPath: "/stable-diffusion-webui/models" - name: stable-diffusion-storage - subPath: models - - mountPath: "/stable-diffusion-webui/outputs" - name: stable-diffusion-storage - subPath: outputs - - mountPath: "/stable-diffusion-webui/inputs" - name: stable-diffusion-storage - subPath: inputs \ No newline at end of file diff --git a/Stable-Diffusion-UI-Novel/templates/nfs_pv.yaml b/Stable-Diffusion-UI-Novel/templates/nfs_pv.yaml deleted file mode 100644 index 7f7f424..0000000 --- a/Stable-Diffusion-UI-Novel/templates/nfs_pv.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: v1 -kind: PersistentVolume -metadata: - name: filestore-nfs-pv -spec: - capacity: - storage: 1Ti - accessModes: - - ReadWriteMany - nfs: - path: $fileshare_path # e.g. /sdpvc - server: $filestore_instance_ip # e.g. 172.168.1.1 diff --git a/Stable-Diffusion-UI-Novel/templates/nfs_pvc.yaml b/Stable-Diffusion-UI-Novel/templates/nfs_pvc.yaml deleted file mode 100644 index 9cbf918..0000000 --- a/Stable-Diffusion-UI-Novel/templates/nfs_pvc.yaml +++ /dev/null @@ -1,12 +0,0 @@ -kind: PersistentVolumeClaim -apiVersion: v1 -metadata: - name: $fileshare_name # e.g. sdpvc -spec: - accessModes: - - ReadWriteMany - storageClassName: "" - volumeName: filestore-nfs-pv - resources: - requests: - storage: 1Ti diff --git a/VertexCustomTraining/Dockerfile b/VertexCustomTraining/Dockerfile new file mode 100644 index 0000000..64dbb73 --- /dev/null +++ b/VertexCustomTraining/Dockerfile @@ -0,0 +1,33 @@ +FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 + +RUN apt update +RUN apt install -y wget git python3 python3-venv python3-pip + +RUN pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 + +WORKDIR /root + +RUN git clone -b v0.14.0 https://github.com/huggingface/diffusers.git \ + && pip install /root/diffusers \ + && pip install -U -r /root/diffusers/examples/dreambooth/requirements.txt \ + && pip install -U -r /root/diffusers/examples/text_to_image/requirements.txt \ + && pip install -U xformers \ + && pip install -U safetensors + +# Solve Bitbytes and CUDA conflict issue +ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/cuda/lib64 +RUN ln -s /usr/local/cuda/lib64/libcudart.so.11.0 /usr/local/cuda/lib64/libcudart.so +RUN pip install -U bitsandbytes --prefer-binary + +# Config accelerate +RUN accelerate config default --mixed_precision=fp16 + +# Installs additional packages as you need. +RUN pip install google-cloud-aiplatform +RUN pip install google-cloud-storage + +# Copies the trainer code to the docker image. +COPY train.py /root/train.py + +# Sets up the entry point to invoke the trainer. +ENTRYPOINT ["python3", "-m", "train"] \ No newline at end of file diff --git a/VertexCustomTraining/Stable_Diffusion_Lora_fine_tuning_on_Vertex_AI.ipynb b/VertexCustomTraining/Stable_Diffusion_Lora_fine_tuning_on_Vertex_AI.ipynb new file mode 100644 index 0000000..1873bfc --- /dev/null +++ b/VertexCustomTraining/Stable_Diffusion_Lora_fine_tuning_on_Vertex_AI.ipynb @@ -0,0 +1,353 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "alXRFyMWz7tP" + }, + "source": [ + "# **Notebook - Stable Diffusion Lora fine tuning on Vertex AI**\n", + "\n", + "This notebook copies all codes and scripts here, just for a view. You can also directly use the code file." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **Architecture:**\n", + "- **train.py** : model training file in Docker\n", + "- **Dockerfile**\n", + "- **cloud-build-config.yaml** : cloud config file used in CLI\n", + "- **vertex-ai-config.yaml** : : cloud config file used in CLI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pY8UaaUqxAdy" + }, + "outputs": [], + "source": [ + "%%writefile train.py\n", + "import subprocess\n", + "import os\n", + "import argparse\n", + "import re\n", + "import torch\n", + "from safetensors.torch import save_file\n", + "\n", + "def bin_to_safetensors(output_path):\n", + " newDict = dict();\n", + " checkpoint = torch.load(output_path + '/pytorch_lora_weights.bin');\n", + " for idx, key in enumerate(checkpoint):\n", + " newKey = re.sub('\\.processor\\.', '_', key);\n", + " newKey = re.sub('mid_block\\.', 'mid_block_', newKey);\n", + " newKey = re.sub('_lora.up.', '.lora_up.', newKey);\n", + " newKey = re.sub('_lora.down.', '.lora_down.', newKey);\n", + " newKey = re.sub('\\.(\\d+)\\.', '_\\\\1_', newKey);\n", + " newKey = re.sub('to_out', 'to_out_0', newKey);\n", + " newKey = 'lora_unet_'+newKey;\n", + "\n", + " newDict[newKey] = checkpoint[key];\n", + "\n", + " newLoraName = 'pytorch_lora_weights.safetensors';\n", + " print(\"Saving \" + newLoraName);\n", + " save_file(newDict, output_path + '/' + newLoraName);\n", + "\n", + "def main(args):\n", + "\n", + " MODEL_NAME= args.model_name #\"runwayml/stable-diffusion-v1-5\"\n", + " INSTANCE_DIR= args.input_storage\n", + " OUTPUT_DIR= args.output_storage\n", + " PROMPT = args.prompt\n", + "\n", + " os.chdir(\"/root/diffusers/examples/dreambooth\")\n", + "\n", + " # for complex commands, with many args, use string + `shell=True`:\n", + " cmd_str = (f'accelerate launch train_dreambooth_lora.py '\n", + " f'--pretrained_model_name_or_path=\"{MODEL_NAME}\" '\n", + " f'--instance_data_dir=\"{INSTANCE_DIR}\" '\n", + " f'--output_dir=\"{OUTPUT_DIR}\" '\n", + " f'--instance_prompt=\"{PROMPT}\" '\n", + " f' --resolution=512 '\n", + " f'--train_batch_size=1 '\n", + " f'--use_8bit_adam '\n", + " f'--mixed_precision=\"fp16\" '\n", + " f'--gradient_accumulation_steps=1 '\n", + " f'--learning_rate=1e-4 '\n", + " f'--lr_scheduler=\"constant\" '\n", + " f'--lr_warmup_steps=0 '\n", + " f'--max_train_steps=400')\n", + "\n", + " subprocess.run(cmd_str, shell=True)\n", + " # Convert .bin file to .safetensors, to be used in Automatic111 WebUI\n", + " bin_to_safetensors(args.output_storage)\n", + "\n", + "if __name__ == \"__main__\":\n", + " parser = argparse.ArgumentParser()\n", + " parser.add_argument(\"--model_name\", type=str, default=\"runwayml/stable-diffusion-v1-5\", help=\"bucket_name/model_folder\")\n", + " parser.add_argument(\"--input_storage\", type=str,default=\"abc\", help=\"/gcs/bucket_name/input_image_folder\")\n", + " parser.add_argument(\"--output_storage\", type=str, default=\"abc\",help=\"/gcs/bucket_name/output_folder\")\n", + " parser.add_argument(\"--prompt\", type=str, default=\"abc\",help=\"a photo of XXX\")\n", + " \n", + " args = parser.parse_args()\n", + " print(args.model_name)\n", + " print(args.input_storage)\n", + " print(args.output_storage)\n", + " print(args.prompt)\n", + " main(args)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3w5dWRYbRQ-L" + }, + "outputs": [], + "source": [ + "%%writefile Dockerfile\n", + "FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04\n", + "\n", + "RUN apt update\n", + "RUN apt install -y wget git python3 python3-venv python3-pip\n", + "\n", + "RUN pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117\n", + "\n", + "WORKDIR /root\n", + "\n", + "RUN git clone -b v0.14.0 https://github.com/huggingface/diffusers.git \\\n", + " && pip install /root/diffusers \\\n", + " && pip install -U -r /root/diffusers/examples/dreambooth/requirements.txt \\\n", + " && pip install -U -r /root/diffusers/examples/text_to_image/requirements.txt \\\n", + " && pip install -U xformers \\ \n", + " && pip install -U safetensors\n", + "\n", + "# Solve Bitbytes and CUDA conflict issue\n", + "ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/cuda/lib64\n", + "RUN ln -s /usr/local/cuda/lib64/libcudart.so.11.0 /usr/local/cuda/lib64/libcudart.so\n", + "RUN pip install -U bitsandbytes --prefer-binary\n", + "\n", + "# Config accelerate\n", + "RUN accelerate config default --mixed_precision=fp16\n", + "\n", + "# Installs additional packages as you need.\n", + "RUN pip install google-cloud-aiplatform\n", + "RUN pip install google-cloud-storage\n", + "\n", + "# Copies the trainer code to the docker image.\n", + "COPY train.py /root/train.py\n", + "\n", + "# Sets up the entry point to invoke the trainer.\n", + "ENTRYPOINT [\"python3\", \"-m\", \"train\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kSDuYpoZF8m-" + }, + "outputs": [], + "source": [ + "#cloud build config: modify docker image name and tag\n", + "%%writefile cloud-build-config.yaml\n", + "steps:\n", + "- name: 'gcr.io/cloud-builders/docker'\n", + " args: [ 'build', '-t', 'us-central1-docker.pkg.dev/project_id/artifact_registry_name/sd-training:db-lora-v1', '.' ]\n", + "- name: 'gcr.io/cloud-builders/docker'\n", + " args: ['push', 'us-central1-docker.pkg.dev/project_id/artifact_registry_name/sd-training:db-lora-v1']\n", + "options:\n", + " machineType: 'N1_HIGHCPU_8'\n", + " diskSizeGb: '200'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Sn6qxXsM1Jc9" + }, + "outputs": [], + "source": [ + "#vertex ai config: modify docker image name and tag\n", + "%%writefile vertex-ai-config.yaml\n", + "workerPoolSpecs:\n", + " machineSpec:\n", + " machineType: n1-standard-8\n", + " acceleratorType: NVIDIA_TESLA_T4\n", + " acceleratorCount: 1\n", + " replicaCount: 1\n", + " containerSpec:\n", + " imageUri: us-central1-docker.pkg.dev/project_id/artifact_registry_id/sd-training:db-lora-v1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YTIy8wBa1pNA" + }, + "outputs": [], + "source": [ + "# cloud build image\n", + "! gcloud builds submit --config cloud-build-config.yaml .\n", + "\n", + "# create vertex ai customer training job\n", + "# args format:\n", + "# --model_name: Huggingface repo id, or \"/gcs/bucket_name/model_folder\". I only test the models downloaded from HF, with standard diffusers format. Safetensors has not been test.\n", + "# --input_storage:/gcs/bucket_name/input_image_folder\n", + "# --output_storage: /gcs/bucket_name/output_folder\n", + "# --prompt: a photo of XXX\n", + "! gcloud ai custom-jobs create \\\n", + " --region=us-central1 \\\n", + " --display-name=sd-lora-training-args-0314-noyh \\\n", + " --config=vertex-ai-config.yaml \\\n", + " --args=\"--model_name=runwayml/stable-diffusion-v1-5,--input_storage=/gcs/sd_lsj/input_dog,--output_storage=/gcs/sd_lsj/dog_lora_output,--prompt=a photo of sks dog\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yS5cqSia1kte" + }, + "source": [ + "When training finished, you can load the base model and lora weights for inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zgpdZ_Nv1MF_" + }, + "outputs": [], + "source": [ + "# inference with fine-tuned lora model\n", + "from diffusers import StableDiffusionPipeline\n", + "import torch\n", + "\n", + "model_path = \"/somewhere/dog_lora_output\"\n", + "pipe = StableDiffusionPipeline.from_pretrained(\"runwayml/stable-diffusion-v1-5\", torch_dtype=torch.float16)\n", + "pipe.unet.load_attn_procs(model_path)\n", + "pipe.to(\"cuda\")\n", + "\n", + "prompt = \"A sks dog in the desert.\"\n", + "image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]\n", + "image.save(\"dog_lora.png\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "m5pyZ_kv8m3O" + }, + "source": [ + "Convert .bin file to safetensors, to use in Automatic1111 WebUI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tr788sXw8mVA" + }, + "outputs": [], + "source": [ + "import os;\n", + "import re;\n", + "import torch;\n", + "from safetensors.torch import save_file;\n", + "\n", + "newDict = dict();\n", + "checkpoint = torch.load('dog_lora_output/pytorch_lora_weights.bin');\n", + "for idx, key in enumerate(checkpoint):\n", + " newKey = re.sub('\\.processor\\.', '_', key);\n", + " newKey = re.sub('mid_block\\.', 'mid_block_', newKey);\n", + " newKey = re.sub('_lora.up.', '.lora_up.', newKey);\n", + " newKey = re.sub('_lora.down.', '.lora_down.', newKey);\n", + " newKey = re.sub('\\.(\\d+)\\.', '_\\\\1_', newKey);\n", + " newKey = re.sub('to_out', 'to_out_0', newKey);\n", + " newKey = 'lora_unet_'+newKey;\n", + "\n", + " newDict[newKey] = checkpoint[key];\n", + "\n", + "newLoraName = 'pytorch_lora_weights.safetensors';\n", + "print(\"Saving \" + newLoraName);\n", + "save_file(newDict, newLoraName);" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "h5dnViBHzOZ-" + }, + "source": [ + "Alternatives: Dowload and save Stable Diffusion model from Huggingface to GCS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6UWVslWVHmbk" + }, + "outputs": [], + "source": [ + "! pip install diffusers\n", + "! pip install transformers\n", + "! pip install accelerate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q2qzuAQ0HsJs" + }, + "outputs": [], + "source": [ + "import torch\n", + "from diffusers import DiffusionPipeline\n", + "\n", + "pipeline = DiffusionPipeline.from_pretrained(\n", + " \"runwayml/stable-diffusion-v1-5\",\n", + " revision=\"fp16\",\n", + " torch_dtype=torch.float16,\n", + ")\n", + "pipeline.save_pretrained(\"model_weights\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bJYwWU9S0zqv" + }, + "outputs": [], + "source": [ + "! gsutil cp -r model_weights gs://bucket_name/folder" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/VertexCustomTraining/cloud-build-config.yaml b/VertexCustomTraining/cloud-build-config.yaml new file mode 100644 index 0000000..1b73a41 --- /dev/null +++ b/VertexCustomTraining/cloud-build-config.yaml @@ -0,0 +1,8 @@ +steps: +- name: 'gcr.io/cloud-builders/docker' + args: [ 'build', '-t', 'us-central1-docker.pkg.dev/project_id/artifact_registry_id/sd-training:db-lora-v1', '.' ] +- name: 'gcr.io/cloud-builders/docker' + args: ['push', 'us-central1-docker.pkg.dev/project_id/artifact_registry_id/sd-training:db-lora-v1'] +options: + machineType: 'N1_HIGHCPU_8' + diskSizeGb: '200' \ No newline at end of file diff --git a/VertexCustomTraining/train.py b/VertexCustomTraining/train.py new file mode 100644 index 0000000..518c8ec --- /dev/null +++ b/VertexCustomTraining/train.py @@ -0,0 +1,67 @@ +import subprocess +import os +import argparse +import re +import torch +from safetensors.torch import save_file + +def bin_to_safetensors(output_path): + newDict = dict(); + checkpoint = torch.load(output_path + '/pytorch_lora_weights.bin'); + for idx, key in enumerate(checkpoint): + newKey = re.sub('\.processor\.', '_', key); + newKey = re.sub('mid_block\.', 'mid_block_', newKey); + newKey = re.sub('_lora.up.', '.lora_up.', newKey); + newKey = re.sub('_lora.down.', '.lora_down.', newKey); + newKey = re.sub('\.(\d+)\.', '_\\1_', newKey); + newKey = re.sub('to_out', 'to_out_0', newKey); + newKey = 'lora_unet_'+newKey; + + newDict[newKey] = checkpoint[key]; + + newLoraName = 'pytorch_lora_weights.safetensors'; + print("Saving " + newLoraName); + save_file(newDict, output_path + '/' + newLoraName); + +def main(args): + + MODEL_NAME= args.model_name #"runwayml/stable-diffusion-v1-5" + INSTANCE_DIR= args.input_storage + OUTPUT_DIR= args.output_storage + PROMPT = args.prompt + + os.chdir("/root/diffusers/examples/dreambooth") + + # for complex commands, with many args, use string + `shell=True`: + cmd_str = (f'accelerate launch train_dreambooth_lora.py ' + f'--pretrained_model_name_or_path="{MODEL_NAME}" ' + f'--instance_data_dir="{INSTANCE_DIR}" ' + f'--output_dir="{OUTPUT_DIR}" ' + f'--instance_prompt="{PROMPT}" ' + f' --resolution=512 ' + f'--train_batch_size=1 ' + f'--use_8bit_adam ' + f'--mixed_precision="fp16" ' + f'--gradient_accumulation_steps=1 ' + f'--learning_rate=1e-4 ' + f'--lr_scheduler="constant" ' + f'--lr_warmup_steps=0 ' + f'--max_train_steps=400') + + subprocess.run(cmd_str, shell=True) + # Convert .bin file to .safetensors, to be used in Automatic111 WebUI + bin_to_safetensors(args.output_storage) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model_name", type=str, default="runwayml/stable-diffusion-v1-5", help="bucket_name/model_folder") + parser.add_argument("--input_storage", type=str,default="abc", help="/gcs/bucket_name/input_image_folder") + parser.add_argument("--output_storage", type=str, default="abc",help="/gcs/bucket_name/output_folder") + parser.add_argument("--prompt", type=str, default="abc",help="a photo of XXX") + + args = parser.parse_args() + print(args.model_name) + print(args.input_storage) + print(args.output_storage) + print(args.prompt) + main(args) \ No newline at end of file diff --git a/VertexCustomTraining/vertex-ai-config.yaml b/VertexCustomTraining/vertex-ai-config.yaml new file mode 100644 index 0000000..fa3bdcd --- /dev/null +++ b/VertexCustomTraining/vertex-ai-config.yaml @@ -0,0 +1,8 @@ +workerPoolSpecs: + machineSpec: + machineType: n1-standard-8 + acceleratorType: NVIDIA_TESLA_T4 + acceleratorCount: 1 + replicaCount: 1 + containerSpec: + imageUri: us-central1-docker.pkg.dev/project_id/artifact_registry_id/sd-training:db-lora-v1 \ No newline at end of file diff --git a/Workbench/Dockerfile b/Workbench/Dockerfile new file mode 100644 index 0000000..eab7c90 --- /dev/null +++ b/Workbench/Dockerfile @@ -0,0 +1,19 @@ +FROM gcr.io/deeplearning-platform-release/base-gpu + +RUN apt update +WORKDIR /root + +RUN git clone -b v0.14.0 https://github.com/huggingface/diffusers.git \ + && pip install /root/diffusers \ + && pip install -U -r /root/diffusers/examples/dreambooth/requirements.txt \ + && pip install -U -r /root/diffusers/examples/text_to_image/requirements.txt \ + && pip install -U xformers \ + && pip install -U bitsandbytes --prefer-binary \ + && pip install -U safetensors + +RUN pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 + +ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/cuda/lib64 +RUN ln -sf /usr/local/cuda/lib64/libcudart.so.11.0 /usr/local/cuda/lib64/libcudart.so + +RUN accelerate config default --mixed_precision=fp16 diff --git a/Workbench/cloud-build.yaml b/Workbench/cloud-build.yaml new file mode 100644 index 0000000..79fcd87 --- /dev/null +++ b/Workbench/cloud-build.yaml @@ -0,0 +1,8 @@ +steps: +- name: 'gcr.io/cloud-builders/docker' + args: [ 'build', '-t', 'us-central1-docker.pkg.dev/project_id/artifact_registry_id/sd-training:nbexecutor-v1', '.' ] +- name: 'gcr.io/cloud-builders/docker' + args: ['push', 'us-central1-docker.pkg.dev/project_id/artifact_registry_id/sd-training:nbexecutor-v1'] +options: + machineType: 'N1_HIGHCPU_8' + diskSizeGb: '200' \ No newline at end of file diff --git a/Workbench/sd_training_nbexecutor.ipynb b/Workbench/sd_training_nbexecutor.ipynb new file mode 100644 index 0000000..e189a41 --- /dev/null +++ b/Workbench/sd_training_nbexecutor.ipynb @@ -0,0 +1,114 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "8036faf2-ca6b-4fbf-9469-26358eb653bc", + "metadata": { + "id": "8036faf2-ca6b-4fbf-9469-26358eb653bc" + }, + "source": [ + "## Write training code here and Click \"Execute\" for a workbench execute job\n", + "- Use custom container built in Cloud Build and stored in Artifact Registry\n", + "- Cloud Build command: gcloud builds submit --config cloud-build.yaml .\n", + "- input and output directory can be /gcs/bucket_name/folder for Cloud Storage path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2243ab9f-d7db-4db5-836a-154d9616a628", + "metadata": { + "id": "2243ab9f-d7db-4db5-836a-154d9616a628" + }, + "outputs": [], + "source": [ + "MODEL_NAME=\"runwayml/stable-diffusion-v1-5\"\n", + "INSTANCE_DIR=\"/gcs/bucket_name/input_dog\"\n", + "OUTPUT_DIR=\"/gcs/bucket_name/dog_lora_output\"\n", + "\n", + "! accelerate launch ./diffusers/examples/dreambooth/train_dreambooth_lora.py \\\n", + " --pretrained_model_name_or_path=$MODEL_NAME \\\n", + " --instance_data_dir=$INSTANCE_DIR \\\n", + " --output_dir=$OUTPUT_DIR \\\n", + " --instance_prompt=\"a photo of sks dog\" \\\n", + " --resolution=512 \\\n", + " --train_batch_size=1 \\\n", + " --use_8bit_adam \\\n", + " --mixed_precision=\"fp16\" \\\n", + " --gradient_accumulation_steps=1 \\\n", + " --learning_rate=1e-4 \\\n", + " --lr_scheduler=\"constant\" \\\n", + " --lr_warmup_steps=0 \\\n", + " --max_train_steps=500" + ] + }, + { + "cell_type": "markdown", + "id": "O25rkc78ggqL", + "metadata": { + "id": "O25rkc78ggqL" + }, + "source": [ + "Convert the lora .bin file to safetensor file, for used in WebUI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d52e7698-122a-4864-ad8c-55d4562c2a94", + "metadata": { + "id": "d52e7698-122a-4864-ad8c-55d4562c2a94" + }, + "outputs": [], + "source": [ + "import os;\n", + "import re;\n", + "import torch;\n", + "from safetensors.torch import save_file;\n", + "\n", + "newDict = dict();\n", + "checkpoint = torch.load(OUTPUT_DIR + '/pytorch_lora_weights.bin');\n", + "for idx, key in enumerate(checkpoint):\n", + " newKey = re.sub('\\.processor\\.', '_', key);\n", + " newKey = re.sub('mid_block\\.', 'mid_block_', newKey);\n", + " newKey = re.sub('_lora.up.', '.lora_up.', newKey);\n", + " newKey = re.sub('_lora.down.', '.lora_down.', newKey);\n", + " newKey = re.sub('\\.(\\d+)\\.', '_\\\\1_', newKey);\n", + " newKey = re.sub('to_out', 'to_out_0', newKey);\n", + " newKey = 'lora_unet_'+newKey;\n", + "\n", + " newDict[newKey] = checkpoint[key];\n", + "\n", + "newLoraName = 'pytorch_lora_weights.safetensors';\n", + "print(\"Saving \" + newLoraName);\n", + "save_file(newDict, OUTPUT_DIR + '/' + newLoraName);" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Pytorch (Local)", + "language": "python", + "name": "local-pytorch" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/dog_images/alvan-nee-9M0tSjb-cpA-unsplash.jpeg b/dog_images/alvan-nee-9M0tSjb-cpA-unsplash.jpeg new file mode 100644 index 0000000..e839e5e Binary files /dev/null and b/dog_images/alvan-nee-9M0tSjb-cpA-unsplash.jpeg differ diff --git a/dog_images/alvan-nee-Id1DBHv4fbg-unsplash.jpeg b/dog_images/alvan-nee-Id1DBHv4fbg-unsplash.jpeg new file mode 100644 index 0000000..44d5967 Binary files /dev/null and b/dog_images/alvan-nee-Id1DBHv4fbg-unsplash.jpeg differ diff --git a/dog_images/alvan-nee-bQaAJCbNq3g-unsplash.jpeg b/dog_images/alvan-nee-bQaAJCbNq3g-unsplash.jpeg new file mode 100644 index 0000000..35dd2ab Binary files /dev/null and b/dog_images/alvan-nee-bQaAJCbNq3g-unsplash.jpeg differ diff --git a/dog_images/alvan-nee-brFsZ7qszSY-unsplash.jpeg b/dog_images/alvan-nee-brFsZ7qszSY-unsplash.jpeg new file mode 100644 index 0000000..cd760c6 Binary files /dev/null and b/dog_images/alvan-nee-brFsZ7qszSY-unsplash.jpeg differ diff --git a/dog_images/alvan-nee-eoqnr8ikwFE-unsplash.jpeg b/dog_images/alvan-nee-eoqnr8ikwFE-unsplash.jpeg new file mode 100644 index 0000000..16ab824 Binary files /dev/null and b/dog_images/alvan-nee-eoqnr8ikwFE-unsplash.jpeg differ diff --git a/images/custom_training_status.png b/images/custom_training_status.png new file mode 100644 index 0000000..dcac0c4 Binary files /dev/null and b/images/custom_training_status.png differ diff --git a/images/workbench_executor.png b/images/workbench_executor.png new file mode 100644 index 0000000..bfe3c65 Binary files /dev/null and b/images/workbench_executor.png differ diff --git a/images/workbench_status.png b/images/workbench_status.png new file mode 100644 index 0000000..8ce6c7e Binary files /dev/null and b/images/workbench_status.png differ