diff --git a/notebooks/en/_toctree.yml b/notebooks/en/_toctree.yml index 72d9c2da..0472a3d5 100644 --- a/notebooks/en/_toctree.yml +++ b/notebooks/en/_toctree.yml @@ -26,6 +26,8 @@ sections: - local: stable_diffusion_interpolation title: Stable Diffusion Interpolation + - local: quantize_stable_diffusion_with_quanto + title: Quantize Stable Diffusion with quanto - title: Multimodal Recipes sections: diff --git a/notebooks/en/quantize_stable_diffusion_with_quanto.ipynb b/notebooks/en/quantize_stable_diffusion_with_quanto.ipynb new file mode 100644 index 00000000..efb6d189 --- /dev/null +++ b/notebooks/en/quantize_stable_diffusion_with_quanto.ipynb @@ -0,0 +1,1067 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quantize Stable Diffusion with quanto\n", + "\n", + "_Authored by: [Thomas Liang](https://github.com/thliang01)_\n", + "\n", + "\n", + "- Stable Diffusion models are a type of generative AI that specialize in creating high-quality images from textual descriptions. They leverage deep learning techniques to understand and translate text inputs into visual outputs. Quantization, particularly Post Training Quantization (PTQ), is a crucial process in optimizing these models for faster performance and reduced model size, making them more efficient for deployment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install and setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install --upgrade diffusers accelerate transformers safetensors datasets quanto\n", + "! pip install -q numpy Pillow torchmetrics[image] torch-fidelity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import numpy as np\n", + "import os\n", + "\n", + "import time\n", + "from time import perf_counter\n", + "\n", + "from PIL import Image\n", + "from IPython import display as IPdisplay\n", + "from tqdm.auto import tqdm\n", + "\n", + "from diffusers import DiffusionPipeline\n", + "from diffusers import DDIMScheduler\n", + "from diffusers import AutoencoderKL\n", + "from transformers import logging\n", + "\n", + "logging.set_verbosity_error()\n", + "\n", + "# Check CUDA is available\n", + "print(torch.cuda.is_available())\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Base Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_name_or_path = \"stabilityai/stable-diffusion-xl-base-1.0\"\n", + "scheduler = DDIMScheduler.from_pretrained(model_name_or_path, subfolder=\"scheduler\")\n", + "num_inference_steps = 50\n", + "height = 512\n", + "width = 512\n", + "generator = torch.manual_seed(42)\n", + "\n", + "vae = AutoencoderKL.from_pretrained(\n", + " 'madebyollin/sdxl-vae-fp16-fix',\n", + " use_safetensors=True,\n", + " torch_dtype=torch.float16,\n", + ").to('cuda')\n", + "\n", + "pipeline = DiffusionPipeline.from_pretrained(\n", + " model_name_or_path,\n", + " scheduler = scheduler,\n", + " torch_dtype = torch.float16, \n", + " variant = \"fp16\",\n", + " height = height,\n", + " width = width,\n", + " generator = generator,\n", + " num_inference_steps = num_inference_steps,\n", + " vae = vae,\n", + " use_safetensors = True, \n", + ").to(device)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prompts and seeds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "queue = []\n", + "\n", + "# Photorealistic portrait (Portrait)\n", + "queue.extend([{\n", + " 'prompt': '3/4 shot, candid photograph of a beautiful 30 year old redhead woman with messy dark hair, peacefully sleeping in her bed, night, dark, light from window, dark shadows, masterpiece, uhd, moody',\n", + " 'seed': 877866767,\n", + "}])\n", + "\n", + "# Creative interior image (Interior)\n", + "queue.extend([{\n", + " 'prompt': 'futuristic living room with big windows, brown sofas, coffee table, plants, cyberpunk city, concept art, earthy colors',\n", + " 'seed': 5567822442,\n", + "}])\n", + "\n", + "# Macro photography (Macro)\n", + "queue.extend([{\n", + " 'prompt': 'macro shot of a bee collecting nectar from lavender flowers',\n", + " 'seed': 2257899409,\n", + "}])\n", + "\n", + "# Rendered 3D image (3D)\n", + "queue.extend([{\n", + " 'prompt': '3d rendered isometric fiji island beach, 3d tile, polygon, cartoony, mobile game',\n", + " 'seed': 987865634,\n", + "}])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Display_images & Memory & Execution time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Display Single image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompt = \"a photo of an astronaut riding a horse on mars\"\n", + "start = time.time()\n", + "images = pipeline(prompt).images[0]\n", + "end = time.time()\n", + "mem_bytes = torch.cuda.max_memory_allocated()\n", + "images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Execution time: {(end - start):.3f} sec\")\n", + "print(f\"Memory: {mem_bytes/(10**6):.3f} MB\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Display Multi images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a generator\n", + "generator = torch.Generator(device='cuda')\n", + "\n", + "# Start a loop to process prompts one by one\n", + "for i, generation in enumerate(queue, start=1):\n", + "\n", + " # We start the counter\n", + " image_start = perf_counter()\n", + "\n", + " # Assign the seed to the generator\n", + " generator.manual_seed(generation['seed'])\n", + "\n", + " # Create the image\n", + " image = pipeline(\n", + " prompt=generation['prompt'],\n", + " generator=generator,\n", + " ).images[0]\n", + "\n", + " # Save the image\n", + " image.save(f'image_{i}.png')\n", + "\n", + " # We stop the counter and save the result\n", + " generation['total_time'] = perf_counter() - image_start\n", + "\n", + "# Print the generation time of each image\n", + "images_totals = ', '.join(map(lambda generation: str(round(generation['total_time'], 1)), queue))\n", + "print('Image time:', images_totals)\n", + "\n", + "# Print the average time\n", + "images_average = round(sum(generation['total_time'] for generation in queue) / len(queue), 1)\n", + "print('Average image time:', images_average)\n", + "\n", + "# Print the Max. memory used\n", + "max_memory = round(torch.cuda.max_memory_allocated(device='cuda') / 1000000000, 2)\n", + "print('Max. memory used:', max_memory, 'GB')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the image_1\n", + "from PIL import Image\n", + "\n", + "img1 = Image.open(\"image_1.png\")\n", + "img1.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the image_2\n", + "from PIL import Image\n", + "\n", + "img2 = Image.open(\"image_2.png\")\n", + "img2.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the image_3\n", + "from PIL import Image\n", + "\n", + "img3 = Image.open(\"image_3.png\")\n", + "img3.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the image_4\n", + "from PIL import Image\n", + "\n", + "img4 = Image.open(\"image_4.png\")\n", + "img4.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluating Diffusion Models (default)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will evaluating both in CLIP score and PickScore\n", + "\n", + "* CLIP score\n", + "* PickScore" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### CLIP score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "CLIP, which stands for Contrastive Language-Image Pre-training, is a model developed by OpenAI that learns visual concepts from natural language descriptions. It is trained on a variety of (image, text) pairs and can predict the most relevant text snippet given an image, similar to the zero-shot capabilities of GPT-2 and GPT-3 \n", + ".\n", + "\n", + "\n", + "The CLIP model consists of a text encoder and an image encoder. These encoders transform the input data (text or image) into a shared multimodal embedding space. The goal of the model is to maximize the cosine similarity between the embeddings of matching image-text pairs while minimizing the cosine similarity between the embeddings of mismatching pairs. This is achieved through a contrastive objective \n", + ".\n", + "\n", + "\n", + "The CLIP score, in this context, is the cosine similarity between the image and text embeddings. A higher CLIP score indicates a better match between the image and the text \n", + "." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompts = [\n", + " \"a photo of an astronaut riding a horse on mars\",\n", + " \"A high tech solarpunk utopia in the Amazon rainforest\",\n", + " \"A pikachu fine dining with a view to the Eiffel Tower\",\n", + " \"A mecha robot in a favela in expressionist style\",\n", + " \"an insect robot preparing a delicious meal\",\n", + " \"A small cabin on top of a snowy mountain in the style of Disney, artstation\",\n", + "]\n", + "\n", + "images = pipeline(prompts, num_images_per_prompt=1, output_type=\"np\", height = height, width = width).images\n", + "\n", + "print(images.shape)\n", + "# (6, 512, 512, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torchmetrics.functional.multimodal import clip_score\n", + "from functools import partial\n", + "\n", + "clip_score_fn = partial(clip_score, model_name_or_path=\"openai/clip-vit-base-patch16\")\n", + "\n", + "def calculate_clip_score(images, prompts):\n", + " images_int = (images * 255).astype(\"uint8\")\n", + " clip_score = clip_score_fn(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()\n", + " return round(float(clip_score), 4)\n", + "\n", + "sd_clip_score = calculate_clip_score(images, prompts)\n", + "print(f\"CLIP score: {sd_clip_score}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### PickScore" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TODO\n", + "![PickScore](https://huggingface.co/datasets/huggingface/cookbook-images/)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Abstract\n", + "\n", + "The collection of large datasets of human preferences from text-to-image users is typically a privilege reserved for corporations, leaving such datasets out of public reach. To tackle this problem, we've developed a web application that allows text-to-image users to generate images and express their preferences. This application has been instrumental in the creation of Pick-a-Pic, an extensive, publicly accessible dataset of text-to-image prompts and genuine user preferences for generated images.\n", + "\n", + "\n", + "We've utilized this dataset to train a CLIP-based scoring function, known as PickScore, which has shown extraordinary performance in predicting human preferences. We've also tested PickScore's capability in model evaluation and found that it aligns better with human rankings compared to other automatic evaluation metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import\n", + "from transformers import AutoProcessor, AutoModel\n", + "from PIL import Image\n", + "import torch\n", + "\n", + "# load model\n", + "device = \"cuda\"\n", + "processor_name_or_path = \"laion/CLIP-ViT-H-14-laion2B-s32B-b79K\"\n", + "model_pretrained_name_or_path = \"yuvalkirstain/PickScore_v1\"\n", + "processor = AutoProcessor.from_pretrained(processor_name_or_path)\n", + "model = AutoModel.from_pretrained(model_pretrained_name_or_path).eval().to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Score function adapted from their docs\n", + "def get_scores(prompt, images):\n", + " \n", + " # preprocess\n", + " image_inputs = processor(\n", + " images=images,\n", + " padding=True,\n", + " truncation=True,\n", + " max_length=77,\n", + " return_tensors=\"pt\",\n", + " ).to(device)\n", + " \n", + " text_inputs = processor(\n", + " text=prompt,\n", + " padding=True,\n", + " truncation=True,\n", + " max_length=77,\n", + " return_tensors=\"pt\",\n", + " ).to(device)\n", + "\n", + "\n", + " with torch.no_grad():\n", + " # embed\n", + " image_embs = model.get_image_features(**image_inputs)\n", + " image_embs = image_embs / torch.norm(image_embs, dim=-1, keepdim=True)\n", + " \n", + " text_embs = model.get_text_features(**text_inputs)\n", + " text_embs = text_embs / torch.norm(text_embs, dim=-1, keepdim=True)\n", + " \n", + " # score\n", + " scores = model.logit_scale.exp() * (text_embs @ image_embs.T)[0]\n", + " \n", + " return scores.cpu().tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "get_scores(\"a photo of an astronaut riding a horse on mars\", images)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "get_scores(\"a photo of a pretty flower\", images)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "pap = load_dataset(\"yuvalkirstain/pickapic_v1_no_images\")\n", + "prompts = pap['validation_unique']['caption']\n", + "prompts[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Measuring the effect of CFG_Scale on Score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from IPython.display import clear_output\n", + "\n", + "average_scores = []\n", + "cfg_scales = [2, 9, 12, 30]\n", + "for cfg_scale in cfg_scales:\n", + " scores = []\n", + " for i, prompt in enumerate(prompts[:5]):\n", + " print(f\"Scale {cfg_scale}, prompt {i}\")\n", + " generator = generator # For reproducibility\n", + " im = pipeline(prompt, num_inference_steps=50, \n", + " generator=generator, guidance_scale=cfg_scale).images[0]\n", + " scores.append(get_scores(prompt, im)[0])\n", + " clear_output(wait=True)\n", + " average_scores.append(sum(scores)/len(scores))\n", + "\n", + "plt.plot(cfg_scales, average_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using A Score Model for Re-Ranking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_good_image(prompt):\n", + " images = []\n", + " # Generate 4 images with two different guidance scales (for example):\n", + " images += pipeline(prompt, num_inference_steps=50, num_images_per_prompt=1,height = height, width = width).images\n", + " images += pipeline(prompt, num_inference_steps=50, num_images_per_prompt=1,height = height, width = width, guidance_scale=5).images \n", + " # Score them and pick the best one\n", + " scores = get_scores(prompt, images)\n", + " best_image = images[scores.index(max(scores))]\n", + " return best_image\n", + "\n", + "generate_good_image(\"a photo of an astronaut riding a horse on mars\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quantization Stable Diffusion with quanto\n", + "\n", + "Let's Post Training Quantization our base model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from quanto import quantize, freeze, qint8\n", + "import torch\n", + "\n", + "model = \"stabilityai/stable-diffusion-xl-base-1.0\"\n", + "\n", + "print(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vae = AutoencoderKL.from_pretrained(\n", + " 'madebyollin/sdxl-vae-fp16-fix',\n", + " torch_dtype=torch.float16,\n", + " use_safetensors=True,\n", + ").to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def PTQ(torch_dtype, unet_dtype=None, device=\"cuda\"):\n", + " pipe = DiffusionPipeline.from_pretrained(\n", + " pretrained_model_name_or_path=\"stabilityai/stable-diffusion-xl-base-1.0\", \n", + " torch_dtype=torch_dtype,\n", + " scheduler = scheduler,\n", + " height = height,\n", + " width = width,\n", + " generator = generator,\n", + " num_inference_steps = num_inference_steps,\n", + " vae=vae, \n", + " use_safetensors=True\n", + " ).to(device)\n", + "\n", + " if unet_dtype:\n", + " quantize(pipe.unet, weights=unet_dtype)\n", + " quantize(pipe.vae, weights=unet_dtype)\n", + " freeze(pipe.unet)\n", + " freeze(pipe.vae)\n", + "\n", + " pipe.set_progress_bar_config(disable=True)\n", + " return pipe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "qpipe = PTQ(torch_dtype=torch.float16, unet_dtype=qint8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prompts and seeds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "queue = []\n", + "\n", + "# Photorealistic portrait (Portrait)\n", + "queue.extend([{\n", + " 'prompt': '3/4 shot, candid photograph of a beautiful 30 year old redhead woman with messy dark hair, peacefully sleeping in her bed, night, dark, light from window, dark shadows, masterpiece, uhd, moody',\n", + " 'seed': 877866767,\n", + "}])\n", + "\n", + "# Creative interior image (Interior)\n", + "queue.extend([{\n", + " 'prompt': 'futuristic living room with big windows, brown sofas, coffee table, plants, cyberpunk city, concept art, earthy colors',\n", + " 'seed': 5567822442,\n", + "}])\n", + "\n", + "# Macro photography (Macro)\n", + "queue.extend([{\n", + " 'prompt': 'macro shot of a bee collecting nectar from lavender flowers',\n", + " 'seed': 2257899409,\n", + "}])\n", + "\n", + "# Rendered 3D image (3D)\n", + "queue.extend([{\n", + " 'prompt': '3d rendered isometric fiji island beach, 3d tile, polygon, cartoony, mobile game',\n", + " 'seed': 987865634,\n", + "}])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Display_images & Memory & Execution time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompt = \"a photo of an astronaut riding a horse on mars\"\n", + "start = time.time()\n", + "images = qpipe(prompt).images[0]\n", + "end = time.time()\n", + "mem_bytes = torch.cuda.max_memory_allocated()\n", + "images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Execution time: {(end - start):.3f} sec\")\n", + "print(f\"Memory: {mem_bytes/(10**6):.3f} MB\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Display Multi images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a generator\n", + "generator = torch.Generator(device='cuda')\n", + "\n", + "# Start a loop to process prompts one by one\n", + "for i, generation in enumerate(queue, start=1):\n", + "\n", + " # We start the counter\n", + " image_start = perf_counter()\n", + "\n", + " # Assign the seed to the generator\n", + " generator.manual_seed(generation['seed'])\n", + "\n", + " # Create the image\n", + " image = qpipe(\n", + " prompt=generation['prompt'],\n", + " generator=generator,\n", + " ).images[0]\n", + "\n", + " # Save the image\n", + " image.save(f'q_image_{i}.png')\n", + "\n", + " # We stop the counter and save the result\n", + " generation['total_time'] = perf_counter() - image_start\n", + "\n", + "# Print the generation time of each image\n", + "images_totals = ', '.join(map(lambda generation: str(round(generation['total_time'], 1)), queue))\n", + "print('Image time:', images_totals)\n", + "\n", + "# Print the average time\n", + "images_average = round(sum(generation['total_time'] for generation in queue) / len(queue), 1)\n", + "print('Average image time:', images_average)\n", + "\n", + "# Print the Max. memory used\n", + "max_memory = round(torch.cuda.max_memory_allocated(device='cuda') / 1000000000, 2)\n", + "print('Max. memory used:', max_memory, 'GB')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the q_image_1\n", + "from PIL import Image\n", + "\n", + "qimg1 = Image.open(\"q_image_1.png\")\n", + "qimg1.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the q_image_2\n", + "from PIL import Image\n", + "\n", + "qimg2 = Image.open(\"q_image_2.png\")\n", + "qimg2.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the image_3\n", + "from PIL import Image\n", + "\n", + "qimg3 = Image.open(\"q_image_3.png\")\n", + "qimg3.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the q_image_4\n", + "from PIL import Image\n", + "\n", + "qimg4 = Image.open(\"q_image_4.png\")\n", + "qimg4.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluating Diffusion Models After (Post Training Quantization)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* CLIP score\n", + "* PickScore" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### CLIP score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompts = [\n", + " \"a photo of an astronaut riding a horse on mars\",\n", + " \"A high tech solarpunk utopia in the Amazon rainforest\",\n", + " \"A pikachu fine dining with a view to the Eiffel Tower\",\n", + " \"A mecha robot in a favela in expressionist style\",\n", + " \"an insect robot preparing a delicious meal\",\n", + " \"A small cabin on top of a snowy mountain in the style of Disney, artstation\",\n", + "]\n", + "\n", + "images = qpipe(prompts, num_images_per_prompt=1, output_type=\"np\", height = height, width = width).images\n", + "\n", + "print(images.shape)\n", + "# (6, 512, 512, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torchmetrics.functional.multimodal import clip_score\n", + "from functools import partial\n", + "\n", + "clip_score_fn = partial(clip_score, model_name_or_path=\"openai/clip-vit-base-patch16\")\n", + "\n", + "def calculate_clip_score(images, prompts):\n", + " images_int = (images * 255).astype(\"uint8\")\n", + " clip_score = clip_score_fn(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()\n", + " return round(float(clip_score), 4)\n", + "\n", + "sd_clip_score = calculate_clip_score(images, prompts)\n", + "print(f\"CLIP score: {sd_clip_score}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### PickScore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import\n", + "from transformers import AutoProcessor, AutoModel\n", + "from PIL import Image\n", + "import torch\n", + "\n", + "# load model\n", + "device = \"cuda\"\n", + "processor_name_or_path = \"laion/CLIP-ViT-H-14-laion2B-s32B-b79K\"\n", + "model_pretrained_name_or_path = \"yuvalkirstain/PickScore_v1\"\n", + "processor = AutoProcessor.from_pretrained(processor_name_or_path)\n", + "model = AutoModel.from_pretrained(model_pretrained_name_or_path).eval().to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Score function adapted from their docs\n", + "def get_scores(prompt, images):\n", + " \n", + " # preprocess\n", + " image_inputs = processor(\n", + " images=images,\n", + " padding=True,\n", + " truncation=True,\n", + " max_length=77,\n", + " return_tensors=\"pt\",\n", + " ).to(device)\n", + " \n", + " text_inputs = processor(\n", + " text=prompt,\n", + " padding=True,\n", + " truncation=True,\n", + " max_length=77,\n", + " return_tensors=\"pt\",\n", + " ).to(device)\n", + "\n", + "\n", + " with torch.no_grad():\n", + " # embed\n", + " image_embs = model.get_image_features(**image_inputs)\n", + " image_embs = image_embs / torch.norm(image_embs, dim=-1, keepdim=True)\n", + " \n", + " text_embs = model.get_text_features(**text_inputs)\n", + " text_embs = text_embs / torch.norm(text_embs, dim=-1, keepdim=True)\n", + " \n", + " # score\n", + " scores = model.logit_scale.exp() * (text_embs @ image_embs.T)[0]\n", + " \n", + " return scores.cpu().tolist()\n", + "get_scores(\"a photo of an astronaut riding a horse on mars\", images)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "get_scores(\"a photo of a pretty flower\", images)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "pap = load_dataset(\"yuvalkirstain/pickapic_v1_no_images\")\n", + "prompts = pap['validation_unique']['caption']\n", + "prompts[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Measuring the effect of CFG_Scale on Score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from IPython.display import clear_output\n", + "\n", + "average_scores = []\n", + "cfg_scales = [2, 9, 12, 30]\n", + "for cfg_scale in cfg_scales:\n", + " scores = []\n", + " for i, prompt in enumerate(prompts[:5]):\n", + " print(f\"Scale {cfg_scale}, prompt {i}\")\n", + " generator = generator # For reproducibility\n", + " im = qpipe(prompt, num_inference_steps=50, \n", + " generator=generator, guidance_scale=cfg_scale).images[0]\n", + " scores.append(get_scores(prompt, im)[0])\n", + " clear_output(wait=True)\n", + " average_scores.append(sum(scores)/len(scores))\n", + "\n", + "plt.plot(cfg_scales, average_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using A Score Model for Re-Ranking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_good_image(prompt):\n", + " images = []\n", + " # Generate 4 images with two different guidance scales (for example):\n", + " images += qpipe(prompt, num_inference_steps=50, num_images_per_prompt=1,height = height, width = width).images\n", + " images += qpipe(prompt, num_inference_steps=50, num_images_per_prompt=1,height = height, width = width, guidance_scale=5).images \n", + " # Score them and pick the best one\n", + " scores = get_scores(prompt, images)\n", + " best_image = images[scores.index(max(scores))]\n", + " return best_image\n", + "\n", + "generate_good_image(\"a photo of an astronaut riding a horse on mars\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| | Memory | Execution time |\n", + "| :-----| ----: | :----: |\n", + "| SDXL | 9.62 GB | 18.6 sec | \n", + "| PTQ SDXL | 14.08 GB | 22.2 sec |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| | CLIP score | PickScore |\n", + "| :-----| ----: | :----: |\n", + "| SDXL | 31.4792 | 22.4 |\n", + "| PTQ SDXL | 35.8455 | 22.3 |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* [Diffusers](https://huggingface.co/docs/diffusers/)\n", + "* [Evaluating Diffusion Models](https://huggingface.co/docs/diffusers/conceptual/evaluation#text-guided-image-generation)\n", + "* [Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation](https://arxiv.org/abs/2305.01569)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cloudspace", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}