From e3be1a1758ad6372c6ff5c7700a985a79d54de82 Mon Sep 17 00:00:00 2001
From: Jaret Burkett <jaretburkett@gmail.com>
Date: Sun, 13 Aug 2023 13:52:38 -0600
Subject: [PATCH] Added WIP slider training colab

---
 notebooks/SliderTraining.ipynb | 340 +++++++++++++++++++++++++++++++++
 1 file changed, 340 insertions(+)
 create mode 100644 notebooks/SliderTraining.ipynb

diff --git a/notebooks/SliderTraining.ipynb b/notebooks/SliderTraining.ipynb
new file mode 100644
index 00000000..6ae8b8f0
--- /dev/null
+++ b/notebooks/SliderTraining.ipynb
@@ -0,0 +1,340 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "provenance": [],
+   "machine_shape": "hm",
+   "gpuType": "V100"
+  },
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  },
+  "language_info": {
+   "name": "python"
+  },
+  "accelerator": "GPU"
+ },
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# AI Toolkit by Ostris\n",
+    "## Slider Training\n",
+    "\n",
+    "This is a quick colab demo for training sliders like can be found in my CivitAI profile https://civitai.com/user/Ostris/models .  I will work on making it more user friendly, but for now, it will get you started."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "!git clone https://github.com/ostris/ai-toolkit"
+   ],
+   "metadata": {
+    "id": "BvAG0GKAh59G"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "XGZqVER_aQJW"
+   },
+   "outputs": [],
+   "source": [
+    "!cd ai-toolkit && git submodule update --init --recursive && pip install -r requirements.txt\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "sys.path.append('/content/ai-toolkit')\n",
+    "from toolkit.job import run_job\n",
+    "from collections import OrderedDict\n",
+    "from PIL import Image"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Setup\n",
+    "\n",
+    "This is your config. It is documented pretty well. Normally you would do this as a yaml file, but for colab, this will work. This will run as is without modification, but feel free to edit as you want."
+   ],
+   "metadata": {
+    "id": "N8UUFzVRigbC"
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "from collections import OrderedDict\n",
+    "\n",
+    "job_to_run = OrderedDict({\n",
+    "    # This is the config I use on my sliders, It is solid and tested\n",
+    "    'job': 'train',\n",
+    "    'config': {\n",
+    "        # the name will be used to create a folder in the output folder\n",
+    "        # it will also replace any [name] token in the rest of this config\n",
+    "        'name': 'detail_slider_v1',\n",
+    "        # folder will be created with name above in folder below\n",
+    "        # it can be relative to the project root or absolute\n",
+    "        'training_folder': \"output/LoRA\",\n",
+    "        'device': 'cuda',  # cpu, cuda:0, etc\n",
+    "        # for tensorboard logging, we will make a subfolder for this job\n",
+    "        'log_dir': \"output/.tensorboard\",\n",
+    "        # you can stack processes for other jobs, It is not tested with sliders though\n",
+    "        # just use one for now\n",
+    "        'process': [\n",
+    "            {\n",
+    "                'type': 'slider',  # tells runner to run the slider process\n",
+    "                # network is the LoRA network for a slider, I recommend to leave this be\n",
+    "                'network': {\n",
+    "                    'type': \"lora\",\n",
+    "                    # rank / dim of the network. Bigger is not always better. Especially for sliders. 8 is good\n",
+    "                    'linear': 8,  # \"rank\" or \"dim\"\n",
+    "                    'linear_alpha': 4,  # Do about half of rank \"alpha\"\n",
+    "                    # 'conv': 4,  # for convolutional layers \"locon\"\n",
+    "                    # 'conv_alpha': 4,  # Do about half of conv \"alpha\"\n",
+    "                },\n",
+    "                # training config\n",
+    "                'train': {\n",
+    "                    # this is also used in sampling. Stick with ddpm unless you know what you are doing\n",
+    "                    'noise_scheduler': \"ddpm\",  # or \"ddpm\", \"lms\", \"euler_a\"\n",
+    "                    # how many steps to train. More is not always better. I rarely go over 1000\n",
+    "                    'steps': 100,\n",
+    "                    # I have had good results with 4e-4 to 1e-4 at 500 steps\n",
+    "                    'lr': 2e-4,\n",
+    "                    # enables gradient checkpoint, saves vram, leave it on\n",
+    "                    'gradient_checkpointing': True,\n",
+    "                    # train the unet. I recommend leaving this true\n",
+    "                    'train_unet': True,\n",
+    "                    # train the text encoder. I don't recommend this unless you have a special use case\n",
+    "                    # for sliders we are adjusting representation of the concept (unet),\n",
+    "                    # not the description of it (text encoder)\n",
+    "                    'train_text_encoder': False,\n",
+    "\n",
+    "                    # just leave unless you know what you are doing\n",
+    "                    # also supports \"dadaptation\" but set lr to 1 if you use that,\n",
+    "                    # but it learns too fast and I don't recommend it\n",
+    "                    'optimizer': \"adamw\",\n",
+    "                    # only constant for now\n",
+    "                    'lr_scheduler': \"constant\",\n",
+    "                    # we randomly denoise random num of steps form 1 to this number\n",
+    "                    # while training. Just leave it\n",
+    "                    'max_denoising_steps': 40,\n",
+    "                    # works great at 1. I do 1 even with my 4090.\n",
+    "                    # higher may not work right with newer single batch stacking code anyway\n",
+    "                    'batch_size': 1,\n",
+    "                    # bf16 works best if your GPU supports it (modern)\n",
+    "                    'dtype': 'bf16',  # fp32, bf16, fp16\n",
+    "                    # I don't recommend using unless you are trying to make a darker lora. Then do 0.1 MAX\n",
+    "                    # although, the way we train sliders is comparative, so it probably won't work anyway\n",
+    "                    'noise_offset': 0.0,\n",
+    "                },\n",
+    "\n",
+    "                # the model to train the LoRA network on\n",
+    "                'model': {\n",
+    "                    # name_or_path can be a hugging face name, local path or url to model\n",
+    "                    # on civit ai with or without modelVersionId. They will be cached in /model folder\n",
+    "                    # epicRealisim v5\n",
+    "                    'name_or_path': \"https://civitai.com/models/25694?modelVersionId=134065\",\n",
+    "                    'is_v2': False,  # for v2 models\n",
+    "                    'is_v_pred': False,  # for v-prediction models (most v2 models)\n",
+    "                    # has some issues with the dual text encoder and the way we train sliders\n",
+    "                    # it works bit weights need to probably be higher to see it.\n",
+    "                    'is_xl': False,  # for SDXL models\n",
+    "                },\n",
+    "\n",
+    "                # saving config\n",
+    "                'save': {\n",
+    "                    'dtype': 'float16',  # precision to save. I recommend float16\n",
+    "                    'save_every': 50,  # save every this many steps\n",
+    "                    # this will remove step counts more than this number\n",
+    "                    # allows you to save more often in case of a crash without filling up your drive\n",
+    "                    'max_step_saves_to_keep': 2,\n",
+    "                },\n",
+    "\n",
+    "                # sampling config\n",
+    "                'sample': {\n",
+    "                    # must match train.noise_scheduler, this is not used here\n",
+    "                    # but may be in future and in other processes\n",
+    "                    'sampler': \"ddpm\",\n",
+    "                    # sample every this many steps\n",
+    "                    'sample_every': 20,\n",
+    "                    # image size\n",
+    "                    'width': 512,\n",
+    "                    'height': 512,\n",
+    "                    # prompts to use for sampling. Do as many as you want, but it slows down training\n",
+    "                    # pick ones that will best represent the concept you are trying to adjust\n",
+    "                    # allows some flags after the prompt\n",
+    "                    #  --m [number]  # network multiplier. LoRA weight. -3 for the negative slide, 3 for the positive\n",
+    "                    #      slide are good tests. will inherit sample.network_multiplier if not set\n",
+    "                    #  --n [string]  # negative prompt, will inherit sample.neg if not set\n",
+    "                    # Only 75 tokens allowed currently\n",
+    "                    # I like to do a wide positive and negative spread so I can see a good range and stop\n",
+    "                    # early if the network is braking down\n",
+    "                    'prompts': [\n",
+    "                        \"a woman in a coffee shop, black hat, blonde hair, blue jacket --m -5\",\n",
+    "                        \"a woman in a coffee shop, black hat, blonde hair, blue jacket --m -3\",\n",
+    "                        \"a woman in a coffee shop, black hat, blonde hair, blue jacket --m 3\",\n",
+    "                        \"a woman in a coffee shop, black hat, blonde hair, blue jacket --m 5\",\n",
+    "                        \"a golden retriever sitting on a leather couch, --m -5\",\n",
+    "                        \"a golden retriever sitting on a leather couch --m -3\",\n",
+    "                        \"a golden retriever sitting on a leather couch --m 3\",\n",
+    "                        \"a golden retriever sitting on a leather couch --m 5\",\n",
+    "                        \"a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m -5\",\n",
+    "                        \"a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m -3\",\n",
+    "                        \"a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m 3\",\n",
+    "                        \"a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m 5\",\n",
+    "                    ],\n",
+    "                    # negative prompt used on all prompts above as default if they don't have one\n",
+    "                    'neg': \"cartoon, fake, drawing, illustration, cgi, animated, anime, monochrome\",\n",
+    "                    # seed for sampling. 42 is the answer for everything\n",
+    "                    'seed': 42,\n",
+    "                    # walks the seed so s1 is 42, s2 is 43, s3 is 44, etc\n",
+    "                    # will start over on next sample_every so s1 is always seed\n",
+    "                    # works well if you use same prompt but want different results\n",
+    "                    'walk_seed': False,\n",
+    "                    # cfg scale (4 to 10 is good)\n",
+    "                    'guidance_scale': 7,\n",
+    "                    # sampler steps (20 to 30 is good)\n",
+    "                    'sample_steps': 20,\n",
+    "                    # default network multiplier for all prompts\n",
+    "                    # since we are training a slider, I recommend overriding this with --m [number]\n",
+    "                    # in the prompts above to get both sides of the slider\n",
+    "                    'network_multiplier': 1.0,\n",
+    "                },\n",
+    "\n",
+    "                # logging information\n",
+    "                'logging': {\n",
+    "                    'log_every': 10,  # log every this many steps\n",
+    "                    'use_wandb': False,  # not supported yet\n",
+    "                    'verbose': False,  # probably done need unless you are debugging\n",
+    "                },\n",
+    "\n",
+    "                # slider training config, best for last\n",
+    "                'slider': {\n",
+    "                    # resolutions to train on. [ width, height ]. This is less important for sliders\n",
+    "                    # as we are not teaching the model anything it doesn't already know\n",
+    "                    # but must be a size it understands [ 512, 512 ] for sd_v1.5  and [ 768, 768 ] for sd_v2.1\n",
+    "                    # and [ 1024, 1024 ] for sd_xl\n",
+    "                    # you can do as many as you want here\n",
+    "                    'resolutions': [\n",
+    "                        [512, 512],\n",
+    "                        # [ 512, 768 ]\n",
+    "                        # [ 768, 768 ]\n",
+    "                    ],\n",
+    "                    # slider training uses 4 combined steps for a single round. This will do it in one gradient\n",
+    "                    # step. It is highly optimized and shouldn't take anymore vram than doing without it,\n",
+    "                    # since we break down batches for gradient accumulation now. so just leave it on.\n",
+    "                    'batch_full_slide': True,\n",
+    "                    # These are the concepts to train on. You can do as many as you want here,\n",
+    "                    # but they can conflict outweigh each other. Other than experimenting, I recommend\n",
+    "                    # just doing one for good results\n",
+    "                    'targets': [\n",
+    "                        # target_class is the base concept we are adjusting the representation of\n",
+    "                        # for example, if we are adjusting the representation of a person, we would use \"person\"\n",
+    "                        # if we are adjusting the representation of a cat, we would use \"cat\" It is not\n",
+    "                        # a keyword necessarily but what the model understands the concept to represent.\n",
+    "                        # \"person\" will affect men, women, children, etc but will not affect cats, dogs, etc\n",
+    "                        # it is the models base general understanding of the concept and everything it represents\n",
+    "                        # you can leave it blank to affect everything. In this example, we are adjusting\n",
+    "                        # detail, so we will leave it blank to affect everything\n",
+    "                        {\n",
+    "                            'target_class': \"\",\n",
+    "                            # positive is the prompt for the positive side of the slider.\n",
+    "                            # It is the concept that will be excited and amplified in the model when we slide the slider\n",
+    "                            # to the positive side and forgotten / inverted when we slide\n",
+    "                            # the slider to the negative side. It is generally best to include the target_class in\n",
+    "                            # the prompt. You want it to be the extreme of what you want to train on. For example,\n",
+    "                            # if you want to train on fat people, you would use \"an extremely fat, morbidly obese person\"\n",
+    "                            # as the prompt. Not just \"fat person\"\n",
+    "                            # max 75 tokens for now\n",
+    "                            'positive': \"high detail, 8k, intricate, detailed, high resolution, high res, high quality\",\n",
+    "                            # negative is the prompt for the negative side of the slider and works the same as positive\n",
+    "                            # it does not necessarily work the same as a negative prompt when generating images\n",
+    "                            # these need to be polar opposites.\n",
+    "                            # max 76 tokens for now\n",
+    "                            'negative': \"blurry, boring, fuzzy, low detail, low resolution, low res, low quality\",\n",
+    "                            # the loss for this target is multiplied by this number.\n",
+    "                            # if you are doing more than one target it may be good to set less important ones\n",
+    "                            # to a lower number like 0.1 so they don't outweigh the primary target\n",
+    "                            'weight': 1.0,\n",
+    "                        },\n",
+    "                    ],\n",
+    "                },\n",
+    "            },\n",
+    "        ]\n",
+    "    },\n",
+    "\n",
+    "    # You can put any information you want here, and it will be saved in the model.\n",
+    "    # The below is an example, but you can put your grocery list in it if you want.\n",
+    "    # It is saved in the model so be aware of that. The software will include this\n",
+    "    # plus some other information for you automatically\n",
+    "    'meta': {\n",
+    "        # [name] gets replaced with the name above\n",
+    "        'name': \"[name]\",\n",
+    "        'version': '1.0',\n",
+    "        # 'creator': {\n",
+    "        #     'name': 'your name',\n",
+    "        #     'email': 'your@gmail.com',\n",
+    "        #     'website': 'https://your.website'\n",
+    "        # }\n",
+    "    }\n",
+    "})\n"
+   ],
+   "metadata": {
+    "id": "_t28QURYjRQO"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Run it\n",
+    "\n",
+    "Below does all the magic. Check your folders to the left. Items will be in output/LoRA/your_name_v1 In the samples folder, there are preiodic sampled. This doesnt work great with colab. Ill update soon."
+   ],
+   "metadata": {
+    "id": "h6F1FlM2Wb3l"
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "run_job(job_to_run)\n"
+   ],
+   "metadata": {
+    "id": "HkajwI8gteOh"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Done\n",
+    "\n",
+    "Check your ourput dir and get your slider\n"
+   ],
+   "metadata": {
+    "id": "Hblgb5uwW5SD"
+   }
+  }
+ ]
+}