Initial content for Dockercon (#477)

wandb · Oct 5, 2023 · 465c388 · 465c388
1 parent ee4be52
commit 465c388
Show file tree

Hide file tree

Showing 8 changed files with 281 additions and 0 deletions.
diff --git a/examples/llama-cpp/.dockerignore b/examples/llama-cpp/.dockerignore
@@ -0,0 +1 @@
+models/
diff --git a/examples/llama-cpp/.gitignore b/examples/llama-cpp/.gitignore
@@ -0,0 +1,2 @@
+wandb/
+models/
diff --git a/examples/llama-cpp/Dockerfile b/examples/llama-cpp/Dockerfile
@@ -0,0 +1,30 @@
+ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
+FROM nvidia/cuda:${CUDA_IMAGE}
+
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+
+RUN apt-get update && apt-get upgrade -y \
+    && apt-get install -y git build-essential \
+    python3 python3-pip gcc wget \
+    ocl-icd-opencl-dev opencl-headers clinfo \
+    libclblast-dev libopenblas-dev \
+    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
+
+# setting build related env vars
+ENV CUDA_DOCKER_ARCH=all
+ENV LLAMA_CUBLAS=1
+ENV PYTHONUNBUFFERED=1
+
+# Install depencencies
+RUN python3 -m pip install --no-cache-dir --upgrade pip pytest cmake scikit-build setuptools fastapi \
+    uvicorn sse-starlette pydantic-settings starlette-context openai wandb timeout-decorator
+
+# Install llama-cpp-python (build with cuda)
+RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install --no-cache-dir llama-cpp-python
+
+COPY app /app
+WORKDIR /app
+
+# Run evaluation
+CMD python3 evaluate.py
diff --git a/examples/llama-cpp/README.md b/examples/llama-cpp/README.md
@@ -0,0 +1,76 @@
+# Overview
+
+This script automates the evaluation of different LLM's with W&B.  It was originally used at Dockercon '23.  The default dataset attemtps to convert english commands into docker CLI commands.  See `eval.jsonl`.  All of the logic is in `evaluate.py`.
+
+# Documentation
+
+## Setup W&B
+
+```bash
+pip install wandb
+
+# Find your api key at https://wandb.ai/authorize
+export WANDB_API_KEY=XXX
+# Find your openai api key at https://platform.openai.com/account/api-keys
+export OPENAI_API_KEY=XXX
+```
+
+## Download Models
+
+```bash
+python download_models.py
+```
+
+## Nvidia/CUDA
+
+### Build the docker container
+
+```bash
+docker build -t wandb/eval-llm:cuda .
+```
+
+### Run evaluation
+
+```bash
+docker run --gpus=all --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e WANDB_API_KEY -e OPENAI_API_KEY -e MODEL=mistral-7b-instruct-v0.1.Q5_K_M.gguf -e TEMP=0.3 -v $(pwd)/models:/var/models wandb/eval-llm:cuda
+```
+
+## Environment variables
+
+* `TEMP` - temperature _(0.5)_
+* `MAX_TOKENS` - maximum number of tokens to emit _(128)_
+* `SYSTEM_PROMPT` - instructions for the model _(You're a Docker expert. Translate the following sentence to a simple docker command.)_
+* `MODEL` - name of gguf file, or gpt-turbo-3.5, gpt-40 _(codellama-13b-instruct.Q4_K_M.gguf)_
+* `EVAL_PATH` - the path to a jsonl file with "input" and "ideal" keys _(eval.jsonl)_
+* `VERBOSE` - print verbose info from llama-cpp-python _(False)_
+* `DIFF_THRESHOLD` - the percentage threshold for considering a response accurate _(0.7)_
+* `REPITITION_PENALTY` - how much to penalize repeated tokens _(1.1)_
+* `GPU_LAYERS` - the number of layers to offload to the gpu _(-1 for CUDA, 0 for CPU)_
+
+# W&B Launch Setup
+
+## Create a queue
+
+Goto https://wandb.ai/vanpelt/launch and create a queue named "llm-eval-cuda".  Set it's config to:
+
+> Note: replace `/home/jupyter` to whatever `pwd` returns in your current directory.
+
+```json
+{
+  "env": ["USE_MLOCK=0", "OPENAI_API_KEY"],
+  "gpus": "all",
+  "volume": "/home/jupyter/models:/var/models",
+  "cap-add": "SYS_RESOURCE"
+}
+```
+
+## Create a docker job
+
+```bash
+wandb job create --project "llm-eval" --name "llm-eval-cuda" image wandb/eval-llm:cuda
+```
+
+## Run an agent
+
+```bash
+wandb launch-agent -q llm-eval-cuda
diff --git a/examples/llama-cpp/app/download_models.py b/examples/llama-cpp/app/download_models.py
@@ -0,0 +1,27 @@
+import os
+import requests
+
+models = [
+    "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/resolve/main/codellama-7b.Q5_K_S.gguf",
+    "https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GGUF/resolve/main/codellama-13b-instruct.Q4_K_M.gguf",
+    "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q5_K_M.gguf"
+]
+for url in models:
+    dest = f"models/{url.split('/')[-1]}"
+    if os.path.exists(dest):
+        print(f"Skipping {dest}, already exists")
+        continue
+    print(f"Downloading {url} to {dest}...")
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        with open(dest, 'wb') as f:
+            total_downloaded = 0
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:  # filter out keep-alive new chunks
+                    f.write(chunk)
+                    total_downloaded += len(chunk)
+                    if total_downloaded >= 10485760:  # 10 MB
+                        print('.', end='', flush=True)
+                        total_downloaded = 0
+        print("\nDownload complete.")
+
diff --git a/examples/llama-cpp/app/eval.jsonl b/examples/llama-cpp/app/eval.jsonl
@@ -0,0 +1,10 @@
+{"input": "Spin up a new container named busy_server_1 with the latest version of the busybox image and place it under the high_priority cgroup parent in the background.", "ideal": "docker run -d --name busy_server_1 --cgroup-parent /high_priority busybox"}
+{"input": "Please display the images in a table format with the repository, tag, ID, and size included.", "ideal": "docker images --format \"table {{.Repository}},{{.Tag}},{{.ID}},{{.Size}}\""}
+{"input": "Can you show me the containers that are running and have port 8080 published?", "ideal": "docker ps --filter 'publish=8080'"}
+{"input": "I need just the IDs of Docker images.", "ideal": "docker images --quiet"}
+{"input": "Display the containers that are both running and healthy.", "ideal": "docker ps --filter 'status=running' --filter 'health=healthy'"}
+{"input": "Get the images labeled with \"maintainer=nginx\" and show their repository, tag, and ID.", "ideal": "docker images --filter \"label=maintainer=nginx\" --format \"{{.Repository}},{{.Tag}},{{.ID}}\""}
+{"input": "Display all the running containers.", "ideal": "docker ps"}
+{"input": "Display the details of the most recent container execution now!", "ideal": "docker ps -l"}
+{"input": "Please run httpd as an Apache server on port 8080 using the latest image.", "ideal": "docker run --name apache_server -p 8080:80 httpd"}
+{"input": "I need to see the running tasks in Docker.", "ideal": "docker ps --filter 'is-task=true'"}
diff --git a/examples/llama-cpp/app/evaluate.py b/examples/llama-cpp/app/evaluate.py
@@ -0,0 +1,113 @@
+import difflib
+import json
+from llama_cpp import Llama
+import openai
+import os
+import re
+import subprocess
+import time
+import wandb
+
+config = {
+    'max_tokens': int(os.getenv("MAX_TOKENS", 128)),
+    'repetition_penalty': float(os.getenv("REPITITION_PENALTY", 1.1)), 
+    'temperature': float(os.getenv("TEMP", 0.5)),
+    'gpu_layers': int(os.getenv("GPU_LAYERS", 0)),
+}
+
+model_path = os.getenv("MODEL", "codellama-13b-instruct.Q4_K_M.gguf")
+eval_path = os.getenv("EVAL_PATH", "eval.jsonl")
+system_prompt = os.getenv("SYSTEM_PROMPT", "You're a Docker expert. Translate the following sentence to a simple docker command.")
+diff_threshold = float(os.getenv("DIFF_THRESHOLD", 0.7))
+
+def is_cuda_available():
+    try:
+        subprocess.check_output(["nvidia-smi"])
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return False
+
+wandb_config = {"model": model_path, "eval": eval_path,
+                "system_prompt": system_prompt, **config}
+
+# Set WANDB_MODE=disabled when running this files in tests
+wandb.init(project="llm-eval-v2", config=wandb_config)
+
+if wandb.config["model"].startswith("gpt"):
+    def llm(prompt):
+        res = openai.ChatCompletion.create(
+            model=wandb.config["model"],
+            messages=[
+                {"role": "system", "content": wandb.config["system_prompt"]},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=wandb.config["temperature"],
+            max_tokens=wandb.config["max_tokens"],
+            frequency_penalty=wandb.config["repetition_penalty"],
+        )
+        return res.choices[0].message.content, res.usage.total_tokens
+else:
+    default_gpu = -1 if is_cuda_available() else 0
+    cpp = Llama(f"/var/models/{wandb.config['model']}",
+                verbose=bool(os.getenv("VERBOSE", False)),
+                n_gpu_layers=int(os.getenv("GPU_LAYERS", default_gpu)))
+    def llm(prompt):
+        res = cpp.create_chat_completion(
+            messages=[
+                {"role": "system", "content": wandb.config["system_prompt"]},
+                {"role": "user", "content": f"Q: {prompt}"}
+            ],
+            max_tokens=wandb.config["max_tokens"], stop=["Q:"],
+            repeat_penalty=wandb.config["repetition_penalty"],
+            temperature=wandb.config["temperature"],
+        )
+        return res["choices"][0]["message"]["content"], res["usage"]["total_tokens"]
+
+print(f"Evaluating {wandb.config['model']}")
+table = wandb.Table(columns=["prompt", "output", "ideal", "score", "latency", "tokens"])
+
+codeblock_pattern = re.compile(r'(docker.+)$', re.MULTILINE)
+def fmt(s):
+    return f"`{s}`"
+
+total_score = 0
+total_latency = 0
+total_tokens = 0
+correct = 0.0
+total = 0.0
+with open(eval_path, "r") as f:
+    for line in f:
+        data = json.loads(line)
+        total += 1.0
+        prompt = data["input"]
+        print(prompt)
+        start = time.time()
+        output, tokens = llm(prompt)
+        latency = time.time() - start
+        total_latency += latency
+        matches = codeblock_pattern.findall(output)
+        if len(matches) == 0:
+            print("\t!!! No code generated:")
+            for l in output.split("\n"):
+                print(f"\t> {l}")
+            continue
+        command = matches[0].split("`")[0]
+        score = difflib.SequenceMatcher(None, data["ideal"], command).ratio()
+        print(f"\t({score:.2f}) {command}")
+        total_score += score
+        total_tokens += tokens
+        if score > diff_threshold:
+            correct += 1.0
+        table.add_data(prompt, fmt(command), fmt(data["ideal"]), score, latency, tokens)
+
+wandb.log({
+    "accuracy": correct / total,
+    "diff_score": total_score / total,
+    "avg_tokens": total_tokens / total,
+    "latency": total_latency / total,
+    "eval": table
+})
+print("\nConfig:\n")
+print(json.dumps(dict(wandb.config), indent=4))
+print(f"Accuracy: {wandb.run.summary['accuracy']}")
+print(f"Average diff score: {wandb.run.summary['diff_score']}")
diff --git a/examples/llama-cpp/sweep.yaml b/examples/llama-cpp/sweep.yaml
@@ -0,0 +1,22 @@
+program: evaluate.py
+method: random
+metric:
+  goal: maximize
+  name: diff_score
+parameters:
+  model:
+    distribution: categorical
+    values:
+      - codellama-13b-instruct.Q4_K_M.gguf
+      - codellama-7b.Q5_K_S.gguf
+      - mistral-7b-instruct-v0.1.Q5_K_M.gguf
+      - gpt-3.5-turbo
+  system_prompt:
+    distribution: categorical
+    values:
+      - You're a Docker expert. Translate the following sentence to a simple docker command.
+      - You'll be asked a question about Docker.  Your job is to convert this question to a succinct docker command.  Only provide a single command and limit your use of pipeing to other unix tools.
+  temperature:
+    distribution: uniform
+    max: 0.6
+    min: 0