From 465c3882be19e82ef761de001fd74d76f4ec101a Mon Sep 17 00:00:00 2001 From: Chris Van Pelt Date: Thu, 5 Oct 2023 09:17:06 -0700 Subject: [PATCH] Initial content for Dockercon (#477) --- examples/llama-cpp/.dockerignore | 1 + examples/llama-cpp/.gitignore | 2 + examples/llama-cpp/Dockerfile | 30 ++++++ examples/llama-cpp/README.md | 76 +++++++++++++++ examples/llama-cpp/app/download_models.py | 27 ++++++ examples/llama-cpp/app/eval.jsonl | 10 ++ examples/llama-cpp/app/evaluate.py | 113 ++++++++++++++++++++++ examples/llama-cpp/sweep.yaml | 22 +++++ 8 files changed, 281 insertions(+) create mode 100644 examples/llama-cpp/.dockerignore create mode 100644 examples/llama-cpp/.gitignore create mode 100644 examples/llama-cpp/Dockerfile create mode 100644 examples/llama-cpp/README.md create mode 100644 examples/llama-cpp/app/download_models.py create mode 100644 examples/llama-cpp/app/eval.jsonl create mode 100644 examples/llama-cpp/app/evaluate.py create mode 100644 examples/llama-cpp/sweep.yaml diff --git a/examples/llama-cpp/.dockerignore b/examples/llama-cpp/.dockerignore new file mode 100644 index 00000000..6ea88749 --- /dev/null +++ b/examples/llama-cpp/.dockerignore @@ -0,0 +1 @@ +models/ \ No newline at end of file diff --git a/examples/llama-cpp/.gitignore b/examples/llama-cpp/.gitignore new file mode 100644 index 00000000..4592db0a --- /dev/null +++ b/examples/llama-cpp/.gitignore @@ -0,0 +1,2 @@ +wandb/ +models/ \ No newline at end of file diff --git a/examples/llama-cpp/Dockerfile b/examples/llama-cpp/Dockerfile new file mode 100644 index 00000000..adc8d65c --- /dev/null +++ b/examples/llama-cpp/Dockerfile @@ -0,0 +1,30 @@ +ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04" +FROM nvidia/cuda:${CUDA_IMAGE} + +# We need to set the host to 0.0.0.0 to allow outside access +ENV HOST 0.0.0.0 + +RUN apt-get update && apt-get upgrade -y \ + && apt-get install -y git build-essential \ + python3 python3-pip gcc wget \ + ocl-icd-opencl-dev opencl-headers clinfo \ + libclblast-dev libopenblas-dev \ + && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd + +# setting build related env vars +ENV CUDA_DOCKER_ARCH=all +ENV LLAMA_CUBLAS=1 +ENV PYTHONUNBUFFERED=1 + +# Install depencencies +RUN python3 -m pip install --no-cache-dir --upgrade pip pytest cmake scikit-build setuptools fastapi \ + uvicorn sse-starlette pydantic-settings starlette-context openai wandb timeout-decorator + +# Install llama-cpp-python (build with cuda) +RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install --no-cache-dir llama-cpp-python + +COPY app /app +WORKDIR /app + +# Run evaluation +CMD python3 evaluate.py \ No newline at end of file diff --git a/examples/llama-cpp/README.md b/examples/llama-cpp/README.md new file mode 100644 index 00000000..86dbe4d3 --- /dev/null +++ b/examples/llama-cpp/README.md @@ -0,0 +1,76 @@ +# Overview + +This script automates the evaluation of different LLM's with W&B. It was originally used at Dockercon '23. The default dataset attemtps to convert english commands into docker CLI commands. See `eval.jsonl`. All of the logic is in `evaluate.py`. + +# Documentation + +## Setup W&B + +```bash +pip install wandb + +# Find your api key at https://wandb.ai/authorize +export WANDB_API_KEY=XXX +# Find your openai api key at https://platform.openai.com/account/api-keys +export OPENAI_API_KEY=XXX +``` + +## Download Models + +```bash +python download_models.py +``` + +## Nvidia/CUDA + +### Build the docker container + +```bash +docker build -t wandb/eval-llm:cuda . +``` + +### Run evaluation + +```bash +docker run --gpus=all --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e WANDB_API_KEY -e OPENAI_API_KEY -e MODEL=mistral-7b-instruct-v0.1.Q5_K_M.gguf -e TEMP=0.3 -v $(pwd)/models:/var/models wandb/eval-llm:cuda +``` + +## Environment variables + +* `TEMP` - temperature _(0.5)_ +* `MAX_TOKENS` - maximum number of tokens to emit _(128)_ +* `SYSTEM_PROMPT` - instructions for the model _(You're a Docker expert. Translate the following sentence to a simple docker command.)_ +* `MODEL` - name of gguf file, or gpt-turbo-3.5, gpt-40 _(codellama-13b-instruct.Q4_K_M.gguf)_ +* `EVAL_PATH` - the path to a jsonl file with "input" and "ideal" keys _(eval.jsonl)_ +* `VERBOSE` - print verbose info from llama-cpp-python _(False)_ +* `DIFF_THRESHOLD` - the percentage threshold for considering a response accurate _(0.7)_ +* `REPITITION_PENALTY` - how much to penalize repeated tokens _(1.1)_ +* `GPU_LAYERS` - the number of layers to offload to the gpu _(-1 for CUDA, 0 for CPU)_ + +# W&B Launch Setup + +## Create a queue + +Goto https://wandb.ai/vanpelt/launch and create a queue named "llm-eval-cuda". Set it's config to: + +> Note: replace `/home/jupyter` to whatever `pwd` returns in your current directory. + +```json +{ + "env": ["USE_MLOCK=0", "OPENAI_API_KEY"], + "gpus": "all", + "volume": "/home/jupyter/models:/var/models", + "cap-add": "SYS_RESOURCE" +} +``` + +## Create a docker job + +```bash +wandb job create --project "llm-eval" --name "llm-eval-cuda" image wandb/eval-llm:cuda +``` + +## Run an agent + +```bash +wandb launch-agent -q llm-eval-cuda \ No newline at end of file diff --git a/examples/llama-cpp/app/download_models.py b/examples/llama-cpp/app/download_models.py new file mode 100644 index 00000000..6af41b40 --- /dev/null +++ b/examples/llama-cpp/app/download_models.py @@ -0,0 +1,27 @@ +import os +import requests + +models = [ + "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/resolve/main/codellama-7b.Q5_K_S.gguf", + "https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GGUF/resolve/main/codellama-13b-instruct.Q4_K_M.gguf", + "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q5_K_M.gguf" +] +for url in models: + dest = f"models/{url.split('/')[-1]}" + if os.path.exists(dest): + print(f"Skipping {dest}, already exists") + continue + print(f"Downloading {url} to {dest}...") + response = requests.get(url, stream=True) + if response.status_code == 200: + with open(dest, 'wb') as f: + total_downloaded = 0 + for chunk in response.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + total_downloaded += len(chunk) + if total_downloaded >= 10485760: # 10 MB + print('.', end='', flush=True) + total_downloaded = 0 + print("\nDownload complete.") + \ No newline at end of file diff --git a/examples/llama-cpp/app/eval.jsonl b/examples/llama-cpp/app/eval.jsonl new file mode 100644 index 00000000..2a0b036e --- /dev/null +++ b/examples/llama-cpp/app/eval.jsonl @@ -0,0 +1,10 @@ +{"input": "Spin up a new container named busy_server_1 with the latest version of the busybox image and place it under the high_priority cgroup parent in the background.", "ideal": "docker run -d --name busy_server_1 --cgroup-parent /high_priority busybox"} +{"input": "Please display the images in a table format with the repository, tag, ID, and size included.", "ideal": "docker images --format \"table {{.Repository}},{{.Tag}},{{.ID}},{{.Size}}\""} +{"input": "Can you show me the containers that are running and have port 8080 published?", "ideal": "docker ps --filter 'publish=8080'"} +{"input": "I need just the IDs of Docker images.", "ideal": "docker images --quiet"} +{"input": "Display the containers that are both running and healthy.", "ideal": "docker ps --filter 'status=running' --filter 'health=healthy'"} +{"input": "Get the images labeled with \"maintainer=nginx\" and show their repository, tag, and ID.", "ideal": "docker images --filter \"label=maintainer=nginx\" --format \"{{.Repository}},{{.Tag}},{{.ID}}\""} +{"input": "Display all the running containers.", "ideal": "docker ps"} +{"input": "Display the details of the most recent container execution now!", "ideal": "docker ps -l"} +{"input": "Please run httpd as an Apache server on port 8080 using the latest image.", "ideal": "docker run --name apache_server -p 8080:80 httpd"} +{"input": "I need to see the running tasks in Docker.", "ideal": "docker ps --filter 'is-task=true'"} \ No newline at end of file diff --git a/examples/llama-cpp/app/evaluate.py b/examples/llama-cpp/app/evaluate.py new file mode 100644 index 00000000..e4ec62f2 --- /dev/null +++ b/examples/llama-cpp/app/evaluate.py @@ -0,0 +1,113 @@ +import difflib +import json +from llama_cpp import Llama +import openai +import os +import re +import subprocess +import time +import wandb + +config = { + 'max_tokens': int(os.getenv("MAX_TOKENS", 128)), + 'repetition_penalty': float(os.getenv("REPITITION_PENALTY", 1.1)), + 'temperature': float(os.getenv("TEMP", 0.5)), + 'gpu_layers': int(os.getenv("GPU_LAYERS", 0)), +} + +model_path = os.getenv("MODEL", "codellama-13b-instruct.Q4_K_M.gguf") +eval_path = os.getenv("EVAL_PATH", "eval.jsonl") +system_prompt = os.getenv("SYSTEM_PROMPT", "You're a Docker expert. Translate the following sentence to a simple docker command.") +diff_threshold = float(os.getenv("DIFF_THRESHOLD", 0.7)) + +def is_cuda_available(): + try: + subprocess.check_output(["nvidia-smi"]) + return True + except (subprocess.CalledProcessError, FileNotFoundError): + return False + +wandb_config = {"model": model_path, "eval": eval_path, + "system_prompt": system_prompt, **config} + +# Set WANDB_MODE=disabled when running this files in tests +wandb.init(project="llm-eval-v2", config=wandb_config) + +if wandb.config["model"].startswith("gpt"): + def llm(prompt): + res = openai.ChatCompletion.create( + model=wandb.config["model"], + messages=[ + {"role": "system", "content": wandb.config["system_prompt"]}, + {"role": "user", "content": prompt} + ], + temperature=wandb.config["temperature"], + max_tokens=wandb.config["max_tokens"], + frequency_penalty=wandb.config["repetition_penalty"], + ) + return res.choices[0].message.content, res.usage.total_tokens +else: + default_gpu = -1 if is_cuda_available() else 0 + cpp = Llama(f"/var/models/{wandb.config['model']}", + verbose=bool(os.getenv("VERBOSE", False)), + n_gpu_layers=int(os.getenv("GPU_LAYERS", default_gpu))) + def llm(prompt): + res = cpp.create_chat_completion( + messages=[ + {"role": "system", "content": wandb.config["system_prompt"]}, + {"role": "user", "content": f"Q: {prompt}"} + ], + max_tokens=wandb.config["max_tokens"], stop=["Q:"], + repeat_penalty=wandb.config["repetition_penalty"], + temperature=wandb.config["temperature"], + ) + return res["choices"][0]["message"]["content"], res["usage"]["total_tokens"] + +print(f"Evaluating {wandb.config['model']}") +table = wandb.Table(columns=["prompt", "output", "ideal", "score", "latency", "tokens"]) + +codeblock_pattern = re.compile(r'(docker.+)$', re.MULTILINE) +def fmt(s): + return f"`{s}`" + +total_score = 0 +total_latency = 0 +total_tokens = 0 +correct = 0.0 +total = 0.0 +with open(eval_path, "r") as f: + for line in f: + data = json.loads(line) + total += 1.0 + prompt = data["input"] + print(prompt) + start = time.time() + output, tokens = llm(prompt) + latency = time.time() - start + total_latency += latency + matches = codeblock_pattern.findall(output) + if len(matches) == 0: + print("\t!!! No code generated:") + for l in output.split("\n"): + print(f"\t> {l}") + continue + command = matches[0].split("`")[0] + score = difflib.SequenceMatcher(None, data["ideal"], command).ratio() + print(f"\t({score:.2f}) {command}") + total_score += score + total_tokens += tokens + if score > diff_threshold: + correct += 1.0 + table.add_data(prompt, fmt(command), fmt(data["ideal"]), score, latency, tokens) + +wandb.log({ + "accuracy": correct / total, + "diff_score": total_score / total, + "avg_tokens": total_tokens / total, + "latency": total_latency / total, + "eval": table +}) +print("\nConfig:\n") +print(json.dumps(dict(wandb.config), indent=4)) +print(f"Accuracy: {wandb.run.summary['accuracy']}") +print(f"Average diff score: {wandb.run.summary['diff_score']}") \ No newline at end of file diff --git a/examples/llama-cpp/sweep.yaml b/examples/llama-cpp/sweep.yaml new file mode 100644 index 00000000..a79a8e7d --- /dev/null +++ b/examples/llama-cpp/sweep.yaml @@ -0,0 +1,22 @@ +program: evaluate.py +method: random +metric: + goal: maximize + name: diff_score +parameters: + model: + distribution: categorical + values: + - codellama-13b-instruct.Q4_K_M.gguf + - codellama-7b.Q5_K_S.gguf + - mistral-7b-instruct-v0.1.Q5_K_M.gguf + - gpt-3.5-turbo + system_prompt: + distribution: categorical + values: + - You're a Docker expert. Translate the following sentence to a simple docker command. + - You'll be asked a question about Docker. Your job is to convert this question to a succinct docker command. Only provide a single command and limit your use of pipeing to other unix tools. + temperature: + distribution: uniform + max: 0.6 + min: 0 \ No newline at end of file