From 64c88bcb5aa979c557591cf446bd8771e773c1f6 Mon Sep 17 00:00:00 2001 From: F-G Fernandez <26927750+frgfm@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:17:41 +0100 Subject: [PATCH] docs(benchmark): add throughput evaluation for NVIDIA L4 GPUs (#138) * docs(benchmark): add throughput for NVIDIA L4 * build(docker): update docker orchestration for ollama bench * docs(readme): add GPU reference to benchmark readme --- scripts/ollama/README.md | 1 + scripts/ollama/docker-compose.yml | 2 +- scripts/ollama/latency.csv | 5 +++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/ollama/README.md b/scripts/ollama/README.md index f01bd9b..ca10d35 100644 --- a/scripts/ollama/README.md +++ b/scripts/ollama/README.md @@ -10,6 +10,7 @@ We ran our tests on the following hardware: - [NVIDIA GeForce RTX 3070](https://www.nvidia.com/fr-fr/geforce/graphics-cards/30-series/rtx-3070-3070ti/) ([Scaleway GPU-3070-S](https://www.scaleway.com/en/pricing/?tags=compute)) - [NVIDIA A10](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) ([Lambda Cloud gpu_1x_a10](https://lambdalabs.com/service/gpu-cloud#pricing)) - [NVIDIA A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) ([AWS g5.xlarge](https://aws.amazon.com/ec2/instance-types/g5/)) +- [NVIDIA L4](https://www.nvidia.com/en-us/data-center/l4/) ([Scaleway L4-1-24G](https://www.scaleway.com/en/pricing/?tags=compute)) *The laptop hardware setup includes an [Intel(R) Core(TM) i7-12700H](https://ark.intel.com/content/www/us/en/ark/products/132228/intel-core-i7-12700h-processor-24m-cache-up-to-4-70-ghz.html) for the CPU* diff --git a/scripts/ollama/docker-compose.yml b/scripts/ollama/docker-compose.yml index 3d220a9..810e2b5 100644 --- a/scripts/ollama/docker-compose.yml +++ b/scripts/ollama/docker-compose.yml @@ -22,7 +22,7 @@ services: retries: 3 evaluator: - image: quackai/evaluator:latest + image: quackai/llm-evaluator:latest build: . depends_on: ollama: diff --git a/scripts/ollama/latency.csv b/scripts/ollama/latency.csv index 4ffdf00..a373804 100644 --- a/scripts/ollama/latency.csv +++ b/scripts/ollama/latency.csv @@ -20,3 +20,8 @@ deepseek-coder:6.7b-instruct-q3_K_M,A10G (AWS g5.xlarge),99.83,35.41,84.47,1.69 pxlksr/opencodeinterpreter-ds:6.7b-Q4_K_M,A10G (AWS g5.xlarge),212.08,86.58,79.02,3.35 dolphin-mistral:7b-v2.6-dpo-laser-q4_K_M,A10G (AWS g5.xlarge),187.2,62.24,75.91,1 dolphin-mistral:7b-v2.6-dpo-laser-q3_K_M,A10G (AWS g5.xlarge),102.36,34.29,81.23,1.02 +deepseek-coder:6.7b-instruct-q4_K_M,NVIDIA L4 (Scaleway L4-1-24G),213.46,76.24,49.97,1.01 +deepseek-coder:6.7b-instruct-q3_K_M,NVIDIA L4 (Scaleway L4-1-24G),118.87,43.35,54.72,1.31 +pxlksr/opencodeinterpreter-ds:6.7b-Q4_K_M,NVIDIA L4 (Scaleway L4-1-24G),225.62,60.21,49.39,1.9 +dolphin-mistral:7b-v2.6-dpo-laser-q4_K_M,NVIDIA L4 (Scaleway L4-1-24G),211.52,72.76,47.27,0.58 +dolphin-mistral:7b-v2.6-dpo-laser-q3_K_M,NVIDIA L4 (Scaleway L4-1-24G),120.13,41.09,51.9,0.71