From 90d6aa8e80df99fc9828b9c9c45415aacc532e6d Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 27 Oct 2023 13:05:14 -0700 Subject: [PATCH] feat: switch cuda backend to llama.cpp --- .github/workflows/ci.yml | 2 +- CHANGELOG.md | 1 + Cargo.lock | 1 - Dockerfile | 16 ++++++---- crates/llama-cpp-bindings/Cargo.toml | 3 ++ crates/llama-cpp-bindings/build.rs | 20 ++++++------- crates/tabby/Cargo.toml | 8 ++--- crates/tabby/src/serve/engine.rs | 44 +--------------------------- crates/tabby/src/serve/mod.rs | 29 +++++------------- 9 files changed, 36 insertions(+), 88 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5b1cef0ff83f..9cf2f131ac96 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -111,7 +111,7 @@ jobs: - run: bash ./ci/prepare_build_environment.sh - name: Bulid release binary - run: cargo build --no-default-features --release --target ${{ matrix.target }} --package tabby + run: cargo build --release --target ${{ matrix.target }} --package tabby - name: Rename release binary run: mv target/${{ matrix.target }}/release/tabby tabby_${{ matrix.target }} diff --git a/CHANGELOG.md b/CHANGELOG.md index ead851f841f8..14411b1df6ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ ## Fixes and Improvements +* Switch cuda backend to llama.cpp: https://github.com/TabbyML/tabby/pull/TODO * Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638 * add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637 diff --git a/Cargo.lock b/Cargo.lock index fb989d8725d9..76b19380bc29 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3153,7 +3153,6 @@ dependencies = [ "axum-streams", "axum-tracing-opentelemetry", "clap", - "ctranslate2-bindings", "futures", "http-api-bindings", "hyper", diff --git a/Dockerfile b/Dockerfile index 17f56a171cf1..12c64af47729 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,12 @@ -FROM ghcr.io/opennmt/ctranslate2:3.20.0-ubuntu20.04-cuda11.2 as source -FROM nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04 as builder +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=11.7.1 +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +# Target the CUDA runtime image +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} -ENV CTRANSLATE2_ROOT=/opt/ctranslate2 -COPY --from=source $CTRANSLATE2_ROOT $CTRANSLATE2_ROOT +FROM ${BASE_CUDA_DEV_CONTAINER} as build ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ @@ -30,10 +34,10 @@ RUN mkdir -p target RUN --mount=type=cache,target=/usr/local/cargo/registry \ --mount=type=cache,target=/root/workspace/target \ - cargo build --features link_shared --release && \ + cargo build --features cuda --release && \ cp target/release/tabby /opt/tabby/bin/ -FROM ghcr.io/opennmt/ctranslate2:3.20.0-ubuntu20.04-cuda11.2 +FROM ${BASE_CUDA_RUN_CONTAINER} as runtime RUN apt-get update && \ apt-get install -y --no-install-recommends \ diff --git a/crates/llama-cpp-bindings/Cargo.toml b/crates/llama-cpp-bindings/Cargo.toml index d2df471d8143..d1afeb0e5d3c 100644 --- a/crates/llama-cpp-bindings/Cargo.toml +++ b/crates/llama-cpp-bindings/Cargo.toml @@ -3,6 +3,9 @@ name = "llama-cpp-bindings" version = "0.5.0-dev" edition = "2021" +[features] +cuda = [] + [build-dependencies] cxx-build = "1.0" cmake = "0.1" diff --git a/crates/llama-cpp-bindings/build.rs b/crates/llama-cpp-bindings/build.rs index 76b42920ea44..a3dc112c9632 100644 --- a/crates/llama-cpp-bindings/build.rs +++ b/crates/llama-cpp-bindings/build.rs @@ -1,25 +1,25 @@ use cmake::Config; fn main() { - let mut config = Config::new("llama.cpp"); - if cfg!(target_os = "macos") { - config.define("LLAMA_METAL", "ON"); - } - let dst = config.build(); - println!("cargo:rerun-if-changed=cc/*.h"); println!("cargo:rerun-if-changed=cc/*.cc"); - println!("cargo:rustc-link-search=native={}/build", dst.display()); - println!("cargo:rustc-link-lib=llama"); - println!("cargo:rustc-link-lib=ggml_static"); - + let mut config = Config::new("llama.cpp"); if cfg!(target_os = "macos") { + config.define("LLAMA_METAL", "ON"); println!("cargo:rustc-link-lib=framework=Foundation"); println!("cargo:rustc-link-lib=framework=Accelerate"); println!("cargo:rustc-link-lib=framework=Metal"); println!("cargo:rustc-link-lib=framework=MetalKit"); } + if cfg!(feature = "cuda") { + config.define("LLAMA_CUBLAS", "ON"); + } + + let dst = config.build(); + println!("cargo:rustc-link-search=native={}/build", dst.display()); + println!("cargo:rustc-link-lib=llama"); + println!("cargo:rustc-link-lib=ggml_static"); cxx_build::bridge("src/lib.rs") .file("src/engine.cc") diff --git a/crates/tabby/Cargo.toml b/crates/tabby/Cargo.toml index 37bc80967e69..ba88d452008b 100644 --- a/crates/tabby/Cargo.toml +++ b/crates/tabby/Cargo.toml @@ -3,6 +3,9 @@ name = "tabby" version = "0.5.0-dev" edition = "2021" +[features] +cuda = ["llama-cpp-bindings/cuda"] + [dependencies] tabby-common = { path = "../tabby-common" } tabby-scheduler = { path = "../tabby-scheduler" } @@ -43,7 +46,6 @@ textdistance = "1.0.2" regex.workspace = true thiserror.workspace = true llama-cpp-bindings = { path = "../llama-cpp-bindings" } -ctranslate2-bindings = { path = "../ctranslate2-bindings", optional = true } [dependencies.uuid] version = "1.3.3" @@ -53,10 +55,6 @@ features = [ "macro-diagnostics", # Enable better diagnostics for compile-time UUIDs ] -[features] -link_shared = ["ctranslate2-bindings/link_shared"] -link_cuda_static = ["ctranslate2-bindings"] - [build-dependencies] vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] } diff --git a/crates/tabby/src/serve/engine.rs b/crates/tabby/src/serve/engine.rs index b2e14ae08ce0..4fb9dd1282c3 100644 --- a/crates/tabby/src/serve/engine.rs +++ b/crates/tabby/src/serve/engine.rs @@ -13,7 +13,7 @@ pub fn create_engine( if args.device != super::Device::ExperimentalHttp { let model_dir = get_model_dir(model); let metadata = read_metadata(&model_dir); - let engine = create_local_engine(args, &model_dir, &metadata); + let engine = create_ggml_engine(&args.device, &model_dir); ( engine, EngineInfo { @@ -38,48 +38,6 @@ pub struct EngineInfo { pub chat_template: Option, } -#[cfg(not(any(feature = "link_shared", feature = "link_cuda_static")))] -fn create_local_engine( - args: &crate::serve::ServeArgs, - model_dir: &ModelDir, - _metadata: &Metadata, -) -> Box { - create_ggml_engine(&args.device, model_dir) -} - -#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))] -fn create_local_engine( - args: &crate::serve::ServeArgs, - model_dir: &ModelDir, - metadata: &Metadata, -) -> Box { - if args.device.use_ggml_backend() { - create_ggml_engine(&args.device, model_dir) - } else { - create_ctranslate2_engine(args, model_dir, metadata) - } -} - -#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))] -fn create_ctranslate2_engine( - args: &crate::serve::ServeArgs, - model_dir: &ModelDir, - metadata: &Metadata, -) -> Box { - use ctranslate2_bindings::{CTranslate2Engine, CTranslate2EngineOptionsBuilder}; - - let device = format!("{}", args.device); - let options = CTranslate2EngineOptionsBuilder::default() - .model_path(model_dir.ctranslate2_dir()) - .tokenizer_path(model_dir.tokenizer_file()) - .device(device) - .model_type(metadata.auto_model.clone()) - .device_indices(args.device_indices.clone()) - .build() - .unwrap(); - Box::new(CTranslate2Engine::create(options)) -} - fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box { let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default() .model_path(model_dir.ggml_q8_0_v2_file()) diff --git a/crates/tabby/src/serve/mod.rs b/crates/tabby/src/serve/mod.rs index 58dce37f26c4..b3f1d3769fd5 100644 --- a/crates/tabby/src/serve/mod.rs +++ b/crates/tabby/src/serve/mod.rs @@ -74,7 +74,7 @@ pub enum Device { #[strum(serialize = "cpu")] Cpu, - #[cfg(any(feature = "link_shared", feature = "link_cuda_static"))] + #[cfg(feature = "cuda")] Cuda, #[cfg(all(target_os = "macos", target_arch = "aarch64"))] @@ -86,24 +86,14 @@ pub enum Device { } impl Device { - #[cfg(all(target_os = "macos", target_arch = "aarch64"))] - fn use_ggml_backend(&self) -> bool { - *self == Device::Metal || *self == Device::Cpu - } - - #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))] - fn use_ggml_backend(&self) -> bool { - *self == Device::Cpu - } - #[cfg(all(target_os = "macos", target_arch = "aarch64"))] fn ggml_use_gpu(&self) -> bool { *self == Device::Metal } - #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))] + #[cfg(feature="cuda")] fn ggml_use_gpu(&self) -> bool { - false + *self == Device::Cuda } } @@ -141,9 +131,9 @@ pub async fn main(config: &Config, args: &ServeArgs) { valid_args(args); if args.device != Device::ExperimentalHttp { - download_model(&args.model, &args.device).await; + download_model(&args.model).await; if let Some(chat_model) = &args.chat_model { - download_model(chat_model, &args.device).await; + download_model(chat_model).await; } } else { warn!("HTTP device is unstable and does not comply with semver expectations.") @@ -285,15 +275,10 @@ fn start_heartbeat(args: &ServeArgs) { }); } -async fn download_model(model: &str, device: &Device) { +async fn download_model(model: &str) { let downloader = Downloader::new(model, /* prefer_local_file= */ true); let handler = |err| fatal!("Failed to fetch model '{}' due to '{}'", model, err,); - let download_result = if device.use_ggml_backend() { - downloader.download_ggml_files().await - } else { - downloader.download_ctranslate2_files().await - }; - + let download_result = downloader.download_ggml_files().await; download_result.unwrap_or_else(handler); }