From 74dfabc2f5ce7cd874f8386b527f3050965092df Mon Sep 17 00:00:00 2001 From: Faiz Surani Date: Tue, 4 Jun 2024 17:27:34 -0700 Subject: [PATCH] Add bug fixes in engines --- rl/llm/__init__.py | 3 - rl/llm/engines.py | 22 +++++--- rl/llm/modal_entrypoint.py | 110 +++++++++++-------------------------- rl/llm/modal_utils.py | 47 ++++++++++++++-- 4 files changed, 86 insertions(+), 96 deletions(-) diff --git a/rl/llm/__init__.py b/rl/llm/__init__.py index bcea526..e69de29 100644 --- a/rl/llm/__init__.py +++ b/rl/llm/__init__.py @@ -1,3 +0,0 @@ -# ruff: noqa: F403 -from .config import LLMConfig, QuantizationType -from .engines import * diff --git a/rl/llm/engines.py b/rl/llm/engines.py index bc35764..6f06114 100644 --- a/rl/llm/engines.py +++ b/rl/llm/engines.py @@ -1,3 +1,4 @@ +import dataclasses import datetime import hashlib import json @@ -306,9 +307,7 @@ def __enter__(self): LOGGER.info(f"No deployed app found for {self.app_name}. Deploying...") deploy_config = { "app_name": self.app_name, - "model_name_or_path": self.llm_config.model_name_or_path, - "num_gpus": self.llm_config.num_gpus, - "vllm_kwargs": _get_vllm_kwargs(self.llm_config), + "llm_config": dataclasses.asdict(self.llm_config), } deploy_env = {"MODAL_DEPLOY_CONFIG": json.dumps(deploy_config)} print(deploy_env) @@ -343,13 +342,13 @@ def __init__(self, llm_config: LLMConfig): rl.utils.io.ensure_dotenv_loaded() self.llm_config = llm_config - async def __aenter__(self): + def __enter__(self): self.tokenizer = AutoTokenizer.from_pretrained( self.llm_config.tokenizer_name_or_path ) pass - async def __aexit__(self, exc_type, exc_value, traceback): + def __exit__(self, exc_type, exc_value, traceback): pass @abstractmethod @@ -529,7 +528,7 @@ def _get_vllm_outputs(self, prompts: list[str]): for i, prompt in enumerate(prompts): self.vllm.add_request( request_id=str(f"{curr_uuid}_{i}"), - prompt=prompt, + inputs=prompt, **self.generate_kwargs, ) @@ -679,13 +678,15 @@ class AsyncVLLMEngine(AsyncInferenceEngine): def __init__(self, llm_config: LLMConfig): super().__init__(llm_config) - async def __aenter__(self): + def __enter__(self): self.vllm, self.generate_kwargs = _get_vllm_engine( self.llm_config, use_async=True ) - self.tokenizer = await self.vllm.get_tokenizer() + self.tokenizer = AutoTokenizer.from_pretrained( + self.llm_config.tokenizer_name_or_path + ) - async def __aexit__(self, exc_type, exc_value, traceback): + def __exit__(self, exc_type, exc_value, traceback): del self.vllm async def stream( @@ -736,6 +737,9 @@ def _wrap_output(self, req_output) -> InferenceOutput: WorkerVLLMEngine, OpenAIEngine, TogetherEngine, + GroqEngine, + AnthropicEngine, + ModalEngine, ) } diff --git a/rl/llm/modal_entrypoint.py b/rl/llm/modal_entrypoint.py index 03c5f89..56bf3ce 100644 --- a/rl/llm/modal_entrypoint.py +++ b/rl/llm/modal_entrypoint.py @@ -2,11 +2,12 @@ import os import subprocess import time -from pathlib import Path import modal import modal.gpu +import rl.llm.modal_utils + _IMAGE_MODEL_DIR = "/model" _DEPLOY_CONFIG = json.loads(os.getenv("MODAL_DEPLOY_CONFIG", "{}")) @@ -16,53 +17,31 @@ print(f"🚀 Deploying with config: {json.dumps(_DEPLOY_CONFIG, indent=2)}") -def _download_model_to_image(model_dir, model_name): - from huggingface_hub import snapshot_download - from transformers.utils import move_cache - - Path(model_dir).mkdir(parents=True, exist_ok=True) - - snapshot_download( - model_name, - local_dir=model_dir, - ignore_patterns=["*.pt", "*.bin"], # Using safetensors - ) - move_cache() - - def _derive_gpu_config(deploy_config): return modal.gpu.A100(size="80GB", count=deploy_config.get("num_gpus", 1)) -def _install_deps(): - return subprocess.run( - [ - "uv", - "pip", - "install", - "rl[llm] @ git+https://github.com/ProbablyFaiz/rl.git@main", - ], - check=True, - ) - - def _get_vllm_image(deploy_config): return ( - modal.Image.debian_slim(python_version="3.11") - .pip_install( - "uv", + modal.Image.from_registry( + "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.11" ) - .run_function(_install_deps) + .apt_install("git") + .run_function(rl.llm.modal_utils.install_deps) + .run_function(rl.llm.modal_utils.install_rl) .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) .run_function( - _download_model_to_image, + rl.llm.modal_utils.download_model_to_image, timeout=60 * 20, kwargs={ "model_dir": _IMAGE_MODEL_DIR, - "model_name": deploy_config["model_name_or_path"], + "model_name": deploy_config["llm_config"]["model_name_or_path"], }, secrets=[modal.Secret.from_name("huggingface-token")], ) + .env( + {"MODAL_DEPLOY_CONFIG": json.dumps(deploy_config), "ENFORCE_EAGER": "true"} + ) ) @@ -73,6 +52,7 @@ def _get_vllm_image(deploy_config): @app.cls( + cpu=4.0, gpu=_GPU_CONFIG, timeout=60 * 10, container_idle_timeout=60 * 10, @@ -81,70 +61,44 @@ def _get_vllm_image(deploy_config): ) class Model: engine = None + config = None @modal.enter() def start_engine(self): - from vllm.engine.arg_utils import AsyncEngineArgs - from vllm.engine.async_llm_engine import AsyncLLMEngine + from rl.llm.config import LLMConfig + from rl.llm.engines import VLLMEngine + + self.config = LLMConfig(**_DEPLOY_CONFIG["llm_config"]) + self.config.model_name_or_path = _IMAGE_MODEL_DIR + self.config.tokenizer_name_or_path = _IMAGE_MODEL_DIR print("🥶 cold starting inference") start = time.monotonic_ns() - if "model" in _DEPLOY_CONFIG["vllm_kwargs"]: - del _DEPLOY_CONFIG["vllm_kwargs"] - engine_args = AsyncEngineArgs( - model=_IMAGE_MODEL_DIR, - **_DEPLOY_CONFIG["vllm_kwargs"], - ) - - self.engine = AsyncLLMEngine.from_engine_args(engine_args) + self.engine = VLLMEngine(self.config) + self.engine.__enter__() duration_s = (time.monotonic_ns() - start) / 1e9 print(f"🏎️ engine started in {duration_s:.0f}s") @modal.method() - async def completion_stream(self, input_text: str): - from vllm import SamplingParams - from vllm.utils import random_uuid - - sampling_params = SamplingParams( - temperature=0.75, - max_tokens=128, - repetition_penalty=1.1, - ) - - request_id = random_uuid() - result_generator = self.engine.generate( - input_text, - sampling_params, - request_id, - ) - index, num_tokens = 0, 0 - start = time.monotonic_ns() - async for output in result_generator: - if output.outputs[0].text and "\ufffd" == output.outputs[0].text[-1]: - continue - text_delta = output.outputs[0].text[index:] - index = len(output.outputs[0].text) - num_tokens = len(output.outputs[0].token_ids) - - yield text_delta - duration_s = (time.monotonic_ns() - start) / 1e9 + def generate(self, inference_input): + return self.engine.generate(inference_input) - yield ( - f"\n\tGenerated {num_tokens} tokens in {duration_s:.1f}s," - f" throughput = {num_tokens / duration_s:.0f} tokens/second on {_GPU_CONFIG}.\n" - ) + @modal.method() + def batch_generate(self, inference_inputs): + return self.engine.batch_generate(inference_inputs) @modal.exit() def stop_engine(self): - if _DEPLOY_CONFIG["num_gpus"] > 1: + self.engine.__exit__(None, None, None) + if self.config.num_gpus > 1: import ray ray.shutdown() @app.local_entrypoint() -def main(): +async def main(): questions = [ "Implement a Python function to compute the Fibonacci numbers.", "What is the fable involving a fox and grapes?", @@ -156,5 +110,5 @@ def main(): model = Model() for question in questions: print("Sending new request:", question, "\n\n") - for text in model.completion_stream.remote_gen(question): - print(text, end="", flush=text.endswith("\n")) + print(model.generate.remote(question)) + print("\n\n") diff --git a/rl/llm/modal_utils.py b/rl/llm/modal_utils.py index 8171c77..02ce9ef 100644 --- a/rl/llm/modal_utils.py +++ b/rl/llm/modal_utils.py @@ -1,13 +1,7 @@ import json import subprocess -import time from pathlib import Path -import modal -import modal.gpu - -from rl.llm.config import LLMConfig - def get_deployed_id(app_name: str) -> str: # Run `modal app list --json` @@ -27,3 +21,44 @@ def get_deployed_id(app_name: str) -> str: ), None, ) + + +def _get_uv_install_prefix(): + return ["python", "-m", "uv", "pip", "install", "--system", "--no-cache"] + + +def install_deps(): + subprocess.run( + ["python", "-m", "pip", "install", "uv"], + check=True, + ) + subprocess.run( + [*_get_uv_install_prefix(), "packaging", "wheel", "torch", "psutil"], + check=True, + ) + subprocess.run([*_get_uv_install_prefix(), "hf-transfer", "huggingface-hub"]) + + +def install_rl(): + subprocess.run( + [ + *_get_uv_install_prefix(), + "rl[llm] @ git+https://github.com/ProbablyFaiz/rl.git@main", + "--no-build-isolation", + ], + check=True, + ) + + +def download_model_to_image(model_dir, model_name): + from huggingface_hub import snapshot_download + from transformers.utils import move_cache + + Path(model_dir).mkdir(parents=True, exist_ok=True) + + snapshot_download( + model_name, + local_dir=model_dir, + ignore_patterns=["*.pt", "*.bin", "*.pth"], # Using safetensors + ) + move_cache()