From 74dfabc2f5ce7cd874f8386b527f3050965092df Mon Sep 17 00:00:00 2001
From: Faiz Surani <faiz.surani@gmail.com>
Date: Tue, 4 Jun 2024 17:27:34 -0700
Subject: [PATCH] Add bug fixes in engines

---
 rl/llm/__init__.py         |   3 -
 rl/llm/engines.py          |  22 +++++---
 rl/llm/modal_entrypoint.py | 110 +++++++++++--------------------------
 rl/llm/modal_utils.py      |  47 ++++++++++++++--
 4 files changed, 86 insertions(+), 96 deletions(-)

diff --git a/rl/llm/__init__.py b/rl/llm/__init__.py
index bcea526..e69de29 100644
--- a/rl/llm/__init__.py
+++ b/rl/llm/__init__.py
@@ -1,3 +0,0 @@
-# ruff: noqa: F403
-from .config import LLMConfig, QuantizationType
-from .engines import *
diff --git a/rl/llm/engines.py b/rl/llm/engines.py
index bc35764..6f06114 100644
--- a/rl/llm/engines.py
+++ b/rl/llm/engines.py
@@ -1,3 +1,4 @@
+import dataclasses
 import datetime
 import hashlib
 import json
@@ -306,9 +307,7 @@ def __enter__(self):
             LOGGER.info(f"No deployed app found for {self.app_name}. Deploying...")
             deploy_config = {
                 "app_name": self.app_name,
-                "model_name_or_path": self.llm_config.model_name_or_path,
-                "num_gpus": self.llm_config.num_gpus,
-                "vllm_kwargs": _get_vllm_kwargs(self.llm_config),
+                "llm_config": dataclasses.asdict(self.llm_config),
             }
             deploy_env = {"MODAL_DEPLOY_CONFIG": json.dumps(deploy_config)}
             print(deploy_env)
@@ -343,13 +342,13 @@ def __init__(self, llm_config: LLMConfig):
         rl.utils.io.ensure_dotenv_loaded()
         self.llm_config = llm_config
 
-    async def __aenter__(self):
+    def __enter__(self):
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.llm_config.tokenizer_name_or_path
         )
         pass
 
-    async def __aexit__(self, exc_type, exc_value, traceback):
+    def __exit__(self, exc_type, exc_value, traceback):
         pass
 
     @abstractmethod
@@ -529,7 +528,7 @@ def _get_vllm_outputs(self, prompts: list[str]):
         for i, prompt in enumerate(prompts):
             self.vllm.add_request(
                 request_id=str(f"{curr_uuid}_{i}"),
-                prompt=prompt,
+                inputs=prompt,
                 **self.generate_kwargs,
             )
 
@@ -679,13 +678,15 @@ class AsyncVLLMEngine(AsyncInferenceEngine):
     def __init__(self, llm_config: LLMConfig):
         super().__init__(llm_config)
 
-    async def __aenter__(self):
+    def __enter__(self):
         self.vllm, self.generate_kwargs = _get_vllm_engine(
             self.llm_config, use_async=True
         )
-        self.tokenizer = await self.vllm.get_tokenizer()
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.llm_config.tokenizer_name_or_path
+        )
 
-    async def __aexit__(self, exc_type, exc_value, traceback):
+    def __exit__(self, exc_type, exc_value, traceback):
         del self.vllm
 
     async def stream(
@@ -736,6 +737,9 @@ def _wrap_output(self, req_output) -> InferenceOutput:
         WorkerVLLMEngine,
         OpenAIEngine,
         TogetherEngine,
+        GroqEngine,
+        AnthropicEngine,
+        ModalEngine,
     )
 }
 
diff --git a/rl/llm/modal_entrypoint.py b/rl/llm/modal_entrypoint.py
index 03c5f89..56bf3ce 100644
--- a/rl/llm/modal_entrypoint.py
+++ b/rl/llm/modal_entrypoint.py
@@ -2,11 +2,12 @@
 import os
 import subprocess
 import time
-from pathlib import Path
 
 import modal
 import modal.gpu
 
+import rl.llm.modal_utils
+
 _IMAGE_MODEL_DIR = "/model"
 _DEPLOY_CONFIG = json.loads(os.getenv("MODAL_DEPLOY_CONFIG", "{}"))
 
@@ -16,53 +17,31 @@
 print(f"🚀 Deploying with config: {json.dumps(_DEPLOY_CONFIG, indent=2)}")
 
 
-def _download_model_to_image(model_dir, model_name):
-    from huggingface_hub import snapshot_download
-    from transformers.utils import move_cache
-
-    Path(model_dir).mkdir(parents=True, exist_ok=True)
-
-    snapshot_download(
-        model_name,
-        local_dir=model_dir,
-        ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
-    )
-    move_cache()
-
-
 def _derive_gpu_config(deploy_config):
     return modal.gpu.A100(size="80GB", count=deploy_config.get("num_gpus", 1))
 
 
-def _install_deps():
-    return subprocess.run(
-        [
-            "uv",
-            "pip",
-            "install",
-            "rl[llm] @ git+https://github.com/ProbablyFaiz/rl.git@main",
-        ],
-        check=True,
-    )
-
-
 def _get_vllm_image(deploy_config):
     return (
-        modal.Image.debian_slim(python_version="3.11")
-        .pip_install(
-            "uv",
+        modal.Image.from_registry(
+            "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.11"
         )
-        .run_function(_install_deps)
+        .apt_install("git")
+        .run_function(rl.llm.modal_utils.install_deps)
+        .run_function(rl.llm.modal_utils.install_rl)
         .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
         .run_function(
-            _download_model_to_image,
+            rl.llm.modal_utils.download_model_to_image,
             timeout=60 * 20,
             kwargs={
                 "model_dir": _IMAGE_MODEL_DIR,
-                "model_name": deploy_config["model_name_or_path"],
+                "model_name": deploy_config["llm_config"]["model_name_or_path"],
             },
             secrets=[modal.Secret.from_name("huggingface-token")],
         )
+        .env(
+            {"MODAL_DEPLOY_CONFIG": json.dumps(deploy_config), "ENFORCE_EAGER": "true"}
+        )
     )
 
 
@@ -73,6 +52,7 @@ def _get_vllm_image(deploy_config):
 
 
 @app.cls(
+    cpu=4.0,
     gpu=_GPU_CONFIG,
     timeout=60 * 10,
     container_idle_timeout=60 * 10,
@@ -81,70 +61,44 @@ def _get_vllm_image(deploy_config):
 )
 class Model:
     engine = None
+    config = None
 
     @modal.enter()
     def start_engine(self):
-        from vllm.engine.arg_utils import AsyncEngineArgs
-        from vllm.engine.async_llm_engine import AsyncLLMEngine
+        from rl.llm.config import LLMConfig
+        from rl.llm.engines import VLLMEngine
+
+        self.config = LLMConfig(**_DEPLOY_CONFIG["llm_config"])
+        self.config.model_name_or_path = _IMAGE_MODEL_DIR
+        self.config.tokenizer_name_or_path = _IMAGE_MODEL_DIR
 
         print("🥶 cold starting inference")
         start = time.monotonic_ns()
 
-        if "model" in _DEPLOY_CONFIG["vllm_kwargs"]:
-            del _DEPLOY_CONFIG["vllm_kwargs"]
-        engine_args = AsyncEngineArgs(
-            model=_IMAGE_MODEL_DIR,
-            **_DEPLOY_CONFIG["vllm_kwargs"],
-        )
-
-        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+        self.engine = VLLMEngine(self.config)
+        self.engine.__enter__()
         duration_s = (time.monotonic_ns() - start) / 1e9
         print(f"🏎️ engine started in {duration_s:.0f}s")
 
     @modal.method()
-    async def completion_stream(self, input_text: str):
-        from vllm import SamplingParams
-        from vllm.utils import random_uuid
-
-        sampling_params = SamplingParams(
-            temperature=0.75,
-            max_tokens=128,
-            repetition_penalty=1.1,
-        )
-
-        request_id = random_uuid()
-        result_generator = self.engine.generate(
-            input_text,
-            sampling_params,
-            request_id,
-        )
-        index, num_tokens = 0, 0
-        start = time.monotonic_ns()
-        async for output in result_generator:
-            if output.outputs[0].text and "\ufffd" == output.outputs[0].text[-1]:
-                continue
-            text_delta = output.outputs[0].text[index:]
-            index = len(output.outputs[0].text)
-            num_tokens = len(output.outputs[0].token_ids)
-
-            yield text_delta
-        duration_s = (time.monotonic_ns() - start) / 1e9
+    def generate(self, inference_input):
+        return self.engine.generate(inference_input)
 
-        yield (
-            f"\n\tGenerated {num_tokens} tokens in {duration_s:.1f}s,"
-            f" throughput = {num_tokens / duration_s:.0f} tokens/second on {_GPU_CONFIG}.\n"
-        )
+    @modal.method()
+    def batch_generate(self, inference_inputs):
+        return self.engine.batch_generate(inference_inputs)
 
     @modal.exit()
     def stop_engine(self):
-        if _DEPLOY_CONFIG["num_gpus"] > 1:
+        self.engine.__exit__(None, None, None)
+        if self.config.num_gpus > 1:
             import ray
 
             ray.shutdown()
 
 
 @app.local_entrypoint()
-def main():
+async def main():
     questions = [
         "Implement a Python function to compute the Fibonacci numbers.",
         "What is the fable involving a fox and grapes?",
@@ -156,5 +110,5 @@ def main():
     model = Model()
     for question in questions:
         print("Sending new request:", question, "\n\n")
-        for text in model.completion_stream.remote_gen(question):
-            print(text, end="", flush=text.endswith("\n"))
+        print(model.generate.remote(question))
+        print("\n\n")
diff --git a/rl/llm/modal_utils.py b/rl/llm/modal_utils.py
index 8171c77..02ce9ef 100644
--- a/rl/llm/modal_utils.py
+++ b/rl/llm/modal_utils.py
@@ -1,13 +1,7 @@
 import json
 import subprocess
-import time
 from pathlib import Path
 
-import modal
-import modal.gpu
-
-from rl.llm.config import LLMConfig
-
 
 def get_deployed_id(app_name: str) -> str:
     # Run `modal app list --json`
@@ -27,3 +21,44 @@ def get_deployed_id(app_name: str) -> str:
         ),
         None,
     )
+
+
+def _get_uv_install_prefix():
+    return ["python", "-m", "uv", "pip", "install", "--system", "--no-cache"]
+
+
+def install_deps():
+    subprocess.run(
+        ["python", "-m", "pip", "install", "uv"],
+        check=True,
+    )
+    subprocess.run(
+        [*_get_uv_install_prefix(), "packaging", "wheel", "torch", "psutil"],
+        check=True,
+    )
+    subprocess.run([*_get_uv_install_prefix(), "hf-transfer", "huggingface-hub"])
+
+
+def install_rl():
+    subprocess.run(
+        [
+            *_get_uv_install_prefix(),
+            "rl[llm] @ git+https://github.com/ProbablyFaiz/rl.git@main",
+            "--no-build-isolation",
+        ],
+        check=True,
+    )
+
+
+def download_model_to_image(model_dir, model_name):
+    from huggingface_hub import snapshot_download
+    from transformers.utils import move_cache
+
+    Path(model_dir).mkdir(parents=True, exist_ok=True)
+
+    snapshot_download(
+        model_name,
+        local_dir=model_dir,
+        ignore_patterns=["*.pt", "*.bin", "*.pth"],  # Using safetensors
+    )
+    move_cache()