Skip to content

Commit

Permalink
Add bug fixes in engines
Browse files Browse the repository at this point in the history
  • Loading branch information
ProbablyFaiz committed Jun 5, 2024
1 parent e34fbea commit 74dfabc
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 96 deletions.
3 changes: 0 additions & 3 deletions rl/llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
# ruff: noqa: F403
from .config import LLMConfig, QuantizationType
from .engines import *
22 changes: 13 additions & 9 deletions rl/llm/engines.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import dataclasses
import datetime
import hashlib
import json
Expand Down Expand Up @@ -306,9 +307,7 @@ def __enter__(self):
LOGGER.info(f"No deployed app found for {self.app_name}. Deploying...")
deploy_config = {
"app_name": self.app_name,
"model_name_or_path": self.llm_config.model_name_or_path,
"num_gpus": self.llm_config.num_gpus,
"vllm_kwargs": _get_vllm_kwargs(self.llm_config),
"llm_config": dataclasses.asdict(self.llm_config),
}
deploy_env = {"MODAL_DEPLOY_CONFIG": json.dumps(deploy_config)}
print(deploy_env)
Expand Down Expand Up @@ -343,13 +342,13 @@ def __init__(self, llm_config: LLMConfig):
rl.utils.io.ensure_dotenv_loaded()
self.llm_config = llm_config

async def __aenter__(self):
def __enter__(self):
self.tokenizer = AutoTokenizer.from_pretrained(
self.llm_config.tokenizer_name_or_path
)
pass

async def __aexit__(self, exc_type, exc_value, traceback):
def __exit__(self, exc_type, exc_value, traceback):
pass

@abstractmethod
Expand Down Expand Up @@ -529,7 +528,7 @@ def _get_vllm_outputs(self, prompts: list[str]):
for i, prompt in enumerate(prompts):
self.vllm.add_request(
request_id=str(f"{curr_uuid}_{i}"),
prompt=prompt,
inputs=prompt,
**self.generate_kwargs,
)

Expand Down Expand Up @@ -679,13 +678,15 @@ class AsyncVLLMEngine(AsyncInferenceEngine):
def __init__(self, llm_config: LLMConfig):
super().__init__(llm_config)

async def __aenter__(self):
def __enter__(self):
self.vllm, self.generate_kwargs = _get_vllm_engine(
self.llm_config, use_async=True
)
self.tokenizer = await self.vllm.get_tokenizer()
self.tokenizer = AutoTokenizer.from_pretrained(
self.llm_config.tokenizer_name_or_path
)

async def __aexit__(self, exc_type, exc_value, traceback):
def __exit__(self, exc_type, exc_value, traceback):
del self.vllm

async def stream(
Expand Down Expand Up @@ -736,6 +737,9 @@ def _wrap_output(self, req_output) -> InferenceOutput:
WorkerVLLMEngine,
OpenAIEngine,
TogetherEngine,
GroqEngine,
AnthropicEngine,
ModalEngine,
)
}

Expand Down
110 changes: 32 additions & 78 deletions rl/llm/modal_entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
import os
import subprocess
import time
from pathlib import Path

import modal
import modal.gpu

import rl.llm.modal_utils

_IMAGE_MODEL_DIR = "/model"
_DEPLOY_CONFIG = json.loads(os.getenv("MODAL_DEPLOY_CONFIG", "{}"))

Expand All @@ -16,53 +17,31 @@
print(f"🚀 Deploying with config: {json.dumps(_DEPLOY_CONFIG, indent=2)}")


def _download_model_to_image(model_dir, model_name):
from huggingface_hub import snapshot_download
from transformers.utils import move_cache

Path(model_dir).mkdir(parents=True, exist_ok=True)

snapshot_download(
model_name,
local_dir=model_dir,
ignore_patterns=["*.pt", "*.bin"], # Using safetensors
)
move_cache()


def _derive_gpu_config(deploy_config):
return modal.gpu.A100(size="80GB", count=deploy_config.get("num_gpus", 1))


def _install_deps():
return subprocess.run(
[
"uv",
"pip",
"install",
"rl[llm] @ git+https://github.com/ProbablyFaiz/rl.git@main",
],
check=True,
)


def _get_vllm_image(deploy_config):
return (
modal.Image.debian_slim(python_version="3.11")
.pip_install(
"uv",
modal.Image.from_registry(
"nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.11"
)
.run_function(_install_deps)
.apt_install("git")
.run_function(rl.llm.modal_utils.install_deps)
.run_function(rl.llm.modal_utils.install_rl)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(
_download_model_to_image,
rl.llm.modal_utils.download_model_to_image,
timeout=60 * 20,
kwargs={
"model_dir": _IMAGE_MODEL_DIR,
"model_name": deploy_config["model_name_or_path"],
"model_name": deploy_config["llm_config"]["model_name_or_path"],
},
secrets=[modal.Secret.from_name("huggingface-token")],
)
.env(
{"MODAL_DEPLOY_CONFIG": json.dumps(deploy_config), "ENFORCE_EAGER": "true"}
)
)


Expand All @@ -73,6 +52,7 @@ def _get_vllm_image(deploy_config):


@app.cls(
cpu=4.0,
gpu=_GPU_CONFIG,
timeout=60 * 10,
container_idle_timeout=60 * 10,
Expand All @@ -81,70 +61,44 @@ def _get_vllm_image(deploy_config):
)
class Model:
engine = None
config = None

@modal.enter()
def start_engine(self):
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from rl.llm.config import LLMConfig
from rl.llm.engines import VLLMEngine

self.config = LLMConfig(**_DEPLOY_CONFIG["llm_config"])
self.config.model_name_or_path = _IMAGE_MODEL_DIR
self.config.tokenizer_name_or_path = _IMAGE_MODEL_DIR

print("🥶 cold starting inference")
start = time.monotonic_ns()

if "model" in _DEPLOY_CONFIG["vllm_kwargs"]:
del _DEPLOY_CONFIG["vllm_kwargs"]
engine_args = AsyncEngineArgs(
model=_IMAGE_MODEL_DIR,
**_DEPLOY_CONFIG["vllm_kwargs"],
)

self.engine = AsyncLLMEngine.from_engine_args(engine_args)
self.engine = VLLMEngine(self.config)
self.engine.__enter__()
duration_s = (time.monotonic_ns() - start) / 1e9
print(f"🏎️ engine started in {duration_s:.0f}s")

@modal.method()
async def completion_stream(self, input_text: str):
from vllm import SamplingParams
from vllm.utils import random_uuid

sampling_params = SamplingParams(
temperature=0.75,
max_tokens=128,
repetition_penalty=1.1,
)

request_id = random_uuid()
result_generator = self.engine.generate(
input_text,
sampling_params,
request_id,
)
index, num_tokens = 0, 0
start = time.monotonic_ns()
async for output in result_generator:
if output.outputs[0].text and "\ufffd" == output.outputs[0].text[-1]:
continue
text_delta = output.outputs[0].text[index:]
index = len(output.outputs[0].text)
num_tokens = len(output.outputs[0].token_ids)

yield text_delta
duration_s = (time.monotonic_ns() - start) / 1e9
def generate(self, inference_input):
return self.engine.generate(inference_input)

yield (
f"\n\tGenerated {num_tokens} tokens in {duration_s:.1f}s,"
f" throughput = {num_tokens / duration_s:.0f} tokens/second on {_GPU_CONFIG}.\n"
)
@modal.method()
def batch_generate(self, inference_inputs):
return self.engine.batch_generate(inference_inputs)

@modal.exit()
def stop_engine(self):
if _DEPLOY_CONFIG["num_gpus"] > 1:
self.engine.__exit__(None, None, None)
if self.config.num_gpus > 1:
import ray

ray.shutdown()


@app.local_entrypoint()
def main():
async def main():
questions = [
"Implement a Python function to compute the Fibonacci numbers.",
"What is the fable involving a fox and grapes?",
Expand All @@ -156,5 +110,5 @@ def main():
model = Model()
for question in questions:
print("Sending new request:", question, "\n\n")
for text in model.completion_stream.remote_gen(question):
print(text, end="", flush=text.endswith("\n"))
print(model.generate.remote(question))
print("\n\n")
47 changes: 41 additions & 6 deletions rl/llm/modal_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
import json
import subprocess
import time
from pathlib import Path

import modal
import modal.gpu

from rl.llm.config import LLMConfig


def get_deployed_id(app_name: str) -> str:
# Run `modal app list --json`
Expand All @@ -27,3 +21,44 @@ def get_deployed_id(app_name: str) -> str:
),
None,
)


def _get_uv_install_prefix():
return ["python", "-m", "uv", "pip", "install", "--system", "--no-cache"]


def install_deps():
subprocess.run(
["python", "-m", "pip", "install", "uv"],
check=True,
)
subprocess.run(
[*_get_uv_install_prefix(), "packaging", "wheel", "torch", "psutil"],
check=True,
)
subprocess.run([*_get_uv_install_prefix(), "hf-transfer", "huggingface-hub"])


def install_rl():
subprocess.run(
[
*_get_uv_install_prefix(),
"rl[llm] @ git+https://github.com/ProbablyFaiz/rl.git@main",
"--no-build-isolation",
],
check=True,
)


def download_model_to_image(model_dir, model_name):
from huggingface_hub import snapshot_download
from transformers.utils import move_cache

Path(model_dir).mkdir(parents=True, exist_ok=True)

snapshot_download(
model_name,
local_dir=model_dir,
ignore_patterns=["*.pt", "*.bin", "*.pth"], # Using safetensors
)
move_cache()

0 comments on commit 74dfabc

Please sign in to comment.