Skip to content

Commit

Permalink
Fix Building Docker with model built-in #71
Browse files Browse the repository at this point in the history
  • Loading branch information
alpayariyak committed Jun 7, 2024
1 parent 1bb6f84 commit f19ce12
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 73 deletions.
49 changes: 25 additions & 24 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,31 @@
import os
import json
import logging
from dotenv import load_dotenv
from torch.cuda import device_count
import os
import logging
from utils import get_int_bool_env

class EngineConfig:
def __init__(self):
load_dotenv()
self.model_name_or_path, self.hf_home, self.model_revision = self._get_local_or_env("/local_model_path.txt", "MODEL_NAME")
self.tokenizer_name_or_path, _, self.tokenizer_revision = self._get_local_or_env("/local_tokenizer_path.txt", "TOKENIZER_NAME")
self.tokenizer_name_or_path = self.tokenizer_name_or_path or self.model_name_or_path
self.quantization = self._get_quantization()
self.config = self._initialize_config()

def _get_local_or_env(self, local_path, env_var):
if os.path.exists(local_path):
self.hf_home = os.getenv("HF_HOME")
# Check if /local_metadata.json exists
local_metadata = {}
if os.path.exists("/local_metadata.json"):
with open("/local_metadata.json", "r") as f:
local_metadata = json.load(f)
if local_metadata.get("model_name") is None:
raise ValueError("Model name is not found in /local_metadata.json, there was a problem when you baked the model in.")
logging.info("Using baked-in model")
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
with open(local_path, "r") as file:
return file.read().strip(), None, None
return os.getenv(env_var), os.getenv("HF_HOME"), os.getenv(f"{env_var.split('_')[0]}_REVISION") or None

def _get_quantization(self):
quantization = os.getenv("QUANTIZATION", "").lower()
return quantization if quantization in ["awq", "squeezellm", "gptq"] else None


self.model_name_or_path = local_metadata.get("model_name", os.getenv("MODEL_NAME"))
self.model_revision = local_metadata.get("revision", os.getenv("MODEL_REVISION"))
self.tokenizer_name_or_path = local_metadata.get("tokenizer_name", os.getenv("TOKENIZER_NAME")) or self.model_name_or_path
self.tokenizer_revision = local_metadata.get("tokenizer_revision", os.getenv("TOKENIZER_REVISION"))
self.quantization = local_metadata.get("quantization", os.getenv("QUANTIZATION"))
self.config = self._initialize_config()
def _initialize_config(self):
args = {
"model": self.model_name_or_path,
Expand All @@ -35,9 +36,9 @@ def _initialize_config(self):
"dtype": os.getenv("DTYPE", "half" if self.quantization else "auto"),
"tokenizer": self.tokenizer_name_or_path,
"tokenizer_revision": self.tokenizer_revision,
"disable_log_stats": bool(int(os.getenv("DISABLE_LOG_STATS", 1))),
"disable_log_requests": bool(int(os.getenv("DISABLE_LOG_REQUESTS", 1))),
"trust_remote_code": bool(int(os.getenv("TRUST_REMOTE_CODE", 0))),
"disable_log_stats": get_int_bool_env("DISABLE_LOG_STATS", True),
"disable_log_requests": get_int_bool_env("DISABLE_LOG_REQUESTS", True),
"trust_remote_code": get_int_bool_env("TRUST_REMOTE_CODE", False),
"gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", 0.95)),
"max_parallel_loading_workers": None if device_count() > 1 or not os.getenv("MAX_PARALLEL_LOADING_WORKERS") else int(os.getenv("MAX_PARALLEL_LOADING_WORKERS")),
"max_model_len": int(os.getenv("MAX_MODEL_LEN")) if os.getenv("MAX_MODEL_LEN") else None,
Expand All @@ -47,10 +48,10 @@ def _initialize_config(self):
"block_size": int(os.getenv("BLOCK_SIZE")) if os.getenv("BLOCK_SIZE") else None,
"swap_space": int(os.getenv("SWAP_SPACE")) if os.getenv("SWAP_SPACE") else None,
"max_context_len_to_capture": int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE")) if os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE") else None,
"disable_custom_all_reduce": bool(int(os.getenv("DISABLE_CUSTOM_ALL_REDUCE", 0))),
"enforce_eager": bool(int(os.getenv("ENFORCE_EAGER", 0)))
"disable_custom_all_reduce": get_int_bool_env("DISABLE_CUSTOM_ALL_REDUCE", False),
"enforce_eager": get_int_bool_env("ENFORCE_EAGER", False)
}
if args["kv_cache_dtype"] == "fp8_e5m2":
args["kv_cache_dtype"] = "fp8"
logging.warning("Using fp8_e5m2 is deprecated. Please use fp8 instead.")
return {k: v for k, v in args.items() if v is not None}
return {k: v for k, v in args.items() if v not in [None, ""]}
68 changes: 20 additions & 48 deletions src/download_model.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,27 @@
import os
import shutil
from tensorize import serialize_model
from huggingface_hub import snapshot_download
from vllm.model_executor.weight_utils import prepare_hf_model_weights, Disabledtqdm
import json

def download_extras_or_tokenizer(model_name, cache_dir, revision, extras=False):
"""Download model or tokenizer and prepare its weights, returning the local folder path."""
pattern = ["*token*", "*.json"] if extras else None
extra_dir = "/extras" if extras else ""
folder = snapshot_download(
model_name,
cache_dir=cache_dir + extra_dir,
revision=revision,
tqdm_class=Disabledtqdm,
allow_patterns=pattern if extras else None,
ignore_patterns=["*.safetensors", "*.bin", "*.pt"] if not extras else None
)
return folder

def move_files(src_dir, dest_dir):
"""Move files from source to destination directory."""
for f in os.listdir(src_dir):
src_path = os.path.join(src_dir, f)
dst_path = os.path.join(dest_dir, f)
shutil.copy2(src_path, dst_path)
os.remove(src_path)

if __name__ == "__main__":
model, download_dir = os.getenv("MODEL_NAME"), os.getenv("HF_HOME")
tokenizer = os.getenv("TOKENIZER_NAME") or model

revisions = {
"model": os.getenv("MODEL_REVISION") or None,
"tokenizer": os.getenv("TOKENIZER_REVISION") or None
}

if not model or not download_dir:
raise ValueError(f"Must specify model and download_dir. Model: {model}, download_dir: {download_dir}")

os.makedirs(download_dir, exist_ok=True)
model_folder, hf_weights_files, use_safetensors = prepare_hf_model_weights(model_name_or_path=model, revision=revisions["model"], cache_dir=download_dir)
model_extras_folder = download_extras_or_tokenizer(model, download_dir, revisions["model"], extras=True)
move_files(model_extras_folder, model_folder)
model_name = os.getenv("MODEL_NAME")
if not model_name:
raise ValueError("Must specify model name by adding --build-arg MODEL_NAME=<your model's repo>")
revision = os.getenv("MODEL_REVISION") or None
snapshot_download(model_name, revision=revision, cache_dir=os.getenv("HF_HOME"))

if os.environ.get("TENSORIZE_MODEL"):

tokenizer_name = os.getenv("TOKENIZER_NAME") or None
tokenizer_revision = os.getenv("TOKENIZER_REVISION") or None
if tokenizer_name:
snapshot_download(tokenizer_name, revision=tokenizer_revision, cache_dir=os.getenv("HF_HOME"))

# Create file with metadata of baked in model and/or tokenizer

with open("/local_metadata.json", "w") as f:
json.dump({
"model_name": model_name,
"revision": revision,
"tokenizer_name": tokenizer_name or model_name,
"tokenizer_revision": tokenizer_revision or revision,
"quantization": os.getenv("QUANTIZATION")
}, f)

with open("/local_model_path.txt", "w") as f:
f.write(model_folder)

tokenizer_folder = download_extras_or_tokenizer(tokenizer, download_dir, revisions["tokenizer"])
with open("/local_tokenizer_path.txt", "w") as f:
f.write(tokenizer_folder)
8 changes: 7 additions & 1 deletion src/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import logging
from http import HTTPStatus
from vllm.utils import random_uuid
Expand Down Expand Up @@ -64,4 +65,9 @@ def create_error_response(message: str, err_type: str = "BadRequestError", statu
type=err_type,
code=status_code.value)


def get_int_bool_env(env_var: str, default: bool) -> bool:
return int(os.getenv(env_var, int(default))) == 1




0 comments on commit f19ce12

Please sign in to comment.