Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Publish/load pre-quantized models #34

Merged
merged 15 commits into from
Feb 5, 2024
2 changes: 1 addition & 1 deletion .github/actions/deps/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ runs:
python3 -m pip install --upgrade pip
- name: Install project
shell: sh
run: pip install ".[dev,train]"
run: pip install ".[dev,train,cuda]"
110 changes: 110 additions & 0 deletions notebooks/import_model.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer\n",
"import torch\n",
"\n",
"print(torch.cuda.is_available())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# model_path = \"codellama/CodeLlama-7b-Instruct-hf\"\n",
"# model_path = \"meta-llama/Llama-2-7b-chat-hf\"\n",
carlosgjs marked this conversation as resolved.
Show resolved Hide resolved
"model_path = \"./models/meta-llama/Llama-2-7b-chat-hf\"\n",
"print(model_path)\n",
"target_model_path = \"carlosgjs/Llama-2-7b-chat-hf-4bit\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load the model in 4bit quantization for faster inference on smaller GPUs\n",
"conf = {\n",
" \"quantization_config\": BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_quant_type=\"nf4\",\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" ),\n",
" \"device_map\": \"auto\",\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
carlosgjs marked this conversation as resolved.
Show resolved Hide resolved
"model = AutoModelForCausalLM.from_pretrained(model_path, device_map=\"auto\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Running this requires loging into HuggingFace via `huggingface-cli login --token` using a write token\n",
carlosgjs marked this conversation as resolved.
Show resolved Hide resolved
"tokenizer.push_to_hub(target_model_path)\n",
"model.push_to_hub(target_model_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Alternatvely, upload to Azure Blob Storage\n",
"from azureml.core import Workspace\n",
"\n",
"# save locally first\n",
"tokenizer.save_pretrained(f\"./models/{model_path}\")\n",
"model.save_pretrained(f\"./models/{model_path}\")\n",
"\n",
"# If all goes well, upload to blob storage:\n",
"workspace = Workspace.from_config()\n",
"ds = workspace.get_default_datastore()\n",
"ds.upload(\n",
" f\"./models/{target_model_path}\", f\"./base_models/{target_model_path}\", show_progress=True, overwrite=True\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
50 changes: 18 additions & 32 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
[project]
name = "autora-doc"
license = {file = "LICENSE"}
license = { file = "LICENSE" }
readme = "README.md"
authors = [
{ name = "Carlos Garcia Jurado Suarez", email = "[email protected]" }
]
authors = [{ name = "Carlos Garcia Jurado Suarez", email = "[email protected]" }]
requires-python = ">=3.8"
classifiers = [
"Development Status :: 4 - Beta",
Expand All @@ -20,7 +18,7 @@ dependencies = [
"scipy",
# This works, while installing from pytorch and cuda from conda does not",
"torch==2.0.1",
"transformers>=4.35.2",
"transformers>=4.37.2",
"nltk",
]

Expand All @@ -29,36 +27,26 @@ description = "Automatic documentation generator from AutoRA code"
[project.optional-dependencies]
dev = [
"pytest",
"pytest-cov", # Used to report total code coverage
"pre-commit", # Used to run checks before finalizing a git commit
"sphinx", # Used to automatically generate documentation
"pytest-cov", # Used to report total code coverage
"pre-commit", # Used to run checks before finalizing a git commit
"sphinx", # Used to automatically generate documentation
"sphinx-rtd-theme", # Used to render documentation
"sphinx-autoapi", # Used to automatically generate api documentation
"black", # Used for static linting of files
"mypy", # Used for static type checking of files
"sphinx-autoapi", # Used to automatically generate api documentation
"black", # Used for static linting of files
"mypy", # Used for static type checking of files
# if you add dependencies here while experimenting in a notebook and you
# want that notebook to render in your documentation, please add the
# dependencies to ./docs/requirements.txt as well.
"nbconvert", # Needed for pre-commit check to clear output from Python notebooks
"nbsphinx", # Used to integrate Python notebooks into Sphinx documentation
"ipython", # Also used in building notebooks into Sphinx
"matplotlib", # Used in sample notebook intro_notebook.ipynb
"nbconvert", # Needed for pre-commit check to clear output from Python notebooks
"nbsphinx", # Used to integrate Python notebooks into Sphinx documentation
"ipython", # Also used in building notebooks into Sphinx
"matplotlib", # Used in sample notebook intro_notebook.ipynb
"ipykernel",
"hf_transfer",
]
train = [
"jsonlines",
"mlflow",
]
azure = [
"azureml-core",
"azureml-mlflow",
]
cuda = [
"bitsandbytes>=0.41.2.post2",
"accelerate>=0.24.1",
"xformers",
]
train = ["jsonlines", "mlflow"]
azure = ["azureml-core", "azureml-mlflow"]
cuda = ["bitsandbytes>=0.42.0", "accelerate>=0.24.1", "xformers"]

[project.urls]
Homepage = "https://github.com/AutoResearch/autodoc"
Expand All @@ -68,9 +56,7 @@ requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"

[tool.pytest.ini_options]
testpaths = [
"tests",
]
testpaths = ["tests"]

[tool.black]
line-length = 110
Expand All @@ -81,7 +67,7 @@ profile = "black"
line_length = 110

[tool.coverage.run]
omit=["src/autora/doc/_version.py"]
omit = ["src/autora/doc/_version.py"]

[tool.hatch]

Expand Down
40 changes: 25 additions & 15 deletions src/autora/doc/runtime/predict_hf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Dict, Iterable, List
from typing import Dict, Iterable, List, Tuple

import torch
import transformers
Expand All @@ -9,6 +9,9 @@

logger = logging.getLogger(__name__)

# TODO: Redirect the quantized model to an 'autora' HF org
quantized_models = {"meta-llama/Llama-2-7b-chat-hf": "carlosgjs/Llama-2-7b-chat-hf-4bit"}


def preprocess_code(code: str) -> str:
lines: Iterable[str] = code.splitlines()
Expand All @@ -21,10 +24,12 @@ def preprocess_code(code: str) -> str:


class Predictor:
def __init__(self, model_path: str):
config = self.get_config()
def __init__(self, input_model_path: str):
model_path, config = Predictor.get_config(input_model_path)
if model_path != input_model_path:
logger.info(f"Mapped requested model '{input_model_path}' to '{model_path}'")

logger.info(f"Loading model from {model_path}")
logger.info(f"Loading model from {model_path} using config {config}")
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
Expand Down Expand Up @@ -84,19 +89,24 @@ def tokenize(self, input: List[str]) -> Dict[str, List[List[int]]]:
tokens: Dict[str, List[List[int]]] = self.tokenizer(input)
return tokens

def get_config(self) -> Dict[str, str]:
@staticmethod
def get_config(model_path: str) -> Tuple[str, Dict[str, str]]:
if torch.cuda.is_available():
from transformers import BitsAndBytesConfig

config = {"device_map": "auto"}
mapped_path = quantized_models.get(model_path, None)
if mapped_path:
# found an already quantized model, so no need to get a new quant config
return mapped_path, config

# Load the model in 4bit quantization for faster inference on smaller GPUs
return {
"quantization_config": BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
),
"device_map": "auto",
}
config["quantization_config"] = BitsAndBytesConfig(
carlosgjs marked this conversation as resolved.
Show resolved Hide resolved
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
return model_path, config
else:
return {}
return model_path, {}
27 changes: 26 additions & 1 deletion tests/test_predict_hf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from autora.doc.runtime.predict_hf import Predictor
from unittest import mock

from autora.doc.runtime.predict_hf import Predictor, quantized_models

# Test models with and without available quantized models
MODEL_NO_QUANTIZED = "hf-internal-testing/tiny-random-FalconForCausalLM"
MODEL_WITH_QUANTIZED = "meta-llama/Llama-2-7b-chat-hf"


def test_trim_prompt() -> None:
Expand All @@ -14,3 +20,22 @@ def test_trim_prompt() -> None:
"""
output = Predictor.trim_prompt(with_marker)
assert output == "output\n"


@mock.patch("torch.cuda.is_available", return_value=True)
def test_get_config_cuda(mock: mock.Mock) -> None:
model, config = Predictor.get_config(MODEL_WITH_QUANTIZED)
assert model == quantized_models[MODEL_WITH_QUANTIZED]
assert "quantization_config" not in config

model, config = Predictor.get_config(MODEL_NO_QUANTIZED)
# no pre-quantized model available
assert model == MODEL_NO_QUANTIZED
assert "quantization_config" in config


@mock.patch("torch.cuda.is_available", return_value=False)
def test_get_config_nocuda(mock: mock.Mock) -> None:
model, config = Predictor.get_config(MODEL_WITH_QUANTIZED)
assert model == MODEL_WITH_QUANTIZED
assert len(config) == 0
Loading