Skip to content

Commit

Permalink
feat: Publish/load pre-quantized models (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
carlosgjs authored Feb 5, 2024
1 parent e7c86f5 commit 7891902
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 55 deletions.
2 changes: 1 addition & 1 deletion .github/actions/deps/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ runs:
python3 -m pip install --upgrade pip
- name: Install project
shell: sh
run: pip install ".[dev,train]"
run: pip install ".[dev,train,cuda]"
7 changes: 2 additions & 5 deletions notebooks/generate.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,9 @@
"# Uncomment to clone and install autodoc from GitHub\n",
"# !pip uninstall -y autora-doc\n",
"# !git clone https://github.com/AutoResearch/autodoc.git\n",
"# !pip install -e \"./autodoc[cuda,train]\"\n",
"# !pip install \"./autodoc[cuda,train]\"\n",
"\n",
"# Login to Huggingface since access to the model repo is private\n",
"# 1) Request access through: https://ai.meta.com/resources/models-and-libraries/llama-downloads/\n",
"# 2) Get a Huggingface token from: https://huggingface.co/settings/token (use same email as above)\n",
"# !huggingface-cli login --token <your HF token>"
"# IMPORTANT: Please restart the runtime after running the above commands"
]
},
{
Expand Down
115 changes: 115 additions & 0 deletions notebooks/import_model.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer\n",
"import torch\n",
"import huggingface_hub\n",
"\n",
"print(torch.cuda.is_available())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_path = \"meta-llama/Llama-2-7b-chat-hf\"\n",
"print(model_path)\n",
"target_model_path = \"autora-doc/Llama-2-7b-chat-hf-nf4\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load the model in 4bit quantization for faster inference on smaller GPUs\n",
"conf = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_quant_type=\"nf4\",\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load the tokenizer and model\n",
"tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
"model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=conf, device_map=\"auto\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# This will work when running from a Jupyter notebook or Colab.\n",
"# For other authentication methods, see https://huggingface.co/docs/huggingface_hub/main/en/quick-start#authentication\n",
"huggingface_hub.notebook_login(new_session=False, write_permission=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenizer.push_to_hub(target_model_path)\n",
"model.push_to_hub(target_model_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Alternatvely, upload to Azure Blob Storage (currently not used)\n",
"from azureml.core import Workspace\n",
"\n",
"# save locally first\n",
"tokenizer.save_pretrained(f\"./models/{model_path}\")\n",
"model.save_pretrained(f\"./models/{model_path}\")\n",
"\n",
"# If all goes well, upload to blob storage:\n",
"workspace = Workspace.from_config()\n",
"ds = workspace.get_default_datastore()\n",
"ds.upload(f\"./models/{model_path}\", f\"./base_models/{target_model_path}\", show_progress=True, overwrite=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
50 changes: 18 additions & 32 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
[project]
name = "autora-doc"
license = {file = "LICENSE"}
license = { file = "LICENSE" }
readme = "README.md"
authors = [
{ name = "Carlos Garcia Jurado Suarez", email = "[email protected]" }
]
authors = [{ name = "Carlos Garcia Jurado Suarez", email = "[email protected]" }]
requires-python = ">=3.8"
classifiers = [
"Development Status :: 4 - Beta",
Expand All @@ -20,7 +18,7 @@ dependencies = [
"scipy",
# This works, while installing from pytorch and cuda from conda does not",
"torch==2.0.1",
"transformers>=4.35.2",
"transformers>=4.37.2",
"nltk",
]

Expand All @@ -29,36 +27,26 @@ description = "Automatic documentation generator from AutoRA code"
[project.optional-dependencies]
dev = [
"pytest",
"pytest-cov", # Used to report total code coverage
"pre-commit", # Used to run checks before finalizing a git commit
"sphinx", # Used to automatically generate documentation
"pytest-cov", # Used to report total code coverage
"pre-commit", # Used to run checks before finalizing a git commit
"sphinx", # Used to automatically generate documentation
"sphinx-rtd-theme", # Used to render documentation
"sphinx-autoapi", # Used to automatically generate api documentation
"black", # Used for static linting of files
"mypy", # Used for static type checking of files
"sphinx-autoapi", # Used to automatically generate api documentation
"black", # Used for static linting of files
"mypy", # Used for static type checking of files
# if you add dependencies here while experimenting in a notebook and you
# want that notebook to render in your documentation, please add the
# dependencies to ./docs/requirements.txt as well.
"nbconvert", # Needed for pre-commit check to clear output from Python notebooks
"nbsphinx", # Used to integrate Python notebooks into Sphinx documentation
"ipython", # Also used in building notebooks into Sphinx
"matplotlib", # Used in sample notebook intro_notebook.ipynb
"nbconvert", # Needed for pre-commit check to clear output from Python notebooks
"nbsphinx", # Used to integrate Python notebooks into Sphinx documentation
"ipython", # Also used in building notebooks into Sphinx
"matplotlib", # Used in sample notebook intro_notebook.ipynb
"ipykernel",
"hf_transfer",
]
train = [
"jsonlines",
"mlflow",
]
azure = [
"azureml-core",
"azureml-mlflow",
]
cuda = [
"bitsandbytes>=0.41.2.post2",
"accelerate>=0.24.1",
"xformers",
]
train = ["jsonlines", "mlflow"]
azure = ["azureml-core", "azureml-mlflow"]
cuda = ["bitsandbytes>=0.42.0", "accelerate>=0.24.1", "xformers"]

[project.urls]
Homepage = "https://github.com/AutoResearch/autodoc"
Expand All @@ -68,9 +56,7 @@ requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"

[tool.pytest.ini_options]
testpaths = [
"tests",
]
testpaths = ["tests"]

[tool.black]
line-length = 110
Expand All @@ -81,7 +67,7 @@ profile = "black"
line_length = 110

[tool.coverage.run]
omit=["src/autora/doc/_version.py"]
omit = ["src/autora/doc/_version.py"]

[tool.hatch]

Expand Down
2 changes: 1 addition & 1 deletion src/autora/doc/pipelines/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def generate(
predictions = pred.predict(prompt, [input], **param_dict)
assert len(predictions) == 1, f"Expected only one output, got {len(predictions)}"
logger.info(f"Writing output to {output}")
with open(output, "w") as f:
with open(output, "w", encoding="utf-8") as f:
f.write(predictions[0])


Expand Down
39 changes: 24 additions & 15 deletions src/autora/doc/runtime/predict_hf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Dict, Iterable, List
from typing import Dict, Iterable, List, Tuple

import torch
import transformers
Expand All @@ -9,6 +9,8 @@

logger = logging.getLogger(__name__)

quantized_models = {"meta-llama/Llama-2-7b-chat-hf": "autora-doc/Llama-2-7b-chat-hf-nf4"}


def preprocess_code(code: str) -> str:
lines: Iterable[str] = code.splitlines()
Expand All @@ -21,10 +23,12 @@ def preprocess_code(code: str) -> str:


class Predictor:
def __init__(self, model_path: str):
config = self.get_config()
def __init__(self, input_model_path: str):
model_path, config = Predictor.get_config(input_model_path)
if model_path != input_model_path:
logger.info(f"Mapped requested model '{input_model_path}' to '{model_path}'")

logger.info(f"Loading model from {model_path}")
logger.info(f"Loading model from {model_path} using config {config}")
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
Expand Down Expand Up @@ -84,19 +88,24 @@ def tokenize(self, input: List[str]) -> Dict[str, List[List[int]]]:
tokens: Dict[str, List[List[int]]] = self.tokenizer(input)
return tokens

def get_config(self) -> Dict[str, str]:
@staticmethod
def get_config(model_path: str) -> Tuple[str, Dict[str, str]]:
if torch.cuda.is_available():
from transformers import BitsAndBytesConfig

config = {"device_map": "auto"}
mapped_path = quantized_models.get(model_path, None)
if mapped_path:
# found an already quantized model, so no need to get a new quant config
return mapped_path, config

# Load the model in 4bit quantization for faster inference on smaller GPUs
return {
"quantization_config": BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
),
"device_map": "auto",
}
config["quantization_config"] = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
return model_path, config
else:
return {}
return model_path, {}
27 changes: 26 additions & 1 deletion tests/test_predict_hf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from autora.doc.runtime.predict_hf import Predictor
from unittest import mock

from autora.doc.runtime.predict_hf import Predictor, quantized_models

# Test models with and without available quantized models
MODEL_NO_QUANTIZED = "hf-internal-testing/tiny-random-FalconForCausalLM"
MODEL_WITH_QUANTIZED = "meta-llama/Llama-2-7b-chat-hf"


def test_trim_prompt() -> None:
Expand All @@ -14,3 +20,22 @@ def test_trim_prompt() -> None:
"""
output = Predictor.trim_prompt(with_marker)
assert output == "output\n"


@mock.patch("torch.cuda.is_available", return_value=True)
def test_get_config_cuda(mock: mock.Mock) -> None:
model, config = Predictor.get_config(MODEL_WITH_QUANTIZED)
assert model == quantized_models[MODEL_WITH_QUANTIZED]
assert "quantization_config" not in config

model, config = Predictor.get_config(MODEL_NO_QUANTIZED)
# no pre-quantized model available
assert model == MODEL_NO_QUANTIZED
assert "quantization_config" in config


@mock.patch("torch.cuda.is_available", return_value=False)
def test_get_config_nocuda(mock: mock.Mock) -> None:
model, config = Predictor.get_config(MODEL_WITH_QUANTIZED)
assert model == MODEL_WITH_QUANTIZED
assert len(config) == 0

0 comments on commit 7891902

Please sign in to comment.