Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Publish/load pre-quantized models #34

Merged
merged 15 commits into from
Feb 5, 2024
2 changes: 1 addition & 1 deletion .github/actions/deps/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ runs:
python3 -m pip install --upgrade pip
- name: Install project
shell: sh
run: pip install ".[dev,train]"
run: pip install ".[dev,train,cuda]"
7 changes: 2 additions & 5 deletions notebooks/generate.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,9 @@
"# Uncomment to clone and install autodoc from GitHub\n",
"# !pip uninstall -y autora-doc\n",
"# !git clone https://github.com/AutoResearch/autodoc.git\n",
"# !pip install -e \"./autodoc[cuda,train]\"\n",
"# !pip install \"./autodoc[cuda,train]\"\n",
"\n",
"# Login to Huggingface since access to the model repo is private\n",
"# 1) Request access through: https://ai.meta.com/resources/models-and-libraries/llama-downloads/\n",
"# 2) Get a Huggingface token from: https://huggingface.co/settings/token (use same email as above)\n",
"# !huggingface-cli login --token <your HF token>"
carlosgjs marked this conversation as resolved.
Show resolved Hide resolved
"# IMPORTANT: Please restart the runtime after running the above commands"
]
},
{
Expand Down
104 changes: 104 additions & 0 deletions notebooks/import_model.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer\n",
"import torch\n",
"\n",
"print(torch.cuda.is_available())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_path = \"meta-llama/Llama-2-7b-chat-hf\"\n",
"print(model_path)\n",
"target_model_path = \"autora-doc/Llama-2-7b-chat-hf-nf4\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load the model in 4bit quantization for faster inference on smaller GPUs\n",
"conf = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_quant_type=\"nf4\",\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load the tokenizer and model\n",
"tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
carlosgjs marked this conversation as resolved.
Show resolved Hide resolved
"model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=conf, device_map=\"auto\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Running this requires loging into HuggingFace via `huggingface-cli login --token` using a write token\n",
carlosgjs marked this conversation as resolved.
Show resolved Hide resolved
"tokenizer.push_to_hub(target_model_path)\n",
"model.push_to_hub(target_model_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Alternatvely, upload to Azure Blob Storage (currently not used)\n",
"from azureml.core import Workspace\n",
"\n",
"# save locally first\n",
"tokenizer.save_pretrained(f\"./models/{model_path}\")\n",
"model.save_pretrained(f\"./models/{model_path}\")\n",
"\n",
"# If all goes well, upload to blob storage:\n",
"workspace = Workspace.from_config()\n",
"ds = workspace.get_default_datastore()\n",
"ds.upload(f\"./models/{model_path}\", f\"./base_models/{target_model_path}\", show_progress=True, overwrite=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "autodoc",
"language": "python",
"name": "autodoc"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
50 changes: 18 additions & 32 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
[project]
name = "autora-doc"
license = {file = "LICENSE"}
license = { file = "LICENSE" }
readme = "README.md"
authors = [
{ name = "Carlos Garcia Jurado Suarez", email = "[email protected]" }
]
authors = [{ name = "Carlos Garcia Jurado Suarez", email = "[email protected]" }]
requires-python = ">=3.8"
classifiers = [
"Development Status :: 4 - Beta",
Expand All @@ -20,7 +18,7 @@ dependencies = [
"scipy",
# This works, while installing from pytorch and cuda from conda does not",
"torch==2.0.1",
"transformers>=4.35.2",
"transformers>=4.37.2",
"nltk",
]

Expand All @@ -29,36 +27,26 @@ description = "Automatic documentation generator from AutoRA code"
[project.optional-dependencies]
dev = [
"pytest",
"pytest-cov", # Used to report total code coverage
"pre-commit", # Used to run checks before finalizing a git commit
"sphinx", # Used to automatically generate documentation
"pytest-cov", # Used to report total code coverage
"pre-commit", # Used to run checks before finalizing a git commit
"sphinx", # Used to automatically generate documentation
"sphinx-rtd-theme", # Used to render documentation
"sphinx-autoapi", # Used to automatically generate api documentation
"black", # Used for static linting of files
"mypy", # Used for static type checking of files
"sphinx-autoapi", # Used to automatically generate api documentation
"black", # Used for static linting of files
"mypy", # Used for static type checking of files
# if you add dependencies here while experimenting in a notebook and you
# want that notebook to render in your documentation, please add the
# dependencies to ./docs/requirements.txt as well.
"nbconvert", # Needed for pre-commit check to clear output from Python notebooks
"nbsphinx", # Used to integrate Python notebooks into Sphinx documentation
"ipython", # Also used in building notebooks into Sphinx
"matplotlib", # Used in sample notebook intro_notebook.ipynb
"nbconvert", # Needed for pre-commit check to clear output from Python notebooks
"nbsphinx", # Used to integrate Python notebooks into Sphinx documentation
"ipython", # Also used in building notebooks into Sphinx
"matplotlib", # Used in sample notebook intro_notebook.ipynb
"ipykernel",
"hf_transfer",
]
train = [
"jsonlines",
"mlflow",
]
azure = [
"azureml-core",
"azureml-mlflow",
]
cuda = [
"bitsandbytes>=0.41.2.post2",
"accelerate>=0.24.1",
"xformers",
]
train = ["jsonlines", "mlflow"]
azure = ["azureml-core", "azureml-mlflow"]
cuda = ["bitsandbytes>=0.42.0", "accelerate>=0.24.1", "xformers"]

[project.urls]
Homepage = "https://github.com/AutoResearch/autodoc"
Expand All @@ -68,9 +56,7 @@ requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"

[tool.pytest.ini_options]
testpaths = [
"tests",
]
testpaths = ["tests"]

[tool.black]
line-length = 110
Expand All @@ -81,7 +67,7 @@ profile = "black"
line_length = 110

[tool.coverage.run]
omit=["src/autora/doc/_version.py"]
omit = ["src/autora/doc/_version.py"]

[tool.hatch]

Expand Down
2 changes: 1 addition & 1 deletion src/autora/doc/pipelines/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def generate(
predictions = pred.predict(prompt, [input], **param_dict)
assert len(predictions) == 1, f"Expected only one output, got {len(predictions)}"
logger.info(f"Writing output to {output}")
with open(output, "w") as f:
with open(output, "w", encoding="utf-8") as f:
f.write(predictions[0])


Expand Down
39 changes: 24 additions & 15 deletions src/autora/doc/runtime/predict_hf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Dict, Iterable, List
from typing import Dict, Iterable, List, Tuple

import torch
import transformers
Expand All @@ -9,6 +9,8 @@

logger = logging.getLogger(__name__)

quantized_models = {"meta-llama/Llama-2-7b-chat-hf": "autora-doc/Llama-2-7b-chat-hf-nf4"}


def preprocess_code(code: str) -> str:
lines: Iterable[str] = code.splitlines()
Expand All @@ -21,10 +23,12 @@ def preprocess_code(code: str) -> str:


class Predictor:
def __init__(self, model_path: str):
config = self.get_config()
def __init__(self, input_model_path: str):
model_path, config = Predictor.get_config(input_model_path)
if model_path != input_model_path:
logger.info(f"Mapped requested model '{input_model_path}' to '{model_path}'")

logger.info(f"Loading model from {model_path}")
logger.info(f"Loading model from {model_path} using config {config}")
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
Expand Down Expand Up @@ -84,19 +88,24 @@ def tokenize(self, input: List[str]) -> Dict[str, List[List[int]]]:
tokens: Dict[str, List[List[int]]] = self.tokenizer(input)
return tokens

def get_config(self) -> Dict[str, str]:
@staticmethod
def get_config(model_path: str) -> Tuple[str, Dict[str, str]]:
if torch.cuda.is_available():
from transformers import BitsAndBytesConfig

config = {"device_map": "auto"}
mapped_path = quantized_models.get(model_path, None)
if mapped_path:
# found an already quantized model, so no need to get a new quant config
return mapped_path, config

# Load the model in 4bit quantization for faster inference on smaller GPUs
return {
"quantization_config": BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
),
"device_map": "auto",
}
config["quantization_config"] = BitsAndBytesConfig(
carlosgjs marked this conversation as resolved.
Show resolved Hide resolved
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
return model_path, config
else:
return {}
return model_path, {}
27 changes: 26 additions & 1 deletion tests/test_predict_hf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from autora.doc.runtime.predict_hf import Predictor
from unittest import mock

from autora.doc.runtime.predict_hf import Predictor, quantized_models

# Test models with and without available quantized models
MODEL_NO_QUANTIZED = "hf-internal-testing/tiny-random-FalconForCausalLM"
MODEL_WITH_QUANTIZED = "meta-llama/Llama-2-7b-chat-hf"


def test_trim_prompt() -> None:
Expand All @@ -14,3 +20,22 @@ def test_trim_prompt() -> None:
"""
output = Predictor.trim_prompt(with_marker)
assert output == "output\n"


@mock.patch("torch.cuda.is_available", return_value=True)
def test_get_config_cuda(mock: mock.Mock) -> None:
model, config = Predictor.get_config(MODEL_WITH_QUANTIZED)
assert model == quantized_models[MODEL_WITH_QUANTIZED]
assert "quantization_config" not in config

model, config = Predictor.get_config(MODEL_NO_QUANTIZED)
# no pre-quantized model available
assert model == MODEL_NO_QUANTIZED
assert "quantization_config" in config


@mock.patch("torch.cuda.is_available", return_value=False)
def test_get_config_nocuda(mock: mock.Mock) -> None:
model, config = Predictor.get_config(MODEL_WITH_QUANTIZED)
assert model == MODEL_WITH_QUANTIZED
assert len(config) == 0
Loading