diff --git a/.github/actions/deps/action.yaml b/.github/actions/deps/action.yaml index 04b8c5b..fff24c8 100644 --- a/.github/actions/deps/action.yaml +++ b/.github/actions/deps/action.yaml @@ -18,4 +18,4 @@ runs: python3 -m pip install --upgrade pip - name: Install project shell: sh - run: pip install ".[dev,train]" + run: pip install ".[dev,train,cuda]" diff --git a/notebooks/generate.ipynb b/notebooks/generate.ipynb index 17a5cb4..510de85 100644 --- a/notebooks/generate.ipynb +++ b/notebooks/generate.ipynb @@ -25,12 +25,9 @@ "# Uncomment to clone and install autodoc from GitHub\n", "# !pip uninstall -y autora-doc\n", "# !git clone https://github.com/AutoResearch/autodoc.git\n", - "# !pip install -e \"./autodoc[cuda,train]\"\n", + "# !pip install \"./autodoc[cuda,train]\"\n", "\n", - "# Login to Huggingface since access to the model repo is private\n", - "# 1) Request access through: https://ai.meta.com/resources/models-and-libraries/llama-downloads/\n", - "# 2) Get a Huggingface token from: https://huggingface.co/settings/token (use same email as above)\n", - "# !huggingface-cli login --token " + "# IMPORTANT: Please restart the runtime after running the above commands" ] }, { diff --git a/notebooks/import_model.ipynb b/notebooks/import_model.ipynb new file mode 100644 index 0000000..b6b4715 --- /dev/null +++ b/notebooks/import_model.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer\n", + "import torch\n", + "import huggingface_hub\n", + "\n", + "print(torch.cuda.is_available())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_path = \"meta-llama/Llama-2-7b-chat-hf\"\n", + "print(model_path)\n", + "target_model_path = \"autora-doc/Llama-2-7b-chat-hf-nf4\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the model in 4bit quantization for faster inference on smaller GPUs\n", + "conf = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_quant_type=\"nf4\",\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the tokenizer and model\n", + "tokenizer = AutoTokenizer.from_pretrained(model_path)\n", + "model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=conf, device_map=\"auto\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This will work when running from a Jupyter notebook or Colab.\n", + "# For other authentication methods, see https://huggingface.co/docs/huggingface_hub/main/en/quick-start#authentication\n", + "huggingface_hub.notebook_login(new_session=False, write_permission=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer.push_to_hub(target_model_path)\n", + "model.push_to_hub(target_model_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Alternatvely, upload to Azure Blob Storage (currently not used)\n", + "from azureml.core import Workspace\n", + "\n", + "# save locally first\n", + "tokenizer.save_pretrained(f\"./models/{model_path}\")\n", + "model.save_pretrained(f\"./models/{model_path}\")\n", + "\n", + "# If all goes well, upload to blob storage:\n", + "workspace = Workspace.from_config()\n", + "ds = workspace.get_default_datastore()\n", + "ds.upload(f\"./models/{model_path}\", f\"./base_models/{target_model_path}\", show_progress=True, overwrite=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pyproject.toml b/pyproject.toml index 7ce597f..ba26438 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,8 @@ [project] name = "autora-doc" -license = {file = "LICENSE"} +license = { file = "LICENSE" } readme = "README.md" -authors = [ - { name = "Carlos Garcia Jurado Suarez", email = "carlosg@uw.edu" } -] +authors = [{ name = "Carlos Garcia Jurado Suarez", email = "carlosg@uw.edu" }] requires-python = ">=3.8" classifiers = [ "Development Status :: 4 - Beta", @@ -20,7 +18,7 @@ dependencies = [ "scipy", # This works, while installing from pytorch and cuda from conda does not", "torch==2.0.1", - "transformers>=4.35.2", + "transformers>=4.37.2", "nltk", ] @@ -29,36 +27,26 @@ description = "Automatic documentation generator from AutoRA code" [project.optional-dependencies] dev = [ "pytest", - "pytest-cov", # Used to report total code coverage - "pre-commit", # Used to run checks before finalizing a git commit - "sphinx", # Used to automatically generate documentation + "pytest-cov", # Used to report total code coverage + "pre-commit", # Used to run checks before finalizing a git commit + "sphinx", # Used to automatically generate documentation "sphinx-rtd-theme", # Used to render documentation - "sphinx-autoapi", # Used to automatically generate api documentation - "black", # Used for static linting of files - "mypy", # Used for static type checking of files + "sphinx-autoapi", # Used to automatically generate api documentation + "black", # Used for static linting of files + "mypy", # Used for static type checking of files # if you add dependencies here while experimenting in a notebook and you # want that notebook to render in your documentation, please add the # dependencies to ./docs/requirements.txt as well. - "nbconvert", # Needed for pre-commit check to clear output from Python notebooks - "nbsphinx", # Used to integrate Python notebooks into Sphinx documentation - "ipython", # Also used in building notebooks into Sphinx - "matplotlib", # Used in sample notebook intro_notebook.ipynb + "nbconvert", # Needed for pre-commit check to clear output from Python notebooks + "nbsphinx", # Used to integrate Python notebooks into Sphinx documentation + "ipython", # Also used in building notebooks into Sphinx + "matplotlib", # Used in sample notebook intro_notebook.ipynb "ipykernel", "hf_transfer", ] -train = [ - "jsonlines", - "mlflow", -] -azure = [ - "azureml-core", - "azureml-mlflow", -] -cuda = [ - "bitsandbytes>=0.41.2.post2", - "accelerate>=0.24.1", - "xformers", -] +train = ["jsonlines", "mlflow"] +azure = ["azureml-core", "azureml-mlflow"] +cuda = ["bitsandbytes>=0.42.0", "accelerate>=0.24.1", "xformers"] [project.urls] Homepage = "https://github.com/AutoResearch/autodoc" @@ -68,9 +56,7 @@ requires = ["hatchling", "hatch-vcs"] build-backend = "hatchling.build" [tool.pytest.ini_options] -testpaths = [ - "tests", -] +testpaths = ["tests"] [tool.black] line-length = 110 @@ -81,7 +67,7 @@ profile = "black" line_length = 110 [tool.coverage.run] -omit=["src/autora/doc/_version.py"] +omit = ["src/autora/doc/_version.py"] [tool.hatch] diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py index 33d6504..d01c407 100644 --- a/src/autora/doc/pipelines/main.py +++ b/src/autora/doc/pipelines/main.py @@ -187,7 +187,7 @@ def generate( predictions = pred.predict(prompt, [input], **param_dict) assert len(predictions) == 1, f"Expected only one output, got {len(predictions)}" logger.info(f"Writing output to {output}") - with open(output, "w") as f: + with open(output, "w", encoding="utf-8") as f: f.write(predictions[0]) diff --git a/src/autora/doc/runtime/predict_hf.py b/src/autora/doc/runtime/predict_hf.py index 9a5adbd..a3766dc 100644 --- a/src/autora/doc/runtime/predict_hf.py +++ b/src/autora/doc/runtime/predict_hf.py @@ -1,5 +1,5 @@ import logging -from typing import Dict, Iterable, List +from typing import Dict, Iterable, List, Tuple import torch import transformers @@ -9,6 +9,8 @@ logger = logging.getLogger(__name__) +quantized_models = {"meta-llama/Llama-2-7b-chat-hf": "autora-doc/Llama-2-7b-chat-hf-nf4"} + def preprocess_code(code: str) -> str: lines: Iterable[str] = code.splitlines() @@ -21,10 +23,12 @@ def preprocess_code(code: str) -> str: class Predictor: - def __init__(self, model_path: str): - config = self.get_config() + def __init__(self, input_model_path: str): + model_path, config = Predictor.get_config(input_model_path) + if model_path != input_model_path: + logger.info(f"Mapped requested model '{input_model_path}' to '{model_path}'") - logger.info(f"Loading model from {model_path}") + logger.info(f"Loading model from {model_path} using config {config}") self.tokenizer = AutoTokenizer.from_pretrained(model_path) self.model = AutoModelForCausalLM.from_pretrained( model_path, @@ -84,19 +88,24 @@ def tokenize(self, input: List[str]) -> Dict[str, List[List[int]]]: tokens: Dict[str, List[List[int]]] = self.tokenizer(input) return tokens - def get_config(self) -> Dict[str, str]: + @staticmethod + def get_config(model_path: str) -> Tuple[str, Dict[str, str]]: if torch.cuda.is_available(): from transformers import BitsAndBytesConfig + config = {"device_map": "auto"} + mapped_path = quantized_models.get(model_path, None) + if mapped_path: + # found an already quantized model, so no need to get a new quant config + return mapped_path, config + # Load the model in 4bit quantization for faster inference on smaller GPUs - return { - "quantization_config": BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, - ), - "device_map": "auto", - } + config["quantization_config"] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + ) + return model_path, config else: - return {} + return model_path, {} diff --git a/tests/test_predict_hf.py b/tests/test_predict_hf.py index 9ad2c47..e433429 100644 --- a/tests/test_predict_hf.py +++ b/tests/test_predict_hf.py @@ -1,4 +1,10 @@ -from autora.doc.runtime.predict_hf import Predictor +from unittest import mock + +from autora.doc.runtime.predict_hf import Predictor, quantized_models + +# Test models with and without available quantized models +MODEL_NO_QUANTIZED = "hf-internal-testing/tiny-random-FalconForCausalLM" +MODEL_WITH_QUANTIZED = "meta-llama/Llama-2-7b-chat-hf" def test_trim_prompt() -> None: @@ -14,3 +20,22 @@ def test_trim_prompt() -> None: """ output = Predictor.trim_prompt(with_marker) assert output == "output\n" + + +@mock.patch("torch.cuda.is_available", return_value=True) +def test_get_config_cuda(mock: mock.Mock) -> None: + model, config = Predictor.get_config(MODEL_WITH_QUANTIZED) + assert model == quantized_models[MODEL_WITH_QUANTIZED] + assert "quantization_config" not in config + + model, config = Predictor.get_config(MODEL_NO_QUANTIZED) + # no pre-quantized model available + assert model == MODEL_NO_QUANTIZED + assert "quantization_config" in config + + +@mock.patch("torch.cuda.is_available", return_value=False) +def test_get_config_nocuda(mock: mock.Mock) -> None: + model, config = Predictor.get_config(MODEL_WITH_QUANTIZED) + assert model == MODEL_WITH_QUANTIZED + assert len(config) == 0