diff --git a/.github/actions/deps/action.yaml b/.github/actions/deps/action.yaml
index 04b8c5b..fff24c8 100644
--- a/.github/actions/deps/action.yaml
+++ b/.github/actions/deps/action.yaml
@@ -18,4 +18,4 @@ runs:
         python3 -m pip install --upgrade pip
     - name: Install project
       shell: sh
-      run: pip install ".[dev,train]"
+      run: pip install ".[dev,train,cuda]"
diff --git a/notebooks/generate.ipynb b/notebooks/generate.ipynb
index 17a5cb4..510de85 100644
--- a/notebooks/generate.ipynb
+++ b/notebooks/generate.ipynb
@@ -25,12 +25,9 @@
     "# Uncomment to clone and install autodoc from GitHub\n",
     "# !pip uninstall -y autora-doc\n",
     "# !git clone https://github.com/AutoResearch/autodoc.git\n",
-    "# !pip install -e \"./autodoc[cuda,train]\"\n",
+    "# !pip install \"./autodoc[cuda,train]\"\n",
     "\n",
-    "# Login to Huggingface since access to the model repo is private\n",
-    "# 1) Request access through: https://ai.meta.com/resources/models-and-libraries/llama-downloads/\n",
-    "# 2) Get a Huggingface token from: https://huggingface.co/settings/token (use same email as above)\n",
-    "# !huggingface-cli login --token <your HF token>"
+    "# IMPORTANT: Please restart the runtime after running the above commands"
    ]
   },
   {
diff --git a/notebooks/import_model.ipynb b/notebooks/import_model.ipynb
new file mode 100644
index 0000000..b6b4715
--- /dev/null
+++ b/notebooks/import_model.ipynb
@@ -0,0 +1,115 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer\n",
+    "import torch\n",
+    "import huggingface_hub\n",
+    "\n",
+    "print(torch.cuda.is_available())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_path = \"meta-llama/Llama-2-7b-chat-hf\"\n",
+    "print(model_path)\n",
+    "target_model_path = \"autora-doc/Llama-2-7b-chat-hf-nf4\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the model in 4bit quantization for faster inference on smaller GPUs\n",
+    "conf = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_use_double_quant=True,\n",
+    "    bnb_4bit_quant_type=\"nf4\",\n",
+    "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the tokenizer and model\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=conf, device_map=\"auto\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This will work when running from a Jupyter notebook or Colab.\n",
+    "# For other authentication methods, see https://huggingface.co/docs/huggingface_hub/main/en/quick-start#authentication\n",
+    "huggingface_hub.notebook_login(new_session=False, write_permission=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer.push_to_hub(target_model_path)\n",
+    "model.push_to_hub(target_model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Alternatvely, upload to Azure Blob Storage (currently not used)\n",
+    "from azureml.core import Workspace\n",
+    "\n",
+    "# save locally first\n",
+    "tokenizer.save_pretrained(f\"./models/{model_path}\")\n",
+    "model.save_pretrained(f\"./models/{model_path}\")\n",
+    "\n",
+    "# If all goes well, upload to blob storage:\n",
+    "workspace = Workspace.from_config()\n",
+    "ds = workspace.get_default_datastore()\n",
+    "ds.upload(f\"./models/{model_path}\", f\"./base_models/{target_model_path}\", show_progress=True, overwrite=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/pyproject.toml b/pyproject.toml
index 7ce597f..ba26438 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,8 @@
 [project]
 name = "autora-doc"
-license = {file = "LICENSE"}
+license = { file = "LICENSE" }
 readme = "README.md"
-authors = [
-    { name = "Carlos Garcia Jurado Suarez", email = "carlosg@uw.edu" }
-]
+authors = [{ name = "Carlos Garcia Jurado Suarez", email = "carlosg@uw.edu" }]
 requires-python = ">=3.8"
 classifiers = [
     "Development Status :: 4 - Beta",
@@ -20,7 +18,7 @@ dependencies = [
     "scipy",
     # This works, while installing from pytorch and cuda from conda does not",
     "torch==2.0.1",
-    "transformers>=4.35.2",
+    "transformers>=4.37.2",
     "nltk",
 ]
 
@@ -29,36 +27,26 @@ description = "Automatic documentation generator from AutoRA code"
 [project.optional-dependencies]
 dev = [
     "pytest",
-    "pytest-cov", # Used to report total code coverage
-    "pre-commit", # Used to run checks before finalizing a git commit
-    "sphinx", # Used to automatically generate documentation
+    "pytest-cov",       # Used to report total code coverage
+    "pre-commit",       # Used to run checks before finalizing a git commit
+    "sphinx",           # Used to automatically generate documentation
     "sphinx-rtd-theme", # Used to render documentation
-    "sphinx-autoapi", # Used to automatically generate api documentation
-    "black", # Used for static linting of files
-    "mypy", # Used for static type checking of files
+    "sphinx-autoapi",   # Used to automatically generate api documentation
+    "black",            # Used for static linting of files
+    "mypy",             # Used for static type checking of files
     # if you add dependencies here while experimenting in a notebook and you
     # want that notebook to render in your documentation, please add the
     # dependencies to ./docs/requirements.txt as well.
-    "nbconvert", # Needed for pre-commit check to clear output from Python notebooks
-    "nbsphinx", # Used to integrate Python notebooks into Sphinx documentation
-    "ipython", # Also used in building notebooks into Sphinx
-    "matplotlib", # Used in sample notebook intro_notebook.ipynb
+    "nbconvert",   # Needed for pre-commit check to clear output from Python notebooks
+    "nbsphinx",    # Used to integrate Python notebooks into Sphinx documentation
+    "ipython",     # Also used in building notebooks into Sphinx
+    "matplotlib",  # Used in sample notebook intro_notebook.ipynb
     "ipykernel",
     "hf_transfer",
 ]
-train = [
-    "jsonlines",
-    "mlflow",
-]
-azure = [
-    "azureml-core",
-    "azureml-mlflow",
-]
-cuda = [
-    "bitsandbytes>=0.41.2.post2",
-    "accelerate>=0.24.1",
-    "xformers",
-]
+train = ["jsonlines", "mlflow"]
+azure = ["azureml-core", "azureml-mlflow"]
+cuda = ["bitsandbytes>=0.42.0", "accelerate>=0.24.1", "xformers"]
 
 [project.urls]
 Homepage = "https://github.com/AutoResearch/autodoc"
@@ -68,9 +56,7 @@ requires = ["hatchling", "hatch-vcs"]
 build-backend = "hatchling.build"
 
 [tool.pytest.ini_options]
-testpaths = [
-    "tests",
-]
+testpaths = ["tests"]
 
 [tool.black]
 line-length = 110
@@ -81,7 +67,7 @@ profile = "black"
 line_length = 110
 
 [tool.coverage.run]
-omit=["src/autora/doc/_version.py"]
+omit = ["src/autora/doc/_version.py"]
 
 [tool.hatch]
 
diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
index 33d6504..d01c407 100644
--- a/src/autora/doc/pipelines/main.py
+++ b/src/autora/doc/pipelines/main.py
@@ -187,7 +187,7 @@ def generate(
     predictions = pred.predict(prompt, [input], **param_dict)
     assert len(predictions) == 1, f"Expected only one output, got {len(predictions)}"
     logger.info(f"Writing output to {output}")
-    with open(output, "w") as f:
+    with open(output, "w", encoding="utf-8") as f:
         f.write(predictions[0])
 
 
diff --git a/src/autora/doc/runtime/predict_hf.py b/src/autora/doc/runtime/predict_hf.py
index 9a5adbd..a3766dc 100644
--- a/src/autora/doc/runtime/predict_hf.py
+++ b/src/autora/doc/runtime/predict_hf.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, Iterable, List
+from typing import Dict, Iterable, List, Tuple
 
 import torch
 import transformers
@@ -9,6 +9,8 @@
 
 logger = logging.getLogger(__name__)
 
+quantized_models = {"meta-llama/Llama-2-7b-chat-hf": "autora-doc/Llama-2-7b-chat-hf-nf4"}
+
 
 def preprocess_code(code: str) -> str:
     lines: Iterable[str] = code.splitlines()
@@ -21,10 +23,12 @@ def preprocess_code(code: str) -> str:
 
 
 class Predictor:
-    def __init__(self, model_path: str):
-        config = self.get_config()
+    def __init__(self, input_model_path: str):
+        model_path, config = Predictor.get_config(input_model_path)
+        if model_path != input_model_path:
+            logger.info(f"Mapped requested model '{input_model_path}' to '{model_path}'")
 
-        logger.info(f"Loading model from {model_path}")
+        logger.info(f"Loading model from {model_path} using config {config}")
         self.tokenizer = AutoTokenizer.from_pretrained(model_path)
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,
@@ -84,19 +88,24 @@ def tokenize(self, input: List[str]) -> Dict[str, List[List[int]]]:
         tokens: Dict[str, List[List[int]]] = self.tokenizer(input)
         return tokens
 
-    def get_config(self) -> Dict[str, str]:
+    @staticmethod
+    def get_config(model_path: str) -> Tuple[str, Dict[str, str]]:
         if torch.cuda.is_available():
             from transformers import BitsAndBytesConfig
 
+            config = {"device_map": "auto"}
+            mapped_path = quantized_models.get(model_path, None)
+            if mapped_path:
+                # found an already quantized model, so no need to get a new quant config
+                return mapped_path, config
+
             # Load the model in 4bit quantization for faster inference on smaller GPUs
-            return {
-                "quantization_config": BitsAndBytesConfig(
-                    load_in_4bit=True,
-                    bnb_4bit_use_double_quant=True,
-                    bnb_4bit_quant_type="nf4",
-                    bnb_4bit_compute_dtype=torch.bfloat16,
-                ),
-                "device_map": "auto",
-            }
+            config["quantization_config"] = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.bfloat16,
+            )
+            return model_path, config
         else:
-            return {}
+            return model_path, {}
diff --git a/tests/test_predict_hf.py b/tests/test_predict_hf.py
index 9ad2c47..e433429 100644
--- a/tests/test_predict_hf.py
+++ b/tests/test_predict_hf.py
@@ -1,4 +1,10 @@
-from autora.doc.runtime.predict_hf import Predictor
+from unittest import mock
+
+from autora.doc.runtime.predict_hf import Predictor, quantized_models
+
+# Test models with and without available quantized models
+MODEL_NO_QUANTIZED = "hf-internal-testing/tiny-random-FalconForCausalLM"
+MODEL_WITH_QUANTIZED = "meta-llama/Llama-2-7b-chat-hf"
 
 
 def test_trim_prompt() -> None:
@@ -14,3 +20,22 @@ def test_trim_prompt() -> None:
 """
     output = Predictor.trim_prompt(with_marker)
     assert output == "output\n"
+
+
+@mock.patch("torch.cuda.is_available", return_value=True)
+def test_get_config_cuda(mock: mock.Mock) -> None:
+    model, config = Predictor.get_config(MODEL_WITH_QUANTIZED)
+    assert model == quantized_models[MODEL_WITH_QUANTIZED]
+    assert "quantization_config" not in config
+
+    model, config = Predictor.get_config(MODEL_NO_QUANTIZED)
+    # no pre-quantized model available
+    assert model == MODEL_NO_QUANTIZED
+    assert "quantization_config" in config
+
+
+@mock.patch("torch.cuda.is_available", return_value=False)
+def test_get_config_nocuda(mock: mock.Mock) -> None:
+    model, config = Predictor.get_config(MODEL_WITH_QUANTIZED)
+    assert model == MODEL_WITH_QUANTIZED
+    assert len(config) == 0