From cc6bf7e1ee083f41d9a948af94501de1ebc33267 Mon Sep 17 00:00:00 2001
From: Carlos Garcia Jurado Suarez <carlosg@uw.edu>
Date: Thu, 1 Feb 2024 13:36:45 -0800
Subject: [PATCH] unit tests

---
 src/autora/doc/runtime/predict_hf.py | 17 +++++++++--------
 tests/test_predict_hf.py             | 27 ++++++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/autora/doc/runtime/predict_hf.py b/src/autora/doc/runtime/predict_hf.py
index 1af324f..50a5767 100644
--- a/src/autora/doc/runtime/predict_hf.py
+++ b/src/autora/doc/runtime/predict_hf.py
@@ -25,7 +25,7 @@ def preprocess_code(code: str) -> str:
 
 class Predictor:
     def __init__(self, input_model_path: str):
-        model_path, config = self.get_config(input_model_path)
+        model_path, config = Predictor.get_config(input_model_path)
         if model_path != input_model_path:
             logger.info(f"Mapped requested model '{input_model_path}' to '{model_path}'")
 
@@ -89,7 +89,8 @@ def tokenize(self, input: List[str]) -> Dict[str, List[List[int]]]:
         tokens: Dict[str, List[List[int]]] = self.tokenizer(input)
         return tokens
 
-    def get_config(self, model_path: str) -> Tuple[str, Dict[str, str]]:
+    @staticmethod
+    def get_config(model_path: str) -> Tuple[str, Dict[str, str]]:
         if torch.cuda.is_available():
             from transformers import BitsAndBytesConfig
 
@@ -100,12 +101,12 @@ def get_config(self, model_path: str) -> Tuple[str, Dict[str, str]]:
                 return mapped_path, config
 
             # Load the model in 4bit quantization for faster inference on smaller GPUs
-            config ["quantization_config"] = BitsAndBytesConfig(
-                    load_in_4bit=True,
-                    bnb_4bit_use_double_quant=True,
-                    bnb_4bit_quant_type="nf4",
-                    bnb_4bit_compute_dtype=torch.bfloat16,
-                )
+            config["quantization_config"] = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.bfloat16,
+            )
             return model_path, config
         else:
             return model_path, {}
diff --git a/tests/test_predict_hf.py b/tests/test_predict_hf.py
index 9ad2c47..e433429 100644
--- a/tests/test_predict_hf.py
+++ b/tests/test_predict_hf.py
@@ -1,4 +1,10 @@
-from autora.doc.runtime.predict_hf import Predictor
+from unittest import mock
+
+from autora.doc.runtime.predict_hf import Predictor, quantized_models
+
+# Test models with and without available quantized models
+MODEL_NO_QUANTIZED = "hf-internal-testing/tiny-random-FalconForCausalLM"
+MODEL_WITH_QUANTIZED = "meta-llama/Llama-2-7b-chat-hf"
 
 
 def test_trim_prompt() -> None:
@@ -14,3 +20,22 @@ def test_trim_prompt() -> None:
 """
     output = Predictor.trim_prompt(with_marker)
     assert output == "output\n"
+
+
+@mock.patch("torch.cuda.is_available", return_value=True)
+def test_get_config_cuda(mock: mock.Mock) -> None:
+    model, config = Predictor.get_config(MODEL_WITH_QUANTIZED)
+    assert model == quantized_models[MODEL_WITH_QUANTIZED]
+    assert "quantization_config" not in config
+
+    model, config = Predictor.get_config(MODEL_NO_QUANTIZED)
+    # no pre-quantized model available
+    assert model == MODEL_NO_QUANTIZED
+    assert "quantization_config" in config
+
+
+@mock.patch("torch.cuda.is_available", return_value=False)
+def test_get_config_nocuda(mock: mock.Mock) -> None:
+    model, config = Predictor.get_config(MODEL_WITH_QUANTIZED)
+    assert model == MODEL_WITH_QUANTIZED
+    assert len(config) == 0