diff --git a/auto_round/utils.py b/auto_round/utils.py index 47c4ef2f..2b8d817b 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -25,6 +25,13 @@ import torch from torch.amp import autocast +from functools import lru_cache +@lru_cache(None) +def warning_once(self, msg: str): + self.warning(msg) + + +logging.Logger.warning_once = warning_once logger = logging.getLogger("autoround") logger.setLevel(logging.INFO) logger.propagate = False @@ -35,7 +42,6 @@ import importlib import transformers -from functools import lru_cache class LazyImport(object): """Lazy import python module till use.""" @@ -607,11 +613,6 @@ def get_autogptq_backend_config(backend, bits=4): use_triton = False return use_triton, disable_exllamav1, disable_exllamav2, use_qigen, disable_marlin -@lru_cache(None) -def warning_once(logger, msg: str): - logger.warning(msg) - -logger.warning_once = warning_once def dynamic_import_inference_linear(bits, group_size, backend): """Dynamically imports and returns the appropriate QuantLinear class based on the given bits and backend. @@ -660,6 +661,7 @@ def dynamic_import_inference_linear(bits, group_size, backend): elif bits == 4 and "exllamav2" in backend: logger.warning_once("Please install auto-round from source to enable exllamav2 kernels, switch to triton " "kernels for now") + from auto_round_extension.cuda.qliner_triton import QuantLinear else: from auto_round_extension.cuda.qliner_triton import QuantLinear return QuantLinear diff --git a/examples/language-modeling/requirements.txt b/examples/language-modeling/requirements.txt index 292166e4..13d67e77 100644 --- a/examples/language-modeling/requirements.txt +++ b/examples/language-modeling/requirements.txt @@ -17,4 +17,5 @@ auto-gptq openpyxl wandb py-cpuinfo +numpy < 2.0 diff --git a/requirements.txt b/requirements.txt index cb8df6bb..0f5c4f7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ sentencepiece torch transformers triton +numpy < 2.0