From 1996c11cae6d4672752fa39bd778db77fcc65f06 Mon Sep 17 00:00:00 2001 From: Ayush Sawant Date: Fri, 5 Jan 2024 12:04:29 +0530 Subject: [PATCH 1/3] Added support for bitsandbytes quantization of models (#33) * added support for bitsandbytes quantization of models --- llm/handler.py | 22 ++++++++++++++++++++++ llm/model_config.json | 2 +- llm/requirements.txt | 3 ++- llm/run.sh | 8 +++++++- llm/torchserve_run.py | 7 +++++++ llm/utils/generate_data_model.py | 8 +++++--- llm/utils/tsutils.py | 23 +++++++++++++++++++++++ 7 files changed, 67 insertions(+), 6 deletions(-) diff --git a/llm/handler.py b/llm/handler.py index be33851..a2ff8c5 100644 --- a/llm/handler.py +++ b/llm/handler.py @@ -104,8 +104,30 @@ def initialize(self, context): self.tokenizer.padding_side = "left" logger.info("Tokenizer loaded successfully") + quantize_bits = None + if os.environ.get("NAI_QUANTIZATION"): + quantize_bits = int(self.get_env_value("NAI_QUANTIZATION")) + + if quantize_bits == 4: + bnb_config = transformers.BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + ) + quantization_config = bnb_config + logger.info("Loading Model with %s bit Quantization", quantize_bits) + elif quantize_bits == 8: + bnb_config = transformers.BitsAndBytesConfig(load_in_8bit=True) + quantization_config = bnb_config + logger.info("Loading Model with %s bit Quantization", quantize_bits) + else: + quantization_config = None + logger.info("Loading Model with bfloat16 data type") + self.model = transformers.AutoModelForCausalLM.from_pretrained( model_dir, + quantization_config=quantization_config, torch_dtype=torch.bfloat16, # Load model weights in bfloat16 device_map=self.device_map, local_files_only=True, diff --git a/llm/model_config.json b/llm/model_config.json index 48fe57e..5df5969 100644 --- a/llm/model_config.json +++ b/llm/model_config.json @@ -77,4 +77,4 @@ "response_timeout": 2000 } } -} +} \ No newline at end of file diff --git a/llm/requirements.txt b/llm/requirements.txt index 813d29b..ec50fcc 100644 --- a/llm/requirements.txt +++ b/llm/requirements.txt @@ -9,4 +9,5 @@ accelerate==0.22.0 nvgpu==0.10.0 torchserve==0.8.2 torch-model-archiver==0.8.1 -einops==0.6.1 \ No newline at end of file +einops==0.6.1 +bitsandbytes==0.41.1 \ No newline at end of file diff --git a/llm/run.sh b/llm/run.sh index 1d1399f..1d72316 100644 --- a/llm/run.sh +++ b/llm/run.sh @@ -10,16 +10,18 @@ helpFunction() echo -e "\t-v HuggingFace repository version (optional)" echo -e "\t-d Absolute path of input data folder (optional)" echo -e "\t-a Absolute path to the Model Store directory" + echo -e "\t-q BitsAndBytes Quantization Precision (4 or 8) (optional)" exit 1 # Exit script after printing help } -while getopts ":n:v:d:a:o:r" opt; +while getopts ":n:v:d:q:a:" opt; do case "$opt" in n ) model_name="$OPTARG" ;; v ) repo_version="$OPTARG" ;; d ) data="$OPTARG" ;; a ) model_store="$OPTARG" ;; + q ) quantize_bits="$OPTARG" ;; ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent esac done @@ -53,6 +55,10 @@ function create_execution_cmd() if [ ! -z "$data" ] ; then cmd+=" --data $data" fi + + if [ ! -z "$quantize_bits" ] ; then + cmd+=" --quantize_bits $quantize_bits" + fi } function inference_exec_vm(){ diff --git a/llm/torchserve_run.py b/llm/torchserve_run.py index 8daab92..66c7d8f 100644 --- a/llm/torchserve_run.py +++ b/llm/torchserve_run.py @@ -116,6 +116,7 @@ def run_inference(params: argparse.Namespace) -> None: if params.data: check_if_path_exists(params.data, "Input data folder", is_dir=True) + ts.set_model_precision(params.quantize_bits) create_folder_if_not_exists( os.path.join(os.path.dirname(__file__), "utils", params.gen_folder_name) ) @@ -219,5 +220,11 @@ def cleanup(gen_folder: str, ts_stop: bool = True, ts_cleanup: bool = True) -> N metavar="model_store", help="absolute path to the model store directory", ) + parser.add_argument( + "--quantize_bits", + type=str, + default="", + help="BitsAndBytes Quantization Precision (4 or 8)", + ) args = parser.parse_args() torchserve_run(args) diff --git a/llm/utils/generate_data_model.py b/llm/utils/generate_data_model.py index ce11219..f595e4e 100644 --- a/llm/utils/generate_data_model.py +++ b/llm/utils/generate_data_model.py @@ -122,9 +122,11 @@ def validate_hf_token(self) -> None: and self.repo_info.hf_token is None ): print( - "## Error: HuggingFace Hub token is required for llama download." - " Please specify it using --hf_token=. " - "Refer https://huggingface.co/docs/hub/security-tokens" + ( + "HuggingFace Hub token is required for llama download. " + "Please specify it using --hf_token= argument " + ". Refer https://huggingface.co/docs/hub/security-tokens" + ) ) sys.exit(1) diff --git a/llm/utils/tsutils.py b/llm/utils/tsutils.py index 7880f81..928393a 100644 --- a/llm/utils/tsutils.py +++ b/llm/utils/tsutils.py @@ -8,10 +8,12 @@ different operating systems. """ import os +import sys import platform import time import json from typing import Tuple, Dict +import torch import requests from utils.inference_data_model import InferenceDataModel, TorchserveStartData from utils.system_utils import check_if_path_exists @@ -198,6 +200,27 @@ def set_model_params(model_name: str) -> None: del os.environ[param_name] +def set_model_precision(quantize_bits: int) -> None: + """ + This function reads the precision to which the model weights are to be + quantized and sets it as environment variable for the handler + to read. + + Args: + quantize_bits (int): BitsAndBytes Quantization Precision. + """ + if quantize_bits and int(quantize_bits) not in [4, 8]: + print( + "## Quantization precision bits should be either 4 or 8. Default precision used is 16" + ) + sys.exit(1) + elif quantize_bits and not torch.cuda.is_available(): + print("## BitsAndBytes Quantization requires GPUs") + sys.exit(1) + else: + os.environ["NAI_QUANTIZATION"] = quantize_bits + + def get_params_for_registration(model_name: str) -> Tuple[str, str, str, str]: """ This function reads registration parameters from model_config.json returns them. From c9b69f77f468149ed958b8f7404b4e25fb501f33 Mon Sep 17 00:00:00 2001 From: Ayush Sawant Date: Fri, 5 Jan 2024 12:29:44 +0530 Subject: [PATCH 2/3] Added support for hf_token as an environment variable (#34) * added support for hf_token as an environment variable --- llm/utils/generate_data_model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llm/utils/generate_data_model.py b/llm/utils/generate_data_model.py index f595e4e..80ad35e 100644 --- a/llm/utils/generate_data_model.py +++ b/llm/utils/generate_data_model.py @@ -87,7 +87,7 @@ def set_values(self, params: argparse.Namespace) -> None: self.skip_download = params.skip_download self.debug = params.debug - self.repo_info.hf_token = params.hf_token + self.repo_info.hf_token = params.hf_token or os.environ.get("HF_TOKEN") self.repo_info.repo_id = params.repo_id self.repo_info.repo_version = params.repo_version @@ -125,7 +125,8 @@ def validate_hf_token(self) -> None: ( "HuggingFace Hub token is required for llama download. " "Please specify it using --hf_token= argument " - ". Refer https://huggingface.co/docs/hub/security-tokens" + "or, set it as an environment variable 'HF_TOKEN'. Refer " + "https://huggingface.co/docs/hub/security-tokens" ) ) sys.exit(1) From 2cf25462b642bda13a143c496a516940a381f45c Mon Sep 17 00:00:00 2001 From: Ayush Sawant Date: Fri, 5 Jan 2024 15:19:03 +0530 Subject: [PATCH 3/3] updated helpFunction of run.sh (#35) --- llm/run.sh | 2 +- llm/utils/generate_data_model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llm/run.sh b/llm/run.sh index 1d72316..9f54320 100644 --- a/llm/run.sh +++ b/llm/run.sh @@ -5,7 +5,7 @@ wdir=$(dirname "$SCRIPT") helpFunction() { echo "" - echo "Usage: $0 -n -a [OPTIONAL -d -v ]" + echo "Usage: $0 -n -a [OPTIONAL -d -v -q ]" echo -e "\t-n Name of the Model" echo -e "\t-v HuggingFace repository version (optional)" echo -e "\t-d Absolute path of input data folder (optional)" diff --git a/llm/utils/generate_data_model.py b/llm/utils/generate_data_model.py index 80ad35e..a8ff613 100644 --- a/llm/utils/generate_data_model.py +++ b/llm/utils/generate_data_model.py @@ -125,7 +125,7 @@ def validate_hf_token(self) -> None: ( "HuggingFace Hub token is required for llama download. " "Please specify it using --hf_token= argument " - "or, set it as an environment variable 'HF_TOKEN'. Refer " + "or set it as an environment variable 'HF_TOKEN'. Refer " "https://huggingface.co/docs/hub/security-tokens" ) )