nutanix · johnugeorge · Jan 5, 2024 · Jan 3, 2024 · Jan 3, 2024 · Jan 4, 2024
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -25,7 +25,7 @@ jobs:
  python-version: 3.11
 
  - name: Install Python dependencies
- run: pip install --no-cache-dir pytest black pylint torchserve==0.8.2 torch==2.0.1 transformers==4.33.0 -r llm/requirements.txt
+ run: pip install --no-cache-dir pytest black pylint torchserve==0.8.2 torch==2.0.1 transformers==4.36.0 -r llm/requirements.txt
 
  - name: Run pylint
  run: pylint ./llm

diff --git a/llm/handler.py b/llm/handler.py
@@ -112,8 +112,29 @@ def initialize(self, context: ts.context.Context):
  self.tokenizer.padding_side = "left"
  logger.info("Tokenizer loaded successfully")
 
+ quantize_bits = self.get_env_value("NAI_QUANTIZATION")
+ quantize_bits = int(quantize_bits) if quantize_bits else quantize_bits
+
+ if quantize_bits == 4:
+ bnb_config = transformers.BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_use_double_quant=False,
+ bnb_4bit_quant_type="nf4",
+ bnb_4bit_compute_dtype=torch.bfloat16,
+ )
+ quantization_config = bnb_config
+ logger.info("Loading Model with %s bit Quantization", quantize_bits)
+ elif quantize_bits == 8:
+ bnb_config = transformers.BitsAndBytesConfig(load_in_8bit=True)
+ quantization_config = bnb_config
+ logger.info("Loading Model with %s bit Quantization", quantize_bits)
+ else:
+ quantization_config = None
+ logger.info("Loading Model with bfloat16 data type")
+
  self.model = transformers.AutoModelForCausalLM.from_pretrained(
  model_dir,
+ quantization_config=quantization_config,
  torch_dtype=torch.bfloat16, # Load model weights in bfloat16
  device_map=self.device_map,
  local_files_only=True,

diff --git a/llm/kubeflow_inference_run.py b/llm/kubeflow_inference_run.py
@@ -188,6 +188,10 @@ def create_isvc(
  client.V1EnvVar(
  name="NAI_MAX_TOKENS", value=str(model_params["max_new_tokens"])
  ),
+ client.V1EnvVar(
+ name="NAI_QUANTIZATION",
+ value=str(model_params["quantize_bits"]),
+ ),
  ],
  resources=client.V1ResourceRequirements(
  limits={
@@ -364,6 +368,7 @@ def execute(params: argparse.Namespace) -> None:
  input_path = params.data
  mount_path = params.mount_path
  model_timeout = params.model_timeout
+ quantize_bits = params.quantize_bits
 
  check_if_path_exists(mount_path, "local nfs mount", is_dir=True)
  if not nfs_path or not nfs_server:
@@ -382,6 +387,15 @@ def execute(params: argparse.Namespace) -> None:
  model_info["repo_id"] = model_params["repo_id"]
  model_info["repo_version"] = check_if_valid_version(model_info, mount_path)
 
+ if quantize_bits and int(quantize_bits) not in [4, 8]:
+ print("## Quantization precision bits should be either 4 or 8")
+ sys.exit(1)
+ elif quantize_bits and deployment_resources["gpus"]:
+ print("## BitsAndBytes Quantization requires GPUs")
+ sys.exit(1)
+ else:
+ model_params["quantize_bits"] = quantize_bits
+
  config.load_kube_config()
  core_api = client.CoreV1Api()
 
@@ -434,6 +448,12 @@ def execute(params: argparse.Namespace) -> None:
  default=None,
  help="HuggingFace Hub token to download LLAMA(2) models",
  )
+ parser.add_argument(
+ "--quantize_bits",
+ type=str,
+ default="",
+ help="BitsAndBytes Quantization Precision (4 or 8)",
+ )
  # Parse the command-line arguments
  args = parser.parse_args()
  execute(args)
diff --git a/llm/requirements.txt b/llm/requirements.txt
@@ -1,4 +1,4 @@
 torch-model-archiver==0.8.1
 kubernetes==28.1.0
 kserve==0.11.1
-huggingface-hub==0.17.1
+huggingface-hub==0.20.1
diff --git a/llm/run.sh b/llm/run.sh
@@ -8,7 +8,7 @@ MODEL_TIMEOUT_IN_SEC="1500"
 
 function helpFunction()
 {
- echo "Usage: $0 -n <MODEL_NAME> -g <NUM_OF_GPUS> -f <NFS_ADDRESS_WITH_SHARE_PATH> -m <NFS_LOCAL_MOUNT_LOCATION> -e <KUBE_DEPLOYMENT_NAME> [OPTIONAL -d <INPUT_DATA_ABSOLUTE_PATH> -v <REPO_COMMIT_ID> -t <Your_HuggingFace_Hub_Token>]"
+ echo "Usage: $0 -n <MODEL_NAME> -g <NUM_OF_GPUS> -f <NFS_ADDRESS_WITH_SHARE_PATH> -m <NFS_LOCAL_MOUNT_LOCATION> -e <KUBE_DEPLOYMENT_NAME> [OPTIONAL -d <INPUT_DATA_ABSOLUTE_PATH> -v <REPO_COMMIT_ID> -t <Your_HuggingFace_Hub_Token> -q <QUANTIZE_BITS>]"
  echo -e "\t-f NFS server address with share path information"
  echo -e "\t-m Absolute path to the NFS local mount location"
  echo -e "\t-e Name of the deployment metadata"
@@ -18,6 +18,7 @@ function helpFunction()
  echo -e "\t-g Number of gpus to be used to execute. Set 0 to use cpu"
  echo -e "\t-v Commit id of the HuggingFace Repo."
  echo -e "\t-t Your HuggingFace token (Required only for LLAMA2 model)."
+ echo -e "\t-q BitsAndBytes Quantization Precision (4 or 8)"
  exit 1 # Exit script after printing help
 }
 
@@ -67,12 +68,16 @@ function inference_exec_kubernetes()
  exec_cmd+=" --hf_token $hf_token"
  fi
 
+ if [ ! -z $quantize_bits ] ; then
+ exec_cmd+=" --quantize_bits $quantize_bits"
+ fi
+
  echo "Running the Inference script";
  $exec_cmd
 }
 
 # Entry Point
-while getopts ":n:v:m:t:d:g:f:e:" opt;
+while getopts ":n:v:m:t:d:g:f:e:q:" opt;
 do
  case "$opt" in
  n ) model_name="$OPTARG" ;;
@@ -83,6 +88,7 @@ do
  v ) repo_version="$OPTARG" ;;
  m ) mount_path="$OPTARG" ;;
  t ) hf_token="$OPTARG" ;;
+ q ) quantize_bits="$OPTARG" ;;
  ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
  esac
 done

diff --git a/llm/utils/model_requirements.txt b/llm/utils/model_requirements.txt
@@ -2,4 +2,6 @@ torch==2.0.1
 tokenizers==0.15.0
 transformers==4.36.0
 accelerate==0.22.0
-einops==0.6.1
+einops==0.6.1
+bitsandbytes==0.41.1
+scipy==1.11.4