removed 16 bit precision option

nutanix · Jan 4, 2024 · 85b6eda · 85b6eda
1 parent 5a5bd2b
commit 85b6eda
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 10 deletions.
diff --git a/llm/handler.py b/llm/handler.py
@@ -112,9 +112,8 @@ def initialize(self, context: ts.context.Context):
  self.tokenizer.padding_side = "left"
  logger.info("Tokenizer loaded successfully")
 
- quantize_bits = 16
- if os.environ.get("NAI_QUANTIZATION"):
- quantize_bits = int(self.get_env_value("NAI_QUANTIZATION"))
+ quantize_bits = self.get_env_value("NAI_QUANTIZATION")
+ quantize_bits = int(quantize_bits) if quantize_bits else quantize_bits
 
  if quantize_bits == 4:
  bnb_config = transformers.BitsAndBytesConfig(

diff --git a/llm/kubeflow_inference_run.py b/llm/kubeflow_inference_run.py
@@ -387,10 +387,10 @@ def execute(params: argparse.Namespace) -> None:
  model_info["repo_id"] = model_params["repo_id"]
  model_info["repo_version"] = check_if_valid_version(model_info, mount_path)
 
- if quantize_bits not in [4, 8, 16]:
- print("## Quantization precision bits should be either 4, 8 or 16")
+ if quantize_bits and int(quantize_bits) not in [4, 8]:
+ print("## Quantization precision bits should be either 4 or 8")
  sys.exit(1)
- elif quantize_bits in [4, 8] and not deployment_resources["gpus"]:
+ elif quantize_bits and deployment_resources["gpus"]:
  print("## BitsAndBytes Quantization requires GPUs")
  sys.exit(1)
  else:
@@ -450,9 +450,9 @@ def execute(params: argparse.Namespace) -> None:
  )
  parser.add_argument(
  "--quantize_bits",
- type=int,
- default=16,
- help="BitsAndBytes Quantization Precision (4, 8 or 16)",
+ type=str,
+ default="",
+ help="BitsAndBytes Quantization Precision (4 or 8)",
  )
  # Parse the command-line arguments
  args = parser.parse_args()

diff --git a/llm/run.sh b/llm/run.sh
@@ -18,7 +18,7 @@ function helpFunction()
  echo -e "\t-g Number of gpus to be used to execute. Set 0 to use cpu"
  echo -e "\t-v Commit id of the HuggingFace Repo."
  echo -e "\t-t Your HuggingFace token (Required only for LLAMA2 model)."
- echo -e "\t-q BitsAndBytes Quantization Precision (4, 8 or 16)"
+ echo -e "\t-q BitsAndBytes Quantization Precision (4 or 8)"
  exit 1 # Exit script after printing help
 }