Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Quantization Support #46

Merged
merged 7 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/lint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
python-version: 3.11

- name: Install Python dependencies
run: pip install --no-cache-dir pytest black pylint torchserve==0.8.2 torch==2.0.1 transformers==4.33.0 -r llm/requirements.txt
run: pip install --no-cache-dir pytest black pylint torchserve==0.8.2 torch==2.0.1 transformers==4.36.0 -r llm/requirements.txt

- name: Run pylint
run: pylint ./llm
Expand Down
21 changes: 21 additions & 0 deletions llm/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,29 @@ def initialize(self, context: ts.context.Context):
self.tokenizer.padding_side = "left"
logger.info("Tokenizer loaded successfully")

quantize_bits = self.get_env_value("NAI_QUANTIZATION")
quantize_bits = int(quantize_bits) if quantize_bits else quantize_bits
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

keep it simple here. you are doing the check, just to type cast

if self.get_env_value("NAI_QUANTIZATION"):
   quantize_bits = int(self.get_env_value("NAI_QUANTIZATION"))

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed as suggested


if quantize_bits == 4:
bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=False,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
quantization_config = bnb_config
logger.info("Loading Model with %s bit Quantization", quantize_bits)
elif quantize_bits == 8:
bnb_config = transformers.BitsAndBytesConfig(load_in_8bit=True)
quantization_config = bnb_config
logger.info("Loading Model with %s bit Quantization", quantize_bits)
else:
quantization_config = None
logger.info("Loading Model with bfloat16 data type")

self.model = transformers.AutoModelForCausalLM.from_pretrained(
model_dir,
quantization_config=quantization_config,
torch_dtype=torch.bfloat16, # Load model weights in bfloat16
device_map=self.device_map,
local_files_only=True,
Expand Down
20 changes: 20 additions & 0 deletions llm/kubeflow_inference_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,10 @@ def create_isvc(
client.V1EnvVar(
name="NAI_MAX_TOKENS", value=str(model_params["max_new_tokens"])
),
client.V1EnvVar(
name="NAI_QUANTIZATION",
value=str(model_params["quantize_bits"]),
),
],
resources=client.V1ResourceRequirements(
limits={
Expand Down Expand Up @@ -364,6 +368,7 @@ def execute(params: argparse.Namespace) -> None:
input_path = params.data
mount_path = params.mount_path
model_timeout = params.model_timeout
quantize_bits = params.quantize_bits
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

white space on top

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed


check_if_path_exists(mount_path, "local nfs mount", is_dir=True)
if not nfs_path or not nfs_server:
Expand All @@ -382,6 +387,15 @@ def execute(params: argparse.Namespace) -> None:
model_info["repo_id"] = model_params["repo_id"]
model_info["repo_version"] = check_if_valid_version(model_info, mount_path)

if quantize_bits and int(quantize_bits) not in [4, 8]:
print("## Quantization precision bits should be either 4 or 8")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There can be question, why it's not taking 16 as well. Add a text,

print("## Quantization precision bits should be either 4 or 8. Default precision used is 16")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed the mentioned message

sys.exit(1)
elif quantize_bits and deployment_resources["gpus"]:
print("## BitsAndBytes Quantization requires GPUs")
sys.exit(1)
else:
model_params["quantize_bits"] = quantize_bits

config.load_kube_config()
core_api = client.CoreV1Api()

Expand Down Expand Up @@ -434,6 +448,12 @@ def execute(params: argparse.Namespace) -> None:
default=None,
help="HuggingFace Hub token to download LLAMA(2) models",
)
parser.add_argument(
"--quantize_bits",
type=str,
default="",
help="BitsAndBytes Quantization Precision (4 or 8)",
)
# Parse the command-line arguments
args = parser.parse_args()
execute(args)
2 changes: 1 addition & 1 deletion llm/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
torch-model-archiver==0.8.1
kubernetes==28.1.0
kserve==0.11.1
huggingface-hub==0.17.1
huggingface-hub==0.20.1
10 changes: 8 additions & 2 deletions llm/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ MODEL_TIMEOUT_IN_SEC="1500"

function helpFunction()
{
echo "Usage: $0 -n <MODEL_NAME> -g <NUM_OF_GPUS> -f <NFS_ADDRESS_WITH_SHARE_PATH> -m <NFS_LOCAL_MOUNT_LOCATION> -e <KUBE_DEPLOYMENT_NAME> [OPTIONAL -d <INPUT_DATA_ABSOLUTE_PATH> -v <REPO_COMMIT_ID> -t <Your_HuggingFace_Hub_Token>]"
echo "Usage: $0 -n <MODEL_NAME> -g <NUM_OF_GPUS> -f <NFS_ADDRESS_WITH_SHARE_PATH> -m <NFS_LOCAL_MOUNT_LOCATION> -e <KUBE_DEPLOYMENT_NAME> [OPTIONAL -d <INPUT_DATA_ABSOLUTE_PATH> -v <REPO_COMMIT_ID> -t <Your_HuggingFace_Hub_Token> -q <QUANTIZE_BITS>]"
echo -e "\t-f NFS server address with share path information"
echo -e "\t-m Absolute path to the NFS local mount location"
echo -e "\t-e Name of the deployment metadata"
Expand All @@ -18,6 +18,7 @@ function helpFunction()
echo -e "\t-g Number of gpus to be used to execute. Set 0 to use cpu"
echo -e "\t-v Commit id of the HuggingFace Repo."
echo -e "\t-t Your HuggingFace token (Required only for LLAMA2 model)."
echo -e "\t-q BitsAndBytes Quantization Precision (4 or 8)"
exit 1 # Exit script after printing help
}

Expand Down Expand Up @@ -67,12 +68,16 @@ function inference_exec_kubernetes()
exec_cmd+=" --hf_token $hf_token"
fi

if [ ! -z $quantize_bits ] ; then
exec_cmd+=" --quantize_bits $quantize_bits"
fi

echo "Running the Inference script";
$exec_cmd
}

# Entry Point
while getopts ":n:v:m:t:d:g:f:e:" opt;
while getopts ":n:v:m:t:d:g:f:e:q:" opt;
do
case "$opt" in
n ) model_name="$OPTARG" ;;
Expand All @@ -83,6 +88,7 @@ do
v ) repo_version="$OPTARG" ;;
m ) mount_path="$OPTARG" ;;
t ) hf_token="$OPTARG" ;;
q ) quantize_bits="$OPTARG" ;;
? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
esac
done
Expand Down
4 changes: 3 additions & 1 deletion llm/utils/model_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ torch==2.0.1
tokenizers==0.15.0
transformers==4.36.0
accelerate==0.22.0
einops==0.6.1
einops==0.6.1
bitsandbytes==0.41.1
scipy==1.11.4