Merge branch 'enable_qwen2_quantization' of https://github.com/intel/…

…auto-round into enable_qwen2_quantization
intel · Sep 12, 2024 · d93e14b · d93e14b
2 parents 7c66e4d + 95ee0e3
commit d93e14b
Show file tree

Hide file tree

Showing 17 changed files with 186 additions and 773 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include requirements.txt
diff --git a/README.md b/README.md
@@ -77,44 +77,6 @@ output_dir = "./tmp_autoround"
 autoround.save_quantized(output_dir, format='auto_round', inplace=True) 
 ```
 
-### Basic Usage
-AutoRound support Gaudi2, CPU and GPU. A user guide detailing the full list of supported arguments is provided by calling ```auto_round -h``` on the terminal.  Alternatively, you can use ```auto-round``` instead of ```auto_round```. (**auto-round version > 0.3.0**)
-
-
-```bash
-auto_round --model facebook/opt-125m \
-    --bits 4 \
-    --group_size 128 \
-    --format auto_round \
-    --output_dir ./tmp_autoround
-```
-We provide two recipes for best accuracy and fast running speed with low memory. Details as below.
-<details>
-  <summary>Other Recipes</summary>
-
-  ```bash
-## best accuracy, 3X slower, low_gpu_mem_usage could save ~20G but ~30% slower
-  auto_round --model facebook/opt-125m \
-    --bits 4 \
-    --group_size 128 \
-    --nsamples 512 \
-    --iters 1000 \
-    --low_gpu_mem_usage 
-  ```
-
-  ```bash
-## fast and low memory, 2-3X speedup, slight accuracy drop at W4G128
-  auto_round --model facebook/opt-125m \
-    --bits 4 \
-    --group_size 128 \
-    --nsamples 128 \
-    --iters 200 \
-    --seqlen 512 \
-    --batch_size 4 
-  ```
-</details>
-<br>
-
 <details>
   <summary>Detailed Hyperparameters</summary>
 
@@ -166,9 +128,44 @@ We provide two recipes for best accuracy and fast running speed with low memory.
 - `device`: The device to be used for tuning. The default is set to 'auto', allowing for automatic detection.
 
 </details>
-<br>
 
+### Basic Usage (auto-round version > 0.3.0)
+AutoRound support Gaudi2, CPU and GPU. A user guide detailing the full list of supported arguments is provided by calling ```auto_round -h``` on the terminal.  Alternatively, you can use ```auto-round``` instead of ```auto_round```. 
+
+
+```bash
+auto_round --model facebook/opt-125m \
+    --bits 4 \
+    --group_size 128 \
+    --format auto_round \
+    --disable_eval \
+    --output_dir ./tmp_autoround
+```
+We provide two recipes for best accuracy and fast running speed with low memory. Details as below.
+<details>
+  <summary>Other Recipes</summary>
 
+  ```bash
+## best accuracy, 3X slower, low_gpu_mem_usage could save ~20G but ~30% slower
+  auto_round --model facebook/opt-125m \
+    --bits 4 \
+    --group_size 128 \
+    --nsamples 512 \
+    --iters 1000 \
+    --low_gpu_mem_usage 
+  ```
+
+  ```bash
+## fast and low memory, 2-3X speedup, slight accuracy drop at W4G128
+  auto_round --model facebook/opt-125m \
+    --bits 4 \
+    --group_size 128 \
+    --nsamples 128 \
+    --iters 200 \
+    --seqlen 512 \
+    --batch_size 4 
+  ```
+</details>
 
 #### Formats
 

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
@@ -24,9 +24,9 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
 from lm_eval.utils import make_table  # pylint: disable=E0401
 
-from auto_round.utils import logger
 from auto_round import AutoRoundConfig
 from auto_round.eval.evaluation import simple_evaluate
+from auto_round.utils import detect_device
 
 def setup_parser():
     parser = argparse.ArgumentParser()
@@ -156,7 +156,6 @@ def setup_parser():
 
 def tune(args):
     tasks = args.tasks
-    use_eval_legacy = False
     if args.format is None:
         args.format = "auto_round"
 
@@ -165,7 +164,6 @@ def tune(args):
         model_name = model_name[:-1]
     print(model_name, flush=True)
 
-    from auto_round.utils import detect_device
 
     device_str = detect_device(args.device)
     torch_dtype = "auto"
@@ -236,16 +234,6 @@ def tune(args):
             seqlen = min(seqlen, tokenizer.model_max_length)
             args.seqlen = seqlen
 
-    excel_name = f"{model_name}_{args.bits}_{args.group_size}"
-    if (hasattr(model, 'config') and (model.dtype is torch.bfloat16 or model.config.torch_dtype is torch.bfloat16)):
-        dtype = 'bfloat16'
-    else:
-        if "cpu" not in device_str:
-            dtype = 'float16'
-        else:
-            dtype = 'float32'
-
-    excel_name = f"{model_name}_{args.bits}_{args.group_size}"
     if "bloom" in model_name:
         args.low_gpu_mem_usage = False
 
@@ -309,7 +297,7 @@ def tune(args):
         low_cpu_mem_usage=low_cpu_mem_usage, data_type=args.data_type,
         enable_norm_bias_tuning=args.enable_norm_bias_tuning)
     model, _ = autoround.quantize()
-    model_name = args.model_name.rstrip("/")
+    model_name = args.model.rstrip("/")
     if args.low_cpu_mem_mode == 1 or args.low_cpu_mem_mode == 2:
         import shutil
 
@@ -320,7 +308,6 @@ def tune(args):
         torch.cuda.empty_cache()
 
     export_dir = args.output_dir + "/" + model_name.split('/')[-1] + f"-autoround-w{args.bits}g{args.group_size}"
-    output_dir = args.output_dir + "/" + model_name.split('/')[-1] + f"-autoround-w{args.bits}g{args.group_size}-qdq"
 
 
     format_list = args.format.replace(' ', '').split(',')
@@ -366,9 +353,10 @@ def get_library_version(library_name):
 
 def eval(args):
     quantization_config = AutoRoundConfig(backend=args.device)
+    device_str = detect_device(args.device)
     user_model = AutoModelForCausalLM.from_pretrained(
         args.model,
-        device_map=args.device, quantization_config=quantization_config)
+        device_map=device_str, quantization_config=quantization_config)
     model_args = f"pretrained={args.model},trust_remote_code={not args.disable_trust_remote_code}"
     if isinstance(args.tasks, str):
         tasks = args.tasks.split(',')
@@ -377,7 +365,6 @@ def eval(args):
         model_args=model_args,
         user_model=user_model,
         tasks=tasks,
-        device=args.device,
         batch_size=args.eval_bs)
 
     from lm_eval.utils import make_table  # pylint: disable=E0401

diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py
@@ -456,7 +456,8 @@ class StoreAttr(object):
 
     def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
         if model.__class__.main_input_name != "input_ids":
-            raise RuntimeError("We can only quantize pure text model.")
+            logger.warning("We can only quantize pure text models and " \
+                            "certain types(Llava/Qwen-VL/Phi-3-vision) of multimodal models.")
 
         if self.pre_quantized:
             model = self.convert_model(model)
@@ -485,3 +486,4 @@ def is_serializable(self):
 transformers.quantizers.auto.AutoHfQuantizer = AutoHfQuantizer
 transformers.modeling_utils.AutoHfQuantizer = AutoHfQuantizer
 
+