Skip to content

Commit

Permalink
Merge branch 'enable_qwen2_quantization' of https://github.com/intel/…
Browse files Browse the repository at this point in the history
…auto-round into enable_qwen2_quantization
  • Loading branch information
WeiweiZhang1 committed Sep 12, 2024
2 parents 7c66e4d + 95ee0e3 commit d93e14b
Show file tree
Hide file tree
Showing 17 changed files with 186 additions and 773 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include requirements.txt
75 changes: 36 additions & 39 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,44 +77,6 @@ output_dir = "./tmp_autoround"
autoround.save_quantized(output_dir, format='auto_round', inplace=True)
```

### Basic Usage
AutoRound support Gaudi2, CPU and GPU. A user guide detailing the full list of supported arguments is provided by calling ```auto_round -h``` on the terminal. Alternatively, you can use ```auto-round``` instead of ```auto_round```. (**auto-round version > 0.3.0**)


```bash
auto_round --model facebook/opt-125m \
--bits 4 \
--group_size 128 \
--format auto_round \
--output_dir ./tmp_autoround
```
We provide two recipes for best accuracy and fast running speed with low memory. Details as below.
<details>
<summary>Other Recipes</summary>

```bash
## best accuracy, 3X slower, low_gpu_mem_usage could save ~20G but ~30% slower
auto_round --model facebook/opt-125m \
--bits 4 \
--group_size 128 \
--nsamples 512 \
--iters 1000 \
--low_gpu_mem_usage
```

```bash
## fast and low memory, 2-3X speedup, slight accuracy drop at W4G128
auto_round --model facebook/opt-125m \
--bits 4 \
--group_size 128 \
--nsamples 128 \
--iters 200 \
--seqlen 512 \
--batch_size 4
```
</details>
<br>

<details>
<summary>Detailed Hyperparameters</summary>

Expand Down Expand Up @@ -166,9 +128,44 @@ We provide two recipes for best accuracy and fast running speed with low memory.
- `device`: The device to be used for tuning. The default is set to 'auto', allowing for automatic detection.

</details>
<br>

### Basic Usage (auto-round version > 0.3.0)
AutoRound support Gaudi2, CPU and GPU. A user guide detailing the full list of supported arguments is provided by calling ```auto_round -h``` on the terminal. Alternatively, you can use ```auto-round``` instead of ```auto_round```.


```bash
auto_round --model facebook/opt-125m \
--bits 4 \
--group_size 128 \
--format auto_round \
--disable_eval \
--output_dir ./tmp_autoround
```
We provide two recipes for best accuracy and fast running speed with low memory. Details as below.
<details>
<summary>Other Recipes</summary>

```bash
## best accuracy, 3X slower, low_gpu_mem_usage could save ~20G but ~30% slower
auto_round --model facebook/opt-125m \
--bits 4 \
--group_size 128 \
--nsamples 512 \
--iters 1000 \
--low_gpu_mem_usage
```

```bash
## fast and low memory, 2-3X speedup, slight accuracy drop at W4G128
auto_round --model facebook/opt-125m \
--bits 4 \
--group_size 128 \
--nsamples 128 \
--iters 200 \
--seqlen 512 \
--batch_size 4
```
</details>

#### Formats

Expand Down
21 changes: 4 additions & 17 deletions auto_round/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
from lm_eval.utils import make_table # pylint: disable=E0401

from auto_round.utils import logger
from auto_round import AutoRoundConfig
from auto_round.eval.evaluation import simple_evaluate
from auto_round.utils import detect_device

def setup_parser():
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -156,7 +156,6 @@ def setup_parser():

def tune(args):
tasks = args.tasks
use_eval_legacy = False
if args.format is None:
args.format = "auto_round"

Expand All @@ -165,7 +164,6 @@ def tune(args):
model_name = model_name[:-1]
print(model_name, flush=True)

from auto_round.utils import detect_device

device_str = detect_device(args.device)
torch_dtype = "auto"
Expand Down Expand Up @@ -236,16 +234,6 @@ def tune(args):
seqlen = min(seqlen, tokenizer.model_max_length)
args.seqlen = seqlen

excel_name = f"{model_name}_{args.bits}_{args.group_size}"
if (hasattr(model, 'config') and (model.dtype is torch.bfloat16 or model.config.torch_dtype is torch.bfloat16)):
dtype = 'bfloat16'
else:
if "cpu" not in device_str:
dtype = 'float16'
else:
dtype = 'float32'

excel_name = f"{model_name}_{args.bits}_{args.group_size}"
if "bloom" in model_name:
args.low_gpu_mem_usage = False

Expand Down Expand Up @@ -309,7 +297,7 @@ def tune(args):
low_cpu_mem_usage=low_cpu_mem_usage, data_type=args.data_type,
enable_norm_bias_tuning=args.enable_norm_bias_tuning)
model, _ = autoround.quantize()
model_name = args.model_name.rstrip("/")
model_name = args.model.rstrip("/")
if args.low_cpu_mem_mode == 1 or args.low_cpu_mem_mode == 2:
import shutil

Expand All @@ -320,7 +308,6 @@ def tune(args):
torch.cuda.empty_cache()

export_dir = args.output_dir + "/" + model_name.split('/')[-1] + f"-autoround-w{args.bits}g{args.group_size}"
output_dir = args.output_dir + "/" + model_name.split('/')[-1] + f"-autoround-w{args.bits}g{args.group_size}-qdq"


format_list = args.format.replace(' ', '').split(',')
Expand Down Expand Up @@ -366,9 +353,10 @@ def get_library_version(library_name):

def eval(args):
quantization_config = AutoRoundConfig(backend=args.device)
device_str = detect_device(args.device)
user_model = AutoModelForCausalLM.from_pretrained(
args.model,
device_map=args.device, quantization_config=quantization_config)
device_map=device_str, quantization_config=quantization_config)
model_args = f"pretrained={args.model},trust_remote_code={not args.disable_trust_remote_code}"
if isinstance(args.tasks, str):
tasks = args.tasks.split(',')
Expand All @@ -377,7 +365,6 @@ def eval(args):
model_args=model_args,
user_model=user_model,
tasks=tasks,
device=args.device,
batch_size=args.eval_bs)

from lm_eval.utils import make_table # pylint: disable=E0401
Expand Down
4 changes: 3 additions & 1 deletion auto_round/auto_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,8 @@ class StoreAttr(object):

def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
if model.__class__.main_input_name != "input_ids":
raise RuntimeError("We can only quantize pure text model.")
logger.warning("We can only quantize pure text models and " \
"certain types(Llava/Qwen-VL/Phi-3-vision) of multimodal models.")

if self.pre_quantized:
model = self.convert_model(model)
Expand Down Expand Up @@ -485,3 +486,4 @@ def is_serializable(self):
transformers.quantizers.auto.AutoHfQuantizer = AutoHfQuantizer
transformers.modeling_utils.AutoHfQuantizer = AutoHfQuantizer


Loading

0 comments on commit d93e14b

Please sign in to comment.