Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

W4Afp8 export #378

Draft
wants to merge 14 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions auto_round/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
sys.path.insert(0, '../')

def run():
from auto_round.script.llm import setup_parser, tune, eval
Expand Down
2 changes: 2 additions & 0 deletions auto_round/auto_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,8 @@ def remove_device_str(s, device_str):
"via `pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@b8b4127`")

QuantLinear = dynamic_import_inference_linear(layer_backend, bits, group_size, sym)
# from auto_round_extension.cuda.qlinear_exllamav2_gptq import QuantLinear


layer_device = get_device(layer)

Expand Down
21 changes: 15 additions & 6 deletions auto_round/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@ def __init__(
all_blocks = get_block_names(model)
self.quant_block_list = find_matching_blocks(model, all_blocks, self.to_quant_block_names)
self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device


##activation
self.act_group_size = act_group_size if not (act_group_size is None) else self.group_size
Expand Down Expand Up @@ -373,6 +372,8 @@ def dump_qinfo_to_layer_config(self):
for n, m in self.model.named_modules():
if n not in self.layer_config.keys():
continue
if hasattr(m,"orig_layer"):
m = m.orig_layer
if hasattr(m, "scale"):
self.layer_config[n]["scale"] = m.scale
self.layer_config[n]["zp"] = m.zp
Expand Down Expand Up @@ -862,8 +863,13 @@ def quant_layer(self, layer_name, inputs, q_inputs=None, device=torch.device("cp
if q_inputs is not None:
q_inputs[i] = q_inputs[i].to(layer.weight.dtype)

wrapper_linear = WrapperLinear(layer, enable_minmax_tuning=self.enable_minmax_tuning, device=device).to(
device)
wrapper_linear = WrapperLinear(
layer,
enable_minmax_tuning=self.enable_minmax_tuning,
device=device,
_inner_layer_name=layer_name,
).to(device)

round_params = []
minmax_params = []
for key in wrapper_linear.params.keys():
Expand Down Expand Up @@ -1029,10 +1035,11 @@ def quant_block(self, block, input_ids, input_others, q_input=None, device=torch
if hasattr(m, "orig_layer"):
for key in m.params.keys():
if "min" in key or "max" in key:
logger.info(f"add minmax param {key} in module{n} to optimizer, shape: {m.params[key].shape}")
minmax_params.append(m.params[key])
else:
logger.info(f"add minmax param {key} in module{n} to optimizer, shape {m.params[key].shape}")
round_params.append(m.params[key])

if self.enable_minmax_tuning:
optimizer = self.optimizer(
[{"params": round_params}, {"params": minmax_params, "lr": self.minmax_lr}], lr=self.lr, weight_decay=0
Expand Down Expand Up @@ -1073,6 +1080,7 @@ def quant_block(self, block, input_ids, input_others, q_input=None, device=torch
total_loss = 0

for i in range(self.iters):
logger.info(f"iter {i} / {self.iters}")
total_loss = 0
if self.sampler == "rand":
whole_indices = torch.randperm(nsamples)[:pick_samples]
Expand Down Expand Up @@ -1209,6 +1217,8 @@ def quant_blocks(
pbar = tqdm(range(0, len(block_names), nblocks))
# for i in pbar:
for i in range(len(block_names)):
if os.getenv("DEBUG_QUANT_BLOCK", "0") == "1" and i > 2:
break
if nblocks == 1:
n = block_names[i]
pbar.set_description(f"Quantizing {n}")
Expand Down Expand Up @@ -1258,7 +1268,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
if not self.quantized:
logger.warning("please run autoround.quantize first")
return
if format == "fake" or format == "qdq" or self.act_bits <= 8: ##TODO fix act quantizaiton later
if format == "fake" or format == "qdq": ##TODO fix act quantizaiton later
self.model = self.model.to("cpu")
self.model.save_pretrained(output_dir)
if self.tokenizer is not None:
Expand Down Expand Up @@ -1740,4 +1750,3 @@ def __init__(
optimizer=optimizer,
**kwargs,
)

36 changes: 36 additions & 0 deletions auto_round/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os

from dataclasses import dataclass


@dataclass
class GlobalConfig:
FP8_INPUT_BACKOFF: float = 1.0
FP8_WEIGHT_BACKOFF: float = 1.0
# enbale weight_fp8_max_scale
ENABLE_WEIGHT_FP8_MAX_SCALE: bool = False
W4A8_PC: bool = False
W4A8_DYNAMIC: bool = False

def __repr__(self):
return (
f"GlobalConfig(FP8_INPUT_BACKOFF={self.FP8_INPUT_BACKOFF}, "
f"FP8_WEIGHT_BACKOFF={self.FP8_WEIGHT_BACKOFF}, "
f"ENABLE_WEIGHT_FP8_MAX_SCALE={self.ENABLE_WEIGHT_FP8_MAX_SCALE}, "
f"W4A8_PC={self.W4A8_PC}, "
f"W4A8_DYNAMIC={self.W4A8_DYNAMIC}"
f")"

)


# https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html?highlight=fp8#configuring-backoff-factors
# The default values are input_backoff=0.25 and weight_backoff=0.5
global_config = GlobalConfig()
global_config.FP8_INPUT_BACKOFF = float(os.environ.get("AR_FP8_INPUT_BACKOFF", 1.0))
global_config.FP8_WEIGHT_BACKOFF = float(os.environ.get("AR_FP8_WEIGHT_BACKOFF", 1.0))
global_config.ENABLE_WEIGHT_FP8_MAX_SCALE = os.environ.get("AR_ENABLE_WEIGHT_FP8_MAX_SCALE", "0") == "1"
global_config.W4A8_PC = os.environ.get("W4A8_PC", "0") == "1"
global_config.W4A8_DYNAMIC = os.environ.get("W4A8_DYNAMIC", "0") == "1"

print(global_config)
Loading
Loading