|
''' |
|
Tested on: transformers==4.38.1, autoawq=0.2.3 |
|
Run on 1 card (mem>=18G) |
|
''' |
|
|
|
import torch |
|
|
|
from awq.quantize.quantizer import AwqQuantizer |
|
from awq.quantize.quantizer import * |
|
from awq import AutoAWQForCausalLM |
|
from transformers import AutoTokenizer |
|
from unittest.mock import patch |
|
|
|
|
|
class FalconAwqQuantizer(AwqQuantizer): |
|
def quantize(self): |
|
print('Patched!') |
|
for i in tqdm(range(len(self.modules)), desc="AWQ"): |
|
|
|
common_device = next(self.modules[i].parameters()).device |
|
if common_device is None or str(common_device) == "cpu": |
|
if torch.cuda.is_available(): |
|
best_device = "cuda:" + str(i % torch.cuda.device_count()) |
|
else: |
|
best_device = get_best_device() |
|
|
|
self.modules[i] = self.modules[i].to(best_device) |
|
common_device = next(self.modules[i].parameters()).device |
|
|
|
if self.module_kwargs.get("position_ids") is not None: |
|
self.module_kwargs["position_ids"] = self.module_kwargs[ |
|
"position_ids" |
|
].to(common_device) |
|
|
|
if self.module_kwargs.get("attention_mask") is not None: |
|
self.module_kwargs["attention_mask"] = self.module_kwargs[ |
|
"attention_mask" |
|
].to(common_device) |
|
|
|
|
|
if self.module_kwargs.get("alibi") is not None: |
|
self.module_kwargs["alibi"] = self.module_kwargs[ |
|
"alibi" |
|
].to(common_device) |
|
else: |
|
self.module_kwargs['alibi'] = None |
|
print(f'alibi=None in layer {i}, this is expected if use_alibi=False.') |
|
|
|
self.inps = self.inps.to(common_device) |
|
|
|
|
|
named_linears = get_named_linears(self.modules[i]) |
|
|
|
|
|
named_linears = exclude_layers_to_not_quantize( |
|
named_linears, self.modules_to_not_convert |
|
) |
|
|
|
input_feat = self._get_input_feat(self.modules[i], named_linears) |
|
clear_memory() |
|
|
|
|
|
module_config: List[Dict] = self.awq_model.get_layers_for_scaling( |
|
self.modules[i], input_feat, self.module_kwargs |
|
) |
|
scales_list = [ |
|
self._search_best_scale(self.modules[i], **layer) |
|
for layer in module_config |
|
] |
|
apply_scale(self.modules[i], scales_list, input_feat_dict=input_feat) |
|
scales_list = append_str_prefix( |
|
scales_list, get_op_name(self.model, self.modules[i]) + "." |
|
) |
|
|
|
|
|
clip_list = self._search_best_clip( |
|
self.modules[i], named_linears, input_feat |
|
) |
|
apply_clip(self.modules[i], clip_list) |
|
clip_list = append_str_prefix( |
|
clip_list, get_op_name(self.model, self.modules[i]) + "." |
|
) |
|
|
|
|
|
if not self.export_compatible: |
|
self._apply_quant(self.modules[i], named_linears) |
|
|
|
clear_memory() |
|
|
|
|
|
model_path = 'tiiuae/falcon-40b' |
|
|
|
quant_path = 'falcon-40b-autoawq-w4g128' |
|
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"} |
|
|
|
|
|
|
|
model = AutoAWQForCausalLM.from_pretrained( |
|
model_path, device_map='cpu', trust_remote_code=False, **{"low_cpu_mem_usage": True, "use_cache": False} |
|
) |
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
|
|
|
with patch('awq.models.base.AwqQuantizer', FalconAwqQuantizer): |
|
model.quantize(tokenizer, quant_config=quant_config) |
|
|
|
|
|
model.save_quantized(quant_path) |
|
tokenizer.save_pretrained(quant_path) |
|
|
|
print(f'Model is quantized and saved at "{quant_path}"') |
|
|