--- tags: - fp8 --- Quantized using AutoFP8 with this script: ```python from transformers import AutoTokenizer import auto_fp8 from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig pretrained_model_dir = "ibm-granite/granite-20b-code-base" quantized_model_dir = "granite-20b-code-base-FP8" tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) # use some code to calibrate import auto_fp8 tmp = auto_fp8.__file__.split('/')[:-1] tmp.append('quantize.py') seed_text_file = '/'.join(tmp) with open(seed_text_file, "r") as f: text = f.read() examples = [text] examples = tokenizer(examples, return_tensors="pt").to("cuda") quantize_config = BaseQuantizeConfig( quant_method="fp8", activation_scheme="static", ignore_patterns=["re:.*lm_head"], ) model = AutoFP8ForCausalLM.from_pretrained( pretrained_model_dir, quantize_config=quantize_config ) model.quantize(examples) model.save_quantized(quantized_model_dir) ```