import torch import transformers import quant from typing import Dict, Any from gptq import GPTQ from utils import find_layers, DEV from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM import os import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" class EndpointHandler: def __init__(self, path="", model_name="Wizard-Vicuna-13B-Uncensored-GPTQ", checkpoint_path="Wizard-Vicuna-13B-Uncensored-GPTQ/Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors", wbits = 4, groupsize=128, fused_mlp=True, eval=True, warmup_autotune=True): model_name = os.path.join(path, model_name) checkpoint_path = os.path.join(path, checkpoint_path) self.model = self.load_quant(model_name, checkpoint_path, wbits, groupsize, fused_mlp, eval, warmup_autotune) self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) self.model.to(DEV) def load_quant(self, model, checkpoint, wbits, groupsize, fused_mlp, eval, warmup_autotune): config = LlamaConfig.from_pretrained(model) def noop(*args, **kwargs): pass torch.nn.init.kaiming_uniform_ = noop torch.nn.init.uniform_ = noop torch.nn.init.normal_ = noop torch.set_default_dtype(torch.half) transformers.modeling_utils._init_weights = False model = LlamaForCausalLM(config) torch.set_default_dtype(torch.float) if eval: model = model.eval() layers = find_layers(model) for name in ['lm_head']: if name in layers: del layers[name] quant.make_quant_linear(model, layers, wbits, groupsize) del layers print('Loading model ...') if checkpoint.endswith('.safetensors'): from safetensors.torch import load_file as safe_load model.load_state_dict(safe_load(checkpoint), strict=False) else: model.load_state_dict(torch.load(checkpoint), strict=False) if eval: quant.make_quant_attn(model) quant.make_quant_norm(model) if fused_mlp: quant.make_fused_mlp(model) if warmup_autotune: quant.autotune_warmup_linear(model, transpose=not (eval)) if eval and fused_mlp: quant.autotune_warmup_fused(model) model.seqlen = 2048 print('Done.') return model def __call__(self, data: Any) -> Dict[str, str]: input_text = data.pop("inputs", data) input_ids = self.tokenizer.encode(input_text, return_tensors="pt").to(DEV) with torch.no_grad(): generated_ids = self.model.generate( input_ids, do_sample=True, min_length=50, max_length=200, top_p=0.95, temperature=0.8, ) generated_text = self.tokenizer.decode([el.item() for el in generated_ids[0]]) return {'generated_text': generated_text}