ssaroya commited on
Commit
7302c2d
·
1 Parent(s): 5061642

Create handler.py

Browse files
Files changed (1) hide show
  1. handler.py +38 -0
handler.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import transformers
3
+ import quant
4
+ from typing import Dict, Any
5
+ from gptq import GPTQ
6
+ from utils import find_layers, DEV
7
+ from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
8
+ import os
9
+
10
+ import os
11
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
12
+
13
+ class EndpointHandler:
14
+ def __init__(self, path=""):
15
+ model_bin_path = os.path.join(path, "model.bin")
16
+ with open(model_bin_path, "rb") as f: # "rb" because we want to read in binary mode
17
+ self.model = pickle.load(f)
18
+
19
+ self.tokenizer = AutoTokenizer.from_pretrained("Wizard-Vicuna-13B-Uncensored-GPTQ", use_fast=False)
20
+
21
+
22
+
23
+ def __call__(self, data: Any) -> Dict[str, str]:
24
+ input_text = data.pop("inputs", data)
25
+ input_ids = self.tokenizer.encode(input_text, return_tensors="pt").to(DEV)
26
+
27
+ with torch.no_grad():
28
+ generated_ids = self.model.generate(
29
+ input_ids,
30
+ do_sample=True,
31
+ min_length=50,
32
+ max_length=200,
33
+ top_p=0.95,
34
+ temperature=0.8,
35
+ )
36
+ generated_text = self.tokenizer.decode([el.item() for el in generated_ids[0]])
37
+
38
+ return {'generated_text': generated_text}