ssaroya
/

gptq_model

Inference Endpoints

Model card Files Files and versions Community

ssaroya commited on May 22, 2023

Commit

7302c2d

·

1 Parent(s): 5061642

Create handler.py

Files changed (1) hide show

handler.py +38 -0

handler.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+import transformers
+import quant
+from typing import Dict, Any
+from gptq import GPTQ
+from utils import find_layers, DEV
+from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
+import os
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+class EndpointHandler:
+    def __init__(self, path=""):
+        model_bin_path = os.path.join(path, "model.bin")
+        with open(model_bin_path, "rb") as f: # "rb" because we want to read in binary mode
+            self.model = pickle.load(f)
+        self.tokenizer = AutoTokenizer.from_pretrained("Wizard-Vicuna-13B-Uncensored-GPTQ", use_fast=False)
+    def __call__(self, data: Any) -> Dict[str, str]:
+        input_text = data.pop("inputs", data)
+        input_ids = self.tokenizer.encode(input_text, return_tensors="pt").to(DEV)
+        with torch.no_grad():
+            generated_ids = self.model.generate(
+                input_ids,
+                do_sample=True,
+                min_length=50,
+                max_length=200,
+                top_p=0.95,
+                temperature=0.8,
+            )
+        generated_text = self.tokenizer.decode([el.item() for el in generated_ids[0]])
+        return {'generated_text': generated_text}