testing for inference endpoints

#57

by nbroad HF staff - opened Nov 22, 2024

base: refs/heads/main

←

from: refs/pr/57

Discussion Files changed

+62

-0

Files changed (2) hide show

handler.py +60 -0
requirements.txt +2 -0

handler.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+class EndpointHandler():
+    def __init__(self, path):
+        # default: Load the model on the available device(s)
+        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+           path, torch_dtype="auto", device_map="auto",
+        )
+        # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+        # model = Qwen2VLForConditionalGeneration.from_pretrained(
+        #    path,
+        #     torch_dtype=torch.bfloat16,
+        #     attn_implementation="flash_attention_2",
+        #     device_map="auto",
+        # )
+        # default processer
+        self.processor = AutoProcessor.from_pretrained(path)
+        # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
+        # min_pixels = 256*28*28
+        # max_pixels = 1280*28*28
+        # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+    def __call__(self, data):
+        text = self.processor.apply_chat_template(
+            data["messages"], tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(data["messages"])
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.model.device)
+        # Inference: Generation of the output
+        generated_ids = self.model.generate(**inputs, max_new_tokens=data.get("max_new_tokens", 128))
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        return {
+            "output": output_text,
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ qwen-vl-utils
2	+ git+https://github.com/huggingface/transformers.git@b99ca4d28b47fa7166e7882cb0695a5c0cc0d411