import gradio as gr from transformers import TextIteratorStreamer, AutoModelForCausalLM, AutoProcessor from threading import Thread import re import time from PIL import Image import torch import argparse import spaces parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, default='aya') args = parser.parse_args() model_name = args.model processor = AutoProcessor.from_pretrained(f"WueNLP/centurio_{model_name}", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(f"WueNLP/centurio_{model_name}", trust_remote_code=True, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True ).to("cuda:0") @spaces.GPU def bot_streaming(message, history): if message["files"]: image = message["files"][-1] else: # if there's no image uploaded for this turn, look for images in the past turns # kept inside tuples, take the last one for hist in history: if type(hist[0]) == tuple: image = hist[0][0] if "qwen" in model_name: if image is None: prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{message['text']}<|im_end|>\n<|im_start|>assistant\n" else: image = Image.open(image).convert("RGB") prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n\n{message['text']}<|im_end|>\n<|im_start|>assistant\n" else: if image is None: prompt = f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{message['text']}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" else: image = Image.open(image).convert("RGB") prompt = f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>\n{message['text']}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda:0", torch.bfloat16) streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": False}) generation_kwargs = dict(inputs, streamer=streamer, do_sample=True, num_beams=1, repetition_penalty=1.15, temperature=0.7, top_p=0.8, top_k=20, max_new_tokens=512, min_new_tokens=1) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() buffer = "" for new_text in streamer: buffer += new_text if "qwen" in model_name: generated_text_without_prompt = buffer.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0] else: generated_text_without_prompt = buffer.split("<|CHATBOT_TOKEN|>")[-1].split("<|END_OF_TURN_TOKEN|>")[0] time.sleep(0.04) yield generated_text_without_prompt description = ("""# [Centurio: On Drivers of Multilingual Ability of Large Vision-Language Model](gregor-ge.github.io/Centurio/) Try [Centurio](https://huggingface.co/collections/WueNLP/centurio-677cf0ab6ddea874927a154e), a massively multilingual large vision-language model, in this demo (specifically, [Centurio Aya](https://huggingface.co/WueNLP/centurio_aya)). Upload an image and start chatting about it, or try one of the examples below. Centurio is trained with 100 languages but quality of answers can differ greatly depending on your language. Centurio is trained to read text in images but struggles with small text and with non-Latin scripts. > If you don't upload an image, you will receive an error. > This demo does not support multi-image prompts or multi-turn dialog. Every new prompt will refer to the last image (if no new image is included) without prior dialog as context.""") demo = gr.ChatInterface(fn=bot_streaming, title="Centurio Demo", examples=[{"text": "What is on the flower?", "files": ["./bee.jpg"]}, {"text": "How to make this pastry?", "files": ["./baklava.png"]}], description=description, stop_btn="Stop Generation", multimodal=True, fill_height=True, ) demo.launch(debug=True, share=True)