llava-onevision / app.py
kavaliha's picture
update
6752e3f verified
import gradio as gr
from transformers import LlavaOnevisionProcessor, LlavaOnevisionForConditionalGeneration, TextIteratorStreamer
from threading import Thread
import re
import time
from PIL import Image
import torch
import cv2
import spaces
model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
processor = LlavaOnevisionProcessor.from_pretrained(model_id)
model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
model.to("cuda")
# Function to capture frames from the camera
def capture_camera_frames(num_frames):
camera = cv2.VideoCapture(0) # Accessing the camera (0 is the default camera)
frames = []
for _ in range(num_frames):
ret, frame = camera.read()
if not ret:
break
pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
frames.append(pil_img)
camera.release()
return frames
@spaces.GPU
def bot_streaming(message, history):
txt = message.text
ext_buffer = f"user\n{txt} assistant"
if message.files:
if len(message.files) == 1:
image = [message.files[0].path]
elif len(message.files) > 1:
image = [msg.path for msg in message.files]
else:
image = None
# Check if we should use the camera
if txt.lower().startswith("camera"):
# Capture frames from the camera
image = capture_camera_frames(5) # Capture 5 frames
if message.files is None and not image:
gr.Error("You need to upload an image or video, or access the camera for LLaVA to work.")
return
video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg")
image_extensions = Image.registered_extensions()
image_extensions = tuple([ex for ex, f in image_extensions.items()])
if len(image) == 1:
if image[0].endswith(video_extensions):
video = sample_frames(image[0], 32)
image = None
prompt = f"<|im_start|>user <video>\n{message.text}<|im_end|><|im_start|>assistant"
elif image[0].endswith(image_extensions):
image = Image.open(image[0]).convert("RGB")
video = None
prompt = f"<|im_start|>user <image>\n{message.text}<|im_end|><|im_start|>assistant"
elif len(image) > 1:
image_list = []
user_prompt = message.text
for img in image:
if img.endswith(image_extensions):
img = Image.open(img).convert("RGB")
image_list.append(img)
elif img.endswith(video_extensions):
frames = sample_frames(img, 6)
for frame in frames:
image_list.append(frame)
toks = "<image>" * len(image_list)
prompt = "<|im_start|>user" + toks + f"\n{user_prompt}<|im_end|><|im_start|>assistant"
image = image_list
video = None
inputs = processor(text=prompt, images=image, videos=video, return_tensors="pt").to("cuda", torch.float16)
streamer = TextIteratorStreamer(processor, **{"max_new_tokens": 200, "skip_special_tokens": True})
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=200)
generated_text = ""
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
generated_text_without_prompt = buffer[len(ext_buffer):]
time.sleep(0.01)
yield generated_text_without_prompt
# Integrate camera access into Gradio demo
demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Onevision with Camera", examples=[
{"text": "Take a picture with the camera and describe what is in it.", "files":[]},
{"text": "Do the cats in these two videos have the same breed? What breed is each cat?", "files":["./cats_1.mp4", "./cats_2.mp4"]},
{"text": "Here are several images from a cooking book, showing how to prepare a meal step by step. Can you write a recipe for the meal?", "files":["./step0.png", "./step1.png", "./step2.png", "./step3.png"]},
],
textbox=gr.MultimodalTextbox(file_count="multiple"),
description="Upload an image or video, or try capturing frames with the camera and chat about it.",
stop_btn="Stop Generation", multimodal=True)
demo.launch(debug=True)