Spaces:
Paused
Paused
import gradio as gr | |
from PIL import Image | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import cv2 | |
import numpy as np | |
import ast | |
# # Ensure GPU usage if available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Initialize the model and tokenizer | |
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE", | |
torch_dtype=torch.float16, | |
device_map="auto", | |
trust_remote_code=True) | |
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True) | |
def video_to_frames(video, fps=1): | |
"""Converts a video file into frames and stores them as PNG images in a list.""" | |
frames_png = [] | |
cap = cv2.VideoCapture(video) | |
if not cap.isOpened(): | |
print("Error opening video file") | |
return frames_png | |
frame_count = 0 | |
frame_interval = int(cap.get(cv2.CAP_PROP_FPS)) // fps # Calculate frame interval | |
while cap.isOpened(): | |
ret, frame = cap.read() | |
if not ret: | |
print("Can't receive frame (stream end?). Exiting ...") | |
break | |
if frame_count % frame_interval == 0: | |
is_success, buffer = cv2.imencode(".png", frame) | |
if is_success: | |
frames_png.append(np.array(buffer).tobytes()) | |
frame_count += 1 | |
cap.release() | |
return frames_png | |
def extract_frames(frame): | |
# Convert binary data to a numpy array | |
frame_np = np.frombuffer(frame, dtype=np.uint8) | |
# Decode the PNG image | |
image_rgb = cv2.imdecode(frame_np, flags=cv2.IMREAD_COLOR) # Assuming it's in RGB format | |
# Convert RGB to BGR | |
image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR) | |
return image_bgr | |
def predict_answer(image, video, question): | |
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:" | |
input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device) | |
if image is not None: | |
# Process as an image | |
image = image.convert("RGB") | |
image_tensor = model.image_preprocess(image) | |
#Generate the answer | |
output_ids = model.generate( | |
input_ids, | |
max_new_tokens=25, | |
images=image_tensor, | |
use_cache=True)[0] | |
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip() | |
elif video is not None: | |
# Process as a video | |
frames = video_to_frames(video) | |
answers = [] | |
for frame in frames: | |
image = extract_frames(frame) | |
image_tensor = model.image_preprocess([image]) | |
# Generate the answer | |
output_ids = model.generate( | |
input_ids, | |
max_new_tokens=25, | |
images=image_tensor, | |
use_cache=True)[0] | |
answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip() | |
answers.append(answer) | |
return ast.literal_eval(answers[0]) | |
else: | |
return "Unsupported file type. Please upload an image or video." | |
def gradio_predict(image, video, question): | |
answer = predict_answer(image, video, question) | |
return answer | |
css = """ | |
#container{ | |
display: block; | |
margin-left: auto; | |
margin-right: auto; | |
width: 50%; | |
} | |
#intro{ | |
max-width: 100%; | |
margin: 0 auto; | |
text-align: center; | |
} | |
""" | |
with gr.Blocks(css = css) as app: | |
with gr.Row(elem_id="container"): | |
gr.Markdown("""<div style='text-align: center;'><img src="https://github-production-user-asset-6210df.s3.amazonaws.com/37763863/311454340-af72f848-9735-4d49-830b-885ffbb81091.jpeg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240309%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240309T165700Z&X-Amz-Expires=300&X-Amz-Signature=51aeb4811afff72e70c083594aaffcca1f4a2b95ddd4adf23ee5e736e4fbfefe&X-Amz-SignedHeaders=host&actor_id=37763863&key_id=0&repo_id=769602947" width="1000" height="500" /></div>""") | |
gr.Markdown(""" | |
## This Gradio app serves as four folds: | |
### 1. My ability and experience to design a customizable Gradio application with Interface/Blocks structure. | |
### 2. One of my Multimodel Vision-Language model's capabilities with the LLaVA framework. | |
### 3. Demo for annotating random images and 4 second videos provided at Notion (https://shorturl.at/givyC) | |
### 4. Ability to integrate a Large Language Model and Vision Encoder | |
""") | |
with gr.Row(): | |
video = gr.Video(label="Upload your video here") | |
image = gr.Image(type="pil", label="Upload or Drag an Image") | |
with gr.Row(): | |
with gr.Column(): | |
question = gr.Textbox(label="Question", placeholder="Annotate prompt", lines=4.3) | |
btn = gr.Button("Annotate") | |
with gr.Column(): | |
answer = gr.TextArea(label="Answer") | |
btn.click(gradio_predict, inputs=[image, video, question], outputs=answer) | |
app.launch(debug=True) | |