Spaces:
Paused
Paused
import gradio as gr | |
from PIL import Image | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import cv2 | |
import numpy as np | |
import pandas as pd | |
import ast | |
from collections import Counter | |
from io import BytesIO | |
from io import StringIO | |
# # Ensure GPU usage if available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Initialize the model and tokenizer | |
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE", | |
torch_dtype=torch.float16, | |
device_map="auto", | |
trust_remote_code=True) | |
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True) | |
def video_to_frames(video, fps=1): | |
"""Converts a video file into frames and stores them as PNG images in a list.""" | |
frames_png = [] | |
cap = cv2.VideoCapture(video) | |
if not cap.isOpened(): | |
print("Error opening video file") | |
return frames_png | |
frame_count = 0 | |
frame_interval = int(cap.get(cv2.CAP_PROP_FPS)) // fps # Calculate frame interval | |
while cap.isOpened(): | |
ret, frame = cap.read() | |
if not ret: | |
print("Can't receive frame (stream end?). Exiting ...") | |
break | |
if frame_count % frame_interval == 0: | |
is_success, buffer = cv2.imencode(".png", frame) | |
if is_success: | |
frames_png.append(np.array(buffer).tobytes()) | |
frame_count += 1 | |
cap.release() | |
return frames_png | |
def extract_frames(frame): | |
# Convert binary data to a numpy array | |
frame_np = np.frombuffer(frame, dtype=np.uint8) | |
# Decode the PNG image | |
image_rgb = cv2.imdecode(frame_np, flags=cv2.IMREAD_COLOR) # Assuming it's in RGB format | |
# Convert RGB to BGR | |
image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR) | |
return image_bgr | |
def predict_answer(video, image, question): | |
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:" | |
input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device) | |
if image: | |
# Process as an image | |
image = image.convert("RGB") | |
image_tensor = model.image_preprocess(image) | |
#Generate the answer | |
output_ids = model.generate( | |
input_ids, | |
max_new_tokens=25, | |
images=image_tensor, | |
use_cache=True)[0] | |
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip() | |
elif video: | |
frames = video_to_frames(video) | |
image = extract_frames(frames[2]) | |
image_tensor = model.image_preprocess([image]) | |
# Generate the answer | |
output_ids = model.generate( | |
input_ids, | |
max_new_tokens=25, | |
images=image_tensor, | |
use_cache=True)[0] | |
answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip() | |
return answer | |
# # Process as a video | |
# frames = video_to_frames(video) | |
# answers = [] | |
# for frame in frames: | |
# image = extract_frames(frame) | |
# image_tensor = model.image_preprocess([image]) | |
# # Generate the answer | |
# output_ids = model.generate( | |
# input_ids, | |
# max_new_tokens=25, | |
# images=image_tensor, | |
# use_cache=True)[0] | |
# answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip() | |
# answers.append(answer) | |
# # Modify this logic based on your specific needs | |
# most_common_answer = Counter(answers).most_common(1)[0][0] | |
# # Safely evaluate the most common answer assuming it's a string representation of a Python literal | |
# try: | |
# evaluated_answer = ast.literal_eval(most_common_answer) | |
# except (ValueError, SyntaxError): | |
# # Handle malformed answer string | |
# evaluated_answer = f"Error evaluating answer: {most_common_answer}" | |
# return evaluated_answer | |
# return ast.literal_eval(answers[0]) | |
# else: | |
# return "Unsupported file type. Please upload an image or video." | |
promt_cat_dog = """ | |
Annotate this image with this schema: | |
{ | |
“description”: “Is there a cat in the image?”, | |
“value”: “Cat” | |
}, | |
{ | |
“description”: “Is there a dog in the image?”, | |
“value”: “Dog”, | |
}, | |
{ | |
“description”: “Is there a horse in the image?”, | |
“value”: “Horse”, | |
}, | |
provide me the answers as a dictionary with key as the string value of the variable value on top and its value should be boolean value | |
""" | |
promt_bus_people = """ | |
Annotate this image with this schema: | |
{ | |
“description”: “Is there a bus in the image?”, | |
“value”: “Bus”, | |
}, | |
{ | |
“description”: “Is there a bike in the image?”, | |
“value”: “Bike”, | |
}, | |
provide me the answers as a dictionary with key as the string value of the variable value on top and its value should be boolean value | |
""" | |
promt_video = """ | |
Annotate this image with this schema: | |
{ | |
“description”: “Is there a person standing in the image?”, | |
“value”: “standing”, | |
}, | |
{ | |
“description”: “Is the person's hands free in the image?”, | |
“value”: “hands-free”, | |
}, | |
{ | |
“description”: “Is it indoors?”, | |
“value”: “Indoors” | |
}, | |
provide me the answers as a dictionary with key as the string value of the variable value on top and its value should be boolean value. | |
""" | |
test_examples = [[None, "Images/cat_dog.jpeg", promt_cat_dog], | |
[None,"Images/bus_people.jpeg", promt_bus_people], | |
["videos/v1_new.mp4",None,promt_video], | |
["videos/v3.mp4",None,promt_video]] | |
def gradio_predict(video,image, question): | |
answer = predict_answer(video,image, question) | |
return answer | |
def export_csv(d): | |
print(d) | |
d_dict = ast.literal_eval(d) | |
print(type(d_dict)) # This will show <class 'dict'> | |
print(d_dict) | |
# df = pd.DataFrame(columns=list(d_dict.keys())) | |
# df.loc[0] = list(d.values()) | |
df = pd.DataFrame([d_dict]) | |
df.to_csv("output.csv", sep=',') | |
print(df) | |
return gr.File(value="output.csv", visible=True) | |
css = """ | |
#container{ | |
display: block; | |
margin-left: auto; | |
margin-right: auto; | |
width: 60%; | |
} | |
#intro{ | |
max-width: 100%; | |
margin: 0 auto; | |
text-align: center; | |
} | |
""" | |
with gr.Blocks(css = css) as app: | |
with gr.Row(elem_id="container"): | |
gr.Image("gsoc_redhen.png",min_width=60, label="GSOC 2024") | |
gr.Markdown(""" | |
## This Gradio app serves as four folds: | |
### 1. My ability and experience to design a customizable Gradio application with Interface/Blocks structure. | |
### 2. One of my Multimodel Vision-Language model's capabilities with the LLaVA framework. | |
### 3. Demo for annotating random images and 4 second videos provided at Notion (https://shorturl.at/givyC) | |
### 4. Ability to integrate a Large Language Model and Vision Encoder | |
""") | |
with gr.Row(): | |
video = gr.Video(label="Video") | |
image = gr.Image(type="pil", label="Image") | |
with gr.Row(): | |
with gr.Column(): | |
question = gr.Textbox(label="Annotate", placeholder="Annotate prompt", lines=4.3) | |
btn = gr.Button("Annotate") | |
with gr.Column(): | |
answer = gr.TextArea(label="Answer") | |
save_btn = gr.Button("Save as CSV") | |
csv = gr.File(interactive=False, visible=False) | |
# Make sure the inputs and outputs match in your click function | |
btn.click(gradio_predict, inputs=[video,image, question], outputs=answer) | |
# Button to save the answer as CSV | |
save_btn.click(export_csv, answer, csv) | |
gr.Examples( | |
examples=test_examples, | |
inputs=[video,image, question], | |
outputs= answer, | |
fn=gradio_predict, | |
cache_examples=True, | |
) | |
app.launch(debug=True) | |