Spaces:

NSTiwari
/

PaliGemma-ZeroShotDetection-Video

Runtime error

App Files Files Community

PaliGemma-ZeroShotDetection-Video / app.py

NSTiwari

Update app.py

8602ffd verified 8 months ago

raw

history blame contribute delete

5 kB

	from PIL import Image, ImageDraw, ImageFont
	import cv2
	import numpy as np
	from transformers import AutoTokenizer, PaliGemmaForConditionalGeneration, PaliGemmaProcessor
	import torch
	import spaces
	import gradio as gr

	# Load PaliGemma
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model_id = "google/paligemma-3b-mix-224"
	model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)
	processor = PaliGemmaProcessor.from_pretrained(model_id)

	# Function to draw bounding boxes (your original code)
	def draw_bounding_box(draw, coordinates, label, width, height):
	y1, x1, y2, x2 = coordinates
	y1, x1, y2, x2 = map(round, (y1height, x1width, y2height, x2width))

	text_width, text_height = draw.textsize(label)
	draw.rectangle([(x1, y1 - text_height - 2), (x1 + text_width + 4, y1)], fill="red")

	# Draw label text
	draw.text((x1 + 2, y1 - text_height - 2), label, fill="white")

	# Draw bounding box
	draw.rectangle([(x1, y1), (x2, y2)], outline="red", width=2)

	@spaces.GPU
	def process_video(video_path, input_text):
	cap = cv2.VideoCapture(video_path)
	fourcc = cv2.VideoWriter_fourcc(*'XVID')
	out = cv2.VideoWriter('output_paligemma_keras.avi', fourcc, 20.0, (int(cap.get(3)), int(cap.get(4))))

	while(True):
	ret, frame = cap.read()
	if not ret:
	break

	# Convert the frame to a PIL Image
	img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

	# Send text prompt and image as input.
	inputs = processor(text=input_text, images=img,
	padding="longest", do_convert_rgb=True, return_tensors="pt").to("cuda")
	inputs = inputs.to(dtype=model.dtype)

	# Get output.
	with torch.no_grad():
	output = model.generate(**inputs, max_length=496)

	paligemma_response = processor.decode(output[0], skip_special_tokens=True)[len(input_text):].lstrip("\n")
	# print(paligemma_response) # For debugging

	detections = paligemma_response.split(" ; ")

	# Parse the output bounding box coordinates
	parsed_coordinates = []
	labels = []

	for item in detections:
	# Remove '<loc>' tags and split the string
	# print(item)
	detection = item.replace("<loc", "").split()

	if len(detection) >= 2:
	coordinates_str = detection[0]
	label = detection[1]
	labels.append(label)
	else:
	# No label detected, skip the iteration.
	continue

	# Split the coordinates string by '>' to get individual coordinates
	coordinates = coordinates_str.split(">")
	coordinates = coordinates[:4] # Slicing to ensure only 4 values

	if coordinates[-1] == '':
	coordinates = coordinates[:-1]
	# print(coordinates)

	coordinates = [int(coord)/1024 for coord in coordinates]
	# location_values = [int(loc) for loc in re.findall(r'\d{4}', coordinates)]
	# y1, x1, y2, x2 = [value / 1024 for value in location_values]
	parsed_coordinates.append(coordinates)

	width = img.size[0]
	height = img.size[1]

	# Draw bounding boxes on the frame using PIL
	draw = ImageDraw.Draw(img)
	for coordinates, label in zip(parsed_coordinates, labels):
	draw_bounding_box(draw, coordinates, label, width=width, height=height)

	# Convert the PIL Image back to OpenCV format
	frame = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

	# Write the frame to the output video
	out.write(frame)

	cap.release()
	out.release()

	return "output_paligemma_keras.avi"

	with gr.Blocks() as demo:
	gr.Markdown("## Zero-shot Object Tracking with PaliGemma")
	gr.Markdown("This is a demo for zero-shot object tracking using [PaliGemma](https://huggingface.co/google/paligemma-3b-mix-448) vision language model by Google.")
	gr.Markdown("Simply upload a video and enter the candidate labels, or try the example below. Text input should be ; separated. 👇")
	with gr.Tab(label="Video"):
	with gr.Row():
	input_video = gr.Video(label='Input Video')
	output_video = gr.Video(label='Output Video')
	with gr.Row():
	candidate_labels = gr.Textbox(
	label='Labels',
	placeholder='Labels separated by a comma',
	)
	submit = gr.Button()
	gr.Examples(
	fn=process_video,
	examples=[["./input.mp4", "detect person"]],
	inputs=[
	input_video,
	candidate_labels,

	],
	outputs=output_video
	)

	submit.click(fn=process_video,
	inputs=[input_video, candidate_labels],
	outputs=output_video
	)

	demo.launch(debug=False, show_error=True)