File size: 6,760 Bytes
ffac82b 42a8ed8 2079d37 42a8ed8 2079d37 42a8ed8 2079d37 42a8ed8 2079d37 42a8ed8 2079d37 42a8ed8 2079d37 42a8ed8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
---
license: mit
---
<p align="center" width="100%">
<img src="https://i.postimg.cc/MKmyP9wH/new-banner.png" width="80%" height="80%">
</p>
<div>
<div align="center">
<a href='https://brianboli.com/' target='_blank'>Bo Li*<sup>1</sup></a> 
<a href='https://zhangyuanhan-ai.github.io/' target='_blank'>Yuanhan Zhang*<sup>,1</sup></a> 
<a href='https://cliangyu.com/' target='_blank'>Liangyu Chen*<sup>,1</sup></a> 
<a href='https://king159.github.io/' target='_blank'>Jinghao Wang*<sup>,1</sup></a> 
<a href='https://pufanyi.github.io/' target='_blank'>Fanyi Pu*<sup>,1</sup></a> 
</br>
<a href='https://jingkang50.github.io/' target='_blank'>Jingkang Yang<sup>1</sup></a> 
<a href='https://chunyuan.li/' target='_blank'>Chunyuan Li<sup>2</sup></a> 
<a href='https://liuziwei7.github.io/' target='_blank'>Ziwei Liu<sup>1</sup></a>
</div>
<div>
<div align="center">
<sup>1</sup>S-Lab, Nanyang Technological University 
<sup>2</sup>Microsoft Research, Redmond
</div>
-----------------
![](https://img.shields.io/badge/otter-v0.2-darkcyan)
![](https://img.shields.io/github/stars/luodian/otter?style=social)
[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FLuodian%2Fotter&count_bg=%23FFA500&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=visitors&edge_flat=false)](https://hits.seeyoufarm.com)
![](https://black.readthedocs.io/en/stable/_static/license.svg)
![](https://img.shields.io/badge/code%20style-black-000000.svg)
An example of using this model to run on your video. Please first clone [Otter](https://github.com/Luodian/Otter) to your local disk. Place following script inside the `Otter` folder to make sure it has the access to `otter/modeling_otter.py`.
```python
import mimetypes
import os
from typing import Union
import cv2
import requests
import torch
import transformers
from PIL import Image
from otter.modeling_otter import OtterForConditionalGeneration
# Disable warnings
requests.packages.urllib3.disable_warnings()
# ------------------- Utility Functions -------------------
def get_content_type(file_path):
content_type, _ = mimetypes.guess_type(file_path)
return content_type
# ------------------- Image and Video Handling Functions -------------------
def extract_frames(video_path, num_frames=128):
video = cv2.VideoCapture(video_path)
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
frame_step = total_frames // num_frames
frames = []
for i in range(num_frames):
video.set(cv2.CAP_PROP_POS_FRAMES, i * frame_step)
ret, frame = video.read()
if ret:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = Image.fromarray(frame).convert("RGB")
frames.append(frame)
video.release()
return frames
def get_image(url: str) -> Union[Image.Image, list]:
if "://" not in url: # Local file
content_type = get_content_type(url)
else: # Remote URL
content_type = requests.head(url, stream=True, verify=False).headers.get(
"Content-Type"
)
if "image" in content_type:
if "://" not in url: # Local file
return Image.open(url)
else: # Remote URL
return Image.open(requests.get(url, stream=True, verify=False).raw)
elif "video" in content_type:
video_path = "temp_video.mp4"
if "://" not in url: # Local file
video_path = url
else: # Remote URL
with open(video_path, "wb") as f:
f.write(requests.get(url, stream=True, verify=False).content)
frames = extract_frames(video_path)
if "://" in url: # Only remove the temporary video file if it was downloaded
os.remove(video_path)
return frames
else:
raise ValueError("Invalid content type. Expected image or video.")
# ------------------- OTTER Prompt and Response Functions -------------------
def get_formatted_prompt(prompt: str) -> str:
return f"<image>User: {prompt} GPT:<answer>"
def get_response(input_data, prompt: str, model=None, image_processor=None) -> str:
if isinstance(input_data, Image.Image):
vision_x = (
image_processor.preprocess([input_data], return_tensors="pt")[
"pixel_values"
]
.unsqueeze(1)
.unsqueeze(0)
)
elif isinstance(input_data, list): # list of video frames
vision_x = (
image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"]
.unsqueeze(1)
.unsqueeze(0)
)
else:
raise ValueError(
"Invalid input data. Expected PIL Image or list of video frames."
)
lang_x = model.text_tokenizer(
[
get_formatted_prompt(prompt),
],
return_tensors="pt",
)
generated_text = model.generate(
vision_x=vision_x.to(model.device),
lang_x=lang_x["input_ids"].to(model.device),
attention_mask=lang_x["attention_mask"].to(model.device),
max_new_tokens=512,
num_beams=3,
no_repeat_ngram_size=3,
)
parsed_output = (
model.text_tokenizer.decode(generated_text[0])
.split("<answer>")[-1]
.lstrip()
.rstrip()
.split("<|endofchunk|>")[0]
.lstrip()
.rstrip()
.lstrip('"')
.rstrip('"')
)
return parsed_output
if __name__ == "__main__":
# ------------------- Main Function -------------------
load_bit = "fp16"
if load_bit == "fp16":
precision = {"torch_dtype": torch.float16}
elif load_bit == "bf16":
precision = {"torch_dtype": torch.bfloat16}
elif load_bit == "fp32":
precision = {"torch_dtype": torch.float32}
# This model version is trained on MIMIC-IT DC dataset.
model = OtterForConditionalGeneration.from_pretrained(
"luodian/otter-9b-dc-hf", device_map="auto", **precision
)
model.text_tokenizer.padding_side = "left"
tokenizer = model.text_tokenizer
image_processor = transformers.CLIPImageProcessor()
model.eval()
while True:
video_url = "demo.mp4" # Replace with the path to your video file
frames_list = get_image(video_url)
prompts_input = input("Enter prompts (comma-separated): ")
prompts = [prompt.strip() for prompt in prompts_input.split(",")]
for prompt in prompts:
print(f"\nPrompt: {prompt}")
response = get_response(frames_list, prompt, model, image_processor)
print(f"Response: {response}")
if prompts_input.lower() == "quit":
break
``` |