ncoop57
commited on
Commit
•
021b099
1
Parent(s):
f400687
add initial code
Browse files- app.py +87 -0
- clip.py +80 -0
- requirements.txt +6 -0
app.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch._C import device
|
2 |
+
import ffmpeg
|
3 |
+
import youtube_dl
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
from PIL import Image
|
8 |
+
import requests
|
9 |
+
|
10 |
+
import torch
|
11 |
+
from sentence_transformers import SentenceTransformer, util, models
|
12 |
+
from clip import CLIPModel
|
13 |
+
# from sentence_transformers.models import CLIPModel
|
14 |
+
from PIL import Image
|
15 |
+
|
16 |
+
clip = CLIPModel()
|
17 |
+
model = SentenceTransformer(modules=[clip]).to(dtype=torch.float32, device=torch.device('cpu'))
|
18 |
+
|
19 |
+
|
20 |
+
def get_embedding(query, video):
|
21 |
+
text_emb = model.encode(query, device='cpu')
|
22 |
+
|
23 |
+
# Encode an image:
|
24 |
+
images = []
|
25 |
+
for img in video:
|
26 |
+
images.append(Image.fromarray(img))
|
27 |
+
img_embs = model.encode(images, device='cpu')
|
28 |
+
|
29 |
+
return text_emb, img_embs
|
30 |
+
|
31 |
+
|
32 |
+
# # Encode an image:
|
33 |
+
# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
34 |
+
# img = Image.fromarray(np.array(Image.open(requests.get(url, stream=True).raw))).convert('RGB')
|
35 |
+
# img_emb = model.encode([img, img], device='cpu')
|
36 |
+
|
37 |
+
# # Encode text descriptions
|
38 |
+
# text_emb = model.encode(['Two dogs in the snow', 'Two cats laying on a sofa',
|
39 |
+
# 'A picture of London at night'], device='cpu')
|
40 |
+
|
41 |
+
# # Compute cosine similarities
|
42 |
+
# cos_scores = util.cos_sim(img_emb, text_emb)
|
43 |
+
# print(cos_scores)
|
44 |
+
|
45 |
+
|
46 |
+
def my_hook(d):
|
47 |
+
if d['status'] == 'finished':
|
48 |
+
print(d)
|
49 |
+
print('Done downloading, now extracting frames ...')
|
50 |
+
probe = ffmpeg.probe(d["filename"])
|
51 |
+
video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
|
52 |
+
width = int(video_stream['width'])
|
53 |
+
height = int(video_stream['height'])
|
54 |
+
out, _ = (
|
55 |
+
ffmpeg
|
56 |
+
.input(d["filename"])
|
57 |
+
.output('pipe:', format='rawvideo', pix_fmt='rgb24')
|
58 |
+
.run(capture_stdout=True)
|
59 |
+
)
|
60 |
+
video = (
|
61 |
+
np
|
62 |
+
.frombuffer(out, np.uint8)
|
63 |
+
.reshape([-1, height, width, 3])
|
64 |
+
)[::10]
|
65 |
+
|
66 |
+
print(video.shape)
|
67 |
+
txt_embd, img_embds = get_embedding("two white puppies", video)
|
68 |
+
cos_scores = util.cos_sim(txt_embd, img_embds)
|
69 |
+
print(cos_scores)
|
70 |
+
|
71 |
+
|
72 |
+
ydl_opts = {"format": "mp4", "progress_hooks": [my_hook], }
|
73 |
+
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
74 |
+
ydl.download(['https://youtu.be/I3AaW9ZevIU'])
|
75 |
+
|
76 |
+
|
77 |
+
# # out, _ = (
|
78 |
+
# # ffmpeg
|
79 |
+
# # .input('in.mp4')
|
80 |
+
# # .output('pipe:', format='rawvideo', pix_fmt='rgb24')
|
81 |
+
# # .run(capture_stdout=True)
|
82 |
+
# # )
|
83 |
+
# # video = (
|
84 |
+
# # np
|
85 |
+
# # .frombuffer(out, np.uint8)
|
86 |
+
# # .reshape([-1, height, width, 3])
|
87 |
+
# )
|
clip.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch import nn
|
2 |
+
import transformers
|
3 |
+
import torch
|
4 |
+
from PIL import Image
|
5 |
+
|
6 |
+
|
7 |
+
class CLIPModel(nn.Module):
|
8 |
+
def __init__(self, model_name: str = "openai/clip-vit-base-patch32", processor_name=None):
|
9 |
+
super(CLIPModel, self).__init__()
|
10 |
+
|
11 |
+
if processor_name is None:
|
12 |
+
processor_name = model_name
|
13 |
+
|
14 |
+
self.model = transformers.CLIPModel.from_pretrained(model_name)
|
15 |
+
self.processor = transformers.CLIPProcessor.from_pretrained(processor_name)
|
16 |
+
|
17 |
+
def __repr__(self):
|
18 |
+
return "CLIPModel()"
|
19 |
+
|
20 |
+
def forward(self, features):
|
21 |
+
image_embeds = []
|
22 |
+
text_embeds = []
|
23 |
+
|
24 |
+
if 'pixel_values' in features:
|
25 |
+
vision_outputs = self.model.vision_model(pixel_values=features['pixel_values'])
|
26 |
+
image_embeds = self.model.visual_projection(vision_outputs[1])
|
27 |
+
|
28 |
+
if 'input_ids' in features:
|
29 |
+
text_outputs = self.model.text_model(
|
30 |
+
input_ids=features.get('input_ids'),
|
31 |
+
attention_mask=features.get('attention_mask', None),
|
32 |
+
position_ids=features.get('position_ids', None),
|
33 |
+
output_attentions=features.get('output_attentions', None),
|
34 |
+
output_hidden_states=features.get('output_hidden_states', None),
|
35 |
+
)
|
36 |
+
text_embeds = self.model.text_projection(text_outputs[1])
|
37 |
+
|
38 |
+
sentence_embedding = []
|
39 |
+
image_features = iter(image_embeds)
|
40 |
+
text_features = iter(text_embeds)
|
41 |
+
|
42 |
+
for idx, input_type in enumerate(features['image_text_info']):
|
43 |
+
if input_type == 0:
|
44 |
+
sentence_embedding.append(next(image_features))
|
45 |
+
else:
|
46 |
+
sentence_embedding.append(next(text_features))
|
47 |
+
|
48 |
+
features['sentence_embedding'] = torch.stack(sentence_embedding).float()
|
49 |
+
|
50 |
+
return features
|
51 |
+
|
52 |
+
def tokenize(self, texts):
|
53 |
+
images = []
|
54 |
+
texts_values = []
|
55 |
+
image_text_info = []
|
56 |
+
|
57 |
+
for idx, data in enumerate(texts):
|
58 |
+
if isinstance(data, Image.Image): # An Image
|
59 |
+
images.append(data)
|
60 |
+
image_text_info.append(0)
|
61 |
+
else: # A text
|
62 |
+
texts_values.append(data)
|
63 |
+
image_text_info.append(1)
|
64 |
+
|
65 |
+
if len(texts_values) == 0:
|
66 |
+
texts_values = None
|
67 |
+
if len(images) == 0:
|
68 |
+
images = None
|
69 |
+
|
70 |
+
inputs = self.processor(text=texts_values, images=images, return_tensors="pt", padding=True)
|
71 |
+
inputs['image_text_info'] = image_text_info
|
72 |
+
return inputs
|
73 |
+
|
74 |
+
def save(self, output_path: str):
|
75 |
+
self.model.save_pretrained(output_path)
|
76 |
+
self.processor.save_pretrained(output_path)
|
77 |
+
|
78 |
+
@staticmethod
|
79 |
+
def load(input_path: str):
|
80 |
+
return CLIPModel(model_name=input_path)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ffmpeg-python
|
2 |
+
numpy
|
3 |
+
pillow
|
4 |
+
torch
|
5 |
+
git+https://github.com/ncoop57/sentence-transformers@clip-image-check
|
6 |
+
youtube_dl
|