shaoyent commited on
Commit
b345be7
·
1 Parent(s): 59eb726
Files changed (1) hide show
  1. app.py +12 -7
app.py CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
4
  import numpy as np
5
  import json
6
  import pickle
7
-
8
  import torch
9
  from torch.nn.utils.rnn import pad_sequence
10
  from transformers import BridgeTowerProcessor
@@ -147,8 +147,10 @@ def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=
147
 
148
  print('Read a new frame: ', idx, mid_time, frame_no, text)
149
  vidcap.set(1, frame_no) # added this line
150
- success, image = vidcap.read()
151
  if success:
 
 
152
  img_fname = f'{video_id}_{idx:06d}'
153
  img_fpath = os.path.join(output, 'frames', img_fname + '.jpg')
154
  # image = maintain_aspect_ratio_resize(image, height=350) # save frame as JPEG file
@@ -163,7 +165,7 @@ def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=
163
  'frame_no': frame_no
164
  })
165
 
166
- encoding = processor(image, text, return_tensors="pt").to(device)
167
  encoding['text'] = text
168
  encoding['image_filepath'] = img_fpath
169
  encoding['start_time'] = caption.start
@@ -236,14 +238,17 @@ def run_query(video_path, text_query, path='/tmp'):
236
  _, I = faiss_index.search(emb_query, 6)
237
 
238
  clip_images = []
 
239
  for idx in I[0]:
240
  frame_no = embeddings[idx]['frame_no']
241
  vidcap.set(1, frame_no) # added this line
242
- success, image = vidcap.read()
243
- clip_images.append(image)
 
 
 
 
244
 
245
- # clip_images = [embeddings[idx]['image_filepath'] for idx in I[0]]
246
- transcripts = [f"({embeddings[idx]['start_time']}) {embeddings[idx]['text']}" for idx in I[0]]
247
  return clip_images, transcripts
248
 
249
 
 
4
  import numpy as np
5
  import json
6
  import pickle
7
+ from PIL import Image
8
  import torch
9
  from torch.nn.utils.rnn import pad_sequence
10
  from transformers import BridgeTowerProcessor
 
147
 
148
  print('Read a new frame: ', idx, mid_time, frame_no, text)
149
  vidcap.set(1, frame_no) # added this line
150
+ success, frame = vidcap.read()
151
  if success:
152
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
153
+ frame = Image.fromarray(frame)
154
  img_fname = f'{video_id}_{idx:06d}'
155
  img_fpath = os.path.join(output, 'frames', img_fname + '.jpg')
156
  # image = maintain_aspect_ratio_resize(image, height=350) # save frame as JPEG file
 
165
  'frame_no': frame_no
166
  })
167
 
168
+ encoding = processor(frame, text, return_tensors="pt").to(device)
169
  encoding['text'] = text
170
  encoding['image_filepath'] = img_fpath
171
  encoding['start_time'] = caption.start
 
238
  _, I = faiss_index.search(emb_query, 6)
239
 
240
  clip_images = []
241
+ transcripts = []
242
  for idx in I[0]:
243
  frame_no = embeddings[idx]['frame_no']
244
  vidcap.set(1, frame_no) # added this line
245
+ success, frame = vidcap.read()
246
+ if success:
247
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
248
+ frame = Image.fromarray(frame)
249
+ clip_images.append(frame)
250
+ transcripts.append(f"({embeddings[idx]['start_time']}) {embeddings[idx]['text']}")
251
 
 
 
252
  return clip_images, transcripts
253
 
254