Spaces:

BridgeTower
/

bridgetower-video-search

Runtime error

App Files Files Community

shaoyent commited on Feb 11, 2023

Commit

b345be7

1 Parent(s): 59eb726

Update

Browse files

Files changed (1) hide show

app.py +12 -7

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import gradio as gr
 import numpy as np
 import json
 import pickle
 import torch
 from torch.nn.utils.rnn import pad_sequence
 from transformers import BridgeTowerProcessor
@@ -147,8 +147,10 @@ def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=
         print('Read a new frame: ', idx, mid_time, frame_no, text)
         vidcap.set(1, frame_no)    # added this line
-        success, image = vidcap.read()
         if success:
             img_fname = f'{video_id}_{idx:06d}'
             img_fpath = os.path.join(output, 'frames', img_fname + '.jpg')
             # image = maintain_aspect_ratio_resize(image, height=350)     # save frame as JPEG file
@@ -163,7 +165,7 @@ def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=
                 'frame_no': frame_no
             })
-            encoding = processor(image, text, return_tensors="pt").to(device)
             encoding['text'] = text
             encoding['image_filepath'] = img_fpath
             encoding['start_time'] = caption.start
@@ -236,14 +238,17 @@ def run_query(video_path, text_query, path='/tmp'):
     _, I = faiss_index.search(emb_query, 6)
     clip_images = []
     for idx in I[0]:
         frame_no = embeddings[idx]['frame_no']
         vidcap.set(1, frame_no)    # added this line
-        success, image = vidcap.read()
-        clip_images.append(image)
-    # clip_images = [embeddings[idx]['image_filepath'] for idx in I[0]]
-    transcripts = [f"({embeddings[idx]['start_time']}) {embeddings[idx]['text']}" for idx in I[0]]
     return clip_images, transcripts

 import numpy as np
 import json
 import pickle
+from PIL import Image
 import torch
 from torch.nn.utils.rnn import pad_sequence
 from transformers import BridgeTowerProcessor
         print('Read a new frame: ', idx, mid_time, frame_no, text)
         vidcap.set(1, frame_no)    # added this line
+        success, frame = vidcap.read()
         if success:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = Image.fromarray(frame)
             img_fname = f'{video_id}_{idx:06d}'
             img_fpath = os.path.join(output, 'frames', img_fname + '.jpg')
             # image = maintain_aspect_ratio_resize(image, height=350)     # save frame as JPEG file
                 'frame_no': frame_no
             })
+            encoding = processor(frame, text, return_tensors="pt").to(device)
             encoding['text'] = text
             encoding['image_filepath'] = img_fpath
             encoding['start_time'] = caption.start
     _, I = faiss_index.search(emb_query, 6)
     clip_images = []
+    transcripts = []
     for idx in I[0]:
         frame_no = embeddings[idx]['frame_no']
         vidcap.set(1, frame_no)    # added this line
+        success, frame = vidcap.read()
+        if success:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = Image.fromarray(frame)
+            clip_images.append(frame)
+            transcripts.append(f"({embeddings[idx]['start_time']}) {embeddings[idx]['text']}")
     return clip_images, transcripts