Build

Paused

App Files Files Community

ManishThota commited on Mar 9, 2024

Commit

fa7747b

verified ·

1 Parent(s): 3295429

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -31

app.py CHANGED Viewed

@@ -58,59 +58,95 @@ def extract_frames(frame):
     return image_bgr
-def predict_answer(image, video, question, max_tokens=100):
     text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
     input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
-    if image:
-        # Process as an image
-        image = image.convert("RGB")
         image_tensor = model.image_preprocess(image)
-        #Generate the answer
-        output_ids = model.generate(
-            input_ids,
-            max_new_tokens=max_tokens,
-            images=image_tensor,
-            use_cache=True)[0]
-        return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
-    elif video:
-        # Process as a video
-        frames = video_to_frames(video)
-        answers = []
-        for frame in frames:
-            image = extract_frames(frame)
-            image_tensor = model.image_preprocess(image)
-            # Generate the answer
-            output_ids = model.generate(
                 input_ids,
                 max_new_tokens=max_tokens,
                 images=image_tensor,
                 use_cache=True)[0]
-            answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
-            answers.append(answer)
         return answers
-    else:
-        return "Unsupported file type. Please upload an image or video."
-def gradio_predict(image, video, question, max_tokens):
-    answer = predict_answer(image, video, question, max_tokens)
     return answer
 iface = gr.Interface(
     fn=gradio_predict,
     inputs=[
-        gr.Image(type="pil", label="Upload or Drag an Image"),
         gr.Video(label="Upload your video here"),
         gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
         gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],

     return image_bgr
+def predict_answer(video, question, max_tokens=100):
     text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
     input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
+    frames = video_to_frames(video)
+    answers = []
+    for i in range(len(frames)):
+        image = extract_frames(frames[i])
         image_tensor = model.image_preprocess(image)
+        # Generate the answer
+        output_ids = model.generate(
                 input_ids,
                 max_new_tokens=max_tokens,
                 images=image_tensor,
                 use_cache=True)[0]
+        answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+        answers.append(answer)
         return answers
+    # if image:
+    #     # Process as an image
+    #     image = image.convert("RGB")
+    #     image_tensor = model.image_preprocess(image)
+    #     #Generate the answer
+    #     output_ids = model.generate(
+    #         input_ids,
+    #         max_new_tokens=max_tokens,
+    #         images=image_tensor,
+    #         use_cache=True)[0]
+    #     return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+    # elif video:
+    #     # Process as a video
+    #     frames = video_to_frames(video)
+    #     answers = []
+    #     for frame in frames:
+    #         image = extract_frames(frame)
+    #         image_tensor = model.image_preprocess(image)
+    #         # Generate the answer
+    #         output_ids = model.generate(
+    #             input_ids,
+    #             max_new_tokens=max_tokens,
+    #             images=image_tensor,
+    #             use_cache=True)[0]
+    #         answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+    #         answers.append(answer)
+    #     return answers
+    # else:
+    #     return "Unsupported file type. Please upload an image or video."
+# def gradio_predict(image, video, question, max_tokens):
+#     answer = predict_answer(image, video, question, max_tokens)
+#     return answer
+# iface = gr.Interface(
+#     fn=gradio_predict,
+#     inputs=[
+#         gr.Image(type="pil", label="Upload or Drag an Image"),
+#         gr.Video(label="Upload your video here"),
+#         gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
+#         gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
+#     outputs=gr.TextArea(label="Answer"),
+#     # outputs=gr.Image(label="Output"),
+#     title="Video/Image Viewer",
+#     description="Upload an image or video to view it or extract frames from the video.",
+# )
+# iface.launch(debug=True)
+def gradio_predict(video, question, max_tokens):
+    answer = predict_answer(video, question, max_tokens)
     return answer
 iface = gr.Interface(
     fn=gradio_predict,
     inputs=[
         gr.Video(label="Upload your video here"),
         gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
         gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],