Spaces:

Abhilashvj
/

video-search

Runtime error

App Files Files Community

Abhilashvj commited on Oct 6, 2024

Commit

f279bf5

verified ·

1 Parent(s): db13e81

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -149

app.py CHANGED Viewed

@@ -1,162 +1,177 @@
 import streamlit as st
-import torch
-from PIL import Image
-import face_recognition
 import faiss
 from sentence_transformers import SentenceTransformer
-from transformers import pipeline
 import cv2
-import numpy as np
-import subprocess
-import tempfile
-import os
-import yt_dlp
-from moviepy.editor import VideoFileClip
-# Helper functions
-def get_video_id(url):
-    return url.split("v=")[1].split("&")[0]
-def download_youtube_video(url, output_path):
-    ydl_opts = {
-        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
-        'outtmpl': os.path.join(output_path, '%(id)s.%(ext)s'),
-    }
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        info = ydl.extract_info(url, download=True)
-        filename = ydl.prepare_filename(info)
-    return filename
-def process_video(video_url, output_dir, video_id):
-    # Placeholder for video processing logic
-    # This should include face detection, object detection, transcription, etc.
-    # For now, we'll just download the video
-    video_path = download_youtube_video(video_url, output_dir)
-    # Extract frames (simplified version)
-    video = cv2.VideoCapture(video_path)
-    fps = video.get(cv2.CAP_PROP_FPS)
-    frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
-    duration = frame_count / fps
-    frames = []
-    frame_times = []
-    for i in range(0, frame_count, int(fps)):  # Extract one frame per second
-        video.set(cv2.CAP_PROP_POS_FRAMES, i)
-        ret, frame = video.read()
-        if ret:
-            frames.append(frame)
-            frame_times.append(i / fps)
-    video.release()
-    return {
-        'video_path': video_path,
-        'frames': frames,
-        'frame_times': frame_times,
-        'duration': duration,
-        'fps': fps
-    }
-def search(query, index_path, metadata_path, model):
-    # Placeholder for search functionality
-    # This should use FAISS for efficient similarity search
-    return [], []
 # Load models
 @st.cache_resource
 def load_models():
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    clip_model, preprocess = torch.hub.load('openai/CLIP', 'clip_vit_b32', device=device)
-    text_model = SentenceTransformer("all-MiniLM-L6-v2").to(device)
-    qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", device=0 if torch.cuda.is_available() else -1)
-    return clip_model, preprocess, text_model, qa_model
-clip_model, preprocess, text_model, qa_model = load_models()
 # Streamlit UI
-st.title("Enhanced YouTube Video Analysis")
-video_url = st.text_input("Enter YouTube Video URL")
-if st.button("Analyze"):
-    with st.spinner("Processing video..."):
-        video_id = get_video_id(video_url)
-        results = process_video(video_url, "output_dir", video_id)
-    if results:
-        st.success("Video processed successfully!")
-        # Text search and question answering
-        st.subheader("Text Search and Q&A")
-        query = st.text_input("Enter a search query or question")
-        if query:
-            # Placeholder for text search and QA
-            st.write("Text search and QA functionality to be implemented")
-        # Image upload and similarity search
-        st.subheader("Image Search")
-        uploaded_image = st.file_uploader("Upload an image to find similar frames", type=["jpg", "jpeg", "png"])
-        if uploaded_image:
-            # Placeholder for image search
-            st.write("Image search functionality to be implemented")
-        # Face upload and recognition
-        st.subheader("Face Search")
-        uploaded_face = st.file_uploader("Upload a face image to find appearances", type=["jpg", "jpeg", "png"])
-        if uploaded_face:
-            face_image = face_recognition.load_image_file(uploaded_face)
-            face_encoding = face_recognition.face_encodings(face_image)[0]
-            face_appearances = []
-            face_frames = []
-            for i, frame in enumerate(results['frames']):
-                face_locations = face_recognition.face_locations(frame)
-                face_encodings = face_recognition.face_encodings(frame, face_locations)
-                for encoding in face_encodings:
-                    if face_recognition.compare_faces([face_encoding], encoding)[0]:
-                        face_appearances.append(results['frame_times'][i])
-                        face_frames.append(frame)
-            st.write(f"Face appearances found at {len(face_appearances)} timestamps.")
-            if face_frames:
-                # Create a temporary directory to store frames
-                with tempfile.TemporaryDirectory() as temp_dir:
-                    # Save frames as images
-                    for i, frame in enumerate(face_frames):
-                        cv2.imwrite(os.path.join(temp_dir, f"frame_{i:04d}.jpg"), frame)
-                    # Use FFmpeg to create a video from the frames
-                    output_video = "face_appearances.mp4"
-                    ffmpeg_command = [
-                        "ffmpeg",
-                        "-framerate", str(results['fps']),
-                        "-i", os.path.join(temp_dir, "frame_%04d.jpg"),
-                        "-c:v", "libx264",
-                        "-pix_fmt", "yuv420p",
-                        output_video
-                    ]
-                    subprocess.run(ffmpeg_command, check=True)
-                # Display the generated video
-                st.video(output_video)
-                # Provide download link for the video
-                with open(output_video, "rb") as file:
-                    btn = st.download_button(
-                        label="Download Face Appearances Video",
-                        data=file,
-                        file_name="face_appearances.mp4",
-                        mime="video/mp4"
-                    )
-            else:
-                st.write("No frames with the uploaded face were found in the video.")
-        # Display original video
-        st.subheader("Original Video")
-        st.video(results['video_path'])
-else:
-    st.warning("Please enter a valid YouTube URL and click 'Analyze'")

 import streamlit as st
+import json
 import faiss
+import numpy as np
 from sentence_transformers import SentenceTransformer
+import base64
+from PIL import Image
+import io
 import cv2
+from insightface.app import FaceAnalysis
 # Load models
 @st.cache_resource
 def load_models():
+    text_model = SentenceTransformer("all-MiniLM-L6-v2")
+    image_model = SentenceTransformer("clip-ViT-B-32")
+    face_app = FaceAnalysis(providers=['CPUExecutionProvider'])
+    face_app.prepare(ctx_id=0, det_size=(640, 640))
+    return text_model, image_model, face_app
+text_model, image_model, face_app = load_models()
+# Load data
+@st.cache_data
+def load_data(video_id):
+    with open(f"{video_id}_summary.json", "r") as f:
+        summary = json.load(f)
+    with open(f"{video_id}_transcription.json", "r") as f:
+        transcription = json.load(f)
+    with open(f"{video_id}_text_metadata.json", "r") as f:
+        text_metadata = json.load(f)
+    with open(f"{video_id}_image_metadata.json", "r") as f:
+        image_metadata = json.load(f)
+    with open(f"{video_id}_object_infos.json", "r") as f:
+        object_infos = json.load(f)
+    with open(f"{video_id}_face_metadata.json", "r") as f:
+        face_metadata = json.load(f)
+    return summary, transcription, text_metadata, image_metadata, object_infos, face_metadata
+video_id = "IMFUOexuEXw"
+summary, transcription, text_metadata, image_metadata, object_infos, face_metadata = load_data(video_id)
+# Load FAISS indexes
+@st.cache_resource
+def load_indexes(video_id):
+    text_index = faiss.read_index(f"{video_id}_text_index.faiss")
+    image_index = faiss.read_index(f"{video_id}_image_index.faiss")
+    face_index = faiss.read_index(f"{video_id}_face_index.faiss")
+    return text_index, image_index, face_index
+text_index, image_index, face_index = load_indexes(video_id)
+# Search functions
+def text_search(query, index, metadata, model, n_results=5):
+    query_vector = model.encode([query], convert_to_tensor=True).cpu().numpy()
+    D, I = index.search(query_vector, n_results)
+    results = [metadata[i] for i in I[0]]
+    return results, D[0]
+def image_search(image, index, metadata, model, n_results=5):
+    image_vector = model.encode(image, convert_to_tensor=True).cpu().numpy()
+    D, I = index.search(image_vector.reshape(1, -1), n_results)
+    results = [metadata[i] for i in I[0]]
+    return results, D[0]
+def face_search(face_embedding, index, metadata, n_results=5):
+    D, I = index.search(np.array(face_embedding).reshape(1, -1), n_results)
+    results = [metadata[i] for i in I[0]]
+    return results, D[0]
+def detect_and_embed_face(image, face_app):
+    img_array = np.array(image)
+    faces = face_app.get(img_array)
+    if len(faces) == 0:
+        return None
+    largest_face = max(faces, key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]))
+    return largest_face.embedding
 # Streamlit UI
+st.title("Video Analysis Dashboard")
+# Display video summary
+st.header("Video Summary")
+st.subheader("Prominent Faces")
+for face in summary['prominent_faces']:
+    st.write(f"Face ID: {face['id']}, Appearances: {face['appearances']}, First Appearance: {face['first_appearance']:.2f}s")
+    if 'thumbnail' in face:
+        image = Image.open(io.BytesIO(base64.b64decode(face['thumbnail'])))
+        st.image(image, caption=f"Face ID: {face['id']}", width=100)
+st.subheader("Prominent Objects")
+for obj in summary['prominent_objects']:
+    st.write(f"Object ID: {obj['id']}, Appearances: {obj['appearances']}, Representative Frame: {obj['representative_frame']:.2f}s")
+st.subheader("Themes")
+for theme in summary['themes']:
+    st.write(f"Theme ID: {theme['id']}, Keywords: {', '.join(theme['keywords'])}")
+# Search functionality
+st.header("Search")
+search_type = st.selectbox("Select search type", ["Text", "Face", "Image"])
+if search_type == "Text":
+    query = st.text_input("Enter your search query")
+    search_target = st.multiselect("Search in", ["Transcript", "Frames"], default=["Transcript"])
+    if st.button("Search"):
+        if "Transcript" in search_target:
+            text_results, text_distances = text_search(query, text_index, text_metadata, text_model)
+            st.subheader("Transcript Search Results")
+            for result, distance in zip(text_results, text_distances):
+                st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
+                st.write(f"Text: {result['text']}")
+                st.write("---")
+        if "Frames" in search_target:
+            frame_results, frame_distances = text_search(query, image_index, image_metadata, image_model)
+            st.subheader("Frame Search Results")
+            for result, distance in zip(frame_results, frame_distances):
+                st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
+                st.write("---")
+elif search_type == "Face":
+    face_search_type = st.radio("Choose face search method", ["Select from video", "Upload image"])
+    if face_search_type == "Select from video":
+        face_id = st.selectbox("Select a face", [face['id'] for face in summary['prominent_faces']])
+        if st.button("Search"):
+            selected_face = next(face for face in summary['prominent_faces'] if face['id'] == face_id)
+            face_results, face_distances = face_search(selected_face['embedding'], face_index, face_metadata)
+            st.subheader("Face Search Results")
+            for result, distance in zip(face_results, face_distances):
+                st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
+                st.write(f"Face ID: {result['face_id']}")
+                st.write("---")
+    else:
+        uploaded_file = st.file_uploader("Choose a face image...", type=["jpg", "jpeg", "png"])
+        if uploaded_file is not None:
+            image = Image.open(uploaded_file)
+            st.image(image, caption="Uploaded Image", use_column_width=True)
+            if st.button("Search"):
+                face_embedding = detect_and_embed_face(image, face_app)
+                if face_embedding is not None:
+                    face_results, face_distances = face_search(face_embedding, face_index, face_metadata)
+                    st.subheader("Face Search Results")
+                    for result, distance in zip(face_results, face_distances):
+                        st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
+                        st.write(f"Face ID: {result['face_id']}")
+                        st.write("---")
+                else:
+                    st.error("No face detected in the uploaded image. Please try another image.")
+elif search_type == "Image":
+    image_search_type = st.radio("Choose image search method", ["Upload image", "Text description"])
+    if image_search_type == "Upload image":
+        uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
+        if uploaded_file is not None:
+            image = Image.open(uploaded_file)
+            st.image(image, caption="Uploaded Image", use_column_width=True)
+            if st.button("Search"):
+                image_results, image_distances = image_search(image, image_index, image_metadata, image_model)
+                st.subheader("Image Search Results")
+                for result, distance in zip(image_results, image_distances):
+                    st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
+                    st.write("---")
+    else:
+        text_query = st.text_input("Enter a description of the image you're looking for")
+        if st.button("Search"):
+            image_results, image_distances = text_search(text_query, image_index, image_metadata, image_model)
+            st.subheader("Image Search Results")
+            for result, distance in zip(image_results, image_distances):
+                st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
+                st.write("---")
+# Display transcription
+st.header("Video Transcription")
+st.write(transcription['transcription'])