Spaces:

Pradheep1647
/

multi-modal-emotion-recognition

Sleeping

App Files Files Community

Pradheep1647 commited on Sep 23, 2024

Commit

03c677b

1 Parent(s): cbf53ef

upload video option added

Browse files

Files changed (1) hide show

app.py +59 -58

app.py CHANGED Viewed

@@ -11,24 +11,6 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from transformers import BlipProcessor, BlipForConditionalGeneration
 import cv2
-def download_youtube_video(video_url, api_key):
-    ydl_opts = {
-        'format': 'bestvideo+bestaudio',
-        'outtmpl': os.path.join('./', '%(title)s.%(ext)s'),
-        'quiet': True,
-    }
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        ydl.download([video_url])
-        video_info = ydl.extract_info(video_url, download=False)
-        video_title = video_info.get('title', 'video')
-        return os.path.join('./', f"{video_title}.webm")
-def convert_to_mp4(input_path):
-    output_file = os.path.join('./', 'video.mp4')
-    command = ['ffmpeg', '-i', input_path, '-c', 'copy', output_file]
-    subprocess.run(command, check=True)
-    return output_file
 def extract_audio_from_video(video_path):
     video_clip = VideoFileClip(video_path)
     audio_output = os.path.join('./', 'audio.mp3')
@@ -82,6 +64,7 @@ def predict_emotions(caption):
     return predicted_emotions
 caption_model_name = "Salesforce/blip-image-captioning-base"
 caption_processor = BlipProcessor.from_pretrained(caption_model_name)
 caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_name)
@@ -90,64 +73,82 @@ emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
 emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
 emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
-def analyze_video(video_url, api_key):
-    video_path = download_youtube_video(video_url, api_key)
-    mp4_path = convert_to_mp4(video_path)
-    audio_path = extract_audio_from_video(mp4_path)
-    audio_wav_path = convert_mp3_to_wav(audio_path)
-    model_whisper = whisper.load_model("base")
-    result_whisper = model_whisper.transcribe(audio_wav_path)
-    transcript = result_whisper['text']
-    emotion_dict_text, predicted_emotion_text = process_text(transcript)
-    n_frame_interval = 60
-    emotion_vectors_video = []
-    video_capture = cv2.VideoCapture(mp4_path)
-    total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
-    frame_count_video = 0
-    while video_capture.isOpened():
-        ret_video, frame_video = video_capture.read()
-        if not ret_video or frame_count_video > total_frames_video:
-            break
-        if frame_count_video % n_frame_interval == 0:
-            pixel_values_video = preprocess_frame(frame_video)
-            caption_video = generate_caption(pixel_values_video)
-            predicted_emotions_video = predict_emotions(caption_video)
-            emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
-        frame_count_video += 1
-    video_capture.release()
-    average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
-    combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
-    final_most_predicted_index = np.argmax(combined_emotion_vector_final)
-    final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
-    return transcript, predicted_emotion_text, final_most_predicted_emotion
 with gr.Blocks() as iface:
-    gr.Markdown("# 🎥 Multimodal Emotion Recognition\nUpload or enter a YouTube Video URL and analyze emotions from both audio and video frames.")
-    with gr.Row():
-        video_url = gr.Textbox(label="YouTube Video URL", placeholder="Enter video URL here...", interactive=True)
-        api_key = gr.Textbox(label="YouTube API Key", placeholder="Enter your API key", type="password", interactive=True)
-    with gr.Row():
-        submit_button = gr.Button("Analyze Video")
     with gr.Row():
         transcript_output = gr.Textbox(label="Transcript", interactive=False)
         audio_emotion_output = gr.Textbox(label="Emotion from Audio and Text", interactive=False)
         visual_emotion_output = gr.Textbox(label="Emotion from Video", interactive=False)
-    submit_button.click(analyze_video, inputs=[video_url, api_key], outputs=[transcript_output, audio_emotion_output, visual_emotion_output])
 if __name__ == "__main__":
     iface.launch()

 from transformers import BlipProcessor, BlipForConditionalGeneration
 import cv2
 def extract_audio_from_video(video_path):
     video_clip = VideoFileClip(video_path)
     audio_output = os.path.join('./', 'audio.mp3')
     return predicted_emotions
+# Models for image captioning and emotion analysis
 caption_model_name = "Salesforce/blip-image-captioning-base"
 caption_processor = BlipProcessor.from_pretrained(caption_model_name)
 caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_name)
 emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
 emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
+def analyze_video(video=None, video_url=None):
+    if video is not None:
+        # If a video is uploaded, process the uploaded file
+        video_path = video
+    elif video_url:
+        # For streaming YouTube video, just embed the link (assuming it's embedded using Gradio)
+        video_path = None
+    # If the video is uploaded, extract audio
+    if video_path:
+        audio_path = extract_audio_from_video(video_path)
+        audio_wav_path = convert_mp3_to_wav(audio_path)
+        model_whisper = whisper.load_model("base")
+        result_whisper = model_whisper.transcribe(audio_wav_path)
+        transcript = result_whisper['text']
+        emotion_dict_text, predicted_emotion_text = process_text(transcript)
+        # Frame-wise emotion detection from the video
+        n_frame_interval = 60
+        emotion_vectors_video = []
+        video_capture = cv2.VideoCapture(video_path)
+        total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        frame_count_video = 0
+        while video_capture.isOpened():
+            ret_video, frame_video = video_capture.read()
+            if not ret_video or frame_count_video > total_frames_video:
+                break
+            if frame_count_video % n_frame_interval == 0:
+                pixel_values_video = preprocess_frame(frame_video)
+                caption_video = generate_caption(pixel_values_video)
+                predicted_emotions_video = predict_emotions(caption_video)
+                emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
+            frame_count_video += 1
+        video_capture.release()
+        average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
+        combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
+        final_most_predicted_index = np.argmax(combined_emotion_vector_final)
+        final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
+        return transcript, predicted_emotion_text, final_most_predicted_emotion
+    else:
+        # For streaming, return an empty analysis or handle the embedding in the Gradio UI
+        return None, "Streaming video detected (no processing).", "N/A"
+# Gradio Interface
 with gr.Blocks() as iface:
+    gr.Markdown("# 🎥 Multimodal Emotion Recognition\nUpload a video or input a YouTube video URL to analyze emotions from audio and video frames.")
+    with gr.Tabs():
+        with gr.TabItem("Upload Video"):
+            video_file = gr.File(label="Upload Video File", file_types=["video"])
+            submit_button_file = gr.Button("Analyze Uploaded Video")
+        with gr.TabItem("YouTube URL"):
+            video_url = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube video URL")
+            submit_button_url = gr.Button("Analyze YouTube Video")
     with gr.Row():
         transcript_output = gr.Textbox(label="Transcript", interactive=False)
         audio_emotion_output = gr.Textbox(label="Emotion from Audio and Text", interactive=False)
         visual_emotion_output = gr.Textbox(label="Emotion from Video", interactive=False)
+    # For uploaded video
+    submit_button_file.click(analyze_video, inputs=[video_file, None], outputs=[transcript_output, audio_emotion_output, visual_emotion_output])
+    # For YouTube streaming (no downloading)
+    submit_button_url.click(analyze_video, inputs=[None, video_url], outputs=[transcript_output, audio_emotion_output, visual_emotion_output])
 if __name__ == "__main__":
     iface.launch()