Pradheep1647 commited on
Commit
03c677b
·
1 Parent(s): cbf53ef

upload video option added

Browse files
Files changed (1) hide show
  1. app.py +59 -58
app.py CHANGED
@@ -11,24 +11,6 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
  from transformers import BlipProcessor, BlipForConditionalGeneration
12
  import cv2
13
 
14
- def download_youtube_video(video_url, api_key):
15
- ydl_opts = {
16
- 'format': 'bestvideo+bestaudio',
17
- 'outtmpl': os.path.join('./', '%(title)s.%(ext)s'),
18
- 'quiet': True,
19
- }
20
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
21
- ydl.download([video_url])
22
- video_info = ydl.extract_info(video_url, download=False)
23
- video_title = video_info.get('title', 'video')
24
- return os.path.join('./', f"{video_title}.webm")
25
-
26
- def convert_to_mp4(input_path):
27
- output_file = os.path.join('./', 'video.mp4')
28
- command = ['ffmpeg', '-i', input_path, '-c', 'copy', output_file]
29
- subprocess.run(command, check=True)
30
- return output_file
31
-
32
  def extract_audio_from_video(video_path):
33
  video_clip = VideoFileClip(video_path)
34
  audio_output = os.path.join('./', 'audio.mp3')
@@ -82,6 +64,7 @@ def predict_emotions(caption):
82
 
83
  return predicted_emotions
84
 
 
85
  caption_model_name = "Salesforce/blip-image-captioning-base"
86
  caption_processor = BlipProcessor.from_pretrained(caption_model_name)
87
  caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_name)
@@ -90,64 +73,82 @@ emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
90
  emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
91
  emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
92
 
93
- def analyze_video(video_url, api_key):
94
- video_path = download_youtube_video(video_url, api_key)
95
- mp4_path = convert_to_mp4(video_path)
96
- audio_path = extract_audio_from_video(mp4_path)
97
- audio_wav_path = convert_mp3_to_wav(audio_path)
 
 
 
 
 
 
 
98
 
99
- model_whisper = whisper.load_model("base")
100
- result_whisper = model_whisper.transcribe(audio_wav_path)
101
- transcript = result_whisper['text']
102
 
103
- emotion_dict_text, predicted_emotion_text = process_text(transcript)
104
 
105
- n_frame_interval = 60
106
- emotion_vectors_video = []
 
107
 
108
- video_capture = cv2.VideoCapture(mp4_path)
109
- total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
110
- frame_count_video = 0
111
 
112
- while video_capture.isOpened():
113
- ret_video, frame_video = video_capture.read()
114
 
115
- if not ret_video or frame_count_video > total_frames_video:
116
- break
117
 
118
- if frame_count_video % n_frame_interval == 0:
119
- pixel_values_video = preprocess_frame(frame_video)
120
- caption_video = generate_caption(pixel_values_video)
121
- predicted_emotions_video = predict_emotions(caption_video)
122
- emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
123
 
124
- frame_count_video += 1
125
 
126
- video_capture.release()
127
 
128
- average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
129
- combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
130
- final_most_predicted_index = np.argmax(combined_emotion_vector_final)
131
- final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
132
 
133
- return transcript, predicted_emotion_text, final_most_predicted_emotion
 
 
 
134
 
 
135
  with gr.Blocks() as iface:
136
- gr.Markdown("# 🎥 Multimodal Emotion Recognition\nUpload or enter a YouTube Video URL and analyze emotions from both audio and video frames.")
137
-
138
- with gr.Row():
139
- video_url = gr.Textbox(label="YouTube Video URL", placeholder="Enter video URL here...", interactive=True)
140
- api_key = gr.Textbox(label="YouTube API Key", placeholder="Enter your API key", type="password", interactive=True)
141
-
142
- with gr.Row():
143
- submit_button = gr.Button("Analyze Video")
144
 
 
 
 
 
 
 
 
 
 
145
  with gr.Row():
146
  transcript_output = gr.Textbox(label="Transcript", interactive=False)
147
  audio_emotion_output = gr.Textbox(label="Emotion from Audio and Text", interactive=False)
148
  visual_emotion_output = gr.Textbox(label="Emotion from Video", interactive=False)
149
-
150
- submit_button.click(analyze_video, inputs=[video_url, api_key], outputs=[transcript_output, audio_emotion_output, visual_emotion_output])
 
 
 
 
151
 
152
  if __name__ == "__main__":
153
  iface.launch()
 
11
  from transformers import BlipProcessor, BlipForConditionalGeneration
12
  import cv2
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def extract_audio_from_video(video_path):
15
  video_clip = VideoFileClip(video_path)
16
  audio_output = os.path.join('./', 'audio.mp3')
 
64
 
65
  return predicted_emotions
66
 
67
+ # Models for image captioning and emotion analysis
68
  caption_model_name = "Salesforce/blip-image-captioning-base"
69
  caption_processor = BlipProcessor.from_pretrained(caption_model_name)
70
  caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_name)
 
73
  emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
74
  emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
75
 
76
+ def analyze_video(video=None, video_url=None):
77
+ if video is not None:
78
+ # If a video is uploaded, process the uploaded file
79
+ video_path = video
80
+ elif video_url:
81
+ # For streaming YouTube video, just embed the link (assuming it's embedded using Gradio)
82
+ video_path = None
83
+
84
+ # If the video is uploaded, extract audio
85
+ if video_path:
86
+ audio_path = extract_audio_from_video(video_path)
87
+ audio_wav_path = convert_mp3_to_wav(audio_path)
88
 
89
+ model_whisper = whisper.load_model("base")
90
+ result_whisper = model_whisper.transcribe(audio_wav_path)
91
+ transcript = result_whisper['text']
92
 
93
+ emotion_dict_text, predicted_emotion_text = process_text(transcript)
94
 
95
+ # Frame-wise emotion detection from the video
96
+ n_frame_interval = 60
97
+ emotion_vectors_video = []
98
 
99
+ video_capture = cv2.VideoCapture(video_path)
100
+ total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
101
+ frame_count_video = 0
102
 
103
+ while video_capture.isOpened():
104
+ ret_video, frame_video = video_capture.read()
105
 
106
+ if not ret_video or frame_count_video > total_frames_video:
107
+ break
108
 
109
+ if frame_count_video % n_frame_interval == 0:
110
+ pixel_values_video = preprocess_frame(frame_video)
111
+ caption_video = generate_caption(pixel_values_video)
112
+ predicted_emotions_video = predict_emotions(caption_video)
113
+ emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
114
 
115
+ frame_count_video += 1
116
 
117
+ video_capture.release()
118
 
119
+ average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
120
+ combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
121
+ final_most_predicted_index = np.argmax(combined_emotion_vector_final)
122
+ final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
123
 
124
+ return transcript, predicted_emotion_text, final_most_predicted_emotion
125
+ else:
126
+ # For streaming, return an empty analysis or handle the embedding in the Gradio UI
127
+ return None, "Streaming video detected (no processing).", "N/A"
128
 
129
+ # Gradio Interface
130
  with gr.Blocks() as iface:
131
+ gr.Markdown("# 🎥 Multimodal Emotion Recognition\nUpload a video or input a YouTube video URL to analyze emotions from audio and video frames.")
 
 
 
 
 
 
 
132
 
133
+ with gr.Tabs():
134
+ with gr.TabItem("Upload Video"):
135
+ video_file = gr.File(label="Upload Video File", file_types=["video"])
136
+ submit_button_file = gr.Button("Analyze Uploaded Video")
137
+
138
+ with gr.TabItem("YouTube URL"):
139
+ video_url = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube video URL")
140
+ submit_button_url = gr.Button("Analyze YouTube Video")
141
+
142
  with gr.Row():
143
  transcript_output = gr.Textbox(label="Transcript", interactive=False)
144
  audio_emotion_output = gr.Textbox(label="Emotion from Audio and Text", interactive=False)
145
  visual_emotion_output = gr.Textbox(label="Emotion from Video", interactive=False)
146
+
147
+ # For uploaded video
148
+ submit_button_file.click(analyze_video, inputs=[video_file, None], outputs=[transcript_output, audio_emotion_output, visual_emotion_output])
149
+
150
+ # For YouTube streaming (no downloading)
151
+ submit_button_url.click(analyze_video, inputs=[None, video_url], outputs=[transcript_output, audio_emotion_output, visual_emotion_output])
152
 
153
  if __name__ == "__main__":
154
  iface.launch()