Pradheep1647
commited on
Commit
·
909f75a
1
Parent(s):
c2b1295
removed comments from app.py
Browse files
app.py
CHANGED
@@ -11,8 +11,6 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
11 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
12 |
import cv2
|
13 |
|
14 |
-
# Define the necessary functions
|
15 |
-
|
16 |
def download_youtube_video(video_url, output_path):
|
17 |
ydl_opts = {
|
18 |
'format': 'bestvideo+bestaudio',
|
@@ -83,7 +81,6 @@ def predict_emotions(caption):
|
|
83 |
|
84 |
return predicted_emotions
|
85 |
|
86 |
-
# Load models and processors once at the start
|
87 |
caption_model_name = "Salesforce/blip-image-captioning-base"
|
88 |
caption_processor = BlipProcessor.from_pretrained(caption_model_name)
|
89 |
caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_name)
|
@@ -92,83 +89,54 @@ emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
|
|
92 |
emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
|
93 |
emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
|
94 |
|
95 |
-
# Gradio Interface Function
|
96 |
def analyze_video(video_url):
|
97 |
-
# Set output path for downloads
|
98 |
global output_path
|
99 |
output_path = './'
|
100 |
-
|
101 |
-
# Download the video
|
102 |
video_path = download_youtube_video(video_url, output_path)
|
103 |
-
|
104 |
-
# Convert to mp4 format
|
105 |
mp4_path = convert_to_mp4(video_path, output_path)
|
106 |
-
|
107 |
-
# Extract audio from the video
|
108 |
audio_path = extract_audio_from_video(mp4_path)
|
109 |
-
|
110 |
-
# Convert audio to wav format for processing
|
111 |
audio_wav_path = convert_mp3_to_wav(audio_path)
|
112 |
-
|
113 |
-
# Process the audio using Whisper for transcription
|
114 |
model_whisper = whisper.load_model("base")
|
115 |
|
116 |
result_whisper = model_whisper.transcribe(audio_wav_path)
|
117 |
|
118 |
transcript = result_whisper['text']
|
119 |
-
|
120 |
-
# Process text to get emotions
|
121 |
emotion_dict_text, predicted_emotion_text = process_text(transcript)
|
122 |
-
|
123 |
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
|
|
127 |
|
128 |
-
|
129 |
-
|
130 |
-
total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
|
131 |
|
132 |
-
|
|
|
133 |
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
136 |
|
137 |
-
|
138 |
-
break
|
139 |
|
140 |
-
|
141 |
-
pixel_values_video = preprocess_frame(frame_video)
|
142 |
-
caption_video = generate_caption(pixel_values_video)
|
143 |
-
predicted_emotions_video, _ = predict_emotions(caption_video)
|
144 |
|
145 |
-
|
146 |
-
|
|
|
|
|
147 |
|
148 |
-
|
149 |
|
150 |
-
|
151 |
-
|
152 |
-
# Aggregate results from video frames
|
153 |
-
average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
|
154 |
-
|
155 |
-
# Combine text and video emotion results
|
156 |
-
combined_emotion_vector_final= np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
|
157 |
-
|
158 |
-
final_most_predicted_index= np.argmax(combined_emotion_vector_final)
|
159 |
-
|
160 |
-
final_most_predicted_emotion= list(emotion_dict_text.keys())[final_most_predicted_index]
|
161 |
-
|
162 |
-
return transcript, predicted_emotion_text, final_most_predicted_emotion
|
163 |
-
|
164 |
-
|
165 |
-
# Create Gradio interface
|
166 |
-
iface= gr.Interface(fn=analyze_video,
|
167 |
inputs=gr.Textbox(label="YouTube Video URL"),
|
168 |
outputs=["text", "text", "text"],
|
169 |
title="Multimodal Emotion Recognition",
|
170 |
description="Enter a YouTube Video URL to analyze emotions from both audio and visual content.")
|
171 |
-
|
172 |
-
# Launch the app
|
173 |
if __name__ == "__main__":
|
174 |
-
|
|
|
11 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
12 |
import cv2
|
13 |
|
|
|
|
|
14 |
def download_youtube_video(video_url, output_path):
|
15 |
ydl_opts = {
|
16 |
'format': 'bestvideo+bestaudio',
|
|
|
81 |
|
82 |
return predicted_emotions
|
83 |
|
|
|
84 |
caption_model_name = "Salesforce/blip-image-captioning-base"
|
85 |
caption_processor = BlipProcessor.from_pretrained(caption_model_name)
|
86 |
caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_name)
|
|
|
89 |
emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
|
90 |
emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
|
91 |
|
|
|
92 |
def analyze_video(video_url):
|
|
|
93 |
global output_path
|
94 |
output_path = './'
|
|
|
|
|
95 |
video_path = download_youtube_video(video_url, output_path)
|
|
|
|
|
96 |
mp4_path = convert_to_mp4(video_path, output_path)
|
|
|
|
|
97 |
audio_path = extract_audio_from_video(mp4_path)
|
|
|
|
|
98 |
audio_wav_path = convert_mp3_to_wav(audio_path)
|
|
|
|
|
99 |
model_whisper = whisper.load_model("base")
|
100 |
|
101 |
result_whisper = model_whisper.transcribe(audio_wav_path)
|
102 |
|
103 |
transcript = result_whisper['text']
|
|
|
|
|
104 |
emotion_dict_text, predicted_emotion_text = process_text(transcript)
|
|
|
105 |
|
106 |
+
n_frame_interval = 60
|
107 |
+
emotion_vectors_video = []
|
108 |
+
video_capture = cv2.VideoCapture(mp4_path)
|
109 |
+
total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
|
110 |
+
frame_count_video = 0
|
111 |
|
112 |
+
while video_capture.isOpened():
|
113 |
+
ret_video, frame_video = video_capture.read()
|
|
|
114 |
|
115 |
+
if not ret_video or frame_count_video > total_frames_video:
|
116 |
+
break
|
117 |
|
118 |
+
if frame_count_video % n_frame_interval == 0:
|
119 |
+
pixel_values_video = preprocess_frame(frame_video)
|
120 |
+
caption_video = generate_caption(pixel_values_video)
|
121 |
+
predicted_emotions_video, _ = predict_emotions(caption_video)
|
122 |
+
emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
|
123 |
|
124 |
+
frame_count_video += 1
|
|
|
125 |
|
126 |
+
video_capture.release()
|
|
|
|
|
|
|
127 |
|
128 |
+
average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
|
129 |
+
combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
|
130 |
+
final_most_predicted_index = np.argmax(combined_emotion_vector_final)
|
131 |
+
final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
|
132 |
|
133 |
+
return transcript, predicted_emotion_text, final_most_predicted_emotion
|
134 |
|
135 |
+
iface = gr.Interface(fn=analyze_video,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
inputs=gr.Textbox(label="YouTube Video URL"),
|
137 |
outputs=["text", "text", "text"],
|
138 |
title="Multimodal Emotion Recognition",
|
139 |
description="Enter a YouTube Video URL to analyze emotions from both audio and visual content.")
|
140 |
+
|
|
|
141 |
if __name__ == "__main__":
|
142 |
+
iface.launch()
|