Pradheep1647
commited on
Commit
·
03c677b
1
Parent(s):
cbf53ef
upload video option added
Browse files
app.py
CHANGED
@@ -11,24 +11,6 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
11 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
12 |
import cv2
|
13 |
|
14 |
-
def download_youtube_video(video_url, api_key):
|
15 |
-
ydl_opts = {
|
16 |
-
'format': 'bestvideo+bestaudio',
|
17 |
-
'outtmpl': os.path.join('./', '%(title)s.%(ext)s'),
|
18 |
-
'quiet': True,
|
19 |
-
}
|
20 |
-
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
21 |
-
ydl.download([video_url])
|
22 |
-
video_info = ydl.extract_info(video_url, download=False)
|
23 |
-
video_title = video_info.get('title', 'video')
|
24 |
-
return os.path.join('./', f"{video_title}.webm")
|
25 |
-
|
26 |
-
def convert_to_mp4(input_path):
|
27 |
-
output_file = os.path.join('./', 'video.mp4')
|
28 |
-
command = ['ffmpeg', '-i', input_path, '-c', 'copy', output_file]
|
29 |
-
subprocess.run(command, check=True)
|
30 |
-
return output_file
|
31 |
-
|
32 |
def extract_audio_from_video(video_path):
|
33 |
video_clip = VideoFileClip(video_path)
|
34 |
audio_output = os.path.join('./', 'audio.mp3')
|
@@ -82,6 +64,7 @@ def predict_emotions(caption):
|
|
82 |
|
83 |
return predicted_emotions
|
84 |
|
|
|
85 |
caption_model_name = "Salesforce/blip-image-captioning-base"
|
86 |
caption_processor = BlipProcessor.from_pretrained(caption_model_name)
|
87 |
caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_name)
|
@@ -90,64 +73,82 @@ emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
|
|
90 |
emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
|
91 |
emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
|
92 |
|
93 |
-
def analyze_video(
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
|
103 |
-
|
104 |
|
105 |
-
|
106 |
-
|
|
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
|
112 |
-
|
113 |
-
|
114 |
|
115 |
-
|
116 |
-
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
|
124 |
-
|
125 |
|
126 |
-
|
127 |
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
|
133 |
-
|
|
|
|
|
|
|
134 |
|
|
|
135 |
with gr.Blocks() as iface:
|
136 |
-
gr.Markdown("# 🎥 Multimodal Emotion Recognition\nUpload or
|
137 |
-
|
138 |
-
with gr.Row():
|
139 |
-
video_url = gr.Textbox(label="YouTube Video URL", placeholder="Enter video URL here...", interactive=True)
|
140 |
-
api_key = gr.Textbox(label="YouTube API Key", placeholder="Enter your API key", type="password", interactive=True)
|
141 |
-
|
142 |
-
with gr.Row():
|
143 |
-
submit_button = gr.Button("Analyze Video")
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
with gr.Row():
|
146 |
transcript_output = gr.Textbox(label="Transcript", interactive=False)
|
147 |
audio_emotion_output = gr.Textbox(label="Emotion from Audio and Text", interactive=False)
|
148 |
visual_emotion_output = gr.Textbox(label="Emotion from Video", interactive=False)
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
151 |
|
152 |
if __name__ == "__main__":
|
153 |
iface.launch()
|
|
|
11 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
12 |
import cv2
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
def extract_audio_from_video(video_path):
|
15 |
video_clip = VideoFileClip(video_path)
|
16 |
audio_output = os.path.join('./', 'audio.mp3')
|
|
|
64 |
|
65 |
return predicted_emotions
|
66 |
|
67 |
+
# Models for image captioning and emotion analysis
|
68 |
caption_model_name = "Salesforce/blip-image-captioning-base"
|
69 |
caption_processor = BlipProcessor.from_pretrained(caption_model_name)
|
70 |
caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_name)
|
|
|
73 |
emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
|
74 |
emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
|
75 |
|
76 |
+
def analyze_video(video=None, video_url=None):
|
77 |
+
if video is not None:
|
78 |
+
# If a video is uploaded, process the uploaded file
|
79 |
+
video_path = video
|
80 |
+
elif video_url:
|
81 |
+
# For streaming YouTube video, just embed the link (assuming it's embedded using Gradio)
|
82 |
+
video_path = None
|
83 |
+
|
84 |
+
# If the video is uploaded, extract audio
|
85 |
+
if video_path:
|
86 |
+
audio_path = extract_audio_from_video(video_path)
|
87 |
+
audio_wav_path = convert_mp3_to_wav(audio_path)
|
88 |
|
89 |
+
model_whisper = whisper.load_model("base")
|
90 |
+
result_whisper = model_whisper.transcribe(audio_wav_path)
|
91 |
+
transcript = result_whisper['text']
|
92 |
|
93 |
+
emotion_dict_text, predicted_emotion_text = process_text(transcript)
|
94 |
|
95 |
+
# Frame-wise emotion detection from the video
|
96 |
+
n_frame_interval = 60
|
97 |
+
emotion_vectors_video = []
|
98 |
|
99 |
+
video_capture = cv2.VideoCapture(video_path)
|
100 |
+
total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
|
101 |
+
frame_count_video = 0
|
102 |
|
103 |
+
while video_capture.isOpened():
|
104 |
+
ret_video, frame_video = video_capture.read()
|
105 |
|
106 |
+
if not ret_video or frame_count_video > total_frames_video:
|
107 |
+
break
|
108 |
|
109 |
+
if frame_count_video % n_frame_interval == 0:
|
110 |
+
pixel_values_video = preprocess_frame(frame_video)
|
111 |
+
caption_video = generate_caption(pixel_values_video)
|
112 |
+
predicted_emotions_video = predict_emotions(caption_video)
|
113 |
+
emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
|
114 |
|
115 |
+
frame_count_video += 1
|
116 |
|
117 |
+
video_capture.release()
|
118 |
|
119 |
+
average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
|
120 |
+
combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
|
121 |
+
final_most_predicted_index = np.argmax(combined_emotion_vector_final)
|
122 |
+
final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
|
123 |
|
124 |
+
return transcript, predicted_emotion_text, final_most_predicted_emotion
|
125 |
+
else:
|
126 |
+
# For streaming, return an empty analysis or handle the embedding in the Gradio UI
|
127 |
+
return None, "Streaming video detected (no processing).", "N/A"
|
128 |
|
129 |
+
# Gradio Interface
|
130 |
with gr.Blocks() as iface:
|
131 |
+
gr.Markdown("# 🎥 Multimodal Emotion Recognition\nUpload a video or input a YouTube video URL to analyze emotions from audio and video frames.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
+
with gr.Tabs():
|
134 |
+
with gr.TabItem("Upload Video"):
|
135 |
+
video_file = gr.File(label="Upload Video File", file_types=["video"])
|
136 |
+
submit_button_file = gr.Button("Analyze Uploaded Video")
|
137 |
+
|
138 |
+
with gr.TabItem("YouTube URL"):
|
139 |
+
video_url = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube video URL")
|
140 |
+
submit_button_url = gr.Button("Analyze YouTube Video")
|
141 |
+
|
142 |
with gr.Row():
|
143 |
transcript_output = gr.Textbox(label="Transcript", interactive=False)
|
144 |
audio_emotion_output = gr.Textbox(label="Emotion from Audio and Text", interactive=False)
|
145 |
visual_emotion_output = gr.Textbox(label="Emotion from Video", interactive=False)
|
146 |
+
|
147 |
+
# For uploaded video
|
148 |
+
submit_button_file.click(analyze_video, inputs=[video_file, None], outputs=[transcript_output, audio_emotion_output, visual_emotion_output])
|
149 |
+
|
150 |
+
# For YouTube streaming (no downloading)
|
151 |
+
submit_button_url.click(analyze_video, inputs=[None, video_url], outputs=[transcript_output, audio_emotion_output, visual_emotion_output])
|
152 |
|
153 |
if __name__ == "__main__":
|
154 |
iface.launch()
|