Pradheep1647 commited on
Commit
909f75a
·
1 Parent(s): c2b1295

removed comments from app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -56
app.py CHANGED
@@ -11,8 +11,6 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
  from transformers import BlipProcessor, BlipForConditionalGeneration
12
  import cv2
13
 
14
- # Define the necessary functions
15
-
16
  def download_youtube_video(video_url, output_path):
17
  ydl_opts = {
18
  'format': 'bestvideo+bestaudio',
@@ -83,7 +81,6 @@ def predict_emotions(caption):
83
 
84
  return predicted_emotions
85
 
86
- # Load models and processors once at the start
87
  caption_model_name = "Salesforce/blip-image-captioning-base"
88
  caption_processor = BlipProcessor.from_pretrained(caption_model_name)
89
  caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_name)
@@ -92,83 +89,54 @@ emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
92
  emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
93
  emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
94
 
95
- # Gradio Interface Function
96
  def analyze_video(video_url):
97
- # Set output path for downloads
98
  global output_path
99
  output_path = './'
100
-
101
- # Download the video
102
  video_path = download_youtube_video(video_url, output_path)
103
-
104
- # Convert to mp4 format
105
  mp4_path = convert_to_mp4(video_path, output_path)
106
-
107
- # Extract audio from the video
108
  audio_path = extract_audio_from_video(mp4_path)
109
-
110
- # Convert audio to wav format for processing
111
  audio_wav_path = convert_mp3_to_wav(audio_path)
112
-
113
- # Process the audio using Whisper for transcription
114
  model_whisper = whisper.load_model("base")
115
 
116
  result_whisper = model_whisper.transcribe(audio_wav_path)
117
 
118
  transcript = result_whisper['text']
119
-
120
- # Process text to get emotions
121
  emotion_dict_text, predicted_emotion_text = process_text(transcript)
122
-
123
 
124
- # Process the video using image captioning and emotion recognition
125
- n_frame_interval = 60 # Process every 60th frame
126
- emotion_vectors_video = []
 
 
127
 
128
- # Process the video frames for emotions using BLIP model
129
- video_capture = cv2.VideoCapture(mp4_path)
130
- total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
131
 
132
- frame_count_video = 0
 
133
 
134
- while video_capture.isOpened():
135
- ret_video, frame_video = video_capture.read()
 
 
 
136
 
137
- if not ret_video or frame_count_video > total_frames_video:
138
- break
139
 
140
- if frame_count_video % n_frame_interval == 0:
141
- pixel_values_video = preprocess_frame(frame_video)
142
- caption_video = generate_caption(pixel_values_video)
143
- predicted_emotions_video, _ = predict_emotions(caption_video)
144
 
145
- # Collect emotion vectors from frames
146
- emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
 
 
147
 
148
- frame_count_video += 1
149
 
150
- video_capture.release()
151
-
152
- # Aggregate results from video frames
153
- average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
154
-
155
- # Combine text and video emotion results
156
- combined_emotion_vector_final= np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
157
-
158
- final_most_predicted_index= np.argmax(combined_emotion_vector_final)
159
-
160
- final_most_predicted_emotion= list(emotion_dict_text.keys())[final_most_predicted_index]
161
-
162
- return transcript, predicted_emotion_text, final_most_predicted_emotion
163
-
164
-
165
- # Create Gradio interface
166
- iface= gr.Interface(fn=analyze_video,
167
  inputs=gr.Textbox(label="YouTube Video URL"),
168
  outputs=["text", "text", "text"],
169
  title="Multimodal Emotion Recognition",
170
  description="Enter a YouTube Video URL to analyze emotions from both audio and visual content.")
171
-
172
- # Launch the app
173
  if __name__ == "__main__":
174
- iface.launch()
 
11
  from transformers import BlipProcessor, BlipForConditionalGeneration
12
  import cv2
13
 
 
 
14
  def download_youtube_video(video_url, output_path):
15
  ydl_opts = {
16
  'format': 'bestvideo+bestaudio',
 
81
 
82
  return predicted_emotions
83
 
 
84
  caption_model_name = "Salesforce/blip-image-captioning-base"
85
  caption_processor = BlipProcessor.from_pretrained(caption_model_name)
86
  caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_name)
 
89
  emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
90
  emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
91
 
 
92
  def analyze_video(video_url):
 
93
  global output_path
94
  output_path = './'
 
 
95
  video_path = download_youtube_video(video_url, output_path)
 
 
96
  mp4_path = convert_to_mp4(video_path, output_path)
 
 
97
  audio_path = extract_audio_from_video(mp4_path)
 
 
98
  audio_wav_path = convert_mp3_to_wav(audio_path)
 
 
99
  model_whisper = whisper.load_model("base")
100
 
101
  result_whisper = model_whisper.transcribe(audio_wav_path)
102
 
103
  transcript = result_whisper['text']
 
 
104
  emotion_dict_text, predicted_emotion_text = process_text(transcript)
 
105
 
106
+ n_frame_interval = 60
107
+ emotion_vectors_video = []
108
+ video_capture = cv2.VideoCapture(mp4_path)
109
+ total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
110
+ frame_count_video = 0
111
 
112
+ while video_capture.isOpened():
113
+ ret_video, frame_video = video_capture.read()
 
114
 
115
+ if not ret_video or frame_count_video > total_frames_video:
116
+ break
117
 
118
+ if frame_count_video % n_frame_interval == 0:
119
+ pixel_values_video = preprocess_frame(frame_video)
120
+ caption_video = generate_caption(pixel_values_video)
121
+ predicted_emotions_video, _ = predict_emotions(caption_video)
122
+ emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
123
 
124
+ frame_count_video += 1
 
125
 
126
+ video_capture.release()
 
 
 
127
 
128
+ average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
129
+ combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
130
+ final_most_predicted_index = np.argmax(combined_emotion_vector_final)
131
+ final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
132
 
133
+ return transcript, predicted_emotion_text, final_most_predicted_emotion
134
 
135
+ iface = gr.Interface(fn=analyze_video,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  inputs=gr.Textbox(label="YouTube Video URL"),
137
  outputs=["text", "text", "text"],
138
  title="Multimodal Emotion Recognition",
139
  description="Enter a YouTube Video URL to analyze emotions from both audio and visual content.")
140
+
 
141
  if __name__ == "__main__":
142
+ iface.launch()