siddhantuniyal commited on
Commit
200d507
·
verified ·
1 Parent(s): 9f4c26c

feat: add audio pipeline and change response format

Browse files
Files changed (1) hide show
  1. app.py +161 -58
app.py CHANGED
@@ -1,58 +1,161 @@
1
- import gradio as gr
2
- from gradio_client import Client , handle_file
3
- import cv2
4
- import os
5
- from PIL import Image
6
-
7
- clientImgPipeLn = Client("dj-dawgs-ipd/IPD_IMAGE_PIPELINE")
8
-
9
- def predict(video_path):
10
- cap = cv2.VideoCapture(video_path)
11
- fps = int(cap.get(cv2.CAP_PROP_FPS))
12
- frame_interval = fps * 2
13
-
14
- frame_count = 0
15
- success = True
16
-
17
- temp_dir = "temp_frames"
18
- os.makedirs(temp_dir, exist_ok=True)
19
-
20
- res = 'not_hate'
21
-
22
- while success:
23
- success, frame = cap.read()
24
- if frame_count % frame_interval == 0 and success:
25
- temp_image_path = os.path.join(temp_dir, f"frame_{frame_count // fps}s.jpg")
26
- cv2.imwrite(temp_image_path, frame)
27
-
28
- response = clientImgPipeLn.predict(
29
- image=handle_file(temp_image_path),
30
- api_name="/predict"
31
- )
32
- print(f"Response for frame at {frame_count // fps}s: {response}")
33
- if(response[0]['label'] == 'hate'):
34
- res = 'hate'
35
- break
36
-
37
- frame_count += 1
38
-
39
- cap.release()
40
-
41
- for file in os.listdir(temp_dir):
42
- os.remove(os.path.join(temp_dir, file))
43
- os.rmdir(temp_dir)
44
-
45
- print("prediction successful")
46
-
47
- return res
48
-
49
- iface = gr.Interface(fn=predict,
50
- inputs = gr.Video(),
51
- outputs=[gr.Label(label = "Class")],
52
- title = "Hate Speech Detection in Video",
53
- description = "Detect hateful symbols or text in Video"
54
- )
55
-
56
- if __name__ == "__main__":
57
- iface.launch()
58
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_client import Client , handle_file
3
+ import cv2
4
+ import os
5
+ import shutil
6
+ from PIL import Image
7
+ from moviepy import *
8
+
9
+ clientImgPipeLn = Client("dj-dawgs-ipd/IPD_IMAGE_PIPELINE")
10
+ clientAudioPipeLn = Client("dj-dawgs-ipd/IPD_AUDIO_PIPELINE")
11
+
12
+ def predict(video_path):
13
+ cap = cv2.VideoCapture(video_path)
14
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
15
+ frame_interval = fps * 2
16
+
17
+ frame_count = 0
18
+ success = True
19
+
20
+ temp_data_path = "temp_data"
21
+ os.makedirs(temp_data_path, exist_ok=True)
22
+
23
+ temp_frames_path = os.path.join(temp_data_path, "temp_frames")
24
+ os.makedirs(temp_frames_path, exist_ok=True)
25
+
26
+ resImg = {}
27
+ resAudio = {}
28
+
29
+ video_clip = VideoFileClip(video_path)
30
+
31
+ if video_clip.audio is None:
32
+ resAudio = {
33
+ 'prediction' : None,
34
+ 'language' : None,
35
+ 'label' : None,
36
+ 'confidence' : None,
37
+ 'hate_text' : None
38
+ }
39
+ else:
40
+ audio_path = os.path.join(temp_data_path , "temp_audio.wav")
41
+ video_clip.audio.write_audiofile(audio_path , codec="pcm_s16le")
42
+ resAudio = clientAudioPipeLn.predict(
43
+ audio_path = handle_file(audio_path),
44
+ api_name = '/predict'
45
+ )
46
+
47
+ while success:
48
+ success, frame = cap.read()
49
+ if frame_count % frame_interval == 0 and success:
50
+
51
+ temp_image_path = os.path.join(temp_data_path, f"temp_frames/temp_frame_{frame_count // fps}s.jpg")
52
+ cv2.imwrite(temp_image_path, frame)
53
+
54
+ response = clientImgPipeLn.predict(
55
+ image=handle_file(temp_image_path),
56
+ api_name="/predict"
57
+ )
58
+
59
+ print(f"Response for frame at {frame_count // fps}s: {response}")
60
+
61
+ if response['prediction'] == 'hate':
62
+ resImg = response
63
+ resImg['hate_image_timestamp'] = frame_count//fps
64
+ break
65
+
66
+ frame_count += 1
67
+
68
+ cap.release()
69
+
70
+ shutil.rmtree(temp_data_path)
71
+
72
+ if len(resImg) == 0 and resAudio['prediction'] == 'not_hate':
73
+ return {
74
+ 'prediction' : 'not_hate',
75
+ 'language' : {
76
+ 'video' : None,
77
+ 'audio' : None
78
+ },
79
+ 'label' : {
80
+ 'video' : None,
81
+ 'audio' : None
82
+ },
83
+ 'confidence' : None,
84
+ 'hate_text' : {
85
+ 'video' : None,
86
+ 'audio' : None
87
+ },
88
+ 'hate_image_timestamp' : None,
89
+ 'hate_component' : None
90
+ }
91
+
92
+ if resImg['prediction'] == 'hate' and resAudio['prediction'] == 'not_hate':
93
+ resImg['hate_component'] = 'video'
94
+ return {
95
+ 'prediction' : 'hate',
96
+ 'language' : {
97
+ 'video' : resImg['language'],
98
+ 'audio' : None
99
+ },
100
+ 'label' : {
101
+ 'video' : resImg['label'],
102
+ 'audio' : None
103
+ },
104
+ 'confidence' : resImg['confidence'],
105
+ 'hate_text' : {
106
+ 'video' : resImg['hate_text'],
107
+ 'audio' : None
108
+ },
109
+ 'hate_image_timestamp' : resImg['hate_image_timestamp'],
110
+ 'hate_component' : ["video"]
111
+ }
112
+
113
+ if len(resImg) == 0 and resAudio['prediction'] == 'hate':
114
+ return {
115
+ 'prediction' : 'hate',
116
+ 'language' : {
117
+ 'video' : None,
118
+ 'audio' : resAudio['language']
119
+ },
120
+ 'label' : {
121
+ 'video' : None,
122
+ 'audio' : resAudio['label']
123
+ },
124
+ 'confidence' : resAudio['confidence'],
125
+ 'hate_text' : {
126
+ 'video' : None,
127
+ 'audio' : resAudio['hate_text']
128
+ },
129
+ 'hate_image_timestamp' : None,
130
+ 'hate_component' : ["audio"],
131
+ }
132
+
133
+ return {
134
+ 'prediction' : 'hate',
135
+ 'language' : {
136
+ 'video' : resImg['language'],
137
+ 'audio' : resAudio['language']
138
+ },
139
+ 'label' : {
140
+ 'video' : resImg['label'],
141
+ 'audio' : resAudio['label']
142
+ },
143
+ 'confidence' : ((resImg['confidence'] or 0) + (resAudio['confidence'] or 0)) / (2 - (resImg['confidence'] == None or resAudio['confidence'] == None)),
144
+ 'hate_text' : {
145
+ 'video' : resImg['hate_text'],
146
+ 'audio' : resAudio['hate_text']
147
+ },
148
+ 'hate_image_timestamp' : resImg['hate_image_timestamp'],
149
+ 'hate_component' : ["video" , "audio"]
150
+ }
151
+
152
+ iface = gr.Interface(fn=predict,
153
+ inputs = gr.Video(),
154
+ outputs=gr.JSON(),
155
+ title = "Hate Speech Detection in Video",
156
+ description = "Detect hateful symbols or text in Video"
157
+ )
158
+
159
+ if __name__ == "__main__":
160
+ iface.launch(show_error = True)
161
+