Tanmay Agrawal
commited on
Commit
·
7c7c0ee
1
Parent(s):
c3a69f4
Update app.py
Browse files
app.py
CHANGED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import os, glob
|
7 |
+
import librosa
|
8 |
+
import librosa.display
|
9 |
+
|
10 |
+
|
11 |
+
sample_rate=48000
|
12 |
+
|
13 |
+
def get_waveforms(file):
|
14 |
+
'''# load an individual sample audio file
|
15 |
+
# read the full 3 seconds of the file, cut off the first 0.5s of silence; native sample rate = 48k
|
16 |
+
# don't need to store the sample rate that librosa.load returns'''
|
17 |
+
|
18 |
+
waveform, _ = librosa.load(file, duration=3, offset=0.5, sr=sample_rate)
|
19 |
+
waveform_homo = np.zeros((int(sample_rate*3,)))
|
20 |
+
waveform_homo[:len(waveform)] = waveform
|
21 |
+
return waveform_homo
|
22 |
+
|
23 |
+
class SER(nn.Module):
|
24 |
+
# Define all layers present in the network
|
25 |
+
def __init__(self,num_emotions):
|
26 |
+
super().__init__()
|
27 |
+
|
28 |
+
'''################ TRANSFORMER BLOCK #############################'''
|
29 |
+
self.transformer_maxpool = nn.MaxPool2d(kernel_size=[1,4], stride=[1,4])
|
30 |
+
transformer_layer = nn.TransformerEncoderLayer(
|
31 |
+
d_model=40, # input feature (frequency) dim after maxpooling 40*282 -> 40*70 (MFC*time)
|
32 |
+
nhead=4, # 4 self-attention layers in each multi-head self-attention layer in each encoder block
|
33 |
+
dim_feedforward=512, # 2 linear layers in each encoder block's feedforward network: dim 40-->512--->40
|
34 |
+
dropout=0.4,
|
35 |
+
activation='relu' # ReLU: avoid saturation/tame gradient/reduce compute time
|
36 |
+
)
|
37 |
+
self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4)
|
38 |
+
|
39 |
+
'''############### 1ST PARALLEL 2D CONVOLUTION BLOCK ############'''
|
40 |
+
self.conv2Dblock1 = nn.Sequential(
|
41 |
+
|
42 |
+
nn.Conv2d(
|
43 |
+
in_channels=1, # input volume depth == input channel dim == 1
|
44 |
+
out_channels=16, # expand output feature map volume's depth to 16
|
45 |
+
kernel_size=3, # 3*3 stride 1 kernel
|
46 |
+
stride=1,
|
47 |
+
padding=1
|
48 |
+
),
|
49 |
+
nn.BatchNorm2d(16), # batch normalize the output feature map before activation
|
50 |
+
nn.ReLU(),
|
51 |
+
nn.MaxPool2d(kernel_size=2, stride=2),
|
52 |
+
nn.Dropout(p=0.3),
|
53 |
+
|
54 |
+
# 2nd 2D convolution layer identical to last except output dim, maxpool kernel
|
55 |
+
nn.Conv2d(
|
56 |
+
in_channels=16,
|
57 |
+
out_channels=32, # expand output feature map volume's depth to 32
|
58 |
+
kernel_size=3,
|
59 |
+
stride=1,
|
60 |
+
padding=1
|
61 |
+
),
|
62 |
+
nn.BatchNorm2d(32),
|
63 |
+
nn.ReLU(),
|
64 |
+
nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters
|
65 |
+
nn.Dropout(p=0.3),
|
66 |
+
|
67 |
+
# 3rd 2D convolution layer identical to last except output dim
|
68 |
+
nn.Conv2d(
|
69 |
+
in_channels=32,
|
70 |
+
out_channels=64, # expand output feature map volume's depth to 64
|
71 |
+
kernel_size=3,
|
72 |
+
stride=1,
|
73 |
+
padding=1
|
74 |
+
),
|
75 |
+
nn.BatchNorm2d(64),
|
76 |
+
nn.ReLU(),
|
77 |
+
nn.MaxPool2d(kernel_size=4, stride=4),
|
78 |
+
nn.Dropout(p=0.3),
|
79 |
+
)
|
80 |
+
'''############### 2ND PARALLEL 2D CONVOLUTION BLOCK ############'''
|
81 |
+
self.conv2Dblock2 = nn.Sequential(
|
82 |
+
|
83 |
+
# 1st 2D convolution layer
|
84 |
+
nn.Conv2d(
|
85 |
+
in_channels=1, # input volume depth == input channel dim == 1
|
86 |
+
out_channels=16,
|
87 |
+
kernel_size=3, #3*3 stride 1 kernel
|
88 |
+
stride=1,
|
89 |
+
padding=1
|
90 |
+
),
|
91 |
+
nn.BatchNorm2d(16), # batch normalize the output feature map before activation
|
92 |
+
nn.ReLU(), # feature map --> activation map
|
93 |
+
nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size
|
94 |
+
nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training
|
95 |
+
|
96 |
+
# 2nd 2D convolution layer identical to last except output dim, maxpool kernel
|
97 |
+
nn.Conv2d(
|
98 |
+
in_channels=16,
|
99 |
+
out_channels=32, # expand output feature map volume's depth to 32
|
100 |
+
kernel_size=3,
|
101 |
+
stride=1,
|
102 |
+
padding=1
|
103 |
+
),
|
104 |
+
nn.BatchNorm2d(32),
|
105 |
+
nn.ReLU(),
|
106 |
+
nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters
|
107 |
+
nn.Dropout(p=0.3),
|
108 |
+
|
109 |
+
# 3rd 2D convolution layer identical to last except output dim
|
110 |
+
nn.Conv2d(
|
111 |
+
in_channels=32,
|
112 |
+
out_channels=64, # expand output feature map volume's depth to 64
|
113 |
+
kernel_size=3,
|
114 |
+
stride=1,
|
115 |
+
padding=1
|
116 |
+
),
|
117 |
+
nn.BatchNorm2d(64),
|
118 |
+
nn.ReLU(),
|
119 |
+
nn.MaxPool2d(kernel_size=4, stride=4),
|
120 |
+
nn.Dropout(p=0.3),
|
121 |
+
)
|
122 |
+
|
123 |
+
# Each full convolution block outputs (64*1*8) embedding flattened to dim 512 1D array
|
124 |
+
# Full transformer block outputs 40*70 feature map, which we time-avg to dim 40 1D array
|
125 |
+
# 512*2+40 == 1064 input features --> 8 output emotions
|
126 |
+
self.fc1_linear = nn.Linear(512*2+40,num_emotions)
|
127 |
+
|
128 |
+
self.softmax_out = nn.Softmax(dim=1)
|
129 |
+
|
130 |
+
def forward(self,x):
|
131 |
+
|
132 |
+
'''############ 1st parallel Conv2D block: 4 Convolutional layers ############################'''
|
133 |
+
conv2d_embedding1 = self.conv2Dblock1(x) # x == N/batch * channel * freq * time
|
134 |
+
|
135 |
+
# flatten final 64*1*8 feature map from convolutional layers to length 512 1D array
|
136 |
+
conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim=1)
|
137 |
+
|
138 |
+
'''############ 2nd parallel Conv2D block: 4 Convolutional layers #############################'''
|
139 |
+
conv2d_embedding2 = self.conv2Dblock2(x) # x == N/batch * channel * freq * time
|
140 |
+
|
141 |
+
conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim=1)
|
142 |
+
|
143 |
+
|
144 |
+
x_maxpool = self.transformer_maxpool(x)
|
145 |
+
|
146 |
+
# remove channel dim: 1*40*70 --> 40*70
|
147 |
+
x_maxpool_reduced = torch.squeeze(x_maxpool,1)
|
148 |
+
|
149 |
+
# transformer encoder layer requires tensor in format: time * batch * embedding (freq)
|
150 |
+
x = x_maxpool_reduced.permute(2,0,1)
|
151 |
+
|
152 |
+
# finally, pass reduced input feature map x into transformer encoder layers
|
153 |
+
transformer_output = self.transformer_encoder(x)
|
154 |
+
|
155 |
+
transformer_embedding = torch.mean(transformer_output, dim=0) # dim 40x70 --> 40
|
156 |
+
|
157 |
+
# concatenate embedding tensors output by parallel 2*conv and 1*transformer blocks
|
158 |
+
complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2,transformer_embedding], dim=1)
|
159 |
+
|
160 |
+
output_logits = self.fc1_linear(complete_embedding)
|
161 |
+
|
162 |
+
output_softmax = self.softmax_out(output_logits)
|
163 |
+
|
164 |
+
return output_logits, output_softmax
|
165 |
+
emotions_dict ={
|
166 |
+
'0':'surprised',
|
167 |
+
'1':'neutral',
|
168 |
+
'2':'calm',
|
169 |
+
'3':'happy',
|
170 |
+
'4':'sad',
|
171 |
+
'5':'angry',
|
172 |
+
'6':'fearful',
|
173 |
+
'7':'disgust'
|
174 |
+
}
|
175 |
+
|
176 |
+
|
177 |
+
def load_checkpoint(optimizer, model, filename):
|
178 |
+
checkpoint_dict = torch.load(filename,map_location=torch.device('cpu'))
|
179 |
+
epoch = checkpoint_dict['epoch']
|
180 |
+
model.load_state_dict(checkpoint_dict['model'])
|
181 |
+
if optimizer is not None:
|
182 |
+
optimizer.load_state_dict(checkpoint_dict['optimizer'])
|
183 |
+
return epoch
|
184 |
+
|
185 |
+
def make_validate_fnc(model,criterion):
|
186 |
+
def validate(X,Y):
|
187 |
+
|
188 |
+
with torch.no_grad():
|
189 |
+
|
190 |
+
# set model to validation phase i.e. turn off dropout and batchnorm layers
|
191 |
+
model.eval()
|
192 |
+
|
193 |
+
# get the model's predictions on the validation set
|
194 |
+
output_logits, output_softmax = model(X)
|
195 |
+
predictions = torch.argmax(output_softmax,dim=1)
|
196 |
+
|
197 |
+
# calculate the mean accuracy over the entire validation set
|
198 |
+
accuracy = torch.sum(Y==predictions)/float(len(Y))
|
199 |
+
|
200 |
+
# compute error from logits (nn.crossentropy implements softmax)
|
201 |
+
loss = criterion(output_logits,Y)
|
202 |
+
|
203 |
+
return loss.item(), accuracy*100, predictions
|
204 |
+
return validate
|
205 |
+
|
206 |
+
model = SER(len(emotions_dict))
|
207 |
+
optimizer = torch.optim.SGD(model.parameters(),lr=0.01, weight_decay=1e-3, momentum=0.8)
|
208 |
+
load_checkpoint(optimizer, model, "SERFINAL-099.pkl")
|
209 |
+
|
210 |
+
waveform = get_waveforms("03-01-08-01-01-01-01.wav")
|
211 |
+
|
212 |
+
waveforms = np.array(waveform)
|
213 |
+
|
214 |
+
|
215 |
+
mfc=librosa.feature.mfcc(
|
216 |
+
y=waveforms,
|
217 |
+
sr=48000,
|
218 |
+
n_mfcc=40,
|
219 |
+
n_fft=1024,
|
220 |
+
win_length=512,
|
221 |
+
window='hamming',
|
222 |
+
n_mels=128,
|
223 |
+
fmax=48000/2
|
224 |
+
)
|
225 |
+
|
226 |
+
X = np.expand_dims(mfc, axis=1)
|
227 |
+
X=np.expand_dims(X,axis=1)
|
228 |
+
X = X.transpose(1, 2, 0,3) # assign the result back to arr
|
229 |
+
X=torch.tensor(X)
|
230 |
+
X=X.float()
|
231 |
+
|
232 |
+
with torch.no_grad():
|
233 |
+
|
234 |
+
# set model to validation phase i.e. turn off dropout and batchnorm layers
|
235 |
+
model.eval()
|
236 |
+
|
237 |
+
# get the model's predictions on the validation set
|
238 |
+
output_logits, output_softmax = model(X)
|
239 |
+
predictions = torch.argmax(output_softmax,dim=1)
|
240 |
+
|
241 |
+
pred = predictions.cpu().numpy()
|
242 |
+
x=pred[0]
|
243 |
+
x=str(x)
|
244 |
+
emotions_dict[x]
|