Tanmay Agrawal commited on
Commit
7c7c0ee
·
1 Parent(s): c3a69f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +244 -0
app.py CHANGED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import pandas as pd
6
+ import os, glob
7
+ import librosa
8
+ import librosa.display
9
+
10
+
11
+ sample_rate=48000
12
+
13
+ def get_waveforms(file):
14
+ '''# load an individual sample audio file
15
+ # read the full 3 seconds of the file, cut off the first 0.5s of silence; native sample rate = 48k
16
+ # don't need to store the sample rate that librosa.load returns'''
17
+
18
+ waveform, _ = librosa.load(file, duration=3, offset=0.5, sr=sample_rate)
19
+ waveform_homo = np.zeros((int(sample_rate*3,)))
20
+ waveform_homo[:len(waveform)] = waveform
21
+ return waveform_homo
22
+
23
+ class SER(nn.Module):
24
+ # Define all layers present in the network
25
+ def __init__(self,num_emotions):
26
+ super().__init__()
27
+
28
+ '''################ TRANSFORMER BLOCK #############################'''
29
+ self.transformer_maxpool = nn.MaxPool2d(kernel_size=[1,4], stride=[1,4])
30
+ transformer_layer = nn.TransformerEncoderLayer(
31
+ d_model=40, # input feature (frequency) dim after maxpooling 40*282 -> 40*70 (MFC*time)
32
+ nhead=4, # 4 self-attention layers in each multi-head self-attention layer in each encoder block
33
+ dim_feedforward=512, # 2 linear layers in each encoder block's feedforward network: dim 40-->512--->40
34
+ dropout=0.4,
35
+ activation='relu' # ReLU: avoid saturation/tame gradient/reduce compute time
36
+ )
37
+ self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4)
38
+
39
+ '''############### 1ST PARALLEL 2D CONVOLUTION BLOCK ############'''
40
+ self.conv2Dblock1 = nn.Sequential(
41
+
42
+ nn.Conv2d(
43
+ in_channels=1, # input volume depth == input channel dim == 1
44
+ out_channels=16, # expand output feature map volume's depth to 16
45
+ kernel_size=3, # 3*3 stride 1 kernel
46
+ stride=1,
47
+ padding=1
48
+ ),
49
+ nn.BatchNorm2d(16), # batch normalize the output feature map before activation
50
+ nn.ReLU(),
51
+ nn.MaxPool2d(kernel_size=2, stride=2),
52
+ nn.Dropout(p=0.3),
53
+
54
+ # 2nd 2D convolution layer identical to last except output dim, maxpool kernel
55
+ nn.Conv2d(
56
+ in_channels=16,
57
+ out_channels=32, # expand output feature map volume's depth to 32
58
+ kernel_size=3,
59
+ stride=1,
60
+ padding=1
61
+ ),
62
+ nn.BatchNorm2d(32),
63
+ nn.ReLU(),
64
+ nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters
65
+ nn.Dropout(p=0.3),
66
+
67
+ # 3rd 2D convolution layer identical to last except output dim
68
+ nn.Conv2d(
69
+ in_channels=32,
70
+ out_channels=64, # expand output feature map volume's depth to 64
71
+ kernel_size=3,
72
+ stride=1,
73
+ padding=1
74
+ ),
75
+ nn.BatchNorm2d(64),
76
+ nn.ReLU(),
77
+ nn.MaxPool2d(kernel_size=4, stride=4),
78
+ nn.Dropout(p=0.3),
79
+ )
80
+ '''############### 2ND PARALLEL 2D CONVOLUTION BLOCK ############'''
81
+ self.conv2Dblock2 = nn.Sequential(
82
+
83
+ # 1st 2D convolution layer
84
+ nn.Conv2d(
85
+ in_channels=1, # input volume depth == input channel dim == 1
86
+ out_channels=16,
87
+ kernel_size=3, #3*3 stride 1 kernel
88
+ stride=1,
89
+ padding=1
90
+ ),
91
+ nn.BatchNorm2d(16), # batch normalize the output feature map before activation
92
+ nn.ReLU(), # feature map --> activation map
93
+ nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size
94
+ nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training
95
+
96
+ # 2nd 2D convolution layer identical to last except output dim, maxpool kernel
97
+ nn.Conv2d(
98
+ in_channels=16,
99
+ out_channels=32, # expand output feature map volume's depth to 32
100
+ kernel_size=3,
101
+ stride=1,
102
+ padding=1
103
+ ),
104
+ nn.BatchNorm2d(32),
105
+ nn.ReLU(),
106
+ nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters
107
+ nn.Dropout(p=0.3),
108
+
109
+ # 3rd 2D convolution layer identical to last except output dim
110
+ nn.Conv2d(
111
+ in_channels=32,
112
+ out_channels=64, # expand output feature map volume's depth to 64
113
+ kernel_size=3,
114
+ stride=1,
115
+ padding=1
116
+ ),
117
+ nn.BatchNorm2d(64),
118
+ nn.ReLU(),
119
+ nn.MaxPool2d(kernel_size=4, stride=4),
120
+ nn.Dropout(p=0.3),
121
+ )
122
+
123
+ # Each full convolution block outputs (64*1*8) embedding flattened to dim 512 1D array
124
+ # Full transformer block outputs 40*70 feature map, which we time-avg to dim 40 1D array
125
+ # 512*2+40 == 1064 input features --> 8 output emotions
126
+ self.fc1_linear = nn.Linear(512*2+40,num_emotions)
127
+
128
+ self.softmax_out = nn.Softmax(dim=1)
129
+
130
+ def forward(self,x):
131
+
132
+ '''############ 1st parallel Conv2D block: 4 Convolutional layers ############################'''
133
+ conv2d_embedding1 = self.conv2Dblock1(x) # x == N/batch * channel * freq * time
134
+
135
+ # flatten final 64*1*8 feature map from convolutional layers to length 512 1D array
136
+ conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim=1)
137
+
138
+ '''############ 2nd parallel Conv2D block: 4 Convolutional layers #############################'''
139
+ conv2d_embedding2 = self.conv2Dblock2(x) # x == N/batch * channel * freq * time
140
+
141
+ conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim=1)
142
+
143
+
144
+ x_maxpool = self.transformer_maxpool(x)
145
+
146
+ # remove channel dim: 1*40*70 --> 40*70
147
+ x_maxpool_reduced = torch.squeeze(x_maxpool,1)
148
+
149
+ # transformer encoder layer requires tensor in format: time * batch * embedding (freq)
150
+ x = x_maxpool_reduced.permute(2,0,1)
151
+
152
+ # finally, pass reduced input feature map x into transformer encoder layers
153
+ transformer_output = self.transformer_encoder(x)
154
+
155
+ transformer_embedding = torch.mean(transformer_output, dim=0) # dim 40x70 --> 40
156
+
157
+ # concatenate embedding tensors output by parallel 2*conv and 1*transformer blocks
158
+ complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2,transformer_embedding], dim=1)
159
+
160
+ output_logits = self.fc1_linear(complete_embedding)
161
+
162
+ output_softmax = self.softmax_out(output_logits)
163
+
164
+ return output_logits, output_softmax
165
+ emotions_dict ={
166
+ '0':'surprised',
167
+ '1':'neutral',
168
+ '2':'calm',
169
+ '3':'happy',
170
+ '4':'sad',
171
+ '5':'angry',
172
+ '6':'fearful',
173
+ '7':'disgust'
174
+ }
175
+
176
+
177
+ def load_checkpoint(optimizer, model, filename):
178
+ checkpoint_dict = torch.load(filename,map_location=torch.device('cpu'))
179
+ epoch = checkpoint_dict['epoch']
180
+ model.load_state_dict(checkpoint_dict['model'])
181
+ if optimizer is not None:
182
+ optimizer.load_state_dict(checkpoint_dict['optimizer'])
183
+ return epoch
184
+
185
+ def make_validate_fnc(model,criterion):
186
+ def validate(X,Y):
187
+
188
+ with torch.no_grad():
189
+
190
+ # set model to validation phase i.e. turn off dropout and batchnorm layers
191
+ model.eval()
192
+
193
+ # get the model's predictions on the validation set
194
+ output_logits, output_softmax = model(X)
195
+ predictions = torch.argmax(output_softmax,dim=1)
196
+
197
+ # calculate the mean accuracy over the entire validation set
198
+ accuracy = torch.sum(Y==predictions)/float(len(Y))
199
+
200
+ # compute error from logits (nn.crossentropy implements softmax)
201
+ loss = criterion(output_logits,Y)
202
+
203
+ return loss.item(), accuracy*100, predictions
204
+ return validate
205
+
206
+ model = SER(len(emotions_dict))
207
+ optimizer = torch.optim.SGD(model.parameters(),lr=0.01, weight_decay=1e-3, momentum=0.8)
208
+ load_checkpoint(optimizer, model, "SERFINAL-099.pkl")
209
+
210
+ waveform = get_waveforms("03-01-08-01-01-01-01.wav")
211
+
212
+ waveforms = np.array(waveform)
213
+
214
+
215
+ mfc=librosa.feature.mfcc(
216
+ y=waveforms,
217
+ sr=48000,
218
+ n_mfcc=40,
219
+ n_fft=1024,
220
+ win_length=512,
221
+ window='hamming',
222
+ n_mels=128,
223
+ fmax=48000/2
224
+ )
225
+
226
+ X = np.expand_dims(mfc, axis=1)
227
+ X=np.expand_dims(X,axis=1)
228
+ X = X.transpose(1, 2, 0,3) # assign the result back to arr
229
+ X=torch.tensor(X)
230
+ X=X.float()
231
+
232
+ with torch.no_grad():
233
+
234
+ # set model to validation phase i.e. turn off dropout and batchnorm layers
235
+ model.eval()
236
+
237
+ # get the model's predictions on the validation set
238
+ output_logits, output_softmax = model(X)
239
+ predictions = torch.argmax(output_softmax,dim=1)
240
+
241
+ pred = predictions.cpu().numpy()
242
+ x=pred[0]
243
+ x=str(x)
244
+ emotions_dict[x]