|
import torch |
|
import torch.nn as nn |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import pandas as pd |
|
import os, glob |
|
import librosa |
|
import librosa.display |
|
|
|
|
|
sample_rate=48000 |
|
|
|
def get_waveforms(file): |
|
'''# load an individual sample audio file |
|
# read the full 3 seconds of the file, cut off the first 0.5s of silence; native sample rate = 48k |
|
# don't need to store the sample rate that librosa.load returns''' |
|
|
|
waveform, _ = librosa.load(file, duration=3, offset=0.5, sr=sample_rate) |
|
waveform_homo = np.zeros((int(sample_rate*3,))) |
|
waveform_homo[:len(waveform)] = waveform |
|
return waveform_homo |
|
|
|
class SER(nn.Module): |
|
|
|
def __init__(self,num_emotions): |
|
super().__init__() |
|
|
|
'''################ TRANSFORMER BLOCK #############################''' |
|
self.transformer_maxpool = nn.MaxPool2d(kernel_size=[1,4], stride=[1,4]) |
|
transformer_layer = nn.TransformerEncoderLayer( |
|
d_model=40, |
|
nhead=4, |
|
dim_feedforward=512, |
|
dropout=0.4, |
|
activation='relu' |
|
) |
|
self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4) |
|
|
|
'''############### 1ST PARALLEL 2D CONVOLUTION BLOCK ############''' |
|
self.conv2Dblock1 = nn.Sequential( |
|
|
|
nn.Conv2d( |
|
in_channels=1, |
|
out_channels=16, |
|
kernel_size=3, |
|
stride=1, |
|
padding=1 |
|
), |
|
nn.BatchNorm2d(16), |
|
nn.ReLU(), |
|
nn.MaxPool2d(kernel_size=2, stride=2), |
|
nn.Dropout(p=0.3), |
|
|
|
|
|
nn.Conv2d( |
|
in_channels=16, |
|
out_channels=32, |
|
kernel_size=3, |
|
stride=1, |
|
padding=1 |
|
), |
|
nn.BatchNorm2d(32), |
|
nn.ReLU(), |
|
nn.MaxPool2d(kernel_size=4, stride=4), |
|
nn.Dropout(p=0.3), |
|
|
|
|
|
nn.Conv2d( |
|
in_channels=32, |
|
out_channels=64, |
|
kernel_size=3, |
|
stride=1, |
|
padding=1 |
|
), |
|
nn.BatchNorm2d(64), |
|
nn.ReLU(), |
|
nn.MaxPool2d(kernel_size=4, stride=4), |
|
nn.Dropout(p=0.3), |
|
) |
|
'''############### 2ND PARALLEL 2D CONVOLUTION BLOCK ############''' |
|
self.conv2Dblock2 = nn.Sequential( |
|
|
|
|
|
nn.Conv2d( |
|
in_channels=1, |
|
out_channels=16, |
|
kernel_size=3, |
|
stride=1, |
|
padding=1 |
|
), |
|
nn.BatchNorm2d(16), |
|
nn.ReLU(), |
|
nn.MaxPool2d(kernel_size=2, stride=2), |
|
nn.Dropout(p=0.3), |
|
|
|
|
|
nn.Conv2d( |
|
in_channels=16, |
|
out_channels=32, |
|
kernel_size=3, |
|
stride=1, |
|
padding=1 |
|
), |
|
nn.BatchNorm2d(32), |
|
nn.ReLU(), |
|
nn.MaxPool2d(kernel_size=4, stride=4), |
|
nn.Dropout(p=0.3), |
|
|
|
|
|
nn.Conv2d( |
|
in_channels=32, |
|
out_channels=64, |
|
kernel_size=3, |
|
stride=1, |
|
padding=1 |
|
), |
|
nn.BatchNorm2d(64), |
|
nn.ReLU(), |
|
nn.MaxPool2d(kernel_size=4, stride=4), |
|
nn.Dropout(p=0.3), |
|
) |
|
|
|
|
|
|
|
|
|
self.fc1_linear = nn.Linear(512*2+40,num_emotions) |
|
|
|
self.softmax_out = nn.Softmax(dim=1) |
|
|
|
def forward(self,x): |
|
|
|
'''############ 1st parallel Conv2D block: 4 Convolutional layers ############################''' |
|
conv2d_embedding1 = self.conv2Dblock1(x) |
|
|
|
|
|
conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim=1) |
|
|
|
'''############ 2nd parallel Conv2D block: 4 Convolutional layers #############################''' |
|
conv2d_embedding2 = self.conv2Dblock2(x) |
|
|
|
conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim=1) |
|
|
|
|
|
x_maxpool = self.transformer_maxpool(x) |
|
|
|
|
|
x_maxpool_reduced = torch.squeeze(x_maxpool,1) |
|
|
|
|
|
x = x_maxpool_reduced.permute(2,0,1) |
|
|
|
|
|
transformer_output = self.transformer_encoder(x) |
|
|
|
transformer_embedding = torch.mean(transformer_output, dim=0) |
|
|
|
|
|
complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2,transformer_embedding], dim=1) |
|
|
|
output_logits = self.fc1_linear(complete_embedding) |
|
|
|
output_softmax = self.softmax_out(output_logits) |
|
|
|
return output_logits, output_softmax |
|
emotions_dict ={ |
|
'0':'surprised', |
|
'1':'neutral', |
|
'2':'calm', |
|
'3':'happy', |
|
'4':'sad', |
|
'5':'angry', |
|
'6':'fearful', |
|
'7':'disgust' |
|
} |
|
|
|
|
|
def load_checkpoint(optimizer, model, filename): |
|
checkpoint_dict = torch.load(filename,map_location=torch.device('cpu')) |
|
epoch = checkpoint_dict['epoch'] |
|
model.load_state_dict(checkpoint_dict['model']) |
|
if optimizer is not None: |
|
optimizer.load_state_dict(checkpoint_dict['optimizer']) |
|
return epoch |
|
|
|
def make_validate_fnc(model,criterion): |
|
def validate(X,Y): |
|
|
|
with torch.no_grad(): |
|
|
|
|
|
model.eval() |
|
|
|
|
|
output_logits, output_softmax = model(X) |
|
predictions = torch.argmax(output_softmax,dim=1) |
|
|
|
|
|
accuracy = torch.sum(Y==predictions)/float(len(Y)) |
|
|
|
|
|
loss = criterion(output_logits,Y) |
|
|
|
return loss.item(), accuracy*100, predictions |
|
return validate |
|
|
|
model = SER(len(emotions_dict)) |
|
optimizer = torch.optim.SGD(model.parameters(),lr=0.01, weight_decay=1e-3, momentum=0.8) |
|
load_checkpoint(optimizer, model, "SERFINAL-099.pkl") |
|
|
|
waveform = get_waveforms("03-01-08-01-01-01-01.wav") |
|
|
|
waveforms = np.array(waveform) |
|
|
|
|
|
mfc=librosa.feature.mfcc( |
|
y=waveforms, |
|
sr=48000, |
|
n_mfcc=40, |
|
n_fft=1024, |
|
win_length=512, |
|
window='hamming', |
|
n_mels=128, |
|
fmax=48000/2 |
|
) |
|
|
|
X = np.expand_dims(mfc, axis=1) |
|
X=np.expand_dims(X,axis=1) |
|
X = X.transpose(1, 2, 0,3) |
|
X=torch.tensor(X) |
|
X=X.float() |
|
|
|
with torch.no_grad(): |
|
|
|
|
|
model.eval() |
|
|
|
|
|
output_logits, output_softmax = model(X) |
|
predictions = torch.argmax(output_softmax,dim=1) |
|
|
|
pred = predictions.cpu().numpy() |
|
x=pred[0] |
|
x=str(x) |
|
emotions_dict[x] |