santiviquez commited on
Commit
29457c0
Β·
1 Parent(s): 3d8b507

add everything

Browse files
Files changed (2) hide show
  1. app.py +113 -0
  2. cnn.py +67 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from huggingface_hub import hf_hub_url, cached_download
3
+ import torch
4
+ import torchaudio.transforms as transforms
5
+ from miniaudio import SampleFormat, decode
6
+ from librosa.util import fix_length
7
+ import numpy as np
8
+ from audio_recorder_streamlit import audio_recorder
9
+
10
+
11
+ # Streamlit app title
12
+ st.markdown("## Noisy Human")
13
+ st.markdown("")
14
+ st.markdown(
15
+ "Non-speach human sounds classification. This model can identify with up to 78/% accuracy the following 10 classes"
16
+ )
17
+
18
+ col1, col2 = st.columns(2)
19
+ with st.container():
20
+ with col1:
21
+ st.markdown(
22
+ """
23
+ * Clapping πŸ‘
24
+ * Footsteps 🦢
25
+ * Brushing Teeth πŸͺ₯
26
+ * Drinking Sipping πŸ§ƒ
27
+ * Laughing πŸ˜‚
28
+ """
29
+ )
30
+
31
+ with col2:
32
+ st.markdown(
33
+ """
34
+
35
+ * Breathing 🌬️
36
+ * Crying Baby 😭
37
+ * Coughing 🀧
38
+ * Snoring 😴
39
+ * Sneezing 🀧
40
+ """
41
+ )
42
+
43
+ # from audio_recorder_streamlit import audio_recorder
44
+
45
+ from cnn import CNN
46
+
47
+ REPO_ID = "santiviquez/noisy_human_cnn"
48
+ FILENAME = "CNN_MelSpec_Deltas_fold_4_.pth"
49
+ RATE = 22050
50
+
51
+
52
+ @st.cache
53
+ def download_model():
54
+ model_weights = torch.load(
55
+ cached_download(hf_hub_url(REPO_ID, FILENAME)), map_location=torch.device("cpu")
56
+ )
57
+ return model_weights
58
+
59
+
60
+ model_weights = download_model()
61
+ model = CNN(input_channels=2)
62
+ model.load_state_dict(model_weights)
63
+ model.eval()
64
+ audio_bytes = st.file_uploader(
65
+ "Choose an audio (.wav) file", accept_multiple_files=False
66
+ )
67
+ st.caption("OR")
68
+ audio_bytes = audio_recorder()
69
+
70
+ if audio_bytes:
71
+ # audio_bytes = audio_file_path.read()
72
+ st.audio(audio_bytes, format="audio/ogg")
73
+ # st.audio(audio_bytes, format="audio/ogg")
74
+ # torch.tensor(audio_bytes).shape
75
+ decoded_audio = decode(
76
+ audio_bytes, nchannels=1, sample_rate=RATE, output_format=SampleFormat.SIGNED32
77
+ )
78
+
79
+ waveform = np.array(decoded_audio.samples)
80
+ waveform = fix_length(waveform, 5 * RATE)
81
+ waveform = torch.FloatTensor(waveform)
82
+
83
+ x_mel = transforms.MelSpectrogram(sample_rate=RATE, n_fft=1024, n_mels=60)(waveform)
84
+ x_deltas = transforms.ComputeDeltas()(x_mel)
85
+ x = torch.cat((x_mel, x_deltas)).view(1, 2, 60, 216)
86
+
87
+ y_pred = model(x)
88
+ y_pred_softmax = torch.log_softmax(y_pred, dim=1)
89
+ _, y_pred_tags = torch.max(y_pred_softmax, dim=1)
90
+
91
+ category_map = {
92
+ 0: "Clapping πŸ‘",
93
+ 1: "Footsteps 🦢",
94
+ 2: "Brushing Teeth πŸͺ₯",
95
+ 3: "Drinking Sipping πŸ§ƒ",
96
+ 4: "Laughing πŸ˜‚",
97
+ 5: "Breathing 🌬️",
98
+ 6: "Crying Baby 😭",
99
+ 7: "Coughing 🀧",
100
+ 8: "Snoring 😴",
101
+ 9: "Sneezing 🀧",
102
+ }
103
+
104
+ st.write("**Predicted class:**", category_map[y_pred_tags.item()])
105
+
106
+ st.text("")
107
+ st.text("")
108
+ st.text("")
109
+ st.markdown(
110
+ """`Create by` [Santiago Viquez](https://twitter.com/santiviquez)
111
+ and [Ivan Padezhki](https://github.com/ivanpadezhki)
112
+ | `Code:` [GitHub](https://github.com/santiviquez/noisy-human-recognition)"""
113
+ )
cnn.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch.nn.functional as F
3
+
4
+
5
+ class CNN(nn.Module):
6
+ def __init__(self, input_channels):
7
+ super(CNN, self).__init__()
8
+ self.input_channels = input_channels
9
+ self.conv1 = nn.Conv2d(self.input_channels, 32, kernel_size=(3, 3))
10
+ self.batchnorm1 = nn.BatchNorm2d(32)
11
+ self.pool1 = nn.MaxPool2d(kernel_size=(3, 3))
12
+ self.dropout1 = nn.Dropout(0.3)
13
+ self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3))
14
+ self.batchnorm2 = nn.BatchNorm2d(64)
15
+ self.pool2 = nn.MaxPool2d(kernel_size=(1, 3))
16
+ self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3))
17
+ self.batchnorm3 = nn.BatchNorm2d(128)
18
+ self.pool3 = nn.MaxPool2d(kernel_size=2)
19
+ self.dropout2 = nn.Dropout(0.3)
20
+ self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
21
+ self.fc1 = nn.Linear(128, 256)
22
+ self.fc2 = nn.Linear(256, 512)
23
+ self.dropout3 = nn.Dropout(0.5)
24
+ self.fc3 = nn.Linear(512, 10)
25
+ self.apply(self._init_weights)
26
+
27
+ def _init_weights(self, module):
28
+ if isinstance(module, nn.Conv2d):
29
+ nn.init.xavier_normal_(module.weight.data)
30
+ if module.bias is not None:
31
+ nn.init.constant_(module.bias.data, 0)
32
+
33
+ elif isinstance(module, nn.BatchNorm2d):
34
+ nn.init.constant_(module.weight.data, 1)
35
+ nn.init.constant_(module.bias.data, 0)
36
+
37
+ elif isinstance(module, nn.Linear):
38
+ n = module.in_features
39
+ y = 1.0 / n ** (1/2)
40
+ module.weight.data.uniform_(-y, y)
41
+ module.bias.data.fill_(0)
42
+
43
+ def forward(self, x):
44
+ x = self.conv1(x)
45
+ x = F.relu(x)
46
+ x = self.batchnorm1(x)
47
+ x = self.pool1(x)
48
+ x = self.dropout1(x)
49
+ x = self.conv2(x)
50
+ x = F.relu(x)
51
+ x = self.batchnorm2(x)
52
+ x = self.pool2(x)
53
+ x = self.conv3(x)
54
+ x = F.relu(x)
55
+ x = self.batchnorm3(x)
56
+ x = self.pool3(x)
57
+ x = self.dropout2(x)
58
+ x = self.avgpool(x)
59
+ x = x.view(x.size(0), -1)
60
+ x = self.fc1(x)
61
+ x = F.relu(x)
62
+ x = self.dropout3(x)
63
+ x = self.fc2(x)
64
+ x = F.relu(x)
65
+ x = self.dropout3(x)
66
+ x = self.fc3(x)
67
+ return x