bpiyush commited on
Commit
c5f65a4
·
verified ·
1 Parent(s): eafbf97

Upload folder using huggingface_hub

Browse files
sound_of_water/audio_pitch/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ In this folder, we will store the code to train and evaluate models for pitch detection from audio.
sound_of_water/audio_pitch/__pycache__/model.cpython-39.pyc ADDED
Binary file (10.8 kB). View file
 
sound_of_water/audio_pitch/model.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Defines the audio model for pitch estimation."""
2
+ import torch
3
+ import torch.nn as nn
4
+ import einops
5
+
6
+ import math
7
+ import numpy as np
8
+ import einops
9
+ import pytorch_lightning as pl
10
+
11
+ import shared.utils as su
12
+
13
+
14
+ class TimeEncodingDiscreteSinusoidal(nn.Module):
15
+ def __init__(self, d, v=10000, rate=49, scale_factor=0.01):
16
+ """
17
+ Args:
18
+ d (int): Dimension
19
+ rate (int): discretisation rate (frames per second)
20
+ this means that each [1/49.] of a second will be
21
+ encoded with a unique vector
22
+ """
23
+ super().__init__()
24
+ self.d = d
25
+ self.rate = rate
26
+ self.v = v
27
+ self.scale_factor = scale_factor
28
+
29
+ def forward(self, t):
30
+ """
31
+ Takes in timestamps t (seconds) and outputs vectors that represent these.
32
+
33
+ Args:
34
+ t (torch.tensor): time stamps in seconds, [B, N]
35
+ """
36
+ B, N = t.shape
37
+
38
+ # Discretise time
39
+ i = (t * self.rate).to(int)
40
+
41
+ pe = torch.zeros(B, N, self.d).to(t.device)
42
+ div_term = torch.exp(
43
+ (torch.arange(0, self.d, 2, dtype=torch.float) * -(math.log(self.v) / self.d))
44
+ )
45
+ div_term = div_term.to(t.device)
46
+ pe[:, :, 0::2] = torch.sin(i[:, :, None].float() * div_term)
47
+ pe[:, :, 1::2] = torch.cos(i[:, :, None].float() * div_term)
48
+
49
+ pe = pe * self.scale_factor
50
+
51
+ return pe
52
+
53
+
54
+ class Wav2Vec2WithTimeEncoding(nn.Module):
55
+ def __init__(
56
+ self, model_name="facebook/wav2vec2-base-960h", use_time=True,
57
+ d=512, v=10000, rate=49, scale_factor=0.01, layer_norm=False,
58
+ ):
59
+ super().__init__()
60
+
61
+ su.log.print_update(
62
+ f" [:::] Loading backbone Wav2Vec 2.0 ",
63
+ pos="left",
64
+ fillchar=".",
65
+ color="cyan",
66
+ )
67
+
68
+ # Load pre-trained Wav2Vec 2.0 model
69
+ from transformers import Wav2Vec2Model
70
+ self.net = Wav2Vec2Model.from_pretrained(model_name)
71
+
72
+ self.d = d
73
+ self.v = v
74
+ self.rate = rate
75
+ self.sr = 16000
76
+ self.use_time = use_time
77
+
78
+ if self.use_time:
79
+ self.time_encoding = TimeEncodingDiscreteSinusoidal(
80
+ d=d, v=v, rate=rate, scale_factor=scale_factor,
81
+ )
82
+ else:
83
+ print(" [:::] Not using time encoding.")
84
+ self.time_encoding = None
85
+
86
+ # Have a layer norm for the time encoding
87
+ if layer_norm:
88
+ self.layer_norm = nn.LayerNorm(d)
89
+ else:
90
+ self.layer_norm = nn.Identity()
91
+
92
+ def forward(self, x, t):
93
+ """
94
+ Args:
95
+ x (torch.tensor): audio input, [B, NC, C, NS],
96
+ NC: n.o. clips, NS: n.o. samples
97
+ t (torch.tensor): time stamps in seconds, [B, NC, 2],
98
+ start and end times for each clip
99
+ """
100
+ B, T, C, NS = x.shape
101
+ assert C == 1, "Require a single-channel input."
102
+ assert t.shape[1] == T, \
103
+ "Number of timestamps should match number of clips."
104
+ assert t.shape[0] == B, \
105
+ "Batch size should match."
106
+ assert t.shape[2] == 2, \
107
+ "Timestamps should have start and end times."
108
+
109
+ # # Compute number of frames
110
+ # NF = int((NS / self.sr) * self.rate)
111
+
112
+ # Process inputs
113
+ x = einops.rearrange(x, "B T 1 NS -> (B T) NS")
114
+ t = einops.rearrange(t, "B T L -> (B T) L")
115
+
116
+ # This forward is based on Huggingface's implementation of Wave2Vec2
117
+ # https://github.com/huggingface/transformers/blob/main/src/
118
+ # transformers/models/wav2vec2/modeling_wav2vec2.py
119
+
120
+ # Encode through the CNN
121
+ extract_features = self.net.feature_extractor(x)
122
+ extract_features = extract_features.transpose(1, 2)
123
+
124
+ if self.use_time:
125
+ # Process timestamps: get timestamps for each frame
126
+ # within each clip (fps=49)
127
+ NF = extract_features.shape[1]
128
+ t_dense = []
129
+ for i in range(B):
130
+ start, end = t[i]
131
+ t_dense.append(torch.linspace(start, end, NF))
132
+ t_dense = torch.stack(t_dense).to(extract_features.device)
133
+
134
+ # Add time encoding to the features
135
+ t_dense_enc = self.time_encoding(t_dense)
136
+
137
+ # Normalise time encoding to have the same scale as the features
138
+ extract_features = extract_features + t_dense_enc
139
+ else:
140
+ pass
141
+
142
+ # Apply layer norm
143
+ extract_features = self.layer_norm(extract_features)
144
+
145
+ # Project into the feature space
146
+ hidden_states, extract_features = self.net.feature_projection(
147
+ extract_features
148
+ )
149
+
150
+ # Pass through the transformer encoder
151
+ encoder_outputs = self.net.encoder(
152
+ hidden_states,
153
+ attention_mask=None,
154
+ output_attentions=False,
155
+ output_hidden_states=False,
156
+ return_dict=True,
157
+ )
158
+ z = encoder_outputs[0]
159
+
160
+ # z = self.backbone(x).last_hidden_state
161
+ z = einops.rearrange(z, "(B T) F D -> B T F D", B=B, T=T)
162
+
163
+ return z
164
+
165
+
166
+ def recursive_attr(module, attr):
167
+ if "." in attr:
168
+ m, a = attr.split(".", 1)
169
+ return recursive_attr(getattr(module, m), a)
170
+ return getattr(module, attr)
171
+
172
+
173
+ class WavelengthWithTime(pl.LightningModule):
174
+ def __init__(
175
+ self,
176
+ backbone,
177
+ feat_dim=768,
178
+ axial=True,
179
+ axial_bins=512,
180
+ radial=True,
181
+ radial_bins=512,
182
+ freeze_backbone=True,
183
+ train_backbone_modules=[10, 11],
184
+ prediction_head_hidden=[],
185
+ act="softmax",
186
+ criterion="kl_div",
187
+ cfg_opt=dict(name="Adam", args=dict(lr=1e-4)),
188
+ ):
189
+ super().__init__()
190
+ su.log.print_update(
191
+ " [:::] Loading model WavelengthWithTime ",
192
+ color="cyan",
193
+ pos="left",
194
+ fillchar=".",
195
+ )
196
+
197
+ # By default, freeze the entire backbone
198
+ if freeze_backbone:
199
+ self.freeze(backbone)
200
+
201
+ # Unfreeze specific modules
202
+ train_backbone_modules = [
203
+ backbone.net.encoder.layers[int(m)] for m in train_backbone_modules
204
+ ]
205
+ for module in train_backbone_modules:
206
+ self.unfreeze(module)
207
+
208
+ # Make the layer norm in backbone trainable
209
+ print("[>>>] Unfreezing layer norm in backbone")
210
+ for param in backbone.layer_norm.parameters():
211
+ param.requires_grad = True
212
+ su.misc.num_trainable_params(backbone)
213
+
214
+ self.backbone = backbone
215
+ self.feat_dim = feat_dim
216
+
217
+ # Add some intermediate layers before prediction heads
218
+ if len(prediction_head_hidden) > 0:
219
+ layers = []
220
+ in_dim = feat_dim
221
+ for out_dim in prediction_head_hidden:
222
+ layers.append(nn.Linear(in_dim, out_dim))
223
+ layers.append(nn.ReLU())
224
+ in_dim = out_dim
225
+ self.intermediate_layers = nn.Sequential(*layers)
226
+ else:
227
+ self.intermediate_layers = torch.nn.Identity()
228
+ out_dim = feat_dim
229
+ su.misc.num_trainable_params(self.intermediate_layers)
230
+
231
+ assert axial or radial, \
232
+ "At least one of axial or radial heads must be enabled."
233
+
234
+ # Define axial head
235
+ self.axial_head = None
236
+ if axial:
237
+ self.axial_head = nn.Linear(out_dim, axial_bins)
238
+ su.misc.num_trainable_params(self.axial_head)
239
+
240
+ # Define radial head
241
+ self.radial_head = None
242
+ if radial:
243
+ self.radial_head = nn.Linear(out_dim, radial_bins)
244
+ su.misc.num_trainable_params(self.radial_head)
245
+
246
+ self.act = torch.nn.Softmax(dim=-1) if act == "softmax" else torch.nn.Identity()
247
+
248
+ # Set criterion
249
+ self.define_criterion(criterion)
250
+
251
+ # Define optimization config
252
+ self.cfg_opt = cfg_opt
253
+
254
+ # Save hyperparameters
255
+ self.save_hyperparameters(ignore=["backbone"])
256
+
257
+ def freeze_backbone(self):
258
+ for param in self.backbone.parameters():
259
+ param.requires_grad = False
260
+
261
+ def define_criterion(self, criterion):
262
+ if criterion == "kl_div":
263
+ self.criterion = nn.KLDivLoss()
264
+ elif criterion == "ce":
265
+ self.criterion = nn.CrossEntropyLoss()
266
+ else:
267
+ raise NotImplementedError(f"Criterion {criterion} not implemented.")
268
+
269
+ def freeze(self, net):
270
+ for p in net.parameters():
271
+ p.requires_grad = False
272
+
273
+ def unfreeze(self, module):
274
+ module_name = type(module).__name__
275
+ print(f"[>>>] Unfreezing {module_name}")
276
+ for p in module.parameters():
277
+ p.requires_grad = True
278
+
279
+ def forward(self, x, t):
280
+ """
281
+ Args:
282
+ x (torch.Tensor): [B, T, C, NS], T: n.o. clips
283
+ t (torch.Tensor): [B, T, 2], clip start and end times
284
+ """
285
+ B, T, C, NS = x.shape
286
+ z = self.backbone.forward(x, t)
287
+
288
+ # assert C == 1, "Require a single-channel input."
289
+ # x = einops.rearrange(x, "B T 1 NS -> (B T) NS")
290
+
291
+ # z = self.backbone(x).last_hidden_state
292
+ # z = einops.rearrange(z, "(B T) F D -> B T F D", B=B, D=self.feat_dim)
293
+
294
+ # Intermediate layers
295
+ h = self.intermediate_layers(z)
296
+
297
+ # Prediction heads
298
+ y_pred = dict()
299
+ if self.axial_head is not None:
300
+ axial = self.act(self.axial_head(h))
301
+ y_pred["axial"] = axial
302
+ if self.radial_head is not None:
303
+ radial = self.act(self.radial_head(h))
304
+ y_pred["radial"] = radial
305
+ return y_pred
306
+
307
+ def compute_loss(self, y_pred: dict, y_true: dict):
308
+ loss = dict()
309
+ total_loss = 0.
310
+ for key in y_pred:
311
+ yt = y_true[key]
312
+ yt = einops.rearrange(yt, "b t d f -> b t f d")
313
+ yp = y_pred[key]
314
+ if isinstance(self.criterion, nn.KLDivLoss):
315
+ # Need to pass log to the loss function if it is KLDivLoss
316
+ yp = yp.log()
317
+ loss[key] = self.criterion(yp, yt)
318
+ elif isinstance(self.criterion, nn.CrossEntropyLoss):
319
+ yp = einops.rearrange(yp, "b t f d -> (b t f) d")
320
+ yt = einops.rearrange(yt, "b t f d -> (b t f) d")
321
+ loss[key] = self.criterion(yp, yt)
322
+ else:
323
+ raise NotImplementedError(f"Criterion {self.criterion} not implemented.")
324
+ # For now, using hardcoded loss weights of 1/K where K is number of losses
325
+ total_loss += loss[key] / len(y_pred)
326
+ loss["total"] = total_loss
327
+ return loss
328
+
329
+ # Fill in the rest of the class definition here
330
+ def step(self, batch, mode, log=True):
331
+ x = batch["audio_clips"]
332
+ t = batch["clips"]
333
+ y_true = {**batch["targets"], **batch["metadata"]}
334
+ y_pred = self.forward(x, t)
335
+ losses = self.compute_loss(y_pred, y_true)
336
+ loss = losses["total"]
337
+
338
+ if log:
339
+ self.log(f"batch/{mode}/loss_net", loss, prog_bar=True, sync_dist=True)
340
+
341
+ return loss
342
+
343
+ def training_step(self, batch, batch_idx):
344
+ return self.step(batch, "train")
345
+
346
+ def validation_step(self, batch, batch_idx):
347
+ return self.step(batch, "valid")
348
+
349
+ def configure_optimizers(self):
350
+ function = getattr(torch.optim, self.cfg_opt["name"])
351
+ optimizer = function(self.parameters(), **self.cfg_opt["args"])
352
+ return optimizer
353
+
354
+
355
+ if __name__ == "__main__":
356
+ import os
357
+
358
+ # Test backbone
359
+ backbone = Wav2Vec2WithTimeEncoding()
360
+ su.misc.num_params(backbone)
361
+
362
+ # Test on a real audio clip
363
+ path = "./media_assets/pouring_water_in_a_glass.wav"
364
+ import torchaudio
365
+ waveform, sr = torchaudio.load(path)
366
+ waveform = torchaudio.functional.resample(waveform, sr, 16000)
367
+ sr = 16000
368
+ waveform = waveform.mean(dim=0, keepdim=True)
369
+
370
+ # Forward pass an entire audio
371
+ from transformers import Wav2Vec2Processor
372
+ model_name = "facebook/wav2vec2-base-960h"
373
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
374
+
375
+ s, e = 8, 22
376
+ x = processor(
377
+ waveform[:, int(s*sr):int(e*sr)], sampling_rate=16000, return_tensors="pt",
378
+ ).input_values.unsqueeze(0)
379
+ duration = waveform.shape[-1] / sr
380
+ t = torch.tensor([[s, e]]).unsqueeze(0)
381
+ z = backbone(x, t)
382
+
383
+ # Let's look at the tsne
384
+ z_flat = einops.rearrange(z, "B T F D -> (B T F) D")
385
+ import matplotlib.pyplot as plt
386
+ # Add serif
387
+ plt.rcParams["font.family"] = "serif"
388
+
389
+ su.visualize.show_temporal_tsne(z_flat.detach().numpy(), show=False)
390
+ plt.savefig("./media_assets/tsne.png")
391
+ plt.close()
392
+
393
+
394
+ # Test model
395
+ cfg_model = {
396
+ "name": "WavelengthWithTime",
397
+ "args": {
398
+ "axial": True,
399
+ "axial_bins": 64,
400
+ "radial": True,
401
+ "radial_bins": 64,
402
+ "freeze_backbone": True,
403
+ "train_backbone_modules": [6, 7, 8, 9, 10, 11],
404
+ "act": "softmax",
405
+ "criterion": "kl_div",
406
+ }
407
+ }
408
+ model = eval(cfg_model["name"])(backbone=backbone, **cfg_model["args"])
409
+ su.misc.num_trainable_params(model)
410
+
411
+ # Load pre-trained checkpoint
412
+ ckpt_dir = "/work/piyush/pretrained_checkpoints/SoundOfWater"
413
+ ckpt_path = os.path.join(
414
+ ckpt_dir,
415
+ "dsr9mf13_ep100_step12423_real_finetuned_with_cosupervision.pth",
416
+ )
417
+ assert os.path.exists(ckpt_path), \
418
+ f"Checkpoint not found at {ckpt_path}."
419
+ print("Loading checkpoint from: ", ckpt_path)
420
+ ckpt = torch.load(ckpt_path, map_location="cpu")
421
+ msg = model.load_state_dict(ckpt)
422
+ print(msg)
423
+
424
+ # Check forward pass
425
+ x_random = torch.randn(2, 1, 1, 16000)
426
+ t_random = torch.tensor([[[0, 1]], [[2, 3]]])
427
+ y_pred = model(x_random, t_random)
428
+ print("Input: ", x_random.shape)
429
+ for key in y_pred:
430
+ print(key, y_pred[key].shape)
431
+
432
+
433
+ # Plot features with the trained backbone and save as tsne_trained.png
434
+ z = model.backbone(x, t)
435
+ z_flat = einops.rearrange(z, "B T F D -> (B T F) D")
436
+ su.visualize.show_temporal_tsne(z_flat.detach().numpy(), show=False)
437
+ plt.savefig("./media_assets/tsne_trained.png")
438
+ plt.close()
sound_of_water/cosupervision/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ In this folder, we store code for co-supervising audio pitch detection network from
2
+ visual height detection network.
sound_of_water/data/__pycache__/audio_loader.cpython-39.pyc ADDED
Binary file (12.8 kB). View file
 
sound_of_water/data/__pycache__/audio_transforms.cpython-39.pyc ADDED
Binary file (5.45 kB). View file
 
sound_of_water/data/__pycache__/csv_loader.cpython-39.pyc ADDED
Binary file (3.32 kB). View file
 
sound_of_water/data/audio_loader.py ADDED
@@ -0,0 +1,646 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """Audio loading utils."""
3
+ import os
4
+ import numpy as np
5
+ import torch
6
+ import torchaudio
7
+ import decord
8
+ import librosa
9
+ import einops
10
+ import PIL
11
+ import matplotlib.pyplot as plt
12
+ # Add serif font
13
+ plt.rcParams['font.family'] = 'serif'
14
+ from PIL import Image, ImageOps
15
+ import librosa.display
16
+
17
+ import shared.utils as su
18
+
19
+
20
+ def read_info(path):
21
+ """
22
+ Reads the info of the given audio file.
23
+
24
+ Args:
25
+ path (str): path to the audio file
26
+ """
27
+ import ffmpeg
28
+ probe = ffmpeg.probe(path)
29
+ audio_info = next(
30
+ (s for s in probe['streams'] if s['codec_type'] == 'audio'),
31
+ None,
32
+ )
33
+ video_info = next(
34
+ (s for s in probe['streams'] if s['codec_type'] == 'video'),
35
+ None,
36
+ )
37
+ return dict(video=video_info, audio=audio_info)
38
+
39
+
40
+ def load_audio_clips(
41
+ audio_path,
42
+ clips,
43
+ sr,
44
+ clip_len,
45
+ backend='decord',
46
+ load_entire=False,
47
+ cut_to_clip_len=True,
48
+ ):
49
+ """
50
+ Loads audio clips from the given audio file.
51
+
52
+ Args:
53
+ audio_path (str): path to the audio file
54
+ clips (np.ndarray): sized [T, 2], where T is the number of clips
55
+ and each row is a pair of start and end times of the clip
56
+ sr (int): sample rate
57
+ clip_len (float): length of the audio clip in seconds
58
+ backend (str): backend to use for loading audio clips
59
+ load_entire (bool): whether to load the entire audio file
60
+ cut_to_clip_len (bool): whether to cut the audio clip to clip_len
61
+ """
62
+
63
+ if backend == 'torchaudio':
64
+ audio_info = read_info(audio_path)["audio"]
65
+ true_sr = int(audio_info["sample_rate"])
66
+ true_nf = audio_info["duration_ts"]
67
+ audio_duration = true_nf / true_sr
68
+ # metadata = torchaudio.info(audio_path)
69
+ # true_sr = metadata.sample_rate
70
+ # true_nf = metadata.num_frames
71
+ elif backend == "decord":
72
+ # duration = librosa.get_duration(filename=audio_path)
73
+ ar = decord.AudioReader(audio_path, sample_rate=sr, mono=True)
74
+ # Mono=False gives NaNs in inputs.
75
+ # This (https://gist.github.com/nateraw/fcc2bdb9c8738224957c8617c3360445) might
76
+ # be a related issue. Ignoring for now. Need to use torchaudio for now.
77
+ true_nf = ar.shape[1]
78
+ audio_duration = ar.shape[1] / sr
79
+ else:
80
+ raise ValueError(f"Unknown backend: {backend}")
81
+
82
+ if load_entire:
83
+ # Load the entire audio as a single clip and return
84
+
85
+ if backend == 'torchaudio':
86
+ y, _ = torchaudio.load(audio_path)
87
+ if y.shape[0] > 1:
88
+ # Convert to a single channel
89
+ y = y.mean(dim=0, keepdim=True)
90
+ resampler = torchaudio.transforms.Resample(true_sr, sr)
91
+ y = resampler(y)
92
+ audio = y
93
+ elif backend == "decord":
94
+ audio = ar.get_batch(np.arange(true_nf)).asnumpy()
95
+ audio = torch.from_numpy(audio)
96
+
97
+ return [audio]
98
+
99
+ else:
100
+ # Clip the clips to avoid going out of bounds
101
+ clips = np.clip(clips, 0, audio_duration)
102
+
103
+ audio_clips = []
104
+ for st, et in clips:
105
+
106
+ if backend == 'torchaudio':
107
+
108
+ # Load audio within the given time range
109
+ sf = max(int(true_sr * st), 0)
110
+ ef = min(int(true_sr * et), true_nf)
111
+ nf = ef - sf
112
+ y, _ = torchaudio.load(audio_path, frame_offset=sf, num_frames=nf)
113
+
114
+ # Stereo to mono
115
+ if y.shape[0] > 1:
116
+ # Convert to a single channel
117
+ y = y.mean(dim=0, keepdim=True)
118
+
119
+ # Resample to the given sample rate
120
+ resampler = torchaudio.transforms.Resample(true_sr, sr)
121
+ y = resampler(y)
122
+
123
+ audio = y
124
+
125
+ elif backend == "decord":
126
+
127
+ # Load audio within the given time range
128
+ sf = max(int(st * sr), 0)
129
+ ef = min(int(et * sr), true_nf)
130
+ audio = ar.get_batch(np.arange(sf, ef)).asnumpy()
131
+ audio = torch.from_numpy(audio)
132
+
133
+ # No need to convert to mono since we are using mono=True
134
+ # No need to resample since we are using sample_rate=sr
135
+
136
+ else:
137
+ raise ValueError(f"Unknown backend: {backend}")
138
+
139
+ # Pad the clip to clip_len
140
+ nf_reqd = int(clip_len * sr)
141
+ nf_curr = audio.shape[1]
142
+ npad_side = max(0, nf_reqd - nf_curr)
143
+ if nf_curr < nf_reqd:
144
+ audio = torch.nn.functional.pad(audio, (0, npad_side))
145
+ elif (nf_curr > nf_reqd) and cut_to_clip_len:
146
+ audio = audio[:, :nf_reqd]
147
+
148
+ audio_clips.append(audio)
149
+ return audio_clips
150
+
151
+
152
+ def show_audio_clips_waveform(
153
+ audio_clips, clips, title=None, show=True, figsize=(10, 2),
154
+ ):
155
+ """
156
+ Visualizes the given audio clips.
157
+
158
+ Args:
159
+ audio_clips (list): list of audio clips
160
+ sr (int): sample rate
161
+ title (str): title of the plot
162
+ show (bool): whether to show the clips
163
+ figsize (tuple): figure size
164
+ """
165
+ clip_centers = (clips[:, 0] + clips[:, 1]) / 2
166
+ clip_durations = clips[:, 1] - clips[:, 0]
167
+
168
+ fig, ax = plt.subplots(1, len(audio_clips), figsize=figsize)
169
+ if len(audio_clips) == 1:
170
+ ax = [ax]
171
+ for i, audio in enumerate(audio_clips):
172
+ timestamps = np.linspace(
173
+ clip_centers[i] - clip_durations[i],
174
+ clip_centers[i] + clip_durations[i],
175
+ audio.shape[-1],
176
+ )
177
+ ax[i].plot(timestamps, audio.squeeze().numpy(), alpha=0.5)
178
+ ax[i].set_title(f'$t=$ {clip_centers[i]:.2f}')
179
+ ax[i].grid(alpha=0.4)
180
+ plt.tight_layout()
181
+ if show:
182
+ plt.show()
183
+ else:
184
+ plt.savefig('audio_clips_waveform.png')
185
+
186
+
187
+ # TODO: preprocess audio clips (e.g., wav-to-spectrogram, etc.)
188
+ # Note that this is different from transforms applied as augmentation
189
+ # during training. This is more like a preprocessing step that is applied
190
+ # to the entire audio before sampling the clips.
191
+ import torchaudio.functional as TAF
192
+ import torchaudio.transforms as TAT
193
+
194
+
195
+ def load_audio(path, sr=16000, **kwargs):
196
+ y, true_sr = torchaudio.load(path, **kwargs)
197
+ y = y.mean(dim=0, keepdim=True)
198
+ resampler = torchaudio.transforms.Resample(true_sr, sr)
199
+ y = resampler(y)
200
+ return y, sr
201
+
202
+
203
+ def load_audio_librosa(path, sr=16000, **kwargs):
204
+ y, true_sr = librosa.load(path, sr=sr, **kwargs)
205
+ y = torch.from_numpy(y).unsqueeze(0)
206
+ return y, sr
207
+
208
+
209
+ def librosa_harmonic_spectrogram_db(
210
+ y, sr=16000, n_fft=512, hop_length=256, margin=16., n_mels=64,
211
+ ):
212
+ if isinstance(y, torch.Tensor):
213
+ y = y.numpy()
214
+ if len(y.shape) == 2:
215
+ y = y.mean(axis=0)
216
+ # center=True outputs 1 more frame than center=False
217
+ # Currently, using just center=False
218
+ D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, center=False)
219
+ DH, DP = librosa.decompose.hpss(D, margin=margin)
220
+ amplitude_h = np.sqrt(2) * np.abs(DH)
221
+ if n_mels is None:
222
+ # Usual dB spectrogram
223
+ SH = librosa.amplitude_to_db(amplitude_h, ref=np.max)
224
+ else:
225
+ # Mel-scaled dB spectrogram
226
+ S = librosa.amplitude_to_db(amplitude_h)
227
+ SH = librosa.feature.melspectrogram(S=S, n_mels=n_mels, sr=sr)
228
+ return SH
229
+
230
+
231
+ def show_logmelspectrogram(
232
+ S,
233
+ sr,
234
+ n_fft=512,
235
+ hop_length=256,
236
+ figsize=(10, 3),
237
+ ax=None,
238
+ show=True,
239
+ title="LogMelSpectrogram",
240
+ xlabel="Time (s)",
241
+ ylabel="Mel bins (Hz)",
242
+ return_as_image=False,
243
+ ):
244
+ if ax is None:
245
+ fig, ax = plt.subplots(1, 1, figsize=figsize)
246
+ librosa.display.specshow(
247
+ S,
248
+ sr=sr,
249
+ hop_length=hop_length,
250
+ n_fft=n_fft,
251
+ y_axis='mel',
252
+ x_axis='time',
253
+ ax=ax,
254
+ auto_aspect=True,
255
+ )
256
+ ax.set_title(title)
257
+ ax.set_xlabel(xlabel)
258
+ ax.set_ylabel(ylabel)
259
+
260
+ if return_as_image:
261
+ fig.canvas.draw()
262
+ image = PIL.Image.frombytes(
263
+ 'RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb(),
264
+ )
265
+ plt.close(fig)
266
+ return image
267
+
268
+ if show:
269
+ plt.show()
270
+
271
+
272
+ def show_logspectrogram(
273
+ S, sr, n_fft=512, hop_length=256, figsize=(10, 3), ax=None, show=True,
274
+ ):
275
+ if ax is None:
276
+ fig, ax = plt.subplots(1, 1, figsize=figsize)
277
+ librosa.display.specshow(
278
+ S,
279
+ sr=sr,
280
+ hop_length=hop_length,
281
+ n_fft=n_fft,
282
+ y_axis='linear',
283
+ x_axis='time',
284
+ ax=ax,
285
+ )
286
+ ax.set_title("LogSpectrogram")
287
+ if show:
288
+ plt.show()
289
+
290
+
291
+ def audio_clips_wav_to_spec(
292
+ audio_clips, n_fft=512, hop_length=256, margin=16., n_mels=None,
293
+ ):
294
+ """
295
+ Converts the given audio clips to spectrograms.
296
+
297
+ Args:
298
+ audio_clips (list): list of audio clips
299
+ n_fft (int): number of FFT points
300
+ hop_length (int): hop length
301
+ margin (float): margin for harmonic-percussive source separation
302
+ n_mels (int): number of mel bands (optional, if None, then dB spectrogram is returned)
303
+ """
304
+ audio_specs = []
305
+ for audio in audio_clips:
306
+ spec = librosa_harmonic_spectrogram_db(
307
+ audio,
308
+ n_fft=n_fft,
309
+ hop_length=hop_length,
310
+ margin=margin,
311
+ n_mels=n_mels,
312
+ )
313
+ spec = torch.from_numpy(spec).unsqueeze(0)
314
+ audio_specs.append(spec)
315
+ return audio_specs
316
+
317
+
318
+ def show_audio_clips_spec(
319
+ audio_specs,
320
+ clips,
321
+ sr,
322
+ n_fft=512,
323
+ hop_length=256,
324
+ margin=16.,
325
+ cmap='magma',
326
+ n_mels=None,
327
+ show=True,
328
+ ):
329
+ """
330
+ Visualizes the given audio clips.
331
+
332
+ Args:
333
+ audio_specs (list): list of audio spectrograms
334
+ clips (np.ndarray): sized [T, 2], where T is the number of clips
335
+ and each row is a pair of start and end times of the clip
336
+ show (bool): whether to show the clips
337
+ """
338
+ clip_centers = (clips[:, 0] + clips[:, 1]) / 2
339
+ clip_durations = clips[:, 1] - clips[:, 0]
340
+
341
+ fig, ax = plt.subplots(1, len(audio_specs), figsize=(10, 4))
342
+ if len(audio_specs) == 1:
343
+ ax = [ax]
344
+ for i, spec in enumerate(audio_specs):
345
+ clip_start = clips[i][0]
346
+ # ax[i].imshow(spec, aspect='auto', origin='lower')
347
+ if isinstance(spec, torch.Tensor):
348
+ spec = spec.numpy()
349
+ if len(spec.shape) == 3:
350
+ spec = spec[0]
351
+ args = dict(
352
+ data=spec,
353
+ sr=sr,
354
+ n_fft=n_fft,
355
+ hop_length=hop_length,
356
+ ax=ax[i],
357
+ x_axis="time",
358
+ cmap=cmap,
359
+ )
360
+ if n_mels is None:
361
+ args.update(dict(y_axis="linear"))
362
+ else:
363
+ args.update(dict(y_axis="mel"))
364
+ librosa.display.specshow(**args)
365
+ # Get xticks and replace them by xticks + clip_start
366
+ xticks = ax[i].get_xticks()
367
+ xticks = xticks + clip_start
368
+ ax[i].set_xticklabels([f'{x:.1f}' for x in xticks])
369
+ ax[i].set_title(f'$t=$ {clip_centers[i]:.2f}')
370
+ plt.tight_layout()
371
+ if show:
372
+ plt.show()
373
+ else:
374
+ plt.savefig('audio_clips_spec.png')
375
+
376
+
377
+ def basic_pipeline_audio_clips(
378
+ audio_clips,
379
+ spec_args=None,
380
+ audio_transform=None,
381
+ stack=True,
382
+ ):
383
+
384
+ wave_transform = audio_transform.get('wave', None)
385
+ spec_transform = audio_transform.get('spec', None)
386
+
387
+ # Apply transforms to raw waveforms
388
+ if wave_transform is not None:
389
+ audio_clips = wave_transform(audio_clips)
390
+
391
+ if spec_args is not None:
392
+ # Convert waveforms to spectrograms
393
+ audio_clips = audio_clips_wav_to_spec(audio_clips, **spec_args)
394
+
395
+ # Apply transforms to spectrograms
396
+ if spec_transform is not None:
397
+ audio_clips = spec_transform(audio_clips)
398
+
399
+ if stack:
400
+ audio_clips = torch.stack(audio_clips)
401
+
402
+ return audio_clips
403
+
404
+
405
+ def load_and_process_audio(
406
+ audio_path,
407
+ clips,
408
+ cut_to_clip_len=True,
409
+ load_entire=False,
410
+ audio_transform=None,
411
+ aload_args=dict(),
412
+ apipe_args=dict(),
413
+ ):
414
+ """Loads and preprocess audio."""
415
+
416
+ # [C1] Load video clips: List[torch.Tensor]
417
+ audio_clips = load_audio_clips(
418
+ audio_path=audio_path,
419
+ clips=clips,
420
+ load_entire=load_entire,
421
+ cut_to_clip_len=cut_to_clip_len,
422
+ **aload_args,
423
+ )
424
+
425
+ # [C2] Pipeline: [Preprocessing -> Transform]
426
+ audio_clips = basic_pipeline_audio_clips(
427
+ audio_clips=audio_clips,
428
+ audio_transform=audio_transform,
429
+ **apipe_args,
430
+ )
431
+
432
+ return audio_clips
433
+
434
+
435
+ def crop_height(image, height):
436
+ """Crops image from the top and bottom to the desired height."""
437
+ width, curr_height = image.size
438
+ if curr_height < height:
439
+ raise ValueError(f"Height of the image is less than {height}")
440
+ top = (curr_height - height) // 2
441
+ bottom = top + height
442
+ return image.crop((0, top, width, bottom))
443
+
444
+
445
+ def pad_to_height(image, height):
446
+ """Pads image with black strips at the top and bottom."""
447
+ width, curr_height = image.size
448
+ if curr_height > height:
449
+ raise ValueError(f"Height of the image is already greater than {height}")
450
+ top = (height - curr_height) // 2
451
+ bottom = height - curr_height - top
452
+ return ImageOps.expand(image, (0, top, 0, bottom), fill="black")
453
+
454
+
455
+ def crop_width(image, width):
456
+ """Crops image from the left and right to the desired width."""
457
+ curr_width, height = image.size
458
+ if curr_width < width:
459
+ raise ValueError(f"Width of the image is less than {width}")
460
+ left = (curr_width - width) // 2
461
+ right = left + width
462
+ return image.crop((left, 0, right, height))
463
+
464
+
465
+ def crop_or_pad_height(image, height):
466
+ """Crops or pads image to the desired height."""
467
+ width, curr_height = image.size
468
+ if curr_height < height:
469
+ return pad_to_height(image, height)
470
+ elif curr_height > height:
471
+ return crop_height(image, height)
472
+ return image
473
+
474
+
475
+ def crop_or_pad_width(image, width):
476
+ """Crops or pads image to the desired width."""
477
+ curr_width, height = image.size
478
+ if curr_width < width:
479
+ return pad_to_width(image, width)
480
+ elif curr_width > width:
481
+ return crop_width(image, width)
482
+ return image
483
+
484
+
485
+ def pad_to_width(image, width):
486
+ """Pads image with black strips at the left and right."""
487
+ curr_width, height = image.size
488
+ if curr_width > width:
489
+ raise ValueError(f"Width of the image is already greater than {width}")
490
+ left = (width - curr_width) // 2
491
+ right = width - curr_width - left
492
+ return ImageOps.expand(image, (left, 0, right, 0), fill="black")
493
+
494
+
495
+ def crop_or_pad_to_size(image, size=(270, 480)):
496
+ """Crops or pads image to the desired size."""
497
+ image = crop_or_pad_height(image, size[1])
498
+ image = crop_or_pad_width(image, size[0])
499
+ return image
500
+
501
+
502
+ if __name__ == "__main__":
503
+ import decord
504
+ import sound_of_water.data.audio_transforms as at
505
+
506
+ # Testing on a sample file
507
+ file_path = "media_assets/ayNzH0uygFw_9.0_21.0.mp4"
508
+ assert os.path.exists(file_path), f"File not found: {file_path}"
509
+
510
+
511
+ # Define audio transforms
512
+ cfg_transform = {
513
+ "audio": {
514
+ "wave": [
515
+ {
516
+ "name": "AddNoise",
517
+ "args": {
518
+ "noise_level": 0.001
519
+ },
520
+ "augmentation": True,
521
+ },
522
+ {
523
+ "name": "ChangeVolume",
524
+ "args": {
525
+ "volume_factor": [0.8, 1.2]
526
+ },
527
+ "augmentation": True,
528
+ },
529
+ {
530
+ "name": "Wav2Vec2WaveformProcessor",
531
+ "args": {
532
+ "model_name": "facebook/wav2vec2-base-960h",
533
+ "sr": 16000
534
+ }
535
+ }
536
+ ],
537
+ "spec": None,
538
+ }
539
+ }
540
+ audio_transform = at.define_audio_transforms(
541
+ cfg_transform, augment=False,
542
+ )
543
+
544
+ # Define audio load arguments
545
+ aload_args = {
546
+ "sr": 16000,
547
+ "clip_len": None,
548
+ "backend": "decord",
549
+ }
550
+
551
+ # Define audio pipeline arguments
552
+ apipe_args = {
553
+ "spec_args": None,
554
+ "stack": True,
555
+ }
556
+
557
+ # Run the pipeline (this is used to pass to the model)
558
+ audio = load_and_process_audio(
559
+ audio_path=file_path,
560
+ clips=None,
561
+ load_entire=True,
562
+ cut_to_clip_len=False,
563
+ audio_transform=audio_transform,
564
+ aload_args=aload_args,
565
+ apipe_args=apipe_args,
566
+ )[0]
567
+
568
+
569
+ # This will be used to visualise
570
+ visualise_args = {
571
+ "sr": 16000,
572
+ "n_fft": 400,
573
+ "hop_length": 320,
574
+ "n_mels": 64,
575
+ "margin": 16.,
576
+ "C": 340 * 100.,
577
+ "audio_output_fps": 49.,
578
+ }
579
+ y = load_audio_clips(
580
+ audio_path=file_path,
581
+ clips=None,
582
+ load_entire=True,
583
+ cut_to_clip_len=False,
584
+ **aload_args,
585
+ )[0]
586
+ S = librosa_harmonic_spectrogram_db(
587
+ y,
588
+ sr=visualise_args["sr"],
589
+ n_fft=visualise_args["n_fft"],
590
+ hop_length=visualise_args["hop_length"],
591
+ n_mels=visualise_args['n_mels'],
592
+ )
593
+
594
+ # Load video frame
595
+ vr = decord.VideoReader(file_path, num_threads=1)
596
+ frame = PIL.Image.fromarray(vr[0].asnumpy())
597
+ """
598
+ # Cut to desired width
599
+ new_width, new_height = 270, 480
600
+ width, height = frame.size
601
+ if width > new_width:
602
+ # Crop the width
603
+ left = (width - new_width) // 2
604
+ right = left + new_width
605
+ frame = frame.crop((left, 0, right, height))
606
+ else:
607
+ # Resize along width to have the desired width
608
+ frame = su.visualize.resize_width(frame, new_width)
609
+ assert frame.size[0] == new_width, \
610
+ f"Width mismatch: {frame.size[0]} != {new_width}"
611
+
612
+ # Now pad/crop to desired height
613
+ if height > new_height:
614
+ # Crop the height
615
+ top = (height - new_height) // 2
616
+ bottom = top + new_height
617
+ frame = frame.crop((0, top, new_width, bottom))
618
+ else:
619
+ # Pad the height
620
+ frame = pad_to_height(frame, new_height)
621
+ assert frame.size[1] == new_height, \
622
+ f"Height mismatch: {frame.size[1]} != {new_height}"
623
+ """
624
+ frame = crop_or_pad_to_size(frame)
625
+ # frame.save("1.png")
626
+
627
+ # Visualise
628
+ fig, axes = plt.subplots(
629
+ 1, 2, figsize=(13, 4), width_ratios=[0.25, 0.75],
630
+ )
631
+ ax = axes[0]
632
+ ax.imshow(frame, aspect="auto")
633
+ ax.set_title("Example frame")
634
+ ax.set_xticks([])
635
+ ax.set_yticks([])
636
+ ax = axes[1]
637
+ show_logmelspectrogram(
638
+ S=S,
639
+ ax=ax,
640
+ show=False,
641
+ sr=visualise_args["sr"],
642
+ n_fft=visualise_args["n_fft"],
643
+ hop_length=visualise_args["hop_length"],
644
+ )
645
+ plt.savefig("./media_assets/audio_visualisation.png", bbox_inches="tight")
646
+ plt.close()
sound_of_water/data/audio_transforms.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Audio transforms."""
2
+ import torchaudio
3
+ import torchvision
4
+ from torchvision.transforms import Compose, ToTensor
5
+ import torchaudio.transforms as T
6
+ import imgaug.augmenters as iaa
7
+ import numpy as np
8
+ import torch
9
+
10
+
11
+ class AddNoise(object):
12
+ """Add noise to the waveform."""
13
+ def __init__(self, noise_level=0.1):
14
+ self.noise_level = noise_level
15
+
16
+ def __call__(self, waveform):
17
+ noise = torch.randn_like(waveform)
18
+ return waveform + self.noise_level * noise
19
+
20
+ def __repr__(self):
21
+ return self.__class__.__name__ + f"(noise_level={self.noise_level})"
22
+
23
+
24
+ class ChangeVolume(object):
25
+ """Change the volume of the waveform."""
26
+ def __init__(self, volume_factor=[0.6, 1.2]):
27
+ self.volume_factor = volume_factor
28
+
29
+ def __call__(self, waveform):
30
+ return waveform * np.random.uniform(*self.volume_factor)
31
+
32
+ def __repr__(self):
33
+ return self.__class__.__name__ + f"(volume_factor={self.volume_factor})"
34
+
35
+
36
+ def configure_transforms(cfg):
37
+ """
38
+ Given a transform config (List[dict]), return a Compose object that
39
+ applies the transforms in order.
40
+ """
41
+ transform = []
42
+ for a in cfg:
43
+ transform.append(eval(a["name"])(**a["args"]))
44
+ return Compose(transform)
45
+
46
+
47
+ class AudioClipsTransform:
48
+ def __init__(self, audio_transform):
49
+ """Applies image transform to each frame of each video clip."""
50
+ self.audio_transform = audio_transform
51
+
52
+ def __call__(self, audio_clips):
53
+ """
54
+ Args:
55
+ audio_clips (list): list of audio clips, each tensor [1, M]
56
+ where M is number of samples in each clip
57
+ """
58
+ transformed_audio_clips = [self.audio_transform(x) for x in audio_clips]
59
+ # transformed_audio_clips = []
60
+ # for clip in audio_clips:
61
+ # transformed_clip = [self.audio_transform(x) for x in clip]
62
+ # transformed_audio_clips.append(transformed_clip)
63
+ return transformed_audio_clips
64
+
65
+ def __repr__(self):
66
+ return self.audio_transform.__repr__()
67
+
68
+
69
+ class NumpyToTensor:
70
+ def __call__(self, x):
71
+ return torch.from_numpy(x).float()
72
+ def __repr__(self):
73
+ return self.__class__.__name__ + "()"
74
+
75
+
76
+ # TODO: Might have to introduce normalisation
77
+ # to have a consistent pipeline.
78
+
79
+
80
+ class Wav2Vec2WaveformProcessor:
81
+ def __init__(self, model_name="facebook/wav2vec2-base-960h", sr=16000):
82
+ from transformers import Wav2Vec2Processor
83
+ self.processor = Wav2Vec2Processor.from_pretrained(model_name)
84
+ self.sr = sr
85
+
86
+ def __call__(self, x):
87
+ x = self.processor(
88
+ x, sampling_rate=self.sr, return_tensors="pt",
89
+ ).input_values
90
+ return x
91
+
92
+
93
+ def define_audio_transforms(cfg_transform, augment=False):
94
+
95
+ wave_transforms = cfg_transform["audio"]["wave"]
96
+ wave_transforms_new = []
97
+
98
+ # Only pick augmentations if augment=True
99
+ for t in wave_transforms:
100
+ if "augmentation" not in t:
101
+ wave_transforms_new.append(t)
102
+ else:
103
+ if augment and t["augmentation"]:
104
+ wave_transforms_new.append(t)
105
+ # print(wave_transforms_new)
106
+ wave_transform = configure_transforms(wave_transforms_new)
107
+ wave_transform = AudioClipsTransform(wave_transform)
108
+
109
+ # wave_transform = configure_transforms(
110
+ # cfg_transform["audio"]["wave"],
111
+ # )
112
+ # wave_transform = AudioClipsTransform(wave_transform)
113
+ # spec_transform = configure_transforms(
114
+ # cfg_transform["audio"]["spec"],
115
+ # )
116
+ # spec_transform = AudioClipsTransform(spec_transform)
117
+
118
+ audio_transform = dict(
119
+ wave=wave_transform,
120
+ # spec=spec_transform,
121
+ )
122
+ return audio_transform
123
+
124
+
125
+ if __name__ == "__main__":
126
+ # Testing it out
127
+
128
+ # Raw waveform transform
129
+ cfg = [
130
+ {
131
+ "name": "AddNoise",
132
+ "args": {"noise_level": 0.1},
133
+ },
134
+ {
135
+ "name": "ChangeVolume",
136
+ "args": {"volume_factor": [0.6, 1.2]},
137
+ },
138
+ ]
139
+ transform = configure_transforms(cfg)
140
+
141
+ x = torch.randn([1, 16000])
142
+ z = transform(x)
143
+ print(x.shape, z.shape)
144
+
145
+ import matplotlib.pyplot as plt
146
+ fig, ax = plt.subplots(2, 1, figsize=(8, 4))
147
+ ax[0].plot(x[0].numpy())
148
+ ax[1].plot(z[0].numpy())
149
+ plt.savefig("waveform_transform.png")
150
+
151
+ # Wav2Vec2 transform
152
+ cfg = [
153
+ {
154
+ "name": "Wav2Vec2WaveformProcessor",
155
+ "args": {"model_name": "facebook/wav2vec2-base-960h", "sr": 16000},
156
+ },
157
+ ]
158
+ transform = configure_transforms(cfg)
159
+ x = torch.randn([4, 16000])
160
+ z = transform(x)
161
+ print(x.shape, z.shape)
162
+
163
+
164
+ # Spectrogram transform
165
+ cfg = [
166
+ {
167
+ "name": "T.FrequencyMasking",
168
+ "args": {"freq_mask_param": 8},
169
+ },
170
+ {
171
+ "name": "T.TimeMasking",
172
+ "args": {"time_mask_param": 16},
173
+ },
174
+ ]
175
+ transform = configure_transforms(cfg)
176
+ x = torch.randn([1, 64, 251])
177
+ z = transform(x)
178
+ print(x.shape, z.shape)
179
+
180
+ fig, ax = plt.subplots(2, 1, figsize=(8, 4))
181
+ ax[0].imshow(x[0].numpy())
182
+ ax[1].imshow(z[0].numpy())
183
+ plt.savefig("spectrogram_transform.png")
sound_of_water/data/csv_loader.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utils to load CSV file of audio datasets."""
2
+ import os
3
+
4
+ import pandas as pd
5
+ import shared.utils as su
6
+
7
+
8
+ def configure_paths_sound_of_water(
9
+ data_root="/work/piyush/from_nfs2/datasets/SoundOfWater",
10
+ ):
11
+ paths = {
12
+ "data_dir": data_root,
13
+ "video_clip_dir": os.path.join(data_root, "videos"),
14
+ "audio_clip_dir": os.path.join(data_root, "videos"),
15
+ "annot_dir": os.path.join(data_root, "annotations"),
16
+ "split_dir": os.path.join(data_root, "splits"),
17
+ }
18
+ return paths
19
+
20
+
21
+ def load_csv_sound_of_water(
22
+ paths: dict,
23
+ csv_filters=dict(),
24
+ csv_name="localisation.csv",
25
+ ds_name="SoundOfWater",
26
+ split=None,
27
+ check_first_frame_annots=True,
28
+ ):
29
+ """Loads CSV containing metadata of the dataset."""
30
+
31
+ su.log.print_update(
32
+ f" [:::] Loading {ds_name}.",
33
+ pos="left",
34
+ fillchar=".",
35
+ )
36
+
37
+ # Configure paths
38
+ video_clip_dir = paths["video_clip_dir"]
39
+ audio_clip_dir = paths["audio_clip_dir"]
40
+
41
+ # Load main CSV
42
+ path = os.path.join(
43
+ paths["annot_dir"], csv_name,
44
+ )
45
+ assert os.path.exists(path), \
46
+ f"CSV file not found at {path}."
47
+ print(" [:::] CSV path:", path)
48
+ df = pd.read_csv(path)
49
+
50
+ # Load side information: containers
51
+ container_path = os.path.join(
52
+ paths['annot_dir'], "containers.yaml",
53
+ )
54
+ assert os.path.exists(container_path)
55
+ containers = su.io.load_yml(container_path)
56
+
57
+ # Update CSV with container information (optional)
58
+ update_with_container_info = True
59
+ if update_with_container_info:
60
+ rows = []
61
+ for row in df.iterrows():
62
+ row = row[1].to_dict()
63
+ row.update(containers[row["container_id"]])
64
+ rows.append(row)
65
+ df = pd.DataFrame(rows)
66
+ print(" [:::] Shape of CSV: ", df.shape)
67
+
68
+ # 1. Update item_id
69
+ df["item_id"] = df.apply(
70
+ lambda d: f"{d['video_id']}_{d['start_time']:.1f}_{d['end_time']:.1f}",
71
+ axis=1,
72
+ )
73
+
74
+ # 2. Update video_clip_path
75
+ # df["video_path"] = df["video_id"].apply(
76
+ # lambda d: os.path.join(
77
+ # video_dir, f"{d}.mp4"
78
+ # )
79
+ # )
80
+ df["video_clip_path"] = df["item_id"].apply(
81
+ lambda d: os.path.join(
82
+ video_clip_dir, f"{d}.mp4"
83
+ )
84
+ )
85
+ df = df[df["video_clip_path"].apply(os.path.exists)]
86
+ print(" [:::] Shape of CSV with available video: ", df.shape)
87
+
88
+ # 3. Update audio_clip_path
89
+ # df["audio_path"] = df["video_id"].apply(
90
+ # lambda d: os.path.join(
91
+ # audio_dir, f"{d}.mp4"
92
+ # )
93
+ # )
94
+ df["audio_clip_path"] = df["item_id"].apply(
95
+ lambda d: os.path.join(
96
+ audio_clip_dir, f"{d}.mp4"
97
+ )
98
+ )
99
+ df = df[df["audio_clip_path"].apply(os.path.exists)]
100
+ print(" [:::] Shape of CSV with available audio: ", df.shape)
101
+
102
+ # Add first frame annotation paths
103
+ if check_first_frame_annots:
104
+ frame_annot_dir = os.path.join(paths["annot_dir"], "container_bboxes")
105
+ df["box_path"] = df["video_id"].apply(
106
+ lambda d: os.path.join(frame_annot_dir, f"{d}_box.npy"),
107
+ )
108
+ df["mask_path"] = df["video_id"].apply(
109
+ lambda d: os.path.join(frame_annot_dir, f"{d}_mask.npy"),
110
+ )
111
+ df = df[df["box_path"].apply(os.path.exists)]
112
+ df = df[df["mask_path"].apply(os.path.exists)]
113
+ print(" [:::] Shape of CSV with first frame annotations: ", df.shape)
114
+
115
+ # Add split filter
116
+ if split is not None and ("item_id" not in csv_filters):
117
+ assert "split_dir" in paths
118
+ split_path = os.path.join(paths["split_dir"], f"{split}")
119
+ assert os.path.exists(split_path), \
120
+ f"Split file not found at {split_path}."
121
+ item_ids = su.io.load_txt(split_path)
122
+ print(" [:::] Number of item_ids in split:", len(item_ids))
123
+ csv_filters["item_id"] = item_ids
124
+
125
+ # Apply filter to the CSV
126
+ if len(csv_filters) > 0:
127
+ df = su.pd_utils.apply_filters(df, csv_filters)
128
+ print(" [:::] Shape of CSV after filtering: ", df.shape)
129
+
130
+ return df
131
+
132
+
133
+ if __name__ == "__main__":
134
+ paths = configure_paths_sound_of_water()
135
+ df = load_csv_sound_of_water(paths)
136
+ row = df.iloc[0].to_dict()
137
+ su.log.json_print(row)
sound_of_water/data/video_loader.py ADDED
File without changes
sound_of_water/data/video_transforms.py ADDED
File without changes
sound_of_water/video_height/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ In this folder, we will store the code to train and evaluate models for liquid height detection from video.