Spaces:
Running
on
Zero
Running
on
Zero
asigalov61
commited on
Upload 6 files
Browse files- config.py +7 -0
- inference.py +171 -0
- models.py +353 -0
- piano_vad.py +130 -0
- pytorch_utils.py +66 -0
- utilities.py +564 -0
config.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sample_rate = 16000
|
2 |
+
classes_num = 88 # Number of notes of piano
|
3 |
+
begin_note = 21 # MIDI note of A0, the lowest note of a piano.
|
4 |
+
segment_seconds = 10. # Training segment duration
|
5 |
+
hop_seconds = 1.
|
6 |
+
frames_per_second = 100
|
7 |
+
velocity_scale = 128
|
inference.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import time
|
4 |
+
import librosa
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
import torch
|
8 |
+
|
9 |
+
from .utilities import (create_folder, get_filename, RegressionPostProcessor,
|
10 |
+
write_events_to_midi)
|
11 |
+
from .models import Regress_onset_offset_frame_velocity_CRNN, Note_pedal
|
12 |
+
from .pytorch_utils import move_data_to_device, forward
|
13 |
+
from . import config
|
14 |
+
|
15 |
+
|
16 |
+
class PianoTranscription(object):
|
17 |
+
def __init__(self, model_type='Note_pedal', checkpoint_path=None,
|
18 |
+
segment_samples=16000*10, device=torch.device('cuda')):
|
19 |
+
"""Class for transcribing piano solo recording.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
model_type: str
|
23 |
+
checkpoint_path: str
|
24 |
+
segment_samples: int
|
25 |
+
device: 'cuda' | 'cpu'
|
26 |
+
"""
|
27 |
+
if not checkpoint_path:
|
28 |
+
checkpoint_path='{}/piano_transcription_inference_data/note_F1=0.9677_pedal_F1=0.9186.pth'.format(str(Path.home()))
|
29 |
+
print('Checkpoint path: {}'.format(checkpoint_path))
|
30 |
+
|
31 |
+
if not os.path.exists(checkpoint_path) or os.path.getsize(checkpoint_path) < 1.6e8:
|
32 |
+
create_folder(os.path.dirname(checkpoint_path))
|
33 |
+
print('Total size: ~165 MB')
|
34 |
+
zenodo_path = 'https://zenodo.org/record/4034264/files/CRNN_note_F1%3D0.9677_pedal_F1%3D0.9186.pth?download=1'
|
35 |
+
os.system('wget -O "{}" "{}"'.format(checkpoint_path, zenodo_path))
|
36 |
+
|
37 |
+
print('Using {} for inference.'.format(device))
|
38 |
+
|
39 |
+
self.segment_samples = segment_samples
|
40 |
+
self.frames_per_second = config.frames_per_second
|
41 |
+
self.classes_num = config.classes_num
|
42 |
+
self.onset_threshold = 0.3
|
43 |
+
self.offset_threshod = 0.3
|
44 |
+
self.frame_threshold = 0.1
|
45 |
+
self.pedal_offset_threshold = 0.2
|
46 |
+
|
47 |
+
# Build model
|
48 |
+
Model = eval(model_type)
|
49 |
+
self.model = Model(frames_per_second=self.frames_per_second,
|
50 |
+
classes_num=self.classes_num)
|
51 |
+
|
52 |
+
# Load model
|
53 |
+
checkpoint = torch.load(checkpoint_path, map_location=device)
|
54 |
+
self.model.load_state_dict(checkpoint['model'], strict=False)
|
55 |
+
|
56 |
+
# Parallel
|
57 |
+
if 'cuda' in str(device):
|
58 |
+
self.model.to(device)
|
59 |
+
print('GPU number: {}'.format(torch.cuda.device_count()))
|
60 |
+
self.model = torch.nn.DataParallel(self.model)
|
61 |
+
else:
|
62 |
+
print('Using CPU.')
|
63 |
+
|
64 |
+
def transcribe(self, audio, midi_path):
|
65 |
+
"""Transcribe an audio recording.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
audio: (audio_samples,)
|
69 |
+
midi_path: str, path to write out the transcribed MIDI.
|
70 |
+
|
71 |
+
Returns:
|
72 |
+
transcribed_dict, dict: {'output_dict':, ..., 'est_note_events': ...}
|
73 |
+
|
74 |
+
"""
|
75 |
+
audio = audio[None, :] # (1, audio_samples)
|
76 |
+
|
77 |
+
# Pad audio to be evenly divided by segment_samples
|
78 |
+
audio_len = audio.shape[1]
|
79 |
+
pad_len = int(np.ceil(audio_len / self.segment_samples))\
|
80 |
+
* self.segment_samples - audio_len
|
81 |
+
|
82 |
+
audio = np.concatenate((audio, np.zeros((1, pad_len))), axis=1)
|
83 |
+
|
84 |
+
# Enframe to segments
|
85 |
+
segments = self.enframe(audio, self.segment_samples)
|
86 |
+
"""(N, segment_samples)"""
|
87 |
+
|
88 |
+
# Forward
|
89 |
+
output_dict = forward(self.model, segments, batch_size=1)
|
90 |
+
"""{'reg_onset_output': (N, segment_frames, classes_num), ...}"""
|
91 |
+
|
92 |
+
# Deframe to original length
|
93 |
+
for key in output_dict.keys():
|
94 |
+
output_dict[key] = self.deframe(output_dict[key])[0 : audio_len]
|
95 |
+
"""output_dict: {
|
96 |
+
'reg_onset_output': (N, segment_frames, classes_num),
|
97 |
+
'reg_offset_output': (N, segment_frames, classes_num),
|
98 |
+
'frame_output': (N, segment_frames, classes_num),
|
99 |
+
'velocity_output': (N, segment_frames, classes_num)}"""
|
100 |
+
|
101 |
+
# Post processor
|
102 |
+
post_processor = RegressionPostProcessor(self.frames_per_second,
|
103 |
+
classes_num=self.classes_num, onset_threshold=self.onset_threshold,
|
104 |
+
offset_threshold=self.offset_threshod,
|
105 |
+
frame_threshold=self.frame_threshold,
|
106 |
+
pedal_offset_threshold=self.pedal_offset_threshold)
|
107 |
+
|
108 |
+
# Post process output_dict to MIDI events
|
109 |
+
(est_note_events, est_pedal_events) = \
|
110 |
+
post_processor.output_dict_to_midi_events(output_dict)
|
111 |
+
|
112 |
+
# Write MIDI events to file
|
113 |
+
if midi_path:
|
114 |
+
write_events_to_midi(start_time=0, note_events=est_note_events,
|
115 |
+
pedal_events=est_pedal_events, midi_path=midi_path)
|
116 |
+
print('Write out to {}'.format(midi_path))
|
117 |
+
|
118 |
+
transcribed_dict = {
|
119 |
+
'output_dict': output_dict,
|
120 |
+
'est_note_events': est_note_events,
|
121 |
+
'est_pedal_events': est_pedal_events}
|
122 |
+
|
123 |
+
return transcribed_dict
|
124 |
+
|
125 |
+
def enframe(self, x, segment_samples):
|
126 |
+
"""Enframe long sequence to short segments.
|
127 |
+
|
128 |
+
Args:
|
129 |
+
x: (1, audio_samples)
|
130 |
+
segment_samples: int
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
batch: (N, segment_samples)
|
134 |
+
"""
|
135 |
+
assert x.shape[1] % segment_samples == 0
|
136 |
+
batch = []
|
137 |
+
|
138 |
+
pointer = 0
|
139 |
+
while pointer + segment_samples <= x.shape[1]:
|
140 |
+
batch.append(x[:, pointer : pointer + segment_samples])
|
141 |
+
pointer += segment_samples // 2
|
142 |
+
|
143 |
+
batch = np.concatenate(batch, axis=0)
|
144 |
+
return batch
|
145 |
+
|
146 |
+
def deframe(self, x):
|
147 |
+
"""Deframe predicted segments to original sequence.
|
148 |
+
|
149 |
+
Args:
|
150 |
+
x: (N, segment_frames, classes_num)
|
151 |
+
|
152 |
+
Returns:
|
153 |
+
y: (audio_frames, classes_num)
|
154 |
+
"""
|
155 |
+
if x.shape[0] == 1:
|
156 |
+
return x[0]
|
157 |
+
|
158 |
+
else:
|
159 |
+
x = x[:, 0 : -1, :]
|
160 |
+
"""Remove an extra frame in the end of each segment caused by the
|
161 |
+
'center=True' argument when calculating spectrogram."""
|
162 |
+
(N, segment_samples, classes_num) = x.shape
|
163 |
+
assert segment_samples % 4 == 0
|
164 |
+
|
165 |
+
y = []
|
166 |
+
y.append(x[0, 0 : int(segment_samples * 0.75)])
|
167 |
+
for i in range(1, N - 1):
|
168 |
+
y.append(x[i, int(segment_samples * 0.25) : int(segment_samples * 0.75)])
|
169 |
+
y.append(x[-1, int(segment_samples * 0.25) :])
|
170 |
+
y = np.concatenate(y, axis=0)
|
171 |
+
return y
|
models.py
ADDED
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import math
|
4 |
+
import time
|
5 |
+
import numpy as np
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
|
8 |
+
import torch
|
9 |
+
import torch.nn as nn
|
10 |
+
import torch.nn.functional as F
|
11 |
+
|
12 |
+
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
|
13 |
+
from .pytorch_utils import move_data_to_device
|
14 |
+
|
15 |
+
|
16 |
+
def init_layer(layer):
|
17 |
+
"""Initialize a Linear or Convolutional layer. """
|
18 |
+
nn.init.xavier_uniform_(layer.weight)
|
19 |
+
|
20 |
+
if hasattr(layer, 'bias'):
|
21 |
+
if layer.bias is not None:
|
22 |
+
layer.bias.data.fill_(0.)
|
23 |
+
|
24 |
+
|
25 |
+
def init_bn(bn):
|
26 |
+
"""Initialize a Batchnorm layer. """
|
27 |
+
bn.bias.data.fill_(0.)
|
28 |
+
bn.weight.data.fill_(1.)
|
29 |
+
|
30 |
+
|
31 |
+
def init_gru(rnn):
|
32 |
+
"""Initialize a GRU layer. """
|
33 |
+
|
34 |
+
def _concat_init(tensor, init_funcs):
|
35 |
+
(length, fan_out) = tensor.shape
|
36 |
+
fan_in = length // len(init_funcs)
|
37 |
+
|
38 |
+
for (i, init_func) in enumerate(init_funcs):
|
39 |
+
init_func(tensor[i * fan_in : (i + 1) * fan_in, :])
|
40 |
+
|
41 |
+
def _inner_uniform(tensor):
|
42 |
+
fan_in = nn.init._calculate_correct_fan(tensor, 'fan_in')
|
43 |
+
nn.init.uniform_(tensor, -math.sqrt(3 / fan_in), math.sqrt(3 / fan_in))
|
44 |
+
|
45 |
+
for i in range(rnn.num_layers):
|
46 |
+
_concat_init(
|
47 |
+
getattr(rnn, 'weight_ih_l{}'.format(i)),
|
48 |
+
[_inner_uniform, _inner_uniform, _inner_uniform]
|
49 |
+
)
|
50 |
+
torch.nn.init.constant_(getattr(rnn, 'bias_ih_l{}'.format(i)), 0)
|
51 |
+
|
52 |
+
_concat_init(
|
53 |
+
getattr(rnn, 'weight_hh_l{}'.format(i)),
|
54 |
+
[_inner_uniform, _inner_uniform, nn.init.orthogonal_]
|
55 |
+
)
|
56 |
+
torch.nn.init.constant_(getattr(rnn, 'bias_hh_l{}'.format(i)), 0)
|
57 |
+
|
58 |
+
|
59 |
+
class ConvBlock(nn.Module):
|
60 |
+
def __init__(self, in_channels, out_channels, momentum):
|
61 |
+
|
62 |
+
super(ConvBlock, self).__init__()
|
63 |
+
|
64 |
+
self.conv1 = nn.Conv2d(in_channels=in_channels,
|
65 |
+
out_channels=out_channels,
|
66 |
+
kernel_size=(3, 3), stride=(1, 1),
|
67 |
+
padding=(1, 1), bias=False)
|
68 |
+
|
69 |
+
self.conv2 = nn.Conv2d(in_channels=out_channels,
|
70 |
+
out_channels=out_channels,
|
71 |
+
kernel_size=(3, 3), stride=(1, 1),
|
72 |
+
padding=(1, 1), bias=False)
|
73 |
+
|
74 |
+
self.bn1 = nn.BatchNorm2d(out_channels, momentum)
|
75 |
+
self.bn2 = nn.BatchNorm2d(out_channels, momentum)
|
76 |
+
|
77 |
+
self.init_weight()
|
78 |
+
|
79 |
+
def init_weight(self):
|
80 |
+
init_layer(self.conv1)
|
81 |
+
init_layer(self.conv2)
|
82 |
+
init_bn(self.bn1)
|
83 |
+
init_bn(self.bn2)
|
84 |
+
|
85 |
+
|
86 |
+
def forward(self, input, pool_size=(2, 2), pool_type='avg'):
|
87 |
+
"""
|
88 |
+
Args:
|
89 |
+
input: (batch_size, in_channels, time_steps, freq_bins)
|
90 |
+
Outputs:
|
91 |
+
output: (batch_size, out_channels, classes_num)
|
92 |
+
"""
|
93 |
+
|
94 |
+
x = F.relu_(self.bn1(self.conv1(input)))
|
95 |
+
x = F.relu_(self.bn2(self.conv2(x)))
|
96 |
+
|
97 |
+
if pool_type == 'avg':
|
98 |
+
x = F.avg_pool2d(x, kernel_size=pool_size)
|
99 |
+
|
100 |
+
return x
|
101 |
+
|
102 |
+
|
103 |
+
class AcousticModelCRnn8Dropout(nn.Module):
|
104 |
+
def __init__(self, classes_num, midfeat, momentum):
|
105 |
+
super(AcousticModelCRnn8Dropout, self).__init__()
|
106 |
+
|
107 |
+
self.conv_block1 = ConvBlock(in_channels=1, out_channels=48, momentum=momentum)
|
108 |
+
self.conv_block2 = ConvBlock(in_channels=48, out_channels=64, momentum=momentum)
|
109 |
+
self.conv_block3 = ConvBlock(in_channels=64, out_channels=96, momentum=momentum)
|
110 |
+
self.conv_block4 = ConvBlock(in_channels=96, out_channels=128, momentum=momentum)
|
111 |
+
|
112 |
+
self.fc5 = nn.Linear(midfeat, 768, bias=False)
|
113 |
+
self.bn5 = nn.BatchNorm1d(768, momentum=momentum)
|
114 |
+
|
115 |
+
self.gru = nn.GRU(input_size=768, hidden_size=256, num_layers=2,
|
116 |
+
bias=True, batch_first=True, dropout=0., bidirectional=True)
|
117 |
+
|
118 |
+
self.fc = nn.Linear(512, classes_num, bias=True)
|
119 |
+
|
120 |
+
self.init_weight()
|
121 |
+
|
122 |
+
def init_weight(self):
|
123 |
+
init_layer(self.fc5)
|
124 |
+
init_bn(self.bn5)
|
125 |
+
init_gru(self.gru)
|
126 |
+
init_layer(self.fc)
|
127 |
+
|
128 |
+
def forward(self, input):
|
129 |
+
"""
|
130 |
+
Args:
|
131 |
+
input: (batch_size, channels_num, time_steps, freq_bins)
|
132 |
+
Outputs:
|
133 |
+
output: (batch_size, time_steps, classes_num)
|
134 |
+
"""
|
135 |
+
|
136 |
+
x = self.conv_block1(input, pool_size=(1, 2), pool_type='avg')
|
137 |
+
x = F.dropout(x, p=0.2, training=self.training)
|
138 |
+
x = self.conv_block2(x, pool_size=(1, 2), pool_type='avg')
|
139 |
+
x = F.dropout(x, p=0.2, training=self.training)
|
140 |
+
x = self.conv_block3(x, pool_size=(1, 2), pool_type='avg')
|
141 |
+
x = F.dropout(x, p=0.2, training=self.training)
|
142 |
+
x = self.conv_block4(x, pool_size=(1, 2), pool_type='avg')
|
143 |
+
x = F.dropout(x, p=0.2, training=self.training)
|
144 |
+
|
145 |
+
x = x.transpose(1, 2).flatten(2)
|
146 |
+
x = F.relu(self.bn5(self.fc5(x).transpose(1, 2)).transpose(1, 2))
|
147 |
+
x = F.dropout(x, p=0.5, training=self.training, inplace=True)
|
148 |
+
|
149 |
+
(x, _) = self.gru(x)
|
150 |
+
x = F.dropout(x, p=0.5, training=self.training, inplace=False)
|
151 |
+
output = torch.sigmoid(self.fc(x))
|
152 |
+
return output
|
153 |
+
|
154 |
+
|
155 |
+
class Regress_onset_offset_frame_velocity_CRNN(nn.Module):
|
156 |
+
def __init__(self, frames_per_second, classes_num):
|
157 |
+
super(Regress_onset_offset_frame_velocity_CRNN, self).__init__()
|
158 |
+
|
159 |
+
sample_rate = 16000
|
160 |
+
window_size = 2048
|
161 |
+
hop_size = sample_rate // frames_per_second
|
162 |
+
mel_bins = 229
|
163 |
+
fmin = 30
|
164 |
+
fmax = sample_rate // 2
|
165 |
+
|
166 |
+
window = 'hann'
|
167 |
+
center = True
|
168 |
+
pad_mode = 'reflect'
|
169 |
+
ref = 1.0
|
170 |
+
amin = 1e-10
|
171 |
+
top_db = None
|
172 |
+
|
173 |
+
midfeat = 1792
|
174 |
+
momentum = 0.01
|
175 |
+
|
176 |
+
# Spectrogram extractor
|
177 |
+
self.spectrogram_extractor = Spectrogram(n_fft=window_size,
|
178 |
+
hop_length=hop_size, win_length=window_size, window=window,
|
179 |
+
center=center, pad_mode=pad_mode, freeze_parameters=True)
|
180 |
+
|
181 |
+
# Logmel feature extractor
|
182 |
+
self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
|
183 |
+
n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref,
|
184 |
+
amin=amin, top_db=top_db, freeze_parameters=True)
|
185 |
+
|
186 |
+
self.bn0 = nn.BatchNorm2d(mel_bins, momentum)
|
187 |
+
|
188 |
+
self.frame_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum)
|
189 |
+
self.reg_onset_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum)
|
190 |
+
self.reg_offset_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum)
|
191 |
+
self.velocity_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum)
|
192 |
+
|
193 |
+
self.reg_onset_gru = nn.GRU(input_size=88 * 2, hidden_size=256, num_layers=1,
|
194 |
+
bias=True, batch_first=True, dropout=0., bidirectional=True)
|
195 |
+
self.reg_onset_fc = nn.Linear(512, classes_num, bias=True)
|
196 |
+
|
197 |
+
self.frame_gru = nn.GRU(input_size=88 * 3, hidden_size=256, num_layers=1,
|
198 |
+
bias=True, batch_first=True, dropout=0., bidirectional=True)
|
199 |
+
self.frame_fc = nn.Linear(512, classes_num, bias=True)
|
200 |
+
|
201 |
+
self.init_weight()
|
202 |
+
|
203 |
+
def init_weight(self):
|
204 |
+
init_bn(self.bn0)
|
205 |
+
init_gru(self.reg_onset_gru)
|
206 |
+
init_gru(self.frame_gru)
|
207 |
+
init_layer(self.reg_onset_fc)
|
208 |
+
init_layer(self.frame_fc)
|
209 |
+
|
210 |
+
def forward(self, input):
|
211 |
+
"""
|
212 |
+
Args:
|
213 |
+
input: (batch_size, data_length)
|
214 |
+
Outputs:
|
215 |
+
output_dict: dict, {
|
216 |
+
'reg_onset_output': (batch_size, time_steps, classes_num),
|
217 |
+
'reg_offset_output': (batch_size, time_steps, classes_num),
|
218 |
+
'frame_output': (batch_size, time_steps, classes_num),
|
219 |
+
'velocity_output': (batch_size, time_steps, classes_num)
|
220 |
+
}
|
221 |
+
"""
|
222 |
+
|
223 |
+
x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
|
224 |
+
x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
|
225 |
+
|
226 |
+
x = x.transpose(1, 3)
|
227 |
+
x = self.bn0(x)
|
228 |
+
x = x.transpose(1, 3)
|
229 |
+
|
230 |
+
frame_output = self.frame_model(x) # (batch_size, time_steps, classes_num)
|
231 |
+
reg_onset_output = self.reg_onset_model(x) # (batch_size, time_steps, classes_num)
|
232 |
+
reg_offset_output = self.reg_offset_model(x) # (batch_size, time_steps, classes_num)
|
233 |
+
velocity_output = self.velocity_model(x) # (batch_size, time_steps, classes_num)
|
234 |
+
|
235 |
+
# Use velocities to condition onset regression
|
236 |
+
x = torch.cat((reg_onset_output, (reg_onset_output ** 0.5) * velocity_output.detach()), dim=2)
|
237 |
+
(x, _) = self.reg_onset_gru(x)
|
238 |
+
x = F.dropout(x, p=0.5, training=self.training, inplace=False)
|
239 |
+
reg_onset_output = torch.sigmoid(self.reg_onset_fc(x))
|
240 |
+
"""(batch_size, time_steps, classes_num)"""
|
241 |
+
|
242 |
+
# Use onsets and offsets to condition frame-wise classification
|
243 |
+
x = torch.cat((frame_output, reg_onset_output.detach(), reg_offset_output.detach()), dim=2)
|
244 |
+
(x, _) = self.frame_gru(x)
|
245 |
+
x = F.dropout(x, p=0.5, training=self.training, inplace=False)
|
246 |
+
frame_output = torch.sigmoid(self.frame_fc(x)) # (batch_size, time_steps, classes_num)
|
247 |
+
"""(batch_size, time_steps, classes_num)"""
|
248 |
+
|
249 |
+
output_dict = {
|
250 |
+
'reg_onset_output': reg_onset_output,
|
251 |
+
'reg_offset_output': reg_offset_output,
|
252 |
+
'frame_output': frame_output,
|
253 |
+
'velocity_output': velocity_output}
|
254 |
+
|
255 |
+
return output_dict
|
256 |
+
|
257 |
+
|
258 |
+
class Regress_pedal_CRNN(nn.Module):
|
259 |
+
def __init__(self, frames_per_second, classes_num):
|
260 |
+
super(Regress_pedal_CRNN, self).__init__()
|
261 |
+
|
262 |
+
sample_rate = 16000
|
263 |
+
window_size = 2048
|
264 |
+
hop_size = sample_rate // frames_per_second
|
265 |
+
mel_bins = 229
|
266 |
+
fmin = 30
|
267 |
+
fmax = sample_rate // 2
|
268 |
+
|
269 |
+
window = 'hann'
|
270 |
+
center = True
|
271 |
+
pad_mode = 'reflect'
|
272 |
+
ref = 1.0
|
273 |
+
amin = 1e-10
|
274 |
+
top_db = None
|
275 |
+
|
276 |
+
midfeat = 1792
|
277 |
+
momentum = 0.01
|
278 |
+
|
279 |
+
# Spectrogram extractor
|
280 |
+
self.spectrogram_extractor = Spectrogram(n_fft=window_size,
|
281 |
+
hop_length=hop_size, win_length=window_size, window=window,
|
282 |
+
center=center, pad_mode=pad_mode, freeze_parameters=True)
|
283 |
+
|
284 |
+
# Logmel feature extractor
|
285 |
+
self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
|
286 |
+
n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref,
|
287 |
+
amin=amin, top_db=top_db, freeze_parameters=True)
|
288 |
+
|
289 |
+
self.bn0 = nn.BatchNorm2d(mel_bins, momentum)
|
290 |
+
|
291 |
+
self.reg_pedal_onset_model = AcousticModelCRnn8Dropout(1, midfeat, momentum)
|
292 |
+
self.reg_pedal_offset_model = AcousticModelCRnn8Dropout(1, midfeat, momentum)
|
293 |
+
self.reg_pedal_frame_model = AcousticModelCRnn8Dropout(1, midfeat, momentum)
|
294 |
+
|
295 |
+
self.init_weight()
|
296 |
+
|
297 |
+
def init_weight(self):
|
298 |
+
init_bn(self.bn0)
|
299 |
+
|
300 |
+
def forward(self, input):
|
301 |
+
"""
|
302 |
+
Args:
|
303 |
+
input: (batch_size, data_length)
|
304 |
+
Outputs:
|
305 |
+
output_dict: dict, {
|
306 |
+
'reg_onset_output': (batch_size, time_steps, classes_num),
|
307 |
+
'reg_offset_output': (batch_size, time_steps, classes_num),
|
308 |
+
'frame_output': (batch_size, time_steps, classes_num),
|
309 |
+
'velocity_output': (batch_size, time_steps, classes_num)
|
310 |
+
}
|
311 |
+
"""
|
312 |
+
|
313 |
+
x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
|
314 |
+
x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
|
315 |
+
|
316 |
+
x = x.transpose(1, 3)
|
317 |
+
x = self.bn0(x)
|
318 |
+
x = x.transpose(1, 3)
|
319 |
+
|
320 |
+
reg_pedal_onset_output = self.reg_pedal_onset_model(x) # (batch_size, time_steps, classes_num)
|
321 |
+
reg_pedal_offset_output = self.reg_pedal_offset_model(x) # (batch_size, time_steps, classes_num)
|
322 |
+
pedal_frame_output = self.reg_pedal_frame_model(x) # (batch_size, time_steps, classes_num)
|
323 |
+
|
324 |
+
output_dict = {
|
325 |
+
'reg_pedal_onset_output': reg_pedal_onset_output,
|
326 |
+
'reg_pedal_offset_output': reg_pedal_offset_output,
|
327 |
+
'pedal_frame_output': pedal_frame_output}
|
328 |
+
|
329 |
+
return output_dict
|
330 |
+
|
331 |
+
|
332 |
+
# This model is not trained, but is combined from the trained note and pedal models.
|
333 |
+
class Note_pedal(nn.Module):
|
334 |
+
def __init__(self, frames_per_second, classes_num):
|
335 |
+
"""The combination of note and pedal model.
|
336 |
+
"""
|
337 |
+
super(Note_pedal, self).__init__()
|
338 |
+
|
339 |
+
self.note_model = Regress_onset_offset_frame_velocity_CRNN(frames_per_second, classes_num)
|
340 |
+
self.pedal_model = Regress_pedal_CRNN(frames_per_second, classes_num)
|
341 |
+
|
342 |
+
def load_state_dict(self, m, strict=False):
|
343 |
+
self.note_model.load_state_dict(m['note_model'], strict=strict)
|
344 |
+
self.pedal_model.load_state_dict(m['pedal_model'], strict=strict)
|
345 |
+
|
346 |
+
def forward(self, input):
|
347 |
+
note_output_dict = self.note_model(input)
|
348 |
+
pedal_output_dict = self.pedal_model(input)
|
349 |
+
|
350 |
+
full_output_dict = {}
|
351 |
+
full_output_dict.update(note_output_dict)
|
352 |
+
full_output_dict.update(pedal_output_dict)
|
353 |
+
return full_output_dict
|
piano_vad.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
def note_detection_with_onset_offset_regress(frame_output, onset_output,
|
5 |
+
onset_shift_output, offset_output, offset_shift_output, velocity_output,
|
6 |
+
frame_threshold):
|
7 |
+
"""Process prediction matrices to note events information.
|
8 |
+
First, detect onsets with onset outputs. Then, detect offsets
|
9 |
+
with frame and offset outputs.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
frame_output: (frames_num,)
|
13 |
+
onset_output: (frames_num,)
|
14 |
+
onset_shift_output: (frames_num,)
|
15 |
+
offset_output: (frames_num,)
|
16 |
+
offset_shift_output: (frames_num,)
|
17 |
+
velocity_output: (frames_num,)
|
18 |
+
frame_threshold: float
|
19 |
+
Returns:
|
20 |
+
output_tuples: list of [bgn, fin, onset_shift, offset_shift, normalized_velocity],
|
21 |
+
e.g., [
|
22 |
+
[1821, 1909, 0.47498, 0.3048533, 0.72119445],
|
23 |
+
[1909, 1947, 0.30730522, -0.45764327, 0.64200014],
|
24 |
+
...]
|
25 |
+
"""
|
26 |
+
output_tuples = []
|
27 |
+
bgn = None
|
28 |
+
frame_disappear = None
|
29 |
+
offset_occur = None
|
30 |
+
|
31 |
+
for i in range(onset_output.shape[0]):
|
32 |
+
if onset_output[i] == 1:
|
33 |
+
"""Onset detected"""
|
34 |
+
if bgn:
|
35 |
+
"""Consecutive onsets. E.g., pedal is not released, but two
|
36 |
+
consecutive notes being played."""
|
37 |
+
fin = max(i - 1, 0)
|
38 |
+
output_tuples.append([bgn, fin, onset_shift_output[bgn],
|
39 |
+
0, velocity_output[bgn]])
|
40 |
+
frame_disappear, offset_occur = None, None
|
41 |
+
bgn = i
|
42 |
+
|
43 |
+
if bgn and i > bgn:
|
44 |
+
"""If onset found, then search offset"""
|
45 |
+
if frame_output[i] <= frame_threshold and not frame_disappear:
|
46 |
+
"""Frame disappear detected"""
|
47 |
+
frame_disappear = i
|
48 |
+
|
49 |
+
if offset_output[i] == 1 and not offset_occur:
|
50 |
+
"""Offset detected"""
|
51 |
+
offset_occur = i
|
52 |
+
|
53 |
+
if frame_disappear:
|
54 |
+
if offset_occur and offset_occur - bgn > frame_disappear - offset_occur:
|
55 |
+
"""bgn --------- offset_occur --- frame_disappear"""
|
56 |
+
fin = offset_occur
|
57 |
+
else:
|
58 |
+
"""bgn --- offset_occur --------- frame_disappear"""
|
59 |
+
fin = frame_disappear
|
60 |
+
output_tuples.append([bgn, fin, onset_shift_output[bgn],
|
61 |
+
offset_shift_output[fin], velocity_output[bgn]])
|
62 |
+
bgn, frame_disappear, offset_occur = None, None, None
|
63 |
+
|
64 |
+
if bgn and (i - bgn >= 600 or i == onset_output.shape[0] - 1):
|
65 |
+
"""Offset not detected"""
|
66 |
+
fin = i
|
67 |
+
output_tuples.append([bgn, fin, onset_shift_output[bgn],
|
68 |
+
offset_shift_output[fin], velocity_output[bgn]])
|
69 |
+
bgn, frame_disappear, offset_occur = None, None, None
|
70 |
+
|
71 |
+
# Sort pairs by onsets
|
72 |
+
output_tuples.sort(key=lambda pair: pair[0])
|
73 |
+
|
74 |
+
return output_tuples
|
75 |
+
|
76 |
+
|
77 |
+
def pedal_detection_with_onset_offset_regress(frame_output, offset_output,
|
78 |
+
offset_shift_output, frame_threshold):
|
79 |
+
"""Process prediction array to pedal events information.
|
80 |
+
|
81 |
+
Args:
|
82 |
+
frame_output: (frames_num,)
|
83 |
+
offset_output: (frames_num,)
|
84 |
+
offset_shift_output: (frames_num,)
|
85 |
+
frame_threshold: float
|
86 |
+
Returns:
|
87 |
+
output_tuples: list of [bgn, fin, onset_shift, offset_shift],
|
88 |
+
e.g., [
|
89 |
+
[1821, 1909, 0.4749851, 0.3048533],
|
90 |
+
[1909, 1947, 0.30730522, -0.45764327],
|
91 |
+
...]
|
92 |
+
"""
|
93 |
+
output_tuples = []
|
94 |
+
bgn = None
|
95 |
+
frame_disappear = None
|
96 |
+
offset_occur = None
|
97 |
+
|
98 |
+
for i in range(1, frame_output.shape[0]):
|
99 |
+
if frame_output[i] >= frame_threshold and frame_output[i] > frame_output[i - 1]:
|
100 |
+
"""Pedal onset detected"""
|
101 |
+
if bgn:
|
102 |
+
pass
|
103 |
+
else:
|
104 |
+
bgn = i
|
105 |
+
|
106 |
+
if bgn and i > bgn:
|
107 |
+
"""If onset found, then search offset"""
|
108 |
+
if frame_output[i] <= frame_threshold and not frame_disappear:
|
109 |
+
"""Frame disappear detected"""
|
110 |
+
frame_disappear = i
|
111 |
+
|
112 |
+
if offset_output[i] == 1 and not offset_occur:
|
113 |
+
"""Offset detected"""
|
114 |
+
offset_occur = i
|
115 |
+
|
116 |
+
if offset_occur:
|
117 |
+
fin = offset_occur
|
118 |
+
output_tuples.append([bgn, fin, 0., offset_shift_output[fin]])
|
119 |
+
bgn, frame_disappear, offset_occur = None, None, None
|
120 |
+
|
121 |
+
if frame_disappear and i - frame_disappear >= 10:
|
122 |
+
"""offset not detected but frame disappear"""
|
123 |
+
fin = frame_disappear
|
124 |
+
output_tuples.append([bgn, fin, 0., offset_shift_output[fin]])
|
125 |
+
bgn, frame_disappear, offset_occur = None, None, None
|
126 |
+
|
127 |
+
# Sort pairs by onsets
|
128 |
+
output_tuples.sort(key=lambda pair: pair[0])
|
129 |
+
|
130 |
+
return output_tuples
|
pytorch_utils.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import time
|
4 |
+
import torch
|
5 |
+
|
6 |
+
from .utilities import pad_truncate_sequence
|
7 |
+
|
8 |
+
|
9 |
+
def move_data_to_device(x, device):
|
10 |
+
if 'float' in str(x.dtype):
|
11 |
+
x = torch.Tensor(x)
|
12 |
+
elif 'int' in str(x.dtype):
|
13 |
+
x = torch.LongTensor(x)
|
14 |
+
else:
|
15 |
+
return x
|
16 |
+
|
17 |
+
return x.to(device)
|
18 |
+
|
19 |
+
|
20 |
+
def append_to_dict(dict, key, value):
|
21 |
+
if key in dict.keys():
|
22 |
+
dict[key].append(value)
|
23 |
+
else:
|
24 |
+
dict[key] = [value]
|
25 |
+
|
26 |
+
|
27 |
+
def forward(model, x, batch_size):
|
28 |
+
"""Forward data to model in mini-batch.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
model: object
|
32 |
+
x: (N, segment_samples)
|
33 |
+
batch_size: int
|
34 |
+
|
35 |
+
Returns:
|
36 |
+
output_dict: dict, e.g. {
|
37 |
+
'frame_output': (segments_num, frames_num, classes_num),
|
38 |
+
'onset_output': (segments_num, frames_num, classes_num),
|
39 |
+
...}
|
40 |
+
"""
|
41 |
+
|
42 |
+
output_dict = {}
|
43 |
+
device = next(model.parameters()).device
|
44 |
+
|
45 |
+
pointer = 0
|
46 |
+
total_segments = int(np.ceil(len(x) / batch_size))
|
47 |
+
|
48 |
+
while True:
|
49 |
+
print('Segment {} / {}'.format(pointer, total_segments))
|
50 |
+
if pointer >= len(x):
|
51 |
+
break
|
52 |
+
|
53 |
+
batch_waveform = move_data_to_device(x[pointer : pointer + batch_size], device)
|
54 |
+
pointer += batch_size
|
55 |
+
|
56 |
+
with torch.no_grad():
|
57 |
+
model.eval()
|
58 |
+
batch_output_dict = model(batch_waveform)
|
59 |
+
|
60 |
+
for key in batch_output_dict.keys():
|
61 |
+
append_to_dict(output_dict, key, batch_output_dict[key].data.cpu().numpy())
|
62 |
+
|
63 |
+
for key in output_dict.keys():
|
64 |
+
output_dict[key] = np.concatenate(output_dict[key], axis=0)
|
65 |
+
|
66 |
+
return output_dict
|
utilities.py
ADDED
@@ -0,0 +1,564 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import audioread
|
4 |
+
import librosa
|
5 |
+
from mido import MidiFile
|
6 |
+
|
7 |
+
from .piano_vad import (note_detection_with_onset_offset_regress,
|
8 |
+
pedal_detection_with_onset_offset_regress)
|
9 |
+
from . import config
|
10 |
+
|
11 |
+
|
12 |
+
def create_folder(fd):
|
13 |
+
if not os.path.exists(fd):
|
14 |
+
os.makedirs(fd)
|
15 |
+
|
16 |
+
|
17 |
+
def get_filename(path):
|
18 |
+
path = os.path.realpath(path)
|
19 |
+
na_ext = path.split('/')[-1]
|
20 |
+
na = os.path.splitext(na_ext)[0]
|
21 |
+
return na
|
22 |
+
|
23 |
+
|
24 |
+
def note_to_freq(piano_note):
|
25 |
+
return 2 ** ((piano_note - 39) / 12) * 440
|
26 |
+
|
27 |
+
|
28 |
+
def float32_to_int16(x):
|
29 |
+
assert np.max(np.abs(x)) <= 1.
|
30 |
+
return (x * 32767.).astype(np.int16)
|
31 |
+
|
32 |
+
|
33 |
+
def int16_to_float32(x):
|
34 |
+
return (x / 32767.).astype(np.float32)
|
35 |
+
|
36 |
+
|
37 |
+
def pad_truncate_sequence(x, max_len):
|
38 |
+
if len(x) < max_len:
|
39 |
+
return np.concatenate((x, np.zeros(max_len - len(x))))
|
40 |
+
else:
|
41 |
+
return x[0 : max_len]
|
42 |
+
|
43 |
+
|
44 |
+
def read_midi(midi_path):
|
45 |
+
"""Parse MIDI file.
|
46 |
+
|
47 |
+
Args:
|
48 |
+
midi_path: str
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
midi_dict: dict, e.g. {
|
52 |
+
'midi_event': [
|
53 |
+
'program_change channel=0 program=0 time=0',
|
54 |
+
'control_change channel=0 control=64 value=127 time=0',
|
55 |
+
'control_change channel=0 control=64 value=63 time=236',
|
56 |
+
...],
|
57 |
+
'midi_event_time': [0., 0, 0.98307292, ...]}
|
58 |
+
"""
|
59 |
+
|
60 |
+
midi_file = MidiFile(midi_path)
|
61 |
+
ticks_per_beat = midi_file.ticks_per_beat
|
62 |
+
|
63 |
+
assert len(midi_file.tracks) == 2
|
64 |
+
"""The first track contains tempo, time signature. The second track
|
65 |
+
contains piano events."""
|
66 |
+
|
67 |
+
microseconds_per_beat = midi_file.tracks[0][0].tempo
|
68 |
+
beats_per_second = 1e6 / microseconds_per_beat
|
69 |
+
ticks_per_second = ticks_per_beat * beats_per_second
|
70 |
+
|
71 |
+
message_list = []
|
72 |
+
|
73 |
+
ticks = 0
|
74 |
+
time_in_second = []
|
75 |
+
|
76 |
+
for message in midi_file.tracks[1]:
|
77 |
+
message_list.append(str(message))
|
78 |
+
ticks += message.time
|
79 |
+
time_in_second.append(ticks / ticks_per_second)
|
80 |
+
|
81 |
+
midi_dict = {
|
82 |
+
'midi_event': np.array(message_list),
|
83 |
+
'midi_event_time': np.array(time_in_second)}
|
84 |
+
|
85 |
+
return midi_dict
|
86 |
+
|
87 |
+
|
88 |
+
def write_events_to_midi(start_time, note_events, pedal_events, midi_path):
|
89 |
+
"""Write out note events to MIDI file.
|
90 |
+
|
91 |
+
Args:
|
92 |
+
start_time: float
|
93 |
+
note_events: list of dict, e.g. [
|
94 |
+
{'midi_note': 51, 'onset_time': 696.63544, 'offset_time': 696.9948, 'velocity': 44},
|
95 |
+
{'midi_note': 58, 'onset_time': 696.99585, 'offset_time': 697.18646, 'velocity': 50}
|
96 |
+
...]
|
97 |
+
midi_path: str
|
98 |
+
"""
|
99 |
+
from mido import Message, MidiFile, MidiTrack, MetaMessage
|
100 |
+
|
101 |
+
# This configuration is the same as MIDIs in MAESTRO dataset
|
102 |
+
ticks_per_beat = 384
|
103 |
+
beats_per_second = 2
|
104 |
+
ticks_per_second = ticks_per_beat * beats_per_second
|
105 |
+
microseconds_per_beat = int(1e6 // beats_per_second)
|
106 |
+
|
107 |
+
midi_file = MidiFile()
|
108 |
+
midi_file.ticks_per_beat = ticks_per_beat
|
109 |
+
|
110 |
+
# Track 0
|
111 |
+
track0 = MidiTrack()
|
112 |
+
track0.append(MetaMessage('set_tempo', tempo=microseconds_per_beat, time=0))
|
113 |
+
track0.append(MetaMessage('time_signature', numerator=4, denominator=4, time=0))
|
114 |
+
track0.append(MetaMessage('end_of_track', time=1))
|
115 |
+
midi_file.tracks.append(track0)
|
116 |
+
|
117 |
+
# Track 1
|
118 |
+
track1 = MidiTrack()
|
119 |
+
|
120 |
+
# Message rolls of MIDI
|
121 |
+
message_roll = []
|
122 |
+
|
123 |
+
for note_event in note_events:
|
124 |
+
# Onset
|
125 |
+
message_roll.append({
|
126 |
+
'time': note_event['onset_time'],
|
127 |
+
'midi_note': note_event['midi_note'],
|
128 |
+
'velocity': note_event['velocity']})
|
129 |
+
|
130 |
+
# Offset
|
131 |
+
message_roll.append({
|
132 |
+
'time': note_event['offset_time'],
|
133 |
+
'midi_note': note_event['midi_note'],
|
134 |
+
'velocity': 0})
|
135 |
+
|
136 |
+
if pedal_events:
|
137 |
+
for pedal_event in pedal_events:
|
138 |
+
message_roll.append({'time': pedal_event['onset_time'], 'control_change': 64, 'value': 127})
|
139 |
+
message_roll.append({'time': pedal_event['offset_time'], 'control_change': 64, 'value': 0})
|
140 |
+
|
141 |
+
# Sort MIDI messages by time
|
142 |
+
message_roll.sort(key=lambda note_event: note_event['time'])
|
143 |
+
|
144 |
+
previous_ticks = 0
|
145 |
+
for message in message_roll:
|
146 |
+
this_ticks = int((message['time'] - start_time) * ticks_per_second)
|
147 |
+
if this_ticks >= 0:
|
148 |
+
diff_ticks = this_ticks - previous_ticks
|
149 |
+
previous_ticks = this_ticks
|
150 |
+
if 'midi_note' in message.keys():
|
151 |
+
track1.append(Message('note_on', note=message['midi_note'], velocity=message['velocity'], time=diff_ticks))
|
152 |
+
elif 'control_change' in message.keys():
|
153 |
+
track1.append(Message('control_change', channel=0, control=message['control_change'], value=message['value'], time=diff_ticks))
|
154 |
+
track1.append(MetaMessage('end_of_track', time=1))
|
155 |
+
midi_file.tracks.append(track1)
|
156 |
+
|
157 |
+
midi_file.save(midi_path)
|
158 |
+
|
159 |
+
|
160 |
+
class RegressionPostProcessor(object):
|
161 |
+
def __init__(self, frames_per_second, classes_num, onset_threshold,
|
162 |
+
offset_threshold, frame_threshold, pedal_offset_threshold):
|
163 |
+
"""Postprocess the output probabilities of a transription model to MIDI
|
164 |
+
events.
|
165 |
+
|
166 |
+
Args:
|
167 |
+
frames_per_second: int
|
168 |
+
classes_num: int
|
169 |
+
onset_threshold: float
|
170 |
+
offset_threshold: float
|
171 |
+
frame_threshold: float
|
172 |
+
pedal_offset_threshold: float
|
173 |
+
"""
|
174 |
+
self.frames_per_second = frames_per_second
|
175 |
+
self.classes_num = classes_num
|
176 |
+
self.onset_threshold = onset_threshold
|
177 |
+
self.offset_threshold = offset_threshold
|
178 |
+
self.frame_threshold = frame_threshold
|
179 |
+
self.pedal_offset_threshold = pedal_offset_threshold
|
180 |
+
self.begin_note = config.begin_note
|
181 |
+
self.velocity_scale = config.velocity_scale
|
182 |
+
|
183 |
+
def output_dict_to_midi_events(self, output_dict):
|
184 |
+
"""Main function. Post process model outputs to MIDI events.
|
185 |
+
|
186 |
+
Args:
|
187 |
+
output_dict: {
|
188 |
+
'reg_onset_output': (segment_frames, classes_num),
|
189 |
+
'reg_offset_output': (segment_frames, classes_num),
|
190 |
+
'frame_output': (segment_frames, classes_num),
|
191 |
+
'velocity_output': (segment_frames, classes_num),
|
192 |
+
'reg_pedal_onset_output': (segment_frames, 1),
|
193 |
+
'reg_pedal_offset_output': (segment_frames, 1),
|
194 |
+
'pedal_frame_output': (segment_frames, 1)}
|
195 |
+
|
196 |
+
Outputs:
|
197 |
+
est_note_events: list of dict, e.g. [
|
198 |
+
{'onset_time': 39.74, 'offset_time': 39.87, 'midi_note': 27, 'velocity': 83},
|
199 |
+
{'onset_time': 11.98, 'offset_time': 12.11, 'midi_note': 33, 'velocity': 88}]
|
200 |
+
|
201 |
+
est_pedal_events: list of dict, e.g. [
|
202 |
+
{'onset_time': 0.17, 'offset_time': 0.96},
|
203 |
+
{'osnet_time': 1.17, 'offset_time': 2.65}]
|
204 |
+
"""
|
205 |
+
|
206 |
+
# Post process piano note outputs to piano note and pedal events information
|
207 |
+
(est_on_off_note_vels, est_pedal_on_offs) = \
|
208 |
+
self.output_dict_to_note_pedal_arrays(output_dict)
|
209 |
+
"""est_on_off_note_vels: (events_num, 4), the four columns are: [onset_time, offset_time, piano_note, velocity],
|
210 |
+
est_pedal_on_offs: (pedal_events_num, 2), the two columns are: [onset_time, offset_time]"""
|
211 |
+
|
212 |
+
# Reformat notes to MIDI events
|
213 |
+
est_note_events = self.detected_notes_to_events(est_on_off_note_vels)
|
214 |
+
|
215 |
+
if est_pedal_on_offs is None:
|
216 |
+
est_pedal_events = None
|
217 |
+
else:
|
218 |
+
est_pedal_events = self.detected_pedals_to_events(est_pedal_on_offs)
|
219 |
+
|
220 |
+
return est_note_events, est_pedal_events
|
221 |
+
|
222 |
+
def output_dict_to_note_pedal_arrays(self, output_dict):
|
223 |
+
"""Postprocess the output probabilities of a transription model to MIDI
|
224 |
+
events.
|
225 |
+
|
226 |
+
Args:
|
227 |
+
output_dict: dict, {
|
228 |
+
'reg_onset_output': (frames_num, classes_num),
|
229 |
+
'reg_offset_output': (frames_num, classes_num),
|
230 |
+
'frame_output': (frames_num, classes_num),
|
231 |
+
'velocity_output': (frames_num, classes_num),
|
232 |
+
...}
|
233 |
+
|
234 |
+
Returns:
|
235 |
+
est_on_off_note_vels: (events_num, 4), the 4 columns are onset_time,
|
236 |
+
offset_time, piano_note and velocity. E.g. [
|
237 |
+
[39.74, 39.87, 27, 0.65],
|
238 |
+
[11.98, 12.11, 33, 0.69],
|
239 |
+
...]
|
240 |
+
|
241 |
+
est_pedal_on_offs: (pedal_events_num, 2), the 2 columns are onset_time
|
242 |
+
and offset_time. E.g. [
|
243 |
+
[0.17, 0.96],
|
244 |
+
[1.17, 2.65],
|
245 |
+
...]
|
246 |
+
"""
|
247 |
+
|
248 |
+
# ------ 1. Process regression outputs to binarized outputs ------
|
249 |
+
# For example, onset or offset of [0., 0., 0.15, 0.30, 0.40, 0.35, 0.20, 0.05, 0., 0.]
|
250 |
+
# will be processed to [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]
|
251 |
+
|
252 |
+
# Calculate binarized onset output from regression output
|
253 |
+
(onset_output, onset_shift_output) = \
|
254 |
+
self.get_binarized_output_from_regression(
|
255 |
+
reg_output=output_dict['reg_onset_output'],
|
256 |
+
threshold=self.onset_threshold, neighbour=2)
|
257 |
+
|
258 |
+
output_dict['onset_output'] = onset_output # Values are 0 or 1
|
259 |
+
output_dict['onset_shift_output'] = onset_shift_output
|
260 |
+
|
261 |
+
# Calculate binarized offset output from regression output
|
262 |
+
(offset_output, offset_shift_output) = \
|
263 |
+
self.get_binarized_output_from_regression(
|
264 |
+
reg_output=output_dict['reg_offset_output'],
|
265 |
+
threshold=self.offset_threshold, neighbour=4)
|
266 |
+
|
267 |
+
output_dict['offset_output'] = offset_output # Values are 0 or 1
|
268 |
+
output_dict['offset_shift_output'] = offset_shift_output
|
269 |
+
|
270 |
+
if 'reg_pedal_onset_output' in output_dict.keys():
|
271 |
+
"""Pedal onsets are not used in inference. Instead, frame-wise pedal
|
272 |
+
predictions are used to detect onsets. We empirically found this is
|
273 |
+
more accurate to detect pedal onsets."""
|
274 |
+
pass
|
275 |
+
|
276 |
+
if 'reg_pedal_offset_output' in output_dict.keys():
|
277 |
+
# Calculate binarized pedal offset output from regression output
|
278 |
+
(pedal_offset_output, pedal_offset_shift_output) = \
|
279 |
+
self.get_binarized_output_from_regression(
|
280 |
+
reg_output=output_dict['reg_pedal_offset_output'],
|
281 |
+
threshold=self.pedal_offset_threshold, neighbour=4)
|
282 |
+
|
283 |
+
output_dict['pedal_offset_output'] = pedal_offset_output # Values are 0 or 1
|
284 |
+
output_dict['pedal_offset_shift_output'] = pedal_offset_shift_output
|
285 |
+
|
286 |
+
# ------ 2. Process matrices results to event results ------
|
287 |
+
# Detect piano notes from output_dict
|
288 |
+
est_on_off_note_vels = self.output_dict_to_detected_notes(output_dict)
|
289 |
+
|
290 |
+
if 'reg_pedal_onset_output' in output_dict.keys():
|
291 |
+
# Detect piano pedals from output_dict
|
292 |
+
est_pedal_on_offs = self.output_dict_to_detected_pedals(output_dict)
|
293 |
+
|
294 |
+
else:
|
295 |
+
est_pedal_on_offs = None
|
296 |
+
|
297 |
+
return est_on_off_note_vels, est_pedal_on_offs
|
298 |
+
|
299 |
+
def get_binarized_output_from_regression(self, reg_output, threshold, neighbour):
|
300 |
+
"""Calculate binarized output and shifts of onsets or offsets from the
|
301 |
+
regression results.
|
302 |
+
|
303 |
+
Args:
|
304 |
+
reg_output: (frames_num, classes_num)
|
305 |
+
threshold: float
|
306 |
+
neighbour: int
|
307 |
+
|
308 |
+
Returns:
|
309 |
+
binary_output: (frames_num, classes_num)
|
310 |
+
shift_output: (frames_num, classes_num)
|
311 |
+
"""
|
312 |
+
binary_output = np.zeros_like(reg_output)
|
313 |
+
shift_output = np.zeros_like(reg_output)
|
314 |
+
(frames_num, classes_num) = reg_output.shape
|
315 |
+
|
316 |
+
for k in range(classes_num):
|
317 |
+
x = reg_output[:, k]
|
318 |
+
for n in range(neighbour, frames_num - neighbour):
|
319 |
+
if x[n] > threshold and self.is_monotonic_neighbour(x, n, neighbour):
|
320 |
+
binary_output[n, k] = 1
|
321 |
+
|
322 |
+
"""See Section III-D in [1] for deduction.
|
323 |
+
[1] Q. Kong, et al., High-resolution Piano Transcription
|
324 |
+
with Pedals by Regressing Onsets and Offsets Times, 2020."""
|
325 |
+
if x[n - 1] > x[n + 1]:
|
326 |
+
shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n + 1]) / 2
|
327 |
+
else:
|
328 |
+
shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n - 1]) / 2
|
329 |
+
shift_output[n, k] = shift
|
330 |
+
|
331 |
+
return binary_output, shift_output
|
332 |
+
|
333 |
+
def is_monotonic_neighbour(self, x, n, neighbour):
|
334 |
+
"""Detect if values are monotonic in both side of x[n].
|
335 |
+
|
336 |
+
Args:
|
337 |
+
x: (frames_num,)
|
338 |
+
n: int
|
339 |
+
neighbour: int
|
340 |
+
|
341 |
+
Returns:
|
342 |
+
monotonic: bool
|
343 |
+
"""
|
344 |
+
monotonic = True
|
345 |
+
for i in range(neighbour):
|
346 |
+
if x[n - i] < x[n - i - 1]:
|
347 |
+
monotonic = False
|
348 |
+
if x[n + i] < x[n + i + 1]:
|
349 |
+
monotonic = False
|
350 |
+
|
351 |
+
return monotonic
|
352 |
+
|
353 |
+
def output_dict_to_detected_notes(self, output_dict):
|
354 |
+
"""Postprocess output_dict to piano notes.
|
355 |
+
|
356 |
+
Args:
|
357 |
+
output_dict: dict, e.g. {
|
358 |
+
'onset_output': (frames_num, classes_num),
|
359 |
+
'onset_shift_output': (frames_num, classes_num),
|
360 |
+
'offset_output': (frames_num, classes_num),
|
361 |
+
'offset_shift_output': (frames_num, classes_num),
|
362 |
+
'frame_output': (frames_num, classes_num),
|
363 |
+
'onset_output': (frames_num, classes_num),
|
364 |
+
...}
|
365 |
+
|
366 |
+
Returns:
|
367 |
+
est_on_off_note_vels: (notes, 4), the four columns are onsets, offsets,
|
368 |
+
MIDI notes and velocities. E.g.,
|
369 |
+
[[39.7375, 39.7500, 27., 0.6638],
|
370 |
+
[11.9824, 12.5000, 33., 0.6892],
|
371 |
+
...]
|
372 |
+
"""
|
373 |
+
est_tuples = []
|
374 |
+
est_midi_notes = []
|
375 |
+
classes_num = output_dict['frame_output'].shape[-1]
|
376 |
+
|
377 |
+
for piano_note in range(classes_num):
|
378 |
+
"""Detect piano notes"""
|
379 |
+
est_tuples_per_note = note_detection_with_onset_offset_regress(
|
380 |
+
frame_output=output_dict['frame_output'][:, piano_note],
|
381 |
+
onset_output=output_dict['onset_output'][:, piano_note],
|
382 |
+
onset_shift_output=output_dict['onset_shift_output'][:, piano_note],
|
383 |
+
offset_output=output_dict['offset_output'][:, piano_note],
|
384 |
+
offset_shift_output=output_dict['offset_shift_output'][:, piano_note],
|
385 |
+
velocity_output=output_dict['velocity_output'][:, piano_note],
|
386 |
+
frame_threshold=self.frame_threshold)
|
387 |
+
|
388 |
+
est_tuples += est_tuples_per_note
|
389 |
+
est_midi_notes += [piano_note + self.begin_note] * len(est_tuples_per_note)
|
390 |
+
|
391 |
+
est_tuples = np.array(est_tuples) # (notes, 5)
|
392 |
+
"""(notes, 5), the five columns are onset, offset, onset_shift,
|
393 |
+
offset_shift and normalized_velocity"""
|
394 |
+
|
395 |
+
est_midi_notes = np.array(est_midi_notes) # (notes,)
|
396 |
+
|
397 |
+
if len(est_tuples) == 0:
|
398 |
+
return np.array([])
|
399 |
+
|
400 |
+
else:
|
401 |
+
onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second
|
402 |
+
offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second
|
403 |
+
velocities = est_tuples[:, 4]
|
404 |
+
|
405 |
+
est_on_off_note_vels = np.stack((onset_times, offset_times, est_midi_notes, velocities), axis=-1)
|
406 |
+
"""(notes, 3), the three columns are onset_times, offset_times and velocity."""
|
407 |
+
|
408 |
+
est_on_off_note_vels = est_on_off_note_vels.astype(np.float32)
|
409 |
+
|
410 |
+
return est_on_off_note_vels
|
411 |
+
|
412 |
+
def output_dict_to_detected_pedals(self, output_dict):
|
413 |
+
"""Postprocess output_dict to piano pedals.
|
414 |
+
|
415 |
+
Args:
|
416 |
+
output_dict: dict, e.g. {
|
417 |
+
'pedal_frame_output': (frames_num,),
|
418 |
+
'pedal_offset_output': (frames_num,),
|
419 |
+
'pedal_offset_shift_output': (frames_num,),
|
420 |
+
...}
|
421 |
+
|
422 |
+
Returns:
|
423 |
+
est_on_off: (notes, 2), the two columns are pedal onsets and pedal
|
424 |
+
offsets. E.g.,
|
425 |
+
[[0.1800, 0.9669],
|
426 |
+
[1.1400, 2.6458],
|
427 |
+
...]
|
428 |
+
"""
|
429 |
+
frames_num = output_dict['pedal_frame_output'].shape[0]
|
430 |
+
|
431 |
+
est_tuples = pedal_detection_with_onset_offset_regress(
|
432 |
+
frame_output=output_dict['pedal_frame_output'][:, 0],
|
433 |
+
offset_output=output_dict['pedal_offset_output'][:, 0],
|
434 |
+
offset_shift_output=output_dict['pedal_offset_shift_output'][:, 0],
|
435 |
+
frame_threshold=0.5)
|
436 |
+
|
437 |
+
est_tuples = np.array(est_tuples)
|
438 |
+
"""(notes, 2), the two columns are pedal onsets and pedal offsets"""
|
439 |
+
|
440 |
+
if len(est_tuples) == 0:
|
441 |
+
return np.array([])
|
442 |
+
|
443 |
+
else:
|
444 |
+
onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second
|
445 |
+
offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second
|
446 |
+
est_on_off = np.stack((onset_times, offset_times), axis=-1)
|
447 |
+
est_on_off = est_on_off.astype(np.float32)
|
448 |
+
return est_on_off
|
449 |
+
|
450 |
+
def detected_notes_to_events(self, est_on_off_note_vels):
|
451 |
+
"""Reformat detected notes to midi events.
|
452 |
+
|
453 |
+
Args:
|
454 |
+
est_on_off_vels: (notes, 3), the three columns are onset_times,
|
455 |
+
offset_times and velocity. E.g.
|
456 |
+
[[32.8376, 35.7700, 0.7932],
|
457 |
+
[37.3712, 39.9300, 0.8058],
|
458 |
+
...]
|
459 |
+
|
460 |
+
Returns:
|
461 |
+
midi_events, list, e.g.,
|
462 |
+
[{'onset_time': 39.7376, 'offset_time': 39.75, 'midi_note': 27, 'velocity': 84},
|
463 |
+
{'onset_time': 11.9824, 'offset_time': 12.50, 'midi_note': 33, 'velocity': 88},
|
464 |
+
...]
|
465 |
+
"""
|
466 |
+
midi_events = []
|
467 |
+
for i in range(est_on_off_note_vels.shape[0]):
|
468 |
+
midi_events.append({
|
469 |
+
'onset_time': est_on_off_note_vels[i][0],
|
470 |
+
'offset_time': est_on_off_note_vels[i][1],
|
471 |
+
'midi_note': int(est_on_off_note_vels[i][2]),
|
472 |
+
'velocity': int(est_on_off_note_vels[i][3] * self.velocity_scale)})
|
473 |
+
|
474 |
+
return midi_events
|
475 |
+
|
476 |
+
def detected_pedals_to_events(self, pedal_on_offs):
|
477 |
+
"""Reformat detected pedal onset and offsets to events.
|
478 |
+
|
479 |
+
Args:
|
480 |
+
pedal_on_offs: (notes, 2), the two columns are pedal onsets and pedal
|
481 |
+
offsets. E.g.,
|
482 |
+
[[0.1800, 0.9669],
|
483 |
+
[1.1400, 2.6458],
|
484 |
+
...]
|
485 |
+
|
486 |
+
Returns:
|
487 |
+
pedal_events: list of dict, e.g.,
|
488 |
+
[{'onset_time': 0.1800, 'offset_time': 0.9669},
|
489 |
+
{'onset_time': 1.1400, 'offset_time': 2.6458},
|
490 |
+
...]
|
491 |
+
"""
|
492 |
+
pedal_events = []
|
493 |
+
for i in range(len(pedal_on_offs)):
|
494 |
+
pedal_events.append({
|
495 |
+
'onset_time': pedal_on_offs[i, 0],
|
496 |
+
'offset_time': pedal_on_offs[i, 1]})
|
497 |
+
|
498 |
+
return pedal_events
|
499 |
+
|
500 |
+
|
501 |
+
def load_audio(path, sr=22050, mono=True, offset=0.0, duration=None,
|
502 |
+
dtype=np.float32, res_type='kaiser_best',
|
503 |
+
backends=[audioread.ffdec.FFmpegAudioFile]):
|
504 |
+
"""Load audio. Copied from librosa.core.load() except that ffmpeg backend is
|
505 |
+
always used in this function."""
|
506 |
+
|
507 |
+
y = []
|
508 |
+
with audioread.audio_open(os.path.realpath(path), backends=backends) as input_file:
|
509 |
+
sr_native = input_file.samplerate
|
510 |
+
n_channels = input_file.channels
|
511 |
+
|
512 |
+
s_start = int(np.round(sr_native * offset)) * n_channels
|
513 |
+
|
514 |
+
if duration is None:
|
515 |
+
s_end = np.inf
|
516 |
+
else:
|
517 |
+
s_end = s_start + (int(np.round(sr_native * duration))
|
518 |
+
* n_channels)
|
519 |
+
|
520 |
+
n = 0
|
521 |
+
|
522 |
+
for frame in input_file:
|
523 |
+
frame = frame = librosa.util.buf_to_float(frame, n_bytes=2, dtype=dtype)
|
524 |
+
n_prev = n
|
525 |
+
n = n + len(frame)
|
526 |
+
|
527 |
+
if n < s_start:
|
528 |
+
# offset is after the current frame
|
529 |
+
# keep reading
|
530 |
+
continue
|
531 |
+
|
532 |
+
if s_end < n_prev:
|
533 |
+
# we're off the end. stop reading
|
534 |
+
break
|
535 |
+
|
536 |
+
if s_end < n:
|
537 |
+
# the end is in this frame. crop.
|
538 |
+
frame = frame[:s_end - n_prev]
|
539 |
+
|
540 |
+
if n_prev <= s_start <= n:
|
541 |
+
# beginning is in this frame
|
542 |
+
frame = frame[(s_start - n_prev):]
|
543 |
+
|
544 |
+
# tack on the current frame
|
545 |
+
y.append(frame)
|
546 |
+
|
547 |
+
if y:
|
548 |
+
y = np.concatenate(y)
|
549 |
+
|
550 |
+
if n_channels > 1:
|
551 |
+
y = y.reshape((-1, n_channels)).T
|
552 |
+
if mono:
|
553 |
+
y = librosa.to_mono(y)
|
554 |
+
|
555 |
+
if sr is not None:
|
556 |
+
y = librosa.resample(y, orig_sr=sr_native, target_sr=sr, res_type=res_type)
|
557 |
+
|
558 |
+
else:
|
559 |
+
sr = sr_native
|
560 |
+
|
561 |
+
# Final cleanup for dtype and contiguity
|
562 |
+
y = np.ascontiguousarray(y, dtype=dtype)
|
563 |
+
|
564 |
+
return (y, sr)
|