Violin_midi_pro / musc /pitch_estimator.py
Hygee's picture
Upload 9 files
e72f2a9 verified
from torch import nn
import torch
import torchaudio
from typing import List, Optional, Tuple
import pathlib
from scipy.signal import medfilt
import numpy as np
import librosa
from librosa.sequence import viterbi_discriminative
from scipy.ndimage import gaussian_filter1d
from musc.postprocessing import spotify_create_notes
class PitchEstimator(nn.Module):
"""
This is the base class that everything else inherits from. The hierarchy is:
PitchEstimator -> Transcriber -> Synchronizer -> AutonomousAgent -> The n-Head Music Performance Analysis Models
PitchEstimator can handle reading the audio, predicting all the features,
estimating a single frame level f0 using viterbi, or
MIDI pitch bend creation for the predicted note events when used inside a Transcriber, or
score-informed f0 estimation when used inside a Synchronizer.
"""
def __init__(self, labeling, instrument='Violin', sr=16000, window_size=1024, hop_length=160):
super().__init__()
self.labeling = labeling
self.sr = sr
self.window_size = window_size
self.hop_length = hop_length
self.instrument = instrument
self.f0_bins_per_semitone = int(np.round(100/self.labeling.f0_granularity_c))
def read_audio(self, audio):
"""
Read and resample an audio file, convert to mono, and unfold into representation frames.
The time array represents the center of each small frame with 5.8ms hop length. This is different than the chunk
level frames. The chunk level frames represent the entire sequence the model sees. Whereas it predicts with the
small frames intervals (5.8ms).
:param audio: str, pathlib.Path, np.ndarray, or torch.Tensor
:return: frames: (n_big_frames, frame_length), times: (n_small_frames,)
"""
if isinstance(audio, str) or isinstance(audio, pathlib.Path):
audio, sample_rate = torchaudio.load(audio, normalize=True)
audio = audio.mean(axis=0) # convert to mono
if sample_rate != self.sr:
audio = torchaudio.functional.resample(audio, sample_rate, self.sr)
elif isinstance(audio, np.ndarray):
audio = torch.from_numpy(audio)
else:
assert isinstance(audio, torch.Tensor)
len_audio = audio.shape[-1]
n_frames = int(np.ceil((len_audio + sum(self.frame_overlap)) / (self.hop_length * self.chunk_size)))
audio = nn.functional.pad(audio, (self.frame_overlap[0],
self.frame_overlap[1] + (n_frames * self.hop_length * self.chunk_size) - len_audio))
frames = audio.unfold(0, self.max_window_size, self.hop_length*self.chunk_size)
times = np.arange(0, len_audio, self.hop_length) / self.sr # not tensor, we don't compute anything with it
return frames, times
def predict(self, audio, batch_size):
frames, times = self.read_audio(audio)
performance = {'f0': [], 'note': [], 'onset': [], 'offset': []}
self.eval()
device = self.main.conv0.conv2d.weight.device
with torch.no_grad():
for i in range(0, len(frames), batch_size):
f = frames[i:min(i + batch_size, len(frames))].to(device)
f -= (torch.mean(f, axis=1).unsqueeze(-1))
f /= (torch.std(f, axis=1).unsqueeze(-1))
out = self.forward(f)
for key, value in out.items():
value = torch.sigmoid(value)
value = torch.nan_to_num(value) # the model outputs nan when the frame is silent (this is an expected behavior due to normalization)
value = value.view(-1, value.shape[-1])
value = value.detach().cpu().numpy()
performance[key].append(value)
performance = {key: np.concatenate(value, axis=0)[:len(times)] for key, value in performance.items()}
performance['time'] = times
return performance
def estimate_pitch(self, audio, batch_size, viterbi=False):
out = self.predict(audio, batch_size)
f0_hz = self.out2f0(out, viterbi)
return out['time'], f0_hz
def out2f0(self, out, viterbi=False):
"""
Monophonic f0 estimation from the model output. The viterbi postprocessing is specialized for the violin family.
"""
salience = out['f0']
if viterbi == 'constrained':
assert hasattr(self, 'out2note')
notes = spotify_create_notes( out["note"], out["onset"], note_low=self.labeling.midi_centers[0],
note_high=self.labeling.midi_centers[-1], onset_thresh=0.5, frame_thresh=0.3,
infer_onsets=True, melodia_trick=True,
min_note_len=int(np.round(127.70 / 1000 * (self.sr / self.hop_length))))
note_cents = self.get_pitch_bends(salience, notes, to_midi=False, timing_refinement_range=0)
cents = np.zeros_like(out['time'])
cents[note_cents[:,0].astype(int)] = note_cents[:,1]
elif viterbi:
# transition probabilities inducing continuous pitch
# big changes are penalized with one order of magnitude
transition = gaussian_filter1d(np.eye(self.labeling.f0_n_bins), 30) + 99 * gaussian_filter1d(
np.eye(self.labeling.f0_n_bins), 2)
transition = transition / np.sum(transition, axis=1)[:, None]
p = salience / salience.sum(axis=1)[:, None]
p[np.isnan(p.sum(axis=1)), :] = np.ones(self.labeling.f0_n_bins) * 1 / self.labeling.f0_n_bins
path = viterbi_discriminative(p.T, transition)
cents = np.array([self.labeling.f0_label2c(salience[i, :], path[i]) for i in range(len(path))])
else:
cents = self.labeling.f0_label2c(salience, center=None) # use argmax for center
f0_hz = self.labeling.f0_c2hz(cents)
f0_hz[np.isnan(f0_hz)] = 0
return f0_hz
def get_pitch_bends(
self,
contours: np.ndarray, note_events: List[Tuple[int, int, int, float]],
timing_refinement_range: int = 0, to_midi: bool = True,
) -> List[Tuple[int, int, int, float, Optional[List[int]]]]:
"""Modified version of an excellent script from Spotify/basic_pitch!! Thank you!!!!
Given note events and contours, estimate pitch bends per note.
Pitch bends are represented as a sequence of evenly spaced midi pitch bend control units.
The time stamps of each pitch bend can be inferred by computing an evenly spaced grid between
the start and end times of each note event.
Args:
contours: Matrix of estimated pitch contours
note_events: note event tuple
timing_refinement_range: if > 0, refine onset/offset boundaries with f0 confidence
to_midi: whether to convert pitch bends to midi pitch bends. If False, return pitch estimates in the format
[time (index), pitch (Hz), confidence in range [0, 1]].
Returns:
note events with pitch bends
"""
f0_matrix = [] # [time (index), pitch (Hz), confidence in range [0, 1]]
note_events_with_pitch_bends = []
for start_idx, end_idx, pitch_midi, amplitude in note_events:
if timing_refinement_range:
start_idx = np.max([0, start_idx - timing_refinement_range])
end_idx = np.min([contours.shape[0], end_idx + timing_refinement_range])
freq_idx = int(np.round(self.midi_pitch_to_contour_bin(pitch_midi)))
freq_start_idx = np.max([freq_idx - self.labeling.f0_tolerance_bins, 0])
freq_end_idx = np.min([self.labeling.f0_n_bins, freq_idx + self.labeling.f0_tolerance_bins + 1])
trans_start_idx = np.max([0, self.labeling.f0_tolerance_bins - freq_idx])
trans_end_idx = (2 * self.labeling.f0_tolerance_bins + 1) - \
np.max([0, freq_idx - (self.labeling.f0_n_bins - self.labeling.f0_tolerance_bins - 1)])
# apply regional viterbi to estimate the intonation
# observation probabilities come from the f0_roll matrix
observation = contours[start_idx:end_idx, freq_start_idx:freq_end_idx]
observation = observation / observation.sum(axis=1)[:, None]
observation[np.isnan(observation.sum(axis=1)), :] = np.ones(freq_end_idx - freq_start_idx) * 1 / (
freq_end_idx - freq_start_idx)
# transition probabilities assure continuity
transition = self.labeling.f0_transition_matrix[trans_start_idx:trans_end_idx,
trans_start_idx:trans_end_idx] + 1e-6
transition = transition / np.sum(transition, axis=1)[:, None]
path = viterbi_discriminative(observation.T / observation.sum(axis=1), transition) + freq_start_idx
cents = np.array([self.labeling.f0_label2c(contours[i + start_idx, :], path[i]) for i in range(len(path))])
bends = cents - self.labeling.midi_centers_c[pitch_midi - self.labeling.midi_centers[0]]
if to_midi:
bends = (bends * 4096 / 100).astype(int)
bends[bends > 8191] = 8191
bends[bends < -8192] = -8192
if timing_refinement_range:
confidences = np.array([contours[i + start_idx, path[i]] for i in range(len(path))])
threshold = np.median(confidences)
threshold = (np.median(confidences > threshold) + threshold) / 2 # some magic
median_kernel = 2 * (timing_refinement_range // 2) + 1 # some more magic
confidences = medfilt(confidences, kernel_size=median_kernel)
conf_bool = confidences > threshold
onset_idx = np.argmax(conf_bool)
offset_idx = len(confidences) - np.argmax(conf_bool[::-1])
bends = bends[onset_idx:offset_idx]
start_idx = start_idx + onset_idx
end_idx = start_idx + offset_idx
note_events_with_pitch_bends.append((start_idx, end_idx, pitch_midi, amplitude, bends))
else:
confidences = np.array([contours[i + start_idx, path[i]] for i in range(len(path))])
time_idx = np.arange(len(path)) + start_idx
# f0_hz = self.labeling.f0_c2hz(cents)
possible_f0s = np.array([time_idx, cents, confidences]).T
f0_matrix.append(possible_f0s[np.abs(bends)<100]) # filter out pitch bends that are too large
if not to_midi:
return np.vstack(f0_matrix)
else:
return note_events_with_pitch_bends
def midi_pitch_to_contour_bin(self, pitch_midi: int) -> np.array:
"""Convert midi pitch to corresponding index in contour matrix
Args:
pitch_midi: pitch in midi
Returns:
index in contour matrix
"""
pitch_hz = librosa.midi_to_hz(pitch_midi)
return np.argmin(np.abs(self.labeling.f0_centers_hz - pitch_hz))