Violin_midi_pro

Sleeping

App Files Files Community

Violin_midi_pro / musc /pitch_estimator.py

Hygee

Upload 9 files

e72f2a9 verified 8 months ago

raw

history blame contribute delete

11.3 kB

	from torch import nn
	import torch
	import torchaudio
	from typing import List, Optional, Tuple
	import pathlib
	from scipy.signal import medfilt
	import numpy as np
	import librosa
	from librosa.sequence import viterbi_discriminative
	from scipy.ndimage import gaussian_filter1d
	from musc.postprocessing import spotify_create_notes


	class PitchEstimator(nn.Module):
	"""
	This is the base class that everything else inherits from. The hierarchy is:
	PitchEstimator -> Transcriber -> Synchronizer -> AutonomousAgent -> The n-Head Music Performance Analysis Models
	PitchEstimator can handle reading the audio, predicting all the features,
	estimating a single frame level f0 using viterbi, or
	MIDI pitch bend creation for the predicted note events when used inside a Transcriber, or
	score-informed f0 estimation when used inside a Synchronizer.
	"""
	def __init__(self, labeling, instrument='Violin', sr=16000, window_size=1024, hop_length=160):
	super().__init__()
	self.labeling = labeling
	self.sr = sr
	self.window_size = window_size
	self.hop_length = hop_length
	self.instrument = instrument
	self.f0_bins_per_semitone = int(np.round(100/self.labeling.f0_granularity_c))


	def read_audio(self, audio):
	"""
	Read and resample an audio file, convert to mono, and unfold into representation frames.
	The time array represents the center of each small frame with 5.8ms hop length. This is different than the chunk
	level frames. The chunk level frames represent the entire sequence the model sees. Whereas it predicts with the
	small frames intervals (5.8ms).
	:param audio: str, pathlib.Path, np.ndarray, or torch.Tensor
	:return: frames: (n_big_frames, frame_length), times: (n_small_frames,)
	"""
	if isinstance(audio, str) or isinstance(audio, pathlib.Path):
	audio, sample_rate = torchaudio.load(audio, normalize=True)
	audio = audio.mean(axis=0) # convert to mono
	if sample_rate != self.sr:
	audio = torchaudio.functional.resample(audio, sample_rate, self.sr)
	elif isinstance(audio, np.ndarray):
	audio = torch.from_numpy(audio)
	else:
	assert isinstance(audio, torch.Tensor)
	len_audio = audio.shape[-1]
	n_frames = int(np.ceil((len_audio + sum(self.frame_overlap)) / (self.hop_length * self.chunk_size)))
	audio = nn.functional.pad(audio, (self.frame_overlap[0],
	self.frame_overlap[1] + (n_frames * self.hop_length * self.chunk_size) - len_audio))
	frames = audio.unfold(0, self.max_window_size, self.hop_length*self.chunk_size)
	times = np.arange(0, len_audio, self.hop_length) / self.sr # not tensor, we don't compute anything with it
	return frames, times

	def predict(self, audio, batch_size):
	frames, times = self.read_audio(audio)
	performance = {'f0': [], 'note': [], 'onset': [], 'offset': []}
	self.eval()
	device = self.main.conv0.conv2d.weight.device
	with torch.no_grad():
	for i in range(0, len(frames), batch_size):
	f = frames[i:min(i + batch_size, len(frames))].to(device)
	f -= (torch.mean(f, axis=1).unsqueeze(-1))
	f /= (torch.std(f, axis=1).unsqueeze(-1))
	out = self.forward(f)
	for key, value in out.items():
	value = torch.sigmoid(value)
	value = torch.nan_to_num(value) # the model outputs nan when the frame is silent (this is an expected behavior due to normalization)
	value = value.view(-1, value.shape[-1])
	value = value.detach().cpu().numpy()
	performance[key].append(value)
	performance = {key: np.concatenate(value, axis=0)[:len(times)] for key, value in performance.items()}
	performance['time'] = times
	return performance

	def estimate_pitch(self, audio, batch_size, viterbi=False):
	out = self.predict(audio, batch_size)
	f0_hz = self.out2f0(out, viterbi)
	return out['time'], f0_hz

	def out2f0(self, out, viterbi=False):
	"""
	Monophonic f0 estimation from the model output. The viterbi postprocessing is specialized for the violin family.
	"""
	salience = out['f0']
	if viterbi == 'constrained':
	assert hasattr(self, 'out2note')
	notes = spotify_create_notes( out["note"], out["onset"], note_low=self.labeling.midi_centers[0],
	note_high=self.labeling.midi_centers[-1], onset_thresh=0.5, frame_thresh=0.3,
	infer_onsets=True, melodia_trick=True,
	min_note_len=int(np.round(127.70 / 1000 * (self.sr / self.hop_length))))
	note_cents = self.get_pitch_bends(salience, notes, to_midi=False, timing_refinement_range=0)
	cents = np.zeros_like(out['time'])
	cents[note_cents[:,0].astype(int)] = note_cents[:,1]
	elif viterbi:
	# transition probabilities inducing continuous pitch
	# big changes are penalized with one order of magnitude
	transition = gaussian_filter1d(np.eye(self.labeling.f0_n_bins), 30) + 99 * gaussian_filter1d(
	np.eye(self.labeling.f0_n_bins), 2)
	transition = transition / np.sum(transition, axis=1)[:, None]

	p = salience / salience.sum(axis=1)[:, None]
	p[np.isnan(p.sum(axis=1)), :] = np.ones(self.labeling.f0_n_bins) * 1 / self.labeling.f0_n_bins
	path = viterbi_discriminative(p.T, transition)
	cents = np.array([self.labeling.f0_label2c(salience[i, :], path[i]) for i in range(len(path))])
	else:
	cents = self.labeling.f0_label2c(salience, center=None) # use argmax for center

	f0_hz = self.labeling.f0_c2hz(cents)
	f0_hz[np.isnan(f0_hz)] = 0
	return f0_hz

	def get_pitch_bends(
	self,
	contours: np.ndarray, note_events: List[Tuple[int, int, int, float]],
	timing_refinement_range: int = 0, to_midi: bool = True,
	) -> List[Tuple[int, int, int, float, Optional[List[int]]]]:
	"""Modified version of an excellent script from Spotify/basic_pitch!! Thank you!!!!
	Given note events and contours, estimate pitch bends per note.
	Pitch bends are represented as a sequence of evenly spaced midi pitch bend control units.
	The time stamps of each pitch bend can be inferred by computing an evenly spaced grid between
	the start and end times of each note event.
	Args:
	contours: Matrix of estimated pitch contours
	note_events: note event tuple
	timing_refinement_range: if > 0, refine onset/offset boundaries with f0 confidence
	to_midi: whether to convert pitch bends to midi pitch bends. If False, return pitch estimates in the format
	[time (index), pitch (Hz), confidence in range [0, 1]].
	Returns:
	note events with pitch bends
	"""

	f0_matrix = [] # [time (index), pitch (Hz), confidence in range [0, 1]]
	note_events_with_pitch_bends = []
	for start_idx, end_idx, pitch_midi, amplitude in note_events:
	if timing_refinement_range:
	start_idx = np.max([0, start_idx - timing_refinement_range])
	end_idx = np.min([contours.shape[0], end_idx + timing_refinement_range])
	freq_idx = int(np.round(self.midi_pitch_to_contour_bin(pitch_midi)))
	freq_start_idx = np.max([freq_idx - self.labeling.f0_tolerance_bins, 0])
	freq_end_idx = np.min([self.labeling.f0_n_bins, freq_idx + self.labeling.f0_tolerance_bins + 1])

	trans_start_idx = np.max([0, self.labeling.f0_tolerance_bins - freq_idx])
	trans_end_idx = (2 * self.labeling.f0_tolerance_bins + 1) - \
	np.max([0, freq_idx - (self.labeling.f0_n_bins - self.labeling.f0_tolerance_bins - 1)])

	# apply regional viterbi to estimate the intonation
	# observation probabilities come from the f0_roll matrix
	observation = contours[start_idx:end_idx, freq_start_idx:freq_end_idx]
	observation = observation / observation.sum(axis=1)[:, None]
	observation[np.isnan(observation.sum(axis=1)), :] = np.ones(freq_end_idx - freq_start_idx) * 1 / (
	freq_end_idx - freq_start_idx)

	# transition probabilities assure continuity
	transition = self.labeling.f0_transition_matrix[trans_start_idx:trans_end_idx,
	trans_start_idx:trans_end_idx] + 1e-6
	transition = transition / np.sum(transition, axis=1)[:, None]

	path = viterbi_discriminative(observation.T / observation.sum(axis=1), transition) + freq_start_idx

	cents = np.array([self.labeling.f0_label2c(contours[i + start_idx, :], path[i]) for i in range(len(path))])
	bends = cents - self.labeling.midi_centers_c[pitch_midi - self.labeling.midi_centers[0]]
	if to_midi:
	bends = (bends * 4096 / 100).astype(int)
	bends[bends > 8191] = 8191
	bends[bends < -8192] = -8192

	if timing_refinement_range:
	confidences = np.array([contours[i + start_idx, path[i]] for i in range(len(path))])
	threshold = np.median(confidences)
	threshold = (np.median(confidences > threshold) + threshold) / 2 # some magic
	median_kernel = 2 * (timing_refinement_range // 2) + 1 # some more magic
	confidences = medfilt(confidences, kernel_size=median_kernel)
	conf_bool = confidences > threshold
	onset_idx = np.argmax(conf_bool)
	offset_idx = len(confidences) - np.argmax(conf_bool[::-1])
	bends = bends[onset_idx:offset_idx]
	start_idx = start_idx + onset_idx
	end_idx = start_idx + offset_idx

	note_events_with_pitch_bends.append((start_idx, end_idx, pitch_midi, amplitude, bends))
	else:
	confidences = np.array([contours[i + start_idx, path[i]] for i in range(len(path))])
	time_idx = np.arange(len(path)) + start_idx
	# f0_hz = self.labeling.f0_c2hz(cents)
	possible_f0s = np.array([time_idx, cents, confidences]).T
	f0_matrix.append(possible_f0s[np.abs(bends)<100]) # filter out pitch bends that are too large
	if not to_midi:
	return np.vstack(f0_matrix)
	else:
	return note_events_with_pitch_bends


	def midi_pitch_to_contour_bin(self, pitch_midi: int) -> np.array:
	"""Convert midi pitch to corresponding index in contour matrix
	Args:
	pitch_midi: pitch in midi
	Returns:
	index in contour matrix
	"""
	pitch_hz = librosa.midi_to_hz(pitch_midi)
	return np.argmin(np.abs(self.labeling.f0_centers_hz - pitch_hz))