|
import numpy as np |
|
import torch |
|
import torchaudio |
|
import pyloudnorm as pyln |
|
from speechbrain.pretrained import EncoderClassifier |
|
|
|
from IMSToucan.Preprocessing.AudioPreprocessor import AudioPreprocessor |
|
|
|
VALID_VEC_TYPES = {'xvector', 'ecapa', 'ecapa+xvector'} |
|
|
|
|
|
class DemoSpeakerEmbeddings: |
|
|
|
def __init__(self, vec_type='xvector', device=torch.device('cpu')): |
|
self.vec_type = vec_type |
|
assert self.vec_type in VALID_VEC_TYPES, f'Invalid vec_type {self.vec_type}, must be one of {VALID_VEC_TYPES}' |
|
self.device = device |
|
|
|
self.encoders = [] |
|
if 'ecapa' in self.vec_type: |
|
self.encoders.append(EncoderClassifier.from_hparams(source='speechbrain/spkrec-ecapa-voxceleb', |
|
savedir='models/speaker_embeddings/spkrec-ecapa-voxceleb', |
|
run_opts={'device': self.device})) |
|
if 'xvector' in self.vec_type: |
|
self.encoders.append(EncoderClassifier.from_hparams(source='speechbrain/spkrec-xvect-voxceleb', |
|
savedir='models/speaker_embeddings/spkrec-xvect-voxceleb', |
|
run_opts={'device': self.device})) |
|
|
|
self.ap = AudioPreprocessor(input_sr=48000, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, |
|
cut_silence=False) |
|
|
|
def extract_vector_from_audio(self, wave, sr): |
|
|
|
|
|
norm_wave = self.ap.audio_to_wave_tensor(normalize=True, audio=wave) |
|
norm_wave = torch.tensor(np.trim_zeros(norm_wave.numpy())) |
|
|
|
spk_embs = [encoder.encode_batch(wavs=norm_wave.unsqueeze(0)).squeeze() for encoder in self.encoders] |
|
if len(spk_embs) == 1: |
|
return spk_embs[0] |
|
else: |
|
return torch.cat(spk_embs, dim=0) |
|
|
|
def _normalize_wave(self, wave, sr): |
|
|
|
wave = torch.tensor(wave) |
|
print(wave.shape) |
|
print(wave) |
|
dur = wave.shape[0] / sr |
|
wave = wave.squeeze().cpu().numpy() |
|
|
|
|
|
meter = pyln.Meter(sr, block_size=min(dur - 0.0001, abs(dur - 0.1)) if dur < 0.4 else 0.4) |
|
loudness = meter.integrated_loudness(wave) |
|
loud_normed = pyln.normalize.loudness(wave, loudness, -30.0) |
|
peak = np.amax(np.abs(loud_normed)) |
|
wave = np.divide(loud_normed, peak) |
|
|
|
wave = torch.Tensor(wave).to(self.device) |
|
|
|
if sr != 16000: |
|
wave = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000).to(self.device)(wave) |
|
|
|
return wave.cpu() |
|
|