|
import os |
|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
from pathlib import Path |
|
|
|
os.system("pip uninstall -y gradio") |
|
os.system("pip install gradio==3.2") |
|
|
|
from demo_inference.demo_tts import DemoTTS |
|
from demo_inference.demo_asr import DemoASR |
|
from demo_inference.demo_anonymization import DemoAnonymizer |
|
|
|
|
|
def pcm2float(sig, dtype='float32'): |
|
""" |
|
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182 |
|
""" |
|
sig = np.asarray(sig) |
|
if sig.dtype.kind not in 'iu': |
|
raise TypeError("'sig' must be an array of integers") |
|
dtype = np.dtype(dtype) |
|
if dtype.kind != 'f': |
|
raise TypeError("'dtype' must be a floating point type") |
|
|
|
i = np.iinfo(sig.dtype) |
|
abs_max = 2 ** (i.bits - 1) |
|
offset = i.min + abs_max |
|
return (sig.astype(dtype) - offset) / abs_max |
|
|
|
|
|
def float2pcm(sig, dtype='int16'): |
|
""" |
|
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182 |
|
""" |
|
sig = np.asarray(sig) |
|
if sig.dtype.kind != 'f': |
|
raise TypeError("'sig' must be a float array") |
|
dtype = np.dtype(dtype) |
|
if dtype.kind not in 'iu': |
|
raise TypeError("'dtype' must be an integer type") |
|
i = np.iinfo(dtype) |
|
abs_max = 2 ** (i.bits - 1) |
|
offset = i.min + abs_max |
|
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) |
|
|
|
|
|
class VPInterface: |
|
|
|
def __init__(self): |
|
self.device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
|
|
self.path_to_tts_models = Path('models', 'tts') |
|
self.path_to_asr_model = Path('models', 'asr') |
|
self.path_to_anon_model = Path('models', 'anonymization') |
|
|
|
self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, device=self.device) |
|
self.asr_model = DemoASR(model_path=self.path_to_asr_model, device=self.device) |
|
self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag='gan', device=self.device) |
|
|
|
def read(self, recording, anon_model_tag): |
|
sr, audio = recording |
|
audio = pcm2float(audio) |
|
|
|
self._check_models(anon_model_tag) |
|
|
|
text_is_phonemes = True |
|
text = self.asr_model.recognize_speech(audio, sr) |
|
speaker_embedding = self.anon_model.anonymize_embedding(audio, sr) |
|
syn_audio = self.synthesis_model.read_text(transcription=text, speaker_embedding=speaker_embedding, |
|
text_is_phonemes=text_is_phonemes) |
|
|
|
return 48000, float2pcm(syn_audio.cpu().numpy()) |
|
|
|
def _check_models(self, anon_model_tag): |
|
if anon_model_tag != self.anon_model.model_tag: |
|
self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag=anon_model_tag, |
|
device=self.device) |
|
|
|
|
|
model = VPInterface() |
|
|
|
article = """ |
|
This demo allows you to anonymize your input speech by defining different anonymization models. If |
|
you want to know more about each model, please read the paper linked above. Every time you click the *submit* button, |
|
you should receive a new voice. |
|
|
|
Note that for *pool* anonymization in this demo, we are using a different scaling approach ( |
|
sklearn.preprocessing.StandardScaler instead of sklearn.preprocessing.MinMaxScaler) because we are processing only |
|
one sample at a time and would otherwise always end up with the same voice. |
|
|
|
This demo is still work in progress, so please be lenient with possible low quality and errors. Also, be aware that |
|
this Huggingface space runs on CPU which makes the demo quite slow. |
|
|
|
For more information about this system, visit our Github page: [https://github.com/DigitalPhonetics/speaker-anonymization](https://github.com/DigitalPhonetics/speaker-anonymization/tree/gan_embeddings) |
|
""" |
|
|
|
description = """ |
|
## Test demo corresponding to the models in our paper [Anonymizing Speech with Generative Adversarial Networks to Preserve Speaker Privacy](https://arxiv.org/abs/2210.07002) |
|
""" |
|
|
|
css = """ |
|
.gr-button-primary {background-color: green !important, border-color: green} |
|
""" |
|
|
|
iface = gr.Interface(fn=model.read, |
|
inputs=[gr.inputs.Audio(source='microphone', type='numpy', label='Say a sentence in English.'), |
|
gr.inputs.Dropdown(['gan', 'pool', 'random'], type='value', default='gan', |
|
label='Anonymization') |
|
], |
|
outputs=gr.outputs.Audio(type='numpy', label=None), |
|
layout='vertical', |
|
title='IMS Speaker Anonymization', |
|
description=description, |
|
theme='default', |
|
allow_flagging='never', |
|
article=article, |
|
allow_screenshot=False) |
|
iface.launch(enable_queue=True) |
|
|