In [1]:
import torch
import librosa
from transformers import VitsModel, VitsTokenizer, pipeline
from IPython.display import Audio

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"Using {device} with dtype {torch_dtype}")

model = VitsModel.from_pretrained("facebook/mms-tts-zlm")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-zlm")

asr_pipe = pipeline(  # noqa: F821
    "automatic-speech-recognition",
    model="openai/whisper-large-v3",
    device=device,
    torch_dtype=torch_dtype,
)

Using cuda:0 with dtype torch.float16


Some weights of the model checkpoint at facebook/mms-tts-zlm were not used when initializing VitsModel: ['flow.flows.0.wavenet.in_layers.0.weight_g', 'flow.flows.0.wavenet.in_layers.0.weight_v', 'flow.flows.0.wavenet.in_layers.1.weight_g', 'flow.flows.0.wavenet.in_layers.1.weight_v', 'flow.flows.0.wavenet.in_layers.2.weight_g', 'flow.flows.0.wavenet.in_layers.2.weight_v', 'flow.flows.0.wavenet.in_layers.3.weight_g', 'flow.flows.0.wavenet.in_layers.3.weight_v', 'flow.flows.0.wavenet.res_skip_layers.0.weight_g', 'flow.flows.0.wavenet.res_skip_layers.0.weight_v', 'flow.flows.0.wavenet.res_skip_layers.1.weight_g', 'flow.flows.0.wavenet.res_skip_layers.1.weight_v', 'flow.flows.0.wavenet.res_skip_layers.2.weight_g', 'flow.flows.0.wavenet.res_skip_layers.2.weight_v', 'flow.flows.0.wavenet.res_skip_layers.3.weight_g', 'flow.flows.0.wavenet.res_skip_layers.3.weight_v', 'flow.flows.1.wavenet.in_layers.0.weight_g', 'flow.flows.1.wavenet.in_layers.0.weight_v', 'flow.flows.1.wavenet.in_layers.1.wei

In [21]:
def synthesise(text):
    inputs = tokenizer(text=text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        outputs = model(input_ids)

    speech = outputs["waveform"]
    return speech

def translate(audio):
    outputs = asr_pipe(
        audio,
        max_new_tokens=256,
        generate_kwargs={"task": "transcribe", "language": "ms"},
    )
    return outputs["text"]

def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech

In [11]:
x, sr = librosa.load("audio.wav", sr=16_000)

In [12]:
Audio(x, rate=sr)

In [13]:
text = translate(x)
print(text)
tts = synthesise(text)

Audio(tts, rate=16_000)

 Pada bab 16. Saya mungkin telah memberitahu anda tentang permulaan penyelidikan ini dalam beberapa lirik. Tetapi saya mahu anda melihat setiap langkah dengan mana kami datang. Saya juga setuju dengan apa-apa pun yang Marguerite mahukan.


In [19]:
import numpy as np
(tts.numpy() * 32767).astype(np.int16)

array([[-8, -8, -5, ..., -4, -2, -3]], dtype=int16)

In [20]:
Audio((tts.numpy() * 32767).astype(np.int16), rate=16_000)

In [22]:
sr, x = speech_to_speech_translation(x)
Audio(x, rate=sr)