import spaces import gradio as gr import torch from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer from string import punctuation import re import os from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed device = "cuda:0" if torch.cuda.is_available() else "cpu" auth_token = os.environ.get("AUTH_TOKEN") auth_username = os.environ.get("AUTH_USERNAME") auth_password = os.environ.get("AUTH_PASSWORD") repo_id = "davidmeikle/german_parler_tts_mini_v0.1-alpha2" model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id, token=auth_token).to(device) tokenizer = AutoTokenizer.from_pretrained(repo_id, token=auth_token) feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id, token=auth_token) SAMPLE_RATE = feature_extractor.sampling_rate SEED = 42 default_text = "Eigene lage, wenn er Sullas absichten richtig beurteilte, eine so bedrohte, daß dergleichen Rücksichten kaum mehr in betracht kamen" default_description = "A male voice speaks in a monotone voice with a slightly slow delivery in a quiet room with barely any echo." examples = [ [ "Eigene lage, wenn er Sullas absichten richtig beurteilte, eine so bedrohte, daß dergleichen Rücksichten kaum mehr in betracht kamen", "A male voice speaks in a monotone voice with a slightly slow delivery in a quiet room with barely any echo.", None, ], [ "Die menschliche Stimme ist vor allem ein Musikinstrument.", "A male voice speaks in a monotone voice with a slightly slow delivery in a quiet room with barely any echo.", None, ], ] number_normalizer = EnglishNumberNormalizer() def preprocess(text): text = number_normalizer(text).strip() text = text.replace("-", " ") if text[-1] not in punctuation: text = f"{text}." abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b' def separate_abb(chunk): chunk = chunk.replace(".","") print(chunk) return " ".join(chunk) abbreviations = re.findall(abbreviations_pattern, text) for abv in abbreviations: if abv in text: text = text.replace(abv, separate_abb(abv)) return text @spaces.GPU def gen_tts(text, description): inputs = tokenizer(description.strip(), return_tensors="pt").to(device) prompt = tokenizer(preprocess(text), return_tensors="pt").to(device) set_seed(SEED) generation = model.generate( input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0 ) audio_arr = generation.cpu().numpy().squeeze() return SAMPLE_RATE, audio_arr css = """ #share-btn-container { display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem; margin-top: 10px; margin-left: auto; flex: unset !important; } #share-btn { all: initial; color: #ffffff; font-weight: 600; cursor: pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important; right:0; } #share-btn * { all: unset !important; } #share-btn-container div:nth-child(-n+2){ width: auto !important; min-height: 0px !important; } #share-btn-container .wrap { display: none !important; } """ with gr.Blocks(css=css) as block: gr.HTML( """
This model is a version of Parler-TTS Mini V1 trained for the German language.
The model has five voices to choose from: Ana (female), Brenda (female), Christof (male), Elena (female), and Hans (male)
Due to limitations on the volume of recordings in the dataset, this model may underperform for female voices at present.
By default, Parler-TTS generates random voice characteristics. To ensure speaker consistency across generations, aim to use consistent descriptions in your prompts.
Please Note: this model has not been trained to work in English, it will generate incoherent output.
""" ) with gr.Row(): with gr.Column(): input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text") description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description") run_button = gr.Button("Generate Audio", variant="primary") with gr.Column(): audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out") inputs = [input_text, description] outputs = [audio_out] run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True) gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True) gr.HTML( """Tips for ensuring good generation: