Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,553 Bytes
01d378b 7e0e946 80b5649 01d378b ca0426e 01d378b ca0426e 01d378b bf1019f bb1b982 bf1019f 01d378b 7e0e946 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import spaces
import gradio as gr
import torch
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
from string import punctuation
import re
import os
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
device = "cuda:0" if torch.cuda.is_available() else "cpu"
auth_token = os.environ.get("AUTH_TOKEN")
auth_username = os.environ.get("AUTH_USERNAME")
auth_password = os.environ.get("AUTH_PASSWORD")
repo_id = "davidmeikle/german_parler_tts_mini_v0.1-alpha2"
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id, token=auth_token).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id, token=auth_token)
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id, token=auth_token)
SAMPLE_RATE = feature_extractor.sampling_rate
SEED = 42
default_text = "Eigene lage, wenn er Sullas absichten richtig beurteilte, eine so bedrohte, daß dergleichen Rücksichten kaum mehr in betracht kamen"
default_description = "A male voice speaks in a monotone voice with a slightly slow delivery in a quiet room with barely any echo."
examples = [
[
"Eigene lage, wenn er Sullas absichten richtig beurteilte, eine so bedrohte, daß dergleichen Rücksichten kaum mehr in betracht kamen",
"A male voice speaks in a monotone voice with a slightly slow delivery in a quiet room with barely any echo.",
None,
],
[
"Die menschliche Stimme ist vor allem ein Musikinstrument.",
"A male voice speaks in a monotone voice with a slightly slow delivery in a quiet room with barely any echo.",
None,
],
]
number_normalizer = EnglishNumberNormalizer()
def preprocess(text):
text = number_normalizer(text).strip()
text = text.replace("-", " ")
if text[-1] not in punctuation:
text = f"{text}."
abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
def separate_abb(chunk):
chunk = chunk.replace(".","")
print(chunk)
return " ".join(chunk)
abbreviations = re.findall(abbreviations_pattern, text)
for abv in abbreviations:
if abv in text:
text = text.replace(abv, separate_abb(abv))
return text
@spaces.GPU
def gen_tts(text, description):
inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)
set_seed(SEED)
generation = model.generate(
input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
)
audio_arr = generation.cpu().numpy().squeeze()
return SAMPLE_RATE, audio_arr
css = """
#share-btn-container {
display: flex;
padding-left: 0.5rem !important;
padding-right: 0.5rem !important;
background-color: #000000;
justify-content: center;
align-items: center;
border-radius: 9999px !important;
width: 13rem;
margin-top: 10px;
margin-left: auto;
flex: unset !important;
}
#share-btn {
all: initial;
color: #ffffff;
font-weight: 600;
cursor: pointer;
font-family: 'IBM Plex Sans', sans-serif;
margin-left: 0.5rem !important;
padding-top: 0.25rem !important;
padding-bottom: 0.25rem !important;
right:0;
}
#share-btn * {
all: unset !important;
}
#share-btn-container div:nth-child(-n+2){
width: auto !important;
min-height: 0px !important;
}
#share-btn-container .wrap {
display: none !important;
}
"""
with gr.Blocks(css=css) as block:
gr.HTML(
"""
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
German Parler-TTS 🗣️
</h1>
</div>
</div>
"""
)
gr.HTML(
f"""
<p>This model is a version of <strong><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> Mini V1</strong> trained for the German language.</p>
<p>The model has five voices to choose from: Ana (female), Brenda (female), Christof (male), Elena (female), and Hans (male)</p>
<p>Due to limitations on the volume of recordings in the dataset, this model may underperform for female voices at present.</p>
<p>By default, Parler-TTS generates random voice characteristics. To ensure speaker consistency across generations, aim to use consistent descriptions in your prompts.</p>
<p><b>Please Note:</b> this model has not been trained to work in English, it will generate incoherent output. </p>
"""
)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
run_button = gr.Button("Generate Audio", variant="primary")
with gr.Column():
audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
inputs = [input_text, description]
outputs = [audio_out]
run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
gr.HTML(
"""
<p>Tips for ensuring good generation:
<ul>
<li>Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
<li>Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech</li>
<li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
</ul>
</p>
"""
)
block.queue()
block.launch(share=True, auth=(auth_username, auth_password)) |