File size: 6,553 Bytes
01d378b
 
 
 
 
 
 
 
 
 
 
 
 
 
7e0e946
 
80b5649
01d378b
 
 
 
 
 
 
 
 
ca0426e
01d378b
 
ca0426e
 
 
 
 
 
 
 
 
 
 
01d378b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf1019f
bb1b982
bf1019f
 
 
01d378b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e0e946
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import spaces
import gradio as gr
import torch
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
from string import punctuation
import re
import os

from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed

device = "cuda:0" if torch.cuda.is_available() else "cpu"

auth_token = os.environ.get("AUTH_TOKEN")
auth_username = os.environ.get("AUTH_USERNAME")
auth_password = os.environ.get("AUTH_PASSWORD")
repo_id =  "davidmeikle/german_parler_tts_mini_v0.1-alpha2"

model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id, token=auth_token).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id, token=auth_token)
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id, token=auth_token)


SAMPLE_RATE = feature_extractor.sampling_rate
SEED = 42

default_text = "Eigene lage, wenn er Sullas absichten richtig beurteilte, eine so bedrohte, daß dergleichen Rücksichten kaum mehr in betracht kamen"
default_description = "A male voice speaks in a monotone voice with a slightly slow delivery in a quiet room with barely any echo."
examples = [
    [
        "Eigene lage, wenn er Sullas absichten richtig beurteilte, eine so bedrohte, daß dergleichen Rücksichten kaum mehr in betracht kamen",
        "A male voice speaks in a monotone voice with a slightly slow delivery in a quiet room with barely any echo.",
        None,
    
    ],
    [
        "Die menschliche Stimme ist vor allem ein Musikinstrument.",
        "A male voice speaks in a monotone voice with a slightly slow delivery in a quiet room with barely any echo.",
        None,
    ],
]
number_normalizer = EnglishNumberNormalizer()

def preprocess(text):
    text = number_normalizer(text).strip()
    text = text.replace("-", " ")
    if text[-1] not in punctuation:
        text = f"{text}."
    
    abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
    
    def separate_abb(chunk):
        chunk = chunk.replace(".","")
        print(chunk)
        return " ".join(chunk)
    
    abbreviations = re.findall(abbreviations_pattern, text)
    for abv in abbreviations:
        if abv in text:
            text = text.replace(abv, separate_abb(abv))
    return text

@spaces.GPU
def gen_tts(text, description):
    inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
    prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)

    set_seed(SEED)
    generation = model.generate(
        input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
    )
    audio_arr = generation.cpu().numpy().squeeze()

    return SAMPLE_RATE, audio_arr


css = """
        #share-btn-container {
            display: flex;
            padding-left: 0.5rem !important;
            padding-right: 0.5rem !important;
            background-color: #000000;
            justify-content: center;
            align-items: center;
            border-radius: 9999px !important; 
            width: 13rem;
            margin-top: 10px;
            margin-left: auto;
            flex: unset !important;
        }
        #share-btn {
            all: initial;
            color: #ffffff;
            font-weight: 600;
            cursor: pointer;
            font-family: 'IBM Plex Sans', sans-serif;
            margin-left: 0.5rem !important;
            padding-top: 0.25rem !important;
            padding-bottom: 0.25rem !important;
            right:0;
        }
        #share-btn * {
            all: unset !important;
        }
        #share-btn-container div:nth-child(-n+2){
            width: auto !important;
            min-height: 0px !important;
        }
        #share-btn-container .wrap {
            display: none !important;
        }
"""
with gr.Blocks(css=css) as block:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 700px; margin: 0 auto;">
              <div
                style="
                  display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
                "
              >
                <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
                  German Parler-TTS 🗣️
                </h1>
              </div>
            </div>
        """
    )
    gr.HTML(
        f"""
        <p>This model is a version of <strong><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> Mini V1</strong> trained for the German language.</p>
        <p>The model has five voices to choose from: Ana (female), Brenda (female), Christof (male), Elena (female), and Hans (male)</p>
        <p>Due to limitations on the volume of recordings in the dataset, this model may underperform for female voices at present.</p>
        <p>By default, Parler-TTS generates random voice characteristics. To ensure speaker consistency across generations, aim to use consistent descriptions in your prompts.</p>
        <p><b>Please Note:</b> this model has not been trained to work in English, it will generate incoherent output. </p>
        """
    )
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
            description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
            run_button = gr.Button("Generate Audio", variant="primary")
        with gr.Column():
            audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")

    inputs = [input_text, description]
    outputs = [audio_out]
    run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
    gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
    gr.HTML(
        """
        <p>Tips for ensuring good generation:
        <ul>
            <li>Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
            <li>Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech</li>
            <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
        </ul>
        </p>
        """
    )


block.queue()
block.launch(share=True, auth=(auth_username, auth_password))