Spaces:
Sleeping
Sleeping
Commit
·
d86bc7f
1
Parent(s):
496bf8a
new demo
Browse files- app.py +37 -183
- requirements.txt +2 -0
app.py
CHANGED
@@ -13,19 +13,18 @@ from parler_tts import ParlerTTSForConditionalGeneration
|
|
13 |
from pydub import AudioSegment
|
14 |
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
|
15 |
from transformers.generation.streamers import BaseStreamer
|
|
|
16 |
|
17 |
device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
|
18 |
torch_dtype = torch.float16 if device != "cpu" else torch.float32
|
19 |
|
20 |
repo_id = "parler-tts/parler_tts_mini_v0.1"
|
21 |
-
jenny_repo_id = "ylacombe/parler-tts-mini-jenny-30H"
|
22 |
|
23 |
model = ParlerTTSForConditionalGeneration.from_pretrained(
|
24 |
repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
|
25 |
).to(device)
|
26 |
-
|
27 |
-
|
28 |
-
).to(device)
|
29 |
|
30 |
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
31 |
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
@@ -33,53 +32,6 @@ feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
|
33 |
SAMPLE_RATE = feature_extractor.sampling_rate
|
34 |
SEED = 42
|
35 |
|
36 |
-
default_text = "Please surprise me and speak in whatever voice you enjoy."
|
37 |
-
examples = [
|
38 |
-
[
|
39 |
-
"Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
|
40 |
-
"A male speaker with a low-pitched voice delivering his words at a fast pace in a small, confined space with a very clear audio and an animated tone.",
|
41 |
-
3.0,
|
42 |
-
],
|
43 |
-
[
|
44 |
-
"'This is the best time of my life, Bartley,' she said happily.",
|
45 |
-
"A female speaker with a slightly low-pitched, quite monotone voice delivers her words at a slightly faster-than-average pace in a confined space with very clear audio.",
|
46 |
-
3.0,
|
47 |
-
],
|
48 |
-
[
|
49 |
-
"Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
|
50 |
-
"A male speaker with a slightly high-pitched voice delivering his words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
|
51 |
-
3.0,
|
52 |
-
],
|
53 |
-
[
|
54 |
-
"Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
|
55 |
-
"A male speaker with a low-pitched voice delivers his words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
|
56 |
-
3.0,
|
57 |
-
],
|
58 |
-
]
|
59 |
-
|
60 |
-
jenny_examples = [
|
61 |
-
[
|
62 |
-
"Remember, this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
|
63 |
-
"Jenny speaks at an average pace with a slightly animated delivery in a very confined sounding environment with clear audio quality.",
|
64 |
-
3.0,
|
65 |
-
],
|
66 |
-
[
|
67 |
-
"'This is the best time of my life, Bartley,' she said happily.",
|
68 |
-
"Jenny speaks in quite a monotone voice at a slightly faster-than-average pace in a confined space with very clear audio.",
|
69 |
-
3.0,
|
70 |
-
],
|
71 |
-
[
|
72 |
-
"Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
|
73 |
-
"Jenny delivers her words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
|
74 |
-
3.0,
|
75 |
-
],
|
76 |
-
[
|
77 |
-
"Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
|
78 |
-
"Jenny delivers her words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
|
79 |
-
3.0,
|
80 |
-
],
|
81 |
-
]
|
82 |
-
|
83 |
|
84 |
class ParlerTTSStreamer(BaseStreamer):
|
85 |
def __init__(
|
@@ -238,13 +190,28 @@ def numpy_to_mp3(audio_array, sampling_rate):
|
|
238 |
sampling_rate = model.audio_encoder.config.sampling_rate
|
239 |
frame_rate = model.audio_encoder.config.frame_rate
|
240 |
|
|
|
|
|
241 |
@spaces.GPU
|
242 |
-
def generate_base(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
play_steps = int(frame_rate * play_steps_in_s)
|
244 |
streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
|
245 |
|
|
|
246 |
inputs = tokenizer(description, return_tensors="pt").to(device)
|
247 |
-
prompt = tokenizer(
|
248 |
|
249 |
generation_kwargs = dict(
|
250 |
input_ids=inputs.input_ids,
|
@@ -259,145 +226,32 @@ def generate_base(text, description, play_steps_in_s=2.0):
|
|
259 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
260 |
thread.start()
|
261 |
|
|
|
|
|
|
|
262 |
for new_audio in streamer:
|
263 |
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
264 |
-
yield numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
265 |
|
266 |
-
@spaces.GPU
|
267 |
-
def generate_jenny(text, description, play_steps_in_s=2.0):
|
268 |
-
play_steps = int(frame_rate * play_steps_in_s)
|
269 |
-
streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
|
270 |
|
271 |
-
inputs = tokenizer(description, return_tensors="pt").to(device)
|
272 |
-
prompt = tokenizer(text, return_tensors="pt").to(device)
|
273 |
-
|
274 |
-
generation_kwargs = dict(
|
275 |
-
input_ids=inputs.input_ids,
|
276 |
-
prompt_input_ids=prompt.input_ids,
|
277 |
-
streamer=streamer,
|
278 |
-
do_sample=True,
|
279 |
-
temperature=1.0,
|
280 |
-
min_new_tokens=10,
|
281 |
-
)
|
282 |
-
|
283 |
-
set_seed(SEED)
|
284 |
-
thread = Thread(target=jenny_model.generate, kwargs=generation_kwargs)
|
285 |
-
thread.start()
|
286 |
-
|
287 |
-
for new_audio in streamer:
|
288 |
-
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
289 |
-
yield sampling_rate, new_audio
|
290 |
-
|
291 |
-
|
292 |
-
css = """
|
293 |
-
#share-btn-container {
|
294 |
-
display: flex;
|
295 |
-
padding-left: 0.5rem !important;
|
296 |
-
padding-right: 0.5rem !important;
|
297 |
-
background-color: #000000;
|
298 |
-
justify-content: center;
|
299 |
-
align-items: center;
|
300 |
-
border-radius: 9999px !important;
|
301 |
-
width: 13rem;
|
302 |
-
margin-top: 10px;
|
303 |
-
margin-left: auto;
|
304 |
-
flex: unset !important;
|
305 |
-
}
|
306 |
-
#share-btn {
|
307 |
-
all: initial;
|
308 |
-
color: #ffffff;
|
309 |
-
font-weight: 600;
|
310 |
-
cursor: pointer;
|
311 |
-
font-family: 'IBM Plex Sans', sans-serif;
|
312 |
-
margin-left: 0.5rem !important;
|
313 |
-
padding-top: 0.25rem !important;
|
314 |
-
padding-bottom: 0.25rem !important;
|
315 |
-
right:0;
|
316 |
-
}
|
317 |
-
#share-btn * {
|
318 |
-
all: unset !important;
|
319 |
-
}
|
320 |
-
#share-btn-container div:nth-child(-n+2){
|
321 |
-
width: auto !important;
|
322 |
-
min-height: 0px !important;
|
323 |
-
}
|
324 |
-
#share-btn-container .wrap {
|
325 |
-
display: none !important;
|
326 |
-
}
|
327 |
-
"""
|
328 |
with gr.Blocks(css=css) as block:
|
329 |
-
gr.HTML(
|
330 |
-
"""
|
331 |
-
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
|
332 |
-
<div
|
333 |
-
style="
|
334 |
-
display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
|
335 |
-
"
|
336 |
-
>
|
337 |
-
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
|
338 |
-
Parler-TTS 🗣️
|
339 |
-
</h1>
|
340 |
-
</div>
|
341 |
-
</div>
|
342 |
-
"""
|
343 |
-
)
|
344 |
gr.HTML(
|
345 |
f"""
|
346 |
-
<
|
347 |
-
|
348 |
-
is the first iteration model trained using 10k hours of narrated audiobooks, and <a href="https://huggingface.co/ylacombe/parler-tts-mini-jenny-30H"> Parler-TTS Jenny</a>,
|
349 |
-
a model fine-tuned on the <a href="https://huggingface.co/datasets/reach-vb/jenny_tts_dataset"> Jenny dataset</a>.
|
350 |
-
Both models generates high-quality speech with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation).</p>
|
351 |
-
|
352 |
-
<p>Tips for ensuring good generation:
|
353 |
-
<ul>
|
354 |
-
<li>Include the term <b>"very clear audio"</b> to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
|
355 |
-
<li>When using the fine-tuned model, include the term <b>"Jenny"</b> to pick out her voice</li>
|
356 |
-
<li>Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech</li>
|
357 |
-
<li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
|
358 |
-
</ul>
|
359 |
-
</p>
|
360 |
-
"""
|
361 |
-
)
|
362 |
-
with gr.Tab("Base"):
|
363 |
-
with gr.Row():
|
364 |
-
with gr.Column():
|
365 |
-
input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
|
366 |
-
description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
|
367 |
-
play_seconds = gr.Slider(3.0, 7.0, value=3.0, step=2, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps")
|
368 |
-
run_button = gr.Button("Generate Audio", variant="primary")
|
369 |
-
with gr.Column():
|
370 |
-
audio_out = gr.Audio(label="Parler-TTS generation", format="mp3", elem_id="audio_out", streaming=True, autoplay=True)
|
371 |
-
|
372 |
-
inputs = [input_text, description, play_seconds]
|
373 |
-
outputs = [audio_out]
|
374 |
-
gr.Examples(examples=examples, fn=generate_base, inputs=inputs, outputs=outputs, cache_examples=False)
|
375 |
-
run_button.click(fn=generate_base, inputs=inputs, outputs=outputs, queue=True)
|
376 |
-
|
377 |
-
with gr.Tab("Jenny"):
|
378 |
-
with gr.Row():
|
379 |
-
with gr.Column():
|
380 |
-
input_text = gr.Textbox(label="Input Text", lines=2, value=jenny_examples[0][0], elem_id="input_text")
|
381 |
-
description = gr.Textbox(label="Description", lines=2, value=jenny_examples[0][1], elem_id="input_description")
|
382 |
-
play_seconds = gr.Slider(3.0, 7.0, value=jenny_examples[0][2], step=2, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps")
|
383 |
-
run_button = gr.Button("Generate Audio", variant="primary")
|
384 |
-
with gr.Column():
|
385 |
-
audio_out = gr.Audio(label="Parler-TTS generation", format="mp3", elem_id="audio_out", streaming=True, autoplay=True)
|
386 |
-
|
387 |
-
inputs = [input_text, description, play_seconds]
|
388 |
-
outputs = [audio_out]
|
389 |
-
gr.Examples(examples=jenny_examples, fn=generate_jenny, inputs=inputs, outputs=outputs, cache_examples=False)
|
390 |
-
run_button.click(fn=generate_jenny, inputs=inputs, outputs=outputs, queue=True)
|
391 |
-
|
392 |
-
gr.HTML(
|
393 |
-
"""
|
394 |
-
<p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.
|
395 |
-
The v1 release of the model will be trained on this data, as well as inference optimisations, such as flash attention
|
396 |
-
and torch compile, that will improve the latency by 2-4x. If you want to find out more about how this model was trained and even fine-tune it yourself, check-out the
|
397 |
-
<a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> repository on GitHub. The Parler-TTS codebase and its
|
398 |
-
associated checkpoints are licensed under <a href='https://github.com/huggingface/parler-tts?tab=Apache-2.0-1-ov-file#readme'> Apache 2.0</a>.</p>
|
399 |
"""
|
400 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
|
402 |
block.queue()
|
403 |
block.launch(share=True)
|
|
|
13 |
from pydub import AudioSegment
|
14 |
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
|
15 |
from transformers.generation.streamers import BaseStreamer
|
16 |
+
from huggingface_hub import InferrenceClient
|
17 |
|
18 |
device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
|
19 |
torch_dtype = torch.float16 if device != "cpu" else torch.float32
|
20 |
|
21 |
repo_id = "parler-tts/parler_tts_mini_v0.1"
|
|
|
22 |
|
23 |
model = ParlerTTSForConditionalGeneration.from_pretrained(
|
24 |
repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
|
25 |
).to(device)
|
26 |
+
|
27 |
+
client = InferenceClient()
|
|
|
28 |
|
29 |
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
30 |
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
|
|
32 |
SAMPLE_RATE = feature_extractor.sampling_rate
|
33 |
SEED = 42
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
class ParlerTTSStreamer(BaseStreamer):
|
37 |
def __init__(
|
|
|
190 |
sampling_rate = model.audio_encoder.config.sampling_rate
|
191 |
frame_rate = model.audio_encoder.config.frame_rate
|
192 |
|
193 |
+
import random
|
194 |
+
|
195 |
@spaces.GPU
|
196 |
+
def generate_base(subject, setting, ):
|
197 |
+
|
198 |
+
messages = [{"role": "sytem", "content": ("You are an award-winning children's bedtime story author lauded for your inventive stories."
|
199 |
+
"You want to write a bed time story for your child. They will give you the subject and setting "
|
200 |
+
"and you will write the entire story. It should be targetted at children 5 and younger and take about "
|
201 |
+
"a minute to read")},
|
202 |
+
{"role": "user", "content": f"Please tell me a story about a {subject} in {setting}"}]
|
203 |
+
gr.Info("Generating story", duration=3)
|
204 |
+
response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
|
205 |
+
gr.Info("Story Generated", duration=3)
|
206 |
+
story = output.choices[0].content
|
207 |
+
|
208 |
+
play_steps_in_s = 2.0
|
209 |
play_steps = int(frame_rate * play_steps_in_s)
|
210 |
streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
|
211 |
|
212 |
+
description = "A female speaker with a calm, warm, monotone voice delivers her words at a normal pace confined space with very clear audio."
|
213 |
inputs = tokenizer(description, return_tensors="pt").to(device)
|
214 |
+
prompt = tokenizer(story, return_tensors="pt").to(device)
|
215 |
|
216 |
generation_kwargs = dict(
|
217 |
input_ids=inputs.input_ids,
|
|
|
226 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
227 |
thread.start()
|
228 |
|
229 |
+
yield story, None
|
230 |
+
|
231 |
+
gr.Info("Reading story", duration=3)
|
232 |
for new_audio in streamer:
|
233 |
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
234 |
+
yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
235 |
|
|
|
|
|
|
|
|
|
236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
with gr.Blocks(css=css) as block:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
gr.HTML(
|
239 |
f"""
|
240 |
+
<h1> Bedtime Story Reader 😴🔊 </h1>
|
241 |
+
<p> Powered by <a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
"""
|
243 |
)
|
244 |
+
with gr.Row():
|
245 |
+
subject = gr.Dropdown(value="Princess", choices=["Prince", "Princess", "Dog", "Cat"])
|
246 |
+
setting = gr.Dropdown(value="Forest", choices=["Forest", "Kingdom", "Jungle", "Underwater"])
|
247 |
+
with gr.Row():
|
248 |
+
with gr.Group():
|
249 |
+
audio_out = gr.Audio(label="Bed time story", streaming=True, autoplay=True)
|
250 |
+
story = gr.Textbox(label="Story")
|
251 |
+
|
252 |
+
inputs = [subject, setting]
|
253 |
+
outputs = [audio_out, story]
|
254 |
+
run_button.click(fn=generate_base, inputs=inputs, outputs=outputs)
|
255 |
|
256 |
block.queue()
|
257 |
block.launch(share=True)
|
requirements.txt
CHANGED
@@ -1,2 +1,4 @@
|
|
|
|
|
|
1 |
git+https://github.com/huggingface/parler-tts.git
|
2 |
accelerate
|
|
|
1 |
+
"gradio-client @ git+https://github.com/gradio-app/gradio@bed454c3d22cfacedc047eb3b0ba987b485ac3fd#subdirectory=client/python"
|
2 |
+
https://gradio-builds.s3.amazonaws.com/bed454c3d22cfacedc047eb3b0ba987b485ac3fd/gradio-4.40.0-py3-none-any.whl
|
3 |
git+https://github.com/huggingface/parler-tts.git
|
4 |
accelerate
|