Respair commited on
Commit
4028449
·
verified ·
1 Parent(s): 1a8b14f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -21
app.py CHANGED
@@ -1,32 +1,25 @@
1
  INTROTXT = """# StyleTTS 2
 
2
 
3
- [Paper](https://arxiv.org/abs/2306.07691) - [Samples](https://styletts2.github.io/) - [Code](https://github.com/yl4579/StyleTTS2) - [Discord](https://discord.gg/ha8sxdG2K4)
4
-
5
- A free demo of StyleTTS 2. **I am not affiliated with the StyleTTS 2 Authors.**
6
-
7
- **Before using this demo, you agree to inform the listeners that the speech samples are synthesized by the pre-trained models, unless you have the permission to use the voice you synthesize. That is, you agree to only use voices whose speakers grant the permission to have their voice cloned, either directly or by license before making synthesized voices public, or you have to publicly announce that these voices are synthesized if you do not have the permission to use these voices.**
8
-
9
- Is there a long queue on this space? Duplicate it and add a more powerful GPU to skip the wait! **Note: Thank you to Hugging Face for their generous GPU grant program!**
10
-
11
- **NOTE: StyleTTS 2 does better on longer texts.** For example, making it say "hi" will produce a lower-quality result than making it say a longer phrase.
12
-
13
- **NOTE: StyleTTS 2 is _currently_ English-only. Join the Discord for updates on multilingual training.**
14
  """
15
  import gradio as gr
16
  import styletts2importable
17
- import ljspeechimportable
18
  import torch
19
  import os
20
  from txtsplit import txtsplit
21
  import numpy as np
22
  import pickle
23
- theme = gr.themes.Base(
24
  font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
25
  )
26
- voicelist = ['f-us-1', 'f-us-2', 'f-us-3', 'f-us-4', 'm-us-1', 'm-us-2', 'm-us-3', 'm-us-4']
27
  voices = {}
28
- import phonemizer
29
- global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
30
  # todo: cache computed style, load using pickle
31
  # if os.path.exists('voices.pkl'):
32
  # with open('voices.pkl', 'rb') as f:
@@ -108,7 +101,7 @@ def ljsynthesize(text, steps, progress=gr.Progress()):
108
  # if len(text) > 400:
109
  # raise gr.Error("Text must be under 400 characters")
110
  noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
111
- # return (24000, ljspeechimportable.inference(text, noise, diffusion_steps=7, embedding_scale=1))
112
  if text.strip() == "":
113
  raise gr.Error("You must enter some text")
114
  if len(text) > 150000:
@@ -119,7 +112,7 @@ def ljsynthesize(text, steps, progress=gr.Progress()):
119
  texts = txtsplit(text)
120
  audios = []
121
  for t in progress.tqdm(texts):
122
- audios.append(ljspeechimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=1))
123
  return (24000, np.concatenate(audios))
124
 
125
 
@@ -128,7 +121,7 @@ with gr.Blocks() as vctk:
128
  with gr.Column(scale=1):
129
  inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
130
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
131
- multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
132
  # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
133
  with gr.Column(scale=1):
134
  btn = gr.Button("Synthesize", variant="primary")
@@ -170,8 +163,8 @@ with gr.Blocks() as lj:
170
  with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme=theme) as demo:
171
  gr.Markdown(INTROTXT)
172
  gr.DuplicateButton("Duplicate Space")
173
- # gr.TabbedInterface([vctk, clone, lj, longText], ['Multi-Voice', 'Voice Cloning', 'LJSpeech', 'Long Text [Beta]'])
174
- gr.TabbedInterface([vctk, clone, lj], ['Multi-Voice', 'Voice Cloning', 'LJSpeech', 'Long Text [Beta]'])
175
  gr.Markdown("""
176
  Demo by [mrfakename](https://twitter.com/realmrfakename). I am not affiliated with the StyleTTS 2 authors.
177
 
 
1
  INTROTXT = """# StyleTTS 2
2
+ kudos to mrfakename for the base gradio code I'm borrowing here.
3
 
4
+ 日本語用
5
+ The Text-guided inference may or may not work. you can only do inference max 512 tokens.
6
+ **
 
 
 
 
 
 
 
 
7
  """
8
  import gradio as gr
9
  import styletts2importable
10
+ import Text-guided Inferenceimportable
11
  import torch
12
  import os
13
  from txtsplit import txtsplit
14
  import numpy as np
15
  import pickle
16
+ theme = gr.themes.Base(theme="NoCrypt/miku",
17
  font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
18
  )
19
+ voicelist = ['1','2','3']
20
  voices = {}
21
+ # import phonemizer
22
+ # global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
23
  # todo: cache computed style, load using pickle
24
  # if os.path.exists('voices.pkl'):
25
  # with open('voices.pkl', 'rb') as f:
 
101
  # if len(text) > 400:
102
  # raise gr.Error("Text must be under 400 characters")
103
  noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
104
+ # return (24000, Text-guided Inferenceimportable.inference(text, noise, diffusion_steps=7, embedding_scale=1))
105
  if text.strip() == "":
106
  raise gr.Error("You must enter some text")
107
  if len(text) > 150000:
 
112
  texts = txtsplit(text)
113
  audios = []
114
  for t in progress.tqdm(texts):
115
+ audios.append(Text-guided Inferenceimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=1))
116
  return (24000, np.concatenate(audios))
117
 
118
 
 
121
  with gr.Column(scale=1):
122
  inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
123
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
124
+ multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Higher gives you more diverse results but not necessarily higher quality - これを増えたらもっとエモーショナルな結果になりますが、クオリティーのいい結果になるとは限らない。", interactive=True)
125
  # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
126
  with gr.Column(scale=1):
127
  btn = gr.Button("Synthesize", variant="primary")
 
163
  with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme=theme) as demo:
164
  gr.Markdown(INTROTXT)
165
  gr.DuplicateButton("Duplicate Space")
166
+ # gr.TabbedInterface([vctk, clone, lj, longText], ['Multi-Voice', 'Voice Cloning', 'Text-guided Inference', 'Long Text [Beta]'])
167
+ gr.TabbedInterface([vctk, clone, lj], ['Multi-Voice', 'Voice Cloning (don't use this,このオプションはこのモデルに利きません。', 'Text-guided Inference', 'Long Text [Beta]'])
168
  gr.Markdown("""
169
  Demo by [mrfakename](https://twitter.com/realmrfakename). I am not affiliated with the StyleTTS 2 authors.
170