del unused vits / --soundscape omit=None specified=Fjord
Browse files- Modules/vits/README.md +0 -58
- Modules/vits/losses.py +0 -61
- Modules/vits/models.py +4 -216
- Modules/vits/monotonic_align/__init__.py +0 -19
- Modules/vits/monotonic_align/core.pyx +0 -42
- Modules/vits/monotonic_align/setup.py +0 -9
- Modules/vits/preprocess.py +0 -25
- tts.py +3 -3
Modules/vits/README.md
DELETED
@@ -1,58 +0,0 @@
|
|
1 |
-
# VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech
|
2 |
-
|
3 |
-
### Jaehyeon Kim, Jungil Kong, and Juhee Son
|
4 |
-
|
5 |
-
In our recent [paper](https://arxiv.org/abs/2106.06103), we propose VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech.
|
6 |
-
|
7 |
-
Several recent end-to-end text-to-speech (TTS) models enabling single-stage training and parallel sampling have been proposed, but their sample quality does not match that of two-stage TTS systems. In this work, we present a parallel end-to-end TTS method that generates more natural sounding audio than current two-stage models. Our method adopts variational inference augmented with normalizing flows and an adversarial training process, which improves the expressive power of generative modeling. We also propose a stochastic duration predictor to synthesize speech with diverse rhythms from input text. With the uncertainty modeling over latent variables and the stochastic duration predictor, our method expresses the natural one-to-many relationship in which a text input can be spoken in multiple ways with different pitches and rhythms. A subjective human evaluation (mean opinion score, or MOS) on the LJ Speech, a single speaker dataset, shows that our method outperforms the best publicly available TTS systems and achieves a MOS comparable to ground truth.
|
8 |
-
|
9 |
-
Visit our [demo](https://jaywalnut310.github.io/vits-demo/index.html) for audio samples.
|
10 |
-
|
11 |
-
We also provide the [pretrained models](https://drive.google.com/drive/folders/1ksarh-cJf3F5eKJjLVWY0X1j1qsQqiS2?usp=sharing).
|
12 |
-
|
13 |
-
** Update note: Thanks to [Rishikesh (ऋषिकेश)](https://github.com/jaywalnut310/vits/issues/1), our interactive TTS demo is now available on [Colab Notebook](https://colab.research.google.com/drive/1CO61pZizDj7en71NQG_aqqKdGaA_SaBf?usp=sharing).
|
14 |
-
|
15 |
-
<table style="width:100%">
|
16 |
-
<tr>
|
17 |
-
<th>VITS at training</th>
|
18 |
-
<th>VITS at inference</th>
|
19 |
-
</tr>
|
20 |
-
<tr>
|
21 |
-
<td><img src="resources/fig_1a.png" alt="VITS at training" height="400"></td>
|
22 |
-
<td><img src="resources/fig_1b.png" alt="VITS at inference" height="400"></td>
|
23 |
-
</tr>
|
24 |
-
</table>
|
25 |
-
|
26 |
-
|
27 |
-
## Pre-requisites
|
28 |
-
0. Python >= 3.6
|
29 |
-
0. Clone this repository
|
30 |
-
0. Install python requirements. Please refer [requirements.txt](requirements.txt)
|
31 |
-
1. You may need to install espeak first: `apt-get install espeak`
|
32 |
-
0. Download datasets
|
33 |
-
1. Download and extract the LJ Speech dataset, then rename or create a link to the dataset folder: `ln -s /path/to/LJSpeech-1.1/wavs DUMMY1`
|
34 |
-
1. For mult-speaker setting, download and extract the VCTK dataset, and downsample wav files to 22050 Hz. Then rename or create a link to the dataset folder: `ln -s /path/to/VCTK-Corpus/downsampled_wavs DUMMY2`
|
35 |
-
0. Build Monotonic Alignment Search and run preprocessing if you use your own datasets.
|
36 |
-
```sh
|
37 |
-
# Cython-version Monotonoic Alignment Search
|
38 |
-
cd monotonic_align
|
39 |
-
python setup.py build_ext --inplace
|
40 |
-
|
41 |
-
# Preprocessing (g2p) for your own datasets. Preprocessed phonemes for LJ Speech and VCTK have been already provided.
|
42 |
-
# python preprocess.py --text_index 1 --filelists filelists/ljs_audio_text_train_filelist.txt filelists/ljs_audio_text_val_filelist.txt filelists/ljs_audio_text_test_filelist.txt
|
43 |
-
# python preprocess.py --text_index 2 --filelists filelists/vctk_audio_sid_text_train_filelist.txt filelists/vctk_audio_sid_text_val_filelist.txt filelists/vctk_audio_sid_text_test_filelist.txt
|
44 |
-
```
|
45 |
-
|
46 |
-
|
47 |
-
## Training Exmaple
|
48 |
-
```sh
|
49 |
-
# LJ Speech
|
50 |
-
python train.py -c configs/ljs_base.json -m ljs_base
|
51 |
-
|
52 |
-
# VCTK
|
53 |
-
python train_ms.py -c configs/vctk_base.json -m vctk_base
|
54 |
-
```
|
55 |
-
|
56 |
-
|
57 |
-
## Inference Example
|
58 |
-
See [inference.ipynb](inference.ipynb)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Modules/vits/losses.py
DELETED
@@ -1,61 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from torch.nn import functional as F
|
3 |
-
|
4 |
-
import commons
|
5 |
-
|
6 |
-
|
7 |
-
def feature_loss(fmap_r, fmap_g):
|
8 |
-
loss = 0
|
9 |
-
for dr, dg in zip(fmap_r, fmap_g):
|
10 |
-
for rl, gl in zip(dr, dg):
|
11 |
-
rl = rl.float().detach()
|
12 |
-
gl = gl.float()
|
13 |
-
loss += torch.mean(torch.abs(rl - gl))
|
14 |
-
|
15 |
-
return loss * 2
|
16 |
-
|
17 |
-
|
18 |
-
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
19 |
-
loss = 0
|
20 |
-
r_losses = []
|
21 |
-
g_losses = []
|
22 |
-
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
23 |
-
dr = dr.float()
|
24 |
-
dg = dg.float()
|
25 |
-
r_loss = torch.mean((1-dr)**2)
|
26 |
-
g_loss = torch.mean(dg**2)
|
27 |
-
loss += (r_loss + g_loss)
|
28 |
-
r_losses.append(r_loss.item())
|
29 |
-
g_losses.append(g_loss.item())
|
30 |
-
|
31 |
-
return loss, r_losses, g_losses
|
32 |
-
|
33 |
-
|
34 |
-
def generator_loss(disc_outputs):
|
35 |
-
loss = 0
|
36 |
-
gen_losses = []
|
37 |
-
for dg in disc_outputs:
|
38 |
-
dg = dg.float()
|
39 |
-
l = torch.mean((1-dg)**2)
|
40 |
-
gen_losses.append(l)
|
41 |
-
loss += l
|
42 |
-
|
43 |
-
return loss, gen_losses
|
44 |
-
|
45 |
-
|
46 |
-
def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
|
47 |
-
"""
|
48 |
-
z_p, logs_q: [b, h, t_t]
|
49 |
-
m_p, logs_p: [b, h, t_t]
|
50 |
-
"""
|
51 |
-
z_p = z_p.float()
|
52 |
-
logs_q = logs_q.float()
|
53 |
-
m_p = m_p.float()
|
54 |
-
logs_p = logs_p.float()
|
55 |
-
z_mask = z_mask.float()
|
56 |
-
|
57 |
-
kl = logs_p - logs_q - 0.5
|
58 |
-
kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
|
59 |
-
kl = torch.sum(kl * z_mask)
|
60 |
-
l = kl / torch.sum(z_mask)
|
61 |
-
return l
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Modules/vits/models.py
CHANGED
@@ -7,7 +7,6 @@ from torch.nn import functional as F
|
|
7 |
import commons
|
8 |
import modules
|
9 |
import attentions
|
10 |
-
import monotonic_align
|
11 |
|
12 |
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
13 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
@@ -94,44 +93,6 @@ class StochasticDurationPredictor(nn.Module):
|
|
94 |
logw = z0
|
95 |
return logw
|
96 |
|
97 |
-
|
98 |
-
class DurationPredictor(nn.Module):
|
99 |
-
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
|
100 |
-
super().__init__()
|
101 |
-
|
102 |
-
self.in_channels = in_channels
|
103 |
-
self.filter_channels = filter_channels
|
104 |
-
self.kernel_size = kernel_size
|
105 |
-
self.p_dropout = p_dropout
|
106 |
-
self.gin_channels = gin_channels
|
107 |
-
|
108 |
-
self.drop = nn.Dropout(p_dropout)
|
109 |
-
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
|
110 |
-
self.norm_1 = modules.LayerNorm(filter_channels)
|
111 |
-
self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
|
112 |
-
self.norm_2 = modules.LayerNorm(filter_channels)
|
113 |
-
self.proj = nn.Conv1d(filter_channels, 1, 1)
|
114 |
-
|
115 |
-
if gin_channels != 0:
|
116 |
-
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
|
117 |
-
|
118 |
-
def forward(self, x, x_mask, g=None):
|
119 |
-
x = torch.detach(x)
|
120 |
-
if g is not None:
|
121 |
-
g = torch.detach(g)
|
122 |
-
x = x + self.cond(g)
|
123 |
-
x = self.conv_1(x * x_mask)
|
124 |
-
x = torch.relu(x)
|
125 |
-
x = self.norm_1(x)
|
126 |
-
x = self.drop(x)
|
127 |
-
x = self.conv_2(x * x_mask)
|
128 |
-
x = torch.relu(x)
|
129 |
-
x = self.norm_2(x)
|
130 |
-
x = self.drop(x)
|
131 |
-
x = self.proj(x * x_mask)
|
132 |
-
return x * x_mask
|
133 |
-
|
134 |
-
|
135 |
class TextEncoder(nn.Module):
|
136 |
def __init__(self,
|
137 |
n_vocab,
|
@@ -208,39 +169,6 @@ class ResidualCouplingBlock(nn.Module):
|
|
208 |
x = flow(x, x_mask, g=g, reverse=reverse)
|
209 |
return x
|
210 |
|
211 |
-
|
212 |
-
class PosteriorEncoder(nn.Module):
|
213 |
-
def __init__(self,
|
214 |
-
in_channels,
|
215 |
-
out_channels,
|
216 |
-
hidden_channels,
|
217 |
-
kernel_size,
|
218 |
-
dilation_rate,
|
219 |
-
n_layers,
|
220 |
-
gin_channels=0):
|
221 |
-
super().__init__()
|
222 |
-
self.in_channels = in_channels
|
223 |
-
self.out_channels = out_channels
|
224 |
-
self.hidden_channels = hidden_channels
|
225 |
-
self.kernel_size = kernel_size
|
226 |
-
self.dilation_rate = dilation_rate
|
227 |
-
self.n_layers = n_layers
|
228 |
-
self.gin_channels = gin_channels
|
229 |
-
|
230 |
-
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
231 |
-
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
232 |
-
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
233 |
-
|
234 |
-
def forward(self, x, x_lengths, g=None):
|
235 |
-
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
236 |
-
x = self.pre(x) * x_mask
|
237 |
-
x = self.enc(x, x_mask, g=g)
|
238 |
-
stats = self.proj(x) * x_mask
|
239 |
-
m, logs = torch.split(stats, self.out_channels, dim=1)
|
240 |
-
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
241 |
-
return z, m, logs, x_mask
|
242 |
-
|
243 |
-
|
244 |
class Generator(torch.nn.Module):
|
245 |
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
|
246 |
super(Generator, self).__init__()
|
@@ -296,97 +224,6 @@ class Generator(torch.nn.Module):
|
|
296 |
l.remove_weight_norm()
|
297 |
|
298 |
|
299 |
-
class DiscriminatorP(torch.nn.Module):
|
300 |
-
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
301 |
-
super(DiscriminatorP, self).__init__()
|
302 |
-
self.period = period
|
303 |
-
self.use_spectral_norm = use_spectral_norm
|
304 |
-
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
305 |
-
self.convs = nn.ModuleList([
|
306 |
-
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
307 |
-
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
308 |
-
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
309 |
-
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
310 |
-
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
|
311 |
-
])
|
312 |
-
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
313 |
-
|
314 |
-
def forward(self, x):
|
315 |
-
fmap = []
|
316 |
-
|
317 |
-
# 1d to 2d
|
318 |
-
b, c, t = x.shape
|
319 |
-
if t % self.period != 0: # pad first
|
320 |
-
n_pad = self.period - (t % self.period)
|
321 |
-
x = F.pad(x, (0, n_pad), "reflect")
|
322 |
-
t = t + n_pad
|
323 |
-
x = x.view(b, c, t // self.period, self.period)
|
324 |
-
|
325 |
-
for l in self.convs:
|
326 |
-
x = l(x)
|
327 |
-
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
328 |
-
fmap.append(x)
|
329 |
-
x = self.conv_post(x)
|
330 |
-
fmap.append(x)
|
331 |
-
x = torch.flatten(x, 1, -1)
|
332 |
-
|
333 |
-
return x, fmap
|
334 |
-
|
335 |
-
|
336 |
-
class DiscriminatorS(torch.nn.Module):
|
337 |
-
def __init__(self, use_spectral_norm=False):
|
338 |
-
super(DiscriminatorS, self).__init__()
|
339 |
-
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
340 |
-
self.convs = nn.ModuleList([
|
341 |
-
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
342 |
-
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
343 |
-
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
344 |
-
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
345 |
-
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
346 |
-
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
347 |
-
])
|
348 |
-
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
349 |
-
|
350 |
-
def forward(self, x):
|
351 |
-
fmap = []
|
352 |
-
|
353 |
-
for l in self.convs:
|
354 |
-
x = l(x)
|
355 |
-
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
356 |
-
fmap.append(x)
|
357 |
-
x = self.conv_post(x)
|
358 |
-
fmap.append(x)
|
359 |
-
x = torch.flatten(x, 1, -1)
|
360 |
-
|
361 |
-
return x, fmap
|
362 |
-
|
363 |
-
|
364 |
-
class MultiPeriodDiscriminator(torch.nn.Module):
|
365 |
-
def __init__(self, use_spectral_norm=False):
|
366 |
-
super(MultiPeriodDiscriminator, self).__init__()
|
367 |
-
periods = [2,3,5,7,11]
|
368 |
-
|
369 |
-
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
370 |
-
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
371 |
-
self.discriminators = nn.ModuleList(discs)
|
372 |
-
|
373 |
-
def forward(self, y, y_hat):
|
374 |
-
y_d_rs = []
|
375 |
-
y_d_gs = []
|
376 |
-
fmap_rs = []
|
377 |
-
fmap_gs = []
|
378 |
-
for i, d in enumerate(self.discriminators):
|
379 |
-
y_d_r, fmap_r = d(y)
|
380 |
-
y_d_g, fmap_g = d(y_hat)
|
381 |
-
y_d_rs.append(y_d_r)
|
382 |
-
y_d_gs.append(y_d_g)
|
383 |
-
fmap_rs.append(fmap_r)
|
384 |
-
fmap_gs.append(fmap_g)
|
385 |
-
|
386 |
-
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
class SynthesizerTrn(nn.Module):
|
391 |
"""
|
392 |
Synthesizer for Training
|
@@ -445,57 +282,19 @@ class SynthesizerTrn(nn.Module):
|
|
445 |
kernel_size,
|
446 |
p_dropout)
|
447 |
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
|
448 |
-
|
449 |
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
450 |
|
451 |
if use_sdp:
|
|
|
452 |
self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
|
453 |
else:
|
454 |
-
|
|
|
455 |
|
456 |
if n_speakers > 1:
|
457 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
458 |
|
459 |
-
def forward(self, x, x_lengths, y, y_lengths, sid=None):
|
460 |
-
|
461 |
-
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
462 |
-
if self.n_speakers > 0:
|
463 |
-
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
464 |
-
else:
|
465 |
-
g = None
|
466 |
-
|
467 |
-
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
468 |
-
z_p = self.flow(z, y_mask, g=g)
|
469 |
-
|
470 |
-
with torch.no_grad():
|
471 |
-
# negative cross-entropy
|
472 |
-
s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
|
473 |
-
neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
|
474 |
-
neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
|
475 |
-
neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
|
476 |
-
neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
|
477 |
-
neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
|
478 |
-
|
479 |
-
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
480 |
-
attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
|
481 |
-
|
482 |
-
w = attn.sum(2)
|
483 |
-
if self.use_sdp:
|
484 |
-
l_length = self.dp(x, x_mask, w, g=g)
|
485 |
-
l_length = l_length / torch.sum(x_mask)
|
486 |
-
else:
|
487 |
-
logw_ = torch.log(w + 1e-6) * x_mask
|
488 |
-
logw = self.dp(x, x_mask, g=g)
|
489 |
-
l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
|
490 |
-
|
491 |
-
# expand prior
|
492 |
-
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
|
493 |
-
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
|
494 |
-
|
495 |
-
z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
|
496 |
-
o = self.dec(z_slice, g=g)
|
497 |
-
return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
498 |
-
|
499 |
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
|
500 |
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
501 |
if self.n_speakers > 0:
|
@@ -521,14 +320,3 @@ class SynthesizerTrn(nn.Module):
|
|
521 |
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
522 |
o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
523 |
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
524 |
-
|
525 |
-
def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
|
526 |
-
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
|
527 |
-
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
528 |
-
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
529 |
-
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
530 |
-
z_p = self.flow(z, y_mask, g=g_src)
|
531 |
-
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
532 |
-
o_hat = self.dec(z_hat * y_mask, g=g_tgt)
|
533 |
-
return o_hat, y_mask, (z, z_p, z_hat)
|
534 |
-
|
|
|
7 |
import commons
|
8 |
import modules
|
9 |
import attentions
|
|
|
10 |
|
11 |
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
12 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
|
|
93 |
logw = z0
|
94 |
return logw
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
class TextEncoder(nn.Module):
|
97 |
def __init__(self,
|
98 |
n_vocab,
|
|
|
169 |
x = flow(x, x_mask, g=g, reverse=reverse)
|
170 |
return x
|
171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
class Generator(torch.nn.Module):
|
173 |
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
|
174 |
super(Generator, self).__init__()
|
|
|
224 |
l.remove_weight_norm()
|
225 |
|
226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
class SynthesizerTrn(nn.Module):
|
228 |
"""
|
229 |
Synthesizer for Training
|
|
|
282 |
kernel_size,
|
283 |
p_dropout)
|
284 |
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
|
285 |
+
|
286 |
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
287 |
|
288 |
if use_sdp:
|
289 |
+
# raise ValueError
|
290 |
self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
|
291 |
else:
|
292 |
+
raise ValueError
|
293 |
+
# self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
294 |
|
295 |
if n_speakers > 1:
|
296 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
297 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
|
299 |
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
300 |
if self.n_speakers > 0:
|
|
|
320 |
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
321 |
o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
322 |
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Modules/vits/monotonic_align/__init__.py
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import torch
|
3 |
-
from .monotonic_align.core import maximum_path_c
|
4 |
-
|
5 |
-
|
6 |
-
def maximum_path(neg_cent, mask):
|
7 |
-
""" Cython optimized version.
|
8 |
-
neg_cent: [b, t_t, t_s]
|
9 |
-
mask: [b, t_t, t_s]
|
10 |
-
"""
|
11 |
-
device = neg_cent.device
|
12 |
-
dtype = neg_cent.dtype
|
13 |
-
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
|
14 |
-
path = np.zeros(neg_cent.shape, dtype=np.int32)
|
15 |
-
|
16 |
-
t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
|
17 |
-
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
|
18 |
-
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
|
19 |
-
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Modules/vits/monotonic_align/core.pyx
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
cimport cython
|
2 |
-
from cython.parallel import prange
|
3 |
-
|
4 |
-
|
5 |
-
@cython.boundscheck(False)
|
6 |
-
@cython.wraparound(False)
|
7 |
-
cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
|
8 |
-
cdef int x
|
9 |
-
cdef int y
|
10 |
-
cdef float v_prev
|
11 |
-
cdef float v_cur
|
12 |
-
cdef float tmp
|
13 |
-
cdef int index = t_x - 1
|
14 |
-
|
15 |
-
for y in range(t_y):
|
16 |
-
for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
|
17 |
-
if x == y:
|
18 |
-
v_cur = max_neg_val
|
19 |
-
else:
|
20 |
-
v_cur = value[y-1, x]
|
21 |
-
if x == 0:
|
22 |
-
if y == 0:
|
23 |
-
v_prev = 0.
|
24 |
-
else:
|
25 |
-
v_prev = max_neg_val
|
26 |
-
else:
|
27 |
-
v_prev = value[y-1, x-1]
|
28 |
-
value[y, x] += max(v_prev, v_cur)
|
29 |
-
|
30 |
-
for y in range(t_y - 1, -1, -1):
|
31 |
-
path[y, index] = 1
|
32 |
-
if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
|
33 |
-
index = index - 1
|
34 |
-
|
35 |
-
|
36 |
-
@cython.boundscheck(False)
|
37 |
-
@cython.wraparound(False)
|
38 |
-
cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
|
39 |
-
cdef int b = paths.shape[0]
|
40 |
-
cdef int i
|
41 |
-
for i in prange(b, nogil=True):
|
42 |
-
maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Modules/vits/monotonic_align/setup.py
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
from distutils.core import setup
|
2 |
-
from Cython.Build import cythonize
|
3 |
-
import numpy
|
4 |
-
|
5 |
-
setup(
|
6 |
-
name = 'monotonic_align',
|
7 |
-
ext_modules = cythonize("core.pyx"),
|
8 |
-
include_dirs=[numpy.get_include()]
|
9 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Modules/vits/preprocess.py
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import text
|
3 |
-
from utils import load_filepaths_and_text
|
4 |
-
|
5 |
-
if __name__ == '__main__':
|
6 |
-
parser = argparse.ArgumentParser()
|
7 |
-
parser.add_argument("--out_extension", default="cleaned")
|
8 |
-
parser.add_argument("--text_index", default=1, type=int)
|
9 |
-
parser.add_argument("--filelists", nargs="+", default=["filelists/ljs_audio_text_val_filelist.txt", "filelists/ljs_audio_text_test_filelist.txt"])
|
10 |
-
parser.add_argument("--text_cleaners", nargs="+", default=["english_cleaners2"])
|
11 |
-
|
12 |
-
args = parser.parse_args()
|
13 |
-
|
14 |
-
|
15 |
-
for filelist in args.filelists:
|
16 |
-
print("START:", filelist)
|
17 |
-
filepaths_and_text = load_filepaths_and_text(filelist)
|
18 |
-
for i in range(len(filepaths_and_text)):
|
19 |
-
original_text = filepaths_and_text[i][args.text_index]
|
20 |
-
cleaned_text = text._clean_text(original_text, args.text_cleaners)
|
21 |
-
filepaths_and_text[i][args.text_index] = cleaned_text
|
22 |
-
|
23 |
-
new_filelist = filelist + "." + args.out_extension
|
24 |
-
with open(new_filelist, "w", encoding="utf-8") as f:
|
25 |
-
f.writelines(["|".join(x) + "\n" for x in filepaths_and_text])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tts.py
CHANGED
@@ -45,10 +45,10 @@ def command_line_args():
|
|
45 |
parser.add_argument(
|
46 |
'--soundscape',
|
47 |
help='soundscape - MUST BE IN BRACKETS: \"forest\"',
|
48 |
-
default='wind fjord',
|
49 |
nargs='?',
|
50 |
type=str,
|
51 |
-
const=
|
52 |
)
|
53 |
parser.add_argument(
|
54 |
'--native',
|
@@ -175,4 +175,4 @@ if __name__ == '__main__':
|
|
175 |
cli()
|
176 |
|
177 |
# assume also video and text for video we have to write some classes for video for audiocraft
|
178 |
-
# then call tts.py on this video with nonempty labels - thus calls audiocraft
|
|
|
45 |
parser.add_argument(
|
46 |
'--soundscape',
|
47 |
help='soundscape - MUST BE IN BRACKETS: \"forest\"',
|
48 |
+
default=None, #'wind fjord',
|
49 |
nargs='?',
|
50 |
type=str,
|
51 |
+
const='wind fjord",
|
52 |
)
|
53 |
parser.add_argument(
|
54 |
'--native',
|
|
|
175 |
cli()
|
176 |
|
177 |
# assume also video and text for video we have to write some classes for video for audiocraft
|
178 |
+
# then call tts.py on this video with nonempty labels - thus calls audiocraft
|