Dionyssos commited on
Commit
ac6157a
·
1 Parent(s): 21b15e5

del nltk - oversplits No. 47

Browse files
Files changed (4) hide show
  1. Modules/diffusion/modules.py +0 -366
  2. api.py +9 -21
  3. models.py +8 -98
  4. msinference.py +9 -11
Modules/diffusion/modules.py DELETED
@@ -1,366 +0,0 @@
1
- from math import floor, log, pi
2
- import torch.nn.functional as F
3
- import torch
4
- import torch.nn as nn
5
- from einops import rearrange, reduce, repeat
6
- from einops.layers.torch import Rearrange
7
- from einops_exts import rearrange_many
8
- from torch import Tensor, einsum
9
-
10
-
11
- def default(val, d):
12
- if val is not None: #exists(val):
13
- return val
14
- return d # d() if isfunction(d) else d
15
-
16
- class AdaLayerNorm(nn.Module):
17
- def __init__(self, style_dim, channels, eps=1e-5):
18
- super().__init__()
19
- self.channels = channels
20
- self.eps = eps
21
-
22
- self.fc = nn.Linear(style_dim, channels*2)
23
-
24
- def forward(self, x, s):
25
- x = x.transpose(-1, -2)
26
- x = x.transpose(1, -1)
27
-
28
- h = self.fc(s)
29
- h = h.view(h.size(0), h.size(1), 1)
30
- gamma, beta = torch.chunk(h, chunks=2, dim=1)
31
- gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
32
-
33
-
34
- x = F.layer_norm(x, (self.channels,), eps=self.eps)
35
- x = (1 + gamma) * x + beta
36
- return x.transpose(1, -1).transpose(-1, -2)
37
-
38
- class StyleTransformer1d(nn.Module):
39
-
40
- # artificial_stylets / models.py
41
-
42
- def __init__(
43
- self,
44
- num_layers: int,
45
- channels: int,
46
- num_heads: int,
47
- head_features: int,
48
- multiplier: int,
49
- use_context_time: bool = True,
50
- use_rel_pos: bool = False,
51
- context_features_multiplier: int = 1,
52
- # rel_pos_num_buckets: Optional[int] = None,
53
- # rel_pos_max_distance: Optional[int] = None,
54
- context_features=None,
55
- context_embedding_features=None,
56
- embedding_max_length=512,
57
- ):
58
- super().__init__()
59
-
60
- self.blocks = nn.ModuleList(
61
- [
62
- StyleTransformerBlock(
63
- features=channels + context_embedding_features,
64
- head_features=head_features,
65
- num_heads=num_heads,
66
- multiplier=multiplier,
67
- style_dim=context_features,
68
- use_rel_pos=use_rel_pos,
69
- # rel_pos_num_buckets=rel_pos_num_buckets,
70
- # rel_pos_max_distance=rel_pos_max_distance,
71
- )
72
- for i in range(num_layers)
73
- ]
74
- )
75
-
76
- self.to_out = nn.Sequential(
77
- Rearrange("b t c -> b c t"),
78
- nn.Conv1d(
79
- in_channels=channels + context_embedding_features,
80
- out_channels=channels,
81
- kernel_size=1,
82
- ),
83
- )
84
-
85
- use_context_features = context_features is not None
86
- self.use_context_features = use_context_features
87
- self.use_context_time = use_context_time
88
-
89
- if use_context_time or use_context_features:
90
- # print(f'{use_context_time=} {use_context_features=}ooooooooooooooooooooooooooooooooooo')
91
- # raise ValueError
92
- # True True both context
93
- context_mapping_features = channels + context_embedding_features
94
-
95
- self.to_mapping = nn.Sequential(
96
- nn.Linear(context_mapping_features, context_mapping_features),
97
- nn.GELU(),
98
- nn.Linear(context_mapping_features, context_mapping_features),
99
- nn.GELU(),
100
- )
101
-
102
- if use_context_time:
103
-
104
- self.to_time = nn.Sequential(
105
- TimePositionalEmbedding(
106
- dim=channels, out_features=context_mapping_features
107
- ),
108
- nn.GELU(),
109
- )
110
-
111
- if use_context_features:
112
-
113
- self.to_features = nn.Sequential(
114
- nn.Linear(
115
- in_features=context_features, out_features=context_mapping_features
116
- ),
117
- nn.GELU(),
118
- )
119
-
120
- # self.fixed_embedding = FixedEmbedding(
121
- # max_length=embedding_max_length, features=context_embedding_features
122
- # ) # Non speker-aware LookUp: EMbedding looks just the time-frame-index [0,1,2...,num-asr-time-frames]
123
-
124
- def get_mapping(
125
- self,
126
- time=None,
127
- features=None):
128
- """Combines context time features and features into mapping"""
129
- items, mapping = [], None
130
- # Compute time features
131
- if self.use_context_time:
132
-
133
- items += [self.to_time(time)]
134
- # Compute features
135
- if self.use_context_features:
136
-
137
- items += [self.to_features(features)]
138
-
139
- # Compute joint mapping
140
- if self.use_context_time or self.use_context_features:
141
- # raise ValueError
142
- mapping = reduce(torch.stack(items), "n b m -> b m", "sum")
143
- mapping = self.to_mapping(mapping)
144
-
145
- return mapping
146
-
147
- def forward(self,
148
- x,
149
- time,
150
- embedding= None,
151
- features = None):
152
-
153
- # --
154
- # called by forward()
155
-
156
- mapping = self.get_mapping(time, features)
157
- x = torch.cat([x.expand(-1, embedding.size(1), -1), embedding], axis=-1)
158
- mapping = mapping.unsqueeze(1).expand(-1, embedding.size(1), -1)
159
- for block in self.blocks:
160
- x = x + mapping
161
- x = block(x, features)
162
- x = x.mean(axis=1).unsqueeze(1)
163
- x = self.to_out(x)
164
- x = x.transpose(-1, -2)
165
- return x
166
-
167
-
168
- class StyleTransformerBlock(nn.Module):
169
- def __init__(
170
- self,
171
- features: int,
172
- num_heads: int,
173
- head_features: int,
174
- style_dim: int,
175
- multiplier: int,
176
- use_rel_pos: bool,
177
- # rel_pos_num_buckets: Optional[int] = None,
178
- # rel_pos_max_distance: Optional[int] = None,
179
- context_features = None,
180
- ):
181
- super().__init__()
182
-
183
- self.use_cross_attention = (context_features is not None) and (context_features > 0)
184
- # print(f'{rel_pos_num_buckets=} {rel_pos_max_distance=}') # None None
185
- # raise ValueError
186
- self.attention = StyleAttention(
187
- features=features,
188
- style_dim=style_dim,
189
- num_heads=num_heads,
190
- head_features=head_features
191
- )
192
-
193
- if self.use_cross_attention:
194
- raise ValueError
195
-
196
- self.feed_forward = FeedForward(features=features, multiplier=multiplier)
197
-
198
- def forward(self, x: Tensor, s: Tensor, *, context = None) -> Tensor:
199
- x = self.attention(x, s) + x
200
- if self.use_cross_attention:
201
- raise ValueError
202
- # x = self.cross_attention(x, s, context=context) + x
203
- x = self.feed_forward(x) + x
204
- return x
205
-
206
- class StyleAttention(nn.Module):
207
- def __init__(
208
- self,
209
- features: int,
210
- *,
211
- style_dim: int,
212
- head_features: int,
213
- num_heads: int,
214
- context_features = None,
215
- # use_rel_pos: bool,
216
- # rel_pos_num_buckets: Optional[int] = None,
217
- # rel_pos_max_distance: Optional[int] = None,
218
- ):
219
- super().__init__()
220
- self.context_features = context_features
221
- mid_features = head_features * num_heads
222
- context_features = default(context_features, features)
223
-
224
- self.norm = AdaLayerNorm(style_dim, features)
225
- self.norm_context = AdaLayerNorm(style_dim, context_features)
226
- self.to_q = nn.Linear(
227
- in_features=features, out_features=mid_features, bias=False
228
- )
229
- self.to_kv = nn.Linear(
230
- in_features=context_features, out_features=mid_features * 2, bias=False
231
- )
232
- self.attention = AttentionBase(
233
- features,
234
- num_heads=num_heads,
235
- head_features=head_features
236
- )
237
-
238
- def forward(self, x, s, *, context = None):
239
-
240
- if context is not None:
241
- raise ValueError
242
- context = default(context, x)
243
-
244
-
245
- x, context = self.norm(x, s), self.norm_context(context, s)
246
-
247
- q, k, v = (self.to_q(x), *torch.chunk(self.to_kv(context), chunks=2, dim=-1))
248
-
249
- return self.attention(q, k, v)
250
-
251
-
252
- def FeedForward(features,
253
- multiplier):
254
- mid_features = features * multiplier
255
- return nn.Sequential(
256
- nn.Linear(in_features=features, out_features=mid_features),
257
- nn.GELU(),
258
- nn.Linear(in_features=mid_features, out_features=features),
259
- )
260
-
261
-
262
- class AttentionBase(nn.Module):
263
- def __init__(
264
- self,
265
- features,
266
- *,
267
- head_features,
268
- num_heads):
269
- super().__init__()
270
- self.scale = head_features ** -0.5
271
- self.num_heads = num_heads
272
- mid_features = head_features * num_heads
273
- self.to_out = nn.Linear(in_features=mid_features,
274
- out_features=features)
275
-
276
- def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
277
- # Split heads
278
- q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=self.num_heads)
279
- # Compute similarity matrix
280
- sim = einsum("... n d, ... m d -> ... n m", q, k)
281
-
282
- # _____THERE_IS_NO_rel_po
283
- # sim = (sim + self.rel_pos(*sim.shape[-2:])) if self.use_rel_pos else sim
284
- # print(self.rel_pos)
285
-
286
- sim = sim * self.scale
287
- # Get attention matrix with softmax
288
- attn = sim.softmax(dim=-1)
289
- # Compute values
290
- out = einsum("... n m, ... m d -> ... n d", attn, v)
291
- out = rearrange(out, "b h n d -> b n (h d)")
292
- return self.to_out(out)
293
-
294
-
295
- class Attention(nn.Module):
296
- def __init__(
297
- self,
298
- features,
299
- *,
300
- head_features,
301
- num_heads,
302
- out_features=None,
303
- context_features=None,
304
- # use_rel_pos,
305
- # rel_pos_num_buckets: Optional[int] = None,
306
- # rel_pos_max_distance: Optional[int] = None,
307
- ):
308
- super().__init__()
309
- self.context_features = context_features
310
- mid_features = head_features * num_heads
311
- context_features = default(context_features, features)
312
-
313
- self.norm = nn.LayerNorm(features)
314
- self.norm_context = nn.LayerNorm(context_features)
315
- self.to_q = nn.Linear(
316
- in_features=features, out_features=mid_features, bias=False
317
- )
318
- self.to_kv = nn.Linear(
319
- in_features=context_features, out_features=mid_features * 2, bias=False
320
- )
321
-
322
- self.attention = AttentionBase(
323
- features,
324
- out_features=out_features,
325
- num_heads=num_heads,
326
- head_features=head_features,
327
- # use_rel_pos=use_rel_pos,
328
- # rel_pos_num_buckets=rel_pos_num_buckets,
329
- # rel_pos_max_distance=rel_pos_max_distance,
330
- )
331
-
332
- def forward(self, x: Tensor, *, context = None) -> Tensor:
333
- # assert_message = "You must provide a context when using context_features"
334
- # assert not self.context_features or exists(context), assert_message
335
- # Use context if provided
336
- context = default(context, x)
337
- # Normalize then compute q from input and k,v from context
338
- x, context = self.norm(x), self.norm_context(context)
339
- q, k, v = (self.to_q(x), *torch.chunk(self.to_kv(context), chunks=2, dim=-1))
340
- # Compute and return attention
341
- return self.attention(q, k, v)
342
-
343
-
344
- class LearnedPositionalEmbedding(nn.Module):
345
- """Used for continuous time"""
346
-
347
- def __init__(self, dim: int):
348
- super().__init__()
349
- assert (dim % 2) == 0
350
- half_dim = dim // 2
351
- self.weights = nn.Parameter(torch.randn(half_dim))
352
-
353
- def forward(self, x: Tensor) -> Tensor:
354
- x = rearrange(x, "b -> b 1")
355
- freqs = x * rearrange(self.weights, "d -> 1 d") * 2 * pi
356
- fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
357
- fouriered = torch.cat((x, fouriered), dim=-1)
358
- return fouriered
359
-
360
-
361
- def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module:
362
- return nn.Sequential(
363
- LearnedPositionalEmbedding(dim),
364
- nn.Linear(in_features=dim + 1, out_features=out_features),
365
- )
366
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
api.py CHANGED
@@ -10,7 +10,6 @@ import srt
10
  import subprocess
11
  import cv2
12
  import markdown
13
- import json
14
  from pathlib import Path
15
  from types import SimpleNamespace
16
  from flask import Flask, request, send_from_directory
@@ -25,8 +24,7 @@ sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
25
 
26
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
27
 
28
- import nltk
29
- nltk.download('punkt')
30
 
31
  # SSH AGENT
32
  # eval $(ssh-agent -s)
@@ -150,8 +148,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
150
  text=None,
151
  voice=None,
152
  soundscape=None,
153
- speed=None,
154
- diffusion_steps=7):
155
  '''create 24kHZ np.array with tts
156
 
157
  precomputed_style_vector : required if en_US or en_UK in voice, so
@@ -168,10 +165,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
168
  x = []
169
  for _sentence in text:
170
  x.append(msinference.inference(_sentence,
171
- precomputed_style_vector,
172
- alpha=0.3,
173
- beta=0.7,
174
- diffusion_steps=diffusion_steps)
175
  )
176
  x = np.concatenate(x)
177
 
@@ -270,7 +264,6 @@ def serve_wav():
270
  # ====STYLE VECTOR====
271
 
272
  precomputed_style_vector = None
273
- diffusion_steps = 7 # 7=native / 5=non-native
274
 
275
  if args.native: # Voice Cloning
276
  try:
@@ -307,7 +300,7 @@ def serve_wav():
307
  '/', '_').replace('#', '_').replace(
308
  'cmu-arctic', 'cmu_arctic').replace(
309
  '_low', '') + '.wav')
310
- diffusion_steps = 5 # non-native
311
 
312
  # Foreign Lang - MMS/TTS
313
  else:
@@ -448,8 +441,7 @@ def serve_wav():
448
  precomputed_style_vector=precomputed_style_vector,
449
  voice=args.voice,
450
  soundscape=args.soundscape,
451
- speed=args.speed,
452
- diffusion_steps=diffusion_steps)
453
  )
454
  total = np.concatenate(pieces, 0)
455
  # x = audresample.resample(x.astype(np.float32), 24000, 22050) # reshapes (64,) -> (1,64)
@@ -470,8 +462,7 @@ def serve_wav():
470
  precomputed_style_vector=precomputed_style_vector,
471
  voice=args.voice,
472
  soundscape=args.soundscape,
473
- speed=args.speed,
474
- diffusion_steps=diffusion_steps)
475
  soundfile.write(AUDIO_TRACK, x, 24000)
476
 
477
  # IMAGE 2 SPEECH
@@ -490,8 +481,7 @@ def serve_wav():
490
  precomputed_style_vector=precomputed_style_vector,
491
  voice=args.voice,
492
  soundscape=args.soundscape,
493
- speed=args.speed,
494
- diffusion_steps=diffusion_steps
495
  )
496
  soundfile.write(AUDIO_TRACK, x, 24000)
497
  if args.video or args.image:
@@ -520,8 +510,7 @@ def serve_wav():
520
  precomputed_style_vector=precomputed_style_vector,
521
  voice=args.voice,
522
  soundscape=args.soundscape,
523
- speed=args.speed,
524
- diffusion_steps=diffusion_steps)
525
  OUT_FILE = 'tmp.wav'
526
  soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
527
 
@@ -529,8 +518,7 @@ def serve_wav():
529
 
530
 
531
  # audios = [msinference.inference(text,
532
- # msinference.compute_style(f'voices/{voice}.wav'),
533
- # alpha=0.3, beta=0.7, diffusion_steps=7)]
534
  # # for t in [text]:
535
  # output_buffer = io.BytesIO()
536
  # write(output_buffer, 24000, np.concatenate(audios))
 
10
  import subprocess
11
  import cv2
12
  import markdown
 
13
  from pathlib import Path
14
  from types import SimpleNamespace
15
  from flask import Flask, request, send_from_directory
 
24
 
25
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
26
 
27
+
 
28
 
29
  # SSH AGENT
30
  # eval $(ssh-agent -s)
 
148
  text=None,
149
  voice=None,
150
  soundscape=None,
151
+ speed=None):
 
152
  '''create 24kHZ np.array with tts
153
 
154
  precomputed_style_vector : required if en_US or en_UK in voice, so
 
165
  x = []
166
  for _sentence in text:
167
  x.append(msinference.inference(_sentence,
168
+ precomputed_style_vector)
 
 
 
169
  )
170
  x = np.concatenate(x)
171
 
 
264
  # ====STYLE VECTOR====
265
 
266
  precomputed_style_vector = None
 
267
 
268
  if args.native: # Voice Cloning
269
  try:
 
300
  '/', '_').replace('#', '_').replace(
301
  'cmu-arctic', 'cmu_arctic').replace(
302
  '_low', '') + '.wav')
303
+
304
 
305
  # Foreign Lang - MMS/TTS
306
  else:
 
441
  precomputed_style_vector=precomputed_style_vector,
442
  voice=args.voice,
443
  soundscape=args.soundscape,
444
+ speed=args.speed)
 
445
  )
446
  total = np.concatenate(pieces, 0)
447
  # x = audresample.resample(x.astype(np.float32), 24000, 22050) # reshapes (64,) -> (1,64)
 
462
  precomputed_style_vector=precomputed_style_vector,
463
  voice=args.voice,
464
  soundscape=args.soundscape,
465
+ speed=args.speed)
 
466
  soundfile.write(AUDIO_TRACK, x, 24000)
467
 
468
  # IMAGE 2 SPEECH
 
481
  precomputed_style_vector=precomputed_style_vector,
482
  voice=args.voice,
483
  soundscape=args.soundscape,
484
+ speed=args.speed
 
485
  )
486
  soundfile.write(AUDIO_TRACK, x, 24000)
487
  if args.video or args.image:
 
510
  precomputed_style_vector=precomputed_style_vector,
511
  voice=args.voice,
512
  soundscape=args.soundscape,
513
+ speed=args.speed)
 
514
  OUT_FILE = 'tmp.wav'
515
  soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
516
 
 
518
 
519
 
520
  # audios = [msinference.inference(text,
521
+ # msinference.compute_style(f'voices/{voice}.wav'))]
 
522
  # # for t in [text]:
523
  # output_buffer = io.BytesIO()
524
  # write(output_buffer, 24000, np.concatenate(audios))
models.py CHANGED
@@ -1,96 +1,15 @@
1
  #coding:utf-8
2
 
3
  import os
4
- import os.path as osp
5
- import copy
6
  import math
7
- import numpy as np
8
  import torch
9
  import torch.nn as nn
10
  import torch.nn.functional as F
11
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
  from Utils.ASR.models import ASRCNN
13
  from Utils.JDC.model import JDCNet
14
-
15
- from Modules.diffusion.modules import StyleTransformer1d
16
-
17
  from munch import Munch
18
  import yaml
19
- from math import pi
20
- from random import randint
21
-
22
- import torch
23
- from einops import rearrange
24
- from torch import Tensor, nn
25
- from tqdm import tqdm
26
-
27
-
28
-
29
-
30
-
31
- def get_default_model_kwargs():
32
- return dict(
33
- channels=128,
34
- patch_size=16,
35
- multipliers=[1, 2, 4, 4, 4, 4, 4],
36
- factors=[4, 4, 4, 2, 2, 2],
37
- num_blocks=[2, 2, 2, 2, 2, 2],
38
- attentions=[0, 0, 0, 1, 1, 1, 1],
39
- attention_heads=8,
40
- attention_features=64,
41
- attention_multiplier=2,
42
- attention_use_rel_pos=False,
43
- diffusion_type="v",
44
- diffusion_sigma_distribution=UniformDistribution(),
45
- )
46
-
47
-
48
- def get_default_sampling_kwargs():
49
- return dict(sigma_schedule=LinearSchedule(), sampler=VSampler(), clamp=True)
50
-
51
- class AudioDiffusionConditional(nn.Module):
52
- def __init__(
53
- self,
54
- embedding_features: int,
55
- embedding_max_length: int,
56
- embedding_mask_proba: float = 0.1,
57
- **kwargs,
58
- ):
59
- self.unet = None
60
- self.embedding_mask_proba = embedding_mask_proba
61
- # default_kwargs = dict(
62
- # **get_default_model_kwargs(),
63
- # unet_type="cfg",
64
- # context_embedding_features=embedding_features,
65
- # context_embedding_max_length=embedding_max_length,
66
- # )
67
- super().__init__()
68
-
69
- def forward(self, *args, **kwargs):
70
- default_kwargs = dict(embedding_mask_proba=self.embedding_mask_proba)
71
- # here embedding_scale = 1.0 is passed to DiffusionSampler() - del no-op if scale = 1.0
72
- return self.diffusion(*args, **{**default_kwargs, **kwargs})
73
-
74
- # def sample(self, *args, **kwargs):
75
- # default_kwargs = dict(
76
- # **get_default_sampling_kwargs(),
77
- # embedding_scale=5.0,
78
- # )
79
- # return super().sample(*args, **{**default_kwargs, **kwargs})
80
-
81
-
82
-
83
-
84
-
85
-
86
-
87
-
88
-
89
-
90
-
91
-
92
-
93
-
94
 
95
 
96
  class LearnedDownSample(nn.Module):
@@ -106,10 +25,11 @@ class LearnedDownSample(nn.Module):
106
  self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
107
  else:
108
  raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
109
-
110
  def forward(self, x):
111
  return self.conv(x)
112
 
 
113
  class DownSample(nn.Module):
114
  def __init__(self, layer_type):
115
  super().__init__()
@@ -187,6 +107,7 @@ class ResBlk(nn.Module):
187
  x = self._shortcut(x) + self._residual(x)
188
  return x / math.sqrt(2) # unit variance
189
 
 
190
  class StyleEncoder(nn.Module):
191
  def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
192
  super().__init__()
@@ -211,9 +132,9 @@ class StyleEncoder(nn.Module):
211
  h = self.shared(x)
212
  h = h.view(h.size(0), -1)
213
  s = self.unshared(h)
214
-
215
  return s
216
 
 
217
  class LinearNorm(torch.nn.Module):
218
  def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
219
  super(LinearNorm, self).__init__()
@@ -226,6 +147,7 @@ class LinearNorm(torch.nn.Module):
226
  def forward(self, x):
227
  return self.linear_layer(x)
228
 
 
229
  class ResBlk1d(nn.Module):
230
  def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
231
  normalize=False, downsample='none', dropout_p=0.2):
@@ -286,6 +208,7 @@ class ResBlk1d(nn.Module):
286
  x = self._shortcut(x) + self._residual(x)
287
  return x / math.sqrt(2) # unit variance
288
 
 
289
  class LayerNorm(nn.Module):
290
  def __init__(self, channels, eps=1e-5):
291
  super().__init__()
@@ -299,7 +222,7 @@ class LayerNorm(nn.Module):
299
  x = x.transpose(1, -1)
300
  x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
301
  return x.transpose(1, -1)
302
-
303
  class TextEncoder(nn.Module):
304
  def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
305
  super().__init__()
@@ -612,19 +535,6 @@ def build_model(args, text_aligner, pitch_extractor, bert):
612
 
613
  style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # acoustic style encoder
614
  predictor_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # prosodic style encoder
615
-
616
- # define diffusion model
617
- if args.multispeaker:
618
- transformer = StyleTransformer1d(channels=args.style_dim*2,
619
- context_embedding_features=bert.config.hidden_size,
620
- context_features=args.style_dim*2,
621
- **args.diffusion.transformer)
622
- else:
623
- raise NotImplementedError
624
-
625
-
626
-
627
-
628
  nets = Munch(
629
  bert=bert,
630
  bert_encoder=nn.Linear(bert.config.hidden_size, args.hidden_dim),
 
1
  #coding:utf-8
2
 
3
  import os
 
 
4
  import math
 
5
  import torch
6
  import torch.nn as nn
7
  import torch.nn.functional as F
8
+ from torch.nn.utils import weight_norm, spectral_norm
9
  from Utils.ASR.models import ASRCNN
10
  from Utils.JDC.model import JDCNet
 
 
 
11
  from munch import Munch
12
  import yaml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  class LearnedDownSample(nn.Module):
 
25
  self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
26
  else:
27
  raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
28
+
29
  def forward(self, x):
30
  return self.conv(x)
31
 
32
+
33
  class DownSample(nn.Module):
34
  def __init__(self, layer_type):
35
  super().__init__()
 
107
  x = self._shortcut(x) + self._residual(x)
108
  return x / math.sqrt(2) # unit variance
109
 
110
+
111
  class StyleEncoder(nn.Module):
112
  def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
113
  super().__init__()
 
132
  h = self.shared(x)
133
  h = h.view(h.size(0), -1)
134
  s = self.unshared(h)
 
135
  return s
136
 
137
+
138
  class LinearNorm(torch.nn.Module):
139
  def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
140
  super(LinearNorm, self).__init__()
 
147
  def forward(self, x):
148
  return self.linear_layer(x)
149
 
150
+
151
  class ResBlk1d(nn.Module):
152
  def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
153
  normalize=False, downsample='none', dropout_p=0.2):
 
208
  x = self._shortcut(x) + self._residual(x)
209
  return x / math.sqrt(2) # unit variance
210
 
211
+
212
  class LayerNorm(nn.Module):
213
  def __init__(self, channels, eps=1e-5):
214
  super().__init__()
 
222
  x = x.transpose(1, -1)
223
  x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
224
  return x.transpose(1, -1)
225
+
226
  class TextEncoder(nn.Module):
227
  def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
228
  super().__init__()
 
535
 
536
  style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # acoustic style encoder
537
  predictor_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # prosodic style encoder
 
 
 
 
 
 
 
 
 
 
 
 
 
538
  nets = Munch(
539
  bert=bert,
540
  bert_encoder=nn.Linear(bert.config.hidden_size, args.hidden_dim),
msinference.py CHANGED
@@ -1,25 +1,20 @@
1
  import torch
2
  from cached_path import cached_path
3
- import nltk
4
  import audresample
5
  # nltk.download('punkt')
6
  import numpy as np
7
- np.random.seed(0)
8
- import time
9
  import yaml
10
- import torch.nn.functional as F
11
- import copy
12
  import torchaudio
13
  import librosa
14
  from models import *
15
  from munch import Munch
16
- from torch import nn
17
  from nltk.tokenize import word_tokenize
18
 
19
  torch.manual_seed(0)
20
  # torch.backends.cudnn.benchmark = False
21
  # torch.backends.cudnn.deterministic = True
22
-
23
 
24
  # IPA Phonemizer: https://github.com/bootphon/phonemizer
25
 
@@ -164,11 +159,12 @@ _ = [model[key].eval() for key in model]
164
 
165
  def inference(text,
166
  ref_s,
167
- alpha = 0.3,
168
- beta = 0.7,
169
- diffusion_steps=7, # 7 if voice is native English else 5 for non-native
170
  use_gruut=False):
 
 
 
171
  text = text.strip()
 
172
  ps = global_phonemizer.phonemize([text])
173
  # print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
174
  ps = word_tokenize(ps[0])
@@ -245,7 +241,7 @@ def inference(text,
245
  F0_pred, N_pred, ref.squeeze().unsqueeze(0))
246
 
247
 
248
- x = x.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model
249
 
250
  x /= np.abs(x).max() + 1e-7
251
 
@@ -476,3 +472,5 @@ def foreign(text=None, # list of text
476
 
477
  # x = synthesize(text=_t, lang=LANG, speed=1.14)
478
  # audiofile.write('_r.wav', x, 16000) # mms-tts = 16,000
 
 
 
1
  import torch
2
  from cached_path import cached_path
3
+ # import nltk
4
  import audresample
5
  # nltk.download('punkt')
6
  import numpy as np
 
 
7
  import yaml
 
 
8
  import torchaudio
9
  import librosa
10
  from models import *
11
  from munch import Munch
 
12
  from nltk.tokenize import word_tokenize
13
 
14
  torch.manual_seed(0)
15
  # torch.backends.cudnn.benchmark = False
16
  # torch.backends.cudnn.deterministic = True
17
+ np.random.seed(0)
18
 
19
  # IPA Phonemizer: https://github.com/bootphon/phonemizer
20
 
 
159
 
160
  def inference(text,
161
  ref_s,
 
 
 
162
  use_gruut=False):
163
+ # Ignore .,; AT end of sentence; or just [-50:]
164
+
165
+
166
  text = text.strip()
167
+
168
  ps = global_phonemizer.phonemize([text])
169
  # print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
170
  ps = word_tokenize(ps[0])
 
241
  F0_pred, N_pred, ref.squeeze().unsqueeze(0))
242
 
243
 
244
+ x = x.squeeze().cpu().numpy()[..., :-74] # weird pulse at the end of the model
245
 
246
  x /= np.abs(x).max() + 1e-7
247
 
 
472
 
473
  # x = synthesize(text=_t, lang=LANG, speed=1.14)
474
  # audiofile.write('_r.wav', x, 16000) # mms-tts = 16,000
475
+
476
+