Spaces:
Runtime error
Runtime error
Upload synthesizer/synthesizer_dataset.py with huggingface_hub
Browse files
synthesizer/synthesizer_dataset.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.utils.data import Dataset
|
3 |
+
import numpy as np
|
4 |
+
from pathlib import Path
|
5 |
+
from synthesizer.utils.text import text_to_sequence
|
6 |
+
|
7 |
+
|
8 |
+
class SynthesizerDataset(Dataset):
|
9 |
+
def __init__(self, metadata_fpath: Path, mel_dir: Path, embed_dir: Path, hparams):
|
10 |
+
print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, embed_dir))
|
11 |
+
|
12 |
+
with metadata_fpath.open("r") as metadata_file:
|
13 |
+
metadata = [line.split("|") for line in metadata_file]
|
14 |
+
|
15 |
+
mel_fnames = [x[1] for x in metadata if int(x[4])]
|
16 |
+
mel_fpaths = [mel_dir.joinpath(fname) for fname in mel_fnames]
|
17 |
+
embed_fnames = [x[2] for x in metadata if int(x[4])]
|
18 |
+
embed_fpaths = [embed_dir.joinpath(fname) for fname in embed_fnames]
|
19 |
+
self.samples_fpaths = list(zip(mel_fpaths, embed_fpaths))
|
20 |
+
self.samples_texts = [x[5].strip() for x in metadata if int(x[4])]
|
21 |
+
self.metadata = metadata
|
22 |
+
self.hparams = hparams
|
23 |
+
|
24 |
+
print("Found %d samples" % len(self.samples_fpaths))
|
25 |
+
|
26 |
+
def __getitem__(self, index):
|
27 |
+
# Sometimes index may be a list of 2 (not sure why this happens)
|
28 |
+
# If that is the case, return a single item corresponding to first element in index
|
29 |
+
if index is list:
|
30 |
+
index = index[0]
|
31 |
+
|
32 |
+
mel_path, embed_path = self.samples_fpaths[index]
|
33 |
+
mel = np.load(mel_path).T.astype(np.float32)
|
34 |
+
|
35 |
+
# Load the embed
|
36 |
+
embed = np.load(embed_path)
|
37 |
+
|
38 |
+
# Get the text and clean it
|
39 |
+
text = text_to_sequence(self.samples_texts[index], self.hparams.tts_cleaner_names)
|
40 |
+
|
41 |
+
# Convert the list returned by text_to_sequence to a numpy array
|
42 |
+
text = np.asarray(text).astype(np.int32)
|
43 |
+
|
44 |
+
return text, mel.astype(np.float32), embed.astype(np.float32), index
|
45 |
+
|
46 |
+
def __len__(self):
|
47 |
+
return len(self.samples_fpaths)
|
48 |
+
|
49 |
+
|
50 |
+
def collate_synthesizer(batch, r, hparams):
|
51 |
+
# Text
|
52 |
+
x_lens = [len(x[0]) for x in batch]
|
53 |
+
max_x_len = max(x_lens)
|
54 |
+
|
55 |
+
chars = [pad1d(x[0], max_x_len) for x in batch]
|
56 |
+
chars = np.stack(chars)
|
57 |
+
|
58 |
+
# Mel spectrogram
|
59 |
+
spec_lens = [x[1].shape[-1] for x in batch]
|
60 |
+
max_spec_len = max(spec_lens) + 1
|
61 |
+
if max_spec_len % r != 0:
|
62 |
+
max_spec_len += r - max_spec_len % r
|
63 |
+
|
64 |
+
# WaveRNN mel spectrograms are normalized to [0, 1] so zero padding adds silence
|
65 |
+
# By default, SV2TTS uses symmetric mels, where -1*max_abs_value is silence.
|
66 |
+
if hparams.symmetric_mels:
|
67 |
+
mel_pad_value = -1 * hparams.max_abs_value
|
68 |
+
else:
|
69 |
+
mel_pad_value = 0
|
70 |
+
|
71 |
+
mel = [pad2d(x[1], max_spec_len, pad_value=mel_pad_value) for x in batch]
|
72 |
+
mel = np.stack(mel)
|
73 |
+
|
74 |
+
# Speaker embedding (SV2TTS)
|
75 |
+
embeds = [x[2] for x in batch]
|
76 |
+
|
77 |
+
# Index (for vocoder preprocessing)
|
78 |
+
indices = [x[3] for x in batch]
|
79 |
+
|
80 |
+
|
81 |
+
# Convert all to tensor
|
82 |
+
chars = torch.tensor(chars).long()
|
83 |
+
mel = torch.tensor(mel)
|
84 |
+
embeds = torch.tensor(embeds)
|
85 |
+
|
86 |
+
return chars, mel, embeds, indices
|
87 |
+
|
88 |
+
def pad1d(x, max_len, pad_value=0):
|
89 |
+
return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
|
90 |
+
|
91 |
+
def pad2d(x, max_len, pad_value=0):
|
92 |
+
return np.pad(x, ((0, 0), (0, max_len - x.shape[-1])), mode="constant", constant_values=pad_value)
|