Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +2 -0
- StyleTTS_Accelerate/Configs/config.yml +80 -0
- StyleTTS_Accelerate/Configs/config_44.1khz.yml +80 -0
- StyleTTS_Accelerate/Data/ani_train.csv +0 -0
- StyleTTS_Accelerate/Data/ani_train_only_longs.csv +0 -0
- StyleTTS_Accelerate/Data/train_list.txt +0 -0
- StyleTTS_Accelerate/Data/train_list_libritts.txt +3 -0
- StyleTTS_Accelerate/Data/val_list.txt +100 -0
- StyleTTS_Accelerate/Data/val_list_libritts.txt +195 -0
- StyleTTS_Accelerate/Demo/Inference_LJSpeech.ipynb +417 -0
- StyleTTS_Accelerate/Demo/Inference_LibriTTS.ipynb +529 -0
- StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder.cpython-310.pyc +0 -0
- StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder.cpython-39.pyc +0 -0
- StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder_utils.cpython-310.pyc +0 -0
- StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder_utils.cpython-39.pyc +0 -0
- StyleTTS_Accelerate/Demo/hifi-gan/vocoder.py +283 -0
- StyleTTS_Accelerate/Demo/hifi-gan/vocoder_utils.py +58 -0
- StyleTTS_Accelerate/LICENSE +21 -0
- StyleTTS_Accelerate/LICENSE copy +21 -0
- StyleTTS_Accelerate/Models/Anispeech/config.yml +80 -0
- StyleTTS_Accelerate/Models/Anispeech/epoch_1st_00020.pth +3 -0
- StyleTTS_Accelerate/Models/Anispeech/epoch_2nd_00015.pth +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735697608.khodaya-basse-dige.344916.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735697814.khodaya-basse-dige.346056.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735698320.khodaya-basse-dige.347680.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735698764.khodaya-basse-dige.349633.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735698917.khodaya-basse-dige.350828.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735721417.khodaya-basse-dige.404215.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735721458.khodaya-basse-dige.404475.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735723135.khodaya-basse-dige.409798.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735736169.khodaya-basse-dige.8849.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735753783.khodaya-basse-dige.55757.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735753897.khodaya-basse-dige.56741.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735753979.khodaya-basse-dige.58472.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735754151.khodaya-basse-dige.59652.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735754204.khodaya-basse-dige.60572.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755068.khodaya-basse-dige.62584.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755116.khodaya-basse-dige.63449.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755175.khodaya-basse-dige.64734.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755213.khodaya-basse-dige.65681.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755246.khodaya-basse-dige.66573.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755299.khodaya-basse-dige.67690.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech/train.log +0 -0
- StyleTTS_Accelerate/Models/Anispeech_with_DIFF/config.yml +80 -0
- StyleTTS_Accelerate/Models/Anispeech_with_DIFF/epoch_1st_00040.pth +3 -0
- StyleTTS_Accelerate/Models/Anispeech_with_DIFF/epoch_2nd_00014.pth +3 -0
- StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735755378.khodaya-basse-dige.68815.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735758983.khodaya-basse-dige.79079.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735759171.khodaya-basse-dige.80201.0 +3 -0
- StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735759231.khodaya-basse-dige.81123.0 +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
StyleTTS_Accelerate/Data/train_list_libritts.txt filter=lfs diff=lfs merge=lfs -text
|
37 |
+
StyleTTS_Accelerate/Utils/JDC/bst.t7 filter=lfs diff=lfs merge=lfs -text
|
StyleTTS_Accelerate/Configs/config.yml
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
log_dir: "Models/Anispeech_with_DIFF"
|
2 |
+
first_stage_path: "/home/ubuntu/StyleTTS_Accelerate_44khz/Models/Anispeech/epoch_1st_00020.pth"
|
3 |
+
save_freq: 1
|
4 |
+
log_interval: 10
|
5 |
+
device: "cuda"
|
6 |
+
multigpu: false
|
7 |
+
epochs_1st: 200 # number of epochs for first stage training
|
8 |
+
epochs_2nd: 100 # number of peochs for second stage training
|
9 |
+
batch_size: 32
|
10 |
+
pretrained_model: ""
|
11 |
+
second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
|
12 |
+
load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
|
13 |
+
|
14 |
+
diff_epoch: 5
|
15 |
+
|
16 |
+
train_data: "/home/ubuntu/StyleTTS_Accelerate_44khz/Data/ani_train_only_longs.csv"
|
17 |
+
val_data: "/home/ubuntu/StyleTTS_Accelerate_44khz/Data/val_list_libritts.txt"
|
18 |
+
|
19 |
+
F0_path: "Utils/JDC/bst.t7"
|
20 |
+
ASR_config: "Utils/ASR/config.yml"
|
21 |
+
ASR_path: "Utils/ASR/epoch_00080.pth"
|
22 |
+
|
23 |
+
preprocess_params:
|
24 |
+
sr: 24000
|
25 |
+
spect_params:
|
26 |
+
n_fft: 2048
|
27 |
+
win_length: 1200
|
28 |
+
hop_length: 300
|
29 |
+
|
30 |
+
model_params:
|
31 |
+
hidden_dim: 512
|
32 |
+
n_token: 178
|
33 |
+
style_dim: 128
|
34 |
+
n_layer: 3
|
35 |
+
dim_in: 64
|
36 |
+
max_conv_dim: 512
|
37 |
+
n_mels: 80
|
38 |
+
dropout: 0.2
|
39 |
+
|
40 |
+
|
41 |
+
diffusion:
|
42 |
+
embedding_mask_proba: 0.1
|
43 |
+
# transformer config
|
44 |
+
transformer:
|
45 |
+
num_layers: 3
|
46 |
+
num_heads: 8
|
47 |
+
head_features: 64
|
48 |
+
multiplier: 2
|
49 |
+
|
50 |
+
# diffusion distribution config
|
51 |
+
dist:
|
52 |
+
sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
|
53 |
+
estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
|
54 |
+
mean: -3.0
|
55 |
+
std: 1.0
|
56 |
+
|
57 |
+
|
58 |
+
loss_params:
|
59 |
+
lambda_mel: 10. # mel reconstruction loss (1st & 2nd stage)
|
60 |
+
lambda_adv: 1. # adversarial loss (1st & 2nd stage)
|
61 |
+
lambda_reg: 1. # adversarial regularization loss (1st & 2nd stage)
|
62 |
+
lambda_fm: 0.1 # feature matching loss (1st & 2nd stage)
|
63 |
+
|
64 |
+
lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
|
65 |
+
lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
|
66 |
+
TMA_epoch: 2 # TMA starting epoch (1st stage)
|
67 |
+
|
68 |
+
# https://github.com/yl4579/StyleTTS/issues/7
|
69 |
+
TMA_CEloss: false # whether to use cross-entropy (CE) loss for TMA
|
70 |
+
|
71 |
+
lambda_F0: 1. # F0 reconstruction loss (2nd stage)
|
72 |
+
lambda_norm: 1. # norm reconstruction loss (2nd stage)
|
73 |
+
lambda_dur: 1. # duration loss (2nd stage)
|
74 |
+
lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
|
75 |
+
|
76 |
+
lambda_sty: 1. # style reconstruction loss (2nd stage)
|
77 |
+
lambda_diff: 1. # score matching loss (2nd stage)
|
78 |
+
|
79 |
+
optimizer_params:
|
80 |
+
lr: 0.0001
|
StyleTTS_Accelerate/Configs/config_44.1khz.yml
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
log_dir: "Models/LJSpeech"
|
2 |
+
first_stage_path: "/home/ubuntu/StyleTTS_Accelerate/Models/LJSpeech/epoch_1st_00040.pth"
|
3 |
+
save_freq: 1
|
4 |
+
log_interval: 10
|
5 |
+
device: "cuda"
|
6 |
+
multigpu: false
|
7 |
+
epochs_1st: 200 # number of epochs for first stage training
|
8 |
+
epochs_2nd: 100 # number of peochs for second stage training
|
9 |
+
batch_size: 32
|
10 |
+
pretrained_model: "/home/ubuntu/StyleTTS_Accelerate/Models/LJSpeech/epoch_1st_00004.pth"
|
11 |
+
second_stage_load_pretrained: false # set to true if the pre-trained model is for 2nd stage
|
12 |
+
load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
|
13 |
+
|
14 |
+
diff_epoch: 5
|
15 |
+
|
16 |
+
train_data: "Data/train_list.txt"
|
17 |
+
val_data: "Data/val_list.txt"
|
18 |
+
|
19 |
+
F0_path: "Utils/JDC/bst.t7"
|
20 |
+
ASR_config: "Utils/ASR/config.yml"
|
21 |
+
ASR_path: "Utils/ASR/epoch_00080.pth"
|
22 |
+
|
23 |
+
preprocess_params:
|
24 |
+
sr: 44_100
|
25 |
+
spect_params:
|
26 |
+
n_fft: 2048
|
27 |
+
win_length: 2048
|
28 |
+
hop_length: 512
|
29 |
+
|
30 |
+
model_params:
|
31 |
+
hidden_dim: 512
|
32 |
+
n_token: 178
|
33 |
+
style_dim: 128
|
34 |
+
n_layer: 3
|
35 |
+
dim_in: 64
|
36 |
+
max_conv_dim: 512
|
37 |
+
n_mels: 128
|
38 |
+
dropout: 0.2
|
39 |
+
|
40 |
+
|
41 |
+
diffusion:
|
42 |
+
embedding_mask_proba: 0.1
|
43 |
+
# transformer config
|
44 |
+
transformer:
|
45 |
+
num_layers: 3
|
46 |
+
num_heads: 8
|
47 |
+
head_features: 64
|
48 |
+
multiplier: 2
|
49 |
+
|
50 |
+
# diffusion distribution config
|
51 |
+
dist:
|
52 |
+
sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
|
53 |
+
estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
|
54 |
+
mean: -3.0
|
55 |
+
std: 1.0
|
56 |
+
|
57 |
+
|
58 |
+
loss_params:
|
59 |
+
lambda_mel: 10. # mel reconstruction loss (1st & 2nd stage)
|
60 |
+
lambda_adv: 1. # adversarial loss (1st & 2nd stage)
|
61 |
+
lambda_reg: 1. # adversarial regularization loss (1st & 2nd stage)
|
62 |
+
lambda_fm: 0.1 # feature matching loss (1st & 2nd stage)
|
63 |
+
|
64 |
+
lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
|
65 |
+
lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
|
66 |
+
TMA_epoch: 2 # TMA starting epoch (1st stage)
|
67 |
+
|
68 |
+
# https://github.com/yl4579/StyleTTS/issues/7
|
69 |
+
TMA_CEloss: false # whether to use cross-entropy (CE) loss for TMA
|
70 |
+
|
71 |
+
lambda_F0: 1. # F0 reconstruction loss (2nd stage)
|
72 |
+
lambda_norm: 1. # norm reconstruction loss (2nd stage)
|
73 |
+
lambda_dur: 1. # duration loss (2nd stage)
|
74 |
+
lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
|
75 |
+
|
76 |
+
lambda_sty: 1. # style reconstruction loss (2nd stage)
|
77 |
+
lambda_diff: 1. # score matching loss (2nd stage)
|
78 |
+
|
79 |
+
optimizer_params:
|
80 |
+
lr: 0.0001
|
StyleTTS_Accelerate/Data/ani_train.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
StyleTTS_Accelerate/Data/ani_train_only_longs.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
StyleTTS_Accelerate/Data/train_list.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
StyleTTS_Accelerate/Data/train_list_libritts.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:07ced2d20dc0163f3a25d52c8544f63ffba4e9608664762325832f26376c402f
|
3 |
+
size 31691428
|
StyleTTS_Accelerate/Data/val_list.txt
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LJSpeech-1.1/wavs/LJ022-0023.wav|ðɪ ˌoʊvɚwˈɛlmɪŋ mədʒˈɔːɹᵻɾi ʌv pˈiːpəl ɪn ðɪs kˈʌntɹi nˈoʊ hˌaʊ tə sˈɪft ðə wˈiːt fɹʌmðə tʃˈæf ɪn wʌt ðeɪ hˈɪɹ ænd wʌt ðeɪ ɹˈiːd .|0
|
2 |
+
LJSpeech-1.1/wavs/LJ043-0030.wav|ɪf sˈʌmbɑːdi dˈɪd ðˈæt tə mˌiː , ɐ lˈaʊsi tɹˈɪk lˈaɪk ðˈæt , tə tˈeɪk maɪ wˈaɪf ɐwˈeɪ , ænd ˈɔːl ðə fˈɜːnɪtʃɚ , aɪ wʊd biː mˈæd æz hˈɛl , tˈuː .|0
|
3 |
+
LJSpeech-1.1/wavs/LJ005-0201.wav|ˌæzˌɪz ʃˈoʊn baɪ ðə ɹᵻpˈoːɹt ʌvðə kəmˈɪʃənɚz tʊ ɪŋkwˈaɪɚɹ ˌɪntʊ ðə stˈeɪt ʌvðə mjuːnˈɪsɪpəl kˌɔːɹpɚɹˈeɪʃənz ɪn ˈeɪtiːn θˈɜːɾi fˈaɪv .|0
|
4 |
+
LJSpeech-1.1/wavs/LJ001-0110.wav|ˈiːvən ðə kˈæslɑːn tˈaɪp wɛn ɛnlˈɑːɹdʒd ʃˈoʊz ɡɹˈeɪt ʃˈɔːɹtkʌmɪŋz ɪn ðɪs ɹᵻspˈɛkt :|0
|
5 |
+
LJSpeech-1.1/wavs/LJ003-0345.wav|ˈɔːl ðə kəmˈɪɾi kʊd dˈuː ɪn ðɪs ɹᵻspˈɛkt wʌz tə θɹˈoʊ ðə ɹᵻspˌɑːnsəbˈɪlɪɾi ˌɔn ˈʌðɚz .|0
|
6 |
+
LJSpeech-1.1/wavs/LJ007-0154.wav|ðiːz pˈʌndʒənt ænd wˈɛl ɡɹˈaʊndᵻd stɹˈɪktʃɚz ɐplˈaɪd wɪð stˈɪl ɡɹˈeɪɾɚ fˈoːɹs tə ðɪ ʌŋkənvˈɪktᵻd pɹˈɪzənɚ , ðə mˈæn hˌuː kˈeɪm tə ðə pɹˈɪzən ˈɪnəsənt , ænd stˈɪl ʌŋkəntˈæmᵻnˌeɪɾᵻd ,|0
|
7 |
+
LJSpeech-1.1/wavs/LJ018-0098.wav|ænd ɹˈɛkəɡnˌaɪzd æz wˈʌn ʌvðə fɹˈiːkwɛntɚz ʌvðə bˈoʊɡəs lˈɔː stˈeɪʃənɚz . hɪz ɚɹˈɛst lˈɛd tə ðæt ʌv ˈʌðɚz .|0
|
8 |
+
LJSpeech-1.1/wavs/LJ047-0044.wav|ˈɑːswəld wʌz , haʊˈɛvɚ , wˈɪlɪŋ tə dɪskˈʌs hɪz kˈɑːntækts wɪð sˈoʊviət ɐθˈɔːɹɪɾiz . hiː dᵻnˈaɪd hˌævɪŋ ˌɛni ɪnvˈɑːlvmənt wɪð sˈoʊviət ɪntˈɛlɪdʒəns ˈeɪdʒənsiz|0
|
9 |
+
LJSpeech-1.1/wavs/LJ031-0038.wav|ðə fˈɜːst fɪzˈɪʃən tə sˈiː ðə pɹˈɛzɪdənt æt pˈɑːɹklənd hˈɑːspɪɾəl wʌz dˈɑːktɚ . tʃˈɑːɹlz dʒˈeɪ . kˈæɹɪkˌoʊ , ɐ ɹˈɛzᵻdənt ɪn dʒˈɛnɚɹəl sˈɜːdʒɚɹi .|0
|
10 |
+
LJSpeech-1.1/wavs/LJ048-0194.wav|dˈʊɹɹɪŋ ðə mˈɔːɹnɪŋ ʌv noʊvˈɛmbɚ twˈɛnti tˈuː pɹˈaɪɚ tə ðə mˈoʊɾɚkˌeɪd .|0
|
11 |
+
LJSpeech-1.1/wavs/LJ049-0026.wav|ˌɔn əkˈeɪʒən ðə sˈiːkɹᵻt sˈɜːvɪs hɐzbɪn pɚmˈɪɾᵻd tə hæv ɐn ˈeɪdʒənt ɹˈaɪdɪŋ ɪnðə pˈæsɪndʒɚ kəmpˈɑːɹtmənt wɪððə pɹˈɛzɪdənt .|0
|
12 |
+
LJSpeech-1.1/wavs/LJ004-0152.wav|ɔːlðˈoʊ æt mˈɪstɚ . bˈʌkstənz vˈɪzɪt ɐ nˈuː dʒˈeɪl wʌz ɪn pɹˈɑːsɛs ʌv ɪɹˈɛkʃən , ðə fˈɜːst stˈɛp təwˈɔːɹdz ɹᵻfˈɔːɹm sˈɪns hˈaʊɚdz vˌɪzɪtˈeɪʃən ɪn sˈɛvəntˌiːn sˈɛvənti fˈoːɹ .|0
|
13 |
+
LJSpeech-1.1/wavs/LJ008-0278.wav|ɔːɹ ðˈɛɹz mˌaɪt biː wˈʌn ʌv mˈɛni , ænd ɪt mˌaɪt biː kənsˈɪdɚd nˈɛsᵻsɚɹi tə dˈɑːlɚ mˌeɪk ɐn ɛɡzˈæmpəl.dˈɑːlɚ|0
|
14 |
+
LJSpeech-1.1/wavs/LJ043-0002.wav|ðə wˈɔːɹəŋ kəmˈɪʃən ɹᵻpˈoːɹt . baɪ ðə pɹˈɛzɪdənts kəmˈɪʃən ɔnðɪ ɐsˌæsᵻnˈeɪʃən ʌv pɹˈɛzɪdənt kˈɛnədi . tʃˈæptɚ sˈɛvən . lˈiː hˈɑːɹvi ˈɑːswəld :|0
|
15 |
+
LJSpeech-1.1/wavs/LJ009-0114.wav|mˈɪstɚ . wˈeɪkfiːld wˈaɪndz ˈʌp hɪz ɡɹˈæfɪk bˌʌt sˈʌmwʌt sɛnsˈeɪʃənəl ɐkˈaʊnt baɪ dᵻskɹˈaɪbɪŋ ɐnˈʌðɚ ɹᵻlˈɪdʒəs sˈɜːvɪs , wˌɪtʃ mˈeɪ ɐpɹˈoʊpɹɪˌeɪtli biː ɪnsˈɜːɾᵻd hˈɪɹ .|0
|
16 |
+
LJSpeech-1.1/wavs/LJ028-0506.wav|ɐ mˈɑːdɚn ˈɑːɹɾɪst wʊdhɐv dˈɪfɪkˌʌlti ɪn dˌuːɪŋ sˈʌtʃ ˈækjʊɹət wˈɜːk .|0
|
17 |
+
LJSpeech-1.1/wavs/LJ050-0168.wav|wɪððə pɚtˈɪkjʊlɚ pˈɜːpəsᵻz ʌvðɪ ˈeɪdʒənsi ɪnvˈɑːlvd . ðə kəmˈɪʃən ɹˈɛkəɡnˌaɪzᵻz ðæt ðɪs ɪz ɐ kˌɑːntɹəvˈɜːʃəl ˈɛɹiə|0
|
18 |
+
LJSpeech-1.1/wavs/LJ039-0223.wav|ˈɑːswəldz mɚɹˈiːn tɹˈeɪnɪŋ ɪn mˈɑːɹksmənʃˌɪp , hɪz ˈʌðɚ ɹˈaɪfəl ɛkspˈiəɹɪəns ænd hɪz ɪstˈæblɪʃt fəmˌɪliˈæɹɪɾi wɪð ðɪs pɚtˈɪkjʊlɚ wˈɛpən|0
|
19 |
+
LJSpeech-1.1/wavs/LJ029-0032.wav|ɐkˈoːɹdɪŋ tʊ oʊdˈɑːnəl , kwˈoʊt , wiː hæd ɐ mˈoʊɾɚkˌeɪd wɛɹˈɛvɚ kplˈʌsplʌs wˌɪtʃ hɐdbɪn bˌɪn hˈeɪstili sˈʌmənd fɚðə ðə pˈɜːpəs wiː wˈɛnt , ˈɛnd kwˈoʊt .|0
|
20 |
+
LJSpeech-1.1/wavs/LJ031-0070.wav|dˈɑːktɚ . klˈɑːɹk , hˌuː mˈoʊst klˈoʊsli əbzˈɜːvd ðə hˈɛd wˈuːnd ,|0
|
21 |
+
LJSpeech-1.1/wavs/LJ034-0198.wav|jˈuːɪnz , hˌuː wʌz ɔnðə saʊθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən stɹˈiːts tˈɛstᵻfˌaɪd ðæt hiː kʊd nˌɑːt dᵻskɹˈaɪb ðə mˈæn hiː sˈɔː ɪnðə wˈɪndoʊ .|0
|
22 |
+
LJSpeech-1.1/wavs/LJ026-0068.wav|ˈɛnɚdʒi ˈɛntɚz ðə plˈænt , tʊ ɐ smˈɔːl ɛkstˈɛnt ,|0
|
23 |
+
LJSpeech-1.1/wavs/LJ039-0075.wav|wˈʌns juː nˈoʊ ðæt juː mˈʌst pˌʊt ðə kɹˈɔshɛɹz ɔnðə tˈɑːɹɡɪt ænd ðæt ɪz ˈɔːl ðæt ɪz nˈɛsᵻsɚɹi .|0
|
24 |
+
LJSpeech-1.1/wavs/LJ004-0096.wav|ðə fˈeɪɾəl kˈɑːnsɪkwənsᵻz wˈɛɹɑːf mˌaɪt biː pɹɪvˈɛntᵻd ɪf ðə dʒˈʌstɪsᵻz ʌvðə pˈiːs wɜː djˈuːli ˈɔːθɚɹˌaɪzd|0
|
25 |
+
LJSpeech-1.1/wavs/LJ005-0014.wav|spˈiːkɪŋ ˌɔn ɐ dᵻbˈeɪt ˌɔn pɹˈɪzən mˈæɾɚz , hiː dᵻklˈɛɹd ðˈæt|0
|
26 |
+
LJSpeech-1.1/wavs/LJ012-0161.wav|hiː wʌz ɹᵻpˈoːɹɾᵻd tə hæv fˈɔːlən ɐwˈeɪ tʊ ɐ ʃˈædoʊ .|0
|
27 |
+
LJSpeech-1.1/wavs/LJ018-0239.wav|hɪz dˌɪsɐpˈɪɹəns ɡˈeɪv kˈʌlɚ ænd sˈʌbstəns tʊ ˈiːvəl ɹᵻpˈoːɹts ɔːlɹˌɛdi ɪn sˌɜːkjʊlˈeɪʃən ðætðə wɪl ænd kənvˈeɪəns əbˌʌv ɹᵻfˈɜːd tuː|0
|
28 |
+
LJSpeech-1.1/wavs/LJ019-0257.wav|hˈɪɹ ðə tɹˈɛd wˈiːl wʌz ɪn jˈuːs , ðɛɹ sˈɛljʊlɚ kɹˈæŋks , ɔːɹ hˈɑːɹd lˈeɪbɚ məʃˈiːnz .|0
|
29 |
+
LJSpeech-1.1/wavs/LJ028-0008.wav|juː tˈæp dʒˈɛntli wɪð jʊɹ hˈiːl əpˌɑːn ðə ʃˈoʊldɚɹ ʌvðə dɹˈoʊmdɚɹi tʊ ˈɜːdʒ hɜːɹ ˈɔn .|0
|
30 |
+
LJSpeech-1.1/wavs/LJ024-0083.wav|ðɪs plˈæn ʌv mˈaɪn ɪz nˈoʊ ɐtˈæk ɔnðə kˈoːɹt ;|0
|
31 |
+
LJSpeech-1.1/wavs/LJ042-0129.wav|nˈoʊ nˈaɪt klˈʌbz ɔːɹ bˈoʊlɪŋ ˈælɪz , nˈoʊ plˈeɪsᵻz ʌv ɹˌɛkɹiːˈeɪʃən ɛksˈɛpt ðə tɹˈeɪd jˈuːniən dˈænsᵻz . aɪ hæv hæd ɪnˈʌf .|0
|
32 |
+
LJSpeech-1.1/wavs/LJ036-0103.wav|ðə pəlˈiːs ˈæskt hˌɪm wˈɛðɚ hiː kʊd pˈɪk ˈaʊt hɪz pˈæsɪndʒɚ fɹʌmðə lˈaɪnʌp .|0
|
33 |
+
LJSpeech-1.1/wavs/LJ046-0058.wav|dˈʊɹɹɪŋ hɪz pɹˈɛzɪdənsi , fɹˈæŋklɪn dˈiː . ɹˈoʊzəvˌɛlt mˌeɪd ˈɔːlmoʊst fˈoːɹ hˈʌndɹɪd dʒˈɜːniz ænd tɹˈævəld mˈoːɹ ðɐn θɹˈiː hˈʌndɹɪd fˈɪfti θˈaʊzənd mˈaɪlz .|0
|
34 |
+
LJSpeech-1.1/wavs/LJ014-0076.wav|hiː wʌz sˈiːn ˈæftɚwɚdz smˈoʊkɪŋ ænd tˈɔːkɪŋ wɪð hɪz hˈoʊsts ɪn ðɛɹ bˈæk pˈɑːɹlɚ , ænd nˈɛvɚ sˈiːn ɐɡˈɛn ɐlˈaɪv .|0
|
35 |
+
LJSpeech-1.1/wavs/LJ002-0043.wav|lˈɔŋ nˈæɹoʊ ɹˈuːmz wˈʌn θˈɜːɾi sˈɪks fˈiːt , sˈɪks twˈɛnti θɹˈiː fˈiːt , ænd ðɪ ˈeɪtθ ˈeɪtiːn ,|0
|
36 |
+
LJSpeech-1.1/wavs/LJ009-0076.wav|wiː kˈʌm tə ðə sˈɜːmən .|0
|
37 |
+
LJSpeech-1.1/wavs/LJ017-0131.wav|ˈiːvən wɛn ðə hˈaɪ ʃˈɛɹɪf hæd tˈoʊld hˌɪm ðɛɹwˌʌz nˈoʊ pˌɑːsəbˈɪlɪɾi əvɚ ɹᵻpɹˈiːv , ænd wɪðˌɪn ɐ fjˈuː ˈaʊɚz ʌv ˌɛksɪkjˈuːʃən .|0
|
38 |
+
LJSpeech-1.1/wavs/LJ046-0184.wav|bˌʌt ðɛɹ ɪz ɐ sˈɪstəm fɚðɪ ɪmˈiːdɪət nˌoʊɾɪfɪkˈeɪʃən ʌvðə sˈiːkɹᵻt sˈɜːvɪs baɪ ðə kənfˈaɪnɪŋ ˌɪnstɪtˈuːʃən wɛn ɐ sˈʌbdʒɛkt ɪz ɹᵻlˈiːst ɔːɹ ɛskˈeɪps .|0
|
39 |
+
LJSpeech-1.1/wavs/LJ014-0263.wav|wˌɛn ˈʌðɚ plˈɛʒɚz pˈɔːld hiː tˈʊk ɐ θˈiəɾɚ , ænd pˈoʊzd æz ɐ mjuːnˈɪfɪsənt pˈeɪtɹən ʌvðə dɹəmˈæɾɪk ˈɑːɹt .|0
|
40 |
+
LJSpeech-1.1/wavs/LJ042-0096.wav|ˈoʊld ɛkstʃˈeɪndʒ ɹˈeɪt ɪn ɐdˈɪʃən tə hɪz fˈæktɚɹi sˈælɚɹi ʌv ɐpɹˈɑːksɪmətli ˈiːkwəl ɐmˈaʊnt|0
|
41 |
+
LJSpeech-1.1/wavs/LJ049-0050.wav|hˈɪl hæd bˈoʊθ fˈiːt ɔnðə kˈɑːɹ ænd wʌz klˈaɪmɪŋ ɐbˈoːɹd tʊ ɐsˈɪst pɹˈɛzɪdənt ænd mˈɪsɪz . kˈɛnədi .|0
|
42 |
+
LJSpeech-1.1/wavs/LJ019-0186.wav|sˈiːɪŋ ðæt sˈɪns ðɪ ɪstˈæblɪʃmənt ʌvðə sˈɛntɹəl kɹˈɪmɪnəl kˈoːɹt , nˈuːɡeɪt ɹᵻsˈiːvd pɹˈɪzənɚz fɔːɹ tɹˈaɪəl fɹʌm sˈɛvɹəl kˈaʊntiz ,|0
|
43 |
+
LJSpeech-1.1/wavs/LJ028-0307.wav|ðˈɛn lˈɛt twˈɛnti dˈeɪz pˈæs , ænd æt ðɪ ˈɛnd ʌv ðæt tˈaɪm stˈeɪʃən nˌɪɹ ðə tʃˈældæsəŋ ɡˈeɪts ɐ bˈɑːdi ʌv fˈoːɹ θˈaʊzənd .|0
|
44 |
+
LJSpeech-1.1/wavs/LJ012-0235.wav|wˌaɪl ðeɪ wɜːɹ ɪn ɐ stˈeɪt ʌv ɪnsˌɛnsəbˈɪlɪɾi ðə mˈɜːdɚ wʌz kəmˈɪɾᵻd .|0
|
45 |
+
LJSpeech-1.1/wavs/LJ034-0053.wav|ɹˈiːtʃt ðə sˈeɪm kəŋklˈuːʒən æz lætˈoʊnə ðætðə pɹˈɪnts fˈaʊnd ɔnðə kˈɑːɹtənz wɜː ðoʊz ʌv lˈiː hˈɑːɹvi ˈɑːswəld .|0
|
46 |
+
LJSpeech-1.1/wavs/LJ014-0030.wav|ðiːz wɜː dˈæmnətˌoːɹi fˈækts wˌɪtʃ wˈɛl səpˈoːɹɾᵻd ðə pɹˌɑːsɪkjˈuːʃən .|0
|
47 |
+
LJSpeech-1.1/wavs/LJ015-0203.wav|bˌʌt wɜː ðə pɹɪkˈɔːʃənz tˈuː mˈɪnɪt , ðə vˈɪdʒɪləns tˈuː klˈoʊs təbi ᵻlˈuːdᵻd ɔːɹ ˌoʊvɚkˈʌm ?|0
|
48 |
+
LJSpeech-1.1/wavs/LJ028-0093.wav|bˌʌt hɪz skɹˈaɪb ɹˈoʊt ɪɾ ɪnðə mˈænɚ kˈʌstəmˌɛɹi fɚðə skɹˈaɪbz ʌv ðoʊz dˈeɪz tə ɹˈaɪt ʌv ðɛɹ ɹˈɔɪəl mˈæstɚz .|0
|
49 |
+
LJSpeech-1.1/wavs/LJ002-0018.wav|ðɪ ɪnˈædɪkwəsi ʌvðə dʒˈeɪl wʌz nˈoʊɾɪst ænd ɹᵻpˈoːɹɾᵻd əpˌɑːn ɐɡˈɛn ænd ɐɡˈɛn baɪ ðə ɡɹˈænd dʒˈʊɹɹiz ʌvðə sˈɪɾi ʌv lˈʌndən ,|0
|
50 |
+
LJSpeech-1.1/wavs/LJ028-0275.wav|æt lˈæst , ɪnðə twˈɛntiəθ mˈʌnθ ,|0
|
51 |
+
LJSpeech-1.1/wavs/LJ012-0042.wav|wˌɪtʃ hiː kˈɛpt kənsˈiːld ɪn ɐ hˈaɪdɪŋ plˈeɪs wɪð ɐ tɹˈæp dˈoːɹ dʒˈʌst ˌʌndɚ hɪz bˈɛd .|0
|
52 |
+
LJSpeech-1.1/wavs/LJ011-0096.wav|hiː mˈæɹid ɐ lˈeɪdi ˈɔːlsoʊ bᵻlˈɔŋɪŋ tə ðə səsˈaɪəɾi ʌv fɹˈɛndz , hˌuː bɹˈɔːt hˌɪm ɐ lˈɑːɹdʒ fˈɔːɹtʃʊn , wˈɪtʃ , ænd hɪz ˈoʊn mˈʌni , hiː pˌʊt ˌɪntʊ ɐ sˈɪɾi fˈɜːm ,|0
|
53 |
+
LJSpeech-1.1/wavs/LJ036-0077.wav|ɹˈɑːdʒɚ dˈiː . kɹˈeɪɡ , ɐ dˈɛpjuːɾi ʃˈɛɹɪf ʌv dˈæləs kˈaʊnti ,|0
|
54 |
+
LJSpeech-1.1/wavs/LJ016-0318.wav|ˈʌðɚɹ əfˈɪʃəlz , ɡɹˈeɪt lˈɔɪɚz , ɡˈʌvɚnɚz ʌv pɹˈɪzənz , ænd tʃˈæplɪnz səpˈoːɹɾᵻd ðɪs vjˈuː .|0
|
55 |
+
LJSpeech-1.1/wavs/LJ013-0164.wav|hˌuː kˈeɪm fɹʌm hɪz ɹˈuːm ɹˈɛdi dɹˈɛst , ɐ səspˈɪʃəs sˈɜːkəmstˌæns , æz hiː wʌz ˈɔːlweɪz lˈeɪt ɪnðə mˈɔːɹnɪŋ .|0
|
56 |
+
LJSpeech-1.1/wavs/LJ027-0141.wav|ɪz klˈoʊsli ɹᵻpɹədˈuːst ɪnðə lˈaɪf hˈɪstɚɹi ʌv ɛɡzˈɪstɪŋ dˈɪɹ . ɔːɹ , ɪn ˈʌðɚ wˈɜːdz ,|0
|
57 |
+
LJSpeech-1.1/wavs/LJ028-0335.wav|ɐkˈoːɹdɪŋli ðeɪ kəmˈɪɾᵻd tə hˌɪm ðə kəmˈænd ʌv ðɛɹ hˈoʊl ˈɑːɹmi , ænd pˌʊt ðə kˈiːz ʌv ðɛɹ sˈɪɾi ˌɪntʊ hɪz hˈændz .|0
|
58 |
+
LJSpeech-1.1/wavs/LJ031-0202.wav|mˈɪsɪz . kˈɛnədi tʃˈoʊz ðə hˈɑːspɪɾəl ɪn bəθˈɛzdə fɚðɪ ˈɔːtɑːpsi bɪkˈʌz ðə pɹˈɛzɪdənt hæd sˈɜːvd ɪnðə nˈeɪvi .|0
|
59 |
+
LJSpeech-1.1/wavs/LJ021-0145.wav|fɹʌm ðoʊz wˈɪlɪŋ tə dʒˈɔɪn ɪn ɪstˈæblɪʃɪŋ ðɪs hˈoʊpt fɔːɹ pˈiəɹɪəd ʌv pˈiːs ,|0
|
60 |
+
LJSpeech-1.1/wavs/LJ016-0288.wav|dˈɑːlɚ mˈuːlɚ , mˈuːlɚ , hiːz ðə mˈæn , dˈɑːlɚ tˈɪl ɐ daɪvˈɜːʒən wʌz kɹiːˈeɪɾᵻd baɪ ðɪ ɐpˈɪɹəns ʌvðə ɡˈæloʊz , wˌɪtʃ wʌz ɹᵻsˈiːvd wɪð kəntˈɪnjuːəs jˈɛlz .|0
|
61 |
+
LJSpeech-1.1/wavs/LJ028-0081.wav|jˈɪɹz lˈeɪɾɚ , wˌɛn ðɪ ˌɑːɹkiːˈɑːlədʒˌɪsts kʊd ɹˈɛdili dɪstˈɪŋɡwɪʃ ðə fˈɔls fɹʌmðə tɹˈuː ,|0
|
62 |
+
LJSpeech-1.1/wavs/LJ018-0081.wav|hɪz dᵻfˈɛns bˌiːɪŋ ðæt hiː hæd ɪntˈɛndᵻd tə kəmˈɪt sˈuːɪsˌaɪd , bˌʌt ðˈæt , ɔnðɪ ɐpˈɪɹəns ʌv ðɪs ˈɑːfɪsɚ hˌuː hæd ɹˈɔŋd hˌɪm ,|0
|
63 |
+
LJSpeech-1.1/wavs/LJ021-0066.wav|təɡˌɛðɚ wɪð ɐ ɡɹˈeɪt ˈɪŋkɹiːs ɪnðə pˈeɪɹoʊlz , ðɛɹ hɐz kˈʌm ɐ səbstˈænʃəl ɹˈaɪz ɪnðə tˈoʊɾəl ʌv ɪndˈʌstɹɪəl pɹˈɑːfɪts|0
|
64 |
+
LJSpeech-1.1/wavs/LJ009-0238.wav|ˈæftɚ ðɪs ðə ʃˈɛɹɪfs sˈɛnt fɔːɹ ɐnˈʌðɚ ɹˈoʊp , bˌʌt ðə spɛktˈeɪɾɚz ˌɪntəfˈɪɹd , ænd ðə mˈæn wʌz kˈæɹid bˈæk tə dʒˈeɪl .|0
|
65 |
+
LJSpeech-1.1/wavs/LJ005-0079.wav|ænd ɪmpɹˈuːv ðə mˈɔːɹəlz ʌvðə pɹˈɪzənɚz , ænd ʃˌæl ɪnʃˈʊɹ ðə pɹˈɑːpɚ mˈɛʒɚɹ ʌv pˈʌnɪʃmənt tə kənvˈɪktᵻd əfˈɛndɚz .|0
|
66 |
+
LJSpeech-1.1/wavs/LJ035-0019.wav|dɹˈoʊv tə ðə nɔːɹθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən , ænd pˈɑːɹkt ɐpɹˈɑːksɪmətli tˈɛn fˈiːt fɹʌmðə tɹˈæfɪk sˈɪɡnəl .|0
|
67 |
+
LJSpeech-1.1/wavs/LJ036-0174.wav|ðɪs ɪz ðɪ ɐpɹˈɑːksɪmət tˈaɪm hiː ˈɛntɚd ðə ɹˈuːmɪŋhˌaʊs , ɐkˈoːɹdɪŋ tʊ ˈɜːliːn ɹˈɑːbɚts , ðə hˈaʊskiːpɚ ðˈɛɹ .|0
|
68 |
+
LJSpeech-1.1/wavs/LJ046-0146.wav|ðə kɹaɪtˈiəɹɪə ɪn ɪfˈɛkt pɹˈaɪɚ tə noʊvˈɛmbɚ twˈɛnti tˈuː , nˈaɪntiːn sˈɪksti θɹˈiː , fɔːɹ dɪtˈɜːmɪnɪŋ wˈɛðɚ tʊ ɐksˈɛpt mətˈɪɹiəl fɚðə pˌiːˌɑːɹɹˈɛs dʒˈɛnɚɹəl fˈaɪlz|0
|
69 |
+
LJSpeech-1.1/wavs/LJ017-0044.wav|ænd ðə dˈiːpɪst æŋzˈaɪəɾi wʌz fˈɛlt ðætðə kɹˈaɪm , ɪf kɹˈaɪm ðˈɛɹ hɐdbɪn , ʃˌʊd biː bɹˈɔːt hˈoʊm tʊ ɪts pˈɜːpɪtɹˌeɪɾɚ .|0
|
70 |
+
LJSpeech-1.1/wavs/LJ017-0070.wav|bˌʌt hɪz spˈoːɹɾɪŋ ˌɑːpɚɹˈeɪʃənz dɪdnˌɑːt pɹˈɑːspɚ , ænd hiː bɪkˌeɪm ɐ nˈiːdi mˈæn , ˈɔːlweɪz dɹˈɪvən tə dˈɛspɚɹət stɹˈeɪts fɔːɹ kˈæʃ .|0
|
71 |
+
LJSpeech-1.1/wavs/LJ014-0020.wav|hiː wʌz sˈuːn ˈæftɚwɚdz ɚɹˈɛstᵻd ˌɔn səspˈɪʃən , ænd ɐ sˈɜːtʃ ʌv hɪz lˈɑːdʒɪŋz bɹˈɔːt tə lˈaɪt sˈɛvɹəl ɡˈɑːɹmənts sˈætʃɚɹˌeɪɾᵻd wɪð blˈʌd ;|0
|
72 |
+
LJSpeech-1.1/wavs/LJ016-0020.wav|hiː nˈɛvɚ ɹˈiːtʃt ðə sˈɪstɚn , bˌʌt fˈɛl bˈæk ˌɪntʊ ðə jˈɑːɹd , ˈɪndʒɚɹɪŋ hɪz lˈɛɡz sᵻvˈɪɹli .|0
|
73 |
+
LJSpeech-1.1/wavs/LJ045-0230.wav|wˌɛn hiː wʌz fˈaɪnəli ˌæpɹihˈɛndᵻd ɪnðə tˈɛksəs θˈiəɾɚ . ɔːlðˈoʊ ɪɾ ɪz nˌɑːt fˈʊli kɚɹˈɑːbɚɹˌeɪɾᵻd baɪ ˈʌðɚz hˌuː wɜː pɹˈɛzənt ,|0
|
74 |
+
LJSpeech-1.1/wavs/LJ035-0129.wav|ænd ʃiː mˈʌstɐv ɹˈʌn dˌaʊn ðə stˈɛɹz ɐhˈɛd ʌv ˈɑːswəld ænd wʊd pɹˈɑːbəbli hæv sˈiːn ɔːɹ hˈɜːd hˌɪm .|0
|
75 |
+
LJSpeech-1.1/wavs/LJ008-0307.wav|ˈæftɚwɚdz ɛkspɹˈɛs ɐ wˈɪʃ tə mˈɜːdɚ ðə ɹᵻkˈoːɹdɚ fɔːɹ hˌævɪŋ kˈɛpt ðˌɛm sˌoʊ lˈɔŋ ɪn səspˈɛns .|0
|
76 |
+
LJSpeech-1.1/wavs/LJ008-0294.wav|nˌɪɹli ɪndˈɛfɪnətli dᵻfˈɜːd .|0
|
77 |
+
LJSpeech-1.1/wavs/LJ047-0148.wav|ˌɔn ɑːktˈoʊbɚ twˈɛnti fˈaɪv ,|0
|
78 |
+
LJSpeech-1.1/wavs/LJ008-0111.wav|ðeɪ ˈɛntɚd ɐ dˈɑːlɚ stˈoʊŋ kˈoʊld ɹˈuːm , dˈɑːlɚɹ ænd wɜː pɹˈɛzəntli dʒˈɔɪnd baɪ ðə pɹˈɪzənɚ .|0
|
79 |
+
LJSpeech-1.1/wavs/LJ034-0042.wav|ðæt hiː kʊd ˈoʊnli tˈɛstᵻfˌaɪ wɪð sˈɜːtənti ðætðə pɹˈɪnt wʌz lˈɛs ðɐn θɹˈiː dˈeɪz ˈoʊld .|0
|
80 |
+
LJSpeech-1.1/wavs/LJ037-0234.wav|mˈɪsɪz . mˈɛɹi bɹˈɑːk , ðə wˈaɪf əvə mɪkˈænɪk hˌuː wˈɜːkt æt ðə stˈeɪʃən , wʌz ðɛɹ æt ðə tˈaɪm ænd ʃiː sˈɔː ɐ wˈaɪt mˈeɪl ,|0
|
81 |
+
LJSpeech-1.1/wavs/LJ040-0002.wav|tʃˈæptɚ sˈɛvən . lˈiː hˈɑːɹvi ˈɑːswəld : bˈækɡɹaʊnd ænd pˈɑːsᵻbəl mˈoʊɾɪvz , pˈɑːɹt wˌʌn .|0
|
82 |
+
LJSpeech-1.1/wavs/LJ045-0140.wav|ðɪ ˈɑːɹɡjuːmənts hiː jˈuːzd tə dʒˈʌstᵻfˌaɪ hɪz jˈuːs ʌvðɪ ˈeɪliəs sədʒˈɛst ðæt ˈɑːswəld mˌeɪhɐv kˈʌm tə θˈɪŋk ðætðə hˈoʊl wˈɜːld wʌz bᵻkˈʌmɪŋ ɪnvˈɑːlvd|0
|
83 |
+
LJSpeech-1.1/wavs/LJ012-0035.wav|ðə nˈʌmbɚ ænd nˈeɪmz ˌɔn wˈɑːtʃᵻz , wɜː kˈɛɹfəli ɹᵻmˈuːvd ɔːɹ əblˈɪɾɚɹˌeɪɾᵻd ˈæftɚ ðə ɡˈʊdz pˈæst ˌaʊɾəv hɪz hˈændz .|0
|
84 |
+
LJSpeech-1.1/wavs/LJ012-0250.wav|ɔnðə sˈɛvənθ dʒuːlˈaɪ , ˈeɪtiːn θˈɜːɾi sˈɛvən ,|0
|
85 |
+
LJSpeech-1.1/wavs/LJ016-0179.wav|kəntɹˈæktᵻd wɪð ʃˈɛɹɪfs ænd kənvˈiːnɚz tə wˈɜːk baɪ ðə dʒˈɑːb .|0
|
86 |
+
LJSpeech-1.1/wavs/LJ016-0138.wav|æɾə dˈɪstəns fɹʌmðə pɹˈɪzən .|0
|
87 |
+
LJSpeech-1.1/wavs/LJ027-0052.wav|ðiːz pɹˈɪnsɪpəlz ʌv həmˈɑːlədʒi ɑːɹ ᵻsˈɛnʃəl tʊ ɐ kɚɹˈɛkt ɪntˌɜːpɹɪtˈeɪʃən ʌvðə fˈækts ʌv mɔːɹfˈɑːlədʒi .|0
|
88 |
+
LJSpeech-1.1/wavs/LJ031-0134.wav|ˌɔn wˈʌn əkˈeɪʒən mˈɪsɪz . dʒˈɑːnsən , ɐkˈʌmpənid baɪ tˈuː sˈiːkɹᵻt sˈɜːvɪs ˈeɪdʒənts , lˈɛft ðə ɹˈuːm tə sˈiː mˈɪsɪz . kˈɛnədi ænd mˈɪsɪz . kˈɑːnæli .|0
|
89 |
+
LJSpeech-1.1/wavs/LJ019-0273.wav|wˌɪtʃ sˌɜː dʒˈɑːʃjuːə dʒˈɛb tˈoʊld ðə kəmˈɪɾi hiː kənsˈɪdɚd ðə pɹˈɑːpɚɹ ˈɛlɪmənts ʌv pˈiːnəl dˈɪsɪplˌɪn .|0
|
90 |
+
LJSpeech-1.1/wavs/LJ014-0110.wav|æt ðə fˈɜːst ðə bˈɑːksᵻz wɜːɹ ɪmpˈaʊndᵻd , ˈoʊpənd , ænd fˈaʊnd tə kəntˈeɪn mˈɛnɪəv oʊkˈɑːnɚz ɪfˈɛkts .|0
|
91 |
+
LJSpeech-1.1/wavs/LJ034-0160.wav|ˌɔn bɹˈɛnənz sˈʌbsᵻkwənt sˈɜːʔn̩ aɪdˈɛntɪfɪkˈeɪʃən ʌv lˈiː hˈɑːɹvi ˈɑːswəld æz ðə mˈæn hiː sˈɔː fˈaɪɚ ðə ɹˈaɪfəl .|0
|
92 |
+
LJSpeech-1.1/wavs/LJ038-0199.wav|ᵻlˈɛvən . ɪf aɪɐm ɐlˈaɪv ænd tˈeɪkən pɹˈɪzənɚ ,|0
|
93 |
+
LJSpeech-1.1/wavs/LJ014-0010.wav|jˈɛt hiː kʊd nˌɑːt ˌoʊvɚkˈʌm ðə stɹˈeɪndʒ fˌæsᵻnˈeɪʃən ɪt hˈæd fɔːɹ hˌɪm , ænd ɹᵻmˈeɪnd baɪ ðə sˈaɪd ʌvðə kˈɔːɹps tˈɪl ðə stɹˈɛtʃɚ kˈeɪm .|0
|
94 |
+
LJSpeech-1.1/wavs/LJ033-0047.wav|aɪ nˈoʊɾɪst wɛn aɪ wɛnt ˈaʊt ðætðə lˈaɪt wʌz ˈɔn , ˈɛnd kwˈoʊt ,|0
|
95 |
+
LJSpeech-1.1/wavs/LJ040-0027.wav|hiː wʌz nˈɛvɚ sˈæɾɪsfˌaɪd wɪð ˈɛnɪθˌɪŋ .|0
|
96 |
+
LJSpeech-1.1/wavs/LJ048-0228.wav|ænd ˈʌðɚz hˌuː wɜː pɹˈɛzənt sˈeɪ ðæt nˈoʊ ˈeɪdʒənt wʌz ɪnˈiːbɹɪˌeɪɾᵻd ɔːɹ ˈæktᵻd ɪmpɹˈɑːpɚli .|0
|
97 |
+
LJSpeech-1.1/wavs/LJ003-0111.wav|hiː wʌz ɪŋ kˈɑːnsɪkwəns pˌʊt ˌaʊɾəv ðə pɹətˈɛkʃən ʌv ðɛɹ ɪntˈɜːnəl lˈɔː , ˈɛnd kwˈoʊt . ðɛɹ kˈoʊd wʌzɐ sˈʌbdʒɛkt ʌv sˌʌm kjˌʊɹɹɪˈɔsɪɾi .|0
|
98 |
+
LJSpeech-1.1/wavs/LJ008-0258.wav|lˈɛt mˌiː ɹᵻtɹˈeɪs maɪ stˈɛps , ænd spˈiːk mˈoːɹ ɪn diːtˈeɪl ʌvðə tɹˈiːtmənt ʌvðə kəndˈɛmd ɪn ðoʊz blˈʌdθɜːsti ænd bɹˈuːɾəli ɪndˈɪfɹənt dˈeɪz ,|0
|
99 |
+
LJSpeech-1.1/wavs/LJ029-0022.wav|ðɪ ɚɹˈɪdʒɪnəl plˈæŋ kˈɔːld fɚðə pɹˈɛzɪdənt tə spˈɛnd ˈoʊnli wˈʌn dˈeɪ ɪnðə stˈeɪt , mˌeɪkɪŋ wˈɜːlwɪnd vˈɪzɪts tə dˈæləs , fˈɔːɹt wˈɜːθ , sˌæn æntˈoʊnɪˌoʊ , ænd hjˈuːstən .|0
|
100 |
+
LJSpeech-1.1/wavs/LJ004-0045.wav|mˈɪstɚ . stˈɜːdʒᵻz bˈoːɹn , sˌɜː dʒˈeɪmz mˈækɪntˌɑːʃ , sˌɜː dʒˈeɪmz skˈɑːɹlɪt , ænd wˈɪljəm wˈɪlbɚfˌoːɹs .|0
|
StyleTTS_Accelerate/Data/val_list_libritts.txt
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_19794.wav|aɪ nˈuː aɪ wʌz ɡˌɛɾɪŋ ˌɪntʊ tɹˈʌbəl kˈʌmɪŋ hˈɪɹ , bˌʌt θˈæŋkfəli aɪv ɡˈɑːt juː , dˈɑːktɚ . aɪ wˌʊdəntəv fˈɪɡɚd ˈaʊt hˌaʊ tə ɹᵻzˈɑːlv sˈʌtʃ ɐ kˈɑːmplᵻkˌeɪɾᵻd kˈɑːnflɪkt baɪ maɪsˈɛlf . ˈɔːl ɹˈaɪt , tˈaɪm tə pˈæk ˌʌp maɪ ɡˈʌn . mˈɪʃən kəmplˈiːt , dˈɑːktɚ . wiː ɐtʃˈiːvd vˈɛɹi ...|56
|
2 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12789.wav|stɹˈaɪk ðˌɛm ! pˈɪɹs ˈɛvɹɪθˌɪŋ . bᵻhˈoʊld ! jʊɹ dˈɛd ! jʊɹ dˈɛθ ! ɪts tˈaɪm , kˈɑːnsəntɹˌeɪɾᵻd . dˈaɪ ! ðæts ðə lˈæst tˈaɪm . juː θˈɪŋk ʌv mˌiː . kˈʌm ˈɔn ! ðæts dʒˈʌst ðə tˈaɪm .|240
|
3 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_1753.wav|nˈaɪs mˈuːv . kənsˈɪdɚɹɪŋ sˈʌmwʌnz fˈɪzɪkəl kəndˈɪʃən .|122
|
4 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14206.wav|ˈoʊ , ðeɪɚ dɹˈeɪnɪŋ jʊɹ mˈɛntəl pˈaʊɚ !|247
|
5 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12264.wav|tˈʌkɪn kjuːzˈuːzi ! ðæt sˈʌkt . dˈoʊnt wˈʌɹi , juː dˈoʊnt hæv təbi skˈɛɹd . dʒˈʌst klˈoʊz jʊɹ ˈaɪz ænd biː tˈeɪkən tə hˈɛvən . aɪl sˈɛnd juː tə hˈɛl ! aɪv wˈʌn ! nˈaʊ , lˈɛts dˈaɪ .|239
|
6 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4661.wav|nˈaɪts mˈeɪ biː ˈeɪbəl tə klˈaɪm bˈæk ˌʌp ˈæftɚ ðeɪ fˈɔːl , bˌʌt ðɪ ɪnfˈɛktᵻd dˈoʊnt hæv sˈʌtʃ kˈaɪndhˈɑːɹɾᵻd ˈɑːpʃənz ɐvˈeɪləbəl tə ðˌɛm .|162
|
7 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12369.wav|juː pɹˈɛs ðɪs swˈɪtʃ tuː ... nˈoʊ , nˈoʊ , nˈoʊ ! ðæts ðə sˈɛlfdᵻstɹˈʌkt bˈʌʔn̩ ! ˈɑː ! ɹˈʌn !|24
|
8 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10319.wav|wɪɹ wˈɪnɪŋ ! ˈoʊ , nˈaɪs dʒˈɑːb ! ˈoʊ , ðeɪɚ ɡˈʊd !|235
|
9 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10192.wav|mˈɪtsɚɹˌuː kæn biː tɹˈʌbəl tə fˈaɪt , bˌʌt aɪ nˈoʊ hɜː wˈɛl .|234
|
10 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7133.wav|hˈeɪ dˈɑːktɚ , ˈɑːɹknaɪts .|195
|
11 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3889.wav|aɪ wɪl ˈæsk mˈɪstɚ . ɹˈæbɪt tə pɹɪpˈɛɹ wˈʌn mˈoːɹ kˈeɪk .|154
|
12 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10529.wav|juː kæn fˈɪnɪʃ ðˌɛm ˈɔf !|235
|
13 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10588.wav|sˈʌmtaɪmz ɪts ɐbˌaʊt lˈʌk . nˈɛkst ɹˈaʊnd !|235
|
14 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5410.wav|aɪ ɐɡɹˈiː təbi ɐ fˈaɪt kˌoːɹɪˈɑːɡɹəfɚ fɔːɹ nˈiːnz mˈuːvi .|172
|
15 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14881.wav|kˈʌm ɐɡˈɛn tə mˌeɪk aʊɚsˈɛlvz nˈoʊn tə ðoʊz hˌuː dᵻfˈaɪ ˌʌs . wˈɛl dˈʌn . ðæt ˈɛndz ˈɛvɹɪθˌɪŋ .|251
|
16 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18092.wav|tɹˈaɪ sˈʌmθɪŋ , fˈaɪɚ ! wˌɛɹ kʊd juː bˈiː ?|34
|
17 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_2474.wav|sˈʌmtaɪmz juː dʒˈʌst ɡˈɑːɾə tˈeɪk ˈɔf ðə mˈæsk ænd kˈætʃ ɐ bɹˈiːðɚ .|135
|
18 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16172.wav|ɪf juː wˈɪn , aɪl ɡˈɪv juː fˈɛðɚ pˈɪŋks ˈɔːɾəɡɹˌæf !|256
|
19 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7299.wav|ɪt sˈiːmz juː ɑːɹ ɹˈæðɚ lˈæks ɪn meɪntˈeɪnɪŋ jʊɹ ˈoʊn hˈɛlθ , dˈɑːktɚ .|198
|
20 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17994.wav|dʒˈʌst bˌiːɪŋ ˈeɪbəl tə hˈoʊld ðə kˈɪɾɪz lˈaɪk ðˈɪs .|33
|
21 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_2538.wav|ɪts tˈaɪm fɔːɹ juː tə lˈiːv .|136
|
22 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4663.wav|maɪ wˈɛpən sˈiːmz təbi ˈæktɪŋ ˈʌp . wˌɪtʃ wˈeɪ ɪz ðə wˈɜːkʃɑːp ? ðæt wˈeɪ ? θˈæŋk juː .|162
|
23 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16288.wav|wɪɹ ˈoʊnli dˌaʊn baɪ ɐ lˈɪɾəl . lˈɛts kˈætʃ ˈʌp !|256
|
24 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_13598.wav|bˈɑːndɪŋ ˈækt ɔːlɹˈɛdi ? dˈoʊnt wˈʌɹi . aɪl kˈɪl ˈɛvɹɪwˌʌn ɐtwˈʌns . ðiːz bˈɑːndz juː bᵻlˈiːv ɪn ɑːɹ kwˈaɪt fɹˈeɪl . jʊɹ pˈaʊɚləs ʌnlˈɛs juː ˈækt æz ɐ ɡɹˈuːp , kɚɹˈɛkt ?|244
|
25 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7444.wav|blˈiːdɪŋ mˈɔːɹɡən . ʃiː mˌaɪt sˈiːm lˈaɪk ɐ lˈeɪɐbˌaʊt mˈoʊstli , bˌʌt ʃiːz ˌaʊɚ tæktˈɪʃən wɛn pˈʊʃ kˈʌmz tə ʃˈʌv . ɡˈɛts ɐ dʒˈɑːb dˈʌn . ɡˈɛts ˌɔn maɪ nˈɜːvz ˈʌðɚwˌaɪz , ðˌoʊ . ænd ʃiː lˈʌvz ɪt .|2
|
26 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8344.wav|aɪ dˈoʊnt wˈɔnt tə lˈuːz maɪ mˈaɪnd , ʌv kˈoːɹs .|210
|
27 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21352.wav|nˈɛkst tˈaɪm . dˈɑːktɚ .|73
|
28 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_605.wav|hiː wʌz stˈɪl wˈɪlɪŋ tə flˈʌf ɪɾ ˌɪntʊ tˈaɪni lˈɪɾəl stˈoːɹiz , dʒˈʌst tə mˌeɪk mˌiː smˈaɪl .|107
|
29 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16547.wav|ɑːɹ juː ʃˈʊɹ juː dˈoʊnt ɹˈiəli sˈiː ðˌɛm æz tˈuːlz ? ˈɛvɹi lˈæst wˈʌn ʌv juː ɪz ɪn maɪ wˈeɪ . aɪm fˈaɪn baɪ maɪsˈɛlf , sˌoʊ ɡɛt lˈɔst ! hˈɑː hˈɑː hˈɑː ! ænd ˈaɪdəl ? aɪ bˈɛt juː dʒˈʌst lˈʌv bˌiːɪŋ pˈæmpɚd . ɪt fˈiːlz ɡɹˈeɪt tə sˈiː juː .|256
|
30 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3213.wav|ɪt mˌaɪt biː ɐ lˈɪɾəl tʃˈæləndʒˌɪŋ fɔːɹ ðɪs jˈuːnɪt ɹˈaɪt nˈaʊ .|146
|
31 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20665.wav|jʊɹ hˈɪɹ tə pɹˈeɪ ? sˈɑːɹi , aɪm bˈɪzi ɹˈaɪt nˈaʊ .|66
|
32 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17916.wav|ˈɔːl ɹˈaɪt , wˈʌns mˈoːɹ . ɡɛt ɐwˈeɪ fɹʌm mˌiː , plˈiːz !|31
|
33 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10945.wav|ɑːhˈɑː ! ðə pˈɪtʃɚɹ ɪz kˈɜːɹəntli ɐfɹˈeɪd ʌvðə bˈæɾɚ !|236
|
34 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_1495.wav|ˈoʊ , woʊnt juː wˈɔnt mˌiː lˈiːdɪŋ dˈɑːktɚ ?|12
|
35 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8645.wav|lˈɛts nˌɑːt fˈiːl dˈaʊn , dˈɑːktɚ . juː wɪl fˈaɪnd ɐ səlˈuːʃən tə bɹˈeɪk ðɪs dˈɛdlɑːk .|214
|
36 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21677.wav|ˈɪntɹɛstɪŋ fˈeɪsᵻz ɑːɹ lˈaɪk stˈoːɹiz ɪn ðɛɹ ˈoʊn ɹˈaɪt .|76
|
37 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7735.wav|jʊɹ nˌɑːt jˈuːzd tə mˌiː tˈɔːkɪŋ ðɪs wˈeɪ , bˌʌt aɪm ɐn ˈɑːnɪsttəɡˈʊdnəs nˈaɪt . ʃˈʊɹli aɪ kæn ˈɪmpɹəvˌaɪz ɐ spˈiːtʃ ɔːɹ tˈuː .|203
|
38 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12417.wav|dˈoʊnt tˈeɪk ˌʌs lˈaɪtli . ðeɪɚ stˈɪl kˈʌmɪŋ . ðə bˈæɾəlz stˈɪl ɡˈoʊɪŋ .|240
|
39 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4109.wav|fˈɜːst ˈeɪd hˈɪɹ , tˈeɪk ɪt . aɪl biː ɹˈaɪt ðˈɛɹ !|157
|
40 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18232.wav|bˌʌt wʊd juː hˈæpən tə nˈoʊ wˌɛɹ pɹɑːvˈɑ̃s ænd ˈɛvɹɪwˌʌn ˈɛls wˈɛnt ?|37
|
41 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20440.wav|ðiːz klˈoʊðz hæv jʊɹ ɐtˈɛnʃən , dˈɑːktɚ ? ðeɪ kənfjˈuːz mˌiː dʒˈʌst æz mˈʌtʃ wɛn aɪ fˈɜːst pˌʊt ðˌɛm ˈɔn . ðeɪ hæv ɐ stˈɑːɹɾəlɪŋ dᵻfˈɛnsɪv kəpˈæsᵻɾi , dᵻspˈaɪt ðɛɹ ˈɑːbviəs dɪzˈaɪn .|62
|
42 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_2160.wav|ˈɑː , sˈuːzən , dˈoʊnt tɹˈuː mˌiː , aɪl tˈoʊld mˌiː . aɪ dˈoʊnt tə θˈɪŋk tə ðə tˈaɪm .|130
|
43 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9888.wav|biː ɛmbˈæɹəst ɐbˌaʊt hˌaʊ ɪt tˈɜːnd ˈaʊt . aɪ wˈʌn ðɪs fˈaɪt , bˌʌt ðɛɹz nˈoʊ nˈoʊɪŋ hˌaʊ ˌɪɾəl ɡˌoʊ nˈɛkst tˈaɪm .|234
|
44 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15842.wav|dˈoʊnt wˈʌɹi , dʒˈʌst wˈɑːtʃ ðˌɛm !|255
|
45 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17659.wav|ɪts fˈɪlθi . ˈʌɡ . wˌʌt ɑːɹ juː lˈʊkɪŋ æt mˌiː fɔːɹ ? juː wˈɔnt mˌiː tə tʃˈeɪs ðˌɛm ?|28
|
46 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15844.wav|mˈeɪbiː aɪ kæn ɡɛt sˌʌm ˈænsɚz ɪf aɪ ɡˌoʊ ðˈɛɹ . sˈoʊ , ɪts ˈoʊnli nˈætʃɚɹəl ðæt ðɛɹˌɑːɹ fˈeɪks wˈɔndɚɹɪŋ ɚɹˈaʊnd ɪn hˈɪɹ tˈuː .|255
|
47 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5427.wav|ɪɾ ɪz ɪmpˈɑːsᵻbəl tə sˈiː ˈɔːl ðiːz sˈaɪts .|172
|
48 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21460.wav|jˈɛs , ðə ɹˈiːdz ðæt wʊd bˈɜːn ɐwˈeɪ .|75
|
49 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5961.wav|ɪf juː wˈɪʃ mˌiː tʊ ɐtˈɛnd juː , ðæt hˈæpənz təbi maɪ spˈɛʃəlɾi .|18
|
50 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21354.wav|wˌaɪ dˈoʊnt juː kˈʌm pɹˈæktɪs ðə blˈeɪd wɪð mˌiː ? dˈɑːktɚ , ɹᵻmˈɛmbɚ tə pɹˈæktɪs jʊɹ kˈʌŋ fˈuː .|73
|
51 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12956.wav|ˈoʊ ɹˈaɪt ! nˈaʊ ! juː kæn bˈiːt ðˌɛm !|241
|
52 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10951.wav|ðɪs sˈiːmz tə kˌɑːntɹədˈɪkt jʊɹ dɪzˈaɪɚ fɔːɹ sˈɑːlɪtˌuːd .|236
|
53 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21021.wav|aɪ kˈænt ɹᵻmˈɛmbɚ hɜː fˈeɪs , bˌʌt ðeɪ klˈɪɹli ɹᵻkˈɔːld ðə ɹˈaɪm ʃiː sˈæŋ mˌiː .|70
|
54 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18479.wav|ðə tˈaɪd stˈɪl kˈʌmz ˈɪn . wiː kˈænt lˈɛt ˌaʊɚ ɡˈɑːɹd dˌaʊn jˈɛt .|4
|
55 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10072.wav|ðɪs hˈæpənz sˈʌmtaɪmz . dˈoʊnt wˈʌɹi ɐbˈaʊt ɪt .|234
|
56 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16293.wav|nˈaɪs pˈeɪs , dˌuːɪŋ ɡˈʊd !|256
|
57 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8232.wav|wˌaɪ dˈʌz ˈɛvɹɪwˌʌn pˈʊl ðiːz wˈɪɹd fˈeɪsᵻz wɛn ðeɪ fˈɪnɪʃ ?|209
|
58 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6296.wav|ˈoʊ , sˈɑːɹi .|183
|
59 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20989.wav|wɛnˌɛvɚ ðə tɹˈaɪb ɪz ɔnðə mˈuːv , ˈɛvɹɪwˌʌn wˈʌɹiz wˈɛðɚ wiːl fˈaɪnd ɐ ɡˈʊd plˈeɪs tə sˈɛɾəl dˈaʊn .|7
|
60 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7065.wav|θˈæŋks . θˈɪŋz ɑːɹ ɔnðə ɹˈaɪt tɹˈæk nˈaʊ .|194
|
61 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15376.wav|dˈɪd juː plˈæn ðˈæt ? aɪ wˈɪʃ ðɛɹd biː ɐn ˈʌpsɛt .|253
|
62 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16663.wav|ðɪs wɪl fˈaɪnəli ˈɛnd θˈɪŋz . ɡˈʊd lˈʌk tə juː .|257
|
63 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16019.wav|aɪ wʌz sˌoʊ klˈoʊs , aɪm sˈɑːɹi . vˈɪktɚɹi ɪz aʊɚz !|255
|
64 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14790.wav|tə ɡˌoʊ wˈaɪld sˈʌmtaɪmz . aɪl ˌɪntɹədˈuːs juː tə ðˌɛm nˈɛkst tˈaɪm . ɹˈaʊnd ? wˈʌn , tˈuː , θɹˈiː , fˈoːɹ , fˈaɪv , sˈɪks , sˈɛvən , ˈeɪt , ˈeɪt , nˈaɪn , tˈɛn , tˈɛn .|250
|
65 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12131.wav|huːˈɛvɚ blˈɪŋks fˈɜːst wɪl lˈuːz !|239
|
66 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_1143.wav|wiː ɡˈeɪn pəzˈɛʃənz , stˈæɾəs ænd ɐ dˈiːsənt lˈaɪf .|115
|
67 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10829.wav|juː mˈʌst kˈaʊntɚɹɐtˌæk !|236
|
68 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17786.wav|ˈɔːl juː nˈiːd tə nˈoʊ ɪz ðæt aɪm nˌɑːt ˈæftɚ jʊɹ lˈaɪf fɔːɹ nˈaʊ , dˈɑːktɚ . ɑːɹ juː ʃˈʊɹ aɪ nˈiːd tə wˈɑːtʃ ðˈiːz ? bˈæk ɪn kˈæsdɛl , juː jˈuːzd tə lˈiːd maɪ pˈiːpəl . hˈʌ , ɡɹˈeɪt .|3
|
69 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11064.wav|aɪ sˈiː , nˈaʊ ðɪs ɪz kwˈaɪt ˈɪntɹɛstɪŋ . aɪm sˈɑːɹi , bˌʌt aɪm stˈɑːɹɾɪŋ tə bɪkˌʌm ɹˈæðɚ bˈoːɹd . ɪf aɪ kəntˈɪnjuː klˈaɪmɪŋ ðɪs tˈaʊɚ , wɪl ɐ hˈɑːɹtθɹˈɑːbɪŋ ɹˌiːjˈuːniən wɪð ðæt jˈʌŋ mˈæn fɹʌm ˈɜːlɪɚ biː ɐwˈeɪɾɪŋ mˌiː ? ˈoʊ mˈaɪ , aɪ hæd ɡˈɑːʔn̩ kəmplˈiːtli bˈoːɹd .|236
|
70 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3913.wav|aɪm hˈʌŋɡɹi . ˈɛvɹɪwˌʌn wɪl lˈʌv ðə blˈuː bˈʌbəl ɡˈɑːɹdən .|154
|
71 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18559.wav|aɪm nˌɑːt ðæt ɡˈʊd æt smˈɔːl tˈɔːk .|41
|
72 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_1869.wav|aɪl tɹˈaɪ sˈʌmhaʊ . aɪ wˈʊdənt mˌeɪk ɐ ɡˈʊd ɛksplˈoːɹɚ .|125
|
73 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8106.wav|aɪ mˈʌst ɹᵻbˈɪld ɪt fɚðə tʃˈɪldɹən . æz lˈɔŋ æz ðeɪ kæn hæv ɐ ʃˈɛltɚ tə kˈɔːl hˈoʊm , ˌɛni plˈeɪs wɪl dˈuː .|208
|
74 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5399.wav|aɪ dɪdnˌɑːt sˈiː juː æt ðɪs mˈɔːɹnɪŋz tɹˈeɪnɪŋ sˈɛʃən .|172
|
75 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9998.wav|ʃiː dˈʌzənt sˈiːm jˈuːzd tə bˈæɾəl jˈɛt . fˈɪnɪʃ hɜː kwˈɪkli .|234
|
76 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8594.wav|baɪ mˈaɪlz . bˈoʊθ ɹˈeɪθɪən ˈɑːɹ ænd dˈiː pɹˈɑːdʒɛkts ænd wˈɜːkɪŋ təɡˌɛðɚ wɪð juː hɐvbɪn dᵻlˈaɪtfəl ɛkspˈiəɹɪənsᵻz .|214
|
77 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5889.wav|aɪ ɹˈiəli ɛndʒˈɔɪ plˈeɪɪŋ wɪð kˈɪdz . bˈæk ɪn maɪ hˈoʊmtaʊn , aɪ tˈʊk maɪ bɹˈʌðɚz ænd sˈɪstɚz slˈɛdɪŋ ˈɛvɹi jˈɪɹ .|178
|
78 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12846.wav|ðɪs ɪz kwˈaɪt ɐ fˈaɪt kˈɑːɹd . kˌoʊdɐmˈɑːɾoʊ ɪz ɛksˈaɪɾᵻd , tˈuː .|240
|
79 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10326.wav|ˈoʊ ! ˈoʊ ! ðæt wʌz kˈuːl !|235
|
80 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8878.wav|lˈʊkɪŋ ɡˈʊd . wɪɹ ˌɔn ɐ ɹˈoʊl , hˈʌ ? ðæt ˈiːzi , hˈʌ ? aɪ kæn tˈɛl baɪ jʊɹ fˈeɪs . ˈɑː , ðæt wʌz fˈæst .|220
|
81 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11613.wav|dˈæm , ˈiːvən aɪm bɹˈeɪkɪŋ ˈaʊt ɪnðə swˈɛt . dʒˈʌst ɡˌɛɾɪŋ wˈʌn mˈoːɹ hˈɪt ænd jʊɹ dˈʌn ! dˈuː ɪt !|238
|
82 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_23554.wav|həm . ɪf mˈɛmɚɹi s��ɜːvz .|99
|
83 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9291.wav|wiː nˈiːd tə tˈeɪk ɐdvˈæntɪdʒ ʌv ˌaʊɚ stɹˈɛŋθs . ðeɪ fˈaʊnd ˌʌs . wiːv bˌɪn ɛkspˈoʊzd . wˈɑːtʃ ˈaʊt ! ðeɪ spˈɑːɾᵻd ˌʌs ? wiːv bˌɪn spˈɑːɾᵻd . ðeɪ fˈaʊnd ˌʌs .|229
|
84 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3666.wav|aɪd lˈaɪk tə pˈɪtʃ ɪn sˌʌm ʌv maɪ ˈoʊn stɹˈɛŋθs nˈaʊ .|151
|
85 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_421.wav|aɪ dˈoʊnt nˈoʊ hˌaʊ aɪ ˈɛndᵻd ˌʌp ˌɔn ðɪs tˈiːm ˈiːðɚ . ˈɔːl aɪ wˈɔntᵻd wʌz ɐn ˈɔːɹdɪnˌɛɹi lˈaɪf . wˌaɪ dˈɪd aɪ ˈɛnd ˌʌp ɪn tʃˈɑːɹdʒ ʌv ðiːz pɹˈɑːbləm tʃˈɪldɹən ?|104
|
86 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7705.wav|juː wˈɔnt mˌiː tə lˈiːd ?|202
|
87 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16124.wav|ðeɪɚɹ ɐ pˈaʊɚfəl əpˈoʊnənt . lˈɛts stˈeɪ fˈoʊkəst .|255
|
88 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3668.wav|juː θˈɪŋk ðæts wˈɪɹd ? juː lˈɜːn ɐ lˈɑːt wɛn jʊɹ ɪn ɹˈoʊdz ˈaɪləndz ˌɛndʒɪnˈɪɹɪŋ dᵻpˈɑːɹtmənt .|151
|
89 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20155.wav|dʒˈʌst ˈɛvɹi wˈʌns ɪn ɐ wˈaɪl , aɪ fˈiːl ɹˈiəli hˈæpi wɛn aɪ ɡɛt kˈɑːmplɪmənts fɹʌm juː .|6
|
90 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15030.wav|ɪf juː lˈuːz , juːl biː ɐ lˈæfɪŋ stˈɑːk . ðɪs ɹˈaʊnd wɪl biː ɐ sˈɪntʃ fɔːɹ juː , ɹˈaɪt ?|253
|
91 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18378.wav|ɪz hɑːɹmˈoʊniə stˈɪl nˌɑːt ɡˌɛɾɪŋ ˌɛni bˈɛɾɚ ?|38
|
92 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21171.wav|jˈɛs , dˈuː aɪ kənsˈɪdɚ hɜːɹ ɐn ˈɛnəmi ?|72
|
93 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11005.wav|ˈɔːl ðɛɹ ɪz nˈaʊ ɪz tə pɹɪpˈɛɹ joːɹsˈɛlf .|236
|
94 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7076.wav|həm , oʊkˈeɪ . maɪ bˈɑːdili flˈuːɪdz ˌɑːɹnt ðɪ ˈoʊnli θˈɪŋ ðæts tʃˈɪlɪŋ .|194
|
95 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_22119.wav|ænd ðeɪ wɪl nˈɛvɚ kəmplˈeɪn . hˈeɪ , aɪd sˈeɪ , lˈɛts plˈeɪ ðæt ɹˈɛkɚd , ðə wˈʌn wiː ˈɔːl lˈaɪk .|82
|
96 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9677.wav|æm aɪ bˌiːɪŋ mˈæspɹədˈuːst ? ðæt kˈænt bˈiː ! jʊɹ əpˈoʊnənt ɪz dˈuːmd . ˈeɪ .|233
|
97 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_1674.wav|aɪ stˈɪl hˈævənt fˈɪɡɚd ˈaʊt wʌt dʒˈʌstɪs ɹˈiəli ɪz .|121
|
98 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_23608.wav|ɪts ɐ fˈʌni lˈɪɾəl θˈɪŋ ðæt aɪ ɡɛt tə fˈaɪt ɐlˈɔŋsaɪd ðɪs vˈɜːʒən ʌv juː .|99
|
99 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18551.wav|hˈeɪ , lˈɛts ɡˌoʊ fɚɹə dɹˈɪŋk . ɪts ˈɔːlweɪz ˈæʃli hˌuː dɹˈɪŋks wɪð mˌiː ænd ðæts dʒˈʌst dˈʌl . dˈɑːktɚ ! kwˈɛstʃən ! kæn aɪ tˈeɪk ɐ lˈʊk æɾ ˈɑːpɚɹˌeɪɾɚ nˈɜːlz bˈæɾəl ɹˈɛkɚdz ?|41
|
100 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10123.wav|aɪ wʌz hˈɛlpɪŋ tˈuː . dˈæm ɪt !|234
|
101 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_19422.wav|sˌoʊ aɪ mˈʌst ˈæsk .|51
|
102 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9579.wav|həm , juː kʊd biː bˈædli ˈɪndʒɚd ɪf juː duːnˌɑːt tˈeɪk kˈɛɹ .|233
|
103 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11443.wav|aɪ swˈɛɹ aɪ dʒˈʌst fˈɛlt ɐ hjˈuːdʒ tʃˈɪl dˌaʊn maɪ spˈaɪn . wˈeɪt ɐ sˈɛk hˈɪɹ . aɪ θˈɪŋk aɪ bɹˈɔːt ɐ bˈæt tʊ ɐn ˈæks fˈaɪt . jˈaɪks ! ðə fˈeɪks ˈɛvɹi bˈɪt æz ɪntˈɛns æz ðə ɹˈiːəl wˌʌn . ˈʌ , ɪts hˈɑːɹd tə tˈeɪk juː sˈɪɹiəsli lˈʊkɪŋ lˈaɪk ðˈæt . tˈɑːdˈɑː ! ɐ tˈoʊɾəl vˈɪktɚɹi fɔːɹ mˌiː !|238
|
104 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_2542.wav|wˌʌt ɪnðə wˈɜːld ɪz ɪt ðæt aɪ dˈuː ?|136
|
105 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_917.wav|ˈoʊ .|110
|
106 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18636.wav|bɹˈɑːvoʊ , dˈɑːktɚ . aɪ wʌz ɹˈaɪt tə pˈɑːɹtnɚ wɪð juː .|42
|
107 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5373.wav|ʌnstˈɛdi pˈɔstʃɚ , wˈeɪvɚɹɪŋ ɪntˈɛnt , mˈʌtʃ tə lˈɜːn jˈɛt .|172
|
108 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3676.wav|pˈɪloʊ , lˈɛts dˈuː ˌaʊɚ bˈɛst .|151
|
109 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_23396.wav|juː ʃˌæl nˌɑːt pˈæs .|97
|
110 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3717.wav|aɪd bˈɛɾɚ stˈænd wˈɑːtʃ kwˈaɪətli . nˈaɪs tə mˈiːt juː , dˈɑːktɚ . kˈɔːl mˌiː kˈɔːɹkiz . ɪts ðə fˈɜːst tˈaɪm wiːv mˈɛt , bˌʌt ðɪ ˈɪntɛl ˌɔn jʊɹ dˈɛsk ɐbˌaʊt nˈɔːɹðɚn vɪktˈoːɹiə ɪz stɹˈeɪt fɹʌm mˌiː .|152
|
111 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_13973.wav|ðɪ əpˈoʊnənt ɪz dˈɛspɚɹət tuː . bˈɛnd ðˌɛm ˈɔf .|247
|
112 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15286.wav|tˈuː sˈoːɹdz ænd ɐ pɜːsˈoʊnə . biː pˈɪɾifəl ɪf hiː lˈɔst , hˈʌ ?|253
|
113 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8931.wav|hˈɛlp mˌiː ! ᵻlˈɪmᵻnˌeɪɾɪŋ θɹˈɛts . tˈuː lˈeɪt fɔːɹ ɹᵻɡɹˈɛts . ðɪs ɪz nˈʌθɪŋ ! nˈaʊ juːv dˈʌn ɪt ! aɪ nˈiːd hˈiːlɪŋ ! ðə fˈaɪɚ ! aɪ kˈænt mˈuːv lˈaɪk ðˈɪs ! ᵻlˈɛktɹᵻfˌaɪd ! ɡˌɛɾɪŋ dˈɪzi . maɪ hˈɛd ! aɪ fˈiːl . nˈoʊ fˈɪɹ . maɪ pɜːsˈoʊnə hˈæzbiːn sˈiːld !|221
|
114 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6952.wav|wˈɛl , tˈaɪni sˈɪlvɛstɚz nˈaɪtklʌb lˈiːdɚ , plˈeɪnteɪl naɪsˈoʊnə , kəntˈɪnjuːɪŋ ðə stɹˈaɪd .|192
|
115 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9147.wav|ðæts nˌɑːt wˈaɪz . ɹᵻtɹˈiːɾɪŋ ! dˈoʊnt fˈɔːl bᵻhˈaɪnd ! nˈoʊ , nˌɑːt jˈɛt ! nˈoʊ ! aɪ woʊnt lˈɛt juː ! dʒˈoʊkɚ ! bˈæk ɪnðə fˈaɪt ! hˈæŋ ɪn ðˈɛɹ ! kˈʌm ˈɔn ! ðɪs mˌaɪt stˈɪŋ ! aɪ kæn ɡɛt ɐ hˈɪt ˈɪn ! nˈaɪs flˈoʊ ! lˈɛt mˌiː hˈɛlp ! oʊkˈeɪ ? aɪl θɹˈæʃ !|227
|
116 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8550.wav|pˈeɪ fɔːɹ ðˌaɪ sˈɪnz wɪð ðˌaɪ blˈʌd . ðə lˈɔː ?|213
|
117 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15957.wav|ɔːlɹˈaɪt , wiː ɡɑːt ðə fˈɜːst hˈɪt . ðɪs ɪz ɐ wˈʌnsˈaɪdᵻd fˈaɪt .|255
|
118 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7865.wav|aɪ ʃˌʊd biː ˈeɪbəl tə dˈuː ɪt . slˈoʊli ænd kˈɛɹfəli .|204
|
119 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11888.wav|wˈoʊ , ðeɪɚ sˈɪɹiəs nˈaʊ !|239
|
120 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_1897.wav|aɪ pɹˈɑːmɪst hɜːɹ aɪd pɹətˈɛkt juː . ɪz ðˈɪs ?|125
|
121 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_428.wav|ɪt wʊd biː bˈɛɾɚ tə hæv mˈoːɹ mˈɪʃənz .|105
|
122 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4062.wav|ðeɪ ʃˌæl pˈeɪ fɔːɹ ðˈɪs .|156
|
123 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21292.wav|bˌʌt ðə hˈɑːɹd pˈɑːɹt ɪz , juː mˈʌst fˈɜːst fˈeɪs ðæt wˌɪtʃ lˈaɪz ɪn jʊɹ hˈɑːɹt bᵻfˌoːɹ juː kæn ɹˈiəli lˈɛt ɪt ɡˈoʊ . aɪ hæv bˈoʊθ hˈeɪtɹɪd ænd ɡɹˈæɾɪtˌuːd təwˈɔːɹdz ðə ɡɹˈændmæstɚ .|73
|
124 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16018.wav|ˈoʊ ! wˈaɪ ?|255
|
125 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6380.wav|wˈɛl , wˌʌt ˈɛls mˌaɪt ðɪs dɪzˈiːz dˈuː tə mˌiː ?|185
|
126 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_22784.wav|ɑːɹ juː tɹˈaɪɪŋ tə ɡɛt jʊɹ hˈænd bˈɜːnt ? dˈɑːktɚ , kʊd juː ɛksplˈeɪn ðɪs θˈiəɹi tə mˌiː ? aɪl nˈiːd ... twˈɛlv mˈɪnɪts ʌv jʊɹ tˈaɪm . ðæts ˈɔːl .|89
|
127 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17893.wav|juːv bˌɪn lˈʊkɪŋ ˈæftɚ maɪ ɛmplˈɔɪiːz fɔːɹ mˌiː . ðeɪ hˈævənt mˌeɪd tɹˈʌbəl fɔːɹ juː , hˈæv ðeɪ ?|31
|
128 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3501.wav|ðeɪl biː mˈʌtʃ mˈoːɹ dˈʌn ɔnðə pɹˈɑːbləmz ðɪ ɪnfˈɛktᵻd fˈeɪs ɪnðə lˈʌŋmɛn slˈʌmz .|15
|
129 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16077.wav|ɪts tˈaɪm tə ʃˈoʊ ˈɔf jʊɹ mˈænlinəs , kˈændʒi . kˈɑːndʒi , hˈɪt hˌɪm wɪð ᵻlɛktɹˈɪsᵻɾi tə ɡɛt ðɪ ɐdvˈæntɪdʒ .|255
|
130 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18376.wav|ˈɑː , aɪ kæn stˈɪl smˈɛl ˈɜːbz ˌɔn maɪ klˈoʊðz .|38
|
131 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9357.wav|fˈiːl lˈaɪk jʊɹ ɡˌənə fˈeɪnt .|23
|
132 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3971.wav|maɪ fɹˈɛndz , æz lˈɔŋ æz wiː stˈænd təɡˈɛðɚ , ðɛɹ wɪl biː nˈoʊ ˈiːvəl wiː kˈænt ˌoʊvɚkˈʌm .|155
|
133 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14197.wav|juː kˈænt sˈʌmən jʊɹ pɜːsˈoʊnə . hˈoʊld ˈaʊt fɔːɹ nˈaʊ !|247
|
134 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14050.wav|ðæt kˈɔːzd ˌʌs . jʊɹ sˈɪɹiəsli hˈɜːt . juːv tˈeɪkən kəntɹˈoʊl ʌvðə bˈæɾəl .|247
|
135 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20480.wav|kˈʌmfi ænd wˈɔːɹm , ɹˈaɪt ? aɪ tˈoʊld nˈɑːnə ˈɔːl ɐbˈaʊt juː , ænd ʃiː kˈɔːld juː ɐ ɡˈʊd ˈɛɡ .|63
|
136 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14360.wav|ʃiː hɐz ðɪ ɐdvˈæntɪdʒ ʌv klˈoʊs . biː vˈɪdʒɪlənt .|247
|
137 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10119.wav|ˈɑː , stˈɑːɹɾɪŋ təmˈɑːɹoʊ , aɪm t��ˈeɪnɪŋ juː tə ɡɛt ɹᵻvˈɛndʒ .|234
|
138 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6517.wav|ðeɪ woʊnt biː ˈeɪbəl tə blˈɑːk ðˈɪswˌʌn . juːv ɡˈɑːt ðˌɛm .|187
|
139 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20709.wav|mˈeɪ ðə hˈoʊli mˈaʊnt kˈɑːɹlɪn biː wɪð ˌʌs .|66
|
140 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12099.wav|hˈeɪ , jʊɹ pɜːsˈoʊnə sˈiːld ! jʊɹ kənfjˈuːzd ! lˈɛft ænd ɹˈaɪt ɑːɹ ɹᵻvˈɜːst !|239
|
141 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5076.wav|wˈiːpɪŋ ænd θɹˈoʊɪŋ ˈʌp . skˈɛɹd aɪ dˈɪdnt dˈuː wˈɛl ɪnˈʌf ðæt aɪ wʊd stˈɑːɹv tə dˈɛθ ɪn sˌʌm fɚɡˈɑːʔn̩ kˈɔːɹnɚɹ ʌnnˈoʊn tʊ ˈɛnɪwˌʌn .|168
|
142 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9670.wav|nˈoʊ , lˈɛts fˈaɪt fˈɛɹ . ɡɛt klˈoʊs ænd bˈiːt ðˌɛm tʊ ɐ pˈoʊp , tʃˈaɪsˌæn . biː kˈɔːʃəs ʌv klˈoʊs kˈɑːmbæt wɪð hɜː . tɹˈaɪ θɹˈoʊɪŋ ðoʊz fˈænz wˈʌn ˈæftɚ ðɪ ˈʌðɚ . wiː mˈʌst ɛŋɡˈeɪdʒ ɪn klˈoʊs kˈɑːmbæt . ˌoʊvɚwˈɛlm jʊɹ ˈɛnəmi .|233
|
143 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_19940.wav|dˈaʊn , pˈɔːz .|58
|
144 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9188.wav|mˈoʊst plˈeɪsᵻz ˌɑːɹnt bˈɪɡ ɪnˈʌf , ðˌoʊ . jʊɹ ʃˈoʊldɚz mˈʌst biː stˈɪf , bˌʌt ɪt fˈɪts juː wˈɛl .|228
|
145 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14902.wav|bɹˈɪŋ ɪɾ ˈɔn ! aɪ woʊnt lˈɛt juː tˈeɪk ðə ɹˈoʊl ʌv ɐdˈoːɹəbəl mˈæskɑːt . ðɪs ˈeɪdʒ nˈiːdz ɐ bˈɛɹ lˈaɪk mˌiː , nˌɑːt dʒˈʌst ɐ plˈeɪn ˈoʊld dˈɑːɡi . juː hæv ɐ lˈɑːt tə lˈɜːn bᵻfˌoːɹ juː kæn wˈɪn ɐɡˈɛnst mˌiː . aɪ ɐksˈɛpt ɹᵻmˈætʃᵻz twˈɛnti fˈoːɹ ˈaʊɚz ɐ dˈeɪ . ɐ bɚɹˈɑːʒ .|251
|
146 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9089.wav|aɪ ɛndʒˈɔɪ maɪ tˈaɪm ɪn ðɪs sˈuːt æz wˈɛl . wˌʌt ɪz mˌeɪkɪŋ ðoʊz lˈaɪnz ɡlˈoʊ ? jʊɹ ˌæbsəlˈuːtli ɹˈaɪt . aɪ θˈɪŋk wɪɹ dˈuː fɚɹə tˈiː bɹˈeɪk . ɹˈiəli ? aɪl biː ʃˈʊɹ tə tɹˈaɪ ðˈæt . sˈʌmθɪŋ stɹˈɔŋ .|225
|
147 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17226.wav|ðæts ðə mˈaɪnd əvən ˈɑːɹɾɪst , hˈʌ ? ˈʌ , nˈoʊ ? wˌʌt kˈaɪnd ʌv mˈaɪnd juː ɡˈɑːt , mˈæn ? aɪ nˈoʊ , ɹˈaɪt ? aɪ dˈoʊnt nˈoʊ wˌaɪ pˈiːpəl wˈeɪst mˈʌni ˌɔn fˈænsi wˈɔːɾɚ . ˈɑː , sˈaɪklɪŋ , hˈʌ ? aɪm kˈaɪnd ʌv ˌɪntʊ ɪt , tˈuː , bˌʌt , ˈʌ ... ðæt ʃˈɪt ɡˈɛts ɛkspˈɛnsɪv .|260
|
148 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6629.wav|ɐ nˈɑːk ɔnðə dˈoːɹ wɪl biː ɐpɹˈiːʃɪˌeɪɾᵻd . mˈɪs .|189
|
149 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6906.wav|ˈoʊ , ɹˈoʊd ˈaɪləndz bˈeɪs ɪz ˈaʊtfɪɾᵻd sˈoʊ mˌʌtʃ bˈɛɾɚ ðɐn maɪ lˈɪɾəl ˈɑːfɪs .|191
|
150 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15594.wav|juː . ɡˈʊd kˈɔːl . ˈɔːl ɹˈaɪt , juː dˈɑːdʒd ɐ bˈʊlɪt ðˈɛɹ . wˌʌt ðə hˈɛl ? dˈæm ɪt , ðæt ˈeɪnt fˈɛɹ . ðeɪ wˈɔːɹmd ðɛɹ wˈeɪ ˌaʊɾəv ɪt . dˈæm ɪt , juːl hæv tə tɹˈaɪ ðæt ɐɡˈɛn . nˈʌθɪŋ ˈɛls juː kˌʊdɐv dˈʌn ðˈɛɹ . hˈɑː ! skɹˈuː juː ! ˈoʊ kɹˈæp ! ˈɔːl ɹˈaɪt , nˈaʊz|254
|
151 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_852.wav|bˌʌt æt tˈaɪmz , tˈuː ɡɹˈeɪt ɐ fɪksˈeɪʃən kæn biː ˌaʊɚɹ ʌndˈuːɪŋ .|110
|
152 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_22220.wav|ɪz ðɛɹ ˈɛnɪθˌɪŋ juːd lˈaɪk tə dɪskˈʌs ?|84
|
153 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_2440.wav|stˈeɪ kˈɑːm .|134
|
154 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14045.wav|ðæt wˈɜːkt nˈaɪsli .|247
|
155 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4402.wav|fɚðə mˈɑːɹtʃɪŋ , ɐ wˈɔːɹ sˈɔŋ , fɚðə fˈɔːlən .|16
|
156 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10269.wav|lˈɛts ɡɛt ðɪs sˈɛɾəld . hˈeɪ ! wˌʌts ɐ dˈiːl ? aɪl ʃˈoʊ juː ðə ɹɪzˈʌlts ʌv maɪ tɹˈeɪnɪŋ . aɪ kæn hˈoʊld maɪ ˈoʊn , juː nˈoʊ . lˈʊk ˈaʊt bɪkˈʌz aɪm nˌɑːt hˈoʊldɪŋ bˈæk . kˈʌm ænd ɡˈɛt mˌiː !|235
|
157 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12288.wav|ʃˈoʊ ˌʌs ðə pˈaʊɚɹ ʌv mˈiːt , tʃˈaɪ sˈɛmpaɪ !|239
|
158 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21080.wav|ˌʌndɚ ðɪs hˈɛdɪŋ , juː hæd ðə θˈɜːd ɹˈoʊ ɔnðə lˈɛft hˈɪɹ ɹˈɔŋ .|71
|
159 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16944.wav|ðæt wʌzɐ klˈoʊs bˈæɾəl . vˈɪktɚɹi ˈɔːlweɪz fˈiːlz ɡɹˈeɪt !|257
|
160 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18021.wav|ðɪs ɪz ɐ fˈeɪvɚɹ aɪl nˈɛvɚ fɚɡˈɛt . pɹəmˈoʊʃən ? ˈoʊ , aɪm nˌɑːt pɹɪpˈɛɹd fɔːɹ ðˈɪs .|33
|
161 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8312.wav|aɪ wˈʌzn̩t ˈiːvən ɡˈɪvɪŋ maɪ ˈɔːl .|21
|
162 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4736.wav|ɪt tˈoːɹ maɪ fˈæmɪli ɐpˈɑːɹt ænd ðiːz slˈʌmbɚfˌʊts ɑːɹ ðɪ ˈoʊnli wˈʌnz lˈɛft tə ɹˈoʊm wɪð mˌiː .|164
|
163 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16142.wav|ˈoʊ , wiː lˈɔst , bˌʌt dˈoʊnt pˈænɪk . stˈeɪ kˈɑːm .|256
|
164 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_22670.wav|jʊɹ tʃˈiːk sˈuːðd baɪ ðə kɹˈeɪdlɪŋ wˈeɪvz .|88
|
165 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_19981.wav|sˈʌtʃ ɐz wiːv ɐɡɹˈiːd . ˈɛnɪθˌɪŋ ðæt sˈiːmz təbi ɡˈɪvən fɔːɹ fɹˈiː wɪl ᵻvˈɛntʃuːəli dᵻmˈænd sˌʌm pɹˈaɪs ʌv juː .|59
|
166 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17764.wav|ðoʊz ɡɹˈeɪ ænd wˈaɪt tɹˈiːz wˈɪðɚɹ ɪnðə fɹˈɪdʒɪd wˈɪnd , mˈɛn stɹˈʌɡəl tə ɹᵻɡˈeɪn ðɛɹ vaɪtˈælᵻɾi ðə fˈɑːloʊɪŋ jˈɪɹ .|3
|
167 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7475.wav|ɪf ðeɪɚ ɡˈɔn , wɪl aɪ stˈɪl biː ʌv ˌɛni jˈuːs ?|20
|
168 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14092.wav|jʊɹ hˈoʊldɪŋ ðˌɛm bˈæk , bˌʌt nˌɑːt baɪ mˈʌtʃ . ðeɪɚɹ ɐhˈɛd ʌv juː . kˈiːp ˈʌp wɪð ðˌɛm .|247
|
169 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6175.wav|aɪ sˈɪmpli wˈɪʃ tə lˈɪv æz ɐ ɹˈɛɡjʊlɚɹ ɪnfˈɛktᵻd .|182
|
170 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18284.wav|nˈaʊ aɪm ʃˈʊɹ ðæt ˈɛvɹɪθˌɪŋ ðæt aɪv dˈʌn ɪz tɹˈuːli mˈiːnɪŋfəl .|37
|
171 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8079.wav|hæv juː ɹˈɛstᵻd wˈɛl , dˈɑːktɚ ?|207
|
172 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18629.wav|maɪ fˈæmɪli dˈʌzənt ɛɡzˈɪst ˌɛnɪmˈoːɹ . bˌʌt ðæt wʌzðə pɹˈaɪs wiː pˈeɪd tə bˈaɪ tʃˈeɪndʒ ɪn sˈɪɹɐkjˌuːsə .|42
|
173 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4165.wav|ˈɪŋkʌmˌɪŋ , ɹˈɛdi ˈɔːl mˈɛmbɚz .|158
|
174 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6452.wav|ˈɛnəmi spˈɑːɾᵻd . pɹɪpˈɛɹ tə fˈaɪt . ɹˈɛdi fɔːɹ ˈækʃən .|186
|
175 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21644.wav|ðə nˈaɪt ɪz kwˈaɪət ænd pˈiːsfəl .|75
|
176 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3584.wav|aɪl mˌeɪk hˌɪm dˌɪsɐpˈɪɹ .|150
|
177 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11995.wav|dˈoʊnt ˈiːvən θˈɪŋk ʌv hˈoʊldɪŋ bˈæk nˈaʊ ! kˈiːp ðɪs ˌʌp ænd bˈiːt hˌɪm , oʊkˈeɪ ?|239
|
178 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15084.wav|həm , lˈʊk ˌɔn ðɛɹ fˈeɪsᵻz tʃˈeɪndʒd .|253
|
179 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_19183.wav|ˈoʊ , ɪts hɜː . aɪ nˈuː ðæt ɡˈɜːl jˈɪɹz ɐɡˈoʊ .|49
|
180 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20224.wav|plˈiːz sˈeɪv jʊɹ stɹˈɛŋθ . wiː wɪl ɡɛt juː ˈaʊt .|60
|
181 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_594.wav|aɪ dˈoʊnt hæv æz mˈʌtʃ mˈʌni ˌɔn hˈænd ðiːz dˈeɪz .|107
|
182 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18179.wav|ɪf kˈɜːsʔn̩ dˈɪdnt hæv ðə tˈælənt ænd lˈiːdɚʃˌɪp , ɹˈaɪn lˈæb wʊdhɐv nˈɛvɚ bˌɪn fˈaʊndᵻd .|36
|
183 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11662.wav|lˈɛts sˈɛnd ðə nˈɛkst wˈʌn dʒˈʌst lˈaɪk ðˈɪs ! aɪm kˈaʊntɪŋ ˈɔn juː !|238
|
184 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15455.wav|dˈʌn . hˈɪɹ juː ɡˈoʊ . jʊɹ tˈuː slˈoʊ . jʊɹ ɡˈɑːɹd ɪz wˈiːk . jˈɛh , wʌtˈɛvɚ . hˈɪɹz ɐ lˈɪɾəl ˈɛkstɹə fɔːɹ juː . ðæts ɪnˈʌf ʌv ðˈæt . juː lˈɪɾəl . jʊɹ ɡˌoʊɪŋ dˈaʊn . juː woʊnt hˈɪt mˌiː . wˌʌt ɐ hˈæsəl . kˈʌm hˈɪɹ . aɪl tˈeɪk juː ˈɔn . aɪm nˌɑːt dˈʌn jˈɛt . ɡˈɛs aɪ kæn biː sˈɪɹiəs fɚɹə bˈɪt .|253
|
185 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3719.wav|aɪ wˈɪʃ aɪd lˈɜːnd ˈɔːl ðɪs ɐ lˈɪɾəl ˈɜːlɪɚ .|152
|
186 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17821.wav|dˈoʊnt θˈɪŋk tˈuː mʌtʃ ɐbˈaʊt ɪt . ˈɛnɪwˌeɪ , aɪm ɡɹˈævəl , ɐ nˈaɪt ʌv kˈæzɪmˌɪɹz .|30
|
187 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12402.wav|ðɪs ɪz ɪt , ðə fˈaɪnəl ɹˈaʊnd . hˈæŋ ɪn ðˈɛɹ .|240
|
188 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11695.wav|ɪt hˈɜːts , bˌʌt ɪts ɐ ɡˈʊd kˈaɪnd ʌv hˈɜːt , juː nˈoʊ ? juː pˈænɪkt æt ðɪ ˈɛnd . dˈoʊnt ɡɛt ˌoʊvɚkˈɑːnfɪdənt .|238
|
189 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10622.wav|ˈoʊ , aɪ θˈɔːt wiː kʊd dˈuː ɪt !|235
|
190 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_13056.wav|ˈoʊ , aɪ hˈoʊp ɪɾ ˈɛndz lˈaɪk ðˈɪs !|241
|
191 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3652.wav|wˈɛl , aɪ bˈɛɾɚ tˈɜːn ðɪs məʃˈiːn dˌaʊn ɐ bˈɪt .|151
|
192 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14252.wav|həm , aɪm stˈɑːɹɾɪŋ tə sˈiː wʌts ɡˌoʊɪŋ ˈɔn .|247
|
193 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_2569.wav|wˌʌt aɪm ɐbˌaʊt tə ʃˈɛɹ wɪð juː ɪz nˌɑːt æz wˈʌndɚfəl æz juː mˈeɪ bᵻlˈiːv .|136
|
194 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9314.wav|ænd ɪts ˈoʊnli ɐ mˈæɾɚɹ ʌv tˈaɪm ʌntˈɪl jʊɹ mˈeɪd .|23
|
195 |
+
/home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10435.wav|tə fˈaɪt ðə ɹˈiːəl fˈɛðɚ pˈɪŋk ! juː nˈoʊ maɪ mˈæstɚ !|235
|
StyleTTS_Accelerate/Demo/Inference_LJSpeech.ipynb
ADDED
@@ -0,0 +1,417 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "9adb7bd1",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# StyleTTS Demo (LJSpeech)\n"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "markdown",
|
13 |
+
"id": "6108384d",
|
14 |
+
"metadata": {},
|
15 |
+
"source": [
|
16 |
+
"### Utils"
|
17 |
+
]
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"cell_type": "code",
|
21 |
+
"execution_count": null,
|
22 |
+
"id": "da84c60f",
|
23 |
+
"metadata": {},
|
24 |
+
"outputs": [],
|
25 |
+
"source": [
|
26 |
+
"%cd .."
|
27 |
+
]
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"cell_type": "code",
|
31 |
+
"execution_count": null,
|
32 |
+
"id": "5a3ddcc8",
|
33 |
+
"metadata": {},
|
34 |
+
"outputs": [],
|
35 |
+
"source": [
|
36 |
+
"# load packages\n",
|
37 |
+
"import random\n",
|
38 |
+
"import yaml\n",
|
39 |
+
"from munch import Munch\n",
|
40 |
+
"import numpy as np\n",
|
41 |
+
"import torch\n",
|
42 |
+
"from torch import nn\n",
|
43 |
+
"import torch.nn.functional as F\n",
|
44 |
+
"import torchaudio\n",
|
45 |
+
"import librosa\n",
|
46 |
+
"from nltk.tokenize import word_tokenize\n",
|
47 |
+
"\n",
|
48 |
+
"from models import *\n",
|
49 |
+
"from utils import *\n",
|
50 |
+
"\n",
|
51 |
+
"%matplotlib inline"
|
52 |
+
]
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"cell_type": "code",
|
56 |
+
"execution_count": null,
|
57 |
+
"id": "bbdc04c0",
|
58 |
+
"metadata": {},
|
59 |
+
"outputs": [],
|
60 |
+
"source": [
|
61 |
+
"device = 'cuda' if torch.cuda.is_available() else 'cpu'"
|
62 |
+
]
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"cell_type": "code",
|
66 |
+
"execution_count": null,
|
67 |
+
"id": "0a173af4",
|
68 |
+
"metadata": {},
|
69 |
+
"outputs": [],
|
70 |
+
"source": [
|
71 |
+
"_pad = \"$\"\n",
|
72 |
+
"_punctuation = ';:,.!?¡¿—…\"«»“” '\n",
|
73 |
+
"_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'\n",
|
74 |
+
"_letters_ipa = \"ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ\"\n",
|
75 |
+
"\n",
|
76 |
+
"\n",
|
77 |
+
"# Export all symbols:\n",
|
78 |
+
"symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)\n",
|
79 |
+
"\n",
|
80 |
+
"dicts = {}\n",
|
81 |
+
"for i in range(len((symbols))):\n",
|
82 |
+
" dicts[symbols[i]] = i\n",
|
83 |
+
"\n",
|
84 |
+
"class TextCleaner:\n",
|
85 |
+
" def __init__(self, dummy=None):\n",
|
86 |
+
" self.word_index_dictionary = dicts\n",
|
87 |
+
" def __call__(self, text):\n",
|
88 |
+
" indexes = []\n",
|
89 |
+
" for char in text:\n",
|
90 |
+
" try:\n",
|
91 |
+
" indexes.append(self.word_index_dictionary[char])\n",
|
92 |
+
" except KeyError:\n",
|
93 |
+
" print(char)\n",
|
94 |
+
" return indexes\n",
|
95 |
+
"\n",
|
96 |
+
"textclenaer = TextCleaner()"
|
97 |
+
]
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"cell_type": "code",
|
101 |
+
"execution_count": null,
|
102 |
+
"id": "00ee05e1",
|
103 |
+
"metadata": {},
|
104 |
+
"outputs": [],
|
105 |
+
"source": [
|
106 |
+
"to_mel = torchaudio.transforms.MelSpectrogram(\n",
|
107 |
+
" n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
|
108 |
+
"mean, std = -4, 4\n",
|
109 |
+
"\n",
|
110 |
+
"def length_to_mask(lengths):\n",
|
111 |
+
" mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
|
112 |
+
" mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
|
113 |
+
" return mask\n",
|
114 |
+
"\n",
|
115 |
+
"def preprocess(wave):\n",
|
116 |
+
" wave_tensor = torch.from_numpy(wave).float()\n",
|
117 |
+
" mel_tensor = to_mel(wave_tensor)\n",
|
118 |
+
" mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
|
119 |
+
" return mel_tensor\n",
|
120 |
+
"\n",
|
121 |
+
"def compute_style(ref_dicts):\n",
|
122 |
+
" reference_embeddings = {}\n",
|
123 |
+
" for key, path in ref_dicts.items():\n",
|
124 |
+
" wave, sr = librosa.load(path, sr=24000)\n",
|
125 |
+
" audio, index = librosa.effects.trim(wave, top_db=30)\n",
|
126 |
+
" if sr != 24000:\n",
|
127 |
+
" audio = librosa.resample(audio, sr, 24000)\n",
|
128 |
+
" mel_tensor = preprocess(audio).to(device)\n",
|
129 |
+
"\n",
|
130 |
+
" with torch.no_grad():\n",
|
131 |
+
" ref = model.style_encoder(mel_tensor.unsqueeze(1))\n",
|
132 |
+
" reference_embeddings[key] = (ref.squeeze(1), audio)\n",
|
133 |
+
" \n",
|
134 |
+
" return reference_embeddings"
|
135 |
+
]
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"cell_type": "markdown",
|
139 |
+
"id": "7b9cecbe",
|
140 |
+
"metadata": {},
|
141 |
+
"source": [
|
142 |
+
"### Load models"
|
143 |
+
]
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"cell_type": "code",
|
147 |
+
"execution_count": null,
|
148 |
+
"id": "64fc4c0f",
|
149 |
+
"metadata": {},
|
150 |
+
"outputs": [],
|
151 |
+
"source": [
|
152 |
+
"# load phonemizer\n",
|
153 |
+
"import phonemizer\n",
|
154 |
+
"global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)"
|
155 |
+
]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"cell_type": "code",
|
159 |
+
"execution_count": null,
|
160 |
+
"id": "54cfbe48",
|
161 |
+
"metadata": {},
|
162 |
+
"outputs": [],
|
163 |
+
"source": [
|
164 |
+
"# load hifi-gan\n",
|
165 |
+
"\n",
|
166 |
+
"import sys\n",
|
167 |
+
"sys.path.insert(0, \"./Demo/hifi-gan\")\n",
|
168 |
+
"\n",
|
169 |
+
"import glob\n",
|
170 |
+
"import os\n",
|
171 |
+
"import argparse\n",
|
172 |
+
"import json\n",
|
173 |
+
"import torch\n",
|
174 |
+
"from scipy.io.wavfile import write\n",
|
175 |
+
"from attrdict import AttrDict\n",
|
176 |
+
"from vocoder import Generator\n",
|
177 |
+
"import librosa\n",
|
178 |
+
"import numpy as np\n",
|
179 |
+
"import torchaudio\n",
|
180 |
+
"\n",
|
181 |
+
"h = None\n",
|
182 |
+
"\n",
|
183 |
+
"def load_checkpoint(filepath, device):\n",
|
184 |
+
" assert os.path.isfile(filepath)\n",
|
185 |
+
" print(\"Loading '{}'\".format(filepath))\n",
|
186 |
+
" checkpoint_dict = torch.load(filepath, map_location=device)\n",
|
187 |
+
" print(\"Complete.\")\n",
|
188 |
+
" return checkpoint_dict\n",
|
189 |
+
"\n",
|
190 |
+
"def scan_checkpoint(cp_dir, prefix):\n",
|
191 |
+
" pattern = os.path.join(cp_dir, prefix + '*')\n",
|
192 |
+
" cp_list = glob.glob(pattern)\n",
|
193 |
+
" if len(cp_list) == 0:\n",
|
194 |
+
" return ''\n",
|
195 |
+
" return sorted(cp_list)[-1]\n",
|
196 |
+
"\n",
|
197 |
+
"cp_g = scan_checkpoint(\"Vocoder/\", 'g_')\n",
|
198 |
+
"\n",
|
199 |
+
"config_file = os.path.join(os.path.split(cp_g)[0], 'config.json')\n",
|
200 |
+
"with open(config_file) as f:\n",
|
201 |
+
" data = f.read()\n",
|
202 |
+
"json_config = json.loads(data)\n",
|
203 |
+
"h = AttrDict(json_config)\n",
|
204 |
+
"\n",
|
205 |
+
"device = torch.device(device)\n",
|
206 |
+
"generator = Generator(h).to(device)\n",
|
207 |
+
"\n",
|
208 |
+
"state_dict_g = load_checkpoint(cp_g, device)\n",
|
209 |
+
"generator.load_state_dict(state_dict_g['generator'])\n",
|
210 |
+
"generator.eval()\n",
|
211 |
+
"generator.remove_weight_norm()"
|
212 |
+
]
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"cell_type": "code",
|
216 |
+
"execution_count": null,
|
217 |
+
"id": "02fb18a6",
|
218 |
+
"metadata": {},
|
219 |
+
"outputs": [],
|
220 |
+
"source": [
|
221 |
+
"# load StyleTTS\n",
|
222 |
+
"model_path = \"./Models/LJSpeech/epoch_2nd_00180.pth\"\n",
|
223 |
+
"model_config_path = \"./Models/LJSpeech/config.yml\"\n",
|
224 |
+
"\n",
|
225 |
+
"config = yaml.safe_load(open(model_config_path))\n",
|
226 |
+
"\n",
|
227 |
+
"# load pretrained ASR model\n",
|
228 |
+
"ASR_config = config.get('ASR_config', False)\n",
|
229 |
+
"ASR_path = config.get('ASR_path', False)\n",
|
230 |
+
"text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
|
231 |
+
"\n",
|
232 |
+
"# load pretrained F0 model\n",
|
233 |
+
"F0_path = config.get('F0_path', False)\n",
|
234 |
+
"pitch_extractor = load_F0_models(F0_path)\n",
|
235 |
+
"\n",
|
236 |
+
"model = build_model(Munch(config['model_params']), text_aligner, pitch_extractor)\n",
|
237 |
+
"\n",
|
238 |
+
"params = torch.load(model_path, map_location='cpu')\n",
|
239 |
+
"params = params['net']\n",
|
240 |
+
"for key in model:\n",
|
241 |
+
" if key in params:\n",
|
242 |
+
" if not \"discriminator\" in key:\n",
|
243 |
+
" print('%s loaded' % key)\n",
|
244 |
+
" model[key].load_state_dict(params[key])\n",
|
245 |
+
"_ = [model[key].eval() for key in model]\n",
|
246 |
+
"_ = [model[key].to(device) for key in model]"
|
247 |
+
]
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"cell_type": "markdown",
|
251 |
+
"id": "b803110e",
|
252 |
+
"metadata": {},
|
253 |
+
"source": [
|
254 |
+
"### Synthesize speech"
|
255 |
+
]
|
256 |
+
},
|
257 |
+
{
|
258 |
+
"cell_type": "code",
|
259 |
+
"execution_count": null,
|
260 |
+
"id": "30e8ff2c",
|
261 |
+
"metadata": {},
|
262 |
+
"outputs": [],
|
263 |
+
"source": [
|
264 |
+
"# get first 3 training sample as references\n",
|
265 |
+
"\n",
|
266 |
+
"train_path = config.get('train_data', None)\n",
|
267 |
+
"val_path = config.get('val_data', None)\n",
|
268 |
+
"train_list, val_list = get_data_path_list(train_path, val_path)\n",
|
269 |
+
"\n",
|
270 |
+
"ref_dicts = {}\n",
|
271 |
+
"for j in range(3):\n",
|
272 |
+
" filename = train_list[j].split('|')[0]\n",
|
273 |
+
" name = filename.split('/')[-1].replace('.wav', '')\n",
|
274 |
+
" ref_dicts[name] = filename\n",
|
275 |
+
" \n",
|
276 |
+
"reference_embeddings = compute_style(ref_dicts)"
|
277 |
+
]
|
278 |
+
},
|
279 |
+
{
|
280 |
+
"cell_type": "code",
|
281 |
+
"execution_count": null,
|
282 |
+
"id": "24655f46",
|
283 |
+
"metadata": {},
|
284 |
+
"outputs": [],
|
285 |
+
"source": [
|
286 |
+
"# synthesize a text\n",
|
287 |
+
"text = ''' StyleTTS is a style-based generative model for parallel TTS that can synthesize diverse speech with natural prosody from a reference speech utterance. '''"
|
288 |
+
]
|
289 |
+
},
|
290 |
+
{
|
291 |
+
"cell_type": "code",
|
292 |
+
"execution_count": null,
|
293 |
+
"id": "43e9f635",
|
294 |
+
"metadata": {},
|
295 |
+
"outputs": [],
|
296 |
+
"source": [
|
297 |
+
"# tokenize\n",
|
298 |
+
"ps = global_phonemizer.phonemize([text])\n",
|
299 |
+
"ps = word_tokenize(ps[0])\n",
|
300 |
+
"ps = ' '.join(ps)\n",
|
301 |
+
"tokens = textclenaer(ps)\n",
|
302 |
+
"tokens.insert(0, 0)\n",
|
303 |
+
"tokens.append(0)\n",
|
304 |
+
"tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)"
|
305 |
+
]
|
306 |
+
},
|
307 |
+
{
|
308 |
+
"cell_type": "code",
|
309 |
+
"execution_count": null,
|
310 |
+
"id": "ca57469c",
|
311 |
+
"metadata": {},
|
312 |
+
"outputs": [],
|
313 |
+
"source": [
|
314 |
+
"converted_samples = {}\n",
|
315 |
+
"\n",
|
316 |
+
"with torch.no_grad():\n",
|
317 |
+
" input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
|
318 |
+
" m = length_to_mask(input_lengths).to(device)\n",
|
319 |
+
" t_en = model.text_encoder(tokens, input_lengths, m)\n",
|
320 |
+
" \n",
|
321 |
+
" for key, (ref, _) in reference_embeddings.items():\n",
|
322 |
+
" \n",
|
323 |
+
" s = ref.squeeze(1)\n",
|
324 |
+
" style = s\n",
|
325 |
+
" \n",
|
326 |
+
" d = model.predictor.text_encoder(t_en, style, input_lengths, m)\n",
|
327 |
+
"\n",
|
328 |
+
" x, _ = model.predictor.lstm(d)\n",
|
329 |
+
" duration = model.predictor.duration_proj(x)\n",
|
330 |
+
" pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
|
331 |
+
" \n",
|
332 |
+
" pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
|
333 |
+
" c_frame = 0\n",
|
334 |
+
" for i in range(pred_aln_trg.size(0)):\n",
|
335 |
+
" pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
|
336 |
+
" c_frame += int(pred_dur[i].data)\n",
|
337 |
+
"\n",
|
338 |
+
" # encode prosody\n",
|
339 |
+
" en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
|
340 |
+
" style = s.expand(en.shape[0], en.shape[1], -1)\n",
|
341 |
+
"\n",
|
342 |
+
" F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
|
343 |
+
"\n",
|
344 |
+
" out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)), \n",
|
345 |
+
" F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
|
346 |
+
"\n",
|
347 |
+
"\n",
|
348 |
+
" c = out.squeeze()\n",
|
349 |
+
" y_g_hat = generator(c.unsqueeze(0))\n",
|
350 |
+
" y_out = y_g_hat.squeeze().cpu().numpy()\n",
|
351 |
+
"\n",
|
352 |
+
" c = out.squeeze()\n",
|
353 |
+
" y_g_hat = generator(c.unsqueeze(0))\n",
|
354 |
+
" y_out = y_g_hat.squeeze()\n",
|
355 |
+
" \n",
|
356 |
+
" converted_samples[key] = y_out.cpu().numpy()"
|
357 |
+
]
|
358 |
+
},
|
359 |
+
{
|
360 |
+
"cell_type": "code",
|
361 |
+
"execution_count": null,
|
362 |
+
"id": "d3d7f7d5",
|
363 |
+
"metadata": {
|
364 |
+
"scrolled": true
|
365 |
+
},
|
366 |
+
"outputs": [],
|
367 |
+
"source": [
|
368 |
+
"import IPython.display as ipd\n",
|
369 |
+
"for key, wave in converted_samples.items():\n",
|
370 |
+
" print('Synthesized: %s' % key)\n",
|
371 |
+
" display(ipd.Audio(wave, rate=24000))\n",
|
372 |
+
" try:\n",
|
373 |
+
" print('Reference: %s' % key)\n",
|
374 |
+
" display(ipd.Audio(reference_embeddings[key][-1], rate=24000))\n",
|
375 |
+
" except:\n",
|
376 |
+
" continue"
|
377 |
+
]
|
378 |
+
},
|
379 |
+
{
|
380 |
+
"cell_type": "code",
|
381 |
+
"execution_count": null,
|
382 |
+
"id": "74fe14d9",
|
383 |
+
"metadata": {},
|
384 |
+
"outputs": [],
|
385 |
+
"source": []
|
386 |
+
},
|
387 |
+
{
|
388 |
+
"cell_type": "code",
|
389 |
+
"execution_count": null,
|
390 |
+
"id": "a97c5e82",
|
391 |
+
"metadata": {},
|
392 |
+
"outputs": [],
|
393 |
+
"source": []
|
394 |
+
}
|
395 |
+
],
|
396 |
+
"metadata": {
|
397 |
+
"kernelspec": {
|
398 |
+
"display_name": "python3",
|
399 |
+
"language": "python",
|
400 |
+
"name": "python3"
|
401 |
+
},
|
402 |
+
"language_info": {
|
403 |
+
"codemirror_mode": {
|
404 |
+
"name": "ipython",
|
405 |
+
"version": 3
|
406 |
+
},
|
407 |
+
"file_extension": ".py",
|
408 |
+
"mimetype": "text/x-python",
|
409 |
+
"name": "python",
|
410 |
+
"nbconvert_exporter": "python",
|
411 |
+
"pygments_lexer": "ipython3",
|
412 |
+
"version": "3.9.7"
|
413 |
+
}
|
414 |
+
},
|
415 |
+
"nbformat": 4,
|
416 |
+
"nbformat_minor": 5
|
417 |
+
}
|
StyleTTS_Accelerate/Demo/Inference_LibriTTS.ipynb
ADDED
@@ -0,0 +1,529 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "9adb7bd1",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# StyleTTS Demo (LibriTTS)\n"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "markdown",
|
13 |
+
"id": "6108384d",
|
14 |
+
"metadata": {},
|
15 |
+
"source": [
|
16 |
+
"### Utils"
|
17 |
+
]
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"cell_type": "code",
|
21 |
+
"execution_count": null,
|
22 |
+
"id": "da84c60f",
|
23 |
+
"metadata": {},
|
24 |
+
"outputs": [],
|
25 |
+
"source": [
|
26 |
+
"%cd .."
|
27 |
+
]
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"cell_type": "code",
|
31 |
+
"execution_count": null,
|
32 |
+
"id": "5a3ddcc8",
|
33 |
+
"metadata": {},
|
34 |
+
"outputs": [],
|
35 |
+
"source": [
|
36 |
+
"# load packages\n",
|
37 |
+
"import random\n",
|
38 |
+
"import yaml\n",
|
39 |
+
"from munch import Munch\n",
|
40 |
+
"import numpy as np\n",
|
41 |
+
"import torch\n",
|
42 |
+
"from torch import nn\n",
|
43 |
+
"import torch.nn.functional as F\n",
|
44 |
+
"import torchaudio\n",
|
45 |
+
"import librosa\n",
|
46 |
+
"\n",
|
47 |
+
"from models import *\n",
|
48 |
+
"from utils import *\n",
|
49 |
+
"\n",
|
50 |
+
"%matplotlib inline"
|
51 |
+
]
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"cell_type": "code",
|
55 |
+
"execution_count": null,
|
56 |
+
"id": "bbdc04c0",
|
57 |
+
"metadata": {},
|
58 |
+
"outputs": [],
|
59 |
+
"source": [
|
60 |
+
"device = 'cuda' if torch.cuda.is_available() else 'cpu'"
|
61 |
+
]
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"cell_type": "code",
|
65 |
+
"execution_count": null,
|
66 |
+
"id": "0a173af4",
|
67 |
+
"metadata": {},
|
68 |
+
"outputs": [],
|
69 |
+
"source": [
|
70 |
+
"_pad = \"$\"\n",
|
71 |
+
"_punctuation = ';:,.!?¡¿—…\"«»“” '\n",
|
72 |
+
"_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'\n",
|
73 |
+
"_letters_ipa = \"ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ\"\n",
|
74 |
+
"\n",
|
75 |
+
"\n",
|
76 |
+
"# Export all symbols:\n",
|
77 |
+
"symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)\n",
|
78 |
+
"\n",
|
79 |
+
"dicts = {}\n",
|
80 |
+
"for i in range(len((symbols))):\n",
|
81 |
+
" dicts[symbols[i]] = i\n",
|
82 |
+
"\n",
|
83 |
+
"class TextCleaner:\n",
|
84 |
+
" def __init__(self, dummy=None):\n",
|
85 |
+
" self.word_index_dictionary = dicts\n",
|
86 |
+
" def __call__(self, text):\n",
|
87 |
+
" indexes = []\n",
|
88 |
+
" for char in text:\n",
|
89 |
+
" try:\n",
|
90 |
+
" indexes.append(self.word_index_dictionary[char])\n",
|
91 |
+
" except KeyError:\n",
|
92 |
+
" print(char)\n",
|
93 |
+
" return indexes\n",
|
94 |
+
"\n",
|
95 |
+
"textclenaer = TextCleaner()"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"cell_type": "code",
|
100 |
+
"execution_count": null,
|
101 |
+
"id": "00ee05e1",
|
102 |
+
"metadata": {},
|
103 |
+
"outputs": [],
|
104 |
+
"source": [
|
105 |
+
"to_mel = torchaudio.transforms.MelSpectrogram(\n",
|
106 |
+
" n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
|
107 |
+
"mean, std = -4, 4\n",
|
108 |
+
"\n",
|
109 |
+
"def length_to_mask(lengths):\n",
|
110 |
+
" mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
|
111 |
+
" mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
|
112 |
+
" return mask\n",
|
113 |
+
"\n",
|
114 |
+
"def preprocess(wave):\n",
|
115 |
+
" wave_tensor = torch.from_numpy(wave).float()\n",
|
116 |
+
" mel_tensor = to_mel(wave_tensor)\n",
|
117 |
+
" mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
|
118 |
+
" return mel_tensor\n",
|
119 |
+
"\n",
|
120 |
+
"def compute_style(ref_dicts):\n",
|
121 |
+
" reference_embeddings = {}\n",
|
122 |
+
" for key, path in ref_dicts.items():\n",
|
123 |
+
" wave, sr = librosa.load(path, sr=24000)\n",
|
124 |
+
" audio, index = librosa.effects.trim(wave, top_db=30)\n",
|
125 |
+
" if sr != 24000:\n",
|
126 |
+
" audio = librosa.resample(audio, sr, 24000)\n",
|
127 |
+
" mel_tensor = preprocess(audio).to(device)\n",
|
128 |
+
" try:\n",
|
129 |
+
" with torch.no_grad():\n",
|
130 |
+
" ref = model.style_encoder(mel_tensor.unsqueeze(1))\n",
|
131 |
+
" reference_embeddings[key] = (ref.squeeze(1), audio)\n",
|
132 |
+
" except:\n",
|
133 |
+
" continue\n",
|
134 |
+
" \n",
|
135 |
+
" return reference_embeddings"
|
136 |
+
]
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"cell_type": "markdown",
|
140 |
+
"id": "7b9cecbe",
|
141 |
+
"metadata": {},
|
142 |
+
"source": [
|
143 |
+
"### Load models"
|
144 |
+
]
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"cell_type": "code",
|
148 |
+
"execution_count": null,
|
149 |
+
"id": "64fc4c0f",
|
150 |
+
"metadata": {},
|
151 |
+
"outputs": [],
|
152 |
+
"source": [
|
153 |
+
"# load phonemizer\n",
|
154 |
+
"import phonemizer\n",
|
155 |
+
"global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)"
|
156 |
+
]
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"cell_type": "code",
|
160 |
+
"execution_count": null,
|
161 |
+
"id": "54cfbe48",
|
162 |
+
"metadata": {},
|
163 |
+
"outputs": [],
|
164 |
+
"source": [
|
165 |
+
"# load hifi-gan\n",
|
166 |
+
"\n",
|
167 |
+
"import sys\n",
|
168 |
+
"sys.path.insert(0, \"./Demo/hifi-gan\")\n",
|
169 |
+
"\n",
|
170 |
+
"import glob\n",
|
171 |
+
"import os\n",
|
172 |
+
"import argparse\n",
|
173 |
+
"import json\n",
|
174 |
+
"import torch\n",
|
175 |
+
"from scipy.io.wavfile import write\n",
|
176 |
+
"from attrdict import AttrDict\n",
|
177 |
+
"from vocoder import Generator\n",
|
178 |
+
"import librosa\n",
|
179 |
+
"import numpy as np\n",
|
180 |
+
"import torchaudio\n",
|
181 |
+
"\n",
|
182 |
+
"h = None\n",
|
183 |
+
"\n",
|
184 |
+
"def load_checkpoint(filepath, device):\n",
|
185 |
+
" assert os.path.isfile(filepath)\n",
|
186 |
+
" print(\"Loading '{}'\".format(filepath))\n",
|
187 |
+
" checkpoint_dict = torch.load(filepath, map_location=device)\n",
|
188 |
+
" print(\"Complete.\")\n",
|
189 |
+
" return checkpoint_dict\n",
|
190 |
+
"\n",
|
191 |
+
"def scan_checkpoint(cp_dir, prefix):\n",
|
192 |
+
" pattern = os.path.join(cp_dir, prefix + '*')\n",
|
193 |
+
" cp_list = glob.glob(pattern)\n",
|
194 |
+
" if len(cp_list) == 0:\n",
|
195 |
+
" return ''\n",
|
196 |
+
" return sorted(cp_list)[-1]\n",
|
197 |
+
"\n",
|
198 |
+
"cp_g = scan_checkpoint(\"Vocoder/LibriTTS/\", 'g_')\n",
|
199 |
+
"\n",
|
200 |
+
"config_file = os.path.join(os.path.split(cp_g)[0], 'config.json')\n",
|
201 |
+
"with open(config_file) as f:\n",
|
202 |
+
" data = f.read()\n",
|
203 |
+
"json_config = json.loads(data)\n",
|
204 |
+
"h = AttrDict(json_config)\n",
|
205 |
+
"\n",
|
206 |
+
"device = torch.device(device)\n",
|
207 |
+
"generator = Generator(h).to(device)\n",
|
208 |
+
"\n",
|
209 |
+
"state_dict_g = load_checkpoint(cp_g, device)\n",
|
210 |
+
"generator.load_state_dict(state_dict_g['generator'])\n",
|
211 |
+
"generator.eval()\n",
|
212 |
+
"generator.remove_weight_norm()"
|
213 |
+
]
|
214 |
+
},
|
215 |
+
{
|
216 |
+
"cell_type": "code",
|
217 |
+
"execution_count": null,
|
218 |
+
"id": "02fb18a6",
|
219 |
+
"metadata": {},
|
220 |
+
"outputs": [],
|
221 |
+
"source": [
|
222 |
+
"# load StyleTTS\n",
|
223 |
+
"model_path = \"./Models/LibriTTS/epoch_2nd_00050.pth\"\n",
|
224 |
+
"model_config_path = \"./Models/LibriTTS/config.yml\"\n",
|
225 |
+
"\n",
|
226 |
+
"config = yaml.safe_load(open(model_config_path))\n",
|
227 |
+
"\n",
|
228 |
+
"# load pretrained ASR model\n",
|
229 |
+
"ASR_config = config.get('ASR_config', False)\n",
|
230 |
+
"ASR_path = config.get('ASR_path', False)\n",
|
231 |
+
"text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
|
232 |
+
"\n",
|
233 |
+
"# load pretrained F0 model\n",
|
234 |
+
"F0_path = config.get('F0_path', False)\n",
|
235 |
+
"pitch_extractor = load_F0_models(F0_path)\n",
|
236 |
+
"\n",
|
237 |
+
"model = build_model(Munch(config['model_params']), text_aligner, pitch_extractor)\n",
|
238 |
+
"\n",
|
239 |
+
"params = torch.load(model_path, map_location='cpu')\n",
|
240 |
+
"params = params['net']\n",
|
241 |
+
"for key in model:\n",
|
242 |
+
" if key in params:\n",
|
243 |
+
" if not \"discriminator\" in key:\n",
|
244 |
+
" print('%s loaded' % key)\n",
|
245 |
+
" model[key].load_state_dict(params[key])\n",
|
246 |
+
"_ = [model[key].eval() for key in model]\n",
|
247 |
+
"_ = [model[key].to(device) for key in model]"
|
248 |
+
]
|
249 |
+
},
|
250 |
+
{
|
251 |
+
"cell_type": "markdown",
|
252 |
+
"id": "b803110e",
|
253 |
+
"metadata": {},
|
254 |
+
"source": [
|
255 |
+
"### Synthesize speech (seen speakers, LibriTTS train-clean-100)"
|
256 |
+
]
|
257 |
+
},
|
258 |
+
{
|
259 |
+
"cell_type": "code",
|
260 |
+
"execution_count": null,
|
261 |
+
"id": "30e8ff2c",
|
262 |
+
"metadata": {},
|
263 |
+
"outputs": [],
|
264 |
+
"source": [
|
265 |
+
"# get first 3 training sample as references\n",
|
266 |
+
"\n",
|
267 |
+
"train_path = config.get('train_data', None)\n",
|
268 |
+
"val_path = config.get('val_data', None)\n",
|
269 |
+
"train_list, val_list = get_data_path_list(train_path, val_path)\n",
|
270 |
+
"\n",
|
271 |
+
"ref_dicts = {}\n",
|
272 |
+
"for j in range(3):\n",
|
273 |
+
" filename = train_list[j].split('|')[0]\n",
|
274 |
+
" name = filename.split('/')[-1].replace('.wav', '')\n",
|
275 |
+
" ref_dicts[name] = filename\n",
|
276 |
+
" \n",
|
277 |
+
"reference_embeddings = compute_style(ref_dicts)"
|
278 |
+
]
|
279 |
+
},
|
280 |
+
{
|
281 |
+
"cell_type": "code",
|
282 |
+
"execution_count": null,
|
283 |
+
"id": "24655f46",
|
284 |
+
"metadata": {},
|
285 |
+
"outputs": [],
|
286 |
+
"source": [
|
287 |
+
"# synthesize a text\n",
|
288 |
+
"text = ''' StyleTTS is a style based generative model that can synthesize diverse speech with natural prosody from a reference speech utterance. '''"
|
289 |
+
]
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"cell_type": "code",
|
293 |
+
"execution_count": null,
|
294 |
+
"id": "43e9f635",
|
295 |
+
"metadata": {},
|
296 |
+
"outputs": [],
|
297 |
+
"source": [
|
298 |
+
"# tokenize\n",
|
299 |
+
"ps = global_phonemizer.phonemize([text])\n",
|
300 |
+
"tokens = textclenaer(ps[0])\n",
|
301 |
+
"tokens.insert(0, 0)\n",
|
302 |
+
"tokens.append(0)\n",
|
303 |
+
"tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)"
|
304 |
+
]
|
305 |
+
},
|
306 |
+
{
|
307 |
+
"cell_type": "code",
|
308 |
+
"execution_count": null,
|
309 |
+
"id": "ca57469c",
|
310 |
+
"metadata": {},
|
311 |
+
"outputs": [],
|
312 |
+
"source": [
|
313 |
+
"converted_samples = {}\n",
|
314 |
+
"\n",
|
315 |
+
"with torch.no_grad():\n",
|
316 |
+
" input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
|
317 |
+
" m = length_to_mask(input_lengths).to(device)\n",
|
318 |
+
" t_en = model.text_encoder(tokens, input_lengths, m)\n",
|
319 |
+
" \n",
|
320 |
+
" for key, (ref, _) in reference_embeddings.items():\n",
|
321 |
+
" \n",
|
322 |
+
" s = ref.squeeze(1)\n",
|
323 |
+
" style = s\n",
|
324 |
+
" \n",
|
325 |
+
" d = model.predictor.text_encoder(t_en, style, input_lengths, m)\n",
|
326 |
+
"\n",
|
327 |
+
" x, _ = model.predictor.lstm(d)\n",
|
328 |
+
" duration = model.predictor.duration_proj(x)\n",
|
329 |
+
" pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
|
330 |
+
" \n",
|
331 |
+
" pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
|
332 |
+
" c_frame = 0\n",
|
333 |
+
" for i in range(pred_aln_trg.size(0)):\n",
|
334 |
+
" pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
|
335 |
+
" c_frame += int(pred_dur[i].data)\n",
|
336 |
+
"\n",
|
337 |
+
" # encode prosody\n",
|
338 |
+
" en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
|
339 |
+
" style = s.expand(en.shape[0], en.shape[1], -1)\n",
|
340 |
+
"\n",
|
341 |
+
" F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
|
342 |
+
"\n",
|
343 |
+
" out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)), \n",
|
344 |
+
" F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
|
345 |
+
"\n",
|
346 |
+
"\n",
|
347 |
+
" c = out.squeeze()\n",
|
348 |
+
" y_g_hat = generator(c.unsqueeze(0))\n",
|
349 |
+
" y_out = y_g_hat.squeeze().cpu().numpy()\n",
|
350 |
+
"\n",
|
351 |
+
" c = out.squeeze()\n",
|
352 |
+
" y_g_hat = generator(c.unsqueeze(0))\n",
|
353 |
+
" y_out = y_g_hat.squeeze()\n",
|
354 |
+
" \n",
|
355 |
+
" converted_samples[key] = y_out.cpu().numpy()"
|
356 |
+
]
|
357 |
+
},
|
358 |
+
{
|
359 |
+
"cell_type": "code",
|
360 |
+
"execution_count": null,
|
361 |
+
"id": "086c25a7",
|
362 |
+
"metadata": {
|
363 |
+
"scrolled": true
|
364 |
+
},
|
365 |
+
"outputs": [],
|
366 |
+
"source": [
|
367 |
+
"import IPython.display as ipd\n",
|
368 |
+
"for key, wave in converted_samples.items():\n",
|
369 |
+
" print('Synthesized: %s' % key)\n",
|
370 |
+
" display(ipd.Audio(wave, rate=24000))\n",
|
371 |
+
" try:\n",
|
372 |
+
" print('Reference: %s' % key)\n",
|
373 |
+
" display(ipd.Audio(reference_embeddings[key][-1], rate=24000))\n",
|
374 |
+
" except:\n",
|
375 |
+
" continue"
|
376 |
+
]
|
377 |
+
},
|
378 |
+
{
|
379 |
+
"cell_type": "markdown",
|
380 |
+
"id": "41d721cd",
|
381 |
+
"metadata": {},
|
382 |
+
"source": [
|
383 |
+
"### Zero-shot TTS (unseen speakers, LibriTTS test-clean)"
|
384 |
+
]
|
385 |
+
},
|
386 |
+
{
|
387 |
+
"cell_type": "code",
|
388 |
+
"execution_count": null,
|
389 |
+
"id": "5b75a5dd",
|
390 |
+
"metadata": {},
|
391 |
+
"outputs": [],
|
392 |
+
"source": [
|
393 |
+
"test_clean_path = '/share/naplab/users/yl4579/data/LibriTTS/test-clean/'\n",
|
394 |
+
"\n",
|
395 |
+
"ref_dicts = {}\n",
|
396 |
+
"# pick first 3 speakers from test-clean\n",
|
397 |
+
"spks = [ f.path for f in os.scandir(test_clean_path) if f.is_dir() ]\n",
|
398 |
+
"spks = spks[:3]\n",
|
399 |
+
"for spk in spks:\n",
|
400 |
+
" spk_path = spk\n",
|
401 |
+
" spk = spk.split('/')[-1]\n",
|
402 |
+
" spk_path = spk_path + \"/\" + (np.random.choice(os.listdir(spk_path), size=1)[0])\n",
|
403 |
+
" for f in os.listdir(spk_path):\n",
|
404 |
+
" if f.endswith('.wav'):\n",
|
405 |
+
" ref_dicts[spk] = spk_path + \"/\" + f\n",
|
406 |
+
"reference_embeddings = compute_style(ref_dicts)"
|
407 |
+
]
|
408 |
+
},
|
409 |
+
{
|
410 |
+
"cell_type": "code",
|
411 |
+
"execution_count": null,
|
412 |
+
"id": "b8c204d0",
|
413 |
+
"metadata": {},
|
414 |
+
"outputs": [],
|
415 |
+
"source": [
|
416 |
+
"# synthesize a text\n",
|
417 |
+
"text = ''' StyleTTS is a style based generative model that can synthesize diverse speech with natural prosody from a reference speech utterance. '''"
|
418 |
+
]
|
419 |
+
},
|
420 |
+
{
|
421 |
+
"cell_type": "code",
|
422 |
+
"execution_count": null,
|
423 |
+
"id": "a0078aa4",
|
424 |
+
"metadata": {},
|
425 |
+
"outputs": [],
|
426 |
+
"source": [
|
427 |
+
"# tokenize\n",
|
428 |
+
"ps = global_phonemizer.phonemize([text])\n",
|
429 |
+
"tokens = textclenaer(ps[0])\n",
|
430 |
+
"tokens.insert(0, 0)\n",
|
431 |
+
"tokens.append(0)\n",
|
432 |
+
"tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)"
|
433 |
+
]
|
434 |
+
},
|
435 |
+
{
|
436 |
+
"cell_type": "code",
|
437 |
+
"execution_count": null,
|
438 |
+
"id": "f02958cc",
|
439 |
+
"metadata": {},
|
440 |
+
"outputs": [],
|
441 |
+
"source": [
|
442 |
+
"converted_samples = {}\n",
|
443 |
+
"\n",
|
444 |
+
"with torch.no_grad():\n",
|
445 |
+
" input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
|
446 |
+
" m = length_to_mask(input_lengths).to(device)\n",
|
447 |
+
" t_en = model.text_encoder(tokens, input_lengths, m)\n",
|
448 |
+
" \n",
|
449 |
+
" for key, (ref, _) in reference_embeddings.items():\n",
|
450 |
+
" \n",
|
451 |
+
" s = ref.squeeze(1)\n",
|
452 |
+
" style = s\n",
|
453 |
+
" \n",
|
454 |
+
" d = model.predictor.text_encoder(t_en, style, input_lengths, m)\n",
|
455 |
+
"\n",
|
456 |
+
" x, _ = model.predictor.lstm(d)\n",
|
457 |
+
" duration = model.predictor.duration_proj(x)\n",
|
458 |
+
" pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
|
459 |
+
" \n",
|
460 |
+
" pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
|
461 |
+
" c_frame = 0\n",
|
462 |
+
" for i in range(pred_aln_trg.size(0)):\n",
|
463 |
+
" pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
|
464 |
+
" c_frame += int(pred_dur[i].data)\n",
|
465 |
+
"\n",
|
466 |
+
" # encode prosody\n",
|
467 |
+
" en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
|
468 |
+
" style = s.expand(en.shape[0], en.shape[1], -1)\n",
|
469 |
+
"\n",
|
470 |
+
" F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
|
471 |
+
"\n",
|
472 |
+
" out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)), \n",
|
473 |
+
" F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
|
474 |
+
"\n",
|
475 |
+
"\n",
|
476 |
+
" c = out.squeeze()\n",
|
477 |
+
" y_g_hat = generator(c.unsqueeze(0))\n",
|
478 |
+
" y_out = y_g_hat.squeeze().cpu().numpy()\n",
|
479 |
+
"\n",
|
480 |
+
" c = out.squeeze()\n",
|
481 |
+
" y_g_hat = generator(c.unsqueeze(0))\n",
|
482 |
+
" y_out = y_g_hat.squeeze()\n",
|
483 |
+
" \n",
|
484 |
+
" converted_samples[key] = y_out.cpu().numpy()"
|
485 |
+
]
|
486 |
+
},
|
487 |
+
{
|
488 |
+
"cell_type": "code",
|
489 |
+
"execution_count": null,
|
490 |
+
"id": "b2e931ac",
|
491 |
+
"metadata": {
|
492 |
+
"scrolled": true
|
493 |
+
},
|
494 |
+
"outputs": [],
|
495 |
+
"source": [
|
496 |
+
"import IPython.display as ipd\n",
|
497 |
+
"for key, wave in converted_samples.items():\n",
|
498 |
+
" print('Synthesized: %s' % key)\n",
|
499 |
+
" display(ipd.Audio(wave, rate=24000))\n",
|
500 |
+
" try:\n",
|
501 |
+
" print('Reference: %s' % key)\n",
|
502 |
+
" display(ipd.Audio(reference_embeddings[key][-1], rate=24000))\n",
|
503 |
+
" except:\n",
|
504 |
+
" continue"
|
505 |
+
]
|
506 |
+
}
|
507 |
+
],
|
508 |
+
"metadata": {
|
509 |
+
"kernelspec": {
|
510 |
+
"display_name": "python3",
|
511 |
+
"language": "python",
|
512 |
+
"name": "python3"
|
513 |
+
},
|
514 |
+
"language_info": {
|
515 |
+
"codemirror_mode": {
|
516 |
+
"name": "ipython",
|
517 |
+
"version": 3
|
518 |
+
},
|
519 |
+
"file_extension": ".py",
|
520 |
+
"mimetype": "text/x-python",
|
521 |
+
"name": "python",
|
522 |
+
"nbconvert_exporter": "python",
|
523 |
+
"pygments_lexer": "ipython3",
|
524 |
+
"version": "3.7.11"
|
525 |
+
}
|
526 |
+
},
|
527 |
+
"nbformat": 4,
|
528 |
+
"nbformat_minor": 5
|
529 |
+
}
|
StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder.cpython-310.pyc
ADDED
Binary file (8.71 kB). View file
|
|
StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder.cpython-39.pyc
ADDED
Binary file (8.76 kB). View file
|
|
StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder_utils.cpython-310.pyc
ADDED
Binary file (2.04 kB). View file
|
|
StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder_utils.cpython-39.pyc
ADDED
Binary file (2.02 kB). View file
|
|
StyleTTS_Accelerate/Demo/hifi-gan/vocoder.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
5 |
+
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
6 |
+
from vocoder_utils import init_weights, get_padding
|
7 |
+
|
8 |
+
LRELU_SLOPE = 0.1
|
9 |
+
|
10 |
+
|
11 |
+
class ResBlock1(torch.nn.Module):
|
12 |
+
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
|
13 |
+
super(ResBlock1, self).__init__()
|
14 |
+
self.h = h
|
15 |
+
self.convs1 = nn.ModuleList([
|
16 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
17 |
+
padding=get_padding(kernel_size, dilation[0]))),
|
18 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
19 |
+
padding=get_padding(kernel_size, dilation[1]))),
|
20 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
|
21 |
+
padding=get_padding(kernel_size, dilation[2])))
|
22 |
+
])
|
23 |
+
self.convs1.apply(init_weights)
|
24 |
+
|
25 |
+
self.convs2 = nn.ModuleList([
|
26 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
27 |
+
padding=get_padding(kernel_size, 1))),
|
28 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
29 |
+
padding=get_padding(kernel_size, 1))),
|
30 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
31 |
+
padding=get_padding(kernel_size, 1)))
|
32 |
+
])
|
33 |
+
self.convs2.apply(init_weights)
|
34 |
+
|
35 |
+
def forward(self, x):
|
36 |
+
for c1, c2 in zip(self.convs1, self.convs2):
|
37 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
38 |
+
xt = c1(xt)
|
39 |
+
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
40 |
+
xt = c2(xt)
|
41 |
+
x = xt + x
|
42 |
+
return x
|
43 |
+
|
44 |
+
def remove_weight_norm(self):
|
45 |
+
for l in self.convs1:
|
46 |
+
remove_weight_norm(l)
|
47 |
+
for l in self.convs2:
|
48 |
+
remove_weight_norm(l)
|
49 |
+
|
50 |
+
|
51 |
+
class ResBlock2(torch.nn.Module):
|
52 |
+
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
|
53 |
+
super(ResBlock2, self).__init__()
|
54 |
+
self.h = h
|
55 |
+
self.convs = nn.ModuleList([
|
56 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
57 |
+
padding=get_padding(kernel_size, dilation[0]))),
|
58 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
59 |
+
padding=get_padding(kernel_size, dilation[1])))
|
60 |
+
])
|
61 |
+
self.convs.apply(init_weights)
|
62 |
+
|
63 |
+
def forward(self, x):
|
64 |
+
for c in self.convs:
|
65 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
66 |
+
xt = c(xt)
|
67 |
+
x = xt + x
|
68 |
+
return x
|
69 |
+
|
70 |
+
def remove_weight_norm(self):
|
71 |
+
for l in self.convs:
|
72 |
+
remove_weight_norm(l)
|
73 |
+
|
74 |
+
|
75 |
+
class Generator(torch.nn.Module):
|
76 |
+
def __init__(self, h):
|
77 |
+
super(Generator, self).__init__()
|
78 |
+
self.h = h
|
79 |
+
self.num_kernels = len(h.resblock_kernel_sizes)
|
80 |
+
self.num_upsamples = len(h.upsample_rates)
|
81 |
+
self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
|
82 |
+
resblock = ResBlock1 if h.resblock == '1' else ResBlock2
|
83 |
+
|
84 |
+
self.ups = nn.ModuleList()
|
85 |
+
for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
|
86 |
+
self.ups.append(weight_norm(ConvTranspose1d(h.upsample_initial_channel//(2**i),
|
87 |
+
h.upsample_initial_channel//(2**(i+1)),
|
88 |
+
k, u, padding=(u//2 + u%2), output_padding=u%2)))
|
89 |
+
|
90 |
+
self.resblocks = nn.ModuleList()
|
91 |
+
for i in range(len(self.ups)):
|
92 |
+
ch = h.upsample_initial_channel//(2**(i+1))
|
93 |
+
for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
|
94 |
+
self.resblocks.append(resblock(h, ch, k, d))
|
95 |
+
|
96 |
+
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
|
97 |
+
self.ups.apply(init_weights)
|
98 |
+
self.conv_post.apply(init_weights)
|
99 |
+
|
100 |
+
def forward(self, x):
|
101 |
+
x = self.conv_pre(x)
|
102 |
+
for i in range(self.num_upsamples):
|
103 |
+
x = F.leaky_relu(x, LRELU_SLOPE)
|
104 |
+
x = self.ups[i](x)
|
105 |
+
xs = None
|
106 |
+
for j in range(self.num_kernels):
|
107 |
+
if xs is None:
|
108 |
+
xs = self.resblocks[i*self.num_kernels+j](x)
|
109 |
+
else:
|
110 |
+
xs += self.resblocks[i*self.num_kernels+j](x)
|
111 |
+
x = xs / self.num_kernels
|
112 |
+
x = F.leaky_relu(x)
|
113 |
+
x = self.conv_post(x)
|
114 |
+
x = torch.tanh(x)
|
115 |
+
|
116 |
+
return x
|
117 |
+
|
118 |
+
def remove_weight_norm(self):
|
119 |
+
print('Removing weight norm...')
|
120 |
+
for l in self.ups:
|
121 |
+
remove_weight_norm(l)
|
122 |
+
for l in self.resblocks:
|
123 |
+
l.remove_weight_norm()
|
124 |
+
remove_weight_norm(self.conv_pre)
|
125 |
+
remove_weight_norm(self.conv_post)
|
126 |
+
|
127 |
+
|
128 |
+
class DiscriminatorP(torch.nn.Module):
|
129 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
130 |
+
super(DiscriminatorP, self).__init__()
|
131 |
+
self.period = period
|
132 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
133 |
+
self.convs = nn.ModuleList([
|
134 |
+
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
135 |
+
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
136 |
+
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
137 |
+
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
138 |
+
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
|
139 |
+
])
|
140 |
+
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
141 |
+
|
142 |
+
def forward(self, x):
|
143 |
+
fmap = []
|
144 |
+
|
145 |
+
# 1d to 2d
|
146 |
+
b, c, t = x.shape
|
147 |
+
if t % self.period != 0: # pad first
|
148 |
+
n_pad = self.period - (t % self.period)
|
149 |
+
x = F.pad(x, (0, n_pad), "reflect")
|
150 |
+
t = t + n_pad
|
151 |
+
x = x.view(b, c, t // self.period, self.period)
|
152 |
+
|
153 |
+
for l in self.convs:
|
154 |
+
x = l(x)
|
155 |
+
x = F.leaky_relu(x, LRELU_SLOPE)
|
156 |
+
fmap.append(x)
|
157 |
+
x = self.conv_post(x)
|
158 |
+
fmap.append(x)
|
159 |
+
x = torch.flatten(x, 1, -1)
|
160 |
+
|
161 |
+
return x, fmap
|
162 |
+
|
163 |
+
|
164 |
+
class MultiPeriodDiscriminator(torch.nn.Module):
|
165 |
+
def __init__(self):
|
166 |
+
super(MultiPeriodDiscriminator, self).__init__()
|
167 |
+
self.discriminators = nn.ModuleList([
|
168 |
+
DiscriminatorP(2),
|
169 |
+
DiscriminatorP(3),
|
170 |
+
DiscriminatorP(5),
|
171 |
+
DiscriminatorP(7),
|
172 |
+
DiscriminatorP(11),
|
173 |
+
])
|
174 |
+
|
175 |
+
def forward(self, y, y_hat):
|
176 |
+
y_d_rs = []
|
177 |
+
y_d_gs = []
|
178 |
+
fmap_rs = []
|
179 |
+
fmap_gs = []
|
180 |
+
for i, d in enumerate(self.discriminators):
|
181 |
+
y_d_r, fmap_r = d(y)
|
182 |
+
y_d_g, fmap_g = d(y_hat)
|
183 |
+
y_d_rs.append(y_d_r)
|
184 |
+
fmap_rs.append(fmap_r)
|
185 |
+
y_d_gs.append(y_d_g)
|
186 |
+
fmap_gs.append(fmap_g)
|
187 |
+
|
188 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
189 |
+
|
190 |
+
|
191 |
+
class DiscriminatorS(torch.nn.Module):
|
192 |
+
def __init__(self, use_spectral_norm=False):
|
193 |
+
super(DiscriminatorS, self).__init__()
|
194 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
195 |
+
self.convs = nn.ModuleList([
|
196 |
+
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
|
197 |
+
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
|
198 |
+
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
|
199 |
+
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
|
200 |
+
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
|
201 |
+
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
|
202 |
+
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
203 |
+
])
|
204 |
+
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
205 |
+
|
206 |
+
def forward(self, x):
|
207 |
+
fmap = []
|
208 |
+
for l in self.convs:
|
209 |
+
x = l(x)
|
210 |
+
x = F.leaky_relu(x, LRELU_SLOPE)
|
211 |
+
fmap.append(x)
|
212 |
+
x = self.conv_post(x)
|
213 |
+
fmap.append(x)
|
214 |
+
x = torch.flatten(x, 1, -1)
|
215 |
+
|
216 |
+
return x, fmap
|
217 |
+
|
218 |
+
|
219 |
+
class MultiScaleDiscriminator(torch.nn.Module):
|
220 |
+
def __init__(self):
|
221 |
+
super(MultiScaleDiscriminator, self).__init__()
|
222 |
+
self.discriminators = nn.ModuleList([
|
223 |
+
DiscriminatorS(use_spectral_norm=True),
|
224 |
+
DiscriminatorS(),
|
225 |
+
DiscriminatorS(),
|
226 |
+
])
|
227 |
+
self.meanpools = nn.ModuleList([
|
228 |
+
AvgPool1d(4, 2, padding=2),
|
229 |
+
AvgPool1d(4, 2, padding=2)
|
230 |
+
])
|
231 |
+
|
232 |
+
def forward(self, y, y_hat):
|
233 |
+
y_d_rs = []
|
234 |
+
y_d_gs = []
|
235 |
+
fmap_rs = []
|
236 |
+
fmap_gs = []
|
237 |
+
for i, d in enumerate(self.discriminators):
|
238 |
+
if i != 0:
|
239 |
+
y = self.meanpools[i-1](y)
|
240 |
+
y_hat = self.meanpools[i-1](y_hat)
|
241 |
+
y_d_r, fmap_r = d(y)
|
242 |
+
y_d_g, fmap_g = d(y_hat)
|
243 |
+
y_d_rs.append(y_d_r)
|
244 |
+
fmap_rs.append(fmap_r)
|
245 |
+
y_d_gs.append(y_d_g)
|
246 |
+
fmap_gs.append(fmap_g)
|
247 |
+
|
248 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
249 |
+
|
250 |
+
|
251 |
+
def feature_loss(fmap_r, fmap_g):
|
252 |
+
loss = 0
|
253 |
+
for dr, dg in zip(fmap_r, fmap_g):
|
254 |
+
for rl, gl in zip(dr, dg):
|
255 |
+
loss += torch.mean(torch.abs(rl - gl))
|
256 |
+
|
257 |
+
return loss*2
|
258 |
+
|
259 |
+
|
260 |
+
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
261 |
+
loss = 0
|
262 |
+
r_losses = []
|
263 |
+
g_losses = []
|
264 |
+
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
265 |
+
r_loss = torch.mean((1-dr)**2)
|
266 |
+
g_loss = torch.mean(dg**2)
|
267 |
+
loss += (r_loss + g_loss)
|
268 |
+
r_losses.append(r_loss.item())
|
269 |
+
g_losses.append(g_loss.item())
|
270 |
+
|
271 |
+
return loss, r_losses, g_losses
|
272 |
+
|
273 |
+
|
274 |
+
def generator_loss(disc_outputs):
|
275 |
+
loss = 0
|
276 |
+
gen_losses = []
|
277 |
+
for dg in disc_outputs:
|
278 |
+
l = torch.mean((1-dg)**2)
|
279 |
+
gen_losses.append(l)
|
280 |
+
loss += l
|
281 |
+
|
282 |
+
return loss, gen_losses
|
283 |
+
|
StyleTTS_Accelerate/Demo/hifi-gan/vocoder_utils.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import os
|
3 |
+
import matplotlib
|
4 |
+
import torch
|
5 |
+
from torch.nn.utils import weight_norm
|
6 |
+
matplotlib.use("Agg")
|
7 |
+
import matplotlib.pylab as plt
|
8 |
+
|
9 |
+
|
10 |
+
def plot_spectrogram(spectrogram):
|
11 |
+
fig, ax = plt.subplots(figsize=(10, 2))
|
12 |
+
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
|
13 |
+
interpolation='none')
|
14 |
+
plt.colorbar(im, ax=ax)
|
15 |
+
|
16 |
+
fig.canvas.draw()
|
17 |
+
plt.close()
|
18 |
+
|
19 |
+
return fig
|
20 |
+
|
21 |
+
|
22 |
+
def init_weights(m, mean=0.0, std=0.01):
|
23 |
+
classname = m.__class__.__name__
|
24 |
+
if classname.find("Conv") != -1:
|
25 |
+
m.weight.data.normal_(mean, std)
|
26 |
+
|
27 |
+
|
28 |
+
def apply_weight_norm(m):
|
29 |
+
classname = m.__class__.__name__
|
30 |
+
if classname.find("Conv") != -1:
|
31 |
+
weight_norm(m)
|
32 |
+
|
33 |
+
|
34 |
+
def get_padding(kernel_size, dilation=1):
|
35 |
+
return int((kernel_size*dilation - dilation)/2)
|
36 |
+
|
37 |
+
|
38 |
+
def load_checkpoint(filepath, device):
|
39 |
+
assert os.path.isfile(filepath)
|
40 |
+
print("Loading '{}'".format(filepath))
|
41 |
+
checkpoint_dict = torch.load(filepath, map_location=device)
|
42 |
+
print("Complete.")
|
43 |
+
return checkpoint_dict
|
44 |
+
|
45 |
+
|
46 |
+
def save_checkpoint(filepath, obj):
|
47 |
+
print("Saving checkpoint to {}".format(filepath))
|
48 |
+
torch.save(obj, filepath)
|
49 |
+
print("Complete.")
|
50 |
+
|
51 |
+
|
52 |
+
def scan_checkpoint(cp_dir, prefix):
|
53 |
+
pattern = os.path.join(cp_dir, prefix + '????????')
|
54 |
+
cp_list = glob.glob(pattern)
|
55 |
+
if len(cp_list) == 0:
|
56 |
+
return None
|
57 |
+
return sorted(cp_list)[-1]
|
58 |
+
|
StyleTTS_Accelerate/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2022 Aaron (Yinghao) Li
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
StyleTTS_Accelerate/LICENSE copy
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2022 Aaron (Yinghao) Li
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
StyleTTS_Accelerate/Models/Anispeech/config.yml
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
log_dir: "Models/Anispeech"
|
2 |
+
first_stage_path: "/home/ubuntu/StyleTTS_Accelerate_44khz/Models/Anispeech/epoch_1st_00020.pth"
|
3 |
+
save_freq: 1
|
4 |
+
log_interval: 10
|
5 |
+
device: "cuda"
|
6 |
+
multigpu: false
|
7 |
+
epochs_1st: 200 # number of epochs for first stage training
|
8 |
+
epochs_2nd: 100 # number of peochs for second stage training
|
9 |
+
batch_size: 16
|
10 |
+
pretrained_model: "/home/ubuntu/StyleTTS_Accelerate_44khz/Models/Anispeech/epoch_2nd_00015.pth"
|
11 |
+
second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
|
12 |
+
load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
|
13 |
+
|
14 |
+
diff_epoch: 5
|
15 |
+
|
16 |
+
train_data: "/home/ubuntu/StyleTTS_Accelerate_44khz/Data/ani_train_only_longs.csv"
|
17 |
+
val_data: "/home/ubuntu/StyleTTS_Accelerate_44khz/Data/val_list_libritts.txt"
|
18 |
+
|
19 |
+
F0_path: "Utils/JDC/bst.t7"
|
20 |
+
ASR_config: "Utils/ASR/config.yml"
|
21 |
+
ASR_path: "Utils/ASR/epoch_00080.pth"
|
22 |
+
|
23 |
+
preprocess_params:
|
24 |
+
sr: 24000
|
25 |
+
spect_params:
|
26 |
+
n_fft: 2048
|
27 |
+
win_length: 1200
|
28 |
+
hop_length: 300
|
29 |
+
|
30 |
+
model_params:
|
31 |
+
hidden_dim: 512
|
32 |
+
n_token: 178
|
33 |
+
style_dim: 128
|
34 |
+
n_layer: 3
|
35 |
+
dim_in: 64
|
36 |
+
max_conv_dim: 512
|
37 |
+
n_mels: 80
|
38 |
+
dropout: 0.2
|
39 |
+
|
40 |
+
|
41 |
+
diffusion:
|
42 |
+
embedding_mask_proba: 0.1
|
43 |
+
# transformer config
|
44 |
+
transformer:
|
45 |
+
num_layers: 3
|
46 |
+
num_heads: 8
|
47 |
+
head_features: 64
|
48 |
+
multiplier: 2
|
49 |
+
|
50 |
+
# diffusion distribution config
|
51 |
+
dist:
|
52 |
+
sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
|
53 |
+
estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
|
54 |
+
mean: -3.0
|
55 |
+
std: 1.0
|
56 |
+
|
57 |
+
|
58 |
+
loss_params:
|
59 |
+
lambda_mel: 10. # mel reconstruction loss (1st & 2nd stage)
|
60 |
+
lambda_adv: 1. # adversarial loss (1st & 2nd stage)
|
61 |
+
lambda_reg: 1. # adversarial regularization loss (1st & 2nd stage)
|
62 |
+
lambda_fm: 0.1 # feature matching loss (1st & 2nd stage)
|
63 |
+
|
64 |
+
lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
|
65 |
+
lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
|
66 |
+
TMA_epoch: 2 # TMA starting epoch (1st stage)
|
67 |
+
|
68 |
+
# https://github.com/yl4579/StyleTTS/issues/7
|
69 |
+
TMA_CEloss: false # whether to use cross-entropy (CE) loss for TMA
|
70 |
+
|
71 |
+
lambda_F0: 1. # F0 reconstruction loss (2nd stage)
|
72 |
+
lambda_norm: 1. # norm reconstruction loss (2nd stage)
|
73 |
+
lambda_dur: 1. # duration loss (2nd stage)
|
74 |
+
lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
|
75 |
+
|
76 |
+
lambda_sty: 1. # style reconstruction loss (2nd stage)
|
77 |
+
lambda_diff: 1. # score matching loss (2nd stage)
|
78 |
+
|
79 |
+
optimizer_params:
|
80 |
+
lr: 0.0001
|
StyleTTS_Accelerate/Models/Anispeech/epoch_1st_00020.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:686beb07eebe47a05efbf8f522e35d3000e8eb56e3c3a64fe0c136cd7d8d784d
|
3 |
+
size 1322367412
|
StyleTTS_Accelerate/Models/Anispeech/epoch_2nd_00015.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:96f0c34bacfecec841845b92287a553d4c4263d28a4f24111b6223f9cdcaba76
|
3 |
+
size 1072227551
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735697608.khodaya-basse-dige.344916.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:285145307d354169a4f365a200060e8835925dd5ec6b15e343d4f5904d8d6840
|
3 |
+
size 88
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735697814.khodaya-basse-dige.346056.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c6d38ee3d2509a09e245e3e17b2d741863d352b9b93c0caf36aedf0870d9f05
|
3 |
+
size 88
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735698320.khodaya-basse-dige.347680.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:61d9e5048dda9c655b4c5a226759c691efccc2557993df16a3751608005ef6ac
|
3 |
+
size 7420
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735698764.khodaya-basse-dige.349633.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:42a6ca2f240bb8e02ca5c416fb3b48e8f05d553fc96dce7067a5c9701e456538
|
3 |
+
size 2678
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735698917.khodaya-basse-dige.350828.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:245e41619fc9245442901624a4f56bdc692b5406bc693fec844efc58931fa314
|
3 |
+
size 3006826
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735721417.khodaya-basse-dige.404215.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:178e0cb87908e6a4822482aca1dfe1c289c137c67ffd3cf8115214cd5c12eff4
|
3 |
+
size 88
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735721458.khodaya-basse-dige.404475.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7fb5701c1192707e8ce5502bcaa1e5c0b23493ca52179857e7e7ca07c005f93d
|
3 |
+
size 19924
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735723135.khodaya-basse-dige.409798.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eed71cd3253e86c83a89b290143491939736c50b13e9d4677d9b4aeb4da7bfad
|
3 |
+
size 124082
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735736169.khodaya-basse-dige.8849.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cccfbe4590399b86a822d6b053509d7a899d6e0ce4b9ee1df1110f5ba0e04474
|
3 |
+
size 278128
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735753783.khodaya-basse-dige.55757.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d76c1a47afb464e5f1abedc14512d3786027064a9384a5e7c26126ff57ec1eb
|
3 |
+
size 88
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735753897.khodaya-basse-dige.56741.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31682364e9cd764c6e199af5c2f1ee87a131c4bed8e0bb37536c33e42813b115
|
3 |
+
size 88
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735753979.khodaya-basse-dige.58472.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eac6f91079a4c53ebf8b44888512b2af1202a9e4fff41e9c0404adef1c940bb4
|
3 |
+
size 88
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735754151.khodaya-basse-dige.59652.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ba702daecc82006aad5078b95a71d8c7ae497c15e0c599148f8163b1e845869
|
3 |
+
size 88
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735754204.khodaya-basse-dige.60572.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90c9ac4835b0122b4688e388cb8aad6881e5d35095c06ae40ee21a6855935eaf
|
3 |
+
size 88
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755068.khodaya-basse-dige.62584.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f203dd3b75cb53fac9fe52bb0b5f87ffdd0515b75fa95667853772ff6b4b56b
|
3 |
+
size 88
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755116.khodaya-basse-dige.63449.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:146856775335bf311b3131a67c83bd394947681454436b8f24ea5870c296b809
|
3 |
+
size 88
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755175.khodaya-basse-dige.64734.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:29031b8874f604abfc131ca92e7e9c3f35da1b065165010922ee872d5e349fff
|
3 |
+
size 88
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755213.khodaya-basse-dige.65681.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:30bbc2f8786eeac413bd53068eda56aebcc6d639579dc4160c09e82fe0fbe542
|
3 |
+
size 88
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755246.khodaya-basse-dige.66573.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5c99617590fc1ca468c5f653147d8b9178392e0c4ef662a5aa5593f6dba60e39
|
3 |
+
size 88
|
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755299.khodaya-basse-dige.67690.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a5b293997ed32692a4e9c7741d758712ca26426229b5e9a4558f4d2c861a06a
|
3 |
+
size 1038
|
StyleTTS_Accelerate/Models/Anispeech/train.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/config.yml
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
log_dir: "Models/Anispeech_with_DIFF"
|
2 |
+
first_stage_path: "/home/ubuntu/StyleTTS_Accelerate_44khz/Models/Anispeech/epoch_1st_00020.pth"
|
3 |
+
save_freq: 1
|
4 |
+
log_interval: 10
|
5 |
+
device: "cuda"
|
6 |
+
multigpu: false
|
7 |
+
epochs_1st: 200 # number of epochs for first stage training
|
8 |
+
epochs_2nd: 100 # number of peochs for second stage training
|
9 |
+
batch_size: 32
|
10 |
+
pretrained_model: ""
|
11 |
+
second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
|
12 |
+
load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
|
13 |
+
|
14 |
+
diff_epoch: 5
|
15 |
+
|
16 |
+
train_data: "/home/ubuntu/StyleTTS_Accelerate_44khz/Data/ani_train_only_longs.csv"
|
17 |
+
val_data: "/home/ubuntu/StyleTTS_Accelerate_44khz/Data/val_list_libritts.txt"
|
18 |
+
|
19 |
+
F0_path: "Utils/JDC/bst.t7"
|
20 |
+
ASR_config: "Utils/ASR/config.yml"
|
21 |
+
ASR_path: "Utils/ASR/epoch_00080.pth"
|
22 |
+
|
23 |
+
preprocess_params:
|
24 |
+
sr: 24000
|
25 |
+
spect_params:
|
26 |
+
n_fft: 2048
|
27 |
+
win_length: 1200
|
28 |
+
hop_length: 300
|
29 |
+
|
30 |
+
model_params:
|
31 |
+
hidden_dim: 512
|
32 |
+
n_token: 178
|
33 |
+
style_dim: 128
|
34 |
+
n_layer: 3
|
35 |
+
dim_in: 64
|
36 |
+
max_conv_dim: 512
|
37 |
+
n_mels: 80
|
38 |
+
dropout: 0.2
|
39 |
+
|
40 |
+
|
41 |
+
diffusion:
|
42 |
+
embedding_mask_proba: 0.1
|
43 |
+
# transformer config
|
44 |
+
transformer:
|
45 |
+
num_layers: 3
|
46 |
+
num_heads: 8
|
47 |
+
head_features: 64
|
48 |
+
multiplier: 2
|
49 |
+
|
50 |
+
# diffusion distribution config
|
51 |
+
dist:
|
52 |
+
sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
|
53 |
+
estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
|
54 |
+
mean: -3.0
|
55 |
+
std: 1.0
|
56 |
+
|
57 |
+
|
58 |
+
loss_params:
|
59 |
+
lambda_mel: 10. # mel reconstruction loss (1st & 2nd stage)
|
60 |
+
lambda_adv: 1. # adversarial loss (1st & 2nd stage)
|
61 |
+
lambda_reg: 1. # adversarial regularization loss (1st & 2nd stage)
|
62 |
+
lambda_fm: 0.1 # feature matching loss (1st & 2nd stage)
|
63 |
+
|
64 |
+
lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
|
65 |
+
lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
|
66 |
+
TMA_epoch: 2 # TMA starting epoch (1st stage)
|
67 |
+
|
68 |
+
# https://github.com/yl4579/StyleTTS/issues/7
|
69 |
+
TMA_CEloss: false # whether to use cross-entropy (CE) loss for TMA
|
70 |
+
|
71 |
+
lambda_F0: 1. # F0 reconstruction loss (2nd stage)
|
72 |
+
lambda_norm: 1. # norm reconstruction loss (2nd stage)
|
73 |
+
lambda_dur: 1. # duration loss (2nd stage)
|
74 |
+
lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
|
75 |
+
|
76 |
+
lambda_sty: 1. # style reconstruction loss (2nd stage)
|
77 |
+
lambda_diff: 1. # score matching loss (2nd stage)
|
78 |
+
|
79 |
+
optimizer_params:
|
80 |
+
lr: 0.0001
|
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/epoch_1st_00040.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e6ddda16cbcd18677f94582b0c60014429ec717ec6ba3ef3819ead0b626a054
|
3 |
+
size 1292081189
|
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/epoch_2nd_00014.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:590b105355609e73ad32a08c138d3d164981b5abaeb548ed0950c404715fca48
|
3 |
+
size 1322367412
|
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735755378.khodaya-basse-dige.68815.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a24f333c91e20df18fb006bf9958430e27eca32efbbfd70194e464bbb217ef0
|
3 |
+
size 80357
|
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735758983.khodaya-basse-dige.79079.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5567fe7eae9933eafa645b60506c564020381a3cf6ad1522c978715d5aa979be
|
3 |
+
size 1486
|
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735759171.khodaya-basse-dige.80201.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:701829cea16b04487744c10ae1463b8559585ba84fbc9c49dc25cafe00ea1f48
|
3 |
+
size 563
|
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735759231.khodaya-basse-dige.81123.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f9de06fe3125dc403d9cd0275c4a7878ca8d3a5fc18d2fdc653c38d366debd9
|
3 |
+
size 429931
|