Respair commited on
Commit
9d7032c
·
verified ·
1 Parent(s): a168453

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. StyleTTS_Accelerate/Configs/config.yml +80 -0
  3. StyleTTS_Accelerate/Configs/config_44.1khz.yml +80 -0
  4. StyleTTS_Accelerate/Data/ani_train.csv +0 -0
  5. StyleTTS_Accelerate/Data/ani_train_only_longs.csv +0 -0
  6. StyleTTS_Accelerate/Data/train_list.txt +0 -0
  7. StyleTTS_Accelerate/Data/train_list_libritts.txt +3 -0
  8. StyleTTS_Accelerate/Data/val_list.txt +100 -0
  9. StyleTTS_Accelerate/Data/val_list_libritts.txt +195 -0
  10. StyleTTS_Accelerate/Demo/Inference_LJSpeech.ipynb +417 -0
  11. StyleTTS_Accelerate/Demo/Inference_LibriTTS.ipynb +529 -0
  12. StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder.cpython-310.pyc +0 -0
  13. StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder.cpython-39.pyc +0 -0
  14. StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder_utils.cpython-310.pyc +0 -0
  15. StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder_utils.cpython-39.pyc +0 -0
  16. StyleTTS_Accelerate/Demo/hifi-gan/vocoder.py +283 -0
  17. StyleTTS_Accelerate/Demo/hifi-gan/vocoder_utils.py +58 -0
  18. StyleTTS_Accelerate/LICENSE +21 -0
  19. StyleTTS_Accelerate/LICENSE copy +21 -0
  20. StyleTTS_Accelerate/Models/Anispeech/config.yml +80 -0
  21. StyleTTS_Accelerate/Models/Anispeech/epoch_1st_00020.pth +3 -0
  22. StyleTTS_Accelerate/Models/Anispeech/epoch_2nd_00015.pth +3 -0
  23. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735697608.khodaya-basse-dige.344916.0 +3 -0
  24. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735697814.khodaya-basse-dige.346056.0 +3 -0
  25. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735698320.khodaya-basse-dige.347680.0 +3 -0
  26. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735698764.khodaya-basse-dige.349633.0 +3 -0
  27. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735698917.khodaya-basse-dige.350828.0 +3 -0
  28. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735721417.khodaya-basse-dige.404215.0 +3 -0
  29. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735721458.khodaya-basse-dige.404475.0 +3 -0
  30. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735723135.khodaya-basse-dige.409798.0 +3 -0
  31. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735736169.khodaya-basse-dige.8849.0 +3 -0
  32. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735753783.khodaya-basse-dige.55757.0 +3 -0
  33. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735753897.khodaya-basse-dige.56741.0 +3 -0
  34. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735753979.khodaya-basse-dige.58472.0 +3 -0
  35. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735754151.khodaya-basse-dige.59652.0 +3 -0
  36. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735754204.khodaya-basse-dige.60572.0 +3 -0
  37. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755068.khodaya-basse-dige.62584.0 +3 -0
  38. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755116.khodaya-basse-dige.63449.0 +3 -0
  39. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755175.khodaya-basse-dige.64734.0 +3 -0
  40. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755213.khodaya-basse-dige.65681.0 +3 -0
  41. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755246.khodaya-basse-dige.66573.0 +3 -0
  42. StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755299.khodaya-basse-dige.67690.0 +3 -0
  43. StyleTTS_Accelerate/Models/Anispeech/train.log +0 -0
  44. StyleTTS_Accelerate/Models/Anispeech_with_DIFF/config.yml +80 -0
  45. StyleTTS_Accelerate/Models/Anispeech_with_DIFF/epoch_1st_00040.pth +3 -0
  46. StyleTTS_Accelerate/Models/Anispeech_with_DIFF/epoch_2nd_00014.pth +3 -0
  47. StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735755378.khodaya-basse-dige.68815.0 +3 -0
  48. StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735758983.khodaya-basse-dige.79079.0 +3 -0
  49. StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735759171.khodaya-basse-dige.80201.0 +3 -0
  50. StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735759231.khodaya-basse-dige.81123.0 +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ StyleTTS_Accelerate/Data/train_list_libritts.txt filter=lfs diff=lfs merge=lfs -text
37
+ StyleTTS_Accelerate/Utils/JDC/bst.t7 filter=lfs diff=lfs merge=lfs -text
StyleTTS_Accelerate/Configs/config.yml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/Anispeech_with_DIFF"
2
+ first_stage_path: "/home/ubuntu/StyleTTS_Accelerate_44khz/Models/Anispeech/epoch_1st_00020.pth"
3
+ save_freq: 1
4
+ log_interval: 10
5
+ device: "cuda"
6
+ multigpu: false
7
+ epochs_1st: 200 # number of epochs for first stage training
8
+ epochs_2nd: 100 # number of peochs for second stage training
9
+ batch_size: 32
10
+ pretrained_model: ""
11
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
12
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
13
+
14
+ diff_epoch: 5
15
+
16
+ train_data: "/home/ubuntu/StyleTTS_Accelerate_44khz/Data/ani_train_only_longs.csv"
17
+ val_data: "/home/ubuntu/StyleTTS_Accelerate_44khz/Data/val_list_libritts.txt"
18
+
19
+ F0_path: "Utils/JDC/bst.t7"
20
+ ASR_config: "Utils/ASR/config.yml"
21
+ ASR_path: "Utils/ASR/epoch_00080.pth"
22
+
23
+ preprocess_params:
24
+ sr: 24000
25
+ spect_params:
26
+ n_fft: 2048
27
+ win_length: 1200
28
+ hop_length: 300
29
+
30
+ model_params:
31
+ hidden_dim: 512
32
+ n_token: 178
33
+ style_dim: 128
34
+ n_layer: 3
35
+ dim_in: 64
36
+ max_conv_dim: 512
37
+ n_mels: 80
38
+ dropout: 0.2
39
+
40
+
41
+ diffusion:
42
+ embedding_mask_proba: 0.1
43
+ # transformer config
44
+ transformer:
45
+ num_layers: 3
46
+ num_heads: 8
47
+ head_features: 64
48
+ multiplier: 2
49
+
50
+ # diffusion distribution config
51
+ dist:
52
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
53
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
54
+ mean: -3.0
55
+ std: 1.0
56
+
57
+
58
+ loss_params:
59
+ lambda_mel: 10. # mel reconstruction loss (1st & 2nd stage)
60
+ lambda_adv: 1. # adversarial loss (1st & 2nd stage)
61
+ lambda_reg: 1. # adversarial regularization loss (1st & 2nd stage)
62
+ lambda_fm: 0.1 # feature matching loss (1st & 2nd stage)
63
+
64
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
65
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
66
+ TMA_epoch: 2 # TMA starting epoch (1st stage)
67
+
68
+ # https://github.com/yl4579/StyleTTS/issues/7
69
+ TMA_CEloss: false # whether to use cross-entropy (CE) loss for TMA
70
+
71
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
72
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
73
+ lambda_dur: 1. # duration loss (2nd stage)
74
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
75
+
76
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
77
+ lambda_diff: 1. # score matching loss (2nd stage)
78
+
79
+ optimizer_params:
80
+ lr: 0.0001
StyleTTS_Accelerate/Configs/config_44.1khz.yml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/LJSpeech"
2
+ first_stage_path: "/home/ubuntu/StyleTTS_Accelerate/Models/LJSpeech/epoch_1st_00040.pth"
3
+ save_freq: 1
4
+ log_interval: 10
5
+ device: "cuda"
6
+ multigpu: false
7
+ epochs_1st: 200 # number of epochs for first stage training
8
+ epochs_2nd: 100 # number of peochs for second stage training
9
+ batch_size: 32
10
+ pretrained_model: "/home/ubuntu/StyleTTS_Accelerate/Models/LJSpeech/epoch_1st_00004.pth"
11
+ second_stage_load_pretrained: false # set to true if the pre-trained model is for 2nd stage
12
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
13
+
14
+ diff_epoch: 5
15
+
16
+ train_data: "Data/train_list.txt"
17
+ val_data: "Data/val_list.txt"
18
+
19
+ F0_path: "Utils/JDC/bst.t7"
20
+ ASR_config: "Utils/ASR/config.yml"
21
+ ASR_path: "Utils/ASR/epoch_00080.pth"
22
+
23
+ preprocess_params:
24
+ sr: 44_100
25
+ spect_params:
26
+ n_fft: 2048
27
+ win_length: 2048
28
+ hop_length: 512
29
+
30
+ model_params:
31
+ hidden_dim: 512
32
+ n_token: 178
33
+ style_dim: 128
34
+ n_layer: 3
35
+ dim_in: 64
36
+ max_conv_dim: 512
37
+ n_mels: 128
38
+ dropout: 0.2
39
+
40
+
41
+ diffusion:
42
+ embedding_mask_proba: 0.1
43
+ # transformer config
44
+ transformer:
45
+ num_layers: 3
46
+ num_heads: 8
47
+ head_features: 64
48
+ multiplier: 2
49
+
50
+ # diffusion distribution config
51
+ dist:
52
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
53
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
54
+ mean: -3.0
55
+ std: 1.0
56
+
57
+
58
+ loss_params:
59
+ lambda_mel: 10. # mel reconstruction loss (1st & 2nd stage)
60
+ lambda_adv: 1. # adversarial loss (1st & 2nd stage)
61
+ lambda_reg: 1. # adversarial regularization loss (1st & 2nd stage)
62
+ lambda_fm: 0.1 # feature matching loss (1st & 2nd stage)
63
+
64
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
65
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
66
+ TMA_epoch: 2 # TMA starting epoch (1st stage)
67
+
68
+ # https://github.com/yl4579/StyleTTS/issues/7
69
+ TMA_CEloss: false # whether to use cross-entropy (CE) loss for TMA
70
+
71
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
72
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
73
+ lambda_dur: 1. # duration loss (2nd stage)
74
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
75
+
76
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
77
+ lambda_diff: 1. # score matching loss (2nd stage)
78
+
79
+ optimizer_params:
80
+ lr: 0.0001
StyleTTS_Accelerate/Data/ani_train.csv ADDED
The diff for this file is too large to render. See raw diff
 
StyleTTS_Accelerate/Data/ani_train_only_longs.csv ADDED
The diff for this file is too large to render. See raw diff
 
StyleTTS_Accelerate/Data/train_list.txt ADDED
The diff for this file is too large to render. See raw diff
 
StyleTTS_Accelerate/Data/train_list_libritts.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07ced2d20dc0163f3a25d52c8544f63ffba4e9608664762325832f26376c402f
3
+ size 31691428
StyleTTS_Accelerate/Data/val_list.txt ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LJSpeech-1.1/wavs/LJ022-0023.wav|ðɪ ˌoʊvɚwˈɛlmɪŋ mədʒˈɔːɹᵻɾi ʌv pˈiːpəl ɪn ðɪs kˈʌntɹi nˈoʊ hˌaʊ tə sˈɪft ðə wˈiːt fɹʌmðə tʃˈæf ɪn wʌt ðeɪ hˈɪɹ ænd wʌt ðeɪ ɹˈiːd .|0
2
+ LJSpeech-1.1/wavs/LJ043-0030.wav|ɪf sˈʌmbɑːdi dˈɪd ðˈæt tə mˌiː , ɐ lˈaʊsi tɹˈɪk lˈaɪk ðˈæt , tə tˈeɪk maɪ wˈaɪf ɐwˈeɪ , ænd ˈɔːl ðə fˈɜːnɪtʃɚ , aɪ wʊd biː mˈæd æz hˈɛl , tˈuː .|0
3
+ LJSpeech-1.1/wavs/LJ005-0201.wav|ˌæzˌɪz ʃˈoʊn baɪ ðə ɹᵻpˈoːɹt ʌvðə kəmˈɪʃənɚz tʊ ɪŋkwˈaɪɚɹ ˌɪntʊ ðə stˈeɪt ʌvðə mjuːnˈɪsɪpəl kˌɔːɹpɚɹˈeɪʃənz ɪn ˈeɪtiːn θˈɜːɾi fˈaɪv .|0
4
+ LJSpeech-1.1/wavs/LJ001-0110.wav|ˈiːvən ðə kˈæslɑːn tˈaɪp wɛn ɛnlˈɑːɹdʒd ʃˈoʊz ɡɹˈeɪt ʃˈɔːɹtkʌmɪŋz ɪn ðɪs ɹᵻspˈɛkt :|0
5
+ LJSpeech-1.1/wavs/LJ003-0345.wav|ˈɔːl ðə kəmˈɪɾi kʊd dˈuː ɪn ðɪs ɹᵻspˈɛkt wʌz tə θɹˈoʊ ðə ɹᵻspˌɑːnsəbˈɪlɪɾi ˌɔn ˈʌðɚz .|0
6
+ LJSpeech-1.1/wavs/LJ007-0154.wav|ðiːz pˈʌndʒənt ænd wˈɛl ɡɹˈaʊndᵻd stɹˈɪktʃɚz ɐplˈaɪd wɪð stˈɪl ɡɹˈeɪɾɚ fˈoːɹs tə ðɪ ʌŋkənvˈɪktᵻd pɹˈɪzənɚ , ðə mˈæn hˌuː kˈeɪm tə ðə pɹˈɪzən ˈɪnəsənt , ænd stˈɪl ʌŋkəntˈæmᵻnˌeɪɾᵻd ,|0
7
+ LJSpeech-1.1/wavs/LJ018-0098.wav|ænd ɹˈɛkəɡnˌaɪzd æz wˈʌn ʌvðə fɹˈiːkwɛntɚz ʌvðə bˈoʊɡəs lˈɔː stˈeɪʃənɚz . hɪz ɚɹˈɛst lˈɛd tə ðæt ʌv ˈʌðɚz .|0
8
+ LJSpeech-1.1/wavs/LJ047-0044.wav|ˈɑːswəld wʌz , haʊˈɛvɚ , wˈɪlɪŋ tə dɪskˈʌs hɪz kˈɑːntækts wɪð sˈoʊviət ɐθˈɔːɹɪɾiz . hiː dᵻnˈaɪd hˌævɪŋ ˌɛni ɪnvˈɑːlvmənt wɪð sˈoʊviət ɪntˈɛlɪdʒəns ˈeɪdʒənsiz|0
9
+ LJSpeech-1.1/wavs/LJ031-0038.wav|ðə fˈɜːst fɪzˈɪʃən tə sˈiː ðə pɹˈɛzɪdənt æt pˈɑːɹklənd hˈɑːspɪɾəl wʌz dˈɑːktɚ . tʃˈɑːɹlz dʒˈeɪ . kˈæɹɪkˌoʊ , ɐ ɹˈɛzᵻdənt ɪn dʒˈɛnɚɹəl sˈɜːdʒɚɹi .|0
10
+ LJSpeech-1.1/wavs/LJ048-0194.wav|dˈʊɹɹɪŋ ðə mˈɔːɹnɪŋ ʌv noʊvˈɛmbɚ twˈɛnti tˈuː pɹˈaɪɚ tə ðə mˈoʊɾɚkˌeɪd .|0
11
+ LJSpeech-1.1/wavs/LJ049-0026.wav|ˌɔn əkˈeɪʒən ðə sˈiːkɹᵻt sˈɜːvɪs hɐzbɪn pɚmˈɪɾᵻd tə hæv ɐn ˈeɪdʒənt ɹˈaɪdɪŋ ɪnðə pˈæsɪndʒɚ kəmpˈɑːɹtmənt wɪððə pɹˈɛzɪdənt .|0
12
+ LJSpeech-1.1/wavs/LJ004-0152.wav|ɔːlðˈoʊ æt mˈɪstɚ . bˈʌkstənz vˈɪzɪt ɐ nˈuː dʒˈeɪl wʌz ɪn pɹˈɑːsɛs ʌv ɪɹˈɛkʃən , ðə fˈɜːst stˈɛp təwˈɔːɹdz ɹᵻfˈɔːɹm sˈɪns hˈaʊɚdz vˌɪzɪtˈeɪʃən ɪn sˈɛvəntˌiːn sˈɛvənti fˈoːɹ .|0
13
+ LJSpeech-1.1/wavs/LJ008-0278.wav|ɔːɹ ðˈɛɹz mˌaɪt biː wˈʌn ʌv mˈɛni , ænd ɪt mˌaɪt biː kənsˈɪdɚd nˈɛsᵻsɚɹi tə dˈɑːlɚ mˌeɪk ɐn ɛɡzˈæmpəl.dˈɑːlɚ|0
14
+ LJSpeech-1.1/wavs/LJ043-0002.wav|ðə wˈɔːɹəŋ kəmˈɪʃən ɹᵻpˈoːɹt . baɪ ðə pɹˈɛzɪdənts kəmˈɪʃən ɔnðɪ ɐsˌæsᵻnˈeɪʃən ʌv pɹˈɛzɪdənt kˈɛnədi . tʃˈæptɚ sˈɛvən . lˈiː hˈɑːɹvi ˈɑːswəld :|0
15
+ LJSpeech-1.1/wavs/LJ009-0114.wav|mˈɪstɚ . wˈeɪkfiːld wˈaɪndz ˈʌp hɪz ɡɹˈæfɪk bˌʌt sˈʌmwʌt sɛnsˈeɪʃənəl ɐkˈaʊnt baɪ dᵻskɹˈaɪbɪŋ ɐnˈʌðɚ ɹᵻlˈɪdʒəs sˈɜːvɪs , wˌɪtʃ mˈeɪ ɐpɹˈoʊpɹɪˌeɪtli biː ɪnsˈɜːɾᵻd hˈɪɹ .|0
16
+ LJSpeech-1.1/wavs/LJ028-0506.wav|ɐ mˈɑːdɚn ˈɑːɹɾɪst wʊdhɐv dˈɪfɪkˌʌlti ɪn dˌuːɪŋ sˈʌtʃ ˈækjʊɹət wˈɜːk .|0
17
+ LJSpeech-1.1/wavs/LJ050-0168.wav|wɪððə pɚtˈɪkjʊlɚ pˈɜːpəsᵻz ʌvðɪ ˈeɪdʒənsi ɪnvˈɑːlvd . ðə kəmˈɪʃən ɹˈɛkəɡnˌaɪzᵻz ðæt ðɪs ɪz ɐ kˌɑːntɹəvˈɜːʃəl ˈɛɹiə|0
18
+ LJSpeech-1.1/wavs/LJ039-0223.wav|ˈɑːswəldz mɚɹˈiːn tɹˈeɪnɪŋ ɪn mˈɑːɹksmənʃˌɪp , hɪz ˈʌðɚ ɹˈaɪfəl ɛkspˈiəɹɪəns ænd hɪz ɪstˈæblɪʃt fəmˌɪliˈæɹɪɾi wɪð ðɪs pɚtˈɪkjʊlɚ wˈɛpən|0
19
+ LJSpeech-1.1/wavs/LJ029-0032.wav|ɐkˈoːɹdɪŋ tʊ oʊdˈɑːnəl , kwˈoʊt , wiː hæd ɐ mˈoʊɾɚkˌeɪd wɛɹˈɛvɚ kplˈʌsplʌs wˌɪtʃ hɐdbɪn bˌɪn hˈeɪstili sˈʌmənd fɚðə ðə pˈɜːpəs wiː wˈɛnt , ˈɛnd kwˈoʊt .|0
20
+ LJSpeech-1.1/wavs/LJ031-0070.wav|dˈɑːktɚ . klˈɑːɹk , hˌuː mˈoʊst klˈoʊsli əbzˈɜːvd ðə hˈɛd wˈuːnd ,|0
21
+ LJSpeech-1.1/wavs/LJ034-0198.wav|jˈuːɪnz , hˌuː wʌz ɔnðə saʊθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən stɹˈiːts tˈɛstᵻfˌaɪd ðæt hiː kʊd nˌɑːt dᵻskɹˈaɪb ðə mˈæn hiː sˈɔː ɪnðə wˈɪndoʊ .|0
22
+ LJSpeech-1.1/wavs/LJ026-0068.wav|ˈɛnɚdʒi ˈɛntɚz ðə plˈænt , tʊ ɐ smˈɔːl ɛkstˈɛnt ,|0
23
+ LJSpeech-1.1/wavs/LJ039-0075.wav|wˈʌns juː nˈoʊ ðæt juː mˈʌst pˌʊt ðə kɹˈɔshɛɹz ɔnðə tˈɑːɹɡɪt ænd ðæt ɪz ˈɔːl ðæt ɪz nˈɛsᵻsɚɹi .|0
24
+ LJSpeech-1.1/wavs/LJ004-0096.wav|ðə fˈeɪɾəl kˈɑːnsɪkwənsᵻz wˈɛɹɑːf mˌaɪt biː pɹɪvˈɛntᵻd ɪf ðə dʒˈʌstɪsᵻz ʌvðə pˈiːs wɜː djˈuːli ˈɔːθɚɹˌaɪzd|0
25
+ LJSpeech-1.1/wavs/LJ005-0014.wav|spˈiːkɪŋ ˌɔn ɐ dᵻbˈeɪt ˌɔn pɹˈɪzən mˈæɾɚz , hiː dᵻklˈɛɹd ðˈæt|0
26
+ LJSpeech-1.1/wavs/LJ012-0161.wav|hiː wʌz ɹᵻpˈoːɹɾᵻd tə hæv fˈɔːlən ɐwˈeɪ tʊ ɐ ʃˈædoʊ .|0
27
+ LJSpeech-1.1/wavs/LJ018-0239.wav|hɪz dˌɪsɐpˈɪɹəns ɡˈeɪv kˈʌlɚ ænd sˈʌbstəns tʊ ˈiːvəl ɹᵻpˈoːɹts ɔːlɹˌɛdi ɪn sˌɜːkjʊlˈeɪʃən ðætðə wɪl ænd kənvˈeɪəns əbˌʌv ɹᵻfˈɜːd tuː|0
28
+ LJSpeech-1.1/wavs/LJ019-0257.wav|hˈɪɹ ðə tɹˈɛd wˈiːl wʌz ɪn jˈuːs , ðɛɹ sˈɛljʊlɚ kɹˈæŋks , ɔːɹ hˈɑːɹd lˈeɪbɚ məʃˈiːnz .|0
29
+ LJSpeech-1.1/wavs/LJ028-0008.wav|juː tˈæp dʒˈɛntli wɪð jʊɹ hˈiːl əpˌɑːn ðə ʃˈoʊldɚɹ ʌvðə dɹˈoʊmdɚɹi tʊ ˈɜːdʒ hɜːɹ ˈɔn .|0
30
+ LJSpeech-1.1/wavs/LJ024-0083.wav|ðɪs plˈæn ʌv mˈaɪn ɪz nˈoʊ ɐtˈæk ɔnðə kˈoːɹt ;|0
31
+ LJSpeech-1.1/wavs/LJ042-0129.wav|nˈoʊ nˈaɪt klˈʌbz ɔːɹ bˈoʊlɪŋ ˈælɪz , nˈoʊ plˈeɪsᵻz ʌv ɹˌɛkɹiːˈeɪʃən ɛksˈɛpt ðə tɹˈeɪd jˈuːniən dˈænsᵻz . aɪ hæv hæd ɪnˈʌf .|0
32
+ LJSpeech-1.1/wavs/LJ036-0103.wav|ðə pəlˈiːs ˈæskt hˌɪm wˈɛðɚ hiː kʊd pˈɪk ˈaʊt hɪz pˈæsɪndʒɚ fɹʌmðə lˈaɪnʌp .|0
33
+ LJSpeech-1.1/wavs/LJ046-0058.wav|dˈʊɹɹɪŋ hɪz pɹˈɛzɪdənsi , fɹˈæŋklɪn dˈiː . ɹˈoʊzəvˌɛlt mˌeɪd ˈɔːlmoʊst fˈoːɹ hˈʌndɹɪd dʒˈɜːniz ænd tɹˈævəld mˈoːɹ ðɐn θɹˈiː hˈʌndɹɪd fˈɪfti θˈaʊzənd mˈaɪlz .|0
34
+ LJSpeech-1.1/wavs/LJ014-0076.wav|hiː wʌz sˈiːn ˈæftɚwɚdz smˈoʊkɪŋ ænd tˈɔːkɪŋ wɪð hɪz hˈoʊsts ɪn ðɛɹ bˈæk pˈɑːɹlɚ , ænd nˈɛvɚ sˈiːn ɐɡˈɛn ɐlˈaɪv .|0
35
+ LJSpeech-1.1/wavs/LJ002-0043.wav|lˈɔŋ nˈæɹoʊ ɹˈuːmz wˈʌn θˈɜːɾi sˈɪks fˈiːt , sˈɪks twˈɛnti θɹˈiː fˈiːt , ænd ðɪ ˈeɪtθ ˈeɪtiːn ,|0
36
+ LJSpeech-1.1/wavs/LJ009-0076.wav|wiː kˈʌm tə ðə sˈɜːmən .|0
37
+ LJSpeech-1.1/wavs/LJ017-0131.wav|ˈiːvən wɛn ðə hˈaɪ ʃˈɛɹɪf hæd tˈoʊld hˌɪm ðɛɹwˌʌz nˈoʊ pˌɑːsəbˈɪlɪɾi əvɚ ɹᵻpɹˈiːv , ænd wɪðˌɪn ɐ fjˈuː ˈaʊɚz ʌv ˌɛksɪkjˈuːʃən .|0
38
+ LJSpeech-1.1/wavs/LJ046-0184.wav|bˌʌt ðɛɹ ɪz ɐ sˈɪstəm fɚðɪ ɪmˈiːdɪət nˌoʊɾɪfɪkˈeɪʃən ʌvðə sˈiːkɹᵻt sˈɜːvɪs baɪ ðə kənfˈaɪnɪŋ ˌɪnstɪtˈuːʃən wɛn ɐ sˈʌbdʒɛkt ɪz ɹᵻlˈiːst ɔːɹ ɛskˈeɪps .|0
39
+ LJSpeech-1.1/wavs/LJ014-0263.wav|wˌɛn ˈʌðɚ plˈɛʒɚz pˈɔːld hiː tˈʊk ɐ θˈiəɾɚ , ænd pˈoʊzd æz ɐ mjuːnˈɪfɪsənt pˈeɪtɹən ʌvðə dɹəmˈæɾɪk ˈɑːɹt .|0
40
+ LJSpeech-1.1/wavs/LJ042-0096.wav|ˈoʊld ɛkstʃˈeɪndʒ ɹˈeɪt ɪn ɐdˈɪʃən tə hɪz fˈæktɚɹi sˈælɚɹi ʌv ɐpɹˈɑːksɪmətli ˈiːkwəl ɐmˈaʊnt|0
41
+ LJSpeech-1.1/wavs/LJ049-0050.wav|hˈɪl hæd bˈoʊθ fˈiːt ɔnðə kˈɑːɹ ænd wʌz klˈaɪmɪŋ ɐbˈoːɹd tʊ ɐsˈɪst pɹˈɛzɪdənt ænd mˈɪsɪz . kˈɛnədi .|0
42
+ LJSpeech-1.1/wavs/LJ019-0186.wav|sˈiːɪŋ ðæt sˈɪns ðɪ ɪstˈæblɪʃmənt ʌvðə sˈɛntɹəl kɹˈɪmɪnəl kˈoːɹt , nˈuːɡeɪt ɹᵻsˈiːvd pɹˈɪzənɚz fɔːɹ tɹˈaɪəl fɹʌm sˈɛvɹəl kˈaʊntiz ,|0
43
+ LJSpeech-1.1/wavs/LJ028-0307.wav|ðˈɛn lˈɛt twˈɛnti dˈeɪz pˈæs , ænd æt ðɪ ˈɛnd ʌv ðæt tˈaɪm stˈeɪʃən nˌɪɹ ðə tʃˈældæsəŋ ɡˈeɪts ɐ bˈɑːdi ʌv fˈoːɹ θˈaʊzənd .|0
44
+ LJSpeech-1.1/wavs/LJ012-0235.wav|wˌaɪl ðeɪ wɜːɹ ɪn ɐ stˈeɪt ʌv ɪnsˌɛnsəbˈɪlɪɾi ðə mˈɜːdɚ wʌz kəmˈɪɾᵻd .|0
45
+ LJSpeech-1.1/wavs/LJ034-0053.wav|ɹˈiːtʃt ðə sˈeɪm kəŋklˈuːʒən æz lætˈoʊnə ðætðə pɹˈɪnts fˈaʊnd ɔnðə kˈɑːɹtənz wɜː ðoʊz ʌv lˈiː hˈɑːɹvi ˈɑːswəld .|0
46
+ LJSpeech-1.1/wavs/LJ014-0030.wav|ðiːz wɜː dˈæmnətˌoːɹi fˈækts wˌɪtʃ wˈɛl səpˈoːɹɾᵻd ðə pɹˌɑːsɪkjˈuːʃən .|0
47
+ LJSpeech-1.1/wavs/LJ015-0203.wav|bˌʌt wɜː ðə pɹɪkˈɔːʃənz tˈuː mˈɪnɪt , ðə vˈɪdʒɪləns tˈuː klˈoʊs təbi ᵻlˈuːdᵻd ɔːɹ ˌoʊvɚkˈʌm ?|0
48
+ LJSpeech-1.1/wavs/LJ028-0093.wav|bˌʌt hɪz skɹˈaɪb ɹˈoʊt ɪɾ ɪnðə mˈænɚ kˈʌstəmˌɛɹi fɚðə skɹˈaɪbz ʌv ðoʊz dˈeɪz tə ɹˈaɪt ʌv ðɛɹ ɹˈɔɪəl mˈæstɚz .|0
49
+ LJSpeech-1.1/wavs/LJ002-0018.wav|ðɪ ɪnˈædɪkwəsi ʌvðə dʒˈeɪl wʌz nˈoʊɾɪst ænd ɹᵻpˈoːɹɾᵻd əpˌɑːn ɐɡˈɛn ænd ɐɡˈɛn baɪ ðə ɡɹˈænd dʒˈʊɹɹiz ʌvðə sˈɪɾi ʌv lˈʌndən ,|0
50
+ LJSpeech-1.1/wavs/LJ028-0275.wav|æt lˈæst , ɪnðə twˈɛntiəθ mˈʌnθ ,|0
51
+ LJSpeech-1.1/wavs/LJ012-0042.wav|wˌɪtʃ hiː kˈɛpt kənsˈiːld ɪn ɐ hˈaɪdɪŋ plˈeɪs wɪð ɐ tɹˈæp dˈoːɹ dʒˈʌst ˌʌndɚ hɪz bˈɛd .|0
52
+ LJSpeech-1.1/wavs/LJ011-0096.wav|hiː mˈæɹid ɐ lˈeɪdi ˈɔːlsoʊ bᵻlˈɔŋɪŋ tə ðə səsˈaɪəɾi ʌv fɹˈɛndz , hˌuː bɹˈɔːt hˌɪm ɐ lˈɑːɹdʒ fˈɔːɹtʃʊn , wˈɪtʃ , ænd hɪz ˈoʊn mˈʌni , hiː pˌʊt ˌɪntʊ ɐ sˈɪɾi fˈɜːm ,|0
53
+ LJSpeech-1.1/wavs/LJ036-0077.wav|ɹˈɑːdʒɚ dˈiː . kɹˈeɪɡ , ɐ dˈɛpjuːɾi ʃˈɛɹɪf ʌv dˈæləs kˈaʊnti ,|0
54
+ LJSpeech-1.1/wavs/LJ016-0318.wav|ˈʌðɚɹ əfˈɪʃəlz , ɡɹˈeɪt lˈɔɪɚz , ɡˈʌvɚnɚz ʌv pɹˈɪzənz , ænd tʃˈæplɪnz səpˈoːɹɾᵻd ðɪs vjˈuː .|0
55
+ LJSpeech-1.1/wavs/LJ013-0164.wav|hˌuː kˈeɪm fɹʌm hɪz ɹˈuːm ɹˈɛdi dɹˈɛst , ɐ səspˈɪʃəs sˈɜːkəmstˌæns , æz hiː wʌz ˈɔːlweɪz lˈeɪt ɪnðə mˈɔːɹnɪŋ .|0
56
+ LJSpeech-1.1/wavs/LJ027-0141.wav|ɪz klˈoʊsli ɹᵻpɹədˈuːst ɪnðə lˈaɪf hˈɪstɚɹi ʌv ɛɡzˈɪstɪŋ dˈɪɹ . ɔːɹ , ɪn ˈʌðɚ wˈɜːdz ,|0
57
+ LJSpeech-1.1/wavs/LJ028-0335.wav|ɐkˈoːɹdɪŋli ðeɪ kəmˈɪɾᵻd tə hˌɪm ðə kəmˈænd ʌv ðɛɹ hˈoʊl ˈɑːɹmi , ænd pˌʊt ðə kˈiːz ʌv ðɛɹ sˈɪɾi ˌɪntʊ hɪz hˈændz .|0
58
+ LJSpeech-1.1/wavs/LJ031-0202.wav|mˈɪsɪz . kˈɛnədi tʃˈoʊz ðə hˈɑːspɪɾəl ɪn bəθˈɛzdə fɚðɪ ˈɔːtɑːpsi bɪkˈʌz ðə pɹˈɛzɪdənt hæd sˈɜːvd ɪnðə nˈeɪvi .|0
59
+ LJSpeech-1.1/wavs/LJ021-0145.wav|fɹʌm ðoʊz wˈɪlɪŋ tə dʒˈɔɪn ɪn ɪstˈæblɪʃɪŋ ðɪs hˈoʊpt fɔːɹ pˈiəɹɪəd ʌv pˈiːs ,|0
60
+ LJSpeech-1.1/wavs/LJ016-0288.wav|dˈɑːlɚ mˈuːlɚ , mˈuːlɚ , hiːz ðə mˈæn , dˈɑːlɚ tˈɪl ɐ daɪvˈɜːʒən wʌz kɹiːˈeɪɾᵻd baɪ ðɪ ɐpˈɪɹəns ʌvðə ɡˈæloʊz , wˌɪtʃ wʌz ɹᵻsˈiːvd wɪð kəntˈɪnjuːəs jˈɛlz .|0
61
+ LJSpeech-1.1/wavs/LJ028-0081.wav|jˈɪɹz lˈeɪɾɚ , wˌɛn ðɪ ˌɑːɹkiːˈɑːlədʒˌɪsts kʊd ɹˈɛdili dɪstˈɪŋɡwɪʃ ðə fˈɔls fɹʌmðə tɹˈuː ,|0
62
+ LJSpeech-1.1/wavs/LJ018-0081.wav|hɪz dᵻfˈɛns bˌiːɪŋ ðæt hiː hæd ɪntˈɛndᵻd tə kəmˈɪt sˈuːɪsˌaɪd , bˌʌt ðˈæt , ɔnðɪ ɐpˈɪɹəns ʌv ðɪs ˈɑːfɪsɚ hˌuː hæd ɹˈɔŋd hˌɪm ,|0
63
+ LJSpeech-1.1/wavs/LJ021-0066.wav|təɡˌɛðɚ wɪð ɐ ɡɹˈeɪt ˈɪŋkɹiːs ɪnðə pˈeɪɹoʊlz , ðɛɹ hɐz kˈʌm ɐ səbstˈænʃəl ɹˈaɪz ɪnðə tˈoʊɾəl ʌv ɪndˈʌstɹɪəl pɹˈɑːfɪts|0
64
+ LJSpeech-1.1/wavs/LJ009-0238.wav|ˈæftɚ ðɪs ðə ʃˈɛɹɪfs sˈɛnt fɔːɹ ɐnˈʌðɚ ɹˈoʊp , bˌʌt ðə spɛktˈeɪɾɚz ˌɪntəfˈɪɹd , ænd ðə mˈæn wʌz kˈæɹid bˈæk tə dʒˈeɪl .|0
65
+ LJSpeech-1.1/wavs/LJ005-0079.wav|ænd ɪmpɹˈuːv ðə mˈɔːɹəlz ʌvðə pɹˈɪzənɚz , ænd ʃˌæl ɪnʃˈʊɹ ðə pɹˈɑːpɚ mˈɛʒɚɹ ʌv pˈʌnɪʃmənt tə kənvˈɪktᵻd əfˈɛndɚz .|0
66
+ LJSpeech-1.1/wavs/LJ035-0019.wav|dɹˈoʊv tə ðə nɔːɹθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən , ænd pˈɑːɹkt ɐpɹˈɑːksɪmətli tˈɛn fˈiːt fɹʌmðə tɹˈæfɪk sˈɪɡnəl .|0
67
+ LJSpeech-1.1/wavs/LJ036-0174.wav|ðɪs ɪz ðɪ ɐpɹˈɑːksɪmət tˈaɪm hiː ˈɛntɚd ðə ɹˈuːmɪŋhˌaʊs , ɐkˈoːɹdɪŋ tʊ ˈɜːliːn ɹˈɑːbɚts , ðə hˈaʊskiːpɚ ðˈɛɹ .|0
68
+ LJSpeech-1.1/wavs/LJ046-0146.wav|ðə kɹaɪtˈiəɹɪə ɪn ɪfˈɛkt pɹˈaɪɚ tə noʊvˈɛmbɚ twˈɛnti tˈuː , nˈaɪntiːn sˈɪksti θɹˈiː , fɔːɹ dɪtˈɜːmɪnɪŋ wˈɛðɚ tʊ ɐksˈɛpt mətˈɪɹiəl fɚðə pˌiːˌɑːɹɹˈɛs dʒˈɛnɚɹəl fˈaɪlz|0
69
+ LJSpeech-1.1/wavs/LJ017-0044.wav|ænd ðə dˈiːpɪst æŋzˈaɪəɾi wʌz fˈɛlt ðætðə kɹˈaɪm , ɪf kɹˈaɪm ðˈɛɹ hɐdbɪn , ʃˌʊd biː bɹˈɔːt hˈoʊm tʊ ɪts pˈɜːpɪtɹˌeɪɾɚ .|0
70
+ LJSpeech-1.1/wavs/LJ017-0070.wav|bˌʌt hɪz spˈoːɹɾɪŋ ˌɑːpɚɹˈeɪʃənz dɪdnˌɑːt pɹˈɑːspɚ , ænd hiː bɪkˌeɪm ɐ nˈiːdi mˈæn , ˈɔːlweɪz dɹˈɪvən tə dˈɛspɚɹət stɹˈeɪts fɔːɹ kˈæʃ .|0
71
+ LJSpeech-1.1/wavs/LJ014-0020.wav|hiː wʌz sˈuːn ˈæftɚwɚdz ɚɹˈɛstᵻd ˌɔn səspˈɪʃən , ænd ɐ sˈɜːtʃ ʌv hɪz lˈɑːdʒɪŋz bɹˈɔːt tə lˈaɪt sˈɛvɹəl ɡˈɑːɹmənts sˈætʃɚɹˌeɪɾᵻd wɪð blˈʌd ;|0
72
+ LJSpeech-1.1/wavs/LJ016-0020.wav|hiː nˈɛvɚ ɹˈiːtʃt ðə sˈɪstɚn , bˌʌt fˈɛl bˈæk ˌɪntʊ ðə jˈɑːɹd , ˈɪndʒɚɹɪŋ hɪz lˈɛɡz sᵻvˈɪɹli .|0
73
+ LJSpeech-1.1/wavs/LJ045-0230.wav|wˌɛn hiː wʌz fˈaɪnəli ˌæpɹihˈɛndᵻd ɪnðə tˈɛksəs θˈiəɾɚ . ɔːlðˈoʊ ɪɾ ɪz nˌɑːt fˈʊli kɚɹˈɑːbɚɹˌeɪɾᵻd baɪ ˈʌðɚz hˌuː wɜː pɹˈɛzənt ,|0
74
+ LJSpeech-1.1/wavs/LJ035-0129.wav|ænd ʃiː mˈʌstɐv ɹˈʌn dˌaʊn ðə stˈɛɹz ɐhˈɛd ʌv ˈɑːswəld ænd wʊd pɹˈɑːbəbli hæv sˈiːn ɔːɹ hˈɜːd hˌɪm .|0
75
+ LJSpeech-1.1/wavs/LJ008-0307.wav|ˈæftɚwɚdz ɛkspɹˈɛs ɐ wˈɪʃ tə mˈɜːdɚ ðə ɹᵻkˈoːɹdɚ fɔːɹ hˌævɪŋ kˈɛpt ðˌɛm sˌoʊ lˈɔŋ ɪn səspˈɛns .|0
76
+ LJSpeech-1.1/wavs/LJ008-0294.wav|nˌɪɹli ɪndˈɛfɪnətli dᵻfˈɜːd .|0
77
+ LJSpeech-1.1/wavs/LJ047-0148.wav|ˌɔn ɑːktˈoʊbɚ twˈɛnti fˈaɪv ,|0
78
+ LJSpeech-1.1/wavs/LJ008-0111.wav|ðeɪ ˈɛntɚd ɐ dˈɑːlɚ stˈoʊŋ kˈoʊld ɹˈuːm , dˈɑːlɚɹ ænd wɜː pɹˈɛzəntli dʒˈɔɪnd baɪ ðə pɹˈɪzənɚ .|0
79
+ LJSpeech-1.1/wavs/LJ034-0042.wav|ðæt hiː kʊd ˈoʊnli tˈɛstᵻfˌaɪ wɪð sˈɜːtənti ðætðə pɹˈɪnt wʌz lˈɛs ðɐn θɹˈiː dˈeɪz ˈoʊld .|0
80
+ LJSpeech-1.1/wavs/LJ037-0234.wav|mˈɪsɪz . mˈɛɹi bɹˈɑːk , ðə wˈaɪf əvə mɪkˈænɪk hˌuː wˈɜːkt æt ðə stˈeɪʃən , wʌz ðɛɹ æt ðə tˈaɪm ænd ʃiː sˈɔː ɐ wˈaɪt mˈeɪl ,|0
81
+ LJSpeech-1.1/wavs/LJ040-0002.wav|tʃˈæptɚ sˈɛvən . lˈiː hˈɑːɹvi ˈɑːswəld : bˈækɡɹaʊnd ænd pˈɑːsᵻbəl mˈoʊɾɪvz , pˈɑːɹt wˌʌn .|0
82
+ LJSpeech-1.1/wavs/LJ045-0140.wav|ðɪ ˈɑːɹɡjuːmənts hiː jˈuːzd tə dʒˈʌstᵻfˌaɪ hɪz jˈuːs ʌvðɪ ˈeɪliəs sədʒˈɛst ðæt ˈɑːswəld mˌeɪhɐv kˈʌm tə θˈɪŋk ðætðə hˈoʊl wˈɜːld wʌz bᵻkˈʌmɪŋ ɪnvˈɑːlvd|0
83
+ LJSpeech-1.1/wavs/LJ012-0035.wav|ðə nˈʌmbɚ ænd nˈeɪmz ˌɔn wˈɑːtʃᵻz , wɜː kˈɛɹfəli ɹᵻmˈuːvd ɔːɹ əblˈɪɾɚɹˌeɪɾᵻd ˈæftɚ ðə ɡˈʊdz pˈæst ˌaʊɾəv hɪz hˈændz .|0
84
+ LJSpeech-1.1/wavs/LJ012-0250.wav|ɔnðə sˈɛvənθ dʒuːlˈaɪ , ˈeɪtiːn θˈɜːɾi sˈɛvən ,|0
85
+ LJSpeech-1.1/wavs/LJ016-0179.wav|kəntɹˈæktᵻd wɪð ʃˈɛɹɪfs ænd kənvˈiːnɚz tə wˈɜːk baɪ ðə dʒˈɑːb .|0
86
+ LJSpeech-1.1/wavs/LJ016-0138.wav|æɾə dˈɪstəns fɹʌmðə pɹˈɪzən .|0
87
+ LJSpeech-1.1/wavs/LJ027-0052.wav|ðiːz pɹˈɪnsɪpəlz ʌv həmˈɑːlədʒi ɑːɹ ᵻsˈɛnʃəl tʊ ɐ kɚɹˈɛkt ɪntˌɜːpɹɪtˈeɪʃən ʌvðə fˈækts ʌv mɔːɹfˈɑːlədʒi .|0
88
+ LJSpeech-1.1/wavs/LJ031-0134.wav|ˌɔn wˈʌn əkˈeɪʒən mˈɪsɪz . dʒˈɑːnsən , ɐkˈʌmpənid baɪ tˈuː sˈiːkɹᵻt sˈɜːvɪs ˈeɪdʒənts , lˈɛft ðə ɹˈuːm tə sˈiː mˈɪsɪz . kˈɛnədi ænd mˈɪsɪz . kˈɑːnæli .|0
89
+ LJSpeech-1.1/wavs/LJ019-0273.wav|wˌɪtʃ sˌɜː dʒˈɑːʃjuːə dʒˈɛb tˈoʊld ðə kəmˈɪɾi hiː kənsˈɪdɚd ðə pɹˈɑːpɚɹ ˈɛlɪmənts ʌv pˈiːnəl dˈɪsɪplˌɪn .|0
90
+ LJSpeech-1.1/wavs/LJ014-0110.wav|æt ðə fˈɜːst ðə bˈɑːksᵻz wɜːɹ ɪmpˈaʊndᵻd , ˈoʊpənd , ænd fˈaʊnd tə kəntˈeɪn mˈɛnɪəv oʊkˈɑːnɚz ɪfˈɛkts .|0
91
+ LJSpeech-1.1/wavs/LJ034-0160.wav|ˌɔn bɹˈɛnənz sˈʌbsᵻkwənt sˈɜːʔn̩ aɪdˈɛntɪfɪkˈeɪʃən ʌv lˈiː hˈɑːɹvi ˈɑːswəld æz ðə mˈæn hiː sˈɔː fˈaɪɚ ðə ɹˈaɪfəl .|0
92
+ LJSpeech-1.1/wavs/LJ038-0199.wav|ᵻlˈɛvən . ɪf aɪɐm ɐlˈaɪv ænd tˈeɪkən pɹˈɪzənɚ ,|0
93
+ LJSpeech-1.1/wavs/LJ014-0010.wav|jˈɛt hiː kʊd nˌɑːt ˌoʊvɚkˈʌm ðə stɹˈeɪndʒ fˌæsᵻnˈeɪʃən ɪt hˈæd fɔːɹ hˌɪm , ænd ɹᵻmˈeɪnd baɪ ðə sˈaɪd ʌvðə kˈɔːɹps tˈɪl ðə stɹˈɛtʃɚ kˈeɪm .|0
94
+ LJSpeech-1.1/wavs/LJ033-0047.wav|aɪ nˈoʊɾɪst wɛn aɪ wɛnt ˈaʊt ðætðə lˈaɪt wʌz ˈɔn , ˈɛnd kwˈoʊt ,|0
95
+ LJSpeech-1.1/wavs/LJ040-0027.wav|hiː wʌz nˈɛvɚ sˈæɾɪsfˌaɪd wɪð ˈɛnɪθˌɪŋ .|0
96
+ LJSpeech-1.1/wavs/LJ048-0228.wav|ænd ˈʌðɚz hˌuː wɜː pɹˈɛzənt sˈeɪ ðæt nˈoʊ ˈeɪdʒənt wʌz ɪnˈiːbɹɪˌeɪɾᵻd ɔːɹ ˈæktᵻd ɪmpɹˈɑːpɚli .|0
97
+ LJSpeech-1.1/wavs/LJ003-0111.wav|hiː wʌz ɪŋ kˈɑːnsɪkwəns pˌʊt ˌaʊɾəv ðə pɹətˈɛkʃən ʌv ðɛɹ ɪntˈɜːnəl lˈɔː , ˈɛnd kwˈoʊt . ðɛɹ kˈoʊd wʌzɐ sˈʌbdʒɛkt ʌv sˌʌm kjˌʊɹɹɪˈɔsɪɾi .|0
98
+ LJSpeech-1.1/wavs/LJ008-0258.wav|lˈɛt mˌiː ɹᵻtɹˈeɪs maɪ stˈɛps , ænd spˈiːk mˈoːɹ ɪn diːtˈeɪl ʌvðə tɹˈiːtmənt ʌvðə kəndˈɛmd ɪn ðoʊz blˈʌdθɜːsti ænd bɹˈuːɾəli ɪndˈɪfɹənt dˈeɪz ,|0
99
+ LJSpeech-1.1/wavs/LJ029-0022.wav|ðɪ ɚɹˈɪdʒɪnəl plˈæŋ kˈɔːld fɚðə pɹˈɛzɪdənt tə spˈɛnd ˈoʊnli wˈʌn dˈeɪ ɪnðə stˈeɪt , mˌeɪkɪŋ wˈɜːlwɪnd vˈɪzɪts tə dˈæləs , fˈɔːɹt wˈɜːθ , sˌæn æntˈoʊnɪˌoʊ , ænd hjˈuːstən .|0
100
+ LJSpeech-1.1/wavs/LJ004-0045.wav|mˈɪstɚ . stˈɜːdʒᵻz bˈoːɹn , sˌɜː dʒˈeɪmz mˈækɪntˌɑːʃ , sˌɜː dʒˈeɪmz skˈɑːɹlɪt , ænd wˈɪljəm wˈɪlbɚfˌoːɹs .|0
StyleTTS_Accelerate/Data/val_list_libritts.txt ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_19794.wav|aɪ nˈuː aɪ wʌz ɡˌɛɾɪŋ ˌɪntʊ tɹˈʌbəl kˈʌmɪŋ hˈɪɹ , bˌʌt θˈæŋkfəli aɪv ɡˈɑːt juː , dˈɑːktɚ . aɪ wˌʊdəntəv fˈɪɡɚd ˈaʊt hˌaʊ tə ɹᵻzˈɑːlv sˈʌtʃ ɐ kˈɑːmplᵻkˌeɪɾᵻd kˈɑːnflɪkt baɪ maɪsˈɛlf . ˈɔːl ɹˈaɪt , tˈaɪm tə pˈæk ˌʌp maɪ ɡˈʌn . mˈɪʃən kəmplˈiːt , dˈɑːktɚ . wiː ɐtʃˈiːvd vˈɛɹi ...|56
2
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12789.wav|stɹˈaɪk ðˌɛm ! pˈɪɹs ˈɛvɹɪθˌɪŋ . bᵻhˈoʊld ! jʊɹ dˈɛd ! jʊɹ dˈɛθ ! ɪts tˈaɪm , kˈɑːnsəntɹˌeɪɾᵻd . dˈaɪ ! ðæts ðə lˈæst tˈaɪm . juː θˈɪŋk ʌv mˌiː . kˈʌm ˈɔn ! ðæts dʒˈʌst ðə tˈaɪm .|240
3
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_1753.wav|nˈaɪs mˈuːv . kənsˈɪdɚɹɪŋ sˈʌmwʌnz fˈɪzɪkəl kəndˈɪʃən .|122
4
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14206.wav|ˈoʊ , ðeɪɚ dɹˈeɪnɪŋ jʊɹ mˈɛntəl pˈaʊɚ !|247
5
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12264.wav|tˈʌkɪn kjuːzˈuːzi ! ðæt sˈʌkt . dˈoʊnt wˈʌɹi , juː dˈoʊnt hæv təbi skˈɛɹd . dʒˈʌst klˈoʊz jʊɹ ˈaɪz ænd biː tˈeɪkən tə hˈɛvən . aɪl sˈɛnd juː tə hˈɛl ! aɪv wˈʌn ! nˈaʊ , lˈɛts dˈaɪ .|239
6
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4661.wav|nˈaɪts mˈeɪ biː ˈeɪbəl tə klˈaɪm bˈæk ˌʌp ˈæftɚ ðeɪ fˈɔːl , bˌʌt ðɪ ɪnfˈɛktᵻd dˈoʊnt hæv sˈʌtʃ kˈaɪndhˈɑːɹɾᵻd ˈɑːpʃənz ɐvˈeɪləbəl tə ðˌɛm .|162
7
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12369.wav|juː pɹˈɛs ðɪs swˈɪtʃ tuː ... nˈoʊ , nˈoʊ , nˈoʊ ! ðæts ðə sˈɛlfdᵻstɹˈʌkt bˈʌʔn̩ ! ˈɑː ! ɹˈʌn !|24
8
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10319.wav|wɪɹ wˈɪnɪŋ ! ˈoʊ , nˈaɪs dʒˈɑːb ! ˈoʊ , ðeɪɚ ɡˈʊd !|235
9
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10192.wav|mˈɪtsɚɹˌuː kæn biː tɹˈʌbəl tə fˈaɪt , bˌʌt aɪ nˈoʊ hɜː wˈɛl .|234
10
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7133.wav|hˈeɪ dˈɑːktɚ , ˈɑːɹknaɪts .|195
11
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3889.wav|aɪ wɪl ˈæsk mˈɪstɚ . ɹˈæbɪt tə pɹɪpˈɛɹ wˈʌn mˈoːɹ kˈeɪk .|154
12
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10529.wav|juː kæn fˈɪnɪʃ ðˌɛm ˈɔf !|235
13
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10588.wav|sˈʌmtaɪmz ɪts ɐbˌaʊt lˈʌk . nˈɛkst ɹˈaʊnd !|235
14
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5410.wav|aɪ ɐɡɹˈiː təbi ɐ fˈaɪt kˌoːɹɪˈɑːɡɹəfɚ fɔːɹ nˈiːnz mˈuːvi .|172
15
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14881.wav|kˈʌm ɐɡˈɛn tə mˌeɪk aʊɚsˈɛlvz nˈoʊn tə ðoʊz hˌuː dᵻfˈaɪ ˌʌs . wˈɛl dˈʌn . ðæt ˈɛndz ˈɛvɹɪθˌɪŋ .|251
16
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18092.wav|tɹˈaɪ sˈʌmθɪŋ , fˈaɪɚ ! wˌɛɹ kʊd juː bˈiː ?|34
17
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_2474.wav|sˈʌmtaɪmz juː dʒˈʌst ɡˈɑːɾə tˈeɪk ˈɔf ðə mˈæsk ænd kˈætʃ ɐ bɹˈiːðɚ .|135
18
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16172.wav|ɪf juː wˈɪn , aɪl ɡˈɪv juː fˈɛðɚ pˈɪŋks ˈɔːɾəɡɹˌæf !|256
19
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7299.wav|ɪt sˈiːmz juː ɑːɹ ɹˈæðɚ lˈæks ɪn meɪntˈeɪnɪŋ jʊɹ ˈoʊn hˈɛlθ , dˈɑːktɚ .|198
20
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17994.wav|dʒˈʌst bˌiːɪŋ ˈeɪbəl tə hˈoʊld ðə kˈɪɾɪz lˈaɪk ðˈɪs .|33
21
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_2538.wav|ɪts tˈaɪm fɔːɹ juː tə lˈiːv .|136
22
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4663.wav|maɪ wˈɛpən sˈiːmz təbi ˈæktɪŋ ˈʌp . wˌɪtʃ wˈeɪ ɪz ðə wˈɜːkʃɑːp ? ðæt wˈeɪ ? θˈæŋk juː .|162
23
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16288.wav|wɪɹ ˈoʊnli dˌaʊn baɪ ɐ lˈɪɾəl . lˈɛts kˈætʃ ˈʌp !|256
24
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_13598.wav|bˈɑːndɪŋ ˈækt ɔːlɹˈɛdi ? dˈoʊnt wˈʌɹi . aɪl kˈɪl ˈɛvɹɪwˌʌn ɐtwˈʌns . ðiːz bˈɑːndz juː bᵻlˈiːv ɪn ɑːɹ kwˈaɪt fɹˈeɪl . jʊɹ pˈaʊɚləs ʌnlˈɛs juː ˈækt æz ɐ ɡɹˈuːp , kɚɹˈɛkt ?|244
25
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7444.wav|blˈiːdɪŋ mˈɔːɹɡən . ʃiː mˌaɪt sˈiːm lˈaɪk ɐ lˈeɪɐbˌaʊt mˈoʊstli , bˌʌt ʃiːz ˌaʊɚ tæktˈɪʃən wɛn pˈʊʃ kˈʌmz tə ʃˈʌv . ɡˈɛts ɐ dʒˈɑːb dˈʌn . ɡˈɛts ˌɔn maɪ nˈɜːvz ˈʌðɚwˌaɪz , ðˌoʊ . ænd ʃiː lˈʌvz ɪt .|2
26
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8344.wav|aɪ dˈoʊnt wˈɔnt tə lˈuːz maɪ mˈaɪnd , ʌv kˈoːɹs .|210
27
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21352.wav|nˈɛkst tˈaɪm . dˈɑːktɚ .|73
28
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_605.wav|hiː wʌz stˈɪl wˈɪlɪŋ tə flˈʌf ɪɾ ˌɪntʊ tˈaɪni lˈɪɾəl stˈoːɹiz , dʒˈʌst tə mˌeɪk mˌiː smˈaɪl .|107
29
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16547.wav|ɑːɹ juː ʃˈʊɹ juː dˈoʊnt ɹˈiəli sˈiː ðˌɛm æz tˈuːlz ? ˈɛvɹi lˈæst wˈʌn ʌv juː ɪz ɪn maɪ wˈeɪ . aɪm fˈaɪn baɪ maɪsˈɛlf , sˌoʊ ɡɛt lˈɔst ! hˈɑː hˈɑː hˈɑː ! ænd ˈaɪdəl ? aɪ bˈɛt juː dʒˈʌst lˈʌv bˌiːɪŋ pˈæmpɚd . ɪt fˈiːlz ɡɹˈeɪt tə sˈiː juː .|256
30
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3213.wav|ɪt mˌaɪt biː ɐ lˈɪɾəl tʃˈæləndʒˌɪŋ fɔːɹ ðɪs jˈuːnɪt ɹˈaɪt nˈaʊ .|146
31
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20665.wav|jʊɹ hˈɪɹ tə pɹˈeɪ ? sˈɑːɹi , aɪm bˈɪzi ɹˈaɪt nˈaʊ .|66
32
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17916.wav|ˈɔːl ɹˈaɪt , wˈʌns mˈoːɹ . ɡɛt ɐwˈeɪ fɹʌm mˌiː , plˈiːz !|31
33
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10945.wav|ɑːhˈɑː ! ðə pˈɪtʃɚɹ ɪz kˈɜːɹəntli ɐfɹˈeɪd ʌvðə bˈæɾɚ !|236
34
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_1495.wav|ˈoʊ , woʊnt juː wˈɔnt mˌiː lˈiːdɪŋ dˈɑːktɚ ?|12
35
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8645.wav|lˈɛts nˌɑːt fˈiːl dˈaʊn , dˈɑːktɚ . juː wɪl fˈaɪnd ɐ səlˈuːʃən tə bɹˈeɪk ðɪs dˈɛdlɑːk .|214
36
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21677.wav|ˈɪntɹɛstɪŋ fˈeɪsᵻz ɑːɹ lˈaɪk stˈoːɹiz ɪn ðɛɹ ˈoʊn ɹˈaɪt .|76
37
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7735.wav|jʊɹ nˌɑːt jˈuːzd tə mˌiː tˈɔːkɪŋ ðɪs wˈeɪ , bˌʌt aɪm ɐn ˈɑːnɪsttəɡˈʊdnəs nˈaɪt . ʃˈʊɹli aɪ kæn ˈɪmpɹəvˌaɪz ɐ spˈiːtʃ ɔːɹ tˈuː .|203
38
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12417.wav|dˈoʊnt tˈeɪk ˌʌs lˈaɪtli . ðeɪɚ stˈɪl kˈʌmɪŋ . ðə bˈæɾəlz stˈɪl ɡˈoʊɪŋ .|240
39
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4109.wav|fˈɜːst ˈeɪd hˈɪɹ , tˈeɪk ɪt . aɪl biː ɹˈaɪt ðˈɛɹ !|157
40
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18232.wav|bˌʌt wʊd juː hˈæpən tə nˈoʊ wˌɛɹ pɹɑːvˈɑ̃s ænd ˈɛvɹɪwˌʌn ˈɛls wˈɛnt ?|37
41
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20440.wav|ðiːz klˈoʊðz hæv jʊɹ ɐtˈɛnʃən , dˈɑːktɚ ? ðeɪ kənfjˈuːz mˌiː dʒˈʌst æz mˈʌtʃ wɛn aɪ fˈɜːst pˌʊt ðˌɛm ˈɔn . ðeɪ hæv ɐ stˈɑːɹɾəlɪŋ dᵻfˈɛnsɪv kəpˈæsᵻɾi , dᵻspˈaɪt ðɛɹ ˈɑːbviəs dɪzˈaɪn .|62
42
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_2160.wav|ˈɑː , sˈuːzən , dˈoʊnt tɹˈuː mˌiː , aɪl tˈoʊld mˌiː . aɪ dˈoʊnt tə θˈɪŋk tə ðə tˈaɪm .|130
43
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9888.wav|biː ɛmbˈæɹəst ɐbˌaʊt hˌaʊ ɪt tˈɜːnd ˈaʊt . aɪ wˈʌn ðɪs fˈaɪt , bˌʌt ðɛɹz nˈoʊ nˈoʊɪŋ hˌaʊ ˌɪɾəl ɡˌoʊ nˈɛkst tˈaɪm .|234
44
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15842.wav|dˈoʊnt wˈʌɹi , dʒˈʌst wˈɑːtʃ ðˌɛm !|255
45
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17659.wav|ɪts fˈɪlθi . ˈʌɡ . wˌʌt ɑːɹ juː lˈʊkɪŋ æt mˌiː fɔːɹ ? juː wˈɔnt mˌiː tə tʃˈeɪs ðˌɛm ?|28
46
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15844.wav|mˈeɪbiː aɪ kæn ɡɛt sˌʌm ˈænsɚz ɪf aɪ ɡˌoʊ ðˈɛɹ . sˈoʊ , ɪts ˈoʊnli nˈætʃɚɹəl ðæt ðɛɹˌɑːɹ fˈeɪks wˈɔndɚɹɪŋ ɚɹˈaʊnd ɪn hˈɪɹ tˈuː .|255
47
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5427.wav|ɪɾ ɪz ɪmpˈɑːsᵻbəl tə sˈiː ˈɔːl ðiːz sˈaɪts .|172
48
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21460.wav|jˈɛs , ðə ɹˈiːdz ðæt wʊd bˈɜːn ɐwˈeɪ .|75
49
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5961.wav|ɪf juː wˈɪʃ mˌiː tʊ ɐtˈɛnd juː , ðæt hˈæpənz təbi maɪ spˈɛʃəlɾi .|18
50
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21354.wav|wˌaɪ dˈoʊnt juː kˈʌm pɹˈæktɪs ðə blˈeɪd wɪð mˌiː ? dˈɑːktɚ , ɹᵻmˈɛmbɚ tə pɹˈæktɪs jʊɹ kˈʌŋ fˈuː .|73
51
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12956.wav|ˈoʊ ɹˈaɪt ! nˈaʊ ! juː kæn bˈiːt ðˌɛm !|241
52
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10951.wav|ðɪs sˈiːmz tə kˌɑːntɹədˈɪkt jʊɹ dɪzˈaɪɚ fɔːɹ sˈɑːlɪtˌuːd .|236
53
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21021.wav|aɪ kˈænt ɹᵻmˈɛmbɚ hɜː fˈeɪs , bˌʌt ðeɪ klˈɪɹli ɹᵻkˈɔːld ðə ɹˈaɪm ʃiː sˈæŋ mˌiː .|70
54
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18479.wav|ðə tˈaɪd stˈɪl kˈʌmz ˈɪn . wiː kˈænt lˈɛt ˌaʊɚ ɡˈɑːɹd dˌaʊn jˈɛt .|4
55
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10072.wav|ðɪs hˈæpənz sˈʌmtaɪmz . dˈoʊnt wˈʌɹi ɐbˈaʊt ɪt .|234
56
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16293.wav|nˈaɪs pˈeɪs , dˌuːɪŋ ɡˈʊd !|256
57
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8232.wav|wˌaɪ dˈʌz ˈɛvɹɪwˌʌn pˈʊl ðiːz wˈɪɹd fˈeɪsᵻz wɛn ðeɪ fˈɪnɪʃ ?|209
58
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6296.wav|ˈoʊ , sˈɑːɹi .|183
59
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20989.wav|wɛnˌɛvɚ ðə tɹˈaɪb ɪz ɔnðə mˈuːv , ˈɛvɹɪwˌʌn wˈʌɹiz wˈɛðɚ wiːl fˈaɪnd ɐ ɡˈʊd plˈeɪs tə sˈɛɾəl dˈaʊn .|7
60
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7065.wav|θˈæŋks . θˈɪŋz ɑːɹ ɔnðə ɹˈaɪt tɹˈæk nˈaʊ .|194
61
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15376.wav|dˈɪd juː plˈæn ðˈæt ? aɪ wˈɪʃ ðɛɹd biː ɐn ˈʌpsɛt .|253
62
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16663.wav|ðɪs wɪl fˈaɪnəli ˈɛnd θˈɪŋz . ɡˈʊd lˈʌk tə juː .|257
63
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16019.wav|aɪ wʌz sˌoʊ klˈoʊs , aɪm sˈɑːɹi . vˈɪktɚɹi ɪz aʊɚz !|255
64
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14790.wav|tə ɡˌoʊ wˈaɪld sˈʌmtaɪmz . aɪl ˌɪntɹədˈuːs juː tə ðˌɛm nˈɛkst tˈaɪm . ɹˈaʊnd ? wˈʌn , tˈuː , θɹˈiː , fˈoːɹ , fˈaɪv , sˈɪks , sˈɛvən , ˈeɪt , ˈeɪt , nˈaɪn , tˈɛn , tˈɛn .|250
65
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12131.wav|huːˈɛvɚ blˈɪŋks fˈɜːst wɪl lˈuːz !|239
66
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_1143.wav|wiː ɡˈeɪn pəzˈɛʃənz , stˈæɾəs ænd ɐ dˈiːsənt lˈaɪf .|115
67
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10829.wav|juː mˈʌst kˈaʊntɚɹɐtˌæk !|236
68
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17786.wav|ˈɔːl juː nˈiːd tə nˈoʊ ɪz ðæt aɪm nˌɑːt ˈæftɚ jʊɹ lˈaɪf fɔːɹ nˈaʊ , dˈɑːktɚ . ɑːɹ juː ʃˈʊɹ aɪ nˈiːd tə wˈɑːtʃ ðˈiːz ? bˈæk ɪn kˈæsdɛl , juː jˈuːzd tə lˈiːd maɪ pˈiːpəl . hˈʌ , ɡɹˈeɪt .|3
69
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11064.wav|aɪ sˈiː , nˈaʊ ðɪs ɪz kwˈaɪt ˈɪntɹɛstɪŋ . aɪm sˈɑːɹi , bˌʌt aɪm stˈɑːɹɾɪŋ tə bɪkˌʌm ɹˈæðɚ bˈoːɹd . ɪf aɪ kəntˈɪnjuː klˈaɪmɪŋ ðɪs tˈaʊɚ , wɪl ɐ hˈɑːɹtθɹˈɑːbɪŋ ɹˌiːjˈuːniən wɪð ðæt jˈʌŋ mˈæn fɹʌm ˈɜːlɪɚ biː ɐwˈeɪɾɪŋ mˌiː ? ˈoʊ mˈaɪ , aɪ hæd ɡˈɑːʔn̩ kəmplˈiːtli bˈoːɹd .|236
70
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3913.wav|aɪm hˈʌŋɡɹi . ˈɛvɹɪwˌʌn wɪl lˈʌv ðə blˈuː bˈʌbəl ɡˈɑːɹdən .|154
71
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18559.wav|aɪm nˌɑːt ðæt ɡˈʊd æt smˈɔːl tˈɔːk .|41
72
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_1869.wav|aɪl tɹˈaɪ sˈʌmhaʊ . aɪ wˈʊdənt mˌeɪk ɐ ɡˈʊd ɛksplˈoːɹɚ .|125
73
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8106.wav|aɪ mˈʌst ɹᵻbˈɪld ɪt fɚðə tʃˈɪldɹən . æz lˈɔŋ æz ðeɪ kæn hæv ɐ ʃˈɛltɚ tə kˈɔːl hˈoʊm , ˌɛni plˈeɪs wɪl dˈuː .|208
74
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5399.wav|aɪ dɪdnˌɑːt sˈiː juː æt ðɪs mˈɔːɹnɪŋz tɹˈeɪnɪŋ sˈɛʃən .|172
75
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9998.wav|ʃiː dˈʌzənt sˈiːm jˈuːzd tə bˈæɾəl jˈɛt . fˈɪnɪʃ hɜː kwˈɪkli .|234
76
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8594.wav|baɪ mˈaɪlz . bˈoʊθ ɹˈeɪθɪən ˈɑːɹ ænd dˈiː pɹˈɑːdʒɛkts ænd wˈɜːkɪŋ təɡˌɛðɚ wɪð juː hɐvbɪn dᵻlˈaɪtfəl ɛkspˈiəɹɪənsᵻz .|214
77
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5889.wav|aɪ ɹˈiəli ɛndʒˈɔɪ plˈeɪɪŋ wɪð kˈɪdz . bˈæk ɪn maɪ hˈoʊmtaʊn , aɪ tˈʊk maɪ bɹˈʌðɚz ænd sˈɪstɚz slˈɛdɪŋ ˈɛvɹi jˈɪɹ .|178
78
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12846.wav|ðɪs ɪz kwˈaɪt ɐ fˈaɪt kˈɑːɹd . kˌoʊdɐmˈɑːɾoʊ ɪz ɛksˈaɪɾᵻd , tˈuː .|240
79
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10326.wav|ˈoʊ ! ˈoʊ ! ðæt wʌz kˈuːl !|235
80
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8878.wav|lˈʊkɪŋ ɡˈʊd . wɪɹ ˌɔn ɐ ɹˈoʊl , hˈʌ ? ðæt ˈiːzi , hˈʌ ? aɪ kæn tˈɛl baɪ jʊɹ fˈeɪs . ˈɑː , ðæt wʌz fˈæst .|220
81
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11613.wav|dˈæm , ˈiːvən aɪm bɹˈeɪkɪŋ ˈaʊt ɪnðə swˈɛt . dʒˈʌst ɡˌɛɾɪŋ wˈʌn mˈoːɹ hˈɪt ænd jʊɹ dˈʌn ! dˈuː ɪt !|238
82
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_23554.wav|həm . ɪf mˈɛmɚɹi s��ɜːvz .|99
83
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9291.wav|wiː nˈiːd tə tˈeɪk ɐdvˈæntɪdʒ ʌv ˌaʊɚ stɹˈɛŋθs . ðeɪ fˈaʊnd ˌʌs . wiːv bˌɪn ɛkspˈoʊzd . wˈɑːtʃ ˈaʊt ! ðeɪ spˈɑːɾᵻd ˌʌs ? wiːv bˌɪn spˈɑːɾᵻd . ðeɪ fˈaʊnd ˌʌs .|229
84
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3666.wav|aɪd lˈaɪk tə pˈɪtʃ ɪn sˌʌm ʌv maɪ ˈoʊn stɹˈɛŋθs nˈaʊ .|151
85
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_421.wav|aɪ dˈoʊnt nˈoʊ hˌaʊ aɪ ˈɛndᵻd ˌʌp ˌɔn ðɪs tˈiːm ˈiːðɚ . ˈɔːl aɪ wˈɔntᵻd wʌz ɐn ˈɔːɹdɪnˌɛɹi lˈaɪf . wˌaɪ dˈɪd aɪ ˈɛnd ˌʌp ɪn tʃˈɑːɹdʒ ʌv ðiːz pɹˈɑːbləm tʃˈɪldɹən ?|104
86
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7705.wav|juː wˈɔnt mˌiː tə lˈiːd ?|202
87
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16124.wav|ðeɪɚɹ ɐ pˈaʊɚfəl əpˈoʊnənt . lˈɛts stˈeɪ fˈoʊkəst .|255
88
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3668.wav|juː θˈɪŋk ðæts wˈɪɹd ? juː lˈɜːn ɐ lˈɑːt wɛn jʊɹ ɪn ɹˈoʊdz ˈaɪləndz ˌɛndʒɪnˈɪɹɪŋ dᵻpˈɑːɹtmənt .|151
89
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20155.wav|dʒˈʌst ˈɛvɹi wˈʌns ɪn ɐ wˈaɪl , aɪ fˈiːl ɹˈiəli hˈæpi wɛn aɪ ɡɛt kˈɑːmplɪmənts fɹʌm juː .|6
90
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15030.wav|ɪf juː lˈuːz , juːl biː ɐ lˈæfɪŋ stˈɑːk . ðɪs ɹˈaʊnd wɪl biː ɐ sˈɪntʃ fɔːɹ juː , ɹˈaɪt ?|253
91
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18378.wav|ɪz hɑːɹmˈoʊniə stˈɪl nˌɑːt ɡˌɛɾɪŋ ˌɛni bˈɛɾɚ ?|38
92
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21171.wav|jˈɛs , dˈuː aɪ kənsˈɪdɚ hɜːɹ ɐn ˈɛnəmi ?|72
93
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11005.wav|ˈɔːl ðɛɹ ɪz nˈaʊ ɪz tə pɹɪpˈɛɹ joːɹsˈɛlf .|236
94
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7076.wav|həm , oʊkˈeɪ . maɪ bˈɑːdili flˈuːɪdz ˌɑːɹnt ðɪ ˈoʊnli θˈɪŋ ðæts tʃˈɪlɪŋ .|194
95
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_22119.wav|ænd ðeɪ wɪl nˈɛvɚ kəmplˈeɪn . hˈeɪ , aɪd sˈeɪ , lˈɛts plˈeɪ ðæt ɹˈɛkɚd , ðə wˈʌn wiː ˈɔːl lˈaɪk .|82
96
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9677.wav|æm aɪ bˌiːɪŋ mˈæspɹədˈuːst ? ðæt kˈænt bˈiː ! jʊɹ əpˈoʊnənt ɪz dˈuːmd . ˈeɪ .|233
97
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_1674.wav|aɪ stˈɪl hˈævənt fˈɪɡɚd ˈaʊt wʌt dʒˈʌstɪs ɹˈiəli ɪz .|121
98
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_23608.wav|ɪts ɐ fˈʌni lˈɪɾəl θˈɪŋ ðæt aɪ ɡɛt tə fˈaɪt ɐlˈɔŋsaɪd ðɪs vˈɜːʒən ʌv juː .|99
99
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18551.wav|hˈeɪ , lˈɛts ɡˌoʊ fɚɹə dɹˈɪŋk . ɪts ˈɔːlweɪz ˈæʃli hˌuː dɹˈɪŋks wɪð mˌiː ænd ðæts dʒˈʌst dˈʌl . dˈɑːktɚ ! kwˈɛstʃən ! kæn aɪ tˈeɪk ɐ lˈʊk æɾ ˈɑːpɚɹˌeɪɾɚ nˈɜːlz bˈæɾəl ɹˈɛkɚdz ?|41
100
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10123.wav|aɪ wʌz hˈɛlpɪŋ tˈuː . dˈæm ɪt !|234
101
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_19422.wav|sˌoʊ aɪ mˈʌst ˈæsk .|51
102
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9579.wav|həm , juː kʊd biː bˈædli ˈɪndʒɚd ɪf juː duːnˌɑːt tˈeɪk kˈɛɹ .|233
103
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11443.wav|aɪ swˈɛɹ aɪ dʒˈʌst fˈɛlt ɐ hjˈuːdʒ tʃˈɪl dˌaʊn maɪ spˈaɪn . wˈeɪt ɐ sˈɛk hˈɪɹ . aɪ θˈɪŋk aɪ bɹˈɔːt ɐ bˈæt tʊ ɐn ˈæks fˈaɪt . jˈaɪks ! ðə fˈeɪks ˈɛvɹi bˈɪt æz ɪntˈɛns æz ðə ɹˈiːəl wˌʌn . ˈʌ , ɪts hˈɑːɹd tə tˈeɪk juː sˈɪɹiəsli lˈʊkɪŋ lˈaɪk ðˈæt . tˈɑːdˈɑː ! ɐ tˈoʊɾəl vˈɪktɚɹi fɔːɹ mˌiː !|238
104
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_2542.wav|wˌʌt ɪnðə wˈɜːld ɪz ɪt ðæt aɪ dˈuː ?|136
105
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_917.wav|ˈoʊ .|110
106
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18636.wav|bɹˈɑːvoʊ , dˈɑːktɚ . aɪ wʌz ɹˈaɪt tə pˈɑːɹtnɚ wɪð juː .|42
107
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5373.wav|ʌnstˈɛdi pˈɔstʃɚ , wˈeɪvɚɹɪŋ ɪntˈɛnt , mˈʌtʃ tə lˈɜːn jˈɛt .|172
108
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3676.wav|pˈɪloʊ , lˈɛts dˈuː ˌaʊɚ bˈɛst .|151
109
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_23396.wav|juː ʃˌæl nˌɑːt pˈæs .|97
110
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3717.wav|aɪd bˈɛɾɚ stˈænd wˈɑːtʃ kwˈaɪətli . nˈaɪs tə mˈiːt juː , dˈɑːktɚ . kˈɔːl mˌiː kˈɔːɹkiz . ɪts ðə fˈɜːst tˈaɪm wiːv mˈɛt , bˌʌt ðɪ ˈɪntɛl ˌɔn jʊɹ dˈɛsk ɐbˌaʊt nˈɔːɹðɚn vɪktˈoːɹiə ɪz stɹˈeɪt fɹʌm mˌiː .|152
111
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_13973.wav|ðɪ əpˈoʊnənt ɪz dˈɛspɚɹət tuː . bˈɛnd ðˌɛm ˈɔf .|247
112
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15286.wav|tˈuː sˈoːɹdz ænd ɐ pɜːsˈoʊnə . biː pˈɪɾifəl ɪf hiː lˈɔst , hˈʌ ?|253
113
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8931.wav|hˈɛlp mˌiː ! ᵻlˈɪmᵻnˌeɪɾɪŋ θɹˈɛts . tˈuː lˈeɪt fɔːɹ ɹᵻɡɹˈɛts . ðɪs ɪz nˈʌθɪŋ ! nˈaʊ juːv dˈʌn ɪt ! aɪ nˈiːd hˈiːlɪŋ ! ðə fˈaɪɚ ! aɪ kˈænt mˈuːv lˈaɪk ðˈɪs ! ᵻlˈɛktɹᵻfˌaɪd ! ɡˌɛɾɪŋ dˈɪzi . maɪ hˈɛd ! aɪ fˈiːl . nˈoʊ fˈɪɹ . maɪ pɜːsˈoʊnə hˈæzbiːn sˈiːld !|221
114
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6952.wav|wˈɛl , tˈaɪni sˈɪlvɛstɚz nˈaɪtklʌb lˈiːdɚ , plˈeɪnteɪl naɪsˈoʊnə , kəntˈɪnjuːɪŋ ðə stɹˈaɪd .|192
115
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9147.wav|ðæts nˌɑːt wˈaɪz . ɹᵻtɹˈiːɾɪŋ ! dˈoʊnt fˈɔːl bᵻhˈaɪnd ! nˈoʊ , nˌɑːt jˈɛt ! nˈoʊ ! aɪ woʊnt lˈɛt juː ! dʒˈoʊkɚ ! bˈæk ɪnðə fˈaɪt ! hˈæŋ ɪn ðˈɛɹ ! kˈʌm ˈɔn ! ðɪs mˌaɪt stˈɪŋ ! aɪ kæn ɡɛt ɐ hˈɪt ˈɪn ! nˈaɪs flˈoʊ ! lˈɛt mˌiː hˈɛlp ! oʊkˈeɪ ? aɪl θɹˈæʃ !|227
116
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8550.wav|pˈeɪ fɔːɹ ðˌaɪ sˈɪnz wɪð ðˌaɪ blˈʌd . ðə lˈɔː ?|213
117
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15957.wav|ɔːlɹˈaɪt , wiː ɡɑːt ðə fˈɜːst hˈɪt . ðɪs ɪz ɐ wˈʌnsˈaɪdᵻd fˈaɪt .|255
118
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7865.wav|aɪ ʃˌʊd biː ˈeɪbəl tə dˈuː ɪt . slˈoʊli ænd kˈɛɹfəli .|204
119
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11888.wav|wˈoʊ , ðeɪɚ sˈɪɹiəs nˈaʊ !|239
120
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_1897.wav|aɪ pɹˈɑːmɪst hɜːɹ aɪd pɹətˈɛkt juː . ɪz ðˈɪs ?|125
121
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_428.wav|ɪt wʊd biː bˈɛɾɚ tə hæv mˈoːɹ mˈɪʃənz .|105
122
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4062.wav|ðeɪ ʃˌæl pˈeɪ fɔːɹ ðˈɪs .|156
123
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21292.wav|bˌʌt ðə hˈɑːɹd pˈɑːɹt ɪz , juː mˈʌst fˈɜːst fˈeɪs ðæt wˌɪtʃ lˈaɪz ɪn jʊɹ hˈɑːɹt bᵻfˌoːɹ juː kæn ɹˈiəli lˈɛt ɪt ɡˈoʊ . aɪ hæv bˈoʊθ hˈeɪtɹɪd ænd ɡɹˈæɾɪtˌuːd təwˈɔːɹdz ðə ɡɹˈændmæstɚ .|73
124
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16018.wav|ˈoʊ ! wˈaɪ ?|255
125
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6380.wav|wˈɛl , wˌʌt ˈɛls mˌaɪt ðɪs dɪzˈiːz dˈuː tə mˌiː ?|185
126
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_22784.wav|ɑːɹ juː tɹˈaɪɪŋ tə ɡɛt jʊɹ hˈænd bˈɜːnt ? dˈɑːktɚ , kʊd juː ɛksplˈeɪn ðɪs θˈiəɹi tə mˌiː ? aɪl nˈiːd ... twˈɛlv mˈɪnɪts ʌv jʊɹ tˈaɪm . ðæts ˈɔːl .|89
127
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17893.wav|juːv bˌɪn lˈʊkɪŋ ˈæftɚ maɪ ɛmplˈɔɪiːz fɔːɹ mˌiː . ðeɪ hˈævənt mˌeɪd tɹˈʌbəl fɔːɹ juː , hˈæv ðeɪ ?|31
128
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3501.wav|ðeɪl biː mˈʌtʃ mˈoːɹ dˈʌn ɔnðə pɹˈɑːbləmz ðɪ ɪnfˈɛktᵻd fˈeɪs ɪnðə lˈʌŋmɛn slˈʌmz .|15
129
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16077.wav|ɪts tˈaɪm tə ʃˈoʊ ˈɔf jʊɹ mˈænlinəs , kˈændʒi . kˈɑːndʒi , hˈɪt hˌɪm wɪð ᵻlɛktɹˈɪsᵻɾi tə ɡɛt ðɪ ɐdvˈæntɪdʒ .|255
130
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18376.wav|ˈɑː , aɪ kæn stˈɪl smˈɛl ˈɜːbz ˌɔn maɪ klˈoʊðz .|38
131
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9357.wav|fˈiːl lˈaɪk jʊɹ ɡˌənə fˈeɪnt .|23
132
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3971.wav|maɪ fɹˈɛndz , æz lˈɔŋ æz wiː stˈænd təɡˈɛðɚ , ðɛɹ wɪl biː nˈoʊ ˈiːvəl wiː kˈænt ˌoʊvɚkˈʌm .|155
133
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14197.wav|juː kˈænt sˈʌmən jʊɹ pɜːsˈoʊnə . hˈoʊld ˈaʊt fɔːɹ nˈaʊ !|247
134
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14050.wav|ðæt kˈɔːzd ˌʌs . jʊɹ sˈɪɹiəsli hˈɜːt . juːv tˈeɪkən kəntɹˈoʊl ʌvðə bˈæɾəl .|247
135
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20480.wav|kˈʌmfi ænd wˈɔːɹm , ɹˈaɪt ? aɪ tˈoʊld nˈɑːnə ˈɔːl ɐbˈaʊt juː , ænd ʃiː kˈɔːld juː ɐ ɡˈʊd ˈɛɡ .|63
136
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14360.wav|ʃiː hɐz ðɪ ɐdvˈæntɪdʒ ʌv klˈoʊs . biː vˈɪdʒɪlənt .|247
137
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10119.wav|ˈɑː , stˈɑːɹɾɪŋ təmˈɑːɹoʊ , aɪm t��ˈeɪnɪŋ juː tə ɡɛt ɹᵻvˈɛndʒ .|234
138
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6517.wav|ðeɪ woʊnt biː ˈeɪbəl tə blˈɑːk ðˈɪswˌʌn . juːv ɡˈɑːt ðˌɛm .|187
139
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20709.wav|mˈeɪ ðə hˈoʊli mˈaʊnt kˈɑːɹlɪn biː wɪð ˌʌs .|66
140
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12099.wav|hˈeɪ , jʊɹ pɜːsˈoʊnə sˈiːld ! jʊɹ kənfjˈuːzd ! lˈɛft ænd ɹˈaɪt ɑːɹ ɹᵻvˈɜːst !|239
141
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_5076.wav|wˈiːpɪŋ ænd θɹˈoʊɪŋ ˈʌp . skˈɛɹd aɪ dˈɪdnt dˈuː wˈɛl ɪnˈʌf ðæt aɪ wʊd stˈɑːɹv tə dˈɛθ ɪn sˌʌm fɚɡˈɑːʔn̩ kˈɔːɹnɚɹ ʌnnˈoʊn tʊ ˈɛnɪwˌʌn .|168
142
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9670.wav|nˈoʊ , lˈɛts fˈaɪt fˈɛɹ . ɡɛt klˈoʊs ænd bˈiːt ðˌɛm tʊ ɐ pˈoʊp , tʃˈaɪsˌæn . biː kˈɔːʃəs ʌv klˈoʊs kˈɑːmbæt wɪð hɜː . tɹˈaɪ θɹˈoʊɪŋ ðoʊz fˈænz wˈʌn ˈæftɚ ðɪ ˈʌðɚ . wiː mˈʌst ɛŋɡˈeɪdʒ ɪn klˈoʊs kˈɑːmbæt . ˌoʊvɚwˈɛlm jʊɹ ˈɛnəmi .|233
143
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_19940.wav|dˈaʊn , pˈɔːz .|58
144
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9188.wav|mˈoʊst plˈeɪsᵻz ˌɑːɹnt bˈɪɡ ɪnˈʌf , ðˌoʊ . jʊɹ ʃˈoʊldɚz mˈʌst biː stˈɪf , bˌʌt ɪt fˈɪts juː wˈɛl .|228
145
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14902.wav|bɹˈɪŋ ɪɾ ˈɔn ! aɪ woʊnt lˈɛt juː tˈeɪk ðə ɹˈoʊl ʌv ɐdˈoːɹəbəl mˈæskɑːt . ðɪs ˈeɪdʒ nˈiːdz ɐ bˈɛɹ lˈaɪk mˌiː , nˌɑːt dʒˈʌst ɐ plˈeɪn ˈoʊld dˈɑːɡi . juː hæv ɐ lˈɑːt tə lˈɜːn bᵻfˌoːɹ juː kæn wˈɪn ɐɡˈɛnst mˌiː . aɪ ɐksˈɛpt ɹᵻmˈætʃᵻz twˈɛnti fˈoːɹ ˈaʊɚz ɐ dˈeɪ . ɐ bɚɹˈɑːʒ .|251
146
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9089.wav|aɪ ɛndʒˈɔɪ maɪ tˈaɪm ɪn ðɪs sˈuːt æz wˈɛl . wˌʌt ɪz mˌeɪkɪŋ ðoʊz lˈaɪnz ɡlˈoʊ ? jʊɹ ˌæbsəlˈuːtli ɹˈaɪt . aɪ θˈɪŋk wɪɹ dˈuː fɚɹə tˈiː bɹˈeɪk . ɹˈiəli ? aɪl biː ʃˈʊɹ tə tɹˈaɪ ðˈæt . sˈʌmθɪŋ stɹˈɔŋ .|225
147
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17226.wav|ðæts ðə mˈaɪnd əvən ˈɑːɹɾɪst , hˈʌ ? ˈʌ , nˈoʊ ? wˌʌt kˈaɪnd ʌv mˈaɪnd juː ɡˈɑːt , mˈæn ? aɪ nˈoʊ , ɹˈaɪt ? aɪ dˈoʊnt nˈoʊ wˌaɪ pˈiːpəl wˈeɪst mˈʌni ˌɔn fˈænsi wˈɔːɾɚ . ˈɑː , sˈaɪklɪŋ , hˈʌ ? aɪm kˈaɪnd ʌv ˌɪntʊ ɪt , tˈuː , bˌʌt , ˈʌ ... ðæt ʃˈɪt ɡˈɛts ɛkspˈɛnsɪv .|260
148
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6629.wav|ɐ nˈɑːk ɔnðə dˈoːɹ wɪl biː ɐpɹˈiːʃɪˌeɪɾᵻd . mˈɪs .|189
149
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6906.wav|ˈoʊ , ɹˈoʊd ˈaɪləndz bˈeɪs ɪz ˈaʊtfɪɾᵻd sˈoʊ mˌʌtʃ bˈɛɾɚ ðɐn maɪ lˈɪɾəl ˈɑːfɪs .|191
150
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15594.wav|juː . ɡˈʊd kˈɔːl . ˈɔːl ɹˈaɪt , juː dˈɑːdʒd ɐ bˈʊlɪt ðˈɛɹ . wˌʌt ðə hˈɛl ? dˈæm ɪt , ðæt ˈeɪnt fˈɛɹ . ðeɪ wˈɔːɹmd ðɛɹ wˈeɪ ˌaʊɾəv ɪt . dˈæm ɪt , juːl hæv tə tɹˈaɪ ðæt ɐɡˈɛn . nˈʌθɪŋ ˈɛls juː kˌʊdɐv dˈʌn ðˈɛɹ . hˈɑː ! skɹˈuː juː ! ˈoʊ kɹˈæp ! ˈɔːl ɹˈaɪt , nˈaʊz|254
151
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_852.wav|bˌʌt æt tˈaɪmz , tˈuː ɡɹˈeɪt ɐ fɪksˈeɪʃən kæn biː ˌaʊɚɹ ʌndˈuːɪŋ .|110
152
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_22220.wav|ɪz ðɛɹ ˈɛnɪθˌɪŋ juːd lˈaɪk tə dɪskˈʌs ?|84
153
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_2440.wav|stˈeɪ kˈɑːm .|134
154
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14045.wav|ðæt wˈɜːkt nˈaɪsli .|247
155
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4402.wav|fɚðə mˈɑːɹtʃɪŋ , ɐ wˈɔːɹ sˈɔŋ , fɚðə fˈɔːlən .|16
156
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10269.wav|lˈɛts ɡɛt ðɪs sˈɛɾəld . hˈeɪ ! wˌʌts ɐ dˈiːl ? aɪl ʃˈoʊ juː ðə ɹɪzˈʌlts ʌv maɪ tɹˈeɪnɪŋ . aɪ kæn hˈoʊld maɪ ˈoʊn , juː nˈoʊ . lˈʊk ˈaʊt bɪkˈʌz aɪm nˌɑːt hˈoʊldɪŋ bˈæk . kˈʌm ænd ɡˈɛt mˌiː !|235
157
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12288.wav|ʃˈoʊ ˌʌs ðə pˈaʊɚɹ ʌv mˈiːt , tʃˈaɪ sˈɛmpaɪ !|239
158
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21080.wav|ˌʌndɚ ðɪs hˈɛdɪŋ , juː hæd ðə θˈɜːd ɹˈoʊ ɔnðə lˈɛft hˈɪɹ ɹˈɔŋ .|71
159
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16944.wav|ðæt wʌzɐ klˈoʊs bˈæɾəl . vˈɪktɚɹi ˈɔːlweɪz fˈiːlz ɡɹˈeɪt !|257
160
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18021.wav|ðɪs ɪz ɐ fˈeɪvɚɹ aɪl nˈɛvɚ fɚɡˈɛt . pɹəmˈoʊʃən ? ˈoʊ , aɪm nˌɑːt pɹɪpˈɛɹd fɔːɹ ðˈɪs .|33
161
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8312.wav|aɪ wˈʌzn̩t ˈiːvən ɡˈɪvɪŋ maɪ ˈɔːl .|21
162
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4736.wav|ɪt tˈoːɹ maɪ fˈæmɪli ɐpˈɑːɹt ænd ðiːz slˈʌmbɚfˌʊts ɑːɹ ðɪ ˈoʊnli wˈʌnz lˈɛft tə ɹˈoʊm wɪð mˌiː .|164
163
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_16142.wav|ˈoʊ , wiː lˈɔst , bˌʌt dˈoʊnt pˈænɪk . stˈeɪ kˈɑːm .|256
164
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_22670.wav|jʊɹ tʃˈiːk sˈuːðd baɪ ðə kɹˈeɪdlɪŋ wˈeɪvz .|88
165
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_19981.wav|sˈʌtʃ ɐz wiːv ɐɡɹˈiːd . ˈɛnɪθˌɪŋ ðæt sˈiːmz təbi ɡˈɪvən fɔːɹ fɹˈiː wɪl ᵻvˈɛntʃuːəli dᵻmˈænd sˌʌm pɹˈaɪs ʌv juː .|59
166
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17764.wav|ðoʊz ɡɹˈeɪ ænd wˈaɪt tɹˈiːz wˈɪðɚɹ ɪnðə fɹˈɪdʒɪd wˈɪnd , mˈɛn stɹˈʌɡəl tə ɹᵻɡˈeɪn ðɛɹ vaɪtˈælᵻɾi ðə fˈɑːloʊɪŋ jˈɪɹ .|3
167
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_7475.wav|ɪf ðeɪɚ ɡˈɔn , wɪl aɪ stˈɪl biː ʌv ˌɛni jˈuːs ?|20
168
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14092.wav|jʊɹ hˈoʊldɪŋ ðˌɛm bˈæk , bˌʌt nˌɑːt baɪ mˈʌtʃ . ðeɪɚɹ ɐhˈɛd ʌv juː . kˈiːp ˈʌp wɪð ðˌɛm .|247
169
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6175.wav|aɪ sˈɪmpli wˈɪʃ tə lˈɪv æz ɐ ɹˈɛɡjʊlɚɹ ɪnfˈɛktᵻd .|182
170
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18284.wav|nˈaʊ aɪm ʃˈʊɹ ðæt ˈɛvɹɪθˌɪŋ ðæt aɪv dˈʌn ɪz tɹˈuːli mˈiːnɪŋfəl .|37
171
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_8079.wav|hæv juː ɹˈɛstᵻd wˈɛl , dˈɑːktɚ ?|207
172
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18629.wav|maɪ fˈæmɪli dˈʌzənt ɛɡzˈɪst ˌɛnɪmˈoːɹ . bˌʌt ðæt wʌzðə pɹˈaɪs wiː pˈeɪd tə bˈaɪ tʃˈeɪndʒ ɪn sˈɪɹɐkjˌuːsə .|42
173
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_4165.wav|ˈɪŋkʌmˌɪŋ , ɹˈɛdi ˈɔːl mˈɛmbɚz .|158
174
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_6452.wav|ˈɛnəmi spˈɑːɾᵻd . pɹɪpˈɛɹ tə fˈaɪt . ɹˈɛdi fɔːɹ ˈækʃən .|186
175
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_21644.wav|ðə nˈaɪt ɪz kwˈaɪət ænd pˈiːsfəl .|75
176
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3584.wav|aɪl mˌeɪk hˌɪm dˌɪsɐpˈɪɹ .|150
177
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11995.wav|dˈoʊnt ˈiːvən θˈɪŋk ʌv hˈoʊldɪŋ bˈæk nˈaʊ ! kˈiːp ðɪs ˌʌp ænd bˈiːt hˌɪm , oʊkˈeɪ ?|239
178
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15084.wav|həm , lˈʊk ˌɔn ðɛɹ fˈeɪsᵻz tʃˈeɪndʒd .|253
179
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_19183.wav|ˈoʊ , ɪts hɜː . aɪ nˈuː ðæt ɡˈɜːl jˈɪɹz ɐɡˈoʊ .|49
180
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_20224.wav|plˈiːz sˈeɪv jʊɹ stɹˈɛŋθ . wiː wɪl ɡɛt juː ˈaʊt .|60
181
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_594.wav|aɪ dˈoʊnt hæv æz mˈʌtʃ mˈʌni ˌɔn hˈænd ðiːz dˈeɪz .|107
182
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_18179.wav|ɪf kˈɜːsʔn̩ dˈɪdnt hæv ðə tˈælənt ænd lˈiːdɚʃˌɪp , ɹˈaɪn lˈæb wʊdhɐv nˈɛvɚ bˌɪn fˈaʊndᵻd .|36
183
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11662.wav|lˈɛts sˈɛnd ðə nˈɛkst wˈʌn dʒˈʌst lˈaɪk ðˈɪs ! aɪm kˈaʊntɪŋ ˈɔn juː !|238
184
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_15455.wav|dˈʌn . hˈɪɹ juː ɡˈoʊ . jʊɹ tˈuː slˈoʊ . jʊɹ ɡˈɑːɹd ɪz wˈiːk . jˈɛh , wʌtˈɛvɚ . hˈɪɹz ɐ lˈɪɾəl ˈɛkstɹə fɔːɹ juː . ðæts ɪnˈʌf ʌv ðˈæt . juː lˈɪɾəl . jʊɹ ɡˌoʊɪŋ dˈaʊn . juː woʊnt hˈɪt mˌiː . wˌʌt ɐ hˈæsəl . kˈʌm hˈɪɹ . aɪl tˈeɪk juː ˈɔn . aɪm nˌɑːt dˈʌn jˈɛt . ɡˈɛs aɪ kæn biː sˈɪɹiəs fɚɹə bˈɪt .|253
185
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3719.wav|aɪ wˈɪʃ aɪd lˈɜːnd ˈɔːl ðɪs ɐ lˈɪɾəl ˈɜːlɪɚ .|152
186
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_17821.wav|dˈoʊnt θˈɪŋk tˈuː mʌtʃ ɐbˈaʊt ɪt . ˈɛnɪwˌeɪ , aɪm ɡɹˈævəl , ɐ nˈaɪt ʌv kˈæzɪmˌɪɹz .|30
187
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_12402.wav|ðɪs ɪz ɪt , ðə fˈaɪnəl ɹˈaʊnd . hˈæŋ ɪn ðˈɛɹ .|240
188
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_11695.wav|ɪt hˈɜːts , bˌʌt ɪts ɐ ɡˈʊd kˈaɪnd ʌv hˈɜːt , juː nˈoʊ ? juː pˈænɪkt æt ðɪ ˈɛnd . dˈoʊnt ɡɛt ˌoʊvɚkˈɑːnfɪdənt .|238
189
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10622.wav|ˈoʊ , aɪ θˈɔːt wiː kʊd dˈuː ɪt !|235
190
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_13056.wav|ˈoʊ , aɪ hˈoʊp ɪɾ ˈɛndz lˈaɪk ðˈɪs !|241
191
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_3652.wav|wˈɛl , aɪ bˈɛɾɚ tˈɜːn ðɪs məʃˈiːn dˌaʊn ɐ bˈɪt .|151
192
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_14252.wav|həm , aɪm stˈɑːɹɾɪŋ tə sˈiː wʌts ɡˌoʊɪŋ ˈɔn .|247
193
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_2569.wav|wˌʌt aɪm ɐbˌaʊt tə ʃˈɛɹ wɪð juː ɪz nˌɑːt æz wˈʌndɚfəl æz juː mˈeɪ bᵻlˈiːv .|136
194
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_9314.wav|ænd ɪts ˈoʊnli ɐ mˈæɾɚɹ ʌv tˈaɪm ʌntˈɪl jʊɹ mˈeɪd .|23
195
+ /home/ubuntu/StyleTTS_Accelerate_44khz/anispeech/anispeech_10435.wav|tə fˈaɪt ðə ɹˈiːəl fˈɛðɚ pˈɪŋk ! juː nˈoʊ maɪ mˈæstɚ !|235
StyleTTS_Accelerate/Demo/Inference_LJSpeech.ipynb ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "9adb7bd1",
6
+ "metadata": {},
7
+ "source": [
8
+ "# StyleTTS Demo (LJSpeech)\n"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "6108384d",
14
+ "metadata": {},
15
+ "source": [
16
+ "### Utils"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "da84c60f",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "%cd .."
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": null,
32
+ "id": "5a3ddcc8",
33
+ "metadata": {},
34
+ "outputs": [],
35
+ "source": [
36
+ "# load packages\n",
37
+ "import random\n",
38
+ "import yaml\n",
39
+ "from munch import Munch\n",
40
+ "import numpy as np\n",
41
+ "import torch\n",
42
+ "from torch import nn\n",
43
+ "import torch.nn.functional as F\n",
44
+ "import torchaudio\n",
45
+ "import librosa\n",
46
+ "from nltk.tokenize import word_tokenize\n",
47
+ "\n",
48
+ "from models import *\n",
49
+ "from utils import *\n",
50
+ "\n",
51
+ "%matplotlib inline"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": null,
57
+ "id": "bbdc04c0",
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": [
61
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": null,
67
+ "id": "0a173af4",
68
+ "metadata": {},
69
+ "outputs": [],
70
+ "source": [
71
+ "_pad = \"$\"\n",
72
+ "_punctuation = ';:,.!?¡¿—…\"«»“” '\n",
73
+ "_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'\n",
74
+ "_letters_ipa = \"ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ\"\n",
75
+ "\n",
76
+ "\n",
77
+ "# Export all symbols:\n",
78
+ "symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)\n",
79
+ "\n",
80
+ "dicts = {}\n",
81
+ "for i in range(len((symbols))):\n",
82
+ " dicts[symbols[i]] = i\n",
83
+ "\n",
84
+ "class TextCleaner:\n",
85
+ " def __init__(self, dummy=None):\n",
86
+ " self.word_index_dictionary = dicts\n",
87
+ " def __call__(self, text):\n",
88
+ " indexes = []\n",
89
+ " for char in text:\n",
90
+ " try:\n",
91
+ " indexes.append(self.word_index_dictionary[char])\n",
92
+ " except KeyError:\n",
93
+ " print(char)\n",
94
+ " return indexes\n",
95
+ "\n",
96
+ "textclenaer = TextCleaner()"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": null,
102
+ "id": "00ee05e1",
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": [
106
+ "to_mel = torchaudio.transforms.MelSpectrogram(\n",
107
+ " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
108
+ "mean, std = -4, 4\n",
109
+ "\n",
110
+ "def length_to_mask(lengths):\n",
111
+ " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
112
+ " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
113
+ " return mask\n",
114
+ "\n",
115
+ "def preprocess(wave):\n",
116
+ " wave_tensor = torch.from_numpy(wave).float()\n",
117
+ " mel_tensor = to_mel(wave_tensor)\n",
118
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
119
+ " return mel_tensor\n",
120
+ "\n",
121
+ "def compute_style(ref_dicts):\n",
122
+ " reference_embeddings = {}\n",
123
+ " for key, path in ref_dicts.items():\n",
124
+ " wave, sr = librosa.load(path, sr=24000)\n",
125
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
126
+ " if sr != 24000:\n",
127
+ " audio = librosa.resample(audio, sr, 24000)\n",
128
+ " mel_tensor = preprocess(audio).to(device)\n",
129
+ "\n",
130
+ " with torch.no_grad():\n",
131
+ " ref = model.style_encoder(mel_tensor.unsqueeze(1))\n",
132
+ " reference_embeddings[key] = (ref.squeeze(1), audio)\n",
133
+ " \n",
134
+ " return reference_embeddings"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "markdown",
139
+ "id": "7b9cecbe",
140
+ "metadata": {},
141
+ "source": [
142
+ "### Load models"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": null,
148
+ "id": "64fc4c0f",
149
+ "metadata": {},
150
+ "outputs": [],
151
+ "source": [
152
+ "# load phonemizer\n",
153
+ "import phonemizer\n",
154
+ "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": null,
160
+ "id": "54cfbe48",
161
+ "metadata": {},
162
+ "outputs": [],
163
+ "source": [
164
+ "# load hifi-gan\n",
165
+ "\n",
166
+ "import sys\n",
167
+ "sys.path.insert(0, \"./Demo/hifi-gan\")\n",
168
+ "\n",
169
+ "import glob\n",
170
+ "import os\n",
171
+ "import argparse\n",
172
+ "import json\n",
173
+ "import torch\n",
174
+ "from scipy.io.wavfile import write\n",
175
+ "from attrdict import AttrDict\n",
176
+ "from vocoder import Generator\n",
177
+ "import librosa\n",
178
+ "import numpy as np\n",
179
+ "import torchaudio\n",
180
+ "\n",
181
+ "h = None\n",
182
+ "\n",
183
+ "def load_checkpoint(filepath, device):\n",
184
+ " assert os.path.isfile(filepath)\n",
185
+ " print(\"Loading '{}'\".format(filepath))\n",
186
+ " checkpoint_dict = torch.load(filepath, map_location=device)\n",
187
+ " print(\"Complete.\")\n",
188
+ " return checkpoint_dict\n",
189
+ "\n",
190
+ "def scan_checkpoint(cp_dir, prefix):\n",
191
+ " pattern = os.path.join(cp_dir, prefix + '*')\n",
192
+ " cp_list = glob.glob(pattern)\n",
193
+ " if len(cp_list) == 0:\n",
194
+ " return ''\n",
195
+ " return sorted(cp_list)[-1]\n",
196
+ "\n",
197
+ "cp_g = scan_checkpoint(\"Vocoder/\", 'g_')\n",
198
+ "\n",
199
+ "config_file = os.path.join(os.path.split(cp_g)[0], 'config.json')\n",
200
+ "with open(config_file) as f:\n",
201
+ " data = f.read()\n",
202
+ "json_config = json.loads(data)\n",
203
+ "h = AttrDict(json_config)\n",
204
+ "\n",
205
+ "device = torch.device(device)\n",
206
+ "generator = Generator(h).to(device)\n",
207
+ "\n",
208
+ "state_dict_g = load_checkpoint(cp_g, device)\n",
209
+ "generator.load_state_dict(state_dict_g['generator'])\n",
210
+ "generator.eval()\n",
211
+ "generator.remove_weight_norm()"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": null,
217
+ "id": "02fb18a6",
218
+ "metadata": {},
219
+ "outputs": [],
220
+ "source": [
221
+ "# load StyleTTS\n",
222
+ "model_path = \"./Models/LJSpeech/epoch_2nd_00180.pth\"\n",
223
+ "model_config_path = \"./Models/LJSpeech/config.yml\"\n",
224
+ "\n",
225
+ "config = yaml.safe_load(open(model_config_path))\n",
226
+ "\n",
227
+ "# load pretrained ASR model\n",
228
+ "ASR_config = config.get('ASR_config', False)\n",
229
+ "ASR_path = config.get('ASR_path', False)\n",
230
+ "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
231
+ "\n",
232
+ "# load pretrained F0 model\n",
233
+ "F0_path = config.get('F0_path', False)\n",
234
+ "pitch_extractor = load_F0_models(F0_path)\n",
235
+ "\n",
236
+ "model = build_model(Munch(config['model_params']), text_aligner, pitch_extractor)\n",
237
+ "\n",
238
+ "params = torch.load(model_path, map_location='cpu')\n",
239
+ "params = params['net']\n",
240
+ "for key in model:\n",
241
+ " if key in params:\n",
242
+ " if not \"discriminator\" in key:\n",
243
+ " print('%s loaded' % key)\n",
244
+ " model[key].load_state_dict(params[key])\n",
245
+ "_ = [model[key].eval() for key in model]\n",
246
+ "_ = [model[key].to(device) for key in model]"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "markdown",
251
+ "id": "b803110e",
252
+ "metadata": {},
253
+ "source": [
254
+ "### Synthesize speech"
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "code",
259
+ "execution_count": null,
260
+ "id": "30e8ff2c",
261
+ "metadata": {},
262
+ "outputs": [],
263
+ "source": [
264
+ "# get first 3 training sample as references\n",
265
+ "\n",
266
+ "train_path = config.get('train_data', None)\n",
267
+ "val_path = config.get('val_data', None)\n",
268
+ "train_list, val_list = get_data_path_list(train_path, val_path)\n",
269
+ "\n",
270
+ "ref_dicts = {}\n",
271
+ "for j in range(3):\n",
272
+ " filename = train_list[j].split('|')[0]\n",
273
+ " name = filename.split('/')[-1].replace('.wav', '')\n",
274
+ " ref_dicts[name] = filename\n",
275
+ " \n",
276
+ "reference_embeddings = compute_style(ref_dicts)"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "code",
281
+ "execution_count": null,
282
+ "id": "24655f46",
283
+ "metadata": {},
284
+ "outputs": [],
285
+ "source": [
286
+ "# synthesize a text\n",
287
+ "text = ''' StyleTTS is a style-based generative model for parallel TTS that can synthesize diverse speech with natural prosody from a reference speech utterance. '''"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": null,
293
+ "id": "43e9f635",
294
+ "metadata": {},
295
+ "outputs": [],
296
+ "source": [
297
+ "# tokenize\n",
298
+ "ps = global_phonemizer.phonemize([text])\n",
299
+ "ps = word_tokenize(ps[0])\n",
300
+ "ps = ' '.join(ps)\n",
301
+ "tokens = textclenaer(ps)\n",
302
+ "tokens.insert(0, 0)\n",
303
+ "tokens.append(0)\n",
304
+ "tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": null,
310
+ "id": "ca57469c",
311
+ "metadata": {},
312
+ "outputs": [],
313
+ "source": [
314
+ "converted_samples = {}\n",
315
+ "\n",
316
+ "with torch.no_grad():\n",
317
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
318
+ " m = length_to_mask(input_lengths).to(device)\n",
319
+ " t_en = model.text_encoder(tokens, input_lengths, m)\n",
320
+ " \n",
321
+ " for key, (ref, _) in reference_embeddings.items():\n",
322
+ " \n",
323
+ " s = ref.squeeze(1)\n",
324
+ " style = s\n",
325
+ " \n",
326
+ " d = model.predictor.text_encoder(t_en, style, input_lengths, m)\n",
327
+ "\n",
328
+ " x, _ = model.predictor.lstm(d)\n",
329
+ " duration = model.predictor.duration_proj(x)\n",
330
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
331
+ " \n",
332
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
333
+ " c_frame = 0\n",
334
+ " for i in range(pred_aln_trg.size(0)):\n",
335
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
336
+ " c_frame += int(pred_dur[i].data)\n",
337
+ "\n",
338
+ " # encode prosody\n",
339
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
340
+ " style = s.expand(en.shape[0], en.shape[1], -1)\n",
341
+ "\n",
342
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
343
+ "\n",
344
+ " out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)), \n",
345
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
346
+ "\n",
347
+ "\n",
348
+ " c = out.squeeze()\n",
349
+ " y_g_hat = generator(c.unsqueeze(0))\n",
350
+ " y_out = y_g_hat.squeeze().cpu().numpy()\n",
351
+ "\n",
352
+ " c = out.squeeze()\n",
353
+ " y_g_hat = generator(c.unsqueeze(0))\n",
354
+ " y_out = y_g_hat.squeeze()\n",
355
+ " \n",
356
+ " converted_samples[key] = y_out.cpu().numpy()"
357
+ ]
358
+ },
359
+ {
360
+ "cell_type": "code",
361
+ "execution_count": null,
362
+ "id": "d3d7f7d5",
363
+ "metadata": {
364
+ "scrolled": true
365
+ },
366
+ "outputs": [],
367
+ "source": [
368
+ "import IPython.display as ipd\n",
369
+ "for key, wave in converted_samples.items():\n",
370
+ " print('Synthesized: %s' % key)\n",
371
+ " display(ipd.Audio(wave, rate=24000))\n",
372
+ " try:\n",
373
+ " print('Reference: %s' % key)\n",
374
+ " display(ipd.Audio(reference_embeddings[key][-1], rate=24000))\n",
375
+ " except:\n",
376
+ " continue"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "execution_count": null,
382
+ "id": "74fe14d9",
383
+ "metadata": {},
384
+ "outputs": [],
385
+ "source": []
386
+ },
387
+ {
388
+ "cell_type": "code",
389
+ "execution_count": null,
390
+ "id": "a97c5e82",
391
+ "metadata": {},
392
+ "outputs": [],
393
+ "source": []
394
+ }
395
+ ],
396
+ "metadata": {
397
+ "kernelspec": {
398
+ "display_name": "python3",
399
+ "language": "python",
400
+ "name": "python3"
401
+ },
402
+ "language_info": {
403
+ "codemirror_mode": {
404
+ "name": "ipython",
405
+ "version": 3
406
+ },
407
+ "file_extension": ".py",
408
+ "mimetype": "text/x-python",
409
+ "name": "python",
410
+ "nbconvert_exporter": "python",
411
+ "pygments_lexer": "ipython3",
412
+ "version": "3.9.7"
413
+ }
414
+ },
415
+ "nbformat": 4,
416
+ "nbformat_minor": 5
417
+ }
StyleTTS_Accelerate/Demo/Inference_LibriTTS.ipynb ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "9adb7bd1",
6
+ "metadata": {},
7
+ "source": [
8
+ "# StyleTTS Demo (LibriTTS)\n"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "6108384d",
14
+ "metadata": {},
15
+ "source": [
16
+ "### Utils"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "da84c60f",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "%cd .."
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": null,
32
+ "id": "5a3ddcc8",
33
+ "metadata": {},
34
+ "outputs": [],
35
+ "source": [
36
+ "# load packages\n",
37
+ "import random\n",
38
+ "import yaml\n",
39
+ "from munch import Munch\n",
40
+ "import numpy as np\n",
41
+ "import torch\n",
42
+ "from torch import nn\n",
43
+ "import torch.nn.functional as F\n",
44
+ "import torchaudio\n",
45
+ "import librosa\n",
46
+ "\n",
47
+ "from models import *\n",
48
+ "from utils import *\n",
49
+ "\n",
50
+ "%matplotlib inline"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": null,
56
+ "id": "bbdc04c0",
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": null,
66
+ "id": "0a173af4",
67
+ "metadata": {},
68
+ "outputs": [],
69
+ "source": [
70
+ "_pad = \"$\"\n",
71
+ "_punctuation = ';:,.!?¡¿—…\"«»“” '\n",
72
+ "_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'\n",
73
+ "_letters_ipa = \"ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ\"\n",
74
+ "\n",
75
+ "\n",
76
+ "# Export all symbols:\n",
77
+ "symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)\n",
78
+ "\n",
79
+ "dicts = {}\n",
80
+ "for i in range(len((symbols))):\n",
81
+ " dicts[symbols[i]] = i\n",
82
+ "\n",
83
+ "class TextCleaner:\n",
84
+ " def __init__(self, dummy=None):\n",
85
+ " self.word_index_dictionary = dicts\n",
86
+ " def __call__(self, text):\n",
87
+ " indexes = []\n",
88
+ " for char in text:\n",
89
+ " try:\n",
90
+ " indexes.append(self.word_index_dictionary[char])\n",
91
+ " except KeyError:\n",
92
+ " print(char)\n",
93
+ " return indexes\n",
94
+ "\n",
95
+ "textclenaer = TextCleaner()"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "id": "00ee05e1",
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "to_mel = torchaudio.transforms.MelSpectrogram(\n",
106
+ " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
107
+ "mean, std = -4, 4\n",
108
+ "\n",
109
+ "def length_to_mask(lengths):\n",
110
+ " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
111
+ " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
112
+ " return mask\n",
113
+ "\n",
114
+ "def preprocess(wave):\n",
115
+ " wave_tensor = torch.from_numpy(wave).float()\n",
116
+ " mel_tensor = to_mel(wave_tensor)\n",
117
+ " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
118
+ " return mel_tensor\n",
119
+ "\n",
120
+ "def compute_style(ref_dicts):\n",
121
+ " reference_embeddings = {}\n",
122
+ " for key, path in ref_dicts.items():\n",
123
+ " wave, sr = librosa.load(path, sr=24000)\n",
124
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
125
+ " if sr != 24000:\n",
126
+ " audio = librosa.resample(audio, sr, 24000)\n",
127
+ " mel_tensor = preprocess(audio).to(device)\n",
128
+ " try:\n",
129
+ " with torch.no_grad():\n",
130
+ " ref = model.style_encoder(mel_tensor.unsqueeze(1))\n",
131
+ " reference_embeddings[key] = (ref.squeeze(1), audio)\n",
132
+ " except:\n",
133
+ " continue\n",
134
+ " \n",
135
+ " return reference_embeddings"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "markdown",
140
+ "id": "7b9cecbe",
141
+ "metadata": {},
142
+ "source": [
143
+ "### Load models"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": null,
149
+ "id": "64fc4c0f",
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "# load phonemizer\n",
154
+ "import phonemizer\n",
155
+ "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": null,
161
+ "id": "54cfbe48",
162
+ "metadata": {},
163
+ "outputs": [],
164
+ "source": [
165
+ "# load hifi-gan\n",
166
+ "\n",
167
+ "import sys\n",
168
+ "sys.path.insert(0, \"./Demo/hifi-gan\")\n",
169
+ "\n",
170
+ "import glob\n",
171
+ "import os\n",
172
+ "import argparse\n",
173
+ "import json\n",
174
+ "import torch\n",
175
+ "from scipy.io.wavfile import write\n",
176
+ "from attrdict import AttrDict\n",
177
+ "from vocoder import Generator\n",
178
+ "import librosa\n",
179
+ "import numpy as np\n",
180
+ "import torchaudio\n",
181
+ "\n",
182
+ "h = None\n",
183
+ "\n",
184
+ "def load_checkpoint(filepath, device):\n",
185
+ " assert os.path.isfile(filepath)\n",
186
+ " print(\"Loading '{}'\".format(filepath))\n",
187
+ " checkpoint_dict = torch.load(filepath, map_location=device)\n",
188
+ " print(\"Complete.\")\n",
189
+ " return checkpoint_dict\n",
190
+ "\n",
191
+ "def scan_checkpoint(cp_dir, prefix):\n",
192
+ " pattern = os.path.join(cp_dir, prefix + '*')\n",
193
+ " cp_list = glob.glob(pattern)\n",
194
+ " if len(cp_list) == 0:\n",
195
+ " return ''\n",
196
+ " return sorted(cp_list)[-1]\n",
197
+ "\n",
198
+ "cp_g = scan_checkpoint(\"Vocoder/LibriTTS/\", 'g_')\n",
199
+ "\n",
200
+ "config_file = os.path.join(os.path.split(cp_g)[0], 'config.json')\n",
201
+ "with open(config_file) as f:\n",
202
+ " data = f.read()\n",
203
+ "json_config = json.loads(data)\n",
204
+ "h = AttrDict(json_config)\n",
205
+ "\n",
206
+ "device = torch.device(device)\n",
207
+ "generator = Generator(h).to(device)\n",
208
+ "\n",
209
+ "state_dict_g = load_checkpoint(cp_g, device)\n",
210
+ "generator.load_state_dict(state_dict_g['generator'])\n",
211
+ "generator.eval()\n",
212
+ "generator.remove_weight_norm()"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "code",
217
+ "execution_count": null,
218
+ "id": "02fb18a6",
219
+ "metadata": {},
220
+ "outputs": [],
221
+ "source": [
222
+ "# load StyleTTS\n",
223
+ "model_path = \"./Models/LibriTTS/epoch_2nd_00050.pth\"\n",
224
+ "model_config_path = \"./Models/LibriTTS/config.yml\"\n",
225
+ "\n",
226
+ "config = yaml.safe_load(open(model_config_path))\n",
227
+ "\n",
228
+ "# load pretrained ASR model\n",
229
+ "ASR_config = config.get('ASR_config', False)\n",
230
+ "ASR_path = config.get('ASR_path', False)\n",
231
+ "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
232
+ "\n",
233
+ "# load pretrained F0 model\n",
234
+ "F0_path = config.get('F0_path', False)\n",
235
+ "pitch_extractor = load_F0_models(F0_path)\n",
236
+ "\n",
237
+ "model = build_model(Munch(config['model_params']), text_aligner, pitch_extractor)\n",
238
+ "\n",
239
+ "params = torch.load(model_path, map_location='cpu')\n",
240
+ "params = params['net']\n",
241
+ "for key in model:\n",
242
+ " if key in params:\n",
243
+ " if not \"discriminator\" in key:\n",
244
+ " print('%s loaded' % key)\n",
245
+ " model[key].load_state_dict(params[key])\n",
246
+ "_ = [model[key].eval() for key in model]\n",
247
+ "_ = [model[key].to(device) for key in model]"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "markdown",
252
+ "id": "b803110e",
253
+ "metadata": {},
254
+ "source": [
255
+ "### Synthesize speech (seen speakers, LibriTTS train-clean-100)"
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "code",
260
+ "execution_count": null,
261
+ "id": "30e8ff2c",
262
+ "metadata": {},
263
+ "outputs": [],
264
+ "source": [
265
+ "# get first 3 training sample as references\n",
266
+ "\n",
267
+ "train_path = config.get('train_data', None)\n",
268
+ "val_path = config.get('val_data', None)\n",
269
+ "train_list, val_list = get_data_path_list(train_path, val_path)\n",
270
+ "\n",
271
+ "ref_dicts = {}\n",
272
+ "for j in range(3):\n",
273
+ " filename = train_list[j].split('|')[0]\n",
274
+ " name = filename.split('/')[-1].replace('.wav', '')\n",
275
+ " ref_dicts[name] = filename\n",
276
+ " \n",
277
+ "reference_embeddings = compute_style(ref_dicts)"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": null,
283
+ "id": "24655f46",
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": [
287
+ "# synthesize a text\n",
288
+ "text = ''' StyleTTS is a style based generative model that can synthesize diverse speech with natural prosody from a reference speech utterance. '''"
289
+ ]
290
+ },
291
+ {
292
+ "cell_type": "code",
293
+ "execution_count": null,
294
+ "id": "43e9f635",
295
+ "metadata": {},
296
+ "outputs": [],
297
+ "source": [
298
+ "# tokenize\n",
299
+ "ps = global_phonemizer.phonemize([text])\n",
300
+ "tokens = textclenaer(ps[0])\n",
301
+ "tokens.insert(0, 0)\n",
302
+ "tokens.append(0)\n",
303
+ "tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": null,
309
+ "id": "ca57469c",
310
+ "metadata": {},
311
+ "outputs": [],
312
+ "source": [
313
+ "converted_samples = {}\n",
314
+ "\n",
315
+ "with torch.no_grad():\n",
316
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
317
+ " m = length_to_mask(input_lengths).to(device)\n",
318
+ " t_en = model.text_encoder(tokens, input_lengths, m)\n",
319
+ " \n",
320
+ " for key, (ref, _) in reference_embeddings.items():\n",
321
+ " \n",
322
+ " s = ref.squeeze(1)\n",
323
+ " style = s\n",
324
+ " \n",
325
+ " d = model.predictor.text_encoder(t_en, style, input_lengths, m)\n",
326
+ "\n",
327
+ " x, _ = model.predictor.lstm(d)\n",
328
+ " duration = model.predictor.duration_proj(x)\n",
329
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
330
+ " \n",
331
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
332
+ " c_frame = 0\n",
333
+ " for i in range(pred_aln_trg.size(0)):\n",
334
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
335
+ " c_frame += int(pred_dur[i].data)\n",
336
+ "\n",
337
+ " # encode prosody\n",
338
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
339
+ " style = s.expand(en.shape[0], en.shape[1], -1)\n",
340
+ "\n",
341
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
342
+ "\n",
343
+ " out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)), \n",
344
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
345
+ "\n",
346
+ "\n",
347
+ " c = out.squeeze()\n",
348
+ " y_g_hat = generator(c.unsqueeze(0))\n",
349
+ " y_out = y_g_hat.squeeze().cpu().numpy()\n",
350
+ "\n",
351
+ " c = out.squeeze()\n",
352
+ " y_g_hat = generator(c.unsqueeze(0))\n",
353
+ " y_out = y_g_hat.squeeze()\n",
354
+ " \n",
355
+ " converted_samples[key] = y_out.cpu().numpy()"
356
+ ]
357
+ },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": null,
361
+ "id": "086c25a7",
362
+ "metadata": {
363
+ "scrolled": true
364
+ },
365
+ "outputs": [],
366
+ "source": [
367
+ "import IPython.display as ipd\n",
368
+ "for key, wave in converted_samples.items():\n",
369
+ " print('Synthesized: %s' % key)\n",
370
+ " display(ipd.Audio(wave, rate=24000))\n",
371
+ " try:\n",
372
+ " print('Reference: %s' % key)\n",
373
+ " display(ipd.Audio(reference_embeddings[key][-1], rate=24000))\n",
374
+ " except:\n",
375
+ " continue"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "markdown",
380
+ "id": "41d721cd",
381
+ "metadata": {},
382
+ "source": [
383
+ "### Zero-shot TTS (unseen speakers, LibriTTS test-clean)"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": null,
389
+ "id": "5b75a5dd",
390
+ "metadata": {},
391
+ "outputs": [],
392
+ "source": [
393
+ "test_clean_path = '/share/naplab/users/yl4579/data/LibriTTS/test-clean/'\n",
394
+ "\n",
395
+ "ref_dicts = {}\n",
396
+ "# pick first 3 speakers from test-clean\n",
397
+ "spks = [ f.path for f in os.scandir(test_clean_path) if f.is_dir() ]\n",
398
+ "spks = spks[:3]\n",
399
+ "for spk in spks:\n",
400
+ " spk_path = spk\n",
401
+ " spk = spk.split('/')[-1]\n",
402
+ " spk_path = spk_path + \"/\" + (np.random.choice(os.listdir(spk_path), size=1)[0])\n",
403
+ " for f in os.listdir(spk_path):\n",
404
+ " if f.endswith('.wav'):\n",
405
+ " ref_dicts[spk] = spk_path + \"/\" + f\n",
406
+ "reference_embeddings = compute_style(ref_dicts)"
407
+ ]
408
+ },
409
+ {
410
+ "cell_type": "code",
411
+ "execution_count": null,
412
+ "id": "b8c204d0",
413
+ "metadata": {},
414
+ "outputs": [],
415
+ "source": [
416
+ "# synthesize a text\n",
417
+ "text = ''' StyleTTS is a style based generative model that can synthesize diverse speech with natural prosody from a reference speech utterance. '''"
418
+ ]
419
+ },
420
+ {
421
+ "cell_type": "code",
422
+ "execution_count": null,
423
+ "id": "a0078aa4",
424
+ "metadata": {},
425
+ "outputs": [],
426
+ "source": [
427
+ "# tokenize\n",
428
+ "ps = global_phonemizer.phonemize([text])\n",
429
+ "tokens = textclenaer(ps[0])\n",
430
+ "tokens.insert(0, 0)\n",
431
+ "tokens.append(0)\n",
432
+ "tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)"
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "code",
437
+ "execution_count": null,
438
+ "id": "f02958cc",
439
+ "metadata": {},
440
+ "outputs": [],
441
+ "source": [
442
+ "converted_samples = {}\n",
443
+ "\n",
444
+ "with torch.no_grad():\n",
445
+ " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
446
+ " m = length_to_mask(input_lengths).to(device)\n",
447
+ " t_en = model.text_encoder(tokens, input_lengths, m)\n",
448
+ " \n",
449
+ " for key, (ref, _) in reference_embeddings.items():\n",
450
+ " \n",
451
+ " s = ref.squeeze(1)\n",
452
+ " style = s\n",
453
+ " \n",
454
+ " d = model.predictor.text_encoder(t_en, style, input_lengths, m)\n",
455
+ "\n",
456
+ " x, _ = model.predictor.lstm(d)\n",
457
+ " duration = model.predictor.duration_proj(x)\n",
458
+ " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
459
+ " \n",
460
+ " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
461
+ " c_frame = 0\n",
462
+ " for i in range(pred_aln_trg.size(0)):\n",
463
+ " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
464
+ " c_frame += int(pred_dur[i].data)\n",
465
+ "\n",
466
+ " # encode prosody\n",
467
+ " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
468
+ " style = s.expand(en.shape[0], en.shape[1], -1)\n",
469
+ "\n",
470
+ " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
471
+ "\n",
472
+ " out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)), \n",
473
+ " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
474
+ "\n",
475
+ "\n",
476
+ " c = out.squeeze()\n",
477
+ " y_g_hat = generator(c.unsqueeze(0))\n",
478
+ " y_out = y_g_hat.squeeze().cpu().numpy()\n",
479
+ "\n",
480
+ " c = out.squeeze()\n",
481
+ " y_g_hat = generator(c.unsqueeze(0))\n",
482
+ " y_out = y_g_hat.squeeze()\n",
483
+ " \n",
484
+ " converted_samples[key] = y_out.cpu().numpy()"
485
+ ]
486
+ },
487
+ {
488
+ "cell_type": "code",
489
+ "execution_count": null,
490
+ "id": "b2e931ac",
491
+ "metadata": {
492
+ "scrolled": true
493
+ },
494
+ "outputs": [],
495
+ "source": [
496
+ "import IPython.display as ipd\n",
497
+ "for key, wave in converted_samples.items():\n",
498
+ " print('Synthesized: %s' % key)\n",
499
+ " display(ipd.Audio(wave, rate=24000))\n",
500
+ " try:\n",
501
+ " print('Reference: %s' % key)\n",
502
+ " display(ipd.Audio(reference_embeddings[key][-1], rate=24000))\n",
503
+ " except:\n",
504
+ " continue"
505
+ ]
506
+ }
507
+ ],
508
+ "metadata": {
509
+ "kernelspec": {
510
+ "display_name": "python3",
511
+ "language": "python",
512
+ "name": "python3"
513
+ },
514
+ "language_info": {
515
+ "codemirror_mode": {
516
+ "name": "ipython",
517
+ "version": 3
518
+ },
519
+ "file_extension": ".py",
520
+ "mimetype": "text/x-python",
521
+ "name": "python",
522
+ "nbconvert_exporter": "python",
523
+ "pygments_lexer": "ipython3",
524
+ "version": "3.7.11"
525
+ }
526
+ },
527
+ "nbformat": 4,
528
+ "nbformat_minor": 5
529
+ }
StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder.cpython-310.pyc ADDED
Binary file (8.71 kB). View file
 
StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder.cpython-39.pyc ADDED
Binary file (8.76 kB). View file
 
StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder_utils.cpython-310.pyc ADDED
Binary file (2.04 kB). View file
 
StyleTTS_Accelerate/Demo/hifi-gan/__pycache__/vocoder_utils.cpython-39.pyc ADDED
Binary file (2.02 kB). View file
 
StyleTTS_Accelerate/Demo/hifi-gan/vocoder.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import torch.nn as nn
4
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
5
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
6
+ from vocoder_utils import init_weights, get_padding
7
+
8
+ LRELU_SLOPE = 0.1
9
+
10
+
11
+ class ResBlock1(torch.nn.Module):
12
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
13
+ super(ResBlock1, self).__init__()
14
+ self.h = h
15
+ self.convs1 = nn.ModuleList([
16
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
17
+ padding=get_padding(kernel_size, dilation[0]))),
18
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
19
+ padding=get_padding(kernel_size, dilation[1]))),
20
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
21
+ padding=get_padding(kernel_size, dilation[2])))
22
+ ])
23
+ self.convs1.apply(init_weights)
24
+
25
+ self.convs2 = nn.ModuleList([
26
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
27
+ padding=get_padding(kernel_size, 1))),
28
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
29
+ padding=get_padding(kernel_size, 1))),
30
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
31
+ padding=get_padding(kernel_size, 1)))
32
+ ])
33
+ self.convs2.apply(init_weights)
34
+
35
+ def forward(self, x):
36
+ for c1, c2 in zip(self.convs1, self.convs2):
37
+ xt = F.leaky_relu(x, LRELU_SLOPE)
38
+ xt = c1(xt)
39
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
40
+ xt = c2(xt)
41
+ x = xt + x
42
+ return x
43
+
44
+ def remove_weight_norm(self):
45
+ for l in self.convs1:
46
+ remove_weight_norm(l)
47
+ for l in self.convs2:
48
+ remove_weight_norm(l)
49
+
50
+
51
+ class ResBlock2(torch.nn.Module):
52
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
53
+ super(ResBlock2, self).__init__()
54
+ self.h = h
55
+ self.convs = nn.ModuleList([
56
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
57
+ padding=get_padding(kernel_size, dilation[0]))),
58
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
59
+ padding=get_padding(kernel_size, dilation[1])))
60
+ ])
61
+ self.convs.apply(init_weights)
62
+
63
+ def forward(self, x):
64
+ for c in self.convs:
65
+ xt = F.leaky_relu(x, LRELU_SLOPE)
66
+ xt = c(xt)
67
+ x = xt + x
68
+ return x
69
+
70
+ def remove_weight_norm(self):
71
+ for l in self.convs:
72
+ remove_weight_norm(l)
73
+
74
+
75
+ class Generator(torch.nn.Module):
76
+ def __init__(self, h):
77
+ super(Generator, self).__init__()
78
+ self.h = h
79
+ self.num_kernels = len(h.resblock_kernel_sizes)
80
+ self.num_upsamples = len(h.upsample_rates)
81
+ self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
82
+ resblock = ResBlock1 if h.resblock == '1' else ResBlock2
83
+
84
+ self.ups = nn.ModuleList()
85
+ for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
86
+ self.ups.append(weight_norm(ConvTranspose1d(h.upsample_initial_channel//(2**i),
87
+ h.upsample_initial_channel//(2**(i+1)),
88
+ k, u, padding=(u//2 + u%2), output_padding=u%2)))
89
+
90
+ self.resblocks = nn.ModuleList()
91
+ for i in range(len(self.ups)):
92
+ ch = h.upsample_initial_channel//(2**(i+1))
93
+ for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
94
+ self.resblocks.append(resblock(h, ch, k, d))
95
+
96
+ self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
97
+ self.ups.apply(init_weights)
98
+ self.conv_post.apply(init_weights)
99
+
100
+ def forward(self, x):
101
+ x = self.conv_pre(x)
102
+ for i in range(self.num_upsamples):
103
+ x = F.leaky_relu(x, LRELU_SLOPE)
104
+ x = self.ups[i](x)
105
+ xs = None
106
+ for j in range(self.num_kernels):
107
+ if xs is None:
108
+ xs = self.resblocks[i*self.num_kernels+j](x)
109
+ else:
110
+ xs += self.resblocks[i*self.num_kernels+j](x)
111
+ x = xs / self.num_kernels
112
+ x = F.leaky_relu(x)
113
+ x = self.conv_post(x)
114
+ x = torch.tanh(x)
115
+
116
+ return x
117
+
118
+ def remove_weight_norm(self):
119
+ print('Removing weight norm...')
120
+ for l in self.ups:
121
+ remove_weight_norm(l)
122
+ for l in self.resblocks:
123
+ l.remove_weight_norm()
124
+ remove_weight_norm(self.conv_pre)
125
+ remove_weight_norm(self.conv_post)
126
+
127
+
128
+ class DiscriminatorP(torch.nn.Module):
129
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
130
+ super(DiscriminatorP, self).__init__()
131
+ self.period = period
132
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
133
+ self.convs = nn.ModuleList([
134
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
135
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
136
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
137
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
138
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
139
+ ])
140
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
141
+
142
+ def forward(self, x):
143
+ fmap = []
144
+
145
+ # 1d to 2d
146
+ b, c, t = x.shape
147
+ if t % self.period != 0: # pad first
148
+ n_pad = self.period - (t % self.period)
149
+ x = F.pad(x, (0, n_pad), "reflect")
150
+ t = t + n_pad
151
+ x = x.view(b, c, t // self.period, self.period)
152
+
153
+ for l in self.convs:
154
+ x = l(x)
155
+ x = F.leaky_relu(x, LRELU_SLOPE)
156
+ fmap.append(x)
157
+ x = self.conv_post(x)
158
+ fmap.append(x)
159
+ x = torch.flatten(x, 1, -1)
160
+
161
+ return x, fmap
162
+
163
+
164
+ class MultiPeriodDiscriminator(torch.nn.Module):
165
+ def __init__(self):
166
+ super(MultiPeriodDiscriminator, self).__init__()
167
+ self.discriminators = nn.ModuleList([
168
+ DiscriminatorP(2),
169
+ DiscriminatorP(3),
170
+ DiscriminatorP(5),
171
+ DiscriminatorP(7),
172
+ DiscriminatorP(11),
173
+ ])
174
+
175
+ def forward(self, y, y_hat):
176
+ y_d_rs = []
177
+ y_d_gs = []
178
+ fmap_rs = []
179
+ fmap_gs = []
180
+ for i, d in enumerate(self.discriminators):
181
+ y_d_r, fmap_r = d(y)
182
+ y_d_g, fmap_g = d(y_hat)
183
+ y_d_rs.append(y_d_r)
184
+ fmap_rs.append(fmap_r)
185
+ y_d_gs.append(y_d_g)
186
+ fmap_gs.append(fmap_g)
187
+
188
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
189
+
190
+
191
+ class DiscriminatorS(torch.nn.Module):
192
+ def __init__(self, use_spectral_norm=False):
193
+ super(DiscriminatorS, self).__init__()
194
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
195
+ self.convs = nn.ModuleList([
196
+ norm_f(Conv1d(1, 128, 15, 1, padding=7)),
197
+ norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
198
+ norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
199
+ norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
200
+ norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
201
+ norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
202
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
203
+ ])
204
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
205
+
206
+ def forward(self, x):
207
+ fmap = []
208
+ for l in self.convs:
209
+ x = l(x)
210
+ x = F.leaky_relu(x, LRELU_SLOPE)
211
+ fmap.append(x)
212
+ x = self.conv_post(x)
213
+ fmap.append(x)
214
+ x = torch.flatten(x, 1, -1)
215
+
216
+ return x, fmap
217
+
218
+
219
+ class MultiScaleDiscriminator(torch.nn.Module):
220
+ def __init__(self):
221
+ super(MultiScaleDiscriminator, self).__init__()
222
+ self.discriminators = nn.ModuleList([
223
+ DiscriminatorS(use_spectral_norm=True),
224
+ DiscriminatorS(),
225
+ DiscriminatorS(),
226
+ ])
227
+ self.meanpools = nn.ModuleList([
228
+ AvgPool1d(4, 2, padding=2),
229
+ AvgPool1d(4, 2, padding=2)
230
+ ])
231
+
232
+ def forward(self, y, y_hat):
233
+ y_d_rs = []
234
+ y_d_gs = []
235
+ fmap_rs = []
236
+ fmap_gs = []
237
+ for i, d in enumerate(self.discriminators):
238
+ if i != 0:
239
+ y = self.meanpools[i-1](y)
240
+ y_hat = self.meanpools[i-1](y_hat)
241
+ y_d_r, fmap_r = d(y)
242
+ y_d_g, fmap_g = d(y_hat)
243
+ y_d_rs.append(y_d_r)
244
+ fmap_rs.append(fmap_r)
245
+ y_d_gs.append(y_d_g)
246
+ fmap_gs.append(fmap_g)
247
+
248
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
249
+
250
+
251
+ def feature_loss(fmap_r, fmap_g):
252
+ loss = 0
253
+ for dr, dg in zip(fmap_r, fmap_g):
254
+ for rl, gl in zip(dr, dg):
255
+ loss += torch.mean(torch.abs(rl - gl))
256
+
257
+ return loss*2
258
+
259
+
260
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
261
+ loss = 0
262
+ r_losses = []
263
+ g_losses = []
264
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
265
+ r_loss = torch.mean((1-dr)**2)
266
+ g_loss = torch.mean(dg**2)
267
+ loss += (r_loss + g_loss)
268
+ r_losses.append(r_loss.item())
269
+ g_losses.append(g_loss.item())
270
+
271
+ return loss, r_losses, g_losses
272
+
273
+
274
+ def generator_loss(disc_outputs):
275
+ loss = 0
276
+ gen_losses = []
277
+ for dg in disc_outputs:
278
+ l = torch.mean((1-dg)**2)
279
+ gen_losses.append(l)
280
+ loss += l
281
+
282
+ return loss, gen_losses
283
+
StyleTTS_Accelerate/Demo/hifi-gan/vocoder_utils.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+ import matplotlib
4
+ import torch
5
+ from torch.nn.utils import weight_norm
6
+ matplotlib.use("Agg")
7
+ import matplotlib.pylab as plt
8
+
9
+
10
+ def plot_spectrogram(spectrogram):
11
+ fig, ax = plt.subplots(figsize=(10, 2))
12
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower",
13
+ interpolation='none')
14
+ plt.colorbar(im, ax=ax)
15
+
16
+ fig.canvas.draw()
17
+ plt.close()
18
+
19
+ return fig
20
+
21
+
22
+ def init_weights(m, mean=0.0, std=0.01):
23
+ classname = m.__class__.__name__
24
+ if classname.find("Conv") != -1:
25
+ m.weight.data.normal_(mean, std)
26
+
27
+
28
+ def apply_weight_norm(m):
29
+ classname = m.__class__.__name__
30
+ if classname.find("Conv") != -1:
31
+ weight_norm(m)
32
+
33
+
34
+ def get_padding(kernel_size, dilation=1):
35
+ return int((kernel_size*dilation - dilation)/2)
36
+
37
+
38
+ def load_checkpoint(filepath, device):
39
+ assert os.path.isfile(filepath)
40
+ print("Loading '{}'".format(filepath))
41
+ checkpoint_dict = torch.load(filepath, map_location=device)
42
+ print("Complete.")
43
+ return checkpoint_dict
44
+
45
+
46
+ def save_checkpoint(filepath, obj):
47
+ print("Saving checkpoint to {}".format(filepath))
48
+ torch.save(obj, filepath)
49
+ print("Complete.")
50
+
51
+
52
+ def scan_checkpoint(cp_dir, prefix):
53
+ pattern = os.path.join(cp_dir, prefix + '????????')
54
+ cp_list = glob.glob(pattern)
55
+ if len(cp_list) == 0:
56
+ return None
57
+ return sorted(cp_list)[-1]
58
+
StyleTTS_Accelerate/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Aaron (Yinghao) Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
StyleTTS_Accelerate/LICENSE copy ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Aaron (Yinghao) Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
StyleTTS_Accelerate/Models/Anispeech/config.yml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/Anispeech"
2
+ first_stage_path: "/home/ubuntu/StyleTTS_Accelerate_44khz/Models/Anispeech/epoch_1st_00020.pth"
3
+ save_freq: 1
4
+ log_interval: 10
5
+ device: "cuda"
6
+ multigpu: false
7
+ epochs_1st: 200 # number of epochs for first stage training
8
+ epochs_2nd: 100 # number of peochs for second stage training
9
+ batch_size: 16
10
+ pretrained_model: "/home/ubuntu/StyleTTS_Accelerate_44khz/Models/Anispeech/epoch_2nd_00015.pth"
11
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
12
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
13
+
14
+ diff_epoch: 5
15
+
16
+ train_data: "/home/ubuntu/StyleTTS_Accelerate_44khz/Data/ani_train_only_longs.csv"
17
+ val_data: "/home/ubuntu/StyleTTS_Accelerate_44khz/Data/val_list_libritts.txt"
18
+
19
+ F0_path: "Utils/JDC/bst.t7"
20
+ ASR_config: "Utils/ASR/config.yml"
21
+ ASR_path: "Utils/ASR/epoch_00080.pth"
22
+
23
+ preprocess_params:
24
+ sr: 24000
25
+ spect_params:
26
+ n_fft: 2048
27
+ win_length: 1200
28
+ hop_length: 300
29
+
30
+ model_params:
31
+ hidden_dim: 512
32
+ n_token: 178
33
+ style_dim: 128
34
+ n_layer: 3
35
+ dim_in: 64
36
+ max_conv_dim: 512
37
+ n_mels: 80
38
+ dropout: 0.2
39
+
40
+
41
+ diffusion:
42
+ embedding_mask_proba: 0.1
43
+ # transformer config
44
+ transformer:
45
+ num_layers: 3
46
+ num_heads: 8
47
+ head_features: 64
48
+ multiplier: 2
49
+
50
+ # diffusion distribution config
51
+ dist:
52
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
53
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
54
+ mean: -3.0
55
+ std: 1.0
56
+
57
+
58
+ loss_params:
59
+ lambda_mel: 10. # mel reconstruction loss (1st & 2nd stage)
60
+ lambda_adv: 1. # adversarial loss (1st & 2nd stage)
61
+ lambda_reg: 1. # adversarial regularization loss (1st & 2nd stage)
62
+ lambda_fm: 0.1 # feature matching loss (1st & 2nd stage)
63
+
64
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
65
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
66
+ TMA_epoch: 2 # TMA starting epoch (1st stage)
67
+
68
+ # https://github.com/yl4579/StyleTTS/issues/7
69
+ TMA_CEloss: false # whether to use cross-entropy (CE) loss for TMA
70
+
71
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
72
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
73
+ lambda_dur: 1. # duration loss (2nd stage)
74
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
75
+
76
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
77
+ lambda_diff: 1. # score matching loss (2nd stage)
78
+
79
+ optimizer_params:
80
+ lr: 0.0001
StyleTTS_Accelerate/Models/Anispeech/epoch_1st_00020.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:686beb07eebe47a05efbf8f522e35d3000e8eb56e3c3a64fe0c136cd7d8d784d
3
+ size 1322367412
StyleTTS_Accelerate/Models/Anispeech/epoch_2nd_00015.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96f0c34bacfecec841845b92287a553d4c4263d28a4f24111b6223f9cdcaba76
3
+ size 1072227551
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735697608.khodaya-basse-dige.344916.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:285145307d354169a4f365a200060e8835925dd5ec6b15e343d4f5904d8d6840
3
+ size 88
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735697814.khodaya-basse-dige.346056.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c6d38ee3d2509a09e245e3e17b2d741863d352b9b93c0caf36aedf0870d9f05
3
+ size 88
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735698320.khodaya-basse-dige.347680.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61d9e5048dda9c655b4c5a226759c691efccc2557993df16a3751608005ef6ac
3
+ size 7420
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735698764.khodaya-basse-dige.349633.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42a6ca2f240bb8e02ca5c416fb3b48e8f05d553fc96dce7067a5c9701e456538
3
+ size 2678
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735698917.khodaya-basse-dige.350828.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:245e41619fc9245442901624a4f56bdc692b5406bc693fec844efc58931fa314
3
+ size 3006826
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735721417.khodaya-basse-dige.404215.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:178e0cb87908e6a4822482aca1dfe1c289c137c67ffd3cf8115214cd5c12eff4
3
+ size 88
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735721458.khodaya-basse-dige.404475.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fb5701c1192707e8ce5502bcaa1e5c0b23493ca52179857e7e7ca07c005f93d
3
+ size 19924
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735723135.khodaya-basse-dige.409798.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eed71cd3253e86c83a89b290143491939736c50b13e9d4677d9b4aeb4da7bfad
3
+ size 124082
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735736169.khodaya-basse-dige.8849.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cccfbe4590399b86a822d6b053509d7a899d6e0ce4b9ee1df1110f5ba0e04474
3
+ size 278128
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735753783.khodaya-basse-dige.55757.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d76c1a47afb464e5f1abedc14512d3786027064a9384a5e7c26126ff57ec1eb
3
+ size 88
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735753897.khodaya-basse-dige.56741.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31682364e9cd764c6e199af5c2f1ee87a131c4bed8e0bb37536c33e42813b115
3
+ size 88
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735753979.khodaya-basse-dige.58472.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eac6f91079a4c53ebf8b44888512b2af1202a9e4fff41e9c0404adef1c940bb4
3
+ size 88
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735754151.khodaya-basse-dige.59652.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ba702daecc82006aad5078b95a71d8c7ae497c15e0c599148f8163b1e845869
3
+ size 88
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735754204.khodaya-basse-dige.60572.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90c9ac4835b0122b4688e388cb8aad6881e5d35095c06ae40ee21a6855935eaf
3
+ size 88
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755068.khodaya-basse-dige.62584.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f203dd3b75cb53fac9fe52bb0b5f87ffdd0515b75fa95667853772ff6b4b56b
3
+ size 88
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755116.khodaya-basse-dige.63449.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:146856775335bf311b3131a67c83bd394947681454436b8f24ea5870c296b809
3
+ size 88
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755175.khodaya-basse-dige.64734.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29031b8874f604abfc131ca92e7e9c3f35da1b065165010922ee872d5e349fff
3
+ size 88
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755213.khodaya-basse-dige.65681.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30bbc2f8786eeac413bd53068eda56aebcc6d639579dc4160c09e82fe0fbe542
3
+ size 88
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755246.khodaya-basse-dige.66573.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c99617590fc1ca468c5f653147d8b9178392e0c4ef662a5aa5593f6dba60e39
3
+ size 88
StyleTTS_Accelerate/Models/Anispeech/tensorboard/events.out.tfevents.1735755299.khodaya-basse-dige.67690.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a5b293997ed32692a4e9c7741d758712ca26426229b5e9a4558f4d2c861a06a
3
+ size 1038
StyleTTS_Accelerate/Models/Anispeech/train.log ADDED
The diff for this file is too large to render. See raw diff
 
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/config.yml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/Anispeech_with_DIFF"
2
+ first_stage_path: "/home/ubuntu/StyleTTS_Accelerate_44khz/Models/Anispeech/epoch_1st_00020.pth"
3
+ save_freq: 1
4
+ log_interval: 10
5
+ device: "cuda"
6
+ multigpu: false
7
+ epochs_1st: 200 # number of epochs for first stage training
8
+ epochs_2nd: 100 # number of peochs for second stage training
9
+ batch_size: 32
10
+ pretrained_model: ""
11
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
12
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
13
+
14
+ diff_epoch: 5
15
+
16
+ train_data: "/home/ubuntu/StyleTTS_Accelerate_44khz/Data/ani_train_only_longs.csv"
17
+ val_data: "/home/ubuntu/StyleTTS_Accelerate_44khz/Data/val_list_libritts.txt"
18
+
19
+ F0_path: "Utils/JDC/bst.t7"
20
+ ASR_config: "Utils/ASR/config.yml"
21
+ ASR_path: "Utils/ASR/epoch_00080.pth"
22
+
23
+ preprocess_params:
24
+ sr: 24000
25
+ spect_params:
26
+ n_fft: 2048
27
+ win_length: 1200
28
+ hop_length: 300
29
+
30
+ model_params:
31
+ hidden_dim: 512
32
+ n_token: 178
33
+ style_dim: 128
34
+ n_layer: 3
35
+ dim_in: 64
36
+ max_conv_dim: 512
37
+ n_mels: 80
38
+ dropout: 0.2
39
+
40
+
41
+ diffusion:
42
+ embedding_mask_proba: 0.1
43
+ # transformer config
44
+ transformer:
45
+ num_layers: 3
46
+ num_heads: 8
47
+ head_features: 64
48
+ multiplier: 2
49
+
50
+ # diffusion distribution config
51
+ dist:
52
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
53
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
54
+ mean: -3.0
55
+ std: 1.0
56
+
57
+
58
+ loss_params:
59
+ lambda_mel: 10. # mel reconstruction loss (1st & 2nd stage)
60
+ lambda_adv: 1. # adversarial loss (1st & 2nd stage)
61
+ lambda_reg: 1. # adversarial regularization loss (1st & 2nd stage)
62
+ lambda_fm: 0.1 # feature matching loss (1st & 2nd stage)
63
+
64
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
65
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
66
+ TMA_epoch: 2 # TMA starting epoch (1st stage)
67
+
68
+ # https://github.com/yl4579/StyleTTS/issues/7
69
+ TMA_CEloss: false # whether to use cross-entropy (CE) loss for TMA
70
+
71
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
72
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
73
+ lambda_dur: 1. # duration loss (2nd stage)
74
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
75
+
76
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
77
+ lambda_diff: 1. # score matching loss (2nd stage)
78
+
79
+ optimizer_params:
80
+ lr: 0.0001
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/epoch_1st_00040.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e6ddda16cbcd18677f94582b0c60014429ec717ec6ba3ef3819ead0b626a054
3
+ size 1292081189
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/epoch_2nd_00014.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:590b105355609e73ad32a08c138d3d164981b5abaeb548ed0950c404715fca48
3
+ size 1322367412
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735755378.khodaya-basse-dige.68815.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a24f333c91e20df18fb006bf9958430e27eca32efbbfd70194e464bbb217ef0
3
+ size 80357
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735758983.khodaya-basse-dige.79079.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5567fe7eae9933eafa645b60506c564020381a3cf6ad1522c978715d5aa979be
3
+ size 1486
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735759171.khodaya-basse-dige.80201.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:701829cea16b04487744c10ae1463b8559585ba84fbc9c49dc25cafe00ea1f48
3
+ size 563
StyleTTS_Accelerate/Models/Anispeech_with_DIFF/tensorboard/events.out.tfevents.1735759231.khodaya-basse-dige.81123.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f9de06fe3125dc403d9cd0275c4a7878ca8d3a5fc18d2fdc653c38d366debd9
3
+ size 429931