Billpai commited on
Commit
f196feb
·
1 Parent(s): 507c407
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. egs/svc/MultipleContentsSVC/README.md +153 -0
  2. egs/svc/MultipleContentsSVC/exp_config.json +126 -0
  3. egs/svc/MultipleContentsSVC/run.sh +1 -0
  4. egs/svc/README.md +34 -0
  5. egs/svc/_template/run.sh +150 -0
  6. egs/vocoder/README.md +23 -0
  7. egs/vocoder/diffusion/README.md +0 -0
  8. egs/vocoder/diffusion/exp_config_base.json +0 -0
  9. egs/vocoder/gan/README.md +224 -0
  10. egs/vocoder/gan/_template/run.sh +143 -0
  11. egs/vocoder/gan/apnet/exp_config.json +45 -0
  12. egs/vocoder/gan/apnet/run.sh +143 -0
  13. egs/vocoder/gan/bigvgan/exp_config.json +66 -0
  14. egs/vocoder/gan/bigvgan/run.sh +143 -0
  15. egs/vocoder/gan/bigvgan_large/exp_config.json +70 -0
  16. egs/vocoder/gan/bigvgan_large/run.sh +143 -0
  17. egs/vocoder/gan/exp_config_base.json +111 -0
  18. egs/vocoder/gan/hifigan/exp_config.json +59 -0
  19. egs/vocoder/gan/hifigan/run.sh +143 -0
  20. egs/vocoder/gan/melgan/exp_config.json +34 -0
  21. egs/vocoder/gan/melgan/run.sh +143 -0
  22. egs/vocoder/gan/nsfhifigan/exp_config.json +83 -0
  23. egs/vocoder/gan/nsfhifigan/run.sh +143 -0
  24. egs/vocoder/gan/tfr_enhanced_hifigan/README.md +185 -0
  25. egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json +118 -0
  26. egs/vocoder/gan/tfr_enhanced_hifigan/run.sh +145 -0
  27. examples/chinese_female_recordings.wav +3 -0
  28. examples/chinese_male_seperated.wav +3 -0
  29. examples/english_female_seperated.wav +3 -0
  30. examples/english_male_recordings.wav +3 -0
  31. examples/output/.DS_Store +0 -0
  32. examples/output/chinese_female_recordings_vocalist_l1_JohnMayer.wav +3 -0
  33. examples/output/chinese_male_seperated_vocalist_l1_TaylorSwift.wav +3 -0
  34. examples/output/english_female_seperated_vocalist_l1_汪峰.wav +3 -0
  35. examples/output/english_male_recordings_vocalist_l1_石倚洁.wav +3 -0
  36. models/__init__.py +0 -0
  37. models/base/__init__.py +7 -0
  38. models/base/base_dataset.py +350 -0
  39. models/base/base_inference.py +220 -0
  40. models/base/base_sampler.py +136 -0
  41. models/base/base_trainer.py +348 -0
  42. models/base/new_dataset.py +50 -0
  43. models/base/new_inference.py +249 -0
  44. models/base/new_trainer.py +722 -0
  45. models/svc/__init__.py +0 -0
  46. models/svc/base/__init__.py +7 -0
  47. models/svc/base/svc_dataset.py +425 -0
  48. models/svc/base/svc_inference.py +15 -0
  49. models/svc/base/svc_trainer.py +111 -0
  50. models/svc/comosvc/__init__.py +4 -0
egs/svc/MultipleContentsSVC/README.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion
2
+
3
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
4
+ [![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html)
5
+
6
+ <br>
7
+ <div align="center">
8
+ <img src="../../../imgs/svc/MultipleContentsSVC.png" width="85%">
9
+ </div>
10
+ <br>
11
+
12
+ This is the official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Specially,
13
+
14
+ - The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec).
15
+ - The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
16
+ - The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data.
17
+
18
+ There are four stages in total:
19
+
20
+ 1. Data preparation
21
+ 2. Features extraction
22
+ 3. Training
23
+ 4. Inference/conversion
24
+
25
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
26
+ > ```bash
27
+ > cd Amphion
28
+ > ```
29
+
30
+ ## 1. Data Preparation
31
+
32
+ ### Dataset Download
33
+
34
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
35
+
36
+ ### Configuration
37
+
38
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
39
+
40
+ ```json
41
+ "dataset": [
42
+ "m4singer",
43
+ "opencpop",
44
+ "opensinger",
45
+ "svcc",
46
+ "vctk"
47
+ ],
48
+ "dataset_path": {
49
+ // TODO: Fill in your dataset path
50
+ "m4singer": "[M4Singer dataset path]",
51
+ "opencpop": "[Opencpop dataset path]",
52
+ "opensinger": "[OpenSinger dataset path]",
53
+ "svcc": "[SVCC dataset path]",
54
+ "vctk": "[VCTK dataset path]"
55
+ },
56
+ ```
57
+
58
+ ## 2. Features Extraction
59
+
60
+ ### Content-based Pretrained Models Download
61
+
62
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
63
+
64
+ ### Configuration
65
+
66
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
67
+
68
+ ```json
69
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
70
+ "log_dir": "ckpts/svc",
71
+ "preprocess": {
72
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
73
+ "processed_dir": "data",
74
+ ...
75
+ },
76
+ ```
77
+
78
+ ### Run
79
+
80
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
81
+
82
+ ```bash
83
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 1
84
+ ```
85
+
86
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
87
+
88
+ ## 3. Training
89
+
90
+ ### Configuration
91
+
92
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
93
+
94
+ ```json
95
+ "train": {
96
+ "batch_size": 32,
97
+ ...
98
+ "adamw": {
99
+ "lr": 2.0e-4
100
+ },
101
+ ...
102
+ }
103
+ ```
104
+
105
+ ### Run
106
+
107
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
108
+
109
+ ```bash
110
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName]
111
+ ```
112
+
113
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
114
+
115
+ ## 4. Inference/Conversion
116
+
117
+ ### Pretrained Vocoder Download
118
+
119
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
120
+
121
+ ### Run
122
+
123
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
124
+
125
+ | Parameters | Description | Example |
126
+ | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
127
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
128
+ | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
129
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
130
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
131
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
132
+
133
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
134
+
135
+ ```bash
136
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \
137
+ --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
138
+ --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
139
+ --infer_source_audio_dir [Your Audios Folder] \
140
+ --infer_target_speaker "opencpop_female1" \
141
+ --infer_key_shift "autoshift"
142
+ ```
143
+
144
+ ## Citations
145
+
146
+ ```bibtex
147
+ @article{zhang2023leveraging,
148
+ title={Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion},
149
+ author={Zhang, Xueyao and Gu, Yicheng and Chen, Haopeng and Fang, Zihao and Zou, Lexiao and Xue, Liumeng and Wu, Zhizheng},
150
+ journal={Machine Learning for Audio Worshop, NeurIPS 2023},
151
+ year={2023}
152
+ }
153
+ ```
egs/svc/MultipleContentsSVC/exp_config.json ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/diffusion.json",
3
+ "model_type": "DiffWaveNetSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
20
+ "log_dir": "ckpts/svc",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
23
+ "processed_dir": "data",
24
+ // Config for features extraction
25
+ "extract_mel": true,
26
+ "extract_pitch": true,
27
+ "extract_energy": true,
28
+ "extract_whisper_feature": true,
29
+ "extract_contentvec_feature": true,
30
+ "extract_wenet_feature": false,
31
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
+ "contentvec_batch_size": 1,
33
+ // Fill in the content-based pretrained model's path
34
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
+ "whisper_model": "medium",
38
+ "whisper_model_path": "pretrained/whisper/medium.pt",
39
+ // Config for features usage
40
+ "use_mel": true,
41
+ "use_min_max_norm_mel": true,
42
+ "use_frame_pitch": true,
43
+ "use_frame_energy": true,
44
+ "use_spkid": true,
45
+ "use_whisper": true,
46
+ "use_contentvec": true,
47
+ "use_wenet": false,
48
+ "n_mel": 100,
49
+ "sample_rate": 24000
50
+ },
51
+ "model": {
52
+ "condition_encoder": {
53
+ // Config for features usage
54
+ "use_whisper": true,
55
+ "use_contentvec": true,
56
+ "use_wenet": false,
57
+ "whisper_dim": 1024,
58
+ "contentvec_dim": 256,
59
+ "wenet_dim": 512,
60
+ "use_singer_encoder": false,
61
+ "pitch_min": 50,
62
+ "pitch_max": 1100
63
+ },
64
+ "diffusion": {
65
+ "scheduler": "ddpm",
66
+ "scheduler_settings": {
67
+ "num_train_timesteps": 1000,
68
+ "beta_start": 1.0e-4,
69
+ "beta_end": 0.02,
70
+ "beta_schedule": "linear"
71
+ },
72
+ // Diffusion steps encoder
73
+ "step_encoder": {
74
+ "dim_raw_embedding": 128,
75
+ "dim_hidden_layer": 512,
76
+ "activation": "SiLU",
77
+ "num_layer": 2,
78
+ "max_period": 10000
79
+ },
80
+ // Diffusion decoder
81
+ "model_type": "bidilconv",
82
+ // bidilconv, unet2d, TODO: unet1d
83
+ "bidilconv": {
84
+ "base_channel": 512,
85
+ "n_res_block": 40,
86
+ "conv_kernel_size": 3,
87
+ "dilation_cycle_length": 4,
88
+ // specially, 1 means no dilation
89
+ "conditioner_size": 384
90
+ }
91
+ }
92
+ },
93
+ "train": {
94
+ "batch_size": 32,
95
+ "gradient_accumulation_step": 1,
96
+ "max_epoch": -1, // -1 means no limit
97
+ "save_checkpoint_stride": [
98
+ 3,
99
+ 50
100
+ ],
101
+ "keep_last": [
102
+ 3,
103
+ 2
104
+ ],
105
+ "run_eval": [
106
+ true,
107
+ true
108
+ ],
109
+ "adamw": {
110
+ "lr": 2.0e-4
111
+ },
112
+ "reducelronplateau": {
113
+ "factor": 0.8,
114
+ "patience": 30,
115
+ "min_lr": 1.0e-4
116
+ },
117
+ "dataloader": {
118
+ "num_worker": 8,
119
+ "pin_memory": true
120
+ },
121
+ "sampler": {
122
+ "holistic_shuffle": false,
123
+ "drop_last": true
124
+ }
125
+ }
126
+ }
egs/svc/MultipleContentsSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Singing Voice Conversion (SVC) Recipe
2
+
3
+ ## Quick Start
4
+
5
+ We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
6
+
7
+ ## Supported Model Architectures
8
+
9
+ The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
10
+
11
+ <br>
12
+ <div align="center">
13
+ <img src="../../imgs/svc/pipeline.png" width="70%">
14
+ </div>
15
+ <br>
16
+
17
+ Until now, Amphion SVC has supported the following features and models:
18
+
19
+ - **Speaker-agnostic Representations**:
20
+ - Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
21
+ - Prosody Features: F0 and energy.
22
+ - **Speaker Embeddings**:
23
+ - Speaker Look-Up Table.
24
+ - Reference Encoder (👨‍💻 developing): It can be used for zero-shot SVC.
25
+ - **Acoustic Decoders**:
26
+ - Diffusion-based models:
27
+ - **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
28
+ - **[DiffComoSVC](DiffComoSVC)** (👨‍💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
29
+ - Transformer-based models:
30
+ - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
31
+ - VAE- and Flow-based models:
32
+ - **[VitsSVC]()** (👨‍💻 developing): It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
33
+ - **Waveform Synthesizers (Vocoders)**:
34
+ - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
egs/svc/_template/run.sh ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
37
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
38
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
39
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
40
+ # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
41
+ --infer_source_file) shift; infer_source_file=$1 ; shift ;;
42
+ --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
43
+ # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
44
+ --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
45
+ # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
46
+ --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
47
+ # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
48
+ --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
49
+
50
+ --) shift ; break ;;
51
+ *) echo "Invalid option: $1" exit 1 ;;
52
+ esac
53
+ done
54
+
55
+
56
+ ### Value check ###
57
+ if [ -z "$running_stage" ]; then
58
+ echo "[Error] Please specify the running stage"
59
+ exit 1
60
+ fi
61
+
62
+ if [ -z "$exp_config" ]; then
63
+ exp_config="${exp_dir}"/exp_config.json
64
+ fi
65
+ echo "Exprimental Configuration File: $exp_config"
66
+
67
+ if [ -z "$gpu" ]; then
68
+ gpu="0"
69
+ fi
70
+
71
+ ######## Features Extraction ###########
72
+ if [ $running_stage -eq 1 ]; then
73
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
74
+ --config $exp_config \
75
+ --num_workers 4
76
+ fi
77
+
78
+ ######## Training ###########
79
+ if [ $running_stage -eq 2 ]; then
80
+ if [ -z "$exp_name" ]; then
81
+ echo "[Error] Please specify the experiments name"
82
+ exit 1
83
+ fi
84
+ echo "Exprimental Name: $exp_name"
85
+
86
+ if [ "$resume" = true ]; then
87
+ echo "Automatically resume from the experimental dir..."
88
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
89
+ --config "$exp_config" \
90
+ --exp_name "$exp_name" \
91
+ --log_level info \
92
+ --resume
93
+ else
94
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
95
+ --config "$exp_config" \
96
+ --exp_name "$exp_name" \
97
+ --log_level info \
98
+ --resume_from_ckpt_path "$resume_from_ckpt_path" \
99
+ --resume_type "$resume_type"
100
+ fi
101
+ fi
102
+
103
+ ######## Inference/Conversion ###########
104
+ if [ $running_stage -eq 3 ]; then
105
+ if [ -z "$infer_expt_dir" ]; then
106
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
107
+ exit 1
108
+ fi
109
+
110
+ if [ -z "$infer_output_dir" ]; then
111
+ infer_output_dir="$expt_dir/result"
112
+ fi
113
+
114
+ if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
115
+ echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
116
+ exit 1
117
+ fi
118
+
119
+ if [ -z "$infer_source_file" ]; then
120
+ infer_source=$infer_source_audio_dir
121
+ fi
122
+
123
+ if [ -z "$infer_source_audio_dir" ]; then
124
+ infer_source=$infer_source_file
125
+ fi
126
+
127
+ if [ -z "$infer_target_speaker" ]; then
128
+ echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
129
+ exit 1
130
+ fi
131
+
132
+ if [ -z "$infer_key_shift" ]; then
133
+ infer_key_shift="autoshift"
134
+ fi
135
+
136
+ if [ -z "$infer_vocoder_dir" ]; then
137
+ infer_vocoder_dir="$work_dir"/pretrained/bigvgan
138
+ echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
139
+ fi
140
+
141
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
142
+ --config $exp_config \
143
+ --acoustics_dir $infer_expt_dir \
144
+ --vocoder_dir $infer_vocoder_dir \
145
+ --target_singer $infer_target_speaker \
146
+ --trans_key $infer_key_shift \
147
+ --source $infer_source \
148
+ --output_dir $infer_output_dir \
149
+ --log_level debug
150
+ fi
egs/vocoder/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Vocoder Recipe
2
+
3
+ ## Quick Start
4
+
5
+ We provide a [**beginner recipe**](gan/tfr_enhanced_hifigan/README.md) to demonstrate how to train a high quality HiFi-GAN speech vocoder. Specially, it is also an official implementation of our paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". Some demos can be seen [here](https://vocodexelysium.github.io/MS-SB-CQTD/).
6
+
7
+ ## Supported Models
8
+
9
+ Neural vocoder generates audible waveforms from acoustic representations, which is one of the key parts for current audio generation systems. Until now, Amphion has supported various widely-used vocoders according to different vocoder types, including:
10
+
11
+ - **GAN-based vocoders**, which we have provided [**a unified recipe**](gan/README.md) :
12
+ - [MelGAN](https://arxiv.org/abs/1910.06711)
13
+ - [HiFi-GAN](https://arxiv.org/abs/2010.05646)
14
+ - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts)
15
+ - [BigVGAN](https://arxiv.org/abs/2206.04658)
16
+ - [APNet](https://arxiv.org/abs/2305.07952)
17
+ - **Flow-based vocoders** (👨‍💻 developing):
18
+ - [WaveGlow](https://arxiv.org/abs/1811.00002)
19
+ - **Diffusion-based vocoders** (👨‍💻 developing):
20
+ - [Diffwave](https://arxiv.org/abs/2009.09761)
21
+ - **Auto-regressive based vocoders** (👨‍💻 developing):
22
+ - [WaveNet](https://arxiv.org/abs/1609.03499)
23
+ - [WaveRNN](https://arxiv.org/abs/1802.08435v1)
egs/vocoder/diffusion/README.md ADDED
File without changes
egs/vocoder/diffusion/exp_config_base.json ADDED
File without changes
egs/vocoder/gan/README.md ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion GAN-based Vocoder Recipe
2
+
3
+ ## Supported Model Architectures
4
+
5
+ GAN-based Vocoder consists of a generator and multiple discriminators, as illustrated below:
6
+
7
+ <br>
8
+ <div align="center">
9
+ <img src="../../../imgs/vocoder/gan/pipeline.png" width="40%">
10
+ </div>
11
+ <br>
12
+
13
+ Until now, Amphion GAN-based Vocoder has supported the following generators and discriminators.
14
+
15
+ - **Generators**
16
+ - [MelGAN](https://arxiv.org/abs/1910.06711)
17
+ - [HiFi-GAN](https://arxiv.org/abs/2010.05646)
18
+ - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts)
19
+ - [BigVGAN](https://arxiv.org/abs/2206.04658)
20
+ - [APNet](https://arxiv.org/abs/2305.07952)
21
+ - **Discriminators**
22
+ - [Multi-Scale Discriminator](https://arxiv.org/abs/2010.05646)
23
+ - [Multi-Period Discriminator](https://arxiv.org/abs/2010.05646)
24
+ - [Multi-Resolution Discriminator](https://arxiv.org/abs/2011.09631)
25
+ - [Multi-Scale Short-Time Fourier Transform Discriminator](https://arxiv.org/abs/2210.13438)
26
+ - [**Multi-Scale Constant-Q Transfrom Discriminator (ours)**](https://arxiv.org/abs/2311.14957)
27
+
28
+ You can use any vocoder architecture with any dataset you want. There are four steps in total:
29
+
30
+ 1. Data preparation
31
+ 2. Feature extraction
32
+ 3. Training
33
+ 4. Inference
34
+
35
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
36
+ > ```bash
37
+ > cd Amphion
38
+ > ```
39
+
40
+ ## 1. Data Preparation
41
+
42
+ You can train the vocoder with any datasets. Amphion's supported open-source datasets are detailed [here](../../../datasets/README.md).
43
+
44
+ ### Configuration
45
+
46
+ Specify the dataset path in `exp_config_base.json`. Note that you can change the `dataset` list to use your preferred datasets.
47
+
48
+ ```json
49
+ "dataset": [
50
+ "csd",
51
+ "kising",
52
+ "m4singer",
53
+ "nus48e",
54
+ "opencpop",
55
+ "opensinger",
56
+ "opera",
57
+ "pjs",
58
+ "popbutfy",
59
+ "popcs",
60
+ "ljspeech",
61
+ "vctk",
62
+ "libritts",
63
+ ],
64
+ "dataset_path": {
65
+ // TODO: Fill in your dataset path
66
+ "csd": "[dataset path]",
67
+ "kising": "[dataset path]",
68
+ "m4singer": "[dataset path]",
69
+ "nus48e": "[dataset path]",
70
+ "opencpop": "[dataset path]",
71
+ "opensinger": "[dataset path]",
72
+ "opera": "[dataset path]",
73
+ "pjs": "[dataset path]",
74
+ "popbutfy": "[dataset path]",
75
+ "popcs": "[dataset path]",
76
+ "ljspeech": "[dataset path]",
77
+ "vctk": "[dataset path]",
78
+ "libritts": "[dataset path]",
79
+ },
80
+ ```
81
+
82
+ ### 2. Feature Extraction
83
+
84
+ The needed features are speficied in the individual vocoder direction so it doesn't require any modification.
85
+
86
+ ### Configuration
87
+
88
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config_base.json`:
89
+
90
+ ```json
91
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
92
+ "log_dir": "ckpts/vocoder",
93
+ "preprocess": {
94
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
95
+ "processed_dir": "data",
96
+ ...
97
+ },
98
+ ```
99
+
100
+ ### Run
101
+
102
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
103
+
104
+ ```bash
105
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 1
106
+ ```
107
+
108
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
109
+
110
+ ## 3. Training
111
+
112
+ ### Configuration
113
+
114
+ We provide the default hyparameters in the `exp_config_base.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
115
+
116
+ ```json
117
+ "train": {
118
+ "batch_size": 16,
119
+ "max_epoch": 1000000,
120
+ "save_checkpoint_stride": [20],
121
+ "adamw": {
122
+ "lr": 2.0e-4,
123
+ "adam_b1": 0.8,
124
+ "adam_b2": 0.99
125
+ },
126
+ "exponential_lr": {
127
+ "lr_decay": 0.999
128
+ },
129
+ }
130
+ ```
131
+
132
+ You can also choose any amount of prefered discriminators for training in the `exp_config_base.json`.
133
+
134
+ ```json
135
+ "discriminators": [
136
+ "msd",
137
+ "mpd",
138
+ "msstftd",
139
+ "mssbcqtd",
140
+ ],
141
+ ```
142
+
143
+ ### Run
144
+
145
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`.
146
+
147
+ ```bash
148
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 2 --name [YourExptName]
149
+ ```
150
+
151
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
152
+
153
+
154
+ ## 4. Inference
155
+
156
+ ### Run
157
+
158
+ Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from_audio`.
159
+
160
+ ```bash
161
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
162
+ --infer_mode [Your chosen inference mode] \
163
+ --infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \
164
+ --infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \
165
+ --infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \
166
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
167
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
168
+ ```
169
+
170
+ #### a. Inference from Dataset
171
+
172
+ Run the `run.sh` with specified datasets, here is an example.
173
+
174
+ ```bash
175
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
176
+ --infer_mode infer_from_dataset \
177
+ --infer_datasets "libritts vctk ljspeech" \
178
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
179
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
180
+ ```
181
+
182
+ #### b. Inference from Features
183
+
184
+ If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure:
185
+
186
+ ```plaintext
187
+ ┣ {infer_feature_dir}
188
+ ┃ ┣ mels
189
+ ┃ ┃ ┣ sample1.npy
190
+ ┃ ┃ ┣ sample2.npy
191
+ ┃ ┣ f0s (required if you use NSF-HiFiGAN)
192
+ ┃ ┃ ┣ sample1.npy
193
+ ┃ ┃ ┣ sample2.npy
194
+ ```
195
+
196
+ Then run the `run.sh` with specificed folder direction, here is an example.
197
+
198
+ ```bash
199
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
200
+ --infer_mode infer_from_feature \
201
+ --infer_feature_dir [Your path to your predicted acoustic features] \
202
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
203
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
204
+ ```
205
+
206
+ #### c. Inference from Audios
207
+
208
+ If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure:
209
+
210
+ ```plaintext
211
+ ┣ audios
212
+ ┃ ┣ sample1.wav
213
+ ┃ ┣ sample2.wav
214
+ ```
215
+
216
+ Then run the `run.sh` with specificed folder direction, here is an example.
217
+
218
+ ```bash
219
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
220
+ --infer_mode infer_from_audio \
221
+ --infer_audio_dir [Your path to your audio files] \
222
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
223
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
224
+ ```
egs/vocoder/gan/_template/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/apnet/exp_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+ "extract_amplitude_phase": true,
8
+
9
+ // Features used for model training
10
+ "use_mel": true,
11
+ "use_audio": true,
12
+ "use_amplitude_phase": true
13
+ },
14
+ "model": {
15
+ "generator": "apnet",
16
+ "apnet": {
17
+ "ASP_channel": 512,
18
+ "ASP_resblock_kernel_sizes": [3,7,11],
19
+ "ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
20
+ "ASP_input_conv_kernel_size": 7,
21
+ "ASP_output_conv_kernel_size": 7,
22
+
23
+ "PSP_channel": 512,
24
+ "PSP_resblock_kernel_sizes": [3,7,11],
25
+ "PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "PSP_input_conv_kernel_size": 7,
27
+ "PSP_output_R_conv_kernel_size": 7,
28
+ "PSP_output_I_conv_kernel_size": 7,
29
+ }
30
+ },
31
+ "train": {
32
+ "criterions": [
33
+ "feature",
34
+ "discriminator",
35
+ "generator",
36
+ "mel",
37
+ "phase",
38
+ "amplitude",
39
+ "consistency"
40
+ ]
41
+ },
42
+ "inference": {
43
+ "batch_size": 1,
44
+ }
45
+ }
egs/vocoder/gan/apnet/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/bigvgan/exp_config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+
8
+ // Features used for model training
9
+ "use_mel": true,
10
+ "use_audio": true
11
+ },
12
+ "model": {
13
+ "generator": "bigvgan",
14
+ "bigvgan": {
15
+ "resblock": "1",
16
+ "activation": "snakebeta",
17
+ "snake_logscale": true,
18
+ "upsample_rates": [
19
+ 8,
20
+ 8,
21
+ 2,
22
+ 2,
23
+ ],
24
+ "upsample_kernel_sizes": [
25
+ 16,
26
+ 16,
27
+ 4,
28
+ 4
29
+ ],
30
+ "upsample_initial_channel": 512,
31
+ "resblock_kernel_sizes": [
32
+ 3,
33
+ 7,
34
+ 11
35
+ ],
36
+ "resblock_dilation_sizes": [
37
+ [
38
+ 1,
39
+ 3,
40
+ 5
41
+ ],
42
+ [
43
+ 1,
44
+ 3,
45
+ 5
46
+ ],
47
+ [
48
+ 1,
49
+ 3,
50
+ 5
51
+ ]
52
+ ]
53
+ }
54
+ },
55
+ "train": {
56
+ "criterions": [
57
+ "feature",
58
+ "discriminator",
59
+ "generator",
60
+ "mel",
61
+ ]
62
+ },
63
+ "inference": {
64
+ "batch_size": 1,
65
+ }
66
+ }
egs/vocoder/gan/bigvgan/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/bigvgan_large/exp_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+
8
+ // Features used for model training
9
+ "use_mel": true,
10
+ "use_audio": true
11
+ },
12
+ "model": {
13
+ "generator": "bigvgan",
14
+ "bigvgan": {
15
+ "resblock": "1",
16
+ "activation": "snakebeta",
17
+ "snake_logscale": true,
18
+ "upsample_rates": [
19
+ 4,
20
+ 4,
21
+ 2,
22
+ 2,
23
+ 2,
24
+ 2
25
+ ],
26
+ "upsample_kernel_sizes": [
27
+ 8,
28
+ 8,
29
+ 4,
30
+ 4,
31
+ 4,
32
+ 4
33
+ ],
34
+ "upsample_initial_channel": 1536,
35
+ "resblock_kernel_sizes": [
36
+ 3,
37
+ 7,
38
+ 11
39
+ ],
40
+ "resblock_dilation_sizes": [
41
+ [
42
+ 1,
43
+ 3,
44
+ 5
45
+ ],
46
+ [
47
+ 1,
48
+ 3,
49
+ 5
50
+ ],
51
+ [
52
+ 1,
53
+ 3,
54
+ 5
55
+ ]
56
+ ]
57
+ },
58
+ },
59
+ "train": {
60
+ "criterions": [
61
+ "feature",
62
+ "discriminator",
63
+ "generator",
64
+ "mel",
65
+ ]
66
+ },
67
+ "inference": {
68
+ "batch_size": 1,
69
+ }
70
+ }
egs/vocoder/gan/bigvgan_large/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/exp_config_base.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/vocoder.json",
3
+ "model_type": "GANVocoder",
4
+ // TODO: Choose your needed datasets
5
+ "dataset": [
6
+ "csd",
7
+ "kising",
8
+ "m4singer",
9
+ "nus48e",
10
+ "opencpop",
11
+ "opensinger",
12
+ "opera",
13
+ "pjs",
14
+ "popbutfy",
15
+ "popcs",
16
+ "ljspeech",
17
+ "vctk",
18
+ "libritts",
19
+ ],
20
+ "dataset_path": {
21
+ // TODO: Fill in your dataset path
22
+ "csd": "[dataset path]",
23
+ "kising": "[dataset path]",
24
+ "m4singer": "[dataset path]",
25
+ "nus48e": "[dataset path]",
26
+ "opencpop": "[dataset path]",
27
+ "opensinger": "[dataset path]",
28
+ "opera": "[dataset path]",
29
+ "pjs": "[dataset path]",
30
+ "popbutfy": "[dataset path]",
31
+ "popcs": "[dataset path]",
32
+ "ljspeech": "[dataset path]",
33
+ "vctk": "[dataset path]",
34
+ "libritts": "[dataset path]",
35
+ },
36
+ // TODO: Fill in the output log path
37
+ "log_dir": "ckpts/vocoder",
38
+ "preprocess": {
39
+ // Acoustic features
40
+ "extract_mel": true,
41
+ "extract_audio": true,
42
+ "extract_pitch": false,
43
+ "extract_uv": false,
44
+ "pitch_extractor": "parselmouth",
45
+
46
+ // Features used for model training
47
+ "use_mel": true,
48
+ "use_frame_pitch": false,
49
+ "use_uv": false,
50
+ "use_audio": true,
51
+
52
+ // TODO: Fill in the output data path
53
+ "processed_dir": "data/",
54
+ "n_mel": 100,
55
+ "sample_rate": 24000
56
+ },
57
+ "model": {
58
+ // TODO: Choose your needed discriminators
59
+ "discriminators": [
60
+ "msd",
61
+ "mpd",
62
+ "msstftd",
63
+ "mssbcqtd",
64
+ ],
65
+ "mpd": {
66
+ "mpd_reshapes": [
67
+ 2,
68
+ 3,
69
+ 5,
70
+ 7,
71
+ 11
72
+ ],
73
+ "use_spectral_norm": false,
74
+ "discriminator_channel_mult_factor": 1
75
+ },
76
+ "mrd": {
77
+ "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
78
+ "use_spectral_norm": false,
79
+ "discriminator_channel_mult_factor": 1,
80
+ "mrd_override": false
81
+ },
82
+ "msstftd": {
83
+ "filters": 32
84
+ },
85
+ "mssbcqtd": {
86
+ hop_lengths: [512, 256, 256],
87
+ filters: 32,
88
+ max_filters: 1024,
89
+ filters_scale: 1,
90
+ dilations: [1, 2, 4],
91
+ in_channels: 1,
92
+ out_channels: 1,
93
+ n_octaves: [9, 9, 9],
94
+ bins_per_octaves: [24, 36, 48]
95
+ },
96
+ },
97
+ "train": {
98
+ // TODO: Choose a suitable batch size, training epoch, and save stride
99
+ "batch_size": 32,
100
+ "max_epoch": 1000000,
101
+ "save_checkpoint_stride": [20],
102
+ "adamw": {
103
+ "lr": 2.0e-4,
104
+ "adam_b1": 0.8,
105
+ "adam_b2": 0.99
106
+ },
107
+ "exponential_lr": {
108
+ "lr_decay": 0.999
109
+ },
110
+ }
111
+ }
egs/vocoder/gan/hifigan/exp_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+
8
+ // Features used for model training
9
+ "use_mel": true,
10
+ "use_audio": true
11
+ },
12
+ "model": {
13
+ "generator": "hifigan",
14
+ "hifigan": {
15
+ "resblock": "2",
16
+ "upsample_rates": [
17
+ 8,
18
+ 8,
19
+ 4
20
+ ],
21
+ "upsample_kernel_sizes": [
22
+ 16,
23
+ 16,
24
+ 8
25
+ ],
26
+ "upsample_initial_channel": 256,
27
+ "resblock_kernel_sizes": [
28
+ 3,
29
+ 5,
30
+ 7
31
+ ],
32
+ "resblock_dilation_sizes": [
33
+ [
34
+ 1,
35
+ 2
36
+ ],
37
+ [
38
+ 2,
39
+ 6
40
+ ],
41
+ [
42
+ 3,
43
+ 12
44
+ ]
45
+ ]
46
+ }
47
+ },
48
+ "train": {
49
+ "criterions": [
50
+ "feature",
51
+ "discriminator",
52
+ "generator",
53
+ "mel",
54
+ ]
55
+ },
56
+ "inference": {
57
+ "batch_size": 1,
58
+ }
59
+ }
egs/vocoder/gan/hifigan/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/melgan/exp_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+
8
+ // Features used for model training
9
+ "use_mel": true,
10
+ "use_audio": true
11
+ },
12
+ "model": {
13
+ "generator": "melgan",
14
+ "melgan": {
15
+ "ratios": [8, 8, 2, 2],
16
+ "ngf": 32,
17
+ "n_residual_layers": 3,
18
+ "num_D": 3,
19
+ "ndf": 16,
20
+ "n_layers": 4,
21
+ "downsampling_factor": 4
22
+ },
23
+ },
24
+ "train": {
25
+ "criterions": [
26
+ "feature",
27
+ "discriminator",
28
+ "generator",
29
+ ]
30
+ },
31
+ "inference": {
32
+ "batch_size": 1,
33
+ }
34
+ }
egs/vocoder/gan/melgan/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/nsfhifigan/exp_config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+ "extract_pitch": true,
8
+
9
+ // Features used for model training
10
+ "use_mel": true,
11
+ "use_audio": true,
12
+ "use_frame_pitch": true
13
+ },
14
+ "model": {
15
+ "generator": "nsfhifigan",
16
+ "nsfhifigan": {
17
+ "resblock": "1",
18
+ "harmonic_num": 8,
19
+ "upsample_rates": [
20
+ 8,
21
+ 4,
22
+ 2,
23
+ 2,
24
+ 2
25
+ ],
26
+ "upsample_kernel_sizes": [
27
+ 16,
28
+ 8,
29
+ 4,
30
+ 4,
31
+ 4
32
+ ],
33
+ "upsample_initial_channel": 768,
34
+ "resblock_kernel_sizes": [
35
+ 3,
36
+ 7,
37
+ 11
38
+ ],
39
+ "resblock_dilation_sizes": [
40
+ [
41
+ 1,
42
+ 3,
43
+ 5
44
+ ],
45
+ [
46
+ 1,
47
+ 3,
48
+ 5
49
+ ],
50
+ [
51
+ 1,
52
+ 3,
53
+ 5
54
+ ]
55
+ ]
56
+ },
57
+ "mpd": {
58
+ "mpd_reshapes": [
59
+ 2,
60
+ 3,
61
+ 5,
62
+ 7,
63
+ 11,
64
+ 17,
65
+ 23,
66
+ 37
67
+ ],
68
+ "use_spectral_norm": false,
69
+ "discriminator_channel_multi": 1
70
+ }
71
+ },
72
+ "train": {
73
+ "criterions": [
74
+ "feature",
75
+ "discriminator",
76
+ "generator",
77
+ "mel",
78
+ ]
79
+ },
80
+ "inference": {
81
+ "batch_size": 1,
82
+ }
83
+ }
egs/vocoder/gan/nsfhifigan/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/tfr_enhanced_hifigan/README.md ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fedility Vocoder
2
+
3
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2311.14957)
4
+ [![demo](https://img.shields.io/badge/Vocoder-Demo-red)](https://vocodexelysium.github.io/MS-SB-CQTD/)
5
+
6
+ <br>
7
+ <div align="center">
8
+ <img src="../../../../imgs/vocoder/gan/MSSBCQTD.png" width="80%">
9
+ </div>
10
+ <br>
11
+
12
+ This is the official implementation of the paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". In this recipe, we will illustrate how to train a high quality HiFi-GAN on LibriTTS, VCTK and LJSpeech via utilizing multiple Time-Frequency-Representation-based Discriminators.
13
+
14
+ There are four stages in total:
15
+
16
+ 1. Data preparation
17
+ 2. Feature extraction
18
+ 3. Training
19
+ 4. Inference
20
+
21
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
22
+ > ```bash
23
+ > cd Amphion
24
+ > ```
25
+
26
+ ## 1. Data Preparation
27
+
28
+ ### Dataset Download
29
+
30
+ By default, we utilize the three datasets for training: LibriTTS, VCTK and LJSpeech. How to download them is detailed in [here](../../../datasets/README.md).
31
+
32
+ ### Configuration
33
+
34
+ Specify the dataset path in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
35
+
36
+ ```json
37
+ "dataset": [
38
+ "ljspeech",
39
+ "vctk",
40
+ "libritts",
41
+ ],
42
+ "dataset_path": {
43
+ // TODO: Fill in your dataset path
44
+ "ljspeech": "[LJSpeech dataset path]",
45
+ "vctk": "[VCTK dataset path]",
46
+ "libritts": "[LibriTTS dataset path]",
47
+ },
48
+ ```
49
+
50
+ ## 2. Features Extraction
51
+
52
+ For HiFiGAN, only the Mel-Spectrogram and the Output Audio are needed for training.
53
+
54
+ ### Configuration
55
+
56
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
57
+
58
+ ```json
59
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
60
+ "log_dir": "ckpts/vocoder",
61
+ "preprocess": {
62
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
63
+ "processed_dir": "data",
64
+ ...
65
+ },
66
+ ```
67
+
68
+ ### Run
69
+
70
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
71
+
72
+ ```bash
73
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 1
74
+ ```
75
+
76
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
77
+
78
+ ## 3. Training
79
+
80
+ ### Configuration
81
+
82
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
83
+
84
+ ```json
85
+ "train": {
86
+ "batch_size": 32,
87
+ ...
88
+ }
89
+ ```
90
+
91
+ ### Run
92
+
93
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`.
94
+
95
+ ```bash
96
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 2 --name [YourExptName]
97
+ ```
98
+
99
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
100
+
101
+ ## 4. Inference
102
+
103
+ ### Pretrained Vocoder Download
104
+
105
+ We trained a HiFiGAN checkpoint with around 685 hours Speech data. The final pretrained checkpoint is released [here](../../../../pretrained/hifigan/README.md).
106
+
107
+ ### Run
108
+
109
+ Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from audio`.
110
+
111
+ ```bash
112
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
113
+ --infer_mode [Your chosen inference mode] \
114
+ --infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \
115
+ --infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \
116
+ --infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \
117
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
118
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
119
+ ```
120
+
121
+ #### a. Inference from Dataset
122
+
123
+ Run the `run.sh` with specified datasets, here is an example.
124
+
125
+ ```bash
126
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
127
+ --infer_mode infer_from_dataset \
128
+ --infer_datasets "libritts vctk ljspeech" \
129
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
130
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
131
+ ```
132
+
133
+ #### b. Inference from Features
134
+
135
+ If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure:
136
+
137
+ ```plaintext
138
+ ┣ {infer_feature_dir}
139
+ ┃ ┣ mels
140
+ ┃ ┃ ┣ sample1.npy
141
+ ┃ ┃ ┣ sample2.npy
142
+ ```
143
+
144
+ Then run the `run.sh` with specificed folder direction, here is an example.
145
+
146
+ ```bash
147
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
148
+ --infer_mode infer_from_feature \
149
+ --infer_feature_dir [Your path to your predicted acoustic features] \
150
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
151
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
152
+ ```
153
+
154
+ #### c. Inference from Audios
155
+
156
+ If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure:
157
+
158
+ ```plaintext
159
+ ┣ audios
160
+ ┃ ┣ sample1.wav
161
+ ┃ ┣ sample2.wav
162
+ ```
163
+
164
+ Then run the `run.sh` with specificed folder direction, here is an example.
165
+
166
+ ```bash
167
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
168
+ --infer_mode infer_from_audio \
169
+ --infer_audio_dir [Your path to your audio files] \
170
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
171
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
172
+ ```
173
+
174
+ ## Citations
175
+
176
+ ```bibtex
177
+ @misc{gu2023cqt,
178
+ title={Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder},
179
+ author={Yicheng Gu and Xueyao Zhang and Liumeng Xue and Zhizheng Wu},
180
+ year={2023},
181
+ eprint={2311.14957},
182
+ archivePrefix={arXiv},
183
+ primaryClass={cs.SD}
184
+ }
185
+ ```
egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "model_type": "GANVocoder",
4
+ "dataset": [
5
+ "ljspeech",
6
+ "vctk",
7
+ "libritts",
8
+ ],
9
+ "dataset_path": {
10
+ // TODO: Fill in your dataset path
11
+ "ljspeech": "[dataset path]",
12
+ "vctk": "[dataset path]",
13
+ "libritts": "[dataset path]",
14
+ },
15
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
16
+ "log_dir": "ckpts/vocoder",
17
+ "preprocess": {
18
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
19
+ "processed_dir": "data",
20
+ // acoustic features
21
+ "extract_mel": true,
22
+ "extract_audio": true,
23
+ "extract_pitch": false,
24
+ "extract_uv": false,
25
+ "extract_amplitude_phase": false,
26
+ "pitch_extractor": "parselmouth",
27
+ // Features used for model training
28
+ "use_mel": true,
29
+ "use_frame_pitch": false,
30
+ "use_uv": false,
31
+ "use_audio": true,
32
+ "n_mel": 100,
33
+ "sample_rate": 24000
34
+ },
35
+ "model": {
36
+ "generator": "hifigan",
37
+ "discriminators": [
38
+ "msd",
39
+ "mpd",
40
+ "mssbcqtd",
41
+ "msstftd",
42
+ ],
43
+ "hifigan": {
44
+ "resblock": "1",
45
+ "upsample_rates": [
46
+ 8,
47
+ 4,
48
+ 2,
49
+ 2,
50
+ 2
51
+ ],
52
+ "upsample_kernel_sizes": [
53
+ 16,
54
+ 8,
55
+ 4,
56
+ 4,
57
+ 4
58
+ ],
59
+ "upsample_initial_channel": 768,
60
+ "resblock_kernel_sizes": [
61
+ 3,
62
+ 5,
63
+ 7
64
+ ],
65
+ "resblock_dilation_sizes": [
66
+ [
67
+ 1,
68
+ 3,
69
+ 5
70
+ ],
71
+ [
72
+ 1,
73
+ 3,
74
+ 5
75
+ ],
76
+ [
77
+ 1,
78
+ 3,
79
+ 5
80
+ ]
81
+ ]
82
+ },
83
+ "mpd": {
84
+ "mpd_reshapes": [
85
+ 2,
86
+ 3,
87
+ 5,
88
+ 7,
89
+ 11,
90
+ 17,
91
+ 23,
92
+ 37
93
+ ],
94
+ "use_spectral_norm": false,
95
+ "discriminator_channel_multi": 1
96
+ }
97
+ },
98
+ "train": {
99
+ "batch_size": 16,
100
+ "adamw": {
101
+ "lr": 2.0e-4,
102
+ "adam_b1": 0.8,
103
+ "adam_b2": 0.99
104
+ },
105
+ "exponential_lr": {
106
+ "lr_decay": 0.999
107
+ },
108
+ "criterions": [
109
+ "feature",
110
+ "discriminator",
111
+ "generator",
112
+ "mel",
113
+ ]
114
+ },
115
+ "inference": {
116
+ "batch_size": 1,
117
+ }
118
+ }
egs/vocoder/gan/tfr_enhanced_hifigan/run.sh ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ echo $infer_datasets
114
+
115
+ if [ $infer_mode = "infer_from_dataset" ]; then
116
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
117
+ --config $exp_config \
118
+ --infer_mode $infer_mode \
119
+ --infer_datasets $infer_datasets \
120
+ --vocoder_dir $infer_expt_dir \
121
+ --output_dir $infer_output_dir \
122
+ --log_level debug
123
+ fi
124
+
125
+ if [ $infer_mode = "infer_from_feature" ]; then
126
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
127
+ --config $exp_config \
128
+ --infer_mode $infer_mode \
129
+ --feature_folder $infer_feature_dir \
130
+ --vocoder_dir $infer_expt_dir \
131
+ --output_dir $infer_output_dir \
132
+ --log_level debug
133
+ fi
134
+
135
+ if [ $infer_mode = "infer_from_audio" ]; then
136
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
137
+ --config $exp_config \
138
+ --infer_mode $infer_mode \
139
+ --audio_folder $infer_audio_dir \
140
+ --vocoder_dir $infer_expt_dir \
141
+ --output_dir $infer_output_dir \
142
+ --log_level debug
143
+ fi
144
+
145
+ fi
examples/chinese_female_recordings.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f710270fe3857211c55aaa1f813e310e68855ff9eabaf5b249537a2d4277cc30
3
+ size 448928
examples/chinese_male_seperated.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:009077a677b23bff3154078930e6c624d218eb0acbe78990bec88f6bf5a6e5de
3
+ size 480044
examples/english_female_seperated.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87e75863ffb4e597467a825d019217e73d64dce1e9635de60a32559ffcb97cf4
3
+ size 1509584
examples/english_male_recordings.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e14ebf1c554ebb25e5169b4bcda36a685538e94c531f303339bad91ff93a2288
3
+ size 251948
examples/output/.DS_Store ADDED
Binary file (6.15 kB). View file
 
examples/output/chinese_female_recordings_vocalist_l1_JohnMayer.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf6d6ef89ba2234fbc64c0ee48f81528cf49717a23a919aa8d0767ada2437113
3
+ size 244268
examples/output/chinese_male_seperated_vocalist_l1_TaylorSwift.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e682abb072246f412133bfa313c6edf863f1d6a6db63022749f74c2c7ef01c7
3
+ size 479788
examples/output/english_female_seperated_vocalist_l1_汪峰.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a03755cfc9aef4d26bda6370d9335625482f22f2c1f3c918dbbec3246213cee2
3
+ size 410668
examples/output/english_male_recordings_vocalist_l1_石倚洁.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e850a0e02f2741185c3d3b642a9c292a3a297cdf262e92333b63adf98af7d450
3
+ size 251948
models/__init__.py ADDED
File without changes
models/base/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from .new_trainer import BaseTrainer
7
+ from .new_inference import BaseInference
models/base/base_dataset.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ import numpy as np
8
+ import torch.utils.data
9
+ from torch.nn.utils.rnn import pad_sequence
10
+ from utils.data_utils import *
11
+ from processors.acoustic_extractor import cal_normalized_mel
12
+ from text import text_to_sequence
13
+ from text.text_token_collation import phoneIDCollation
14
+
15
+
16
+ class BaseDataset(torch.utils.data.Dataset):
17
+ def __init__(self, cfg, dataset, is_valid=False):
18
+ """
19
+ Args:
20
+ cfg: config
21
+ dataset: dataset name
22
+ is_valid: whether to use train or valid dataset
23
+ """
24
+
25
+ assert isinstance(dataset, str)
26
+
27
+ # self.data_root = processed_data_dir
28
+ self.cfg = cfg
29
+
30
+ processed_data_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
31
+ meta_file = cfg.preprocess.valid_file if is_valid else cfg.preprocess.train_file
32
+ self.metafile_path = os.path.join(processed_data_dir, meta_file)
33
+ self.metadata = self.get_metadata()
34
+
35
+
36
+
37
+ '''
38
+ load spk2id and utt2spk from json file
39
+ spk2id: {spk1: 0, spk2: 1, ...}
40
+ utt2spk: {dataset_uid: spk1, ...}
41
+ '''
42
+ if cfg.preprocess.use_spkid:
43
+ spk2id_path = os.path.join(processed_data_dir, cfg.preprocess.spk2id)
44
+ with open(spk2id_path, "r") as f:
45
+ self.spk2id = json.load(f)
46
+
47
+ utt2spk_path = os.path.join(processed_data_dir, cfg.preprocess.utt2spk)
48
+ self.utt2spk = dict()
49
+ with open(utt2spk_path, "r") as f:
50
+ for line in f.readlines():
51
+ utt, spk = line.strip().split('\t')
52
+ self.utt2spk[utt] = spk
53
+
54
+
55
+ if cfg.preprocess.use_uv:
56
+ self.utt2uv_path = {}
57
+ for utt_info in self.metadata:
58
+ dataset = utt_info["Dataset"]
59
+ uid = utt_info["Uid"]
60
+ utt = "{}_{}".format(dataset, uid)
61
+ self.utt2uv_path[utt] = os.path.join(
62
+ cfg.preprocess.processed_dir,
63
+ dataset,
64
+ cfg.preprocess.uv_dir,
65
+ uid + ".npy",
66
+ )
67
+
68
+ if cfg.preprocess.use_frame_pitch:
69
+ self.utt2frame_pitch_path = {}
70
+ for utt_info in self.metadata:
71
+ dataset = utt_info["Dataset"]
72
+ uid = utt_info["Uid"]
73
+ utt = "{}_{}".format(dataset, uid)
74
+
75
+ self.utt2frame_pitch_path[utt] = os.path.join(
76
+ cfg.preprocess.processed_dir,
77
+ dataset,
78
+ cfg.preprocess.pitch_dir,
79
+ uid + ".npy",
80
+ )
81
+
82
+ if cfg.preprocess.use_frame_energy:
83
+ self.utt2frame_energy_path = {}
84
+ for utt_info in self.metadata:
85
+ dataset = utt_info["Dataset"]
86
+ uid = utt_info["Uid"]
87
+ utt = "{}_{}".format(dataset, uid)
88
+
89
+ self.utt2frame_energy_path[utt] = os.path.join(
90
+ cfg.preprocess.processed_dir,
91
+ dataset,
92
+ cfg.preprocess.energy_dir,
93
+ uid + ".npy",
94
+ )
95
+
96
+ if cfg.preprocess.use_mel:
97
+ self.utt2mel_path = {}
98
+ for utt_info in self.metadata:
99
+ dataset = utt_info["Dataset"]
100
+ uid = utt_info["Uid"]
101
+ utt = "{}_{}".format(dataset, uid)
102
+
103
+ self.utt2mel_path[utt] = os.path.join(
104
+ cfg.preprocess.processed_dir,
105
+ dataset,
106
+ cfg.preprocess.mel_dir,
107
+ uid + ".npy",
108
+ )
109
+
110
+ if cfg.preprocess.use_linear:
111
+ self.utt2linear_path = {}
112
+ for utt_info in self.metadata:
113
+ dataset = utt_info["Dataset"]
114
+ uid = utt_info["Uid"]
115
+ utt = "{}_{}".format(dataset, uid)
116
+
117
+ self.utt2linear_path[utt] = os.path.join(
118
+ cfg.preprocess.processed_dir,
119
+ dataset,
120
+ cfg.preprocess.linear_dir,
121
+ uid + ".npy",
122
+ )
123
+
124
+ if cfg.preprocess.use_audio:
125
+ self.utt2audio_path = {}
126
+ for utt_info in self.metadata:
127
+ dataset = utt_info["Dataset"]
128
+ uid = utt_info["Uid"]
129
+ utt = "{}_{}".format(dataset, uid)
130
+
131
+ self.utt2audio_path[utt] = os.path.join(
132
+ cfg.preprocess.processed_dir,
133
+ dataset,
134
+ cfg.preprocess.audio_dir,
135
+ uid + ".npy",
136
+ )
137
+ elif cfg.preprocess.use_label:
138
+ self.utt2label_path = {}
139
+ for utt_info in self.metadata:
140
+ dataset = utt_info["Dataset"]
141
+ uid = utt_info["Uid"]
142
+ utt = "{}_{}".format(dataset, uid)
143
+
144
+ self.utt2label_path[utt] = os.path.join(
145
+ cfg.preprocess.processed_dir,
146
+ dataset,
147
+ cfg.preprocess.label_dir,
148
+ uid + ".npy",
149
+ )
150
+ elif cfg.preprocess.use_one_hot:
151
+ self.utt2one_hot_path = {}
152
+ for utt_info in self.metadata:
153
+ dataset = utt_info["Dataset"]
154
+ uid = utt_info["Uid"]
155
+ utt = "{}_{}".format(dataset, uid)
156
+
157
+ self.utt2one_hot_path[utt] = os.path.join(
158
+ cfg.preprocess.processed_dir,
159
+ dataset,
160
+ cfg.preprocess.one_hot_dir,
161
+ uid + ".npy",
162
+ )
163
+
164
+ if cfg.preprocess.use_text or cfg.preprocess.use_phone:
165
+ self.utt2seq = {}
166
+ for utt_info in self.metadata:
167
+ dataset = utt_info["Dataset"]
168
+ uid = utt_info["Uid"]
169
+ utt = "{}_{}".format(dataset, uid)
170
+
171
+ if cfg.preprocess.use_text:
172
+ text = utt_info["Text"]
173
+ sequence = text_to_sequence(text, cfg.preprocess.text_cleaners)
174
+ elif cfg.preprocess.use_phone:
175
+ # load phoneme squence from phone file
176
+ phone_path = os.path.join(processed_data_dir,
177
+ cfg.preprocess.phone_dir,
178
+ uid+'.phone'
179
+ )
180
+ with open(phone_path, 'r') as fin:
181
+ phones = fin.readlines()
182
+ assert len(phones) == 1
183
+ phones = phones[0].strip()
184
+ phones_seq = phones.split(' ')
185
+
186
+ phon_id_collator = phoneIDCollation(cfg, dataset=dataset)
187
+ sequence = phon_id_collator.get_phone_id_sequence(cfg, phones_seq)
188
+
189
+ self.utt2seq[utt] = sequence
190
+
191
+
192
+ def get_metadata(self):
193
+ with open(self.metafile_path, "r", encoding="utf-8") as f:
194
+ metadata = json.load(f)
195
+
196
+ return metadata
197
+
198
+ def get_dataset_name(self):
199
+ return self.metadata[0]["Dataset"]
200
+
201
+ def __getitem__(self, index):
202
+ utt_info = self.metadata[index]
203
+
204
+ dataset = utt_info["Dataset"]
205
+ uid = utt_info["Uid"]
206
+ utt = "{}_{}".format(dataset, uid)
207
+
208
+ single_feature = dict()
209
+
210
+ if self.cfg.preprocess.use_spkid:
211
+ single_feature["spk_id"] = np.array(
212
+ [self.spk2id[self.utt2spk[utt]]], dtype=np.int32
213
+ )
214
+
215
+ if self.cfg.preprocess.use_mel:
216
+ mel = np.load(self.utt2mel_path[utt])
217
+ assert mel.shape[0] == self.cfg.preprocess.n_mel # [n_mels, T]
218
+ if self.cfg.preprocess.use_min_max_norm_mel:
219
+ # do mel norm
220
+ mel = cal_normalized_mel(mel, utt_info["Dataset"], self.cfg.preprocess)
221
+
222
+ if "target_len" not in single_feature.keys():
223
+ single_feature["target_len"] = mel.shape[1]
224
+ single_feature["mel"] = mel.T # [T, n_mels]
225
+
226
+ if self.cfg.preprocess.use_linear:
227
+ linear = np.load(self.utt2linear_path[utt])
228
+ if "target_len" not in single_feature.keys():
229
+ single_feature["target_len"] = linear.shape[1]
230
+ single_feature["linear"] = linear.T # [T, n_linear]
231
+
232
+ if self.cfg.preprocess.use_frame_pitch:
233
+ frame_pitch_path = self.utt2frame_pitch_path[utt]
234
+ frame_pitch = np.load(frame_pitch_path)
235
+ if "target_len" not in single_feature.keys():
236
+ single_feature["target_len"] = len(frame_pitch)
237
+ aligned_frame_pitch = align_length(
238
+ frame_pitch, single_feature["target_len"]
239
+ )
240
+ single_feature["frame_pitch"] = aligned_frame_pitch
241
+
242
+ if self.cfg.preprocess.use_uv:
243
+ frame_uv_path = self.utt2uv_path[utt]
244
+ frame_uv = np.load(frame_uv_path)
245
+ aligned_frame_uv = align_length(frame_uv, single_feature["target_len"])
246
+ aligned_frame_uv = [
247
+ 0 if frame_uv else 1 for frame_uv in aligned_frame_uv
248
+ ]
249
+ aligned_frame_uv = np.array(aligned_frame_uv)
250
+ single_feature["frame_uv"] = aligned_frame_uv
251
+
252
+ if self.cfg.preprocess.use_frame_energy:
253
+ frame_energy_path = self.utt2frame_energy_path[utt]
254
+ frame_energy = np.load(frame_energy_path)
255
+ if "target_len" not in single_feature.keys():
256
+ single_feature["target_len"] = len(frame_energy)
257
+ aligned_frame_energy = align_length(
258
+ frame_energy, single_feature["target_len"]
259
+ )
260
+ single_feature["frame_energy"] = aligned_frame_energy
261
+
262
+ if self.cfg.preprocess.use_audio:
263
+ audio = np.load(self.utt2audio_path[utt])
264
+ single_feature["audio"] = audio
265
+ single_feature["audio_len"] = audio.shape[0]
266
+
267
+ if self.cfg.preprocess.use_phone or self.cfg.preprocess.use_text:
268
+ single_feature["phone_seq"] = np.array(self.utt2seq[utt])
269
+ single_feature["phone_len"] = len(self.utt2seq[utt])
270
+
271
+ return single_feature
272
+
273
+ def __len__(self):
274
+ return len(self.metadata)
275
+
276
+
277
+ class BaseCollator(object):
278
+ """Zero-pads model inputs and targets based on number of frames per step"""
279
+
280
+ def __init__(self, cfg):
281
+ self.cfg = cfg
282
+
283
+ def __call__(self, batch):
284
+ packed_batch_features = dict()
285
+
286
+ # mel: [b, T, n_mels]
287
+ # frame_pitch, frame_energy: [1, T]
288
+ # target_len: [1]
289
+ # spk_id: [b, 1]
290
+ # mask: [b, T, 1]
291
+
292
+ for key in batch[0].keys():
293
+ if key == "target_len":
294
+ packed_batch_features["target_len"] = torch.LongTensor(
295
+ [b["target_len"] for b in batch]
296
+ )
297
+ masks = [
298
+ torch.ones((b["target_len"], 1), dtype=torch.long) for b in batch
299
+ ]
300
+ packed_batch_features["mask"] = pad_sequence(
301
+ masks, batch_first=True, padding_value=0
302
+ )
303
+ elif key == "phone_len":
304
+ packed_batch_features["phone_len"] = torch.LongTensor(
305
+ [b["phone_len"] for b in batch]
306
+ )
307
+ masks = [
308
+ torch.ones((b["phone_len"], 1), dtype=torch.long) for b in batch
309
+ ]
310
+ packed_batch_features["phn_mask"] = pad_sequence(
311
+ masks, batch_first=True, padding_value=0
312
+ )
313
+ elif key == "audio_len":
314
+ packed_batch_features["audio_len"] = torch.LongTensor(
315
+ [b["audio_len"] for b in batch]
316
+ )
317
+ masks = [
318
+ torch.ones((b["audio_len"], 1), dtype=torch.long) for b in batch
319
+ ]
320
+ else:
321
+ values = [torch.from_numpy(b[key]) for b in batch]
322
+ packed_batch_features[key] = pad_sequence(
323
+ values, batch_first=True, padding_value=0
324
+ )
325
+ return packed_batch_features
326
+
327
+
328
+ class BaseTestDataset(torch.utils.data.Dataset):
329
+ def __init__(self, cfg, args):
330
+ raise NotImplementedError
331
+
332
+
333
+ def get_metadata(self):
334
+ raise NotImplementedError
335
+
336
+ def __getitem__(self, index):
337
+ raise NotImplementedError
338
+
339
+ def __len__(self):
340
+ return len(self.metadata)
341
+
342
+
343
+ class BaseTestCollator(object):
344
+ """Zero-pads model inputs and targets based on number of frames per step"""
345
+
346
+ def __init__(self, cfg):
347
+ raise NotImplementedError
348
+
349
+ def __call__(self, batch):
350
+ raise NotImplementedError
models/base/base_inference.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ import os
8
+ import re
9
+ import time
10
+ from pathlib import Path
11
+
12
+ import torch
13
+ from torch.utils.data import DataLoader
14
+ from tqdm import tqdm
15
+
16
+ from models.vocoders.vocoder_inference import synthesis
17
+ from torch.utils.data import DataLoader
18
+ from utils.util import set_all_random_seed
19
+ from utils.util import load_config
20
+
21
+
22
+ def parse_vocoder(vocoder_dir):
23
+ r"""Parse vocoder config"""
24
+ vocoder_dir = os.path.abspath(vocoder_dir)
25
+ ckpt_list = [ckpt for ckpt in Path(vocoder_dir).glob("*.pt")]
26
+ ckpt_list.sort(key=lambda x: int(x.stem), reverse=True)
27
+ ckpt_path = str(ckpt_list[0])
28
+ vocoder_cfg = load_config(os.path.join(vocoder_dir, "args.json"), lowercase=True)
29
+ vocoder_cfg.model.bigvgan = vocoder_cfg.vocoder
30
+ return vocoder_cfg, ckpt_path
31
+
32
+
33
+ class BaseInference(object):
34
+ def __init__(self, cfg, args):
35
+ self.cfg = cfg
36
+ self.args = args
37
+ self.model_type = cfg.model_type
38
+ self.avg_rtf = list()
39
+ set_all_random_seed(10086)
40
+ os.makedirs(args.output_dir, exist_ok=True)
41
+
42
+ if torch.cuda.is_available():
43
+ self.device = torch.device("cuda")
44
+ else:
45
+ self.device = torch.device("cpu")
46
+ torch.set_num_threads(10) # inference on 1 core cpu.
47
+
48
+ # Load acoustic model
49
+ self.model = self.create_model().to(self.device)
50
+ state_dict = self.load_state_dict()
51
+ self.load_model(state_dict)
52
+ self.model.eval()
53
+
54
+ # Load vocoder model if necessary
55
+ if self.args.checkpoint_dir_vocoder is not None:
56
+ self.get_vocoder_info()
57
+
58
+ def create_model(self):
59
+ raise NotImplementedError
60
+
61
+ def load_state_dict(self):
62
+ self.checkpoint_file = self.args.checkpoint_file
63
+ if self.checkpoint_file is None:
64
+ assert self.args.checkpoint_dir is not None
65
+ checkpoint_path = os.path.join(self.args.checkpoint_dir, "checkpoint")
66
+ checkpoint_filename = open(checkpoint_path).readlines()[-1].strip()
67
+ self.checkpoint_file = os.path.join(
68
+ self.args.checkpoint_dir, checkpoint_filename
69
+ )
70
+
71
+ self.checkpoint_dir = os.path.split(self.checkpoint_file)[0]
72
+
73
+ print("Restore acoustic model from {}".format(self.checkpoint_file))
74
+ raw_state_dict = torch.load(self.checkpoint_file, map_location=self.device)
75
+ self.am_restore_step = re.findall(r"step-(.+?)_loss", self.checkpoint_file)[0]
76
+
77
+ return raw_state_dict
78
+
79
+ def load_model(self, model):
80
+ raise NotImplementedError
81
+
82
+ def get_vocoder_info(self):
83
+ self.checkpoint_dir_vocoder = self.args.checkpoint_dir_vocoder
84
+ self.vocoder_cfg = os.path.join(
85
+ os.path.dirname(self.checkpoint_dir_vocoder), "args.json"
86
+ )
87
+ self.cfg.vocoder = load_config(self.vocoder_cfg, lowercase=True)
88
+ self.vocoder_tag = self.checkpoint_dir_vocoder.split("/")[-2].split(":")[-1]
89
+ self.vocoder_steps = self.checkpoint_dir_vocoder.split("/")[-1].split(".")[0]
90
+
91
+ def build_test_utt_data(self):
92
+ raise NotImplementedError
93
+
94
+ def build_testdata_loader(self, args, target_speaker=None):
95
+ datasets, collate = self.build_test_dataset()
96
+ self.test_dataset = datasets(self.cfg, args, target_speaker)
97
+ self.test_collate = collate(self.cfg)
98
+ self.test_batch_size = min(
99
+ self.cfg.train.batch_size, len(self.test_dataset.metadata)
100
+ )
101
+ test_loader = DataLoader(
102
+ self.test_dataset,
103
+ collate_fn=self.test_collate,
104
+ num_workers=self.args.num_workers,
105
+ batch_size=self.test_batch_size,
106
+ shuffle=False,
107
+ )
108
+ return test_loader
109
+
110
+ def inference_each_batch(self, batch_data):
111
+ raise NotImplementedError
112
+
113
+ def inference_for_batches(self, args, target_speaker=None):
114
+ ###### Construct test_batch ######
115
+ loader = self.build_testdata_loader(args, target_speaker)
116
+
117
+ n_batch = len(loader)
118
+ now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
119
+ print(
120
+ "Model eval time: {}, batch_size = {}, n_batch = {}".format(
121
+ now, self.test_batch_size, n_batch
122
+ )
123
+ )
124
+ self.model.eval()
125
+
126
+ ###### Inference for each batch ######
127
+ pred_res = []
128
+ with torch.no_grad():
129
+ for i, batch_data in enumerate(loader if n_batch == 1 else tqdm(loader)):
130
+ # Put the data to device
131
+ for k, v in batch_data.items():
132
+ batch_data[k] = batch_data[k].to(self.device)
133
+
134
+ y_pred, stats = self.inference_each_batch(batch_data)
135
+
136
+ pred_res += y_pred
137
+
138
+ return pred_res
139
+
140
+ def inference(self, feature):
141
+ raise NotImplementedError
142
+
143
+ def synthesis_by_vocoder(self, pred):
144
+ audios_pred = synthesis(
145
+ self.vocoder_cfg,
146
+ self.checkpoint_dir_vocoder,
147
+ len(pred),
148
+ pred,
149
+ )
150
+ return audios_pred
151
+
152
+ def __call__(self, utt):
153
+ feature = self.build_test_utt_data(utt)
154
+ start_time = time.time()
155
+ with torch.no_grad():
156
+ outputs = self.inference(feature)[0]
157
+ time_used = time.time() - start_time
158
+ rtf = time_used / (
159
+ outputs.shape[1]
160
+ * self.cfg.preprocess.hop_size
161
+ / self.cfg.preprocess.sample_rate
162
+ )
163
+ print("Time used: {:.3f}, RTF: {:.4f}".format(time_used, rtf))
164
+ self.avg_rtf.append(rtf)
165
+ audios = outputs.cpu().squeeze().numpy().reshape(-1, 1)
166
+ return audios
167
+
168
+
169
+ def base_parser():
170
+ parser = argparse.ArgumentParser()
171
+ parser.add_argument(
172
+ "--config", default="config.json", help="json files for configurations."
173
+ )
174
+ parser.add_argument("--use_ddp_inference", default=False)
175
+ parser.add_argument("--n_workers", default=1, type=int)
176
+ parser.add_argument("--local_rank", default=-1, type=int)
177
+ parser.add_argument(
178
+ "--batch_size", default=1, type=int, help="Batch size for inference"
179
+ )
180
+ parser.add_argument(
181
+ "--num_workers",
182
+ default=1,
183
+ type=int,
184
+ help="Worker number for inference dataloader",
185
+ )
186
+ parser.add_argument(
187
+ "--checkpoint_dir",
188
+ type=str,
189
+ default=None,
190
+ help="Checkpoint dir including model file and configuration",
191
+ )
192
+ parser.add_argument(
193
+ "--checkpoint_file", help="checkpoint file", type=str, default=None
194
+ )
195
+ parser.add_argument(
196
+ "--test_list", help="test utterance list for testing", type=str, default=None
197
+ )
198
+ parser.add_argument(
199
+ "--checkpoint_dir_vocoder",
200
+ help="Vocoder's checkpoint dir including model file and configuration",
201
+ type=str,
202
+ default=None,
203
+ )
204
+ parser.add_argument(
205
+ "--output_dir",
206
+ type=str,
207
+ default=None,
208
+ help="Output dir for saving generated results",
209
+ )
210
+ return parser
211
+
212
+
213
+ if __name__ == "__main__":
214
+ parser = base_parser()
215
+ args = parser.parse_args()
216
+ cfg = load_config(args.config)
217
+
218
+ # Build inference
219
+ inference = BaseInference(cfg, args)
220
+ inference()
models/base/base_sampler.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import math
7
+ import random
8
+
9
+ from torch.utils.data import ConcatDataset, Dataset
10
+ from torch.utils.data.sampler import (
11
+ BatchSampler,
12
+ RandomSampler,
13
+ Sampler,
14
+ SequentialSampler,
15
+ )
16
+
17
+
18
+ class ScheduledSampler(Sampler):
19
+ """A sampler that samples data from a given concat-dataset.
20
+
21
+ Args:
22
+ concat_dataset (ConcatDataset): a concatenated dataset consisting of all datasets
23
+ batch_size (int): batch size
24
+ holistic_shuffle (bool): whether to shuffle the whole dataset or not
25
+ logger (logging.Logger): logger to print warning message
26
+
27
+ Usage:
28
+ For cfg.train.batch_size = 3, cfg.train.holistic_shuffle = False, cfg.train.drop_last = True:
29
+ >>> list(ScheduledSampler(ConcatDataset([0, 1, 2], [3, 4, 5], [6, 7, 8]])))
30
+ [3, 4, 5, 0, 1, 2, 6, 7, 8]
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ concat_dataset,
36
+ batch_size,
37
+ holistic_shuffle,
38
+ logger=None,
39
+ loader_type="train",
40
+ ):
41
+ if not isinstance(concat_dataset, ConcatDataset):
42
+ raise ValueError(
43
+ "concat_dataset must be an instance of ConcatDataset, but got {}".format(
44
+ type(concat_dataset)
45
+ )
46
+ )
47
+ if not isinstance(batch_size, int):
48
+ raise ValueError(
49
+ "batch_size must be an integer, but got {}".format(type(batch_size))
50
+ )
51
+ if not isinstance(holistic_shuffle, bool):
52
+ raise ValueError(
53
+ "holistic_shuffle must be a boolean, but got {}".format(
54
+ type(holistic_shuffle)
55
+ )
56
+ )
57
+
58
+ self.concat_dataset = concat_dataset
59
+ self.batch_size = batch_size
60
+ self.holistic_shuffle = holistic_shuffle
61
+
62
+ affected_dataset_name = []
63
+ affected_dataset_len = []
64
+ for dataset in concat_dataset.datasets:
65
+ dataset_len = len(dataset)
66
+ dataset_name = dataset.get_dataset_name()
67
+ if dataset_len < batch_size:
68
+ affected_dataset_name.append(dataset_name)
69
+ affected_dataset_len.append(dataset_len)
70
+
71
+ self.type = loader_type
72
+ for dataset_name, dataset_len in zip(
73
+ affected_dataset_name, affected_dataset_len
74
+ ):
75
+ if not loader_type == "valid":
76
+ logger.warning(
77
+ "The {} dataset {} has a length of {}, which is smaller than the batch size {}. This may cause unexpected behavior.".format(
78
+ loader_type, dataset_name, dataset_len, batch_size
79
+ )
80
+ )
81
+
82
+ def __len__(self):
83
+ # the number of batches with drop last
84
+ num_of_batches = sum(
85
+ [
86
+ math.floor(len(dataset) / self.batch_size)
87
+ for dataset in self.concat_dataset.datasets
88
+ ]
89
+ )
90
+ # if samples are not enough for one batch, we don't drop last
91
+ if self.type == "valid" and num_of_batches < 1:
92
+ return len(self.concat_dataset)
93
+ return num_of_batches * self.batch_size
94
+
95
+ def __iter__(self):
96
+ iters = []
97
+ for dataset in self.concat_dataset.datasets:
98
+ iters.append(
99
+ SequentialSampler(dataset).__iter__()
100
+ if not self.holistic_shuffle
101
+ else RandomSampler(dataset).__iter__()
102
+ )
103
+ # e.g. [0, 200, 400]
104
+ init_indices = [0] + self.concat_dataset.cumulative_sizes[:-1]
105
+ output_batches = []
106
+ for dataset_idx in range(len(self.concat_dataset.datasets)):
107
+ cur_batch = []
108
+ for idx in iters[dataset_idx]:
109
+ cur_batch.append(idx + init_indices[dataset_idx])
110
+ if len(cur_batch) == self.batch_size:
111
+ output_batches.append(cur_batch)
112
+ cur_batch = []
113
+ # if loader_type is valid, we don't need to drop last
114
+ if self.type == "valid" and len(cur_batch) > 0:
115
+ output_batches.append(cur_batch)
116
+
117
+ # force drop last in training
118
+ random.shuffle(output_batches)
119
+ output_indices = [item for sublist in output_batches for item in sublist]
120
+ return iter(output_indices)
121
+
122
+
123
+ def build_samplers(concat_dataset: Dataset, cfg, logger, loader_type):
124
+ sampler = ScheduledSampler(
125
+ concat_dataset,
126
+ cfg.train.batch_size,
127
+ cfg.train.sampler.holistic_shuffle,
128
+ logger,
129
+ loader_type,
130
+ )
131
+ batch_sampler = BatchSampler(
132
+ sampler,
133
+ cfg.train.batch_size,
134
+ cfg.train.sampler.drop_last if not loader_type == "valid" else False,
135
+ )
136
+ return sampler, batch_sampler
models/base/base_trainer.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import collections
7
+ import json
8
+ import os
9
+ import sys
10
+ import time
11
+
12
+ import torch
13
+ import torch.distributed as dist
14
+ from torch.nn.parallel import DistributedDataParallel
15
+ from torch.utils.data import ConcatDataset, DataLoader
16
+ from torch.utils.tensorboard import SummaryWriter
17
+
18
+ from models.base.base_sampler import BatchSampler
19
+ from utils.util import (
20
+ Logger,
21
+ remove_older_ckpt,
22
+ save_config,
23
+ set_all_random_seed,
24
+ ValueWindow,
25
+ )
26
+
27
+
28
+ class BaseTrainer(object):
29
+ def __init__(self, args, cfg):
30
+ self.args = args
31
+ self.log_dir = args.log_dir
32
+ self.cfg = cfg
33
+
34
+ self.checkpoint_dir = os.path.join(args.log_dir, "checkpoints")
35
+ os.makedirs(self.checkpoint_dir, exist_ok=True)
36
+ if not cfg.train.ddp or args.local_rank == 0:
37
+ self.sw = SummaryWriter(os.path.join(args.log_dir, "events"))
38
+ self.logger = self.build_logger()
39
+ self.time_window = ValueWindow(50)
40
+
41
+ self.step = 0
42
+ self.epoch = -1
43
+ self.max_epochs = self.cfg.train.epochs
44
+ self.max_steps = self.cfg.train.max_steps
45
+
46
+ # set random seed & init distributed training
47
+ set_all_random_seed(self.cfg.train.random_seed)
48
+ if cfg.train.ddp:
49
+ dist.init_process_group(backend="nccl")
50
+
51
+ if cfg.model_type not in ["AutoencoderKL", "AudioLDM"]:
52
+ self.singers = self.build_singers_lut()
53
+
54
+ # setup data_loader
55
+ self.data_loader = self.build_data_loader()
56
+
57
+ # setup model & enable distributed training
58
+ self.model = self.build_model()
59
+ print(self.model)
60
+
61
+ if isinstance(self.model, dict):
62
+ for key, value in self.model.items():
63
+ value.cuda(self.args.local_rank)
64
+ if key == "PQMF":
65
+ continue
66
+ if cfg.train.ddp:
67
+ self.model[key] = DistributedDataParallel(
68
+ value, device_ids=[self.args.local_rank]
69
+ )
70
+ else:
71
+ self.model.cuda(self.args.local_rank)
72
+ if cfg.train.ddp:
73
+ self.model = DistributedDataParallel(
74
+ self.model, device_ids=[self.args.local_rank]
75
+ )
76
+
77
+ # create criterion
78
+ self.criterion = self.build_criterion()
79
+ if isinstance(self.criterion, dict):
80
+ for key, value in self.criterion.items():
81
+ self.criterion[key].cuda(args.local_rank)
82
+ else:
83
+ self.criterion.cuda(self.args.local_rank)
84
+
85
+ # optimizer
86
+ self.optimizer = self.build_optimizer()
87
+ self.scheduler = self.build_scheduler()
88
+
89
+ # save config file
90
+ self.config_save_path = os.path.join(self.checkpoint_dir, "args.json")
91
+
92
+ def build_logger(self):
93
+ log_file = os.path.join(self.checkpoint_dir, "train.log")
94
+ logger = Logger(log_file, level=self.args.log_level).logger
95
+
96
+ return logger
97
+
98
+ def build_dataset(self):
99
+ raise NotImplementedError
100
+
101
+ def build_data_loader(self):
102
+ Dataset, Collator = self.build_dataset()
103
+ # build dataset instance for each dataset and combine them by ConcatDataset
104
+ datasets_list = []
105
+ for dataset in self.cfg.dataset:
106
+ subdataset = Dataset(self.cfg, dataset, is_valid=False)
107
+ datasets_list.append(subdataset)
108
+ train_dataset = ConcatDataset(datasets_list)
109
+
110
+ train_collate = Collator(self.cfg)
111
+ # TODO: multi-GPU training
112
+ if self.cfg.train.ddp:
113
+ raise NotImplementedError("DDP is not supported yet.")
114
+
115
+ # sampler will provide indices to batch_sampler, which will perform batching and yield batch indices
116
+ batch_sampler = BatchSampler(
117
+ cfg=self.cfg, concat_dataset=train_dataset, dataset_list=datasets_list
118
+ )
119
+
120
+ # use batch_sampler argument instead of (sampler, shuffle, drop_last, batch_size)
121
+ train_loader = DataLoader(
122
+ train_dataset,
123
+ collate_fn=train_collate,
124
+ num_workers=self.args.num_workers,
125
+ batch_sampler=batch_sampler,
126
+ pin_memory=False,
127
+ )
128
+ if not self.cfg.train.ddp or self.args.local_rank == 0:
129
+ datasets_list = []
130
+ for dataset in self.cfg.dataset:
131
+ subdataset = Dataset(self.cfg, dataset, is_valid=True)
132
+ datasets_list.append(subdataset)
133
+ valid_dataset = ConcatDataset(datasets_list)
134
+ valid_collate = Collator(self.cfg)
135
+ batch_sampler = BatchSampler(
136
+ cfg=self.cfg, concat_dataset=valid_dataset, dataset_list=datasets_list
137
+ )
138
+ valid_loader = DataLoader(
139
+ valid_dataset,
140
+ collate_fn=valid_collate,
141
+ num_workers=1,
142
+ batch_sampler=batch_sampler,
143
+ )
144
+ else:
145
+ raise NotImplementedError("DDP is not supported yet.")
146
+ # valid_loader = None
147
+ data_loader = {"train": train_loader, "valid": valid_loader}
148
+ return data_loader
149
+
150
+ def build_singers_lut(self):
151
+ # combine singers
152
+ if not os.path.exists(os.path.join(self.log_dir, self.cfg.preprocess.spk2id)):
153
+ singers = collections.OrderedDict()
154
+ else:
155
+ with open(
156
+ os.path.join(self.log_dir, self.cfg.preprocess.spk2id), "r"
157
+ ) as singer_file:
158
+ singers = json.load(singer_file)
159
+ singer_count = len(singers)
160
+ for dataset in self.cfg.dataset:
161
+ singer_lut_path = os.path.join(
162
+ self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.spk2id
163
+ )
164
+ with open(singer_lut_path, "r") as singer_lut_path:
165
+ singer_lut = json.load(singer_lut_path)
166
+ for singer in singer_lut.keys():
167
+ if singer not in singers:
168
+ singers[singer] = singer_count
169
+ singer_count += 1
170
+ with open(
171
+ os.path.join(self.log_dir, self.cfg.preprocess.spk2id), "w"
172
+ ) as singer_file:
173
+ json.dump(singers, singer_file, indent=4, ensure_ascii=False)
174
+ print(
175
+ "singers have been dumped to {}".format(
176
+ os.path.join(self.log_dir, self.cfg.preprocess.spk2id)
177
+ )
178
+ )
179
+ return singers
180
+
181
+ def build_model(self):
182
+ raise NotImplementedError()
183
+
184
+ def build_optimizer(self):
185
+ raise NotImplementedError
186
+
187
+ def build_scheduler(self):
188
+ raise NotImplementedError()
189
+
190
+ def build_criterion(self):
191
+ raise NotImplementedError
192
+
193
+ def get_state_dict(self):
194
+ raise NotImplementedError
195
+
196
+ def save_config_file(self):
197
+ save_config(self.config_save_path, self.cfg)
198
+
199
+ # TODO, save without module.
200
+ def save_checkpoint(self, state_dict, saved_model_path):
201
+ torch.save(state_dict, saved_model_path)
202
+
203
+ def load_checkpoint(self):
204
+ checkpoint_path = os.path.join(self.checkpoint_dir, "checkpoint")
205
+ assert os.path.exists(checkpoint_path)
206
+ checkpoint_filename = open(checkpoint_path).readlines()[-1].strip()
207
+ model_path = os.path.join(self.checkpoint_dir, checkpoint_filename)
208
+ assert os.path.exists(model_path)
209
+ if not self.cfg.train.ddp or self.args.local_rank == 0:
210
+ self.logger.info(f"Re(store) from {model_path}")
211
+ checkpoint = torch.load(model_path, map_location="cpu")
212
+ return checkpoint
213
+
214
+ def load_model(self, checkpoint):
215
+ raise NotImplementedError
216
+
217
+ def restore(self):
218
+ checkpoint = self.load_checkpoint()
219
+ self.load_model(checkpoint)
220
+
221
+ def train_step(self, data):
222
+ raise NotImplementedError(
223
+ f"Need to implement function {sys._getframe().f_code.co_name} in "
224
+ f"your sub-class of {self.__class__.__name__}. "
225
+ )
226
+
227
+ @torch.no_grad()
228
+ def eval_step(self):
229
+ raise NotImplementedError(
230
+ f"Need to implement function {sys._getframe().f_code.co_name} in "
231
+ f"your sub-class of {self.__class__.__name__}. "
232
+ )
233
+
234
+ def write_summary(self, losses, stats):
235
+ raise NotImplementedError(
236
+ f"Need to implement function {sys._getframe().f_code.co_name} in "
237
+ f"your sub-class of {self.__class__.__name__}. "
238
+ )
239
+
240
+ def write_valid_summary(self, losses, stats):
241
+ raise NotImplementedError(
242
+ f"Need to implement function {sys._getframe().f_code.co_name} in "
243
+ f"your sub-class of {self.__class__.__name__}. "
244
+ )
245
+
246
+ def echo_log(self, losses, mode="Training"):
247
+ message = [
248
+ "{} - Epoch {} Step {}: [{:.3f} s/step]".format(
249
+ mode, self.epoch + 1, self.step, self.time_window.average
250
+ )
251
+ ]
252
+
253
+ for key in sorted(losses.keys()):
254
+ if isinstance(losses[key], dict):
255
+ for k, v in losses[key].items():
256
+ message.append(
257
+ str(k).split("/")[-1] + "=" + str(round(float(v), 5))
258
+ )
259
+ else:
260
+ message.append(
261
+ str(key).split("/")[-1] + "=" + str(round(float(losses[key]), 5))
262
+ )
263
+ self.logger.info(", ".join(message))
264
+
265
+ def eval_epoch(self):
266
+ self.logger.info("Validation...")
267
+ valid_losses = {}
268
+ for i, batch_data in enumerate(self.data_loader["valid"]):
269
+ for k, v in batch_data.items():
270
+ if isinstance(v, torch.Tensor):
271
+ batch_data[k] = v.cuda()
272
+ valid_loss, valid_stats, total_valid_loss = self.eval_step(batch_data, i)
273
+ for key in valid_loss:
274
+ if key not in valid_losses:
275
+ valid_losses[key] = 0
276
+ valid_losses[key] += valid_loss[key]
277
+
278
+ # Add mel and audio to the Tensorboard
279
+ # Average loss
280
+ for key in valid_losses:
281
+ valid_losses[key] /= i + 1
282
+ self.echo_log(valid_losses, "Valid")
283
+ return valid_losses, valid_stats
284
+
285
+ def train_epoch(self):
286
+ for i, batch_data in enumerate(self.data_loader["train"]):
287
+ start_time = time.time()
288
+ # Put the data to cuda device
289
+ for k, v in batch_data.items():
290
+ if isinstance(v, torch.Tensor):
291
+ batch_data[k] = v.cuda(self.args.local_rank)
292
+
293
+ # Training step
294
+ train_losses, train_stats, total_loss = self.train_step(batch_data)
295
+ self.time_window.append(time.time() - start_time)
296
+
297
+ if self.args.local_rank == 0 or not self.cfg.train.ddp:
298
+ if self.step % self.args.stdout_interval == 0:
299
+ self.echo_log(train_losses, "Training")
300
+
301
+ if self.step % self.cfg.train.save_summary_steps == 0:
302
+ self.logger.info(f"Save summary as step {self.step}")
303
+ self.write_summary(train_losses, train_stats)
304
+
305
+ if (
306
+ self.step % self.cfg.train.save_checkpoints_steps == 0
307
+ and self.step != 0
308
+ ):
309
+ saved_model_name = "step-{:07d}_loss-{:.4f}.pt".format(
310
+ self.step, total_loss
311
+ )
312
+ saved_model_path = os.path.join(
313
+ self.checkpoint_dir, saved_model_name
314
+ )
315
+ saved_state_dict = self.get_state_dict()
316
+ self.save_checkpoint(saved_state_dict, saved_model_path)
317
+ self.save_config_file()
318
+ # keep max n models
319
+ remove_older_ckpt(
320
+ saved_model_name,
321
+ self.checkpoint_dir,
322
+ max_to_keep=self.cfg.train.keep_checkpoint_max,
323
+ )
324
+
325
+ if self.step != 0 and self.step % self.cfg.train.valid_interval == 0:
326
+ if isinstance(self.model, dict):
327
+ for key in self.model.keys():
328
+ self.model[key].eval()
329
+ else:
330
+ self.model.eval()
331
+ # Evaluate one epoch and get average loss
332
+ valid_losses, valid_stats = self.eval_epoch()
333
+ if isinstance(self.model, dict):
334
+ for key in self.model.keys():
335
+ self.model[key].train()
336
+ else:
337
+ self.model.train()
338
+ # Write validation losses to summary.
339
+ self.write_valid_summary(valid_losses, valid_stats)
340
+ self.step += 1
341
+
342
+ def train(self):
343
+ for epoch in range(max(0, self.epoch), self.max_epochs):
344
+ self.train_epoch()
345
+ self.epoch += 1
346
+ if self.step > self.max_steps:
347
+ self.logger.info("Training finished!")
348
+ break
models/base/new_dataset.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import json
7
+ import os
8
+ from abc import abstractmethod
9
+ from pathlib import Path
10
+
11
+ import json5
12
+ import torch
13
+ import yaml
14
+
15
+
16
+ # TODO: for training and validating
17
+ class BaseDataset(torch.utils.data.Dataset):
18
+ r"""Base dataset for training and validating."""
19
+
20
+ def __init__(self, args, cfg, is_valid=False):
21
+ pass
22
+
23
+
24
+ class BaseTestDataset(torch.utils.data.Dataset):
25
+ r"""Test dataset for inference."""
26
+
27
+ def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
28
+ assert infer_type in ["from_dataset", "from_file"]
29
+
30
+ self.args = args
31
+ self.cfg = cfg
32
+ self.infer_type = infer_type
33
+
34
+ @abstractmethod
35
+ def __getitem__(self, index):
36
+ pass
37
+
38
+ def __len__(self):
39
+ return len(self.metadata)
40
+
41
+ def get_metadata(self):
42
+ path = Path(self.args.source)
43
+ if path.suffix == ".json" or path.suffix == ".jsonc":
44
+ metadata = json5.load(open(self.args.source, "r"))
45
+ elif path.suffix == ".yaml" or path.suffix == ".yml":
46
+ metadata = yaml.full_load(open(self.args.source, "r"))
47
+ else:
48
+ raise ValueError(f"Unsupported file type: {path.suffix}")
49
+
50
+ return metadata
models/base/new_inference.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import random
8
+ import re
9
+ import time
10
+ from abc import abstractmethod
11
+ from pathlib import Path
12
+
13
+ import accelerate
14
+ import json5
15
+ import numpy as np
16
+ import torch
17
+ from accelerate.logging import get_logger
18
+ from torch.utils.data import DataLoader
19
+
20
+ from models.vocoders.vocoder_inference import synthesis
21
+ from utils.io import save_audio
22
+ from utils.util import load_config
23
+ from utils.audio_slicer import is_silence
24
+
25
+ EPS = 1.0e-12
26
+
27
+
28
+ class BaseInference(object):
29
+ def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
30
+ super().__init__()
31
+
32
+ start = time.monotonic_ns()
33
+ self.args = args
34
+ self.cfg = cfg
35
+
36
+ assert infer_type in ["from_dataset", "from_file"]
37
+ self.infer_type = infer_type
38
+
39
+ # init with accelerate
40
+ self.accelerator = accelerate.Accelerator()
41
+ self.accelerator.wait_for_everyone()
42
+
43
+ # Use accelerate logger for distributed inference
44
+ with self.accelerator.main_process_first():
45
+ self.logger = get_logger("inference", log_level=args.log_level)
46
+
47
+ # Log some info
48
+ self.logger.info("=" * 56)
49
+ self.logger.info("||\t\t" + "New inference process started." + "\t\t||")
50
+ self.logger.info("=" * 56)
51
+ self.logger.info("\n")
52
+ self.logger.debug(f"Using {args.log_level.upper()} logging level.")
53
+
54
+ self.acoustics_dir = args.acoustics_dir
55
+ self.logger.debug(f"Acoustic dir: {args.acoustics_dir}")
56
+ self.vocoder_dir = args.vocoder_dir
57
+ self.logger.debug(f"Vocoder dir: {args.vocoder_dir}")
58
+ # should be in svc inferencer
59
+ # self.target_singer = args.target_singer
60
+ # self.logger.info(f"Target singers: {args.target_singer}")
61
+ # self.trans_key = args.trans_key
62
+ # self.logger.info(f"Trans key: {args.trans_key}")
63
+
64
+ os.makedirs(args.output_dir, exist_ok=True)
65
+
66
+ # set random seed
67
+ with self.accelerator.main_process_first():
68
+ start = time.monotonic_ns()
69
+ self._set_random_seed(self.cfg.train.random_seed)
70
+ end = time.monotonic_ns()
71
+ self.logger.debug(
72
+ f"Setting random seed done in {(end - start) / 1e6:.2f}ms"
73
+ )
74
+ self.logger.debug(f"Random seed: {self.cfg.train.random_seed}")
75
+
76
+ # setup data_loader
77
+ with self.accelerator.main_process_first():
78
+ self.logger.info("Building dataset...")
79
+ start = time.monotonic_ns()
80
+ self.test_dataloader = self._build_dataloader()
81
+ end = time.monotonic_ns()
82
+ self.logger.info(f"Building dataset done in {(end - start) / 1e6:.2f}ms")
83
+
84
+ # setup model
85
+ with self.accelerator.main_process_first():
86
+ self.logger.info("Building model...")
87
+ start = time.monotonic_ns()
88
+ self.model = self._build_model()
89
+ end = time.monotonic_ns()
90
+ # self.logger.debug(self.model)
91
+ self.logger.info(f"Building model done in {(end - start) / 1e6:.3f}ms")
92
+
93
+ # init with accelerate
94
+ self.logger.info("Initializing accelerate...")
95
+ start = time.monotonic_ns()
96
+ self.accelerator = accelerate.Accelerator()
97
+ self.model = self.accelerator.prepare(self.model)
98
+ end = time.monotonic_ns()
99
+ self.accelerator.wait_for_everyone()
100
+ self.logger.info(f"Initializing accelerate done in {(end - start) / 1e6:.3f}ms")
101
+
102
+ with self.accelerator.main_process_first():
103
+ self.logger.info("Loading checkpoint...")
104
+ start = time.monotonic_ns()
105
+ # TODO: Also, suppose only use latest one yet
106
+ self.__load_model(os.path.join(args.acoustics_dir, "checkpoint"))
107
+ end = time.monotonic_ns()
108
+ self.logger.info(f"Loading checkpoint done in {(end - start) / 1e6:.3f}ms")
109
+
110
+ self.model.eval()
111
+ self.accelerator.wait_for_everyone()
112
+
113
+ ### Abstract methods ###
114
+ @abstractmethod
115
+ def _build_test_dataset(self):
116
+ pass
117
+
118
+ @abstractmethod
119
+ def _build_model(self):
120
+ pass
121
+
122
+ @abstractmethod
123
+ @torch.inference_mode()
124
+ def _inference_each_batch(self, batch_data):
125
+ pass
126
+
127
+ ### Abstract methods end ###
128
+
129
+ @torch.inference_mode()
130
+ def inference(self):
131
+ for i, batch in enumerate(self.test_dataloader):
132
+ y_pred = self._inference_each_batch(batch).cpu()
133
+ mel_min, mel_max = self.test_dataset.target_mel_extrema
134
+ y_pred = (y_pred + 1.0) / 2.0 * (mel_max - mel_min + EPS) + mel_min
135
+ y_ls = y_pred.chunk(self.test_batch_size)
136
+ tgt_ls = batch["target_len"].cpu().chunk(self.test_batch_size)
137
+ j = 0
138
+ for it, l in zip(y_ls, tgt_ls):
139
+ l = l.item()
140
+ it = it.squeeze(0)[:l]
141
+ uid = self.test_dataset.metadata[i * self.test_batch_size + j]["Uid"]
142
+ torch.save(it, os.path.join(self.args.output_dir, f"{uid}.pt"))
143
+ j += 1
144
+
145
+ vocoder_cfg, vocoder_ckpt = self._parse_vocoder(self.args.vocoder_dir)
146
+
147
+ res = synthesis(
148
+ cfg=vocoder_cfg,
149
+ vocoder_weight_file=vocoder_ckpt,
150
+ n_samples=None,
151
+ pred=[
152
+ torch.load(
153
+ os.path.join(self.args.output_dir, "{}.pt".format(i["Uid"]))
154
+ ).numpy(force=True)
155
+ for i in self.test_dataset.metadata
156
+ ],
157
+ )
158
+
159
+ output_audio_files = []
160
+ for it, wav in zip(self.test_dataset.metadata, res):
161
+ uid = it["Uid"]
162
+ file = os.path.join(self.args.output_dir, f"{uid}.wav")
163
+ output_audio_files.append(file)
164
+
165
+ wav = wav.numpy(force=True)
166
+ save_audio(
167
+ file,
168
+ wav,
169
+ self.cfg.preprocess.sample_rate,
170
+ add_silence=False,
171
+ turn_up=not is_silence(wav, self.cfg.preprocess.sample_rate),
172
+ )
173
+ os.remove(os.path.join(self.args.output_dir, f"{uid}.pt"))
174
+
175
+ return sorted(output_audio_files)
176
+
177
+ # TODO: LEGACY CODE
178
+ def _build_dataloader(self):
179
+ datasets, collate = self._build_test_dataset()
180
+ self.test_dataset = datasets(self.args, self.cfg, self.infer_type)
181
+ self.test_collate = collate(self.cfg)
182
+ self.test_batch_size = min(
183
+ self.cfg.train.batch_size, len(self.test_dataset.metadata)
184
+ )
185
+ test_dataloader = DataLoader(
186
+ self.test_dataset,
187
+ collate_fn=self.test_collate,
188
+ num_workers=1,
189
+ batch_size=self.test_batch_size,
190
+ shuffle=False,
191
+ )
192
+ return test_dataloader
193
+
194
+ def __load_model(self, checkpoint_dir: str = None, checkpoint_path: str = None):
195
+ r"""Load model from checkpoint. If checkpoint_path is None, it will
196
+ load the latest checkpoint in checkpoint_dir. If checkpoint_path is not
197
+ None, it will load the checkpoint specified by checkpoint_path. **Only use this
198
+ method after** ``accelerator.prepare()``.
199
+ """
200
+ if checkpoint_path is None:
201
+ ls = []
202
+ for i in Path(checkpoint_dir).iterdir():
203
+ if re.match(r"epoch-\d+_step-\d+_loss-[\d.]+", str(i.stem)):
204
+ ls.append(i)
205
+ ls.sort(
206
+ key=lambda x: int(x.stem.split("_")[-3].split("-")[-1]), reverse=True
207
+ )
208
+ checkpoint_path = ls[0]
209
+ else:
210
+ checkpoint_path = Path(checkpoint_path)
211
+ self.accelerator.load_state(str(checkpoint_path))
212
+ # set epoch and step
213
+ self.epoch = int(checkpoint_path.stem.split("_")[-3].split("-")[-1])
214
+ self.step = int(checkpoint_path.stem.split("_")[-2].split("-")[-1])
215
+ return str(checkpoint_path)
216
+
217
+ @staticmethod
218
+ def _set_random_seed(seed):
219
+ r"""Set random seed for all possible random modules."""
220
+ random.seed(seed)
221
+ np.random.seed(seed)
222
+ torch.random.manual_seed(seed)
223
+
224
+ @staticmethod
225
+ def _parse_vocoder(vocoder_dir):
226
+ r"""Parse vocoder config"""
227
+ vocoder_dir = os.path.abspath(vocoder_dir)
228
+ ckpt_list = [ckpt for ckpt in Path(vocoder_dir).glob("*.pt")]
229
+ ckpt_list.sort(key=lambda x: int(x.stem), reverse=True)
230
+ ckpt_path = str(ckpt_list[0])
231
+ vocoder_cfg = load_config(
232
+ os.path.join(vocoder_dir, "args.json"), lowercase=True
233
+ )
234
+ return vocoder_cfg, ckpt_path
235
+
236
+ @staticmethod
237
+ def __count_parameters(model):
238
+ return sum(p.numel() for p in model.parameters())
239
+
240
+ def __dump_cfg(self, path):
241
+ os.makedirs(os.path.dirname(path), exist_ok=True)
242
+ json5.dump(
243
+ self.cfg,
244
+ open(path, "w"),
245
+ indent=4,
246
+ sort_keys=True,
247
+ ensure_ascii=False,
248
+ quote_keys=True,
249
+ )
models/base/new_trainer.py ADDED
@@ -0,0 +1,722 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import json
7
+ import os
8
+ import random
9
+ import shutil
10
+ import time
11
+ from abc import abstractmethod
12
+ from pathlib import Path
13
+
14
+ import accelerate
15
+ import json5
16
+ import numpy as np
17
+ import torch
18
+ from accelerate.logging import get_logger
19
+ from accelerate.utils import ProjectConfiguration
20
+ from torch.utils.data import ConcatDataset, DataLoader
21
+ from tqdm import tqdm
22
+
23
+ from models.base.base_sampler import build_samplers
24
+ from optimizer.optimizers import NoamLR
25
+
26
+
27
+ class BaseTrainer(object):
28
+ r"""The base trainer for all tasks. Any trainer should inherit from this class."""
29
+
30
+ def __init__(self, args=None, cfg=None):
31
+ super().__init__()
32
+
33
+ self.args = args
34
+ self.cfg = cfg
35
+
36
+ cfg.exp_name = args.exp_name
37
+
38
+ # init with accelerate
39
+ self._init_accelerator()
40
+ self.accelerator.wait_for_everyone()
41
+
42
+ # Use accelerate logger for distributed training
43
+ with self.accelerator.main_process_first():
44
+ self.logger = get_logger(args.exp_name, log_level=args.log_level)
45
+
46
+ # Log some info
47
+ self.logger.info("=" * 56)
48
+ self.logger.info("||\t\t" + "New training process started." + "\t\t||")
49
+ self.logger.info("=" * 56)
50
+ self.logger.info("\n")
51
+ self.logger.debug(f"Using {args.log_level.upper()} logging level.")
52
+ self.logger.info(f"Experiment name: {args.exp_name}")
53
+ self.logger.info(f"Experiment directory: {self.exp_dir}")
54
+ self.checkpoint_dir = os.path.join(self.exp_dir, "checkpoint")
55
+ if self.accelerator.is_main_process:
56
+ os.makedirs(self.checkpoint_dir, exist_ok=True)
57
+ self.logger.debug(f"Checkpoint directory: {self.checkpoint_dir}")
58
+
59
+ # init counts
60
+ self.batch_count: int = 0
61
+ self.step: int = 0
62
+ self.epoch: int = 0
63
+ self.max_epoch = (
64
+ self.cfg.train.max_epoch if self.cfg.train.max_epoch > 0 else float("inf")
65
+ )
66
+ self.logger.info(
67
+ "Max epoch: {}".format(
68
+ self.max_epoch if self.max_epoch < float("inf") else "Unlimited"
69
+ )
70
+ )
71
+
72
+ # Check values
73
+ if self.accelerator.is_main_process:
74
+ self.__check_basic_configs()
75
+ # Set runtime configs
76
+ self.save_checkpoint_stride = self.cfg.train.save_checkpoint_stride
77
+ self.checkpoints_path = [
78
+ [] for _ in range(len(self.save_checkpoint_stride))
79
+ ]
80
+ self.keep_last = [
81
+ i if i > 0 else float("inf") for i in self.cfg.train.keep_last
82
+ ]
83
+ self.run_eval = self.cfg.train.run_eval
84
+
85
+ # set random seed
86
+ with self.accelerator.main_process_first():
87
+ start = time.monotonic_ns()
88
+ self._set_random_seed(self.cfg.train.random_seed)
89
+ end = time.monotonic_ns()
90
+ self.logger.debug(
91
+ f"Setting random seed done in {(end - start) / 1e6:.2f}ms"
92
+ )
93
+ self.logger.debug(f"Random seed: {self.cfg.train.random_seed}")
94
+
95
+ # setup data_loader
96
+ with self.accelerator.main_process_first():
97
+ self.logger.info("Building dataset...")
98
+ start = time.monotonic_ns()
99
+ self.train_dataloader, self.valid_dataloader = self._build_dataloader()
100
+ end = time.monotonic_ns()
101
+ self.logger.info(f"Building dataset done in {(end - start) / 1e6:.2f}ms")
102
+
103
+ # setup model
104
+ with self.accelerator.main_process_first():
105
+ self.logger.info("Building model...")
106
+ start = time.monotonic_ns()
107
+ self.model = self._build_model()
108
+ end = time.monotonic_ns()
109
+ self.logger.debug(self.model)
110
+ self.logger.info(f"Building model done in {(end - start) / 1e6:.2f}ms")
111
+ self.logger.info(
112
+ f"Model parameters: {self.__count_parameters(self.model)/1e6:.2f}M"
113
+ )
114
+ # optimizer & scheduler
115
+ with self.accelerator.main_process_first():
116
+ self.logger.info("Building optimizer and scheduler...")
117
+ start = time.monotonic_ns()
118
+ self.optimizer = self.__build_optimizer()
119
+ self.scheduler = self.__build_scheduler()
120
+ end = time.monotonic_ns()
121
+ self.logger.info(
122
+ f"Building optimizer and scheduler done in {(end - start) / 1e6:.2f}ms"
123
+ )
124
+
125
+ # accelerate prepare
126
+ self.logger.info("Initializing accelerate...")
127
+ start = time.monotonic_ns()
128
+ (
129
+ self.train_dataloader,
130
+ self.valid_dataloader,
131
+ self.model,
132
+ self.optimizer,
133
+ self.scheduler,
134
+ ) = self.accelerator.prepare(
135
+ self.train_dataloader,
136
+ self.valid_dataloader,
137
+ self.model,
138
+ self.optimizer,
139
+ self.scheduler,
140
+ )
141
+ end = time.monotonic_ns()
142
+ self.logger.info(f"Initializing accelerate done in {(end - start) / 1e6:.2f}ms")
143
+
144
+ # create criterion
145
+ with self.accelerator.main_process_first():
146
+ self.logger.info("Building criterion...")
147
+ start = time.monotonic_ns()
148
+ self.criterion = self._build_criterion()
149
+ end = time.monotonic_ns()
150
+ self.logger.info(f"Building criterion done in {(end - start) / 1e6:.2f}ms")
151
+
152
+ # Resume or Finetune
153
+ with self.accelerator.main_process_first():
154
+ if args.resume:
155
+ ## Automatically resume according to the current exprimental name
156
+ self.logger.info("Resuming from {}...".format(self.checkpoint_dir))
157
+ start = time.monotonic_ns()
158
+ ckpt_path = self.__load_model(
159
+ checkpoint_dir=self.checkpoint_dir, resume_type=args.resume_type
160
+ )
161
+ end = time.monotonic_ns()
162
+ self.logger.info(
163
+ f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms"
164
+ )
165
+ self.checkpoints_path = json.load(
166
+ open(os.path.join(ckpt_path, "ckpts.json"), "r")
167
+ )
168
+ elif args.resume_from_ckpt_path and args.resume_from_ckpt_path != "":
169
+ ## Resume from the given checkpoint path
170
+ if not os.path.exists(args.resume_from_ckpt_path):
171
+ raise ValueError(
172
+ "[Error] The resumed checkpoint path {} don't exist.".format(
173
+ args.resume_from_ckpt_path
174
+ )
175
+ )
176
+
177
+ self.logger.info(
178
+ "Resuming from {}...".format(args.resume_from_ckpt_path)
179
+ )
180
+ start = time.monotonic_ns()
181
+ ckpt_path = self.__load_model(
182
+ checkpoint_path=args.resume_from_ckpt_path,
183
+ resume_type=args.resume_type,
184
+ )
185
+ end = time.monotonic_ns()
186
+ self.logger.info(
187
+ f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms"
188
+ )
189
+
190
+ # save config file path
191
+ self.config_save_path = os.path.join(self.exp_dir, "args.json")
192
+
193
+ ### Following are abstract methods that should be implemented in child classes ###
194
+ @abstractmethod
195
+ def _build_dataset(self):
196
+ r"""Build dataset for model training/validating/evaluating."""
197
+ pass
198
+
199
+ @staticmethod
200
+ @abstractmethod
201
+ def _build_criterion():
202
+ r"""Build criterion function for model loss calculation."""
203
+ pass
204
+
205
+ @abstractmethod
206
+ def _build_model(self):
207
+ r"""Build model for training/validating/evaluating."""
208
+ pass
209
+
210
+ @abstractmethod
211
+ def _forward_step(self, batch):
212
+ r"""One forward step of the neural network. This abstract method is trying to
213
+ unify ``_train_step`` and ``_valid_step`` and avoid redundant implementation.
214
+ However, for special case that using different forward step pattern for
215
+ training and validating, you could just override this method with ``pass`` and
216
+ implement ``_train_step`` and ``_valid_step`` separately.
217
+ """
218
+ pass
219
+
220
+ @abstractmethod
221
+ def _save_auxiliary_states(self):
222
+ r"""To save some auxiliary states when saving model's ckpt"""
223
+ pass
224
+
225
+ ### Abstract methods end ###
226
+
227
+ ### THIS IS MAIN ENTRY ###
228
+ def train_loop(self):
229
+ r"""Training loop. The public entry of training process."""
230
+ # Wait everyone to prepare before we move on
231
+ self.accelerator.wait_for_everyone()
232
+ # dump config file
233
+ if self.accelerator.is_main_process:
234
+ self.__dump_cfg(self.config_save_path)
235
+ self.model.train()
236
+ self.optimizer.zero_grad()
237
+ # Wait to ensure good to go
238
+ self.accelerator.wait_for_everyone()
239
+ while self.epoch < self.max_epoch:
240
+ self.logger.info("\n")
241
+ self.logger.info("-" * 32)
242
+ self.logger.info("Epoch {}: ".format(self.epoch))
243
+
244
+ ### TODO: change the return values of _train_epoch() to a loss dict, or (total_loss, loss_dict)
245
+ ### It's inconvenient for the model with multiple losses
246
+ # Do training & validating epoch
247
+ train_loss = self._train_epoch()
248
+ self.logger.info(" |- Train/Loss: {:.6f}".format(train_loss))
249
+ valid_loss = self._valid_epoch()
250
+ self.logger.info(" |- Valid/Loss: {:.6f}".format(valid_loss))
251
+ self.accelerator.log(
252
+ {"Epoch/Train Loss": train_loss, "Epoch/Valid Loss": valid_loss},
253
+ step=self.epoch,
254
+ )
255
+
256
+ self.accelerator.wait_for_everyone()
257
+ # TODO: what is scheduler?
258
+ self.scheduler.step(valid_loss) # FIXME: use epoch track correct?
259
+
260
+ # Check if hit save_checkpoint_stride and run_eval
261
+ run_eval = False
262
+ if self.accelerator.is_main_process:
263
+ save_checkpoint = False
264
+ hit_dix = []
265
+ for i, num in enumerate(self.save_checkpoint_stride):
266
+ if self.epoch % num == 0:
267
+ save_checkpoint = True
268
+ hit_dix.append(i)
269
+ run_eval |= self.run_eval[i]
270
+
271
+ self.accelerator.wait_for_everyone()
272
+ if self.accelerator.is_main_process and save_checkpoint:
273
+ path = os.path.join(
274
+ self.checkpoint_dir,
275
+ "epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
276
+ self.epoch, self.step, train_loss
277
+ ),
278
+ )
279
+ self.tmp_checkpoint_save_path = path
280
+ self.accelerator.save_state(path)
281
+ print(f"save checkpoint in {path}")
282
+ json.dump(
283
+ self.checkpoints_path,
284
+ open(os.path.join(path, "ckpts.json"), "w"),
285
+ ensure_ascii=False,
286
+ indent=4,
287
+ )
288
+ self._save_auxiliary_states()
289
+
290
+ # Remove old checkpoints
291
+ to_remove = []
292
+ for idx in hit_dix:
293
+ self.checkpoints_path[idx].append(path)
294
+ while len(self.checkpoints_path[idx]) > self.keep_last[idx]:
295
+ to_remove.append((idx, self.checkpoints_path[idx].pop(0)))
296
+
297
+ # Search conflicts
298
+ total = set()
299
+ for i in self.checkpoints_path:
300
+ total |= set(i)
301
+ do_remove = set()
302
+ for idx, path in to_remove[::-1]:
303
+ if path in total:
304
+ self.checkpoints_path[idx].insert(0, path)
305
+ else:
306
+ do_remove.add(path)
307
+
308
+ # Remove old checkpoints
309
+ for path in do_remove:
310
+ shutil.rmtree(path, ignore_errors=True)
311
+ self.logger.debug(f"Remove old checkpoint: {path}")
312
+
313
+ self.accelerator.wait_for_everyone()
314
+ if run_eval:
315
+ # TODO: run evaluation
316
+ pass
317
+
318
+ # Update info for each epoch
319
+ self.epoch += 1
320
+
321
+ # Finish training and save final checkpoint
322
+ self.accelerator.wait_for_everyone()
323
+ if self.accelerator.is_main_process:
324
+ self.accelerator.save_state(
325
+ os.path.join(
326
+ self.checkpoint_dir,
327
+ "final_epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
328
+ self.epoch, self.step, valid_loss
329
+ ),
330
+ )
331
+ )
332
+ self._save_auxiliary_states()
333
+
334
+ self.accelerator.end_training()
335
+
336
+ ### Following are methods that can be used directly in child classes ###
337
+ def _train_epoch(self):
338
+ r"""Training epoch. Should return average loss of a batch (sample) over
339
+ one epoch. See ``train_loop`` for usage.
340
+ """
341
+ self.model.train()
342
+ epoch_sum_loss: float = 0.0
343
+ epoch_step: int = 0
344
+ for batch in tqdm(
345
+ self.train_dataloader,
346
+ desc=f"Training Epoch {self.epoch}",
347
+ unit="batch",
348
+ colour="GREEN",
349
+ leave=False,
350
+ dynamic_ncols=True,
351
+ smoothing=0.04,
352
+ disable=not self.accelerator.is_main_process,
353
+ ):
354
+ # Do training step and BP
355
+ with self.accelerator.accumulate(self.model):
356
+ loss = self._train_step(batch)
357
+ self.accelerator.backward(loss)
358
+ self.optimizer.step()
359
+ self.optimizer.zero_grad()
360
+ self.batch_count += 1
361
+
362
+ # Update info for each step
363
+ # TODO: step means BP counts or batch counts?
364
+ if self.batch_count % self.cfg.train.gradient_accumulation_step == 0:
365
+ epoch_sum_loss += loss
366
+ self.accelerator.log(
367
+ {
368
+ "Step/Train Loss": loss,
369
+ "Step/Learning Rate": self.optimizer.param_groups[0]["lr"],
370
+ },
371
+ step=self.step,
372
+ )
373
+ self.step += 1
374
+ epoch_step += 1
375
+
376
+ self.accelerator.wait_for_everyone()
377
+ return (
378
+ epoch_sum_loss
379
+ / len(self.train_dataloader)
380
+ * self.cfg.train.gradient_accumulation_step
381
+ )
382
+
383
+ @torch.inference_mode()
384
+ def _valid_epoch(self):
385
+ r"""Testing epoch. Should return average loss of a batch (sample) over
386
+ one epoch. See ``train_loop`` for usage.
387
+ """
388
+ self.model.eval()
389
+ epoch_sum_loss = 0.0
390
+ for batch in tqdm(
391
+ self.valid_dataloader,
392
+ desc=f"Validating Epoch {self.epoch}",
393
+ unit="batch",
394
+ colour="GREEN",
395
+ leave=False,
396
+ dynamic_ncols=True,
397
+ smoothing=0.04,
398
+ disable=not self.accelerator.is_main_process,
399
+ ):
400
+ batch_loss = self._valid_step(batch)
401
+ epoch_sum_loss += batch_loss.item()
402
+
403
+ self.accelerator.wait_for_everyone()
404
+ return epoch_sum_loss / len(self.valid_dataloader)
405
+
406
+ def _train_step(self, batch):
407
+ r"""Training forward step. Should return average loss of a sample over
408
+ one batch. Provoke ``_forward_step`` is recommended except for special case.
409
+ See ``_train_epoch`` for usage.
410
+ """
411
+ return self._forward_step(batch)
412
+
413
+ @torch.inference_mode()
414
+ def _valid_step(self, batch):
415
+ r"""Testing forward step. Should return average loss of a sample over
416
+ one batch. Provoke ``_forward_step`` is recommended except for special case.
417
+ See ``_test_epoch`` for usage.
418
+ """
419
+ return self._forward_step(batch)
420
+
421
+ def __load_model(
422
+ self,
423
+ checkpoint_dir: str = None,
424
+ checkpoint_path: str = None,
425
+ resume_type: str = "",
426
+ ):
427
+ r"""Load model from checkpoint. If checkpoint_path is None, it will
428
+ load the latest checkpoint in checkpoint_dir. If checkpoint_path is not
429
+ None, it will load the checkpoint specified by checkpoint_path. **Only use this
430
+ method after** ``accelerator.prepare()``.
431
+ """
432
+ if checkpoint_path is None:
433
+ ls = [str(i) for i in Path(checkpoint_dir).glob("*")]
434
+ ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True)
435
+ checkpoint_path = ls[0]
436
+ self.logger.info("Resume from {}...".format(checkpoint_path))
437
+
438
+ if resume_type in ["resume", ""]:
439
+ # Load all the things, including model weights, optimizer, scheduler, and random states.
440
+ self.accelerator.load_state(input_dir=checkpoint_path)
441
+
442
+ # set epoch and step
443
+ self.epoch = int(checkpoint_path.split("_")[-3].split("-")[-1]) + 1
444
+ self.step = int(checkpoint_path.split("_")[-2].split("-")[-1]) + 1
445
+
446
+ elif resume_type == "finetune":
447
+ # Load only the model weights
448
+ accelerate.load_checkpoint_and_dispatch(
449
+ self.accelerator.unwrap_model(self.model),
450
+ os.path.join(checkpoint_path, "pytorch_model.bin"),
451
+ )
452
+ self.logger.info("Load model weights for finetune...")
453
+
454
+ else:
455
+ raise ValueError("Resume_type must be `resume` or `finetune`.")
456
+
457
+ return checkpoint_path
458
+
459
+ # TODO: LEGACY CODE
460
+ def _build_dataloader(self):
461
+ Dataset, Collator = self._build_dataset()
462
+
463
+ # build dataset instance for each dataset and combine them by ConcatDataset
464
+ datasets_list = []
465
+ for dataset in self.cfg.dataset:
466
+ subdataset = Dataset(self.cfg, dataset, is_valid=False)
467
+ datasets_list.append(subdataset)
468
+ train_dataset = ConcatDataset(datasets_list)
469
+ train_collate = Collator(self.cfg)
470
+ _, batch_sampler = build_samplers(train_dataset, self.cfg, self.logger, "train")
471
+ self.logger.debug(f"train batch_sampler: {list(batch_sampler)}")
472
+ self.logger.debug(f"length: {train_dataset.cumulative_sizes}")
473
+ # TODO: use config instead of (sampler, shuffle, drop_last, batch_size)
474
+ train_loader = DataLoader(
475
+ train_dataset,
476
+ collate_fn=train_collate,
477
+ batch_sampler=batch_sampler,
478
+ num_workers=self.cfg.train.dataloader.num_worker,
479
+ pin_memory=self.cfg.train.dataloader.pin_memory,
480
+ )
481
+
482
+ # Build valid dataloader
483
+ datasets_list = []
484
+ for dataset in self.cfg.dataset:
485
+ subdataset = Dataset(self.cfg, dataset, is_valid=True)
486
+ datasets_list.append(subdataset)
487
+ valid_dataset = ConcatDataset(datasets_list)
488
+ valid_collate = Collator(self.cfg)
489
+ _, batch_sampler = build_samplers(valid_dataset, self.cfg, self.logger, "valid")
490
+ self.logger.debug(f"valid batch_sampler: {list(batch_sampler)}")
491
+ self.logger.debug(f"length: {valid_dataset.cumulative_sizes}")
492
+ valid_loader = DataLoader(
493
+ valid_dataset,
494
+ collate_fn=valid_collate,
495
+ batch_sampler=batch_sampler,
496
+ num_workers=self.cfg.train.dataloader.num_worker,
497
+ pin_memory=self.cfg.train.dataloader.pin_memory,
498
+ )
499
+ return train_loader, valid_loader
500
+
501
+ @staticmethod
502
+ def _set_random_seed(seed):
503
+ r"""Set random seed for all possible random modules."""
504
+ random.seed(seed)
505
+ np.random.seed(seed)
506
+ torch.random.manual_seed(seed)
507
+
508
+ def _check_nan(self, loss, y_pred, y_gt):
509
+ if torch.any(torch.isnan(loss)):
510
+ self.logger.fatal("Fatal Error: Training is down since loss has Nan!")
511
+ self.logger.error("loss = {:.6f}".format(loss.item()), in_order=True)
512
+ if torch.any(torch.isnan(y_pred)):
513
+ self.logger.error(
514
+ f"y_pred has Nan: {torch.any(torch.isnan(y_pred))}", in_order=True
515
+ )
516
+ else:
517
+ self.logger.debug(
518
+ f"y_pred has Nan: {torch.any(torch.isnan(y_pred))}", in_order=True
519
+ )
520
+ if torch.any(torch.isnan(y_gt)):
521
+ self.logger.error(
522
+ f"y_gt has Nan: {torch.any(torch.isnan(y_gt))}", in_order=True
523
+ )
524
+ else:
525
+ self.logger.debug(
526
+ f"y_gt has nan: {torch.any(torch.isnan(y_gt))}", in_order=True
527
+ )
528
+ if torch.any(torch.isnan(y_pred)):
529
+ self.logger.error(f"y_pred: {y_pred}", in_order=True)
530
+ else:
531
+ self.logger.debug(f"y_pred: {y_pred}", in_order=True)
532
+ if torch.any(torch.isnan(y_gt)):
533
+ self.logger.error(f"y_gt: {y_gt}", in_order=True)
534
+ else:
535
+ self.logger.debug(f"y_gt: {y_gt}", in_order=True)
536
+
537
+ # TODO: still OK to save tracking?
538
+ self.accelerator.end_training()
539
+ raise RuntimeError("Loss has Nan! See log for more info.")
540
+
541
+ ### Protected methods end ###
542
+
543
+ ## Following are private methods ##
544
+ ## !!! These are inconvenient for GAN-based model training. It'd be better to move these to svc_trainer.py if needed.
545
+ def __build_optimizer(self):
546
+ r"""Build optimizer for model."""
547
+ # Make case-insensitive matching
548
+ if self.cfg.train.optimizer.lower() == "adadelta":
549
+ optimizer = torch.optim.Adadelta(
550
+ self.model.parameters(), **self.cfg.train.adadelta
551
+ )
552
+ self.logger.info("Using Adadelta optimizer.")
553
+ elif self.cfg.train.optimizer.lower() == "adagrad":
554
+ optimizer = torch.optim.Adagrad(
555
+ self.model.parameters(), **self.cfg.train.adagrad
556
+ )
557
+ self.logger.info("Using Adagrad optimizer.")
558
+ elif self.cfg.train.optimizer.lower() == "adam":
559
+ optimizer = torch.optim.Adam(self.model.parameters(), **self.cfg.train.adam)
560
+ self.logger.info("Using Adam optimizer.")
561
+ elif self.cfg.train.optimizer.lower() == "adamw":
562
+ optimizer = torch.optim.AdamW(
563
+ self.model.parameters(), **self.cfg.train.adamw
564
+ )
565
+ elif self.cfg.train.optimizer.lower() == "sparseadam":
566
+ optimizer = torch.optim.SparseAdam(
567
+ self.model.parameters(), **self.cfg.train.sparseadam
568
+ )
569
+ elif self.cfg.train.optimizer.lower() == "adamax":
570
+ optimizer = torch.optim.Adamax(
571
+ self.model.parameters(), **self.cfg.train.adamax
572
+ )
573
+ elif self.cfg.train.optimizer.lower() == "asgd":
574
+ optimizer = torch.optim.ASGD(self.model.parameters(), **self.cfg.train.asgd)
575
+ elif self.cfg.train.optimizer.lower() == "lbfgs":
576
+ optimizer = torch.optim.LBFGS(
577
+ self.model.parameters(), **self.cfg.train.lbfgs
578
+ )
579
+ elif self.cfg.train.optimizer.lower() == "nadam":
580
+ optimizer = torch.optim.NAdam(
581
+ self.model.parameters(), **self.cfg.train.nadam
582
+ )
583
+ elif self.cfg.train.optimizer.lower() == "radam":
584
+ optimizer = torch.optim.RAdam(
585
+ self.model.parameters(), **self.cfg.train.radam
586
+ )
587
+ elif self.cfg.train.optimizer.lower() == "rmsprop":
588
+ optimizer = torch.optim.RMSprop(
589
+ self.model.parameters(), **self.cfg.train.rmsprop
590
+ )
591
+ elif self.cfg.train.optimizer.lower() == "rprop":
592
+ optimizer = torch.optim.Rprop(
593
+ self.model.parameters(), **self.cfg.train.rprop
594
+ )
595
+ elif self.cfg.train.optimizer.lower() == "sgd":
596
+ optimizer = torch.optim.SGD(self.model.parameters(), **self.cfg.train.sgd)
597
+ else:
598
+ raise NotImplementedError(
599
+ f"Optimizer {self.cfg.train.optimizer} not supported yet!"
600
+ )
601
+ return optimizer
602
+
603
+ def __build_scheduler(self):
604
+ r"""Build scheduler for optimizer."""
605
+ # Make case-insensitive matching
606
+ if self.cfg.train.scheduler.lower() == "lambdalr":
607
+ scheduler = torch.optim.lr_scheduler.LambdaLR(
608
+ self.optimizer, **self.cfg.train.lambdalr
609
+ )
610
+ elif self.cfg.train.scheduler.lower() == "multiplicativelr":
611
+ scheduler = torch.optim.lr_scheduler.MultiplicativeLR(
612
+ self.optimizer, **self.cfg.train.multiplicativelr
613
+ )
614
+ elif self.cfg.train.scheduler.lower() == "steplr":
615
+ scheduler = torch.optim.lr_scheduler.StepLR(
616
+ self.optimizer, **self.cfg.train.steplr
617
+ )
618
+ elif self.cfg.train.scheduler.lower() == "multisteplr":
619
+ scheduler = torch.optim.lr_scheduler.MultiStepLR(
620
+ self.optimizer, **self.cfg.train.multisteplr
621
+ )
622
+ elif self.cfg.train.scheduler.lower() == "constantlr":
623
+ scheduler = torch.optim.lr_scheduler.ConstantLR(
624
+ self.optimizer, **self.cfg.train.constantlr
625
+ )
626
+ elif self.cfg.train.scheduler.lower() == "linearlr":
627
+ scheduler = torch.optim.lr_scheduler.LinearLR(
628
+ self.optimizer, **self.cfg.train.linearlr
629
+ )
630
+ elif self.cfg.train.scheduler.lower() == "exponentiallr":
631
+ scheduler = torch.optim.lr_scheduler.ExponentialLR(
632
+ self.optimizer, **self.cfg.train.exponentiallr
633
+ )
634
+ elif self.cfg.train.scheduler.lower() == "polynomiallr":
635
+ scheduler = torch.optim.lr_scheduler.PolynomialLR(
636
+ self.optimizer, **self.cfg.train.polynomiallr
637
+ )
638
+ elif self.cfg.train.scheduler.lower() == "cosineannealinglr":
639
+ scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
640
+ self.optimizer, **self.cfg.train.cosineannealinglr
641
+ )
642
+ elif self.cfg.train.scheduler.lower() == "sequentiallr":
643
+ scheduler = torch.optim.lr_scheduler.SequentialLR(
644
+ self.optimizer, **self.cfg.train.sequentiallr
645
+ )
646
+ elif self.cfg.train.scheduler.lower() == "reducelronplateau":
647
+ scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
648
+ self.optimizer, **self.cfg.train.reducelronplateau
649
+ )
650
+ elif self.cfg.train.scheduler.lower() == "cycliclr":
651
+ scheduler = torch.optim.lr_scheduler.CyclicLR(
652
+ self.optimizer, **self.cfg.train.cycliclr
653
+ )
654
+ elif self.cfg.train.scheduler.lower() == "onecyclelr":
655
+ scheduler = torch.optim.lr_scheduler.OneCycleLR(
656
+ self.optimizer, **self.cfg.train.onecyclelr
657
+ )
658
+ elif self.cfg.train.scheduler.lower() == "cosineannearingwarmrestarts":
659
+ scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
660
+ self.optimizer, **self.cfg.train.cosineannearingwarmrestarts
661
+ )
662
+ elif self.cfg.train.scheduler.lower() == "noamlr":
663
+ scheduler = NoamLR(self.optimizer, **self.cfg.train.lr_scheduler)
664
+ else:
665
+ raise NotImplementedError(
666
+ f"Scheduler {self.cfg.train.scheduler} not supported yet!"
667
+ )
668
+ return scheduler
669
+
670
+ def _init_accelerator(self):
671
+ self.exp_dir = os.path.join(
672
+ os.path.abspath(self.cfg.log_dir), self.args.exp_name
673
+ )
674
+ project_config = ProjectConfiguration(
675
+ project_dir=self.exp_dir,
676
+ logging_dir=os.path.join(self.exp_dir, "log"),
677
+ )
678
+ self.accelerator = accelerate.Accelerator(
679
+ gradient_accumulation_steps=self.cfg.train.gradient_accumulation_step,
680
+ log_with=self.cfg.train.tracker,
681
+ project_config=project_config,
682
+ )
683
+ if self.accelerator.is_main_process:
684
+ os.makedirs(project_config.project_dir, exist_ok=True)
685
+ os.makedirs(project_config.logging_dir, exist_ok=True)
686
+ with self.accelerator.main_process_first():
687
+ self.accelerator.init_trackers(self.args.exp_name)
688
+
689
+ def __check_basic_configs(self):
690
+ if self.cfg.train.gradient_accumulation_step <= 0:
691
+ self.logger.fatal("Invalid gradient_accumulation_step value!")
692
+ self.logger.error(
693
+ f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
694
+ )
695
+ self.accelerator.end_training()
696
+ raise ValueError(
697
+ f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
698
+ )
699
+ # TODO: check other values
700
+
701
+ @staticmethod
702
+ def __count_parameters(model):
703
+ model_param = 0.0
704
+ if isinstance(model, dict):
705
+ for key, value in model.items():
706
+ model_param += sum(p.numel() for p in model[key].parameters())
707
+ else:
708
+ model_param = sum(p.numel() for p in model.parameters())
709
+ return model_param
710
+
711
+ def __dump_cfg(self, path):
712
+ os.makedirs(os.path.dirname(path), exist_ok=True)
713
+ json5.dump(
714
+ self.cfg,
715
+ open(path, "w"),
716
+ indent=4,
717
+ sort_keys=True,
718
+ ensure_ascii=False,
719
+ quote_keys=True,
720
+ )
721
+
722
+ ### Private methods end ###
models/svc/__init__.py ADDED
File without changes
models/svc/base/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from .svc_inference import SVCInference
7
+ from .svc_trainer import SVCTrainer
models/svc/base/svc_dataset.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import random
7
+ import torch
8
+ from torch.nn.utils.rnn import pad_sequence
9
+ import json
10
+ import os
11
+ import numpy as np
12
+ from utils.data_utils import *
13
+ from processors.acoustic_extractor import cal_normalized_mel, load_mel_extrema
14
+ from processors.content_extractor import (
15
+ ContentvecExtractor,
16
+ WhisperExtractor,
17
+ WenetExtractor,
18
+ )
19
+ from models.base.base_dataset import (
20
+ BaseCollator,
21
+ BaseDataset,
22
+ )
23
+ from models.base.new_dataset import BaseTestDataset
24
+
25
+ EPS = 1.0e-12
26
+
27
+
28
+ class SVCDataset(BaseDataset):
29
+ def __init__(self, cfg, dataset, is_valid=False):
30
+ BaseDataset.__init__(self, cfg, dataset, is_valid=is_valid)
31
+
32
+ cfg = self.cfg
33
+
34
+ if cfg.model.condition_encoder.use_whisper:
35
+ self.whisper_aligner = WhisperExtractor(self.cfg)
36
+ self.utt2whisper_path = load_content_feature_path(
37
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.whisper_dir
38
+ )
39
+
40
+ if cfg.model.condition_encoder.use_contentvec:
41
+ self.contentvec_aligner = ContentvecExtractor(self.cfg)
42
+ self.utt2contentVec_path = load_content_feature_path(
43
+ self.metadata,
44
+ cfg.preprocess.processed_dir,
45
+ cfg.preprocess.contentvec_dir,
46
+ )
47
+
48
+ if cfg.model.condition_encoder.use_mert:
49
+ self.utt2mert_path = load_content_feature_path(
50
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.mert_dir
51
+ )
52
+ if cfg.model.condition_encoder.use_wenet:
53
+ self.wenet_aligner = WenetExtractor(self.cfg)
54
+ self.utt2wenet_path = load_content_feature_path(
55
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.wenet_dir
56
+ )
57
+
58
+ def __getitem__(self, index):
59
+ single_feature = BaseDataset.__getitem__(self, index)
60
+
61
+ utt_info = self.metadata[index]
62
+ dataset = utt_info["Dataset"]
63
+ uid = utt_info["Uid"]
64
+ utt = "{}_{}".format(dataset, uid)
65
+
66
+ if self.cfg.model.condition_encoder.use_whisper:
67
+ assert "target_len" in single_feature.keys()
68
+ aligned_whisper_feat = self.whisper_aligner.offline_align(
69
+ np.load(self.utt2whisper_path[utt]), single_feature["target_len"]
70
+ )
71
+ single_feature["whisper_feat"] = aligned_whisper_feat
72
+
73
+ if self.cfg.model.condition_encoder.use_contentvec:
74
+ assert "target_len" in single_feature.keys()
75
+ aligned_contentvec = self.contentvec_aligner.offline_align(
76
+ np.load(self.utt2contentVec_path[utt]), single_feature["target_len"]
77
+ )
78
+ single_feature["contentvec_feat"] = aligned_contentvec
79
+
80
+ if self.cfg.model.condition_encoder.use_mert:
81
+ assert "target_len" in single_feature.keys()
82
+ aligned_mert_feat = align_content_feature_length(
83
+ np.load(self.utt2mert_path[utt]),
84
+ single_feature["target_len"],
85
+ source_hop=self.cfg.preprocess.mert_hop_size,
86
+ )
87
+ single_feature["mert_feat"] = aligned_mert_feat
88
+
89
+ if self.cfg.model.condition_encoder.use_wenet:
90
+ assert "target_len" in single_feature.keys()
91
+ aligned_wenet_feat = self.wenet_aligner.offline_align(
92
+ np.load(self.utt2wenet_path[utt]), single_feature["target_len"]
93
+ )
94
+ single_feature["wenet_feat"] = aligned_wenet_feat
95
+
96
+ # print(single_feature.keys())
97
+ # for k, v in single_feature.items():
98
+ # if type(v) in [torch.Tensor, np.ndarray]:
99
+ # print(k, v.shape)
100
+ # else:
101
+ # print(k, v)
102
+ # exit()
103
+
104
+ return self.clip_if_too_long(single_feature)
105
+
106
+ def __len__(self):
107
+ return len(self.metadata)
108
+
109
+ def random_select(self, feature_seq_len, max_seq_len, ending_ts=2812):
110
+ """
111
+ ending_ts: to avoid invalid whisper features for over 30s audios
112
+ 2812 = 30 * 24000 // 256
113
+ """
114
+ ts = max(feature_seq_len - max_seq_len, 0)
115
+ ts = min(ts, ending_ts - max_seq_len)
116
+
117
+ start = random.randint(0, ts)
118
+ end = start + max_seq_len
119
+ return start, end
120
+
121
+ def clip_if_too_long(self, sample, max_seq_len=512):
122
+ """
123
+ sample :
124
+ {
125
+ 'spk_id': (1,),
126
+ 'target_len': int
127
+ 'mel': (seq_len, dim),
128
+ 'frame_pitch': (seq_len,)
129
+ 'frame_energy': (seq_len,)
130
+ 'content_vector_feat': (seq_len, dim)
131
+ }
132
+ """
133
+ if sample["target_len"] <= max_seq_len:
134
+ return sample
135
+
136
+ start, end = self.random_select(sample["target_len"], max_seq_len)
137
+ sample["target_len"] = end - start
138
+
139
+ for k in sample.keys():
140
+ if k not in ["spk_id", "target_len"]:
141
+ sample[k] = sample[k][start:end]
142
+
143
+ return sample
144
+
145
+
146
+ class SVCCollator(BaseCollator):
147
+ """Zero-pads model inputs and targets based on number of frames per step"""
148
+
149
+ def __init__(self, cfg):
150
+ BaseCollator.__init__(self, cfg)
151
+
152
+ def __call__(self, batch):
153
+ parsed_batch_features = BaseCollator.__call__(self, batch)
154
+ return parsed_batch_features
155
+
156
+
157
+ class SVCTestDataset(BaseTestDataset):
158
+ def __init__(self, args, cfg, infer_type):
159
+ BaseTestDataset.__init__(self, args, cfg, infer_type)
160
+ self.metadata = self.get_metadata()
161
+
162
+ target_singer = args.target_singer
163
+ self.cfg = cfg
164
+ self.trans_key = args.trans_key
165
+ assert type(target_singer) == str
166
+
167
+ self.target_singer = target_singer.split("_")[-1]
168
+ self.target_dataset = target_singer.replace(
169
+ "_{}".format(self.target_singer), ""
170
+ )
171
+
172
+ self.target_mel_extrema = load_mel_extrema(cfg.preprocess, self.target_dataset)
173
+ self.target_mel_extrema = torch.as_tensor(
174
+ self.target_mel_extrema[0]
175
+ ), torch.as_tensor(self.target_mel_extrema[1])
176
+
177
+ ######### Load source acoustic features #########
178
+ if cfg.preprocess.use_spkid:
179
+ spk2id_path = os.path.join(args.acoustics_dir, cfg.preprocess.spk2id)
180
+ # utt2sp_path = os.path.join(self.data_root, cfg.preprocess.utt2spk)
181
+
182
+ with open(spk2id_path, "r") as f:
183
+ self.spk2id = json.load(f)
184
+ # print("self.spk2id", self.spk2id)
185
+
186
+ if cfg.preprocess.use_uv:
187
+ self.utt2uv_path = {
188
+ f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
189
+ cfg.preprocess.processed_dir,
190
+ utt_info["Dataset"],
191
+ cfg.preprocess.uv_dir,
192
+ utt_info["Uid"] + ".npy",
193
+ )
194
+ for utt_info in self.metadata
195
+ }
196
+
197
+ if cfg.preprocess.use_frame_pitch:
198
+ self.utt2frame_pitch_path = {
199
+ f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
200
+ cfg.preprocess.processed_dir,
201
+ utt_info["Dataset"],
202
+ cfg.preprocess.pitch_dir,
203
+ utt_info["Uid"] + ".npy",
204
+ )
205
+ for utt_info in self.metadata
206
+ }
207
+
208
+ # Target F0 median
209
+ target_f0_statistics_path = os.path.join(
210
+ cfg.preprocess.processed_dir,
211
+ self.target_dataset,
212
+ cfg.preprocess.pitch_dir,
213
+ "statistics.json",
214
+ )
215
+ self.target_pitch_median = json.load(open(target_f0_statistics_path, "r"))[
216
+ f"{self.target_dataset}_{self.target_singer}"
217
+ ]["voiced_positions"]["median"]
218
+
219
+ # Source F0 median (if infer from file)
220
+ if infer_type == "from_file":
221
+ source_audio_name = cfg.inference.source_audio_name
222
+ source_f0_statistics_path = os.path.join(
223
+ cfg.preprocess.processed_dir,
224
+ source_audio_name,
225
+ cfg.preprocess.pitch_dir,
226
+ "statistics.json",
227
+ )
228
+ self.source_pitch_median = json.load(
229
+ open(source_f0_statistics_path, "r")
230
+ )[f"{source_audio_name}_{source_audio_name}"]["voiced_positions"][
231
+ "median"
232
+ ]
233
+ else:
234
+ self.source_pitch_median = None
235
+
236
+ if cfg.preprocess.use_frame_energy:
237
+ self.utt2frame_energy_path = {
238
+ f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
239
+ cfg.preprocess.processed_dir,
240
+ utt_info["Dataset"],
241
+ cfg.preprocess.energy_dir,
242
+ utt_info["Uid"] + ".npy",
243
+ )
244
+ for utt_info in self.metadata
245
+ }
246
+
247
+ if cfg.preprocess.use_mel:
248
+ self.utt2mel_path = {
249
+ f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
250
+ cfg.preprocess.processed_dir,
251
+ utt_info["Dataset"],
252
+ cfg.preprocess.mel_dir,
253
+ utt_info["Uid"] + ".npy",
254
+ )
255
+ for utt_info in self.metadata
256
+ }
257
+
258
+ ######### Load source content features' path #########
259
+ if cfg.model.condition_encoder.use_whisper:
260
+ self.whisper_aligner = WhisperExtractor(cfg)
261
+ self.utt2whisper_path = load_content_feature_path(
262
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.whisper_dir
263
+ )
264
+
265
+ if cfg.model.condition_encoder.use_contentvec:
266
+ self.contentvec_aligner = ContentvecExtractor(cfg)
267
+ self.utt2contentVec_path = load_content_feature_path(
268
+ self.metadata,
269
+ cfg.preprocess.processed_dir,
270
+ cfg.preprocess.contentvec_dir,
271
+ )
272
+
273
+ if cfg.model.condition_encoder.use_mert:
274
+ self.utt2mert_path = load_content_feature_path(
275
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.mert_dir
276
+ )
277
+ if cfg.model.condition_encoder.use_wenet:
278
+ self.wenet_aligner = WenetExtractor(cfg)
279
+ self.utt2wenet_path = load_content_feature_path(
280
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.wenet_dir
281
+ )
282
+
283
+ def __getitem__(self, index):
284
+ single_feature = {}
285
+
286
+ utt_info = self.metadata[index]
287
+ dataset = utt_info["Dataset"]
288
+ uid = utt_info["Uid"]
289
+ utt = "{}_{}".format(dataset, uid)
290
+
291
+ source_dataset = self.metadata[index]["Dataset"]
292
+
293
+ if self.cfg.preprocess.use_spkid:
294
+ single_feature["spk_id"] = np.array(
295
+ [self.spk2id[f"{self.target_dataset}_{self.target_singer}"]],
296
+ dtype=np.int32,
297
+ )
298
+
299
+ ######### Get Acoustic Features Item #########
300
+ if self.cfg.preprocess.use_mel:
301
+ mel = np.load(self.utt2mel_path[utt])
302
+ assert mel.shape[0] == self.cfg.preprocess.n_mel # [n_mels, T]
303
+ if self.cfg.preprocess.use_min_max_norm_mel:
304
+ # mel norm
305
+ mel = cal_normalized_mel(mel, source_dataset, self.cfg.preprocess)
306
+
307
+ if "target_len" not in single_feature.keys():
308
+ single_feature["target_len"] = mel.shape[1]
309
+ single_feature["mel"] = mel.T # [T, n_mels]
310
+
311
+ if self.cfg.preprocess.use_frame_pitch:
312
+ frame_pitch_path = self.utt2frame_pitch_path[utt]
313
+ frame_pitch = np.load(frame_pitch_path)
314
+
315
+ if self.trans_key:
316
+ try:
317
+ self.trans_key = int(self.trans_key)
318
+ except:
319
+ pass
320
+ if type(self.trans_key) == int:
321
+ frame_pitch = transpose_key(frame_pitch, self.trans_key)
322
+ elif self.trans_key:
323
+ assert self.target_singer
324
+
325
+ frame_pitch = pitch_shift_to_target(
326
+ frame_pitch, self.target_pitch_median, self.source_pitch_median
327
+ )
328
+
329
+ if "target_len" not in single_feature.keys():
330
+ single_feature["target_len"] = len(frame_pitch)
331
+ aligned_frame_pitch = align_length(
332
+ frame_pitch, single_feature["target_len"]
333
+ )
334
+ single_feature["frame_pitch"] = aligned_frame_pitch
335
+
336
+ if self.cfg.preprocess.use_uv:
337
+ frame_uv_path = self.utt2uv_path[utt]
338
+ frame_uv = np.load(frame_uv_path)
339
+ aligned_frame_uv = align_length(frame_uv, single_feature["target_len"])
340
+ aligned_frame_uv = [
341
+ 0 if frame_uv else 1 for frame_uv in aligned_frame_uv
342
+ ]
343
+ aligned_frame_uv = np.array(aligned_frame_uv)
344
+ single_feature["frame_uv"] = aligned_frame_uv
345
+
346
+ if self.cfg.preprocess.use_frame_energy:
347
+ frame_energy_path = self.utt2frame_energy_path[utt]
348
+ frame_energy = np.load(frame_energy_path)
349
+ if "target_len" not in single_feature.keys():
350
+ single_feature["target_len"] = len(frame_energy)
351
+ aligned_frame_energy = align_length(
352
+ frame_energy, single_feature["target_len"]
353
+ )
354
+ single_feature["frame_energy"] = aligned_frame_energy
355
+
356
+ ######### Get Content Features Item #########
357
+ if self.cfg.model.condition_encoder.use_whisper:
358
+ assert "target_len" in single_feature.keys()
359
+ aligned_whisper_feat = self.whisper_aligner.offline_align(
360
+ np.load(self.utt2whisper_path[utt]), single_feature["target_len"]
361
+ )
362
+ single_feature["whisper_feat"] = aligned_whisper_feat
363
+
364
+ if self.cfg.model.condition_encoder.use_contentvec:
365
+ assert "target_len" in single_feature.keys()
366
+ aligned_contentvec = self.contentvec_aligner.offline_align(
367
+ np.load(self.utt2contentVec_path[utt]), single_feature["target_len"]
368
+ )
369
+ single_feature["contentvec_feat"] = aligned_contentvec
370
+
371
+ if self.cfg.model.condition_encoder.use_mert:
372
+ assert "target_len" in single_feature.keys()
373
+ aligned_mert_feat = align_content_feature_length(
374
+ np.load(self.utt2mert_path[utt]),
375
+ single_feature["target_len"],
376
+ source_hop=self.cfg.preprocess.mert_hop_size,
377
+ )
378
+ single_feature["mert_feat"] = aligned_mert_feat
379
+
380
+ if self.cfg.model.condition_encoder.use_wenet:
381
+ assert "target_len" in single_feature.keys()
382
+ aligned_wenet_feat = self.wenet_aligner.offline_align(
383
+ np.load(self.utt2wenet_path[utt]), single_feature["target_len"]
384
+ )
385
+ single_feature["wenet_feat"] = aligned_wenet_feat
386
+
387
+ return single_feature
388
+
389
+ def __len__(self):
390
+ return len(self.metadata)
391
+
392
+
393
+ class SVCTestCollator:
394
+ """Zero-pads model inputs and targets based on number of frames per step"""
395
+
396
+ def __init__(self, cfg):
397
+ self.cfg = cfg
398
+
399
+ def __call__(self, batch):
400
+ packed_batch_features = dict()
401
+
402
+ # mel: [b, T, n_mels]
403
+ # frame_pitch, frame_energy: [1, T]
404
+ # target_len: [1]
405
+ # spk_id: [b, 1]
406
+ # mask: [b, T, 1]
407
+
408
+ for key in batch[0].keys():
409
+ if key == "target_len":
410
+ packed_batch_features["target_len"] = torch.LongTensor(
411
+ [b["target_len"] for b in batch]
412
+ )
413
+ masks = [
414
+ torch.ones((b["target_len"], 1), dtype=torch.long) for b in batch
415
+ ]
416
+ packed_batch_features["mask"] = pad_sequence(
417
+ masks, batch_first=True, padding_value=0
418
+ )
419
+ else:
420
+ values = [torch.from_numpy(b[key]) for b in batch]
421
+ packed_batch_features[key] = pad_sequence(
422
+ values, batch_first=True, padding_value=0
423
+ )
424
+
425
+ return packed_batch_features
models/svc/base/svc_inference.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from models.base.new_inference import BaseInference
7
+ from models.svc.base.svc_dataset import SVCTestCollator, SVCTestDataset
8
+
9
+
10
+ class SVCInference(BaseInference):
11
+ def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
12
+ BaseInference.__init__(self, args, cfg, infer_type)
13
+
14
+ def _build_test_dataset(self):
15
+ return SVCTestDataset, SVCTestCollator
models/svc/base/svc_trainer.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import json
7
+ import os
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+
12
+ from models.base.new_trainer import BaseTrainer
13
+ from models.svc.base.svc_dataset import SVCCollator, SVCDataset
14
+
15
+
16
+ class SVCTrainer(BaseTrainer):
17
+ r"""The base trainer for all SVC models. It inherits from BaseTrainer and implements
18
+ ``build_criterion``, ``_build_dataset`` and ``_build_singer_lut`` methods. You can inherit from this
19
+ class, and implement ``_build_model``, ``_forward_step``.
20
+ """
21
+
22
+ def __init__(self, args=None, cfg=None):
23
+ self.args = args
24
+ self.cfg = cfg
25
+
26
+ self._init_accelerator()
27
+
28
+ # Only for SVC tasks
29
+ with self.accelerator.main_process_first():
30
+ self.singers = self._build_singer_lut()
31
+
32
+ # Super init
33
+ BaseTrainer.__init__(self, args, cfg)
34
+
35
+ # Only for SVC tasks
36
+ self.task_type = "SVC"
37
+ self.logger.info("Task type: {}".format(self.task_type))
38
+
39
+ ### Following are methods only for SVC tasks ###
40
+ # TODO: LEGACY CODE, NEED TO BE REFACTORED
41
+ def _build_dataset(self):
42
+ return SVCDataset, SVCCollator
43
+
44
+ @staticmethod
45
+ def _build_criterion():
46
+ criterion = nn.MSELoss(reduction="none")
47
+ return criterion
48
+
49
+ @staticmethod
50
+ def _compute_loss(criterion, y_pred, y_gt, loss_mask):
51
+ """
52
+ Args:
53
+ criterion: MSELoss(reduction='none')
54
+ y_pred, y_gt: (bs, seq_len, D)
55
+ loss_mask: (bs, seq_len, 1)
56
+ Returns:
57
+ loss: Tensor of shape []
58
+ """
59
+
60
+ # (bs, seq_len, D)
61
+ loss = criterion(y_pred, y_gt)
62
+ # expand loss_mask to (bs, seq_len, D)
63
+ loss_mask = loss_mask.repeat(1, 1, loss.shape[-1])
64
+
65
+ loss = torch.sum(loss * loss_mask) / torch.sum(loss_mask)
66
+ return loss
67
+
68
+ def _save_auxiliary_states(self):
69
+ """
70
+ To save the singer's look-up table in the checkpoint saving path
71
+ """
72
+ with open(
73
+ os.path.join(self.tmp_checkpoint_save_path, self.cfg.preprocess.spk2id), "w"
74
+ ) as f:
75
+ json.dump(self.singers, f, indent=4, ensure_ascii=False)
76
+
77
+ def _build_singer_lut(self):
78
+ resumed_singer_path = None
79
+ if self.args.resume_from_ckpt_path and self.args.resume_from_ckpt_path != "":
80
+ resumed_singer_path = os.path.join(
81
+ self.args.resume_from_ckpt_path, self.cfg.preprocess.spk2id
82
+ )
83
+ if os.path.exists(os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)):
84
+ resumed_singer_path = os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)
85
+
86
+ if resumed_singer_path:
87
+ with open(resumed_singer_path, "r") as f:
88
+ singers = json.load(f)
89
+ else:
90
+ singers = dict()
91
+
92
+ for dataset in self.cfg.dataset:
93
+ singer_lut_path = os.path.join(
94
+ self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.spk2id
95
+ )
96
+ with open(singer_lut_path, "r") as singer_lut_path:
97
+ singer_lut = json.load(singer_lut_path)
98
+ for singer in singer_lut.keys():
99
+ if singer not in singers:
100
+ singers[singer] = len(singers)
101
+
102
+ with open(
103
+ os.path.join(self.exp_dir, self.cfg.preprocess.spk2id), "w"
104
+ ) as singer_file:
105
+ json.dump(singers, singer_file, indent=4, ensure_ascii=False)
106
+ print(
107
+ "singers have been dumped to {}".format(
108
+ os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)
109
+ )
110
+ )
111
+ return singers
models/svc/comosvc/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.