Billpai commited on
Commit
f03cd94
·
1 Parent(s): 2781499
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +21 -0
  3. README.md +8 -4
  4. app.py +194 -0
  5. ckpts/svc/vocalist_l1_contentvec+whisper/args.json +257 -0
  6. ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/optimizer.bin +3 -0
  7. ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/pytorch_model.bin +3 -0
  8. ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/random_states_0.pkl +3 -0
  9. ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/singers.json +17 -0
  10. ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.0 +3 -0
  11. ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.1 +3 -0
  12. ckpts/svc/vocalist_l1_contentvec+whisper/singers.json +17 -0
  13. config/audioldm.json +92 -0
  14. config/autoencoderkl.json +69 -0
  15. config/base.json +220 -0
  16. config/comosvc.json +216 -0
  17. config/diffusion.json +227 -0
  18. config/fs2.json +117 -0
  19. config/transformer.json +180 -0
  20. config/tts.json +23 -0
  21. config/valle.json +52 -0
  22. config/vits.json +101 -0
  23. config/vocoder.json +84 -0
  24. egs/svc/MultipleContentsSVC/README.md +153 -0
  25. egs/svc/MultipleContentsSVC/exp_config.json +126 -0
  26. egs/svc/MultipleContentsSVC/run.sh +1 -0
  27. egs/svc/README.md +34 -0
  28. egs/svc/_template/run.sh +150 -0
  29. egs/vocoder/README.md +23 -0
  30. egs/vocoder/diffusion/README.md +0 -0
  31. egs/vocoder/diffusion/exp_config_base.json +0 -0
  32. egs/vocoder/gan/README.md +224 -0
  33. egs/vocoder/gan/_template/run.sh +143 -0
  34. egs/vocoder/gan/apnet/exp_config.json +45 -0
  35. egs/vocoder/gan/apnet/run.sh +143 -0
  36. egs/vocoder/gan/bigvgan/exp_config.json +66 -0
  37. egs/vocoder/gan/bigvgan/run.sh +143 -0
  38. egs/vocoder/gan/bigvgan_large/exp_config.json +70 -0
  39. egs/vocoder/gan/bigvgan_large/run.sh +143 -0
  40. egs/vocoder/gan/exp_config_base.json +111 -0
  41. egs/vocoder/gan/hifigan/exp_config.json +59 -0
  42. egs/vocoder/gan/hifigan/run.sh +143 -0
  43. egs/vocoder/gan/melgan/exp_config.json +34 -0
  44. egs/vocoder/gan/melgan/run.sh +143 -0
  45. egs/vocoder/gan/nsfhifigan/exp_config.json +83 -0
  46. egs/vocoder/gan/nsfhifigan/run.sh +143 -0
  47. egs/vocoder/gan/tfr_enhanced_hifigan/README.md +185 -0
  48. egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json +118 -0
  49. egs/vocoder/gan/tfr_enhanced_hifigan/run.sh +145 -0
  50. examples/chinese_female_recordings.wav +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ flagged
3
+ result
4
+ source_audios
5
+ ckpts/svc/vocalist_l1_contentvec+whisper/data
6
+ !ckpts/svc/vocalist_l1_contentvec+whisper/data/vocalist_l1
7
+
8
+ # Developing mode
9
+ _*.sh
10
+ _*.json
11
+ *.lst
12
+ yard*
13
+ *.out
14
+ evaluation/evalset_selection
15
+ mfa
16
+ egs/svc/*wavmark
17
+ egs/svc/custom
18
+ egs/svc/*/dev*
19
+ egs/svc/dev_exp_config.json
20
+ bins/svc/demo*
21
+ bins/svc/preprocess_custom.py
README.md CHANGED
@@ -1,11 +1,15 @@
1
  ---
2
- title: Test2
3
- emoji: 📈
4
  colorFrom: indigo
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.28.3
 
8
  app_file: app.py
 
 
 
9
  pinned: false
10
  license: mit
11
  ---
 
1
  ---
2
+ title: Singing Voice Conversion
3
+ emoji: 🎼
4
  colorFrom: indigo
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.8.0
8
+ python_version: 3.9.15
9
  app_file: app.py
10
+ models:
11
+ - amphion/singing_voice_conversion
12
+ - amphion/vocoder
13
  pinned: false
14
  license: mit
15
  ---
app.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import gradio as gr
7
+ import os
8
+ import inference
9
+
10
+ SUPPORTED_TARGET_SINGERS = {
11
+ "Adele": "vocalist_l1_Adele",
12
+ "Beyonce": "vocalist_l1_Beyonce",
13
+ "Bruno Mars": "vocalist_l1_BrunoMars",
14
+ "John Mayer": "vocalist_l1_JohnMayer",
15
+ "Michael Jackson": "vocalist_l1_MichaelJackson",
16
+ "Taylor Swift": "vocalist_l1_TaylorSwift",
17
+ "Jacky Cheung 张学友": "vocalist_l1_张学友",
18
+ "Jian Li 李健": "vocalist_l1_李健",
19
+ "Feng Wang 汪峰": "vocalist_l1_汪峰",
20
+ "Faye Wong 王菲": "vocalist_l1_王菲",
21
+ "Yijie Shi 石倚洁": "vocalist_l1_石倚洁",
22
+ "Tsai Chin 蔡琴": "vocalist_l1_蔡琴",
23
+ "Ying Na 那英": "vocalist_l1_那英",
24
+ "Eason Chan 陈奕迅": "vocalist_l1_陈奕迅",
25
+ "David Tao 陶喆": "vocalist_l1_陶喆",
26
+ }
27
+
28
+
29
+ def svc_inference(
30
+ source_audio_path,
31
+ target_singer,
32
+ key_shift_mode="Auto Shift",
33
+ key_shift_num=0,
34
+ diffusion_steps=1000,
35
+ ):
36
+ #### Prepare source audio file ####
37
+ print("source_audio_path: {}".format(source_audio_path))
38
+ audio_file = source_audio_path.split("/")[-1]
39
+ audio_name = audio_file.split(".")[0]
40
+ source_audio_dir = source_audio_path.replace(audio_file, "")
41
+
42
+ ### Target Singer ###
43
+ target_singer = SUPPORTED_TARGET_SINGERS[target_singer]
44
+
45
+ ### Inference ###
46
+ if key_shift_mode == "Auto Shift":
47
+ key_shift = "autoshift"
48
+ else:
49
+ key_shift = key_shift_num
50
+
51
+ args_list = ["--config", "ckpts/svc/vocalist_l1_contentvec+whisper/args.json"]
52
+ args_list += ["--acoustics_dir", "ckpts/svc/vocalist_l1_contentvec+whisper"]
53
+ args_list += ["--vocoder_dir", "pretrained/bigvgan"]
54
+ args_list += ["--target_singer", target_singer]
55
+ args_list += ["--trans_key", str(key_shift)]
56
+ args_list += ["--diffusion_inference_steps", str(diffusion_steps)]
57
+ args_list += ["--source", source_audio_dir]
58
+ args_list += ["--output_dir", "result"]
59
+ args_list += ["--log_level", "debug"]
60
+
61
+ os.environ["WORK_DIR"] = "./"
62
+ inference.main(args_list)
63
+
64
+ ### Display ###
65
+ result_file = os.path.join(
66
+ "result/{}/{}_{}.wav".format(audio_name, audio_name, target_singer)
67
+ )
68
+ return result_file
69
+
70
+
71
+ with gr.Blocks() as demo:
72
+ gr.Markdown(
73
+ """
74
+ # Amphion Singing Voice Conversion: *DiffWaveNetSVC*
75
+
76
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
77
+
78
+ This demo provides an Amphion [DiffWaveNetSVC](https://github.com/open-mmlab/Amphion/tree/main/egs/svc/MultipleContentsSVC) pretrained model for you to play. The training data has been detailed [here](https://huggingface.co/amphion/singing_voice_conversion).
79
+ """
80
+ )
81
+
82
+ gr.Markdown(
83
+ """
84
+ ## Source Audio
85
+ **Hint**: We recommend using dry vocals (e.g., studio recordings or source-separated voices from music) as the input for this demo. At the bottom of this page, we provide some examples for your reference.
86
+ """
87
+ )
88
+ source_audio_input = gr.Audio(
89
+ sources=["upload", "microphone"],
90
+ label="Source Audio",
91
+ type="filepath",
92
+ )
93
+
94
+ with gr.Row():
95
+ with gr.Column():
96
+ config_target_singer = gr.Radio(
97
+ choices=list(SUPPORTED_TARGET_SINGERS.keys()),
98
+ label="Target Singer",
99
+ value="Jian Li 李健",
100
+ )
101
+ config_keyshift_choice = gr.Radio(
102
+ choices=["Auto Shift", "Key Shift"],
103
+ value="Auto Shift",
104
+ label="Pitch Shift Control",
105
+ info='If you want to control the specific pitch shift value, you need to choose "Key Shift"',
106
+ )
107
+
108
+ # gr.Markdown("## Conversion Configurations")
109
+ with gr.Column():
110
+ config_keyshift_value = gr.Slider(
111
+ -6,
112
+ 6,
113
+ value=0,
114
+ step=1,
115
+ label="Key Shift Values",
116
+ info='How many semitones you want to transpose. This parameter will work only if you choose "Key Shift"',
117
+ )
118
+ config_diff_infer_steps = gr.Slider(
119
+ 1,
120
+ 1000,
121
+ value=1000,
122
+ step=1,
123
+ label="Diffusion Inference Steps",
124
+ info="As the step number increases, the synthesis quality will be better while the inference speed will be lower",
125
+ )
126
+ btn = gr.ClearButton(
127
+ components=[
128
+ config_target_singer,
129
+ config_keyshift_choice,
130
+ config_keyshift_value,
131
+ config_diff_infer_steps,
132
+ ]
133
+ )
134
+ btn = gr.Button(value="Submit", variant="primary")
135
+
136
+ gr.Markdown("## Conversion Result")
137
+ demo_outputs = gr.Audio(label="Conversion Result")
138
+
139
+ btn.click(
140
+ fn=svc_inference,
141
+ inputs=[
142
+ source_audio_input,
143
+ config_target_singer,
144
+ config_keyshift_choice,
145
+ config_keyshift_value,
146
+ config_diff_infer_steps,
147
+ ],
148
+ outputs=demo_outputs,
149
+ )
150
+
151
+ gr.Markdown("## Examples")
152
+ gr.Examples(
153
+ examples=[
154
+ [
155
+ "examples/chinese_female_recordings.wav",
156
+ "John Mayer",
157
+ "Auto Shift",
158
+ 1000,
159
+ "examples/output/chinese_female_recordings_vocalist_l1_JohnMayer.wav",
160
+ ],
161
+ [
162
+ "examples/chinese_male_seperated.wav",
163
+ "Taylor Swift",
164
+ "Auto Shift",
165
+ 1000,
166
+ "examples/output/chinese_male_seperated_vocalist_l1_TaylorSwift.wav",
167
+ ],
168
+ [
169
+ "examples/english_female_seperated.wav",
170
+ "Feng Wang 汪峰",
171
+ "Auto Shift",
172
+ 1000,
173
+ "examples/output/english_female_seperated_vocalist_l1_汪峰.wav",
174
+ ],
175
+ [
176
+ "examples/english_male_recordings.wav",
177
+ "Yijie Shi 石倚洁",
178
+ "Auto Shift",
179
+ 1000,
180
+ "examples/output/english_male_recordings_vocalist_l1_石倚洁.wav",
181
+ ],
182
+ ],
183
+ inputs=[
184
+ source_audio_input,
185
+ config_target_singer,
186
+ config_keyshift_choice,
187
+ config_diff_infer_steps,
188
+ demo_outputs,
189
+ ],
190
+ )
191
+
192
+
193
+ if __name__ == "__main__":
194
+ demo.launch()
ckpts/svc/vocalist_l1_contentvec+whisper/args.json ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_type": "svc",
3
+ "dataset": [
4
+ "vocalist_l1",
5
+ ],
6
+ "exp_name": "vocalist_l1_contentvec+whisper",
7
+ "inference": {
8
+ "diffusion": {
9
+ "scheduler": "pndm",
10
+ "scheduler_settings": {
11
+ "num_inference_timesteps": 1000,
12
+ },
13
+ },
14
+ },
15
+ "model": {
16
+ "condition_encoder": {
17
+ "content_encoder_dim": 384,
18
+ "contentvec_dim": 256,
19
+ "f0_max": 1100,
20
+ "f0_min": 50,
21
+ "input_loudness_dim": 1,
22
+ "input_melody_dim": 1,
23
+ "merge_mode": "add",
24
+ "mert_dim": 256,
25
+ "n_bins_loudness": 256,
26
+ "n_bins_melody": 256,
27
+ "output_content_dim": 384,
28
+ "output_loudness_dim": 384,
29
+ "output_melody_dim": 384,
30
+ "output_singer_dim": 384,
31
+ "pitch_max": 1100,
32
+ "pitch_min": 50,
33
+ "singer_table_size": 512,
34
+ "use_conformer_for_content_features": false,
35
+ "use_contentvec": true,
36
+ "use_log_f0": true,
37
+ "use_log_loudness": true,
38
+ "use_mert": false,
39
+ "use_singer_encoder": true,
40
+ "use_spkid": true,
41
+ "use_wenet": false,
42
+ "use_whisper": true,
43
+ "wenet_dim": 512,
44
+ "whisper_dim": 1024,
45
+ },
46
+ "diffusion": {
47
+ "bidilconv": {
48
+ "base_channel": 384,
49
+ "conditioner_size": 384,
50
+ "conv_kernel_size": 3,
51
+ "dilation_cycle_length": 4,
52
+ "n_res_block": 20,
53
+ },
54
+ "model_type": "bidilconv",
55
+ "scheduler": "ddpm",
56
+ "scheduler_settings": {
57
+ "beta_end": 0.02,
58
+ "beta_schedule": "linear",
59
+ "beta_start": 0.0001,
60
+ "num_train_timesteps": 1000,
61
+ },
62
+ "step_encoder": {
63
+ "activation": "SiLU",
64
+ "dim_hidden_layer": 512,
65
+ "dim_raw_embedding": 128,
66
+ "max_period": 10000,
67
+ "num_layer": 2,
68
+ },
69
+ "unet2d": {
70
+ "down_block_types": [
71
+ "CrossAttnDownBlock2D",
72
+ "CrossAttnDownBlock2D",
73
+ "CrossAttnDownBlock2D",
74
+ "DownBlock2D",
75
+ ],
76
+ "in_channels": 1,
77
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
78
+ "only_cross_attention": false,
79
+ "out_channels": 1,
80
+ "up_block_types": [
81
+ "UpBlock2D",
82
+ "CrossAttnUpBlock2D",
83
+ "CrossAttnUpBlock2D",
84
+ "CrossAttnUpBlock2D",
85
+ ],
86
+ },
87
+ },
88
+ },
89
+ "model_type": "DiffWaveNetSVC",
90
+ "preprocess": {
91
+ "audio_dir": "audios",
92
+ "bits": 8,
93
+ "content_feature_batch_size": 16,
94
+ "contentvec_batch_size": 1,
95
+ "contentvec_dir": "contentvec",
96
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
97
+ "contentvec_frameshift": 0.02,
98
+ "contentvec_sample_rate": 16000,
99
+ "dur_dir": "durs",
100
+ "duration_dir": "duration",
101
+ "emo2id": "emo2id.json",
102
+ "energy_dir": "energys",
103
+ "extract_audio": false,
104
+ "extract_contentvec_feature": true,
105
+ "extract_energy": true,
106
+ "extract_label": false,
107
+ "extract_mcep": false,
108
+ "extract_mel": true,
109
+ "extract_mert_feature": false,
110
+ "extract_pitch": true,
111
+ "extract_uv": true,
112
+ "extract_wenet_feature": false,
113
+ "extract_whisper_feature": true,
114
+ "f0_max": 1100,
115
+ "f0_min": 50,
116
+ "file_lst": "file.lst",
117
+ "fmax": 12000,
118
+ "fmin": 0,
119
+ "hop_size": 256,
120
+ "is_label": true,
121
+ "is_mu_law": true,
122
+ "lab_dir": "labs",
123
+ "label_dir": "labels",
124
+ "mcep_dir": "mcep",
125
+ "mel_dir": "mels",
126
+ "mel_min_max_norm": true,
127
+ "mel_min_max_stats_dir": "mel_min_max_stats",
128
+ "mert_dir": "mert",
129
+ "mert_feature_layer": -1,
130
+ "mert_frameshit": 0.01333,
131
+ "mert_hop_size": 320,
132
+ "mert_model": "m-a-p/MERT-v1-330M",
133
+ "min_level_db": -115,
134
+ "mu_law_norm": false,
135
+ "n_fft": 1024,
136
+ "n_mel": 100,
137
+ "num_silent_frames": 8,
138
+ "num_workers": 8,
139
+ "phone_seq_file": "phone_seq_file",
140
+ "pin_memory": true,
141
+ "pitch_bin": 256,
142
+ "pitch_dir": "pitches",
143
+ "pitch_extractor": "crepe", // "parselmouth"
144
+ "pitch_max": 1100.0,
145
+ "pitch_min": 50.0,
146
+ "processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",
147
+ "ref_level_db": 20,
148
+ "sample_rate": 24000,
149
+ "spk2id": "singers.json",
150
+ "train_file": "train.json",
151
+ "trim_fft_size": 512,
152
+ "trim_hop_size": 128,
153
+ "trim_silence": false,
154
+ "trim_top_db": 30,
155
+ "trimmed_wav_dir": "trimmed_wavs",
156
+ "use_audio": false,
157
+ "use_contentvec": true,
158
+ "use_dur": false,
159
+ "use_emoid": false,
160
+ "use_frame_duration": false,
161
+ "use_frame_energy": true,
162
+ "use_frame_pitch": true,
163
+ "use_lab": false,
164
+ "use_label": false,
165
+ "use_log_scale_energy": false,
166
+ "use_log_scale_pitch": false,
167
+ "use_mel": true,
168
+ "use_mert": false,
169
+ "use_min_max_norm_mel": true,
170
+ "use_one_hot": false,
171
+ "use_phn_seq": false,
172
+ "use_phone_duration": false,
173
+ "use_phone_energy": false,
174
+ "use_phone_pitch": false,
175
+ "use_spkid": true,
176
+ "use_uv": true,
177
+ "use_wav": false,
178
+ "use_wenet": false,
179
+ "use_whisper": true,
180
+ "utt2emo": "utt2emo",
181
+ "utt2spk": "utt2singer",
182
+ "uv_dir": "uvs",
183
+ "valid_file": "test.json",
184
+ "wav_dir": "wavs",
185
+ "wenet_batch_size": 1,
186
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
187
+ "wenet_dir": "wenet",
188
+ "wenet_downsample_rate": 4,
189
+ "wenet_frameshift": 0.01,
190
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
191
+ "wenet_sample_rate": 16000,
192
+ "whisper_batch_size": 30,
193
+ "whisper_dir": "whisper",
194
+ "whisper_downsample_rate": 2,
195
+ "whisper_frameshift": 0.01,
196
+ "whisper_model": "medium",
197
+ "whisper_model_path": "pretrained/whisper/medium.pt",
198
+ "whisper_sample_rate": 16000,
199
+ "win_size": 1024,
200
+ },
201
+ "supported_model_type": [
202
+ "Fastspeech2",
203
+ "DiffSVC",
204
+ "Transformer",
205
+ "EDM",
206
+ "CD",
207
+ ],
208
+ "train": {
209
+ "adamw": {
210
+ "lr": 0.0004,
211
+ },
212
+ "batch_size": 32,
213
+ "dataloader": {
214
+ "num_worker": 8,
215
+ "pin_memory": true,
216
+ },
217
+ "ddp": true,
218
+ "epochs": 50000,
219
+ "gradient_accumulation_step": 1,
220
+ "keep_checkpoint_max": 5,
221
+ "keep_last": [
222
+ 5,
223
+ -1,
224
+ ],
225
+ "max_epoch": -1,
226
+ "max_steps": 1000000,
227
+ "multi_speaker_training": false,
228
+ "optimizer": "AdamW",
229
+ "random_seed": 10086,
230
+ "reducelronplateau": {
231
+ "factor": 0.8,
232
+ "min_lr": 0.0001,
233
+ "patience": 10,
234
+ },
235
+ "run_eval": [
236
+ false,
237
+ true,
238
+ ],
239
+ "sampler": {
240
+ "drop_last": true,
241
+ "holistic_shuffle": false,
242
+ },
243
+ "save_checkpoint_stride": [
244
+ 3,
245
+ 10,
246
+ ],
247
+ "save_checkpoints_steps": 10000,
248
+ "save_summary_steps": 500,
249
+ "scheduler": "ReduceLROnPlateau",
250
+ "total_training_steps": 50000,
251
+ "tracker": [
252
+ "tensorboard",
253
+ ],
254
+ "valid_interval": 10000,
255
+ },
256
+ "use_custom_dataset": true,
257
+ }
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:836af10b834c7aec9209eb19ce43559e6ef1e3a59bd6468e90cadbc9a18749ef
3
+ size 249512389
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d54eed12bef331095fc367f196d07c5061d5cb72dd6fe0e1e4453b997bf1d68d
3
+ size 124755137
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6798ddffadcd7d5405a77e667c674c474e4fef0cba817fdd300c7c985c1e82fe
3
+ size 14599
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/singers.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocalist_l1_Adele": 0,
3
+ "vocalist_l1_Beyonce": 1,
4
+ "vocalist_l1_BrunoMars": 2,
5
+ "vocalist_l1_JohnMayer": 3,
6
+ "vocalist_l1_MichaelJackson": 4,
7
+ "vocalist_l1_TaylorSwift": 5,
8
+ "vocalist_l1_张学友": 6,
9
+ "vocalist_l1_李健": 7,
10
+ "vocalist_l1_汪峰": 8,
11
+ "vocalist_l1_王菲": 9,
12
+ "vocalist_l1_石倚洁": 10,
13
+ "vocalist_l1_蔡琴": 11,
14
+ "vocalist_l1_那英": 12,
15
+ "vocalist_l1_陈奕迅": 13,
16
+ "vocalist_l1_陶喆": 14
17
+ }
ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7f490fd0c97876e24bfc44413365ded7ff5d22c1c79f0dac0b754f3b32df76f
3
+ size 88
ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e01bcf2fa621ba563b70568c18fe0742d0f48cafae83a6e8beb0bb6d1f6d146d
3
+ size 77413046
ckpts/svc/vocalist_l1_contentvec+whisper/singers.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocalist_l1_Adele": 0,
3
+ "vocalist_l1_Beyonce": 1,
4
+ "vocalist_l1_BrunoMars": 2,
5
+ "vocalist_l1_JohnMayer": 3,
6
+ "vocalist_l1_MichaelJackson": 4,
7
+ "vocalist_l1_TaylorSwift": 5,
8
+ "vocalist_l1_张学友": 6,
9
+ "vocalist_l1_李健": 7,
10
+ "vocalist_l1_汪峰": 8,
11
+ "vocalist_l1_王菲": 9,
12
+ "vocalist_l1_石倚洁": 10,
13
+ "vocalist_l1_蔡琴": 11,
14
+ "vocalist_l1_那英": 12,
15
+ "vocalist_l1_陈奕迅": 13,
16
+ "vocalist_l1_陶喆": 14
17
+ }
config/audioldm.json ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AudioLDM",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false,
20
+ "cond_mask_prob": 0.1
21
+ },
22
+ // model
23
+ "model": {
24
+ "audioldm": {
25
+ "image_size": 32,
26
+ "in_channels": 4,
27
+ "out_channels": 4,
28
+ "model_channels": 256,
29
+ "attention_resolutions": [
30
+ 4,
31
+ 2,
32
+ 1
33
+ ],
34
+ "num_res_blocks": 2,
35
+ "channel_mult": [
36
+ 1,
37
+ 2,
38
+ 4
39
+ ],
40
+ "num_heads": 8,
41
+ "use_spatial_transformer": true,
42
+ "transformer_depth": 1,
43
+ "context_dim": 768,
44
+ "use_checkpoint": true,
45
+ "legacy": false
46
+ },
47
+ "autoencoderkl": {
48
+ "ch": 128,
49
+ "ch_mult": [
50
+ 1,
51
+ 1,
52
+ 2,
53
+ 2,
54
+ 4
55
+ ],
56
+ "num_res_blocks": 2,
57
+ "in_channels": 1,
58
+ "z_channels": 4,
59
+ "out_ch": 1,
60
+ "double_z": true
61
+ },
62
+ "noise_scheduler": {
63
+ "num_train_timesteps": 1000,
64
+ "beta_start": 0.00085,
65
+ "beta_end": 0.012,
66
+ "beta_schedule": "scaled_linear",
67
+ "clip_sample": false,
68
+ "steps_offset": 1,
69
+ "set_alpha_to_one": false,
70
+ "skip_prk_steps": true,
71
+ "prediction_type": "epsilon"
72
+ }
73
+ },
74
+ // train
75
+ "train": {
76
+ "lronPlateau": {
77
+ "factor": 0.9,
78
+ "patience": 100,
79
+ "min_lr": 4.0e-5,
80
+ "verbose": true
81
+ },
82
+ "adam": {
83
+ "lr": 5.0e-5,
84
+ "betas": [
85
+ 0.9,
86
+ 0.999
87
+ ],
88
+ "weight_decay": 1.0e-2,
89
+ "eps": 1.0e-8
90
+ }
91
+ }
92
+ }
config/autoencoderkl.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AutoencoderKL",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false
20
+ },
21
+ // model
22
+ "model": {
23
+ "autoencoderkl": {
24
+ "ch": 128,
25
+ "ch_mult": [
26
+ 1,
27
+ 1,
28
+ 2,
29
+ 2,
30
+ 4
31
+ ],
32
+ "num_res_blocks": 2,
33
+ "in_channels": 1,
34
+ "z_channels": 4,
35
+ "out_ch": 1,
36
+ "double_z": true
37
+ },
38
+ "loss": {
39
+ "kl_weight": 1e-8,
40
+ "disc_weight": 0.5,
41
+ "disc_factor": 1.0,
42
+ "logvar_init": 0.0,
43
+ "min_adapt_d_weight": 0.0,
44
+ "max_adapt_d_weight": 10.0,
45
+ "disc_start": 50001,
46
+ "disc_in_channels": 1,
47
+ "disc_num_layers": 3,
48
+ "use_actnorm": false
49
+ }
50
+ },
51
+ // train
52
+ "train": {
53
+ "lronPlateau": {
54
+ "factor": 0.9,
55
+ "patience": 100,
56
+ "min_lr": 4.0e-5,
57
+ "verbose": true
58
+ },
59
+ "adam": {
60
+ "lr": 4.0e-4,
61
+ "betas": [
62
+ 0.9,
63
+ 0.999
64
+ ],
65
+ "weight_decay": 1.0e-2,
66
+ "eps": 1.0e-8
67
+ }
68
+ }
69
+ }
config/base.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "supported_model_type": [
3
+ "GANVocoder",
4
+ "Fastspeech2",
5
+ "DiffSVC",
6
+ "Transformer",
7
+ "EDM",
8
+ "CD"
9
+ ],
10
+ "task_type": "",
11
+ "dataset": [],
12
+ "use_custom_dataset": false,
13
+ "preprocess": {
14
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
15
+ // trim audio silence
16
+ "data_augment": false,
17
+ "trim_silence": false,
18
+ "num_silent_frames": 8,
19
+ "trim_fft_size": 512, // fft size used in trimming
20
+ "trim_hop_size": 128, // hop size used in trimming
21
+ "trim_top_db": 30, // top db used in trimming sensitive to each dataset
22
+ // acoustic features
23
+ "extract_mel": false,
24
+ "mel_extract_mode": "",
25
+ "extract_linear_spec": false,
26
+ "extract_mcep": false,
27
+ "extract_pitch": false,
28
+ "extract_acoustic_token": false,
29
+ "pitch_remove_outlier": false,
30
+ "extract_uv": false,
31
+ "pitch_norm": false,
32
+ "extract_audio": false,
33
+ "extract_label": false,
34
+ "pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
35
+ "extract_energy": false,
36
+ "energy_remove_outlier": false,
37
+ "energy_norm": false,
38
+ "energy_extract_mode": "from_mel",
39
+ "extract_duration": false,
40
+ "extract_amplitude_phase": false,
41
+ "mel_min_max_norm": false,
42
+ // lingusitic features
43
+ "extract_phone": false,
44
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
45
+ // content features
46
+ "extract_whisper_feature": false,
47
+ "extract_contentvec_feature": false,
48
+ "extract_mert_feature": false,
49
+ "extract_wenet_feature": false,
50
+ // Settings for data preprocessing
51
+ "n_mel": 80,
52
+ "win_size": 480,
53
+ "hop_size": 120,
54
+ "sample_rate": 24000,
55
+ "n_fft": 1024,
56
+ "fmin": 0,
57
+ "fmax": 12000,
58
+ "min_level_db": -115,
59
+ "ref_level_db": 20,
60
+ "bits": 8,
61
+ // Directory names of processed data or extracted features
62
+ "processed_dir": "processed_data",
63
+ "trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
64
+ "raw_data": "raw_data",
65
+ "phone_dir": "phones",
66
+ "wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
67
+ "audio_dir": "audios",
68
+ "log_amplitude_dir": "log_amplitudes",
69
+ "phase_dir": "phases",
70
+ "real_dir": "reals",
71
+ "imaginary_dir": "imaginarys",
72
+ "label_dir": "labels",
73
+ "linear_dir": "linears",
74
+ "mel_dir": "mels", // directory name of extraced mel features
75
+ "mcep_dir": "mcep", // directory name of extraced mcep features
76
+ "dur_dir": "durs",
77
+ "symbols_dict": "symbols.dict",
78
+ "lab_dir": "labs", // directory name of extraced label features
79
+ "wenet_dir": "wenet", // directory name of extraced wenet features
80
+ "contentvec_dir": "contentvec", // directory name of extraced wenet features
81
+ "pitch_dir": "pitches", // directory name of extraced pitch features
82
+ "energy_dir": "energys", // directory name of extracted energy features
83
+ "phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
84
+ "phone_energy_dir": "phone_energys", // directory name of extracted energy features
85
+ "uv_dir": "uvs", // directory name of extracted unvoiced features
86
+ "duration_dir": "duration", // ground-truth duration file
87
+ "phone_seq_file": "phone_seq_file", // phoneme sequence file
88
+ "file_lst": "file.lst",
89
+ "train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
90
+ "valid_file": "valid.json", // validattion set
91
+ "spk2id": "spk2id.json", // used for multi-speaker dataset
92
+ "utt2spk": "utt2spk", // used for multi-speaker dataset
93
+ "emo2id": "emo2id.json", // used for multi-emotion dataset
94
+ "utt2emo": "utt2emo", // used for multi-emotion dataset
95
+ // Features used for model training
96
+ "use_text": false,
97
+ "use_phone": false,
98
+ "use_phn_seq": false,
99
+ "use_lab": false,
100
+ "use_linear": false,
101
+ "use_mel": false,
102
+ "use_min_max_norm_mel": false,
103
+ "use_wav": false,
104
+ "use_phone_pitch": false,
105
+ "use_log_scale_pitch": false,
106
+ "use_phone_energy": false,
107
+ "use_phone_duration": false,
108
+ "use_log_scale_energy": false,
109
+ "use_wenet": false,
110
+ "use_dur": false,
111
+ "use_spkid": false, // True: use speaker id for multi-speaker dataset
112
+ "use_emoid": false, // True: use emotion id for multi-emotion dataset
113
+ "use_frame_pitch": false,
114
+ "use_uv": false,
115
+ "use_frame_energy": false,
116
+ "use_frame_duration": false,
117
+ "use_audio": false,
118
+ "use_label": false,
119
+ "use_one_hot": false,
120
+ "use_amplitude_phase": false,
121
+ "data_augment": false,
122
+ "align_mel_duration": false
123
+ },
124
+ "train": {
125
+ "ddp": true,
126
+ "random_seed": 970227,
127
+ "batch_size": 16,
128
+ "max_steps": 1000000,
129
+ // Trackers
130
+ "tracker": [
131
+ "tensorboard"
132
+ // "wandb",
133
+ // "cometml",
134
+ // "mlflow",
135
+ ],
136
+ "max_epoch": -1,
137
+ // -1 means no limit
138
+ "save_checkpoint_stride": [
139
+ 5,
140
+ 20
141
+ ],
142
+ // unit is epoch
143
+ "keep_last": [
144
+ 3,
145
+ -1
146
+ ],
147
+ // -1 means infinite, if one number will broadcast
148
+ "run_eval": [
149
+ false,
150
+ true
151
+ ],
152
+ // if one number will broadcast
153
+ // Fix the random seed
154
+ "random_seed": 10086,
155
+ // Optimizer
156
+ "optimizer": "AdamW",
157
+ "adamw": {
158
+ "lr": 4.0e-4
159
+ // nn model lr
160
+ },
161
+ // LR Scheduler
162
+ "scheduler": "ReduceLROnPlateau",
163
+ "reducelronplateau": {
164
+ "factor": 0.8,
165
+ "patience": 10,
166
+ // unit is epoch
167
+ "min_lr": 1.0e-4
168
+ },
169
+ // Batchsampler
170
+ "sampler": {
171
+ "holistic_shuffle": true,
172
+ "drop_last": true
173
+ },
174
+ // Dataloader
175
+ "dataloader": {
176
+ "num_worker": 32,
177
+ "pin_memory": true
178
+ },
179
+ "gradient_accumulation_step": 1,
180
+ "total_training_steps": 50000,
181
+ "save_summary_steps": 500,
182
+ "save_checkpoints_steps": 10000,
183
+ "valid_interval": 10000,
184
+ "keep_checkpoint_max": 5,
185
+ "multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model;
186
+ "max_epoch": -1,
187
+ // -1 means no limit
188
+ "save_checkpoint_stride": [
189
+ 5,
190
+ 20
191
+ ],
192
+ // unit is epoch
193
+ "keep_last": [
194
+ 3,
195
+ -1
196
+ ],
197
+ // -1 means infinite, if one number will broadcast
198
+ "run_eval": [
199
+ false,
200
+ true
201
+ ],
202
+ // Batchsampler
203
+ "sampler": {
204
+ "holistic_shuffle": true,
205
+ "drop_last": true
206
+ },
207
+ // Dataloader
208
+ "dataloader": {
209
+ "num_worker": 32,
210
+ "pin_memory": true
211
+ },
212
+ // Trackers
213
+ "tracker": [
214
+ "tensorboard"
215
+ // "wandb",
216
+ // "cometml",
217
+ // "mlflow",
218
+ ],
219
+ },
220
+ }
config/comosvc.json ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "DiffComoSVC",
4
+ "task_type": "svc",
5
+ "use_custom_dataset": false,
6
+ "preprocess": {
7
+ // data augmentations
8
+ "use_pitch_shift": false,
9
+ "use_formant_shift": false,
10
+ "use_time_stretch": false,
11
+ "use_equalizer": false,
12
+ // acoustic features
13
+ "extract_mel": true,
14
+ "mel_min_max_norm": true,
15
+ "extract_pitch": true,
16
+ "pitch_extractor": "parselmouth",
17
+ "extract_uv": true,
18
+ "extract_energy": true,
19
+ // content features
20
+ "extract_whisper_feature": false,
21
+ "whisper_sample_rate": 16000,
22
+ "extract_contentvec_feature": false,
23
+ "contentvec_sample_rate": 16000,
24
+ "extract_wenet_feature": false,
25
+ "wenet_sample_rate": 16000,
26
+ "extract_mert_feature": false,
27
+ "mert_sample_rate": 16000,
28
+ // Default config for whisper
29
+ "whisper_frameshift": 0.01,
30
+ "whisper_downsample_rate": 2,
31
+ // Default config for content vector
32
+ "contentvec_frameshift": 0.02,
33
+ // Default config for mert
34
+ "mert_model": "m-a-p/MERT-v1-330M",
35
+ "mert_feature_layer": -1,
36
+ "mert_hop_size": 320,
37
+ // 24k
38
+ "mert_frameshit": 0.01333,
39
+ // 10ms
40
+ "wenet_frameshift": 0.01,
41
+ // wenetspeech is 4, gigaspeech is 6
42
+ "wenet_downsample_rate": 4,
43
+ // Default config
44
+ "n_mel": 100,
45
+ "win_size": 1024,
46
+ // todo
47
+ "hop_size": 256,
48
+ "sample_rate": 24000,
49
+ "n_fft": 1024,
50
+ // todo
51
+ "fmin": 0,
52
+ "fmax": 12000,
53
+ // todo
54
+ "f0_min": 50,
55
+ // ~C2
56
+ "f0_max": 1100,
57
+ //1100, // ~C6(1100), ~G5(800)
58
+ "pitch_bin": 256,
59
+ "pitch_max": 1100.0,
60
+ "pitch_min": 50.0,
61
+ "is_label": true,
62
+ "is_mu_law": true,
63
+ "bits": 8,
64
+ "mel_min_max_stats_dir": "mel_min_max_stats",
65
+ "whisper_dir": "whisper",
66
+ "contentvec_dir": "contentvec",
67
+ "wenet_dir": "wenet",
68
+ "mert_dir": "mert",
69
+ // Extract content features using dataloader
70
+ "pin_memory": true,
71
+ "num_workers": 8,
72
+ "content_feature_batch_size": 16,
73
+ // Features used for model training
74
+ "use_mel": true,
75
+ "use_min_max_norm_mel": true,
76
+ "use_frame_pitch": true,
77
+ "use_uv": true,
78
+ "use_frame_energy": true,
79
+ "use_log_scale_pitch": false,
80
+ "use_log_scale_energy": false,
81
+ "use_spkid": true,
82
+ // Meta file
83
+ "train_file": "train.json",
84
+ "valid_file": "test.json",
85
+ "spk2id": "singers.json",
86
+ "utt2spk": "utt2singer"
87
+ },
88
+ "model": {
89
+ "teacher_model_path": "[Your Teacher Model Path].bin",
90
+ "condition_encoder": {
91
+ "merge_mode": "add",
92
+ "input_melody_dim": 1,
93
+ "use_log_f0": true,
94
+ "n_bins_melody": 256,
95
+ //# Quantization (0 for not quantization)
96
+ "output_melody_dim": 384,
97
+ "input_loudness_dim": 1,
98
+ "use_log_loudness": true,
99
+ "n_bins_loudness": 256,
100
+ "output_loudness_dim": 384,
101
+ "use_whisper": false,
102
+ "use_contentvec": false,
103
+ "use_wenet": false,
104
+ "use_mert": false,
105
+ "whisper_dim": 1024,
106
+ "contentvec_dim": 256,
107
+ "mert_dim": 256,
108
+ "wenet_dim": 512,
109
+ "content_encoder_dim": 384,
110
+ "output_singer_dim": 384,
111
+ "singer_table_size": 512,
112
+ "output_content_dim": 384,
113
+ "use_spkid": true
114
+ },
115
+ "comosvc": {
116
+ "distill": false,
117
+ // conformer encoder
118
+ "input_dim": 384,
119
+ "output_dim": 100,
120
+ "n_heads": 2,
121
+ "n_layers": 6,
122
+ "filter_channels": 512,
123
+ "dropout": 0.1,
124
+ // karras diffusion
125
+ "P_mean": -1.2,
126
+ "P_std": 1.2,
127
+ "sigma_data": 0.5,
128
+ "sigma_min": 0.002,
129
+ "sigma_max": 80,
130
+ "rho": 7,
131
+ "n_timesteps": 40,
132
+ },
133
+ "diffusion": {
134
+ // Diffusion steps encoder
135
+ "step_encoder": {
136
+ "dim_raw_embedding": 128,
137
+ "dim_hidden_layer": 512,
138
+ "activation": "SiLU",
139
+ "num_layer": 2,
140
+ "max_period": 10000
141
+ },
142
+ // Diffusion decoder
143
+ "model_type": "bidilconv",
144
+ // bidilconv, unet2d, TODO: unet1d
145
+ "bidilconv": {
146
+ "base_channel": 384,
147
+ "n_res_block": 20,
148
+ "conv_kernel_size": 3,
149
+ "dilation_cycle_length": 4,
150
+ // specially, 1 means no dilation
151
+ "conditioner_size": 100
152
+ }
153
+ },
154
+ },
155
+ "train": {
156
+ // Basic settings
157
+ "fast_steps": 0,
158
+ "batch_size": 32,
159
+ "gradient_accumulation_step": 1,
160
+ "max_epoch": -1,
161
+ // -1 means no limit
162
+ "save_checkpoint_stride": [
163
+ 10,
164
+ 100
165
+ ],
166
+ // unit is epoch
167
+ "keep_last": [
168
+ 3,
169
+ -1
170
+ ],
171
+ // -1 means infinite, if one number will broadcast
172
+ "run_eval": [
173
+ false,
174
+ true
175
+ ],
176
+ // if one number will broadcast
177
+ // Fix the random seed
178
+ "random_seed": 10086,
179
+ // Batchsampler
180
+ "sampler": {
181
+ "holistic_shuffle": true,
182
+ "drop_last": true
183
+ },
184
+ // Dataloader
185
+ "dataloader": {
186
+ "num_worker": 32,
187
+ "pin_memory": true
188
+ },
189
+ // Trackers
190
+ "tracker": [
191
+ "tensorboard"
192
+ // "wandb",
193
+ // "cometml",
194
+ // "mlflow",
195
+ ],
196
+ // Optimizer
197
+ "optimizer": "AdamW",
198
+ "adamw": {
199
+ "lr": 4.0e-4
200
+ // nn model lr
201
+ },
202
+ // LR Scheduler
203
+ "scheduler": "ReduceLROnPlateau",
204
+ "reducelronplateau": {
205
+ "factor": 0.8,
206
+ "patience": 10,
207
+ // unit is epoch
208
+ "min_lr": 1.0e-4
209
+ }
210
+ },
211
+ "inference": {
212
+ "comosvc": {
213
+ "inference_steps": 40
214
+ }
215
+ }
216
+ }
config/diffusion.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // FIXME: THESE ARE LEGACY
3
+ "base_config": "config/base.json",
4
+ "model_type": "diffusion",
5
+ "task_type": "svc",
6
+ "use_custom_dataset": false,
7
+ "preprocess": {
8
+ // data augmentations
9
+ "use_pitch_shift": false,
10
+ "use_formant_shift": false,
11
+ "use_time_stretch": false,
12
+ "use_equalizer": false,
13
+ // acoustic features
14
+ "extract_mel": true,
15
+ "mel_min_max_norm": true,
16
+ "extract_pitch": true,
17
+ "pitch_extractor": "parselmouth",
18
+ "extract_uv": true,
19
+ "extract_energy": true,
20
+ // content features
21
+ "extract_whisper_feature": false,
22
+ "whisper_sample_rate": 16000,
23
+ "extract_contentvec_feature": false,
24
+ "contentvec_sample_rate": 16000,
25
+ "extract_wenet_feature": false,
26
+ "wenet_sample_rate": 16000,
27
+ "extract_mert_feature": false,
28
+ "mert_sample_rate": 16000,
29
+ // Default config for whisper
30
+ "whisper_frameshift": 0.01,
31
+ "whisper_downsample_rate": 2,
32
+ // Default config for content vector
33
+ "contentvec_frameshift": 0.02,
34
+ // Default config for mert
35
+ "mert_model": "m-a-p/MERT-v1-330M",
36
+ "mert_feature_layer": -1,
37
+ "mert_hop_size": 320,
38
+ // 24k
39
+ "mert_frameshit": 0.01333,
40
+ // 10ms
41
+ "wenet_frameshift": 0.01,
42
+ // wenetspeech is 4, gigaspeech is 6
43
+ "wenet_downsample_rate": 4,
44
+ // Default config
45
+ "n_mel": 100,
46
+ "win_size": 1024,
47
+ // todo
48
+ "hop_size": 256,
49
+ "sample_rate": 24000,
50
+ "n_fft": 1024,
51
+ // todo
52
+ "fmin": 0,
53
+ "fmax": 12000,
54
+ // todo
55
+ "f0_min": 50,
56
+ // ~C2
57
+ "f0_max": 1100,
58
+ //1100, // ~C6(1100), ~G5(800)
59
+ "pitch_bin": 256,
60
+ "pitch_max": 1100.0,
61
+ "pitch_min": 50.0,
62
+ "is_label": true,
63
+ "is_mu_law": true,
64
+ "bits": 8,
65
+ "mel_min_max_stats_dir": "mel_min_max_stats",
66
+ "whisper_dir": "whisper",
67
+ "contentvec_dir": "contentvec",
68
+ "wenet_dir": "wenet",
69
+ "mert_dir": "mert",
70
+ // Extract content features using dataloader
71
+ "pin_memory": true,
72
+ "num_workers": 8,
73
+ "content_feature_batch_size": 16,
74
+ // Features used for model training
75
+ "use_mel": true,
76
+ "use_min_max_norm_mel": true,
77
+ "use_frame_pitch": true,
78
+ "use_uv": true,
79
+ "use_frame_energy": true,
80
+ "use_log_scale_pitch": false,
81
+ "use_log_scale_energy": false,
82
+ "use_spkid": true,
83
+ // Meta file
84
+ "train_file": "train.json",
85
+ "valid_file": "test.json",
86
+ "spk2id": "singers.json",
87
+ "utt2spk": "utt2singer"
88
+ },
89
+ "model": {
90
+ "condition_encoder": {
91
+ "merge_mode": "add",
92
+ "input_melody_dim": 1,
93
+ "use_log_f0": true,
94
+ "n_bins_melody": 256,
95
+ //# Quantization (0 for not quantization)
96
+ "output_melody_dim": 384,
97
+ "input_loudness_dim": 1,
98
+ "use_log_loudness": true,
99
+ "n_bins_loudness": 256,
100
+ "output_loudness_dim": 384,
101
+ "use_whisper": false,
102
+ "use_contentvec": false,
103
+ "use_wenet": false,
104
+ "use_mert": false,
105
+ "whisper_dim": 1024,
106
+ "contentvec_dim": 256,
107
+ "mert_dim": 256,
108
+ "wenet_dim": 512,
109
+ "content_encoder_dim": 384,
110
+ "output_singer_dim": 384,
111
+ "singer_table_size": 512,
112
+ "output_content_dim": 384,
113
+ "use_spkid": true
114
+ },
115
+ // FIXME: FOLLOWING ARE NEW!!
116
+ "diffusion": {
117
+ "scheduler": "ddpm",
118
+ "scheduler_settings": {
119
+ "num_train_timesteps": 1000,
120
+ "beta_start": 1.0e-4,
121
+ "beta_end": 0.02,
122
+ "beta_schedule": "linear"
123
+ },
124
+ // Diffusion steps encoder
125
+ "step_encoder": {
126
+ "dim_raw_embedding": 128,
127
+ "dim_hidden_layer": 512,
128
+ "activation": "SiLU",
129
+ "num_layer": 2,
130
+ "max_period": 10000
131
+ },
132
+ // Diffusion decoder
133
+ "model_type": "bidilconv",
134
+ // bidilconv, unet2d, TODO: unet1d
135
+ "bidilconv": {
136
+ "base_channel": 384,
137
+ "n_res_block": 20,
138
+ "conv_kernel_size": 3,
139
+ "dilation_cycle_length": 4,
140
+ // specially, 1 means no dilation
141
+ "conditioner_size": 384
142
+ },
143
+ "unet2d": {
144
+ "in_channels": 1,
145
+ "out_channels": 1,
146
+ "down_block_types": [
147
+ "CrossAttnDownBlock2D",
148
+ "CrossAttnDownBlock2D",
149
+ "CrossAttnDownBlock2D",
150
+ "DownBlock2D"
151
+ ],
152
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
153
+ "up_block_types": [
154
+ "UpBlock2D",
155
+ "CrossAttnUpBlock2D",
156
+ "CrossAttnUpBlock2D",
157
+ "CrossAttnUpBlock2D"
158
+ ],
159
+ "only_cross_attention": false
160
+ }
161
+ }
162
+ },
163
+ // FIXME: FOLLOWING ARE NEW!!
164
+ "train": {
165
+ // Basic settings
166
+ "batch_size": 64,
167
+ "gradient_accumulation_step": 1,
168
+ "max_epoch": -1,
169
+ // -1 means no limit
170
+ "save_checkpoint_stride": [
171
+ 5,
172
+ 20
173
+ ],
174
+ // unit is epoch
175
+ "keep_last": [
176
+ 3,
177
+ -1
178
+ ],
179
+ // -1 means infinite, if one number will broadcast
180
+ "run_eval": [
181
+ false,
182
+ true
183
+ ],
184
+ // if one number will broadcast
185
+ // Fix the random seed
186
+ "random_seed": 10086,
187
+ // Batchsampler
188
+ "sampler": {
189
+ "holistic_shuffle": true,
190
+ "drop_last": true
191
+ },
192
+ // Dataloader
193
+ "dataloader": {
194
+ "num_worker": 32,
195
+ "pin_memory": true
196
+ },
197
+ // Trackers
198
+ "tracker": [
199
+ "tensorboard"
200
+ // "wandb",
201
+ // "cometml",
202
+ // "mlflow",
203
+ ],
204
+ // Optimizer
205
+ "optimizer": "AdamW",
206
+ "adamw": {
207
+ "lr": 4.0e-4
208
+ // nn model lr
209
+ },
210
+ // LR Scheduler
211
+ "scheduler": "ReduceLROnPlateau",
212
+ "reducelronplateau": {
213
+ "factor": 0.8,
214
+ "patience": 10,
215
+ // unit is epoch
216
+ "min_lr": 1.0e-4
217
+ }
218
+ },
219
+ "inference": {
220
+ "diffusion": {
221
+ "scheduler": "pndm",
222
+ "scheduler_settings": {
223
+ "num_inference_timesteps": 1000
224
+ }
225
+ }
226
+ }
227
+ }
config/fs2.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "FastSpeech2",
4
+ "task_type": "tts",
5
+ "dataset": ["LJSpeech"],
6
+ "preprocess": {
7
+ // acoustic features
8
+ "extract_audio": true,
9
+ "extract_mel": true,
10
+ "mel_extract_mode": "taco",
11
+ "mel_min_max_norm": false,
12
+ "extract_pitch": true,
13
+ "extract_uv": false,
14
+ "pitch_extractor": "dio",
15
+ "extract_energy": true,
16
+ "energy_extract_mode": "from_tacotron_stft",
17
+ "extract_duration": true,
18
+ "use_phone": true,
19
+ "pitch_norm": true,
20
+ "energy_norm": true,
21
+ "pitch_remove_outlier": true,
22
+ "energy_remove_outlier": true,
23
+
24
+ // Default config
25
+ "n_mel": 80,
26
+ "win_size": 1024, // todo
27
+ "hop_size": 256,
28
+ "sample_rate": 22050,
29
+ "n_fft": 1024, // todo
30
+ "fmin": 0,
31
+ "fmax": 8000, // todo
32
+ "raw_data": "raw_data",
33
+ "text_cleaners": ["english_cleaners"],
34
+ "f0_min": 71, // ~C2
35
+ "f0_max": 800, //1100, // ~C6(1100), ~G5(800)
36
+ "pitch_bin": 256,
37
+ "pitch_max": 1100.0,
38
+ "pitch_min": 50.0,
39
+ "is_label": true,
40
+ "is_mu_law": true,
41
+ "bits": 8,
42
+
43
+ "mel_min_max_stats_dir": "mel_min_max_stats",
44
+ "whisper_dir": "whisper",
45
+ "content_vector_dir": "content_vector",
46
+ "wenet_dir": "wenet",
47
+ "mert_dir": "mert",
48
+ "spk2id":"spk2id.json",
49
+ "utt2spk":"utt2spk",
50
+
51
+ // Features used for model training
52
+ "use_mel": true,
53
+ "use_min_max_norm_mel": false,
54
+ "use_frame_pitch": false,
55
+ "use_frame_energy": false,
56
+ "use_phone_pitch": true,
57
+ "use_phone_energy": true,
58
+ "use_log_scale_pitch": false,
59
+ "use_log_scale_energy": false,
60
+ "use_spkid": false,
61
+ "align_mel_duration": true,
62
+ "text_cleaners": ["english_cleaners"]
63
+ },
64
+ "model": {
65
+ // Settings for transformer
66
+ "transformer": {
67
+ "encoder_layer": 4,
68
+ "encoder_head": 2,
69
+ "encoder_hidden": 256,
70
+ "decoder_layer": 6,
71
+ "decoder_head": 2,
72
+ "decoder_hidden": 256,
73
+ "conv_filter_size": 1024,
74
+ "conv_kernel_size": [9, 1],
75
+ "encoder_dropout": 0.2,
76
+ "decoder_dropout": 0.2
77
+ },
78
+
79
+ // Settings for variance_predictor
80
+ "variance_predictor":{
81
+ "filter_size": 256,
82
+ "kernel_size": 3,
83
+ "dropout": 0.5
84
+ },
85
+ "variance_embedding":{
86
+ "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
87
+ "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
88
+ "n_bins": 256
89
+ },
90
+ "max_seq_len": 1000
91
+ },
92
+ "train":{
93
+ "batch_size": 16,
94
+ "sort_sample": true,
95
+ "drop_last": true,
96
+ "group_size": 4,
97
+ "grad_clip_thresh": 1.0,
98
+ "dataloader": {
99
+ "num_worker": 8,
100
+ "pin_memory": true
101
+ },
102
+ "lr_scheduler":{
103
+ "num_warmup": 4000
104
+ },
105
+ // LR Scheduler
106
+ "scheduler": "NoamLR",
107
+ // Optimizer
108
+ "optimizer": "Adam",
109
+ "adam": {
110
+ "lr": 0.0625,
111
+ "betas": [0.9, 0.98],
112
+ "eps": 0.000000001,
113
+ "weight_decay": 0.0
114
+ },
115
+ }
116
+
117
+ }
config/transformer.json ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "Transformer",
4
+ "task_type": "svc",
5
+ "use_custom_dataset": false,
6
+ "preprocess": {
7
+ // data augmentations
8
+ "use_pitch_shift": false,
9
+ "use_formant_shift": false,
10
+ "use_time_stretch": false,
11
+ "use_equalizer": false,
12
+ // acoustic features
13
+ "extract_mel": true,
14
+ "mel_min_max_norm": true,
15
+ "extract_pitch": true,
16
+ "pitch_extractor": "parselmouth",
17
+ "extract_uv": true,
18
+ "extract_energy": true,
19
+ // content features
20
+ "extract_whisper_feature": false,
21
+ "whisper_sample_rate": 16000,
22
+ "extract_contentvec_feature": false,
23
+ "contentvec_sample_rate": 16000,
24
+ "extract_wenet_feature": false,
25
+ "wenet_sample_rate": 16000,
26
+ "extract_mert_feature": false,
27
+ "mert_sample_rate": 16000,
28
+ // Default config for whisper
29
+ "whisper_frameshift": 0.01,
30
+ "whisper_downsample_rate": 2,
31
+ // Default config for content vector
32
+ "contentvec_frameshift": 0.02,
33
+ // Default config for mert
34
+ "mert_model": "m-a-p/MERT-v1-330M",
35
+ "mert_feature_layer": -1,
36
+ "mert_hop_size": 320,
37
+ // 24k
38
+ "mert_frameshit": 0.01333,
39
+ // 10ms
40
+ "wenet_frameshift": 0.01,
41
+ // wenetspeech is 4, gigaspeech is 6
42
+ "wenet_downsample_rate": 4,
43
+ // Default config
44
+ "n_mel": 100,
45
+ "win_size": 1024,
46
+ // todo
47
+ "hop_size": 256,
48
+ "sample_rate": 24000,
49
+ "n_fft": 1024,
50
+ // todo
51
+ "fmin": 0,
52
+ "fmax": 12000,
53
+ // todo
54
+ "f0_min": 50,
55
+ // ~C2
56
+ "f0_max": 1100,
57
+ //1100, // ~C6(1100), ~G5(800)
58
+ "pitch_bin": 256,
59
+ "pitch_max": 1100.0,
60
+ "pitch_min": 50.0,
61
+ "is_label": true,
62
+ "is_mu_law": true,
63
+ "bits": 8,
64
+ "mel_min_max_stats_dir": "mel_min_max_stats",
65
+ "whisper_dir": "whisper",
66
+ "contentvec_dir": "contentvec",
67
+ "wenet_dir": "wenet",
68
+ "mert_dir": "mert",
69
+ // Extract content features using dataloader
70
+ "pin_memory": true,
71
+ "num_workers": 8,
72
+ "content_feature_batch_size": 16,
73
+ // Features used for model training
74
+ "use_mel": true,
75
+ "use_min_max_norm_mel": true,
76
+ "use_frame_pitch": true,
77
+ "use_uv": true,
78
+ "use_frame_energy": true,
79
+ "use_log_scale_pitch": false,
80
+ "use_log_scale_energy": false,
81
+ "use_spkid": true,
82
+ // Meta file
83
+ "train_file": "train.json",
84
+ "valid_file": "test.json",
85
+ "spk2id": "singers.json",
86
+ "utt2spk": "utt2singer"
87
+ },
88
+ "model": {
89
+ "condition_encoder": {
90
+ "merge_mode": "add",
91
+ "input_melody_dim": 1,
92
+ "use_log_f0": true,
93
+ "n_bins_melody": 256,
94
+ //# Quantization (0 for not quantization)
95
+ "output_melody_dim": 384,
96
+ "input_loudness_dim": 1,
97
+ "use_log_loudness": true,
98
+ "n_bins_loudness": 256,
99
+ "output_loudness_dim": 384,
100
+ "use_whisper": false,
101
+ "use_contentvec": true,
102
+ "use_wenet": false,
103
+ "use_mert": false,
104
+ "whisper_dim": 1024,
105
+ "contentvec_dim": 256,
106
+ "mert_dim": 256,
107
+ "wenet_dim": 512,
108
+ "content_encoder_dim": 384,
109
+ "output_singer_dim": 384,
110
+ "singer_table_size": 512,
111
+ "output_content_dim": 384,
112
+ "use_spkid": true
113
+ },
114
+ "transformer": {
115
+ "type": "conformer",
116
+ // 'conformer' or 'transformer'
117
+ "input_dim": 384,
118
+ "output_dim": 100,
119
+ "n_heads": 2,
120
+ "n_layers": 6,
121
+ "filter_channels": 512,
122
+ "dropout": 0.1,
123
+ }
124
+ },
125
+ "train": {
126
+ // Basic settings
127
+ "batch_size": 64,
128
+ "gradient_accumulation_step": 1,
129
+ "max_epoch": -1,
130
+ // -1 means no limit
131
+ "save_checkpoint_stride": [
132
+ 10,
133
+ 100
134
+ ],
135
+ // unit is epoch
136
+ "keep_last": [
137
+ 3,
138
+ -1
139
+ ],
140
+ // -1 means infinite, if one number will broadcast
141
+ "run_eval": [
142
+ false,
143
+ true
144
+ ],
145
+ // if one number will broadcast
146
+ // Fix the random seed
147
+ "random_seed": 10086,
148
+ // Batchsampler
149
+ "sampler": {
150
+ "holistic_shuffle": true,
151
+ "drop_last": true
152
+ },
153
+ // Dataloader
154
+ "dataloader": {
155
+ "num_worker": 32,
156
+ "pin_memory": true
157
+ },
158
+ // Trackers
159
+ "tracker": [
160
+ "tensorboard"
161
+ // "wandb",
162
+ // "cometml",
163
+ // "mlflow",
164
+ ],
165
+ // Optimizer
166
+ "optimizer": "AdamW",
167
+ "adamw": {
168
+ "lr": 4.0e-4
169
+ // nn model lr
170
+ },
171
+ // LR Scheduler
172
+ "scheduler": "ReduceLROnPlateau",
173
+ "reducelronplateau": {
174
+ "factor": 0.8,
175
+ "patience": 10,
176
+ // unit is epoch
177
+ "min_lr": 1.0e-4
178
+ }
179
+ }
180
+ }
config/tts.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "supported_model_type": [
4
+ "Fastspeech2",
5
+ "VITS",
6
+ "VALLE",
7
+ ],
8
+ "task_type": "tts",
9
+ "preprocess": {
10
+ "language": "en-us",
11
+ // linguistic features
12
+ "extract_phone": true,
13
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
14
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
15
+ // Directory names of processed data or extracted features
16
+ "phone_dir": "phones",
17
+ "use_phone": true,
18
+ },
19
+ "model": {
20
+ "text_token_num": 512,
21
+ }
22
+
23
+ }
config/valle.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "VALLE",
4
+ "task_type": "tts",
5
+ "dataset": [
6
+ "libritts"
7
+ ],
8
+ "preprocess": {
9
+ "extract_phone": true,
10
+ "phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
11
+ "extract_acoustic_token": true,
12
+ "acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
13
+ "acoustic_token_dir": "acoutic_tokens",
14
+ "use_text": false,
15
+ "use_phone": true,
16
+ "use_acoustic_token": true,
17
+ "symbols_dict": "symbols.dict",
18
+ "min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
19
+ "max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration.
20
+ "sampling_rate": 24000,
21
+ },
22
+ "model": {
23
+ "text_token_num": 512,
24
+ "audio_token_num": 1024,
25
+ "decoder_dim": 1024, // embedding dimension of the decoder model
26
+ "nhead": 16, // number of attention heads in the decoder layers
27
+ "num_decoder_layers": 12, // number of decoder layers
28
+ "norm_first": true, // pre or post Normalization.
29
+ "add_prenet": false, // whether add PreNet after Inputs
30
+ "prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
31
+ "share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
32
+ "nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
33
+ "prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
34
+ "num_quantizers": 8, // numbert of the audio quantization layers
35
+ // "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
36
+ },
37
+ "train": {
38
+ "ddp": false,
39
+ "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
40
+ "max_epoch": 20,
41
+ "optimizer": "ScaledAdam",
42
+ "scheduler": "Eden",
43
+ "warmup_steps": 200, // number of steps that affects how rapidly the learning rate decreases
44
+ "base_lr": 0.05, // base learning rate."
45
+ "valid_interval": 1000,
46
+ "log_epoch_step": 1000,
47
+ "save_checkpoint_stride": [
48
+ 1,
49
+ 1
50
+ ]
51
+ }
52
+ }
config/vits.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "VITS",
4
+ "task_type": "tts",
5
+ "preprocess": {
6
+ "extract_phone": true,
7
+ "extract_mel": true,
8
+ "n_mel": 80,
9
+ "fmin": 0,
10
+ "fmax": null,
11
+ "extract_linear_spec": true,
12
+ "extract_audio": true,
13
+ "use_linear": true,
14
+ "use_mel": true,
15
+ "use_audio": true,
16
+ "use_text": false,
17
+ "use_phone": true,
18
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
19
+ "n_fft": 1024,
20
+ "win_size": 1024,
21
+ "hop_size": 256,
22
+ "segment_size": 8192,
23
+ "text_cleaners": [
24
+ "english_cleaners"
25
+ ]
26
+ },
27
+ "model": {
28
+ "text_token_num": 512,
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0.1,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [
38
+ 3,
39
+ 7,
40
+ 11
41
+ ],
42
+ "resblock_dilation_sizes": [
43
+ [
44
+ 1,
45
+ 3,
46
+ 5
47
+ ],
48
+ [
49
+ 1,
50
+ 3,
51
+ 5
52
+ ],
53
+ [
54
+ 1,
55
+ 3,
56
+ 5
57
+ ]
58
+ ],
59
+ "upsample_rates": [
60
+ 8,
61
+ 8,
62
+ 2,
63
+ 2
64
+ ],
65
+ "upsample_initial_channel": 512,
66
+ "upsample_kernel_sizes": [
67
+ 16,
68
+ 16,
69
+ 4,
70
+ 4
71
+ ],
72
+ "n_layers_q": 3,
73
+ "use_spectral_norm": false,
74
+ "n_speakers": 0, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
75
+ "gin_channels": 256,
76
+ "use_sdp": true
77
+ },
78
+ "train": {
79
+ "fp16_run": true,
80
+ "learning_rate": 2e-4,
81
+ "betas": [
82
+ 0.8,
83
+ 0.99
84
+ ],
85
+ "eps": 1e-9,
86
+ "batch_size": 16,
87
+ "lr_decay": 0.999875,
88
+ // "segment_size": 8192,
89
+ "init_lr_ratio": 1,
90
+ "warmup_epochs": 0,
91
+ "c_mel": 45,
92
+ "c_kl": 1.0,
93
+ "AdamW": {
94
+ "betas": [
95
+ 0.8,
96
+ 0.99
97
+ ],
98
+ "eps": 1e-9,
99
+ }
100
+ }
101
+ }
config/vocoder.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "dataset": [
4
+ "LJSpeech",
5
+ "LibriTTS",
6
+ "opencpop",
7
+ "m4singer",
8
+ "svcc",
9
+ "svcceval",
10
+ "pjs",
11
+ "opensinger",
12
+ "popbutfy",
13
+ "nus48e",
14
+ "popcs",
15
+ "kising",
16
+ "csd",
17
+ "opera",
18
+ "vctk",
19
+ "lijian",
20
+ "cdmusiceval"
21
+ ],
22
+ "task_type": "vocoder",
23
+ "preprocess": {
24
+ // acoustic features
25
+ "extract_mel": true,
26
+ "extract_pitch": false,
27
+ "extract_uv": false,
28
+ "extract_audio": true,
29
+ "extract_label": false,
30
+ "extract_one_hot": false,
31
+ "extract_amplitude_phase": false,
32
+ "pitch_extractor": "parselmouth",
33
+ // Settings for data preprocessing
34
+ "n_mel": 100,
35
+ "win_size": 1024,
36
+ "hop_size": 256,
37
+ "sample_rate": 24000,
38
+ "n_fft": 1024,
39
+ "fmin": 0,
40
+ "fmax": 12000,
41
+ "f0_min": 50,
42
+ "f0_max": 1100,
43
+ "pitch_bin": 256,
44
+ "pitch_max": 1100.0,
45
+ "pitch_min": 50.0,
46
+ "is_mu_law": false,
47
+ "bits": 8,
48
+ "cut_mel_frame": 32,
49
+ // Directory names of processed data or extracted features
50
+ "spk2id": "singers.json",
51
+ // Features used for model training
52
+ "use_mel": true,
53
+ "use_frame_pitch": false,
54
+ "use_uv": false,
55
+ "use_audio": true,
56
+ "use_label": false,
57
+ "use_one_hot": false,
58
+ "train_file": "train.json",
59
+ "valid_file": "test.json"
60
+ },
61
+ "train": {
62
+ "random_seed": 114514,
63
+ "batch_size": 64,
64
+ "gradient_accumulation_step": 1,
65
+ "max_epoch": 1000000,
66
+ "save_checkpoint_stride": [
67
+ 20
68
+ ],
69
+ "run_eval": [
70
+ true
71
+ ],
72
+ "sampler": {
73
+ "holistic_shuffle": true,
74
+ "drop_last": true
75
+ },
76
+ "dataloader": {
77
+ "num_worker": 4,
78
+ "pin_memory": true
79
+ },
80
+ "tracker": [
81
+ "tensorboard"
82
+ ],
83
+ }
84
+ }
egs/svc/MultipleContentsSVC/README.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion
2
+
3
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
4
+ [![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html)
5
+
6
+ <br>
7
+ <div align="center">
8
+ <img src="../../../imgs/svc/MultipleContentsSVC.png" width="85%">
9
+ </div>
10
+ <br>
11
+
12
+ This is the official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Specially,
13
+
14
+ - The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec).
15
+ - The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
16
+ - The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data.
17
+
18
+ There are four stages in total:
19
+
20
+ 1. Data preparation
21
+ 2. Features extraction
22
+ 3. Training
23
+ 4. Inference/conversion
24
+
25
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
26
+ > ```bash
27
+ > cd Amphion
28
+ > ```
29
+
30
+ ## 1. Data Preparation
31
+
32
+ ### Dataset Download
33
+
34
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
35
+
36
+ ### Configuration
37
+
38
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
39
+
40
+ ```json
41
+ "dataset": [
42
+ "m4singer",
43
+ "opencpop",
44
+ "opensinger",
45
+ "svcc",
46
+ "vctk"
47
+ ],
48
+ "dataset_path": {
49
+ // TODO: Fill in your dataset path
50
+ "m4singer": "[M4Singer dataset path]",
51
+ "opencpop": "[Opencpop dataset path]",
52
+ "opensinger": "[OpenSinger dataset path]",
53
+ "svcc": "[SVCC dataset path]",
54
+ "vctk": "[VCTK dataset path]"
55
+ },
56
+ ```
57
+
58
+ ## 2. Features Extraction
59
+
60
+ ### Content-based Pretrained Models Download
61
+
62
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
63
+
64
+ ### Configuration
65
+
66
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
67
+
68
+ ```json
69
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
70
+ "log_dir": "ckpts/svc",
71
+ "preprocess": {
72
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
73
+ "processed_dir": "data",
74
+ ...
75
+ },
76
+ ```
77
+
78
+ ### Run
79
+
80
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
81
+
82
+ ```bash
83
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 1
84
+ ```
85
+
86
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
87
+
88
+ ## 3. Training
89
+
90
+ ### Configuration
91
+
92
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
93
+
94
+ ```json
95
+ "train": {
96
+ "batch_size": 32,
97
+ ...
98
+ "adamw": {
99
+ "lr": 2.0e-4
100
+ },
101
+ ...
102
+ }
103
+ ```
104
+
105
+ ### Run
106
+
107
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
108
+
109
+ ```bash
110
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName]
111
+ ```
112
+
113
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
114
+
115
+ ## 4. Inference/Conversion
116
+
117
+ ### Pretrained Vocoder Download
118
+
119
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
120
+
121
+ ### Run
122
+
123
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
124
+
125
+ | Parameters | Description | Example |
126
+ | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
127
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
128
+ | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
129
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
130
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
131
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
132
+
133
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
134
+
135
+ ```bash
136
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \
137
+ --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
138
+ --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
139
+ --infer_source_audio_dir [Your Audios Folder] \
140
+ --infer_target_speaker "opencpop_female1" \
141
+ --infer_key_shift "autoshift"
142
+ ```
143
+
144
+ ## Citations
145
+
146
+ ```bibtex
147
+ @article{zhang2023leveraging,
148
+ title={Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion},
149
+ author={Zhang, Xueyao and Gu, Yicheng and Chen, Haopeng and Fang, Zihao and Zou, Lexiao and Xue, Liumeng and Wu, Zhizheng},
150
+ journal={Machine Learning for Audio Worshop, NeurIPS 2023},
151
+ year={2023}
152
+ }
153
+ ```
egs/svc/MultipleContentsSVC/exp_config.json ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/diffusion.json",
3
+ "model_type": "DiffWaveNetSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
20
+ "log_dir": "ckpts/svc",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
23
+ "processed_dir": "data",
24
+ // Config for features extraction
25
+ "extract_mel": true,
26
+ "extract_pitch": true,
27
+ "extract_energy": true,
28
+ "extract_whisper_feature": true,
29
+ "extract_contentvec_feature": true,
30
+ "extract_wenet_feature": false,
31
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
+ "contentvec_batch_size": 1,
33
+ // Fill in the content-based pretrained model's path
34
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
+ "whisper_model": "medium",
38
+ "whisper_model_path": "pretrained/whisper/medium.pt",
39
+ // Config for features usage
40
+ "use_mel": true,
41
+ "use_min_max_norm_mel": true,
42
+ "use_frame_pitch": true,
43
+ "use_frame_energy": true,
44
+ "use_spkid": true,
45
+ "use_whisper": true,
46
+ "use_contentvec": true,
47
+ "use_wenet": false,
48
+ "n_mel": 100,
49
+ "sample_rate": 24000
50
+ },
51
+ "model": {
52
+ "condition_encoder": {
53
+ // Config for features usage
54
+ "use_whisper": true,
55
+ "use_contentvec": true,
56
+ "use_wenet": false,
57
+ "whisper_dim": 1024,
58
+ "contentvec_dim": 256,
59
+ "wenet_dim": 512,
60
+ "use_singer_encoder": false,
61
+ "pitch_min": 50,
62
+ "pitch_max": 1100
63
+ },
64
+ "diffusion": {
65
+ "scheduler": "ddpm",
66
+ "scheduler_settings": {
67
+ "num_train_timesteps": 1000,
68
+ "beta_start": 1.0e-4,
69
+ "beta_end": 0.02,
70
+ "beta_schedule": "linear"
71
+ },
72
+ // Diffusion steps encoder
73
+ "step_encoder": {
74
+ "dim_raw_embedding": 128,
75
+ "dim_hidden_layer": 512,
76
+ "activation": "SiLU",
77
+ "num_layer": 2,
78
+ "max_period": 10000
79
+ },
80
+ // Diffusion decoder
81
+ "model_type": "bidilconv",
82
+ // bidilconv, unet2d, TODO: unet1d
83
+ "bidilconv": {
84
+ "base_channel": 512,
85
+ "n_res_block": 40,
86
+ "conv_kernel_size": 3,
87
+ "dilation_cycle_length": 4,
88
+ // specially, 1 means no dilation
89
+ "conditioner_size": 384
90
+ }
91
+ }
92
+ },
93
+ "train": {
94
+ "batch_size": 32,
95
+ "gradient_accumulation_step": 1,
96
+ "max_epoch": -1, // -1 means no limit
97
+ "save_checkpoint_stride": [
98
+ 3,
99
+ 50
100
+ ],
101
+ "keep_last": [
102
+ 3,
103
+ 2
104
+ ],
105
+ "run_eval": [
106
+ true,
107
+ true
108
+ ],
109
+ "adamw": {
110
+ "lr": 2.0e-4
111
+ },
112
+ "reducelronplateau": {
113
+ "factor": 0.8,
114
+ "patience": 30,
115
+ "min_lr": 1.0e-4
116
+ },
117
+ "dataloader": {
118
+ "num_worker": 8,
119
+ "pin_memory": true
120
+ },
121
+ "sampler": {
122
+ "holistic_shuffle": false,
123
+ "drop_last": true
124
+ }
125
+ }
126
+ }
egs/svc/MultipleContentsSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Singing Voice Conversion (SVC) Recipe
2
+
3
+ ## Quick Start
4
+
5
+ We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
6
+
7
+ ## Supported Model Architectures
8
+
9
+ The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
10
+
11
+ <br>
12
+ <div align="center">
13
+ <img src="../../imgs/svc/pipeline.png" width="70%">
14
+ </div>
15
+ <br>
16
+
17
+ Until now, Amphion SVC has supported the following features and models:
18
+
19
+ - **Speaker-agnostic Representations**:
20
+ - Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
21
+ - Prosody Features: F0 and energy.
22
+ - **Speaker Embeddings**:
23
+ - Speaker Look-Up Table.
24
+ - Reference Encoder (👨‍💻 developing): It can be used for zero-shot SVC.
25
+ - **Acoustic Decoders**:
26
+ - Diffusion-based models:
27
+ - **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
28
+ - **[DiffComoSVC](DiffComoSVC)** (👨‍💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
29
+ - Transformer-based models:
30
+ - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
31
+ - VAE- and Flow-based models:
32
+ - **[VitsSVC]()** (👨‍💻 developing): It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
33
+ - **Waveform Synthesizers (Vocoders)**:
34
+ - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
egs/svc/_template/run.sh ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
37
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
38
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
39
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
40
+ # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
41
+ --infer_source_file) shift; infer_source_file=$1 ; shift ;;
42
+ --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
43
+ # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
44
+ --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
45
+ # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
46
+ --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
47
+ # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
48
+ --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
49
+
50
+ --) shift ; break ;;
51
+ *) echo "Invalid option: $1" exit 1 ;;
52
+ esac
53
+ done
54
+
55
+
56
+ ### Value check ###
57
+ if [ -z "$running_stage" ]; then
58
+ echo "[Error] Please specify the running stage"
59
+ exit 1
60
+ fi
61
+
62
+ if [ -z "$exp_config" ]; then
63
+ exp_config="${exp_dir}"/exp_config.json
64
+ fi
65
+ echo "Exprimental Configuration File: $exp_config"
66
+
67
+ if [ -z "$gpu" ]; then
68
+ gpu="0"
69
+ fi
70
+
71
+ ######## Features Extraction ###########
72
+ if [ $running_stage -eq 1 ]; then
73
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
74
+ --config $exp_config \
75
+ --num_workers 4
76
+ fi
77
+
78
+ ######## Training ###########
79
+ if [ $running_stage -eq 2 ]; then
80
+ if [ -z "$exp_name" ]; then
81
+ echo "[Error] Please specify the experiments name"
82
+ exit 1
83
+ fi
84
+ echo "Exprimental Name: $exp_name"
85
+
86
+ if [ "$resume" = true ]; then
87
+ echo "Automatically resume from the experimental dir..."
88
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
89
+ --config "$exp_config" \
90
+ --exp_name "$exp_name" \
91
+ --log_level info \
92
+ --resume
93
+ else
94
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
95
+ --config "$exp_config" \
96
+ --exp_name "$exp_name" \
97
+ --log_level info \
98
+ --resume_from_ckpt_path "$resume_from_ckpt_path" \
99
+ --resume_type "$resume_type"
100
+ fi
101
+ fi
102
+
103
+ ######## Inference/Conversion ###########
104
+ if [ $running_stage -eq 3 ]; then
105
+ if [ -z "$infer_expt_dir" ]; then
106
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
107
+ exit 1
108
+ fi
109
+
110
+ if [ -z "$infer_output_dir" ]; then
111
+ infer_output_dir="$expt_dir/result"
112
+ fi
113
+
114
+ if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
115
+ echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
116
+ exit 1
117
+ fi
118
+
119
+ if [ -z "$infer_source_file" ]; then
120
+ infer_source=$infer_source_audio_dir
121
+ fi
122
+
123
+ if [ -z "$infer_source_audio_dir" ]; then
124
+ infer_source=$infer_source_file
125
+ fi
126
+
127
+ if [ -z "$infer_target_speaker" ]; then
128
+ echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
129
+ exit 1
130
+ fi
131
+
132
+ if [ -z "$infer_key_shift" ]; then
133
+ infer_key_shift="autoshift"
134
+ fi
135
+
136
+ if [ -z "$infer_vocoder_dir" ]; then
137
+ infer_vocoder_dir="$work_dir"/pretrained/bigvgan
138
+ echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
139
+ fi
140
+
141
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
142
+ --config $exp_config \
143
+ --acoustics_dir $infer_expt_dir \
144
+ --vocoder_dir $infer_vocoder_dir \
145
+ --target_singer $infer_target_speaker \
146
+ --trans_key $infer_key_shift \
147
+ --source $infer_source \
148
+ --output_dir $infer_output_dir \
149
+ --log_level debug
150
+ fi
egs/vocoder/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Vocoder Recipe
2
+
3
+ ## Quick Start
4
+
5
+ We provide a [**beginner recipe**](gan/tfr_enhanced_hifigan/README.md) to demonstrate how to train a high quality HiFi-GAN speech vocoder. Specially, it is also an official implementation of our paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". Some demos can be seen [here](https://vocodexelysium.github.io/MS-SB-CQTD/).
6
+
7
+ ## Supported Models
8
+
9
+ Neural vocoder generates audible waveforms from acoustic representations, which is one of the key parts for current audio generation systems. Until now, Amphion has supported various widely-used vocoders according to different vocoder types, including:
10
+
11
+ - **GAN-based vocoders**, which we have provided [**a unified recipe**](gan/README.md) :
12
+ - [MelGAN](https://arxiv.org/abs/1910.06711)
13
+ - [HiFi-GAN](https://arxiv.org/abs/2010.05646)
14
+ - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts)
15
+ - [BigVGAN](https://arxiv.org/abs/2206.04658)
16
+ - [APNet](https://arxiv.org/abs/2305.07952)
17
+ - **Flow-based vocoders** (👨‍💻 developing):
18
+ - [WaveGlow](https://arxiv.org/abs/1811.00002)
19
+ - **Diffusion-based vocoders** (👨‍💻 developing):
20
+ - [Diffwave](https://arxiv.org/abs/2009.09761)
21
+ - **Auto-regressive based vocoders** (👨‍💻 developing):
22
+ - [WaveNet](https://arxiv.org/abs/1609.03499)
23
+ - [WaveRNN](https://arxiv.org/abs/1802.08435v1)
egs/vocoder/diffusion/README.md ADDED
File without changes
egs/vocoder/diffusion/exp_config_base.json ADDED
File without changes
egs/vocoder/gan/README.md ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion GAN-based Vocoder Recipe
2
+
3
+ ## Supported Model Architectures
4
+
5
+ GAN-based Vocoder consists of a generator and multiple discriminators, as illustrated below:
6
+
7
+ <br>
8
+ <div align="center">
9
+ <img src="../../../imgs/vocoder/gan/pipeline.png" width="40%">
10
+ </div>
11
+ <br>
12
+
13
+ Until now, Amphion GAN-based Vocoder has supported the following generators and discriminators.
14
+
15
+ - **Generators**
16
+ - [MelGAN](https://arxiv.org/abs/1910.06711)
17
+ - [HiFi-GAN](https://arxiv.org/abs/2010.05646)
18
+ - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts)
19
+ - [BigVGAN](https://arxiv.org/abs/2206.04658)
20
+ - [APNet](https://arxiv.org/abs/2305.07952)
21
+ - **Discriminators**
22
+ - [Multi-Scale Discriminator](https://arxiv.org/abs/2010.05646)
23
+ - [Multi-Period Discriminator](https://arxiv.org/abs/2010.05646)
24
+ - [Multi-Resolution Discriminator](https://arxiv.org/abs/2011.09631)
25
+ - [Multi-Scale Short-Time Fourier Transform Discriminator](https://arxiv.org/abs/2210.13438)
26
+ - [**Multi-Scale Constant-Q Transfrom Discriminator (ours)**](https://arxiv.org/abs/2311.14957)
27
+
28
+ You can use any vocoder architecture with any dataset you want. There are four steps in total:
29
+
30
+ 1. Data preparation
31
+ 2. Feature extraction
32
+ 3. Training
33
+ 4. Inference
34
+
35
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
36
+ > ```bash
37
+ > cd Amphion
38
+ > ```
39
+
40
+ ## 1. Data Preparation
41
+
42
+ You can train the vocoder with any datasets. Amphion's supported open-source datasets are detailed [here](../../../datasets/README.md).
43
+
44
+ ### Configuration
45
+
46
+ Specify the dataset path in `exp_config_base.json`. Note that you can change the `dataset` list to use your preferred datasets.
47
+
48
+ ```json
49
+ "dataset": [
50
+ "csd",
51
+ "kising",
52
+ "m4singer",
53
+ "nus48e",
54
+ "opencpop",
55
+ "opensinger",
56
+ "opera",
57
+ "pjs",
58
+ "popbutfy",
59
+ "popcs",
60
+ "ljspeech",
61
+ "vctk",
62
+ "libritts",
63
+ ],
64
+ "dataset_path": {
65
+ // TODO: Fill in your dataset path
66
+ "csd": "[dataset path]",
67
+ "kising": "[dataset path]",
68
+ "m4singer": "[dataset path]",
69
+ "nus48e": "[dataset path]",
70
+ "opencpop": "[dataset path]",
71
+ "opensinger": "[dataset path]",
72
+ "opera": "[dataset path]",
73
+ "pjs": "[dataset path]",
74
+ "popbutfy": "[dataset path]",
75
+ "popcs": "[dataset path]",
76
+ "ljspeech": "[dataset path]",
77
+ "vctk": "[dataset path]",
78
+ "libritts": "[dataset path]",
79
+ },
80
+ ```
81
+
82
+ ### 2. Feature Extraction
83
+
84
+ The needed features are speficied in the individual vocoder direction so it doesn't require any modification.
85
+
86
+ ### Configuration
87
+
88
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config_base.json`:
89
+
90
+ ```json
91
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
92
+ "log_dir": "ckpts/vocoder",
93
+ "preprocess": {
94
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
95
+ "processed_dir": "data",
96
+ ...
97
+ },
98
+ ```
99
+
100
+ ### Run
101
+
102
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
103
+
104
+ ```bash
105
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 1
106
+ ```
107
+
108
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
109
+
110
+ ## 3. Training
111
+
112
+ ### Configuration
113
+
114
+ We provide the default hyparameters in the `exp_config_base.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
115
+
116
+ ```json
117
+ "train": {
118
+ "batch_size": 16,
119
+ "max_epoch": 1000000,
120
+ "save_checkpoint_stride": [20],
121
+ "adamw": {
122
+ "lr": 2.0e-4,
123
+ "adam_b1": 0.8,
124
+ "adam_b2": 0.99
125
+ },
126
+ "exponential_lr": {
127
+ "lr_decay": 0.999
128
+ },
129
+ }
130
+ ```
131
+
132
+ You can also choose any amount of prefered discriminators for training in the `exp_config_base.json`.
133
+
134
+ ```json
135
+ "discriminators": [
136
+ "msd",
137
+ "mpd",
138
+ "msstftd",
139
+ "mssbcqtd",
140
+ ],
141
+ ```
142
+
143
+ ### Run
144
+
145
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`.
146
+
147
+ ```bash
148
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 2 --name [YourExptName]
149
+ ```
150
+
151
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
152
+
153
+
154
+ ## 4. Inference
155
+
156
+ ### Run
157
+
158
+ Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from_audio`.
159
+
160
+ ```bash
161
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
162
+ --infer_mode [Your chosen inference mode] \
163
+ --infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \
164
+ --infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \
165
+ --infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \
166
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
167
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
168
+ ```
169
+
170
+ #### a. Inference from Dataset
171
+
172
+ Run the `run.sh` with specified datasets, here is an example.
173
+
174
+ ```bash
175
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
176
+ --infer_mode infer_from_dataset \
177
+ --infer_datasets "libritts vctk ljspeech" \
178
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
179
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
180
+ ```
181
+
182
+ #### b. Inference from Features
183
+
184
+ If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure:
185
+
186
+ ```plaintext
187
+ ┣ {infer_feature_dir}
188
+ ┃ ┣ mels
189
+ ┃ ┃ ┣ sample1.npy
190
+ ┃ ┃ ┣ sample2.npy
191
+ ┃ ┣ f0s (required if you use NSF-HiFiGAN)
192
+ ┃ ┃ ┣ sample1.npy
193
+ ┃ ┃ ┣ sample2.npy
194
+ ```
195
+
196
+ Then run the `run.sh` with specificed folder direction, here is an example.
197
+
198
+ ```bash
199
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
200
+ --infer_mode infer_from_feature \
201
+ --infer_feature_dir [Your path to your predicted acoustic features] \
202
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
203
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
204
+ ```
205
+
206
+ #### c. Inference from Audios
207
+
208
+ If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure:
209
+
210
+ ```plaintext
211
+ ┣ audios
212
+ ┃ ┣ sample1.wav
213
+ ┃ ┣ sample2.wav
214
+ ```
215
+
216
+ Then run the `run.sh` with specificed folder direction, here is an example.
217
+
218
+ ```bash
219
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
220
+ --infer_mode infer_from_audio \
221
+ --infer_audio_dir [Your path to your audio files] \
222
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
223
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
224
+ ```
egs/vocoder/gan/_template/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/apnet/exp_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+ "extract_amplitude_phase": true,
8
+
9
+ // Features used for model training
10
+ "use_mel": true,
11
+ "use_audio": true,
12
+ "use_amplitude_phase": true
13
+ },
14
+ "model": {
15
+ "generator": "apnet",
16
+ "apnet": {
17
+ "ASP_channel": 512,
18
+ "ASP_resblock_kernel_sizes": [3,7,11],
19
+ "ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
20
+ "ASP_input_conv_kernel_size": 7,
21
+ "ASP_output_conv_kernel_size": 7,
22
+
23
+ "PSP_channel": 512,
24
+ "PSP_resblock_kernel_sizes": [3,7,11],
25
+ "PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "PSP_input_conv_kernel_size": 7,
27
+ "PSP_output_R_conv_kernel_size": 7,
28
+ "PSP_output_I_conv_kernel_size": 7,
29
+ }
30
+ },
31
+ "train": {
32
+ "criterions": [
33
+ "feature",
34
+ "discriminator",
35
+ "generator",
36
+ "mel",
37
+ "phase",
38
+ "amplitude",
39
+ "consistency"
40
+ ]
41
+ },
42
+ "inference": {
43
+ "batch_size": 1,
44
+ }
45
+ }
egs/vocoder/gan/apnet/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/bigvgan/exp_config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+
8
+ // Features used for model training
9
+ "use_mel": true,
10
+ "use_audio": true
11
+ },
12
+ "model": {
13
+ "generator": "bigvgan",
14
+ "bigvgan": {
15
+ "resblock": "1",
16
+ "activation": "snakebeta",
17
+ "snake_logscale": true,
18
+ "upsample_rates": [
19
+ 8,
20
+ 8,
21
+ 2,
22
+ 2,
23
+ ],
24
+ "upsample_kernel_sizes": [
25
+ 16,
26
+ 16,
27
+ 4,
28
+ 4
29
+ ],
30
+ "upsample_initial_channel": 512,
31
+ "resblock_kernel_sizes": [
32
+ 3,
33
+ 7,
34
+ 11
35
+ ],
36
+ "resblock_dilation_sizes": [
37
+ [
38
+ 1,
39
+ 3,
40
+ 5
41
+ ],
42
+ [
43
+ 1,
44
+ 3,
45
+ 5
46
+ ],
47
+ [
48
+ 1,
49
+ 3,
50
+ 5
51
+ ]
52
+ ]
53
+ }
54
+ },
55
+ "train": {
56
+ "criterions": [
57
+ "feature",
58
+ "discriminator",
59
+ "generator",
60
+ "mel",
61
+ ]
62
+ },
63
+ "inference": {
64
+ "batch_size": 1,
65
+ }
66
+ }
egs/vocoder/gan/bigvgan/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/bigvgan_large/exp_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+
8
+ // Features used for model training
9
+ "use_mel": true,
10
+ "use_audio": true
11
+ },
12
+ "model": {
13
+ "generator": "bigvgan",
14
+ "bigvgan": {
15
+ "resblock": "1",
16
+ "activation": "snakebeta",
17
+ "snake_logscale": true,
18
+ "upsample_rates": [
19
+ 4,
20
+ 4,
21
+ 2,
22
+ 2,
23
+ 2,
24
+ 2
25
+ ],
26
+ "upsample_kernel_sizes": [
27
+ 8,
28
+ 8,
29
+ 4,
30
+ 4,
31
+ 4,
32
+ 4
33
+ ],
34
+ "upsample_initial_channel": 1536,
35
+ "resblock_kernel_sizes": [
36
+ 3,
37
+ 7,
38
+ 11
39
+ ],
40
+ "resblock_dilation_sizes": [
41
+ [
42
+ 1,
43
+ 3,
44
+ 5
45
+ ],
46
+ [
47
+ 1,
48
+ 3,
49
+ 5
50
+ ],
51
+ [
52
+ 1,
53
+ 3,
54
+ 5
55
+ ]
56
+ ]
57
+ },
58
+ },
59
+ "train": {
60
+ "criterions": [
61
+ "feature",
62
+ "discriminator",
63
+ "generator",
64
+ "mel",
65
+ ]
66
+ },
67
+ "inference": {
68
+ "batch_size": 1,
69
+ }
70
+ }
egs/vocoder/gan/bigvgan_large/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/exp_config_base.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/vocoder.json",
3
+ "model_type": "GANVocoder",
4
+ // TODO: Choose your needed datasets
5
+ "dataset": [
6
+ "csd",
7
+ "kising",
8
+ "m4singer",
9
+ "nus48e",
10
+ "opencpop",
11
+ "opensinger",
12
+ "opera",
13
+ "pjs",
14
+ "popbutfy",
15
+ "popcs",
16
+ "ljspeech",
17
+ "vctk",
18
+ "libritts",
19
+ ],
20
+ "dataset_path": {
21
+ // TODO: Fill in your dataset path
22
+ "csd": "[dataset path]",
23
+ "kising": "[dataset path]",
24
+ "m4singer": "[dataset path]",
25
+ "nus48e": "[dataset path]",
26
+ "opencpop": "[dataset path]",
27
+ "opensinger": "[dataset path]",
28
+ "opera": "[dataset path]",
29
+ "pjs": "[dataset path]",
30
+ "popbutfy": "[dataset path]",
31
+ "popcs": "[dataset path]",
32
+ "ljspeech": "[dataset path]",
33
+ "vctk": "[dataset path]",
34
+ "libritts": "[dataset path]",
35
+ },
36
+ // TODO: Fill in the output log path
37
+ "log_dir": "ckpts/vocoder",
38
+ "preprocess": {
39
+ // Acoustic features
40
+ "extract_mel": true,
41
+ "extract_audio": true,
42
+ "extract_pitch": false,
43
+ "extract_uv": false,
44
+ "pitch_extractor": "parselmouth",
45
+
46
+ // Features used for model training
47
+ "use_mel": true,
48
+ "use_frame_pitch": false,
49
+ "use_uv": false,
50
+ "use_audio": true,
51
+
52
+ // TODO: Fill in the output data path
53
+ "processed_dir": "data/",
54
+ "n_mel": 100,
55
+ "sample_rate": 24000
56
+ },
57
+ "model": {
58
+ // TODO: Choose your needed discriminators
59
+ "discriminators": [
60
+ "msd",
61
+ "mpd",
62
+ "msstftd",
63
+ "mssbcqtd",
64
+ ],
65
+ "mpd": {
66
+ "mpd_reshapes": [
67
+ 2,
68
+ 3,
69
+ 5,
70
+ 7,
71
+ 11
72
+ ],
73
+ "use_spectral_norm": false,
74
+ "discriminator_channel_mult_factor": 1
75
+ },
76
+ "mrd": {
77
+ "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
78
+ "use_spectral_norm": false,
79
+ "discriminator_channel_mult_factor": 1,
80
+ "mrd_override": false
81
+ },
82
+ "msstftd": {
83
+ "filters": 32
84
+ },
85
+ "mssbcqtd": {
86
+ hop_lengths: [512, 256, 256],
87
+ filters: 32,
88
+ max_filters: 1024,
89
+ filters_scale: 1,
90
+ dilations: [1, 2, 4],
91
+ in_channels: 1,
92
+ out_channels: 1,
93
+ n_octaves: [9, 9, 9],
94
+ bins_per_octaves: [24, 36, 48]
95
+ },
96
+ },
97
+ "train": {
98
+ // TODO: Choose a suitable batch size, training epoch, and save stride
99
+ "batch_size": 32,
100
+ "max_epoch": 1000000,
101
+ "save_checkpoint_stride": [20],
102
+ "adamw": {
103
+ "lr": 2.0e-4,
104
+ "adam_b1": 0.8,
105
+ "adam_b2": 0.99
106
+ },
107
+ "exponential_lr": {
108
+ "lr_decay": 0.999
109
+ },
110
+ }
111
+ }
egs/vocoder/gan/hifigan/exp_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+
8
+ // Features used for model training
9
+ "use_mel": true,
10
+ "use_audio": true
11
+ },
12
+ "model": {
13
+ "generator": "hifigan",
14
+ "hifigan": {
15
+ "resblock": "2",
16
+ "upsample_rates": [
17
+ 8,
18
+ 8,
19
+ 4
20
+ ],
21
+ "upsample_kernel_sizes": [
22
+ 16,
23
+ 16,
24
+ 8
25
+ ],
26
+ "upsample_initial_channel": 256,
27
+ "resblock_kernel_sizes": [
28
+ 3,
29
+ 5,
30
+ 7
31
+ ],
32
+ "resblock_dilation_sizes": [
33
+ [
34
+ 1,
35
+ 2
36
+ ],
37
+ [
38
+ 2,
39
+ 6
40
+ ],
41
+ [
42
+ 3,
43
+ 12
44
+ ]
45
+ ]
46
+ }
47
+ },
48
+ "train": {
49
+ "criterions": [
50
+ "feature",
51
+ "discriminator",
52
+ "generator",
53
+ "mel",
54
+ ]
55
+ },
56
+ "inference": {
57
+ "batch_size": 1,
58
+ }
59
+ }
egs/vocoder/gan/hifigan/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/melgan/exp_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+
8
+ // Features used for model training
9
+ "use_mel": true,
10
+ "use_audio": true
11
+ },
12
+ "model": {
13
+ "generator": "melgan",
14
+ "melgan": {
15
+ "ratios": [8, 8, 2, 2],
16
+ "ngf": 32,
17
+ "n_residual_layers": 3,
18
+ "num_D": 3,
19
+ "ndf": 16,
20
+ "n_layers": 4,
21
+ "downsampling_factor": 4
22
+ },
23
+ },
24
+ "train": {
25
+ "criterions": [
26
+ "feature",
27
+ "discriminator",
28
+ "generator",
29
+ ]
30
+ },
31
+ "inference": {
32
+ "batch_size": 1,
33
+ }
34
+ }
egs/vocoder/gan/melgan/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/nsfhifigan/exp_config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+ "extract_pitch": true,
8
+
9
+ // Features used for model training
10
+ "use_mel": true,
11
+ "use_audio": true,
12
+ "use_frame_pitch": true
13
+ },
14
+ "model": {
15
+ "generator": "nsfhifigan",
16
+ "nsfhifigan": {
17
+ "resblock": "1",
18
+ "harmonic_num": 8,
19
+ "upsample_rates": [
20
+ 8,
21
+ 4,
22
+ 2,
23
+ 2,
24
+ 2
25
+ ],
26
+ "upsample_kernel_sizes": [
27
+ 16,
28
+ 8,
29
+ 4,
30
+ 4,
31
+ 4
32
+ ],
33
+ "upsample_initial_channel": 768,
34
+ "resblock_kernel_sizes": [
35
+ 3,
36
+ 7,
37
+ 11
38
+ ],
39
+ "resblock_dilation_sizes": [
40
+ [
41
+ 1,
42
+ 3,
43
+ 5
44
+ ],
45
+ [
46
+ 1,
47
+ 3,
48
+ 5
49
+ ],
50
+ [
51
+ 1,
52
+ 3,
53
+ 5
54
+ ]
55
+ ]
56
+ },
57
+ "mpd": {
58
+ "mpd_reshapes": [
59
+ 2,
60
+ 3,
61
+ 5,
62
+ 7,
63
+ 11,
64
+ 17,
65
+ 23,
66
+ 37
67
+ ],
68
+ "use_spectral_norm": false,
69
+ "discriminator_channel_multi": 1
70
+ }
71
+ },
72
+ "train": {
73
+ "criterions": [
74
+ "feature",
75
+ "discriminator",
76
+ "generator",
77
+ "mel",
78
+ ]
79
+ },
80
+ "inference": {
81
+ "batch_size": 1,
82
+ }
83
+ }
egs/vocoder/gan/nsfhifigan/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/tfr_enhanced_hifigan/README.md ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fedility Vocoder
2
+
3
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2311.14957)
4
+ [![demo](https://img.shields.io/badge/Vocoder-Demo-red)](https://vocodexelysium.github.io/MS-SB-CQTD/)
5
+
6
+ <br>
7
+ <div align="center">
8
+ <img src="../../../../imgs/vocoder/gan/MSSBCQTD.png" width="80%">
9
+ </div>
10
+ <br>
11
+
12
+ This is the official implementation of the paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". In this recipe, we will illustrate how to train a high quality HiFi-GAN on LibriTTS, VCTK and LJSpeech via utilizing multiple Time-Frequency-Representation-based Discriminators.
13
+
14
+ There are four stages in total:
15
+
16
+ 1. Data preparation
17
+ 2. Feature extraction
18
+ 3. Training
19
+ 4. Inference
20
+
21
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
22
+ > ```bash
23
+ > cd Amphion
24
+ > ```
25
+
26
+ ## 1. Data Preparation
27
+
28
+ ### Dataset Download
29
+
30
+ By default, we utilize the three datasets for training: LibriTTS, VCTK and LJSpeech. How to download them is detailed in [here](../../../datasets/README.md).
31
+
32
+ ### Configuration
33
+
34
+ Specify the dataset path in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
35
+
36
+ ```json
37
+ "dataset": [
38
+ "ljspeech",
39
+ "vctk",
40
+ "libritts",
41
+ ],
42
+ "dataset_path": {
43
+ // TODO: Fill in your dataset path
44
+ "ljspeech": "[LJSpeech dataset path]",
45
+ "vctk": "[VCTK dataset path]",
46
+ "libritts": "[LibriTTS dataset path]",
47
+ },
48
+ ```
49
+
50
+ ## 2. Features Extraction
51
+
52
+ For HiFiGAN, only the Mel-Spectrogram and the Output Audio are needed for training.
53
+
54
+ ### Configuration
55
+
56
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
57
+
58
+ ```json
59
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
60
+ "log_dir": "ckpts/vocoder",
61
+ "preprocess": {
62
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
63
+ "processed_dir": "data",
64
+ ...
65
+ },
66
+ ```
67
+
68
+ ### Run
69
+
70
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
71
+
72
+ ```bash
73
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 1
74
+ ```
75
+
76
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
77
+
78
+ ## 3. Training
79
+
80
+ ### Configuration
81
+
82
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
83
+
84
+ ```json
85
+ "train": {
86
+ "batch_size": 32,
87
+ ...
88
+ }
89
+ ```
90
+
91
+ ### Run
92
+
93
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`.
94
+
95
+ ```bash
96
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 2 --name [YourExptName]
97
+ ```
98
+
99
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
100
+
101
+ ## 4. Inference
102
+
103
+ ### Pretrained Vocoder Download
104
+
105
+ We trained a HiFiGAN checkpoint with around 685 hours Speech data. The final pretrained checkpoint is released [here](../../../../pretrained/hifigan/README.md).
106
+
107
+ ### Run
108
+
109
+ Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from audio`.
110
+
111
+ ```bash
112
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
113
+ --infer_mode [Your chosen inference mode] \
114
+ --infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \
115
+ --infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \
116
+ --infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \
117
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
118
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
119
+ ```
120
+
121
+ #### a. Inference from Dataset
122
+
123
+ Run the `run.sh` with specified datasets, here is an example.
124
+
125
+ ```bash
126
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
127
+ --infer_mode infer_from_dataset \
128
+ --infer_datasets "libritts vctk ljspeech" \
129
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
130
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
131
+ ```
132
+
133
+ #### b. Inference from Features
134
+
135
+ If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure:
136
+
137
+ ```plaintext
138
+ ┣ {infer_feature_dir}
139
+ ┃ ┣ mels
140
+ ┃ ┃ ┣ sample1.npy
141
+ ┃ ┃ ┣ sample2.npy
142
+ ```
143
+
144
+ Then run the `run.sh` with specificed folder direction, here is an example.
145
+
146
+ ```bash
147
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
148
+ --infer_mode infer_from_feature \
149
+ --infer_feature_dir [Your path to your predicted acoustic features] \
150
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
151
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
152
+ ```
153
+
154
+ #### c. Inference from Audios
155
+
156
+ If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure:
157
+
158
+ ```plaintext
159
+ ┣ audios
160
+ ┃ ┣ sample1.wav
161
+ ┃ ┣ sample2.wav
162
+ ```
163
+
164
+ Then run the `run.sh` with specificed folder direction, here is an example.
165
+
166
+ ```bash
167
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
168
+ --infer_mode infer_from_audio \
169
+ --infer_audio_dir [Your path to your audio files] \
170
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
171
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
172
+ ```
173
+
174
+ ## Citations
175
+
176
+ ```bibtex
177
+ @misc{gu2023cqt,
178
+ title={Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder},
179
+ author={Yicheng Gu and Xueyao Zhang and Liumeng Xue and Zhizheng Wu},
180
+ year={2023},
181
+ eprint={2311.14957},
182
+ archivePrefix={arXiv},
183
+ primaryClass={cs.SD}
184
+ }
185
+ ```
egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "model_type": "GANVocoder",
4
+ "dataset": [
5
+ "ljspeech",
6
+ "vctk",
7
+ "libritts",
8
+ ],
9
+ "dataset_path": {
10
+ // TODO: Fill in your dataset path
11
+ "ljspeech": "[dataset path]",
12
+ "vctk": "[dataset path]",
13
+ "libritts": "[dataset path]",
14
+ },
15
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
16
+ "log_dir": "ckpts/vocoder",
17
+ "preprocess": {
18
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
19
+ "processed_dir": "data",
20
+ // acoustic features
21
+ "extract_mel": true,
22
+ "extract_audio": true,
23
+ "extract_pitch": false,
24
+ "extract_uv": false,
25
+ "extract_amplitude_phase": false,
26
+ "pitch_extractor": "parselmouth",
27
+ // Features used for model training
28
+ "use_mel": true,
29
+ "use_frame_pitch": false,
30
+ "use_uv": false,
31
+ "use_audio": true,
32
+ "n_mel": 100,
33
+ "sample_rate": 24000
34
+ },
35
+ "model": {
36
+ "generator": "hifigan",
37
+ "discriminators": [
38
+ "msd",
39
+ "mpd",
40
+ "mssbcqtd",
41
+ "msstftd",
42
+ ],
43
+ "hifigan": {
44
+ "resblock": "1",
45
+ "upsample_rates": [
46
+ 8,
47
+ 4,
48
+ 2,
49
+ 2,
50
+ 2
51
+ ],
52
+ "upsample_kernel_sizes": [
53
+ 16,
54
+ 8,
55
+ 4,
56
+ 4,
57
+ 4
58
+ ],
59
+ "upsample_initial_channel": 768,
60
+ "resblock_kernel_sizes": [
61
+ 3,
62
+ 5,
63
+ 7
64
+ ],
65
+ "resblock_dilation_sizes": [
66
+ [
67
+ 1,
68
+ 3,
69
+ 5
70
+ ],
71
+ [
72
+ 1,
73
+ 3,
74
+ 5
75
+ ],
76
+ [
77
+ 1,
78
+ 3,
79
+ 5
80
+ ]
81
+ ]
82
+ },
83
+ "mpd": {
84
+ "mpd_reshapes": [
85
+ 2,
86
+ 3,
87
+ 5,
88
+ 7,
89
+ 11,
90
+ 17,
91
+ 23,
92
+ 37
93
+ ],
94
+ "use_spectral_norm": false,
95
+ "discriminator_channel_multi": 1
96
+ }
97
+ },
98
+ "train": {
99
+ "batch_size": 16,
100
+ "adamw": {
101
+ "lr": 2.0e-4,
102
+ "adam_b1": 0.8,
103
+ "adam_b2": 0.99
104
+ },
105
+ "exponential_lr": {
106
+ "lr_decay": 0.999
107
+ },
108
+ "criterions": [
109
+ "feature",
110
+ "discriminator",
111
+ "generator",
112
+ "mel",
113
+ ]
114
+ },
115
+ "inference": {
116
+ "batch_size": 1,
117
+ }
118
+ }
egs/vocoder/gan/tfr_enhanced_hifigan/run.sh ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ echo $infer_datasets
114
+
115
+ if [ $infer_mode = "infer_from_dataset" ]; then
116
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
117
+ --config $exp_config \
118
+ --infer_mode $infer_mode \
119
+ --infer_datasets $infer_datasets \
120
+ --vocoder_dir $infer_expt_dir \
121
+ --output_dir $infer_output_dir \
122
+ --log_level debug
123
+ fi
124
+
125
+ if [ $infer_mode = "infer_from_feature" ]; then
126
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
127
+ --config $exp_config \
128
+ --infer_mode $infer_mode \
129
+ --feature_folder $infer_feature_dir \
130
+ --vocoder_dir $infer_expt_dir \
131
+ --output_dir $infer_output_dir \
132
+ --log_level debug
133
+ fi
134
+
135
+ if [ $infer_mode = "infer_from_audio" ]; then
136
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
137
+ --config $exp_config \
138
+ --infer_mode $infer_mode \
139
+ --audio_folder $infer_audio_dir \
140
+ --vocoder_dir $infer_expt_dir \
141
+ --output_dir $infer_output_dir \
142
+ --log_level debug
143
+ fi
144
+
145
+ fi
examples/chinese_female_recordings.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f710270fe3857211c55aaa1f813e310e68855ff9eabaf5b249537a2d4277cc30
3
+ size 448928