xJuuzouYTx commited on
Commit
20d05ae
·
1 Parent(s): 63e4976

[REMOVE] binary files

Browse files
.gitignore ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ __pycache__
3
+ /TEMP
4
+ /audios/
5
+ /audio-outputs/
6
+ /LOGS
7
+ /RUNTIME
8
+ *.pyd
9
+ hubert_base.pt
10
+ /logs
11
+ /env
12
+ .venv
13
+ alexforkINSTALL.bat
14
+ Changelog_CN.md
15
+ Changelog_EN.md
16
+ Changelog_KO.md
17
+ difdep.py
18
+ EasierGUI.py
19
+ envfilescheck.bat
20
+ export_onnx.py
21
+ export_onnx_old.py
22
+ ffmpeg.exe
23
+ ffprobe.exe
24
+ Fixes/Launch_Tensorboard.bat
25
+ Fixes/LOCAL_CREPE_FIX.bat
26
+ Fixes/local_fixes.py
27
+ Fixes/tensor-launch.py
28
+ gui.py
29
+ infer-web — backup.py
30
+ infer-webbackup.py
31
+ install_easy_dependencies.py
32
+ install_easyGUI.bat
33
+ installstft.bat
34
+ Launch_Tensorboard.bat
35
+ listdepend.bat
36
+ LOCAL_CREPE_FIX.bat
37
+ local_fixes.py
38
+ oldinfer.py
39
+ onnx_inference_demo.py
40
+ Praat.exe
41
+ requirementsNEW.txt
42
+ rmvpe.pt
43
+ run_easiergui.bat
44
+ tensor-launch.py
45
+ values1.json
46
+ 使用需遵守的协议-LICENSE.txt
47
+ trainset_preprocess_pipeline_print.py
48
+ .env
49
+ firebase_secrets.json
LICENSE ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 liujing04
4
+ Copyright (c) 2023 源文雨
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
README copy.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [![Licence](https://img.shields.io/github/license/liujing04/Retrieval-based-Voice-Conversion-WebUI?style=for-the-badge)](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/%E4%BD%BF%E7%94%A8%E9%9C%80%E9%81%B5%E5%AE%88%E7%9A%84%E5%8D%8F%E8%AE%AE-LICENSE.txt)
2
+
3
+ [![Huggingface](https://img.shields.io/badge/🤗%20-Spaces-yellow.svg?style=for-the-badge)](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/)
4
+
5
+ [![Discord](https://img.shields.io/badge/RVC%20Developers-Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)]()
6
+
7
+ [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/drive/1iWOLYE9znqT6XE5Rw2iETE19ZlqpziLx?usp=sharing)
8
+
9
+ # Instalación de dependencias 🖥️
10
+ Usando pip (python3.9.8 es recomendado)
11
+ ```bash
12
+ python -m venv env
13
+ pip install -r requirements.txt
14
+ ```
15
+
16
+ ## Uso local
17
+
18
+ Aquí esta el listado de los archivos necesarios para correr el programa:
19
+ Puedes descargar los dos primeros desde [Huggingface space](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/).
20
+
21
+ ```bash
22
+ hubert_base.pt
23
+
24
+ rmvpe.pt
25
+ #Si estás usando windows, necesitas este archivo, omitelo si ffmpeg ffpbobe están instalados; los usuarios de ubuntu/debian pueden instalar estas dos librerías a través de apt install ffmpeg
26
+
27
+ ./ffmpeg
28
+
29
+ ./ffprobe
30
+ ```
31
+
32
+ ## Créditos
33
+ + [ContentVec](https://github.com/auspicious3000/contentvec/)
34
+ + [VITS](https://github.com/jaywalnut310/vits)
35
+ + [HIFIGAN](https://github.com/jik876/hifi-gan)
36
+ + [Gradio](https://github.com/gradio-app/gradio)
37
+ + [FFmpeg](https://github.com/FFmpeg/FFmpeg)
38
+ + [Ultimate Vocal Remover](https://github.com/Anjok07/ultimatevocalremovergui)
39
+ + [audio-slicer](https://github.com/openvpi/audio-slicer)
40
+ + [Mangio FORK](https://github.com/Mangio621/Mangio-RVC-Fork)
41
+
app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from inference import Inference
3
+ import os
4
+
5
+ def infer(model, f0_method, audio_file):
6
+ print("****", audio_file)
7
+ inference = Inference(
8
+ model_name=model,
9
+ f0_method=f0_method,
10
+ source_audio_path=audio_file,
11
+ output_file_name=os.path.join("./audio-outputs", os.path.basename(audio_file))
12
+ )
13
+ output = inference.run()
14
+ if 'success' in output and output['success']:
15
+ return output, output['file']
16
+ else:
17
+ return
18
+
19
+ with gr.Blocks() as app:
20
+ gr.HTML("<h1> Simple RVC Inference - by Juuxn 💻 </h1>")
21
+ model_url = gr.Textbox(placeholder="https://huggingface.co/AIVER-SE/BillieEilish/resolve/main/BillieEilish.zip", label="Url del modelo", show_label=True)
22
+ audio_path = gr.Audio(label="Archivo de audio", show_label=True, type="filepath", )
23
+ f0_method = gr.Dropdown(choices=["harvest", "pm", "crepe", "crepe-tiny", "mangio-crepe", "mangio-crepe-tiny", "rmvpe"],
24
+ value="harvest",
25
+ label="Algoritmo", show_label=True)
26
+ # Salida
27
+ with gr.Row():
28
+ vc_output1 = gr.Textbox(label="Salida")
29
+ vc_output2 = gr.Audio(label="Audio de salida")
30
+
31
+ btn = gr.Button(value="Convertir")
32
+ btn.click(infer, inputs=[model_url, f0_method, audio_path], outputs=[vc_output1, vc_output2])
33
+
34
+ app.queue(concurrency_count=511, max_size=1022).launch(share=True)
config.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import sys
3
+ import torch
4
+ import json
5
+ from multiprocessing import cpu_count
6
+
7
+ global usefp16
8
+ usefp16 = False
9
+
10
+
11
+ def use_fp32_config():
12
+ usefp16 = False
13
+ device_capability = 0
14
+ if torch.cuda.is_available():
15
+ device = torch.device("cuda:0") # Assuming you have only one GPU (index 0).
16
+ device_capability = torch.cuda.get_device_capability(device)[0]
17
+ if device_capability >= 7:
18
+ usefp16 = True
19
+ for config_file in ["32k.json", "40k.json", "48k.json"]:
20
+ with open(f"configs/{config_file}", "r") as d:
21
+ data = json.load(d)
22
+
23
+ if "train" in data and "fp16_run" in data["train"]:
24
+ data["train"]["fp16_run"] = True
25
+
26
+ with open(f"configs/{config_file}", "w") as d:
27
+ json.dump(data, d, indent=4)
28
+
29
+ print(f"Set fp16_run to true in {config_file}")
30
+
31
+ else:
32
+ for config_file in ["32k.json", "40k.json", "48k.json"]:
33
+ with open(f"configs/{config_file}", "r") as f:
34
+ data = json.load(f)
35
+
36
+ if "train" in data and "fp16_run" in data["train"]:
37
+ data["train"]["fp16_run"] = False
38
+
39
+ with open(f"configs/{config_file}", "w") as d:
40
+ json.dump(data, d, indent=4)
41
+
42
+ print(f"Set fp16_run to false in {config_file}")
43
+ else:
44
+ print(
45
+ "CUDA is not available. Make sure you have an NVIDIA GPU and CUDA installed."
46
+ )
47
+ return (usefp16, device_capability)
48
+
49
+
50
+ class Config:
51
+ def __init__(self):
52
+ self.device = "cuda:0"
53
+ self.is_half = True
54
+ self.n_cpu = 0
55
+ self.gpu_name = None
56
+ self.gpu_mem = None
57
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
58
+
59
+ # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
60
+ # check `getattr` and try it for compatibility
61
+ @staticmethod
62
+ def has_mps() -> bool:
63
+ if not torch.backends.mps.is_available():
64
+ return False
65
+ try:
66
+ torch.zeros(1).to(torch.device("mps"))
67
+ return True
68
+ except Exception:
69
+ return False
70
+
71
+ def device_config(self) -> tuple:
72
+ if torch.cuda.is_available():
73
+ i_device = int(self.device.split(":")[-1])
74
+ self.gpu_name = torch.cuda.get_device_name(i_device)
75
+ if (
76
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
77
+ or "P40" in self.gpu_name.upper()
78
+ or "1060" in self.gpu_name
79
+ or "1070" in self.gpu_name
80
+ or "1080" in self.gpu_name
81
+ ):
82
+ print("Found GPU", self.gpu_name, ", force to fp32")
83
+ self.is_half = False
84
+ else:
85
+ print("Found GPU", self.gpu_name)
86
+ use_fp32_config()
87
+ self.gpu_mem = int(
88
+ torch.cuda.get_device_properties(i_device).total_memory
89
+ / 1024
90
+ / 1024
91
+ / 1024
92
+ + 0.4
93
+ )
94
+ elif self.has_mps():
95
+ print("No supported Nvidia GPU found, use MPS instead")
96
+ self.device = "mps"
97
+ self.is_half = False
98
+ use_fp32_config()
99
+ else:
100
+ print("No supported Nvidia GPU found, use CPU instead")
101
+ self.device = "cpu"
102
+ self.is_half = False
103
+ use_fp32_config()
104
+
105
+ if self.n_cpu == 0:
106
+ self.n_cpu = cpu_count()
107
+
108
+ if self.is_half:
109
+ # 6G显存配置
110
+ x_pad = 3
111
+ x_query = 10
112
+ x_center = 60
113
+ x_max = 65
114
+ else:
115
+ # 5G显存配置
116
+ x_pad = 1
117
+ x_query = 6
118
+ x_center = 38
119
+ x_max = 41
120
+
121
+ if self.gpu_mem != None and self.gpu_mem <= 4:
122
+ x_pad = 1
123
+ x_query = 5
124
+ x_center = 30
125
+ x_max = 32
126
+
127
+ return x_pad, x_query, x_center, x_max
configs/32k.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 0.0001,
7
+ "betas": [
8
+ 0.8,
9
+ 0.99
10
+ ],
11
+ "eps": 1e-09,
12
+ "batch_size": 4,
13
+ "fp16_run": true,
14
+ "lr_decay": 0.999875,
15
+ "segment_size": 12800,
16
+ "init_lr_ratio": 1,
17
+ "warmup_epochs": 0,
18
+ "c_mel": 45,
19
+ "c_kl": 1.0
20
+ },
21
+ "data": {
22
+ "max_wav_value": 32768.0,
23
+ "sampling_rate": 32000,
24
+ "filter_length": 1024,
25
+ "hop_length": 320,
26
+ "win_length": 1024,
27
+ "n_mel_channels": 80,
28
+ "mel_fmin": 0.0,
29
+ "mel_fmax": null
30
+ },
31
+ "model": {
32
+ "inter_channels": 192,
33
+ "hidden_channels": 192,
34
+ "filter_channels": 768,
35
+ "n_heads": 2,
36
+ "n_layers": 6,
37
+ "kernel_size": 3,
38
+ "p_dropout": 0,
39
+ "resblock": "1",
40
+ "resblock_kernel_sizes": [
41
+ 3,
42
+ 7,
43
+ 11
44
+ ],
45
+ "resblock_dilation_sizes": [
46
+ [
47
+ 1,
48
+ 3,
49
+ 5
50
+ ],
51
+ [
52
+ 1,
53
+ 3,
54
+ 5
55
+ ],
56
+ [
57
+ 1,
58
+ 3,
59
+ 5
60
+ ]
61
+ ],
62
+ "upsample_rates": [
63
+ 10,
64
+ 4,
65
+ 2,
66
+ 2,
67
+ 2
68
+ ],
69
+ "upsample_initial_channel": 512,
70
+ "upsample_kernel_sizes": [
71
+ 16,
72
+ 16,
73
+ 4,
74
+ 4,
75
+ 4
76
+ ],
77
+ "use_spectral_norm": false,
78
+ "gin_channels": 256,
79
+ "spk_embed_dim": 109
80
+ }
81
+ }
configs/32k_v2.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,8,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [20,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/40k.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 0.0001,
7
+ "betas": [
8
+ 0.8,
9
+ 0.99
10
+ ],
11
+ "eps": 1e-09,
12
+ "batch_size": 4,
13
+ "fp16_run": true,
14
+ "lr_decay": 0.999875,
15
+ "segment_size": 12800,
16
+ "init_lr_ratio": 1,
17
+ "warmup_epochs": 0,
18
+ "c_mel": 45,
19
+ "c_kl": 1.0
20
+ },
21
+ "data": {
22
+ "max_wav_value": 32768.0,
23
+ "sampling_rate": 40000,
24
+ "filter_length": 2048,
25
+ "hop_length": 400,
26
+ "win_length": 2048,
27
+ "n_mel_channels": 125,
28
+ "mel_fmin": 0.0,
29
+ "mel_fmax": null
30
+ },
31
+ "model": {
32
+ "inter_channels": 192,
33
+ "hidden_channels": 192,
34
+ "filter_channels": 768,
35
+ "n_heads": 2,
36
+ "n_layers": 6,
37
+ "kernel_size": 3,
38
+ "p_dropout": 0,
39
+ "resblock": "1",
40
+ "resblock_kernel_sizes": [
41
+ 3,
42
+ 7,
43
+ 11
44
+ ],
45
+ "resblock_dilation_sizes": [
46
+ [
47
+ 1,
48
+ 3,
49
+ 5
50
+ ],
51
+ [
52
+ 1,
53
+ 3,
54
+ 5
55
+ ],
56
+ [
57
+ 1,
58
+ 3,
59
+ 5
60
+ ]
61
+ ],
62
+ "upsample_rates": [
63
+ 10,
64
+ 10,
65
+ 2,
66
+ 2
67
+ ],
68
+ "upsample_initial_channel": 512,
69
+ "upsample_kernel_sizes": [
70
+ 16,
71
+ 16,
72
+ 4,
73
+ 4
74
+ ],
75
+ "use_spectral_norm": false,
76
+ "gin_channels": 256,
77
+ "spk_embed_dim": 109
78
+ }
79
+ }
configs/48k.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 0.0001,
7
+ "betas": [
8
+ 0.8,
9
+ 0.99
10
+ ],
11
+ "eps": 1e-09,
12
+ "batch_size": 4,
13
+ "fp16_run": true,
14
+ "lr_decay": 0.999875,
15
+ "segment_size": 11520,
16
+ "init_lr_ratio": 1,
17
+ "warmup_epochs": 0,
18
+ "c_mel": 45,
19
+ "c_kl": 1.0
20
+ },
21
+ "data": {
22
+ "max_wav_value": 32768.0,
23
+ "sampling_rate": 48000,
24
+ "filter_length": 2048,
25
+ "hop_length": 480,
26
+ "win_length": 2048,
27
+ "n_mel_channels": 128,
28
+ "mel_fmin": 0.0,
29
+ "mel_fmax": null
30
+ },
31
+ "model": {
32
+ "inter_channels": 192,
33
+ "hidden_channels": 192,
34
+ "filter_channels": 768,
35
+ "n_heads": 2,
36
+ "n_layers": 6,
37
+ "kernel_size": 3,
38
+ "p_dropout": 0,
39
+ "resblock": "1",
40
+ "resblock_kernel_sizes": [
41
+ 3,
42
+ 7,
43
+ 11
44
+ ],
45
+ "resblock_dilation_sizes": [
46
+ [
47
+ 1,
48
+ 3,
49
+ 5
50
+ ],
51
+ [
52
+ 1,
53
+ 3,
54
+ 5
55
+ ],
56
+ [
57
+ 1,
58
+ 3,
59
+ 5
60
+ ]
61
+ ],
62
+ "upsample_rates": [
63
+ 10,
64
+ 6,
65
+ 2,
66
+ 2,
67
+ 2
68
+ ],
69
+ "upsample_initial_channel": 512,
70
+ "upsample_kernel_sizes": [
71
+ 16,
72
+ 16,
73
+ 4,
74
+ 4,
75
+ 4
76
+ ],
77
+ "use_spectral_norm": false,
78
+ "gin_channels": 256,
79
+ "spk_embed_dim": 109
80
+ }
81
+ }
configs/48k_v2.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 17280,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [12,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [24,20,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
infer_pack/attentions.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ from infer_pack import commons
7
+ from infer_pack.modules import LayerNorm
8
+
9
+ class Encoder(nn.Module):
10
+ def __init__(
11
+ self,
12
+ hidden_channels,
13
+ filter_channels,
14
+ n_heads,
15
+ n_layers,
16
+ kernel_size=1,
17
+ p_dropout=0.0,
18
+ window_size=10,
19
+ **kwargs
20
+ ):
21
+ super().__init__()
22
+ self.hidden_channels = hidden_channels
23
+ self.filter_channels = filter_channels
24
+ self.n_heads = n_heads
25
+ self.n_layers = n_layers
26
+ self.kernel_size = kernel_size
27
+ self.p_dropout = p_dropout
28
+ self.window_size = window_size
29
+
30
+ self.drop = nn.Dropout(p_dropout)
31
+ self.attn_layers = nn.ModuleList()
32
+ self.norm_layers_1 = nn.ModuleList()
33
+ self.ffn_layers = nn.ModuleList()
34
+ self.norm_layers_2 = nn.ModuleList()
35
+ for i in range(self.n_layers):
36
+ self.attn_layers.append(
37
+ MultiHeadAttention(
38
+ hidden_channels,
39
+ hidden_channels,
40
+ n_heads,
41
+ p_dropout=p_dropout,
42
+ window_size=window_size,
43
+ )
44
+ )
45
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
46
+ self.ffn_layers.append(
47
+ FFN(
48
+ hidden_channels,
49
+ hidden_channels,
50
+ filter_channels,
51
+ kernel_size,
52
+ p_dropout=p_dropout,
53
+ )
54
+ )
55
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
56
+
57
+ def forward(self, x, x_mask):
58
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
59
+ x = x * x_mask
60
+ for i in range(self.n_layers):
61
+ y = self.attn_layers[i](x, x, attn_mask)
62
+ y = self.drop(y)
63
+ x = self.norm_layers_1[i](x + y)
64
+
65
+ y = self.ffn_layers[i](x, x_mask)
66
+ y = self.drop(y)
67
+ x = self.norm_layers_2[i](x + y)
68
+ x = x * x_mask
69
+ return x
70
+
71
+
72
+ class Decoder(nn.Module):
73
+ def __init__(
74
+ self,
75
+ hidden_channels,
76
+ filter_channels,
77
+ n_heads,
78
+ n_layers,
79
+ kernel_size=1,
80
+ p_dropout=0.0,
81
+ proximal_bias=False,
82
+ proximal_init=True,
83
+ **kwargs
84
+ ):
85
+ super().__init__()
86
+ self.hidden_channels = hidden_channels
87
+ self.filter_channels = filter_channels
88
+ self.n_heads = n_heads
89
+ self.n_layers = n_layers
90
+ self.kernel_size = kernel_size
91
+ self.p_dropout = p_dropout
92
+ self.proximal_bias = proximal_bias
93
+ self.proximal_init = proximal_init
94
+
95
+ self.drop = nn.Dropout(p_dropout)
96
+ self.self_attn_layers = nn.ModuleList()
97
+ self.norm_layers_0 = nn.ModuleList()
98
+ self.encdec_attn_layers = nn.ModuleList()
99
+ self.norm_layers_1 = nn.ModuleList()
100
+ self.ffn_layers = nn.ModuleList()
101
+ self.norm_layers_2 = nn.ModuleList()
102
+ for i in range(self.n_layers):
103
+ self.self_attn_layers.append(
104
+ MultiHeadAttention(
105
+ hidden_channels,
106
+ hidden_channels,
107
+ n_heads,
108
+ p_dropout=p_dropout,
109
+ proximal_bias=proximal_bias,
110
+ proximal_init=proximal_init,
111
+ )
112
+ )
113
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
114
+ self.encdec_attn_layers.append(
115
+ MultiHeadAttention(
116
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
117
+ )
118
+ )
119
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
120
+ self.ffn_layers.append(
121
+ FFN(
122
+ hidden_channels,
123
+ hidden_channels,
124
+ filter_channels,
125
+ kernel_size,
126
+ p_dropout=p_dropout,
127
+ causal=True,
128
+ )
129
+ )
130
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
131
+
132
+ def forward(self, x, x_mask, h, h_mask):
133
+ """
134
+ x: decoder input
135
+ h: encoder output
136
+ """
137
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
138
+ device=x.device, dtype=x.dtype
139
+ )
140
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
141
+ x = x * x_mask
142
+ for i in range(self.n_layers):
143
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
144
+ y = self.drop(y)
145
+ x = self.norm_layers_0[i](x + y)
146
+
147
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
148
+ y = self.drop(y)
149
+ x = self.norm_layers_1[i](x + y)
150
+
151
+ y = self.ffn_layers[i](x, x_mask)
152
+ y = self.drop(y)
153
+ x = self.norm_layers_2[i](x + y)
154
+ x = x * x_mask
155
+ return x
156
+
157
+
158
+ class MultiHeadAttention(nn.Module):
159
+ def __init__(
160
+ self,
161
+ channels,
162
+ out_channels,
163
+ n_heads,
164
+ p_dropout=0.0,
165
+ window_size=None,
166
+ heads_share=True,
167
+ block_length=None,
168
+ proximal_bias=False,
169
+ proximal_init=False,
170
+ ):
171
+ super().__init__()
172
+ assert channels % n_heads == 0
173
+
174
+ self.channels = channels
175
+ self.out_channels = out_channels
176
+ self.n_heads = n_heads
177
+ self.p_dropout = p_dropout
178
+ self.window_size = window_size
179
+ self.heads_share = heads_share
180
+ self.block_length = block_length
181
+ self.proximal_bias = proximal_bias
182
+ self.proximal_init = proximal_init
183
+ self.attn = None
184
+
185
+ self.k_channels = channels // n_heads
186
+ self.conv_q = nn.Conv1d(channels, channels, 1)
187
+ self.conv_k = nn.Conv1d(channels, channels, 1)
188
+ self.conv_v = nn.Conv1d(channels, channels, 1)
189
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
190
+ self.drop = nn.Dropout(p_dropout)
191
+
192
+ if window_size is not None:
193
+ n_heads_rel = 1 if heads_share else n_heads
194
+ rel_stddev = self.k_channels**-0.5
195
+ self.emb_rel_k = nn.Parameter(
196
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
197
+ * rel_stddev
198
+ )
199
+ self.emb_rel_v = nn.Parameter(
200
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
201
+ * rel_stddev
202
+ )
203
+
204
+ nn.init.xavier_uniform_(self.conv_q.weight)
205
+ nn.init.xavier_uniform_(self.conv_k.weight)
206
+ nn.init.xavier_uniform_(self.conv_v.weight)
207
+ if proximal_init:
208
+ with torch.no_grad():
209
+ self.conv_k.weight.copy_(self.conv_q.weight)
210
+ self.conv_k.bias.copy_(self.conv_q.bias)
211
+
212
+ def forward(self, x, c, attn_mask=None):
213
+ q = self.conv_q(x)
214
+ k = self.conv_k(c)
215
+ v = self.conv_v(c)
216
+
217
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
218
+
219
+ x = self.conv_o(x)
220
+ return x
221
+
222
+ def attention(self, query, key, value, mask=None):
223
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
224
+ b, d, t_s, t_t = (*key.size(), query.size(2))
225
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
226
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
227
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
228
+
229
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
230
+ if self.window_size is not None:
231
+ assert (
232
+ t_s == t_t
233
+ ), "Relative attention is only available for self-attention."
234
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
235
+ rel_logits = self._matmul_with_relative_keys(
236
+ query / math.sqrt(self.k_channels), key_relative_embeddings
237
+ )
238
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
239
+ scores = scores + scores_local
240
+ if self.proximal_bias:
241
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
242
+ scores = scores + self._attention_bias_proximal(t_s).to(
243
+ device=scores.device, dtype=scores.dtype
244
+ )
245
+ if mask is not None:
246
+ scores = scores.masked_fill(mask == 0, -1e4)
247
+ if self.block_length is not None:
248
+ assert (
249
+ t_s == t_t
250
+ ), "Local attention is only available for self-attention."
251
+ block_mask = (
252
+ torch.ones_like(scores)
253
+ .triu(-self.block_length)
254
+ .tril(self.block_length)
255
+ )
256
+ scores = scores.masked_fill(block_mask == 0, -1e4)
257
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
258
+ p_attn = self.drop(p_attn)
259
+ output = torch.matmul(p_attn, value)
260
+ if self.window_size is not None:
261
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
262
+ value_relative_embeddings = self._get_relative_embeddings(
263
+ self.emb_rel_v, t_s
264
+ )
265
+ output = output + self._matmul_with_relative_values(
266
+ relative_weights, value_relative_embeddings
267
+ )
268
+ output = (
269
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
270
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
271
+ return output, p_attn
272
+
273
+ def _matmul_with_relative_values(self, x, y):
274
+ """
275
+ x: [b, h, l, m]
276
+ y: [h or 1, m, d]
277
+ ret: [b, h, l, d]
278
+ """
279
+ ret = torch.matmul(x, y.unsqueeze(0))
280
+ return ret
281
+
282
+ def _matmul_with_relative_keys(self, x, y):
283
+ """
284
+ x: [b, h, l, d]
285
+ y: [h or 1, m, d]
286
+ ret: [b, h, l, m]
287
+ """
288
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
289
+ return ret
290
+
291
+ def _get_relative_embeddings(self, relative_embeddings, length):
292
+ max_relative_position = 2 * self.window_size + 1
293
+ # Pad first before slice to avoid using cond ops.
294
+ pad_length = max(length - (self.window_size + 1), 0)
295
+ slice_start_position = max((self.window_size + 1) - length, 0)
296
+ slice_end_position = slice_start_position + 2 * length - 1
297
+ if pad_length > 0:
298
+ padded_relative_embeddings = F.pad(
299
+ relative_embeddings,
300
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
301
+ )
302
+ else:
303
+ padded_relative_embeddings = relative_embeddings
304
+ used_relative_embeddings = padded_relative_embeddings[
305
+ :, slice_start_position:slice_end_position
306
+ ]
307
+ return used_relative_embeddings
308
+
309
+ def _relative_position_to_absolute_position(self, x):
310
+ """
311
+ x: [b, h, l, 2*l-1]
312
+ ret: [b, h, l, l]
313
+ """
314
+ batch, heads, length, _ = x.size()
315
+ # Concat columns of pad to shift from relative to absolute indexing.
316
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
317
+
318
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
319
+ x_flat = x.view([batch, heads, length * 2 * length])
320
+ x_flat = F.pad(
321
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
322
+ )
323
+
324
+ # Reshape and slice out the padded elements.
325
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
326
+ :, :, :length, length - 1 :
327
+ ]
328
+ return x_final
329
+
330
+ def _absolute_position_to_relative_position(self, x):
331
+ """
332
+ x: [b, h, l, l]
333
+ ret: [b, h, l, 2*l-1]
334
+ """
335
+ batch, heads, length, _ = x.size()
336
+ # padd along column
337
+ x = F.pad(
338
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
339
+ )
340
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
341
+ # add 0's in the beginning that will skew the elements after reshape
342
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
343
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
344
+ return x_final
345
+
346
+ def _attention_bias_proximal(self, length):
347
+ """Bias for self-attention to encourage attention to close positions.
348
+ Args:
349
+ length: an integer scalar.
350
+ Returns:
351
+ a Tensor with shape [1, 1, length, length]
352
+ """
353
+ r = torch.arange(length, dtype=torch.float32)
354
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
355
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
356
+
357
+
358
+ class FFN(nn.Module):
359
+ def __init__(
360
+ self,
361
+ in_channels,
362
+ out_channels,
363
+ filter_channels,
364
+ kernel_size,
365
+ p_dropout=0.0,
366
+ activation=None,
367
+ causal=False,
368
+ ):
369
+ super().__init__()
370
+ self.in_channels = in_channels
371
+ self.out_channels = out_channels
372
+ self.filter_channels = filter_channels
373
+ self.kernel_size = kernel_size
374
+ self.p_dropout = p_dropout
375
+ self.activation = activation
376
+ self.causal = causal
377
+
378
+ if causal:
379
+ self.padding = self._causal_padding
380
+ else:
381
+ self.padding = self._same_padding
382
+
383
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
384
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
385
+ self.drop = nn.Dropout(p_dropout)
386
+
387
+ def forward(self, x, x_mask):
388
+ x = self.conv_1(self.padding(x * x_mask))
389
+ if self.activation == "gelu":
390
+ x = x * torch.sigmoid(1.702 * x)
391
+ else:
392
+ x = torch.relu(x)
393
+ x = self.drop(x)
394
+ x = self.conv_2(self.padding(x * x_mask))
395
+ return x * x_mask
396
+
397
+ def _causal_padding(self, x):
398
+ if self.kernel_size == 1:
399
+ return x
400
+ pad_l = self.kernel_size - 1
401
+ pad_r = 0
402
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
403
+ x = F.pad(x, commons.convert_pad_shape(padding))
404
+ return x
405
+
406
+ def _same_padding(self, x):
407
+ if self.kernel_size == 1:
408
+ return x
409
+ pad_l = (self.kernel_size - 1) // 2
410
+ pad_r = self.kernel_size // 2
411
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
412
+ x = F.pad(x, commons.convert_pad_shape(padding))
413
+ return x
infer_pack/commons.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch.nn import functional as F
4
+
5
+
6
+ def init_weights(m, mean=0.0, std=0.01):
7
+ classname = m.__class__.__name__
8
+ if classname.find("Conv") != -1:
9
+ m.weight.data.normal_(mean, std)
10
+
11
+
12
+ def get_padding(kernel_size, dilation=1):
13
+ return int((kernel_size * dilation - dilation) / 2)
14
+
15
+
16
+ def convert_pad_shape(pad_shape):
17
+ l = pad_shape[::-1]
18
+ pad_shape = [item for sublist in l for item in sublist]
19
+ return pad_shape
20
+
21
+
22
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
23
+ """KL(P||Q)"""
24
+ kl = (logs_q - logs_p) - 0.5
25
+ kl += (
26
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
27
+ )
28
+ return kl
29
+
30
+
31
+ def rand_gumbel(shape):
32
+ """Sample from the Gumbel distribution, protect from overflows."""
33
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
34
+ return -torch.log(-torch.log(uniform_samples))
35
+
36
+
37
+ def rand_gumbel_like(x):
38
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
39
+ return g
40
+
41
+
42
+ def slice_segments(x, ids_str, segment_size=4):
43
+ ret = torch.zeros_like(x[:, :, :segment_size])
44
+ for i in range(x.size(0)):
45
+ idx_str = ids_str[i]
46
+ idx_end = idx_str + segment_size
47
+ ret[i] = x[i, :, idx_str:idx_end]
48
+ return ret
49
+
50
+
51
+ def slice_segments2(x, ids_str, segment_size=4):
52
+ ret = torch.zeros_like(x[:, :segment_size])
53
+ for i in range(x.size(0)):
54
+ idx_str = ids_str[i]
55
+ idx_end = idx_str + segment_size
56
+ ret[i] = x[i, idx_str:idx_end]
57
+ return ret
58
+
59
+
60
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
61
+ b, d, t = x.size()
62
+ if x_lengths is None:
63
+ x_lengths = t
64
+ ids_str_max = x_lengths - segment_size + 1
65
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
66
+ ret = slice_segments(x, ids_str, segment_size)
67
+ return ret, ids_str
68
+
69
+
70
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
71
+ position = torch.arange(length, dtype=torch.float)
72
+ num_timescales = channels // 2
73
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
74
+ num_timescales - 1
75
+ )
76
+ inv_timescales = min_timescale * torch.exp(
77
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
78
+ )
79
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
80
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
81
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
82
+ signal = signal.view(1, channels, length)
83
+ return signal
84
+
85
+
86
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
87
+ b, channels, length = x.size()
88
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
89
+ return x + signal.to(dtype=x.dtype, device=x.device)
90
+
91
+
92
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
93
+ b, channels, length = x.size()
94
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
95
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
96
+
97
+
98
+ def subsequent_mask(length):
99
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
100
+ return mask
101
+
102
+
103
+ @torch.jit.script
104
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
105
+ n_channels_int = n_channels[0]
106
+ in_act = input_a + input_b
107
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
108
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
109
+ acts = t_act * s_act
110
+ return acts
111
+
112
+
113
+ def convert_pad_shape(pad_shape):
114
+ l = pad_shape[::-1]
115
+ pad_shape = [item for sublist in l for item in sublist]
116
+ return pad_shape
117
+
118
+
119
+ def shift_1d(x):
120
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
121
+ return x
122
+
123
+
124
+ def sequence_mask(length, max_length=None):
125
+ if max_length is None:
126
+ max_length = length.max()
127
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
128
+ return x.unsqueeze(0) < length.unsqueeze(1)
129
+
130
+
131
+ def generate_path(duration, mask):
132
+ """
133
+ duration: [b, 1, t_x]
134
+ mask: [b, 1, t_y, t_x]
135
+ """
136
+ device = duration.device
137
+
138
+ b, _, t_y, t_x = mask.shape
139
+ cum_duration = torch.cumsum(duration, -1)
140
+
141
+ cum_duration_flat = cum_duration.view(b * t_x)
142
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
143
+ path = path.view(b, t_x, t_y)
144
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
145
+ path = path.unsqueeze(1).transpose(2, 3) * mask
146
+ return path
147
+
148
+
149
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
150
+ if isinstance(parameters, torch.Tensor):
151
+ parameters = [parameters]
152
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
153
+ norm_type = float(norm_type)
154
+ if clip_value is not None:
155
+ clip_value = float(clip_value)
156
+
157
+ total_norm = 0
158
+ for p in parameters:
159
+ param_norm = p.grad.data.norm(norm_type)
160
+ total_norm += param_norm.item() ** norm_type
161
+ if clip_value is not None:
162
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
163
+ total_norm = total_norm ** (1.0 / norm_type)
164
+ return total_norm
infer_pack/models.py ADDED
@@ -0,0 +1,1139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+ from infer_pack import modules
6
+ from infer_pack import attentions
7
+ from infer_pack import commons
8
+ from infer_pack.commons import init_weights, get_padding
9
+ from torch.nn import Conv1d, ConvTranspose1d, Conv2d
10
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
11
+ from infer_pack.commons import init_weights
12
+ import numpy as np
13
+
14
+ class TextEncoder256(nn.Module):
15
+ def __init__(
16
+ self,
17
+ out_channels,
18
+ hidden_channels,
19
+ filter_channels,
20
+ n_heads,
21
+ n_layers,
22
+ kernel_size,
23
+ p_dropout,
24
+ f0=True,
25
+ ):
26
+ super().__init__()
27
+ self.out_channels = out_channels
28
+ self.hidden_channels = hidden_channels
29
+ self.filter_channels = filter_channels
30
+ self.n_heads = n_heads
31
+ self.n_layers = n_layers
32
+ self.kernel_size = kernel_size
33
+ self.p_dropout = p_dropout
34
+ self.emb_phone = nn.Linear(256, hidden_channels)
35
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
36
+ if f0 == True:
37
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
38
+ self.encoder = attentions.Encoder(
39
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
40
+ )
41
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
42
+
43
+ def forward(self, phone, pitch, lengths):
44
+ if pitch == None:
45
+ x = self.emb_phone(phone)
46
+ else:
47
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
48
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
49
+ x = self.lrelu(x)
50
+ x = torch.transpose(x, 1, -1) # [b, h, t]
51
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
52
+ x.dtype
53
+ )
54
+ x = self.encoder(x * x_mask, x_mask)
55
+ stats = self.proj(x) * x_mask
56
+
57
+ m, logs = torch.split(stats, self.out_channels, dim=1)
58
+ return m, logs, x_mask
59
+
60
+
61
+ class TextEncoder768(nn.Module):
62
+ def __init__(
63
+ self,
64
+ out_channels,
65
+ hidden_channels,
66
+ filter_channels,
67
+ n_heads,
68
+ n_layers,
69
+ kernel_size,
70
+ p_dropout,
71
+ f0=True,
72
+ ):
73
+ super().__init__()
74
+ self.out_channels = out_channels
75
+ self.hidden_channels = hidden_channels
76
+ self.filter_channels = filter_channels
77
+ self.n_heads = n_heads
78
+ self.n_layers = n_layers
79
+ self.kernel_size = kernel_size
80
+ self.p_dropout = p_dropout
81
+ self.emb_phone = nn.Linear(768, hidden_channels)
82
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
83
+ if f0 == True:
84
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
85
+ self.encoder = attentions.Encoder(
86
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
87
+ )
88
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
89
+
90
+ def forward(self, phone, pitch, lengths):
91
+ if pitch == None:
92
+ x = self.emb_phone(phone)
93
+ else:
94
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
95
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
96
+ x = self.lrelu(x)
97
+ x = torch.transpose(x, 1, -1) # [b, h, t]
98
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
99
+ x.dtype
100
+ )
101
+ x = self.encoder(x * x_mask, x_mask)
102
+ stats = self.proj(x) * x_mask
103
+
104
+ m, logs = torch.split(stats, self.out_channels, dim=1)
105
+ return m, logs, x_mask
106
+
107
+
108
+ class ResidualCouplingBlock(nn.Module):
109
+ def __init__(
110
+ self,
111
+ channels,
112
+ hidden_channels,
113
+ kernel_size,
114
+ dilation_rate,
115
+ n_layers,
116
+ n_flows=4,
117
+ gin_channels=0,
118
+ ):
119
+ super().__init__()
120
+ self.channels = channels
121
+ self.hidden_channels = hidden_channels
122
+ self.kernel_size = kernel_size
123
+ self.dilation_rate = dilation_rate
124
+ self.n_layers = n_layers
125
+ self.n_flows = n_flows
126
+ self.gin_channels = gin_channels
127
+
128
+ self.flows = nn.ModuleList()
129
+ for i in range(n_flows):
130
+ self.flows.append(
131
+ modules.ResidualCouplingLayer(
132
+ channels,
133
+ hidden_channels,
134
+ kernel_size,
135
+ dilation_rate,
136
+ n_layers,
137
+ gin_channels=gin_channels,
138
+ mean_only=True,
139
+ )
140
+ )
141
+ self.flows.append(modules.Flip())
142
+
143
+ def forward(self, x, x_mask, g=None, reverse=False):
144
+ if not reverse:
145
+ for flow in self.flows:
146
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
147
+ else:
148
+ for flow in reversed(self.flows):
149
+ x = flow(x, x_mask, g=g, reverse=reverse)
150
+ return x
151
+
152
+ def remove_weight_norm(self):
153
+ for i in range(self.n_flows):
154
+ self.flows[i * 2].remove_weight_norm()
155
+
156
+
157
+ class PosteriorEncoder(nn.Module):
158
+ def __init__(
159
+ self,
160
+ in_channels,
161
+ out_channels,
162
+ hidden_channels,
163
+ kernel_size,
164
+ dilation_rate,
165
+ n_layers,
166
+ gin_channels=0,
167
+ ):
168
+ super().__init__()
169
+ self.in_channels = in_channels
170
+ self.out_channels = out_channels
171
+ self.hidden_channels = hidden_channels
172
+ self.kernel_size = kernel_size
173
+ self.dilation_rate = dilation_rate
174
+ self.n_layers = n_layers
175
+ self.gin_channels = gin_channels
176
+
177
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
178
+ self.enc = modules.WN(
179
+ hidden_channels,
180
+ kernel_size,
181
+ dilation_rate,
182
+ n_layers,
183
+ gin_channels=gin_channels,
184
+ )
185
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
186
+
187
+ def forward(self, x, x_lengths, g=None):
188
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
189
+ x.dtype
190
+ )
191
+ x = self.pre(x) * x_mask
192
+ x = self.enc(x, x_mask, g=g)
193
+ stats = self.proj(x) * x_mask
194
+ m, logs = torch.split(stats, self.out_channels, dim=1)
195
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
196
+ return z, m, logs, x_mask
197
+
198
+ def remove_weight_norm(self):
199
+ self.enc.remove_weight_norm()
200
+
201
+
202
+ class Generator(torch.nn.Module):
203
+ def __init__(
204
+ self,
205
+ initial_channel,
206
+ resblock,
207
+ resblock_kernel_sizes,
208
+ resblock_dilation_sizes,
209
+ upsample_rates,
210
+ upsample_initial_channel,
211
+ upsample_kernel_sizes,
212
+ gin_channels=0,
213
+ ):
214
+ super(Generator, self).__init__()
215
+ self.num_kernels = len(resblock_kernel_sizes)
216
+ self.num_upsamples = len(upsample_rates)
217
+ self.conv_pre = Conv1d(
218
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
219
+ )
220
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
221
+
222
+ self.ups = nn.ModuleList()
223
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
224
+ self.ups.append(
225
+ weight_norm(
226
+ ConvTranspose1d(
227
+ upsample_initial_channel // (2**i),
228
+ upsample_initial_channel // (2 ** (i + 1)),
229
+ k,
230
+ u,
231
+ padding=(k - u) // 2,
232
+ )
233
+ )
234
+ )
235
+
236
+ self.resblocks = nn.ModuleList()
237
+ for i in range(len(self.ups)):
238
+ ch = upsample_initial_channel // (2 ** (i + 1))
239
+ for j, (k, d) in enumerate(
240
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
241
+ ):
242
+ self.resblocks.append(resblock(ch, k, d))
243
+
244
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
245
+ self.ups.apply(init_weights)
246
+
247
+ if gin_channels != 0:
248
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
249
+
250
+ def forward(self, x, g=None):
251
+ x = self.conv_pre(x)
252
+ if g is not None:
253
+ x = x + self.cond(g)
254
+
255
+ for i in range(self.num_upsamples):
256
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
257
+ x = self.ups[i](x)
258
+ xs = None
259
+ for j in range(self.num_kernels):
260
+ if xs is None:
261
+ xs = self.resblocks[i * self.num_kernels + j](x)
262
+ else:
263
+ xs += self.resblocks[i * self.num_kernels + j](x)
264
+ x = xs / self.num_kernels
265
+ x = F.leaky_relu(x)
266
+ x = self.conv_post(x)
267
+ x = torch.tanh(x)
268
+
269
+ return x
270
+
271
+ def remove_weight_norm(self):
272
+ for l in self.ups:
273
+ remove_weight_norm(l)
274
+ for l in self.resblocks:
275
+ l.remove_weight_norm()
276
+
277
+
278
+ class SineGen(torch.nn.Module):
279
+ """Definition of sine generator
280
+ SineGen(samp_rate, harmonic_num = 0,
281
+ sine_amp = 0.1, noise_std = 0.003,
282
+ voiced_threshold = 0,
283
+ flag_for_pulse=False)
284
+ samp_rate: sampling rate in Hz
285
+ harmonic_num: number of harmonic overtones (default 0)
286
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
287
+ noise_std: std of Gaussian noise (default 0.003)
288
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
289
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
290
+ Note: when flag_for_pulse is True, the first time step of a voiced
291
+ segment is always sin(np.pi) or cos(0)
292
+ """
293
+
294
+ def __init__(
295
+ self,
296
+ samp_rate,
297
+ harmonic_num=0,
298
+ sine_amp=0.1,
299
+ noise_std=0.003,
300
+ voiced_threshold=0,
301
+ flag_for_pulse=False,
302
+ ):
303
+ super(SineGen, self).__init__()
304
+ self.sine_amp = sine_amp
305
+ self.noise_std = noise_std
306
+ self.harmonic_num = harmonic_num
307
+ self.dim = self.harmonic_num + 1
308
+ self.sampling_rate = samp_rate
309
+ self.voiced_threshold = voiced_threshold
310
+
311
+ def _f02uv(self, f0):
312
+ # generate uv signal
313
+ uv = torch.ones_like(f0)
314
+ uv = uv * (f0 > self.voiced_threshold)
315
+ return uv
316
+
317
+ def forward(self, f0, upp):
318
+ """sine_tensor, uv = forward(f0)
319
+ input F0: tensor(batchsize=1, length, dim=1)
320
+ f0 for unvoiced steps should be 0
321
+ output sine_tensor: tensor(batchsize=1, length, dim)
322
+ output uv: tensor(batchsize=1, length, 1)
323
+ """
324
+ with torch.no_grad():
325
+ f0 = f0[:, None].transpose(1, 2)
326
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
327
+ # fundamental component
328
+ f0_buf[:, :, 0] = f0[:, :, 0]
329
+ for idx in np.arange(self.harmonic_num):
330
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
331
+ idx + 2
332
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
333
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
334
+ rand_ini = torch.rand(
335
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
336
+ )
337
+ rand_ini[:, 0] = 0
338
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
339
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
340
+ tmp_over_one *= upp
341
+ tmp_over_one = F.interpolate(
342
+ tmp_over_one.transpose(2, 1),
343
+ scale_factor=upp,
344
+ mode="linear",
345
+ align_corners=True,
346
+ ).transpose(2, 1)
347
+ rad_values = F.interpolate(
348
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
349
+ ).transpose(
350
+ 2, 1
351
+ ) #######
352
+ tmp_over_one %= 1
353
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
354
+ cumsum_shift = torch.zeros_like(rad_values)
355
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
356
+ sine_waves = torch.sin(
357
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
358
+ )
359
+ sine_waves = sine_waves * self.sine_amp
360
+ uv = self._f02uv(f0)
361
+ uv = F.interpolate(
362
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
363
+ ).transpose(2, 1)
364
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
365
+ noise = noise_amp * torch.randn_like(sine_waves)
366
+ sine_waves = sine_waves * uv + noise
367
+ return sine_waves, uv, noise
368
+
369
+
370
+ class SourceModuleHnNSF(torch.nn.Module):
371
+ """SourceModule for hn-nsf
372
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
373
+ add_noise_std=0.003, voiced_threshod=0)
374
+ sampling_rate: sampling_rate in Hz
375
+ harmonic_num: number of harmonic above F0 (default: 0)
376
+ sine_amp: amplitude of sine source signal (default: 0.1)
377
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
378
+ note that amplitude of noise in unvoiced is decided
379
+ by sine_amp
380
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
381
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
382
+ F0_sampled (batchsize, length, 1)
383
+ Sine_source (batchsize, length, 1)
384
+ noise_source (batchsize, length 1)
385
+ uv (batchsize, length, 1)
386
+ """
387
+
388
+ def __init__(
389
+ self,
390
+ sampling_rate,
391
+ harmonic_num=0,
392
+ sine_amp=0.1,
393
+ add_noise_std=0.003,
394
+ voiced_threshod=0,
395
+ is_half=True,
396
+ ):
397
+ super(SourceModuleHnNSF, self).__init__()
398
+
399
+ self.sine_amp = sine_amp
400
+ self.noise_std = add_noise_std
401
+ self.is_half = is_half
402
+ # to produce sine waveforms
403
+ self.l_sin_gen = SineGen(
404
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
405
+ )
406
+
407
+ # to merge source harmonics into a single excitation
408
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
409
+ self.l_tanh = torch.nn.Tanh()
410
+
411
+ def forward(self, x, upp=None):
412
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
413
+ if self.is_half:
414
+ sine_wavs = sine_wavs.half()
415
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
416
+ return sine_merge, None, None # noise, uv
417
+
418
+
419
+ class GeneratorNSF(torch.nn.Module):
420
+ def __init__(
421
+ self,
422
+ initial_channel,
423
+ resblock,
424
+ resblock_kernel_sizes,
425
+ resblock_dilation_sizes,
426
+ upsample_rates,
427
+ upsample_initial_channel,
428
+ upsample_kernel_sizes,
429
+ gin_channels,
430
+ sr,
431
+ is_half=False,
432
+ ):
433
+ super(GeneratorNSF, self).__init__()
434
+ self.num_kernels = len(resblock_kernel_sizes)
435
+ self.num_upsamples = len(upsample_rates)
436
+
437
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
438
+ self.m_source = SourceModuleHnNSF(
439
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
440
+ )
441
+ self.noise_convs = nn.ModuleList()
442
+ self.conv_pre = Conv1d(
443
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
444
+ )
445
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
446
+
447
+ self.ups = nn.ModuleList()
448
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
449
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
450
+ self.ups.append(
451
+ weight_norm(
452
+ ConvTranspose1d(
453
+ upsample_initial_channel // (2**i),
454
+ upsample_initial_channel // (2 ** (i + 1)),
455
+ k,
456
+ u,
457
+ padding=(k - u) // 2,
458
+ )
459
+ )
460
+ )
461
+ if i + 1 < len(upsample_rates):
462
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
463
+ self.noise_convs.append(
464
+ Conv1d(
465
+ 1,
466
+ c_cur,
467
+ kernel_size=stride_f0 * 2,
468
+ stride=stride_f0,
469
+ padding=stride_f0 // 2,
470
+ )
471
+ )
472
+ else:
473
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
474
+
475
+ self.resblocks = nn.ModuleList()
476
+ for i in range(len(self.ups)):
477
+ ch = upsample_initial_channel // (2 ** (i + 1))
478
+ for j, (k, d) in enumerate(
479
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
480
+ ):
481
+ self.resblocks.append(resblock(ch, k, d))
482
+
483
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
484
+ self.ups.apply(init_weights)
485
+
486
+ if gin_channels != 0:
487
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
488
+
489
+ self.upp = np.prod(upsample_rates)
490
+
491
+ def forward(self, x, f0, g=None):
492
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
493
+ har_source = har_source.transpose(1, 2)
494
+ x = self.conv_pre(x)
495
+ if g is not None:
496
+ x = x + self.cond(g)
497
+
498
+ for i in range(self.num_upsamples):
499
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
500
+ x = self.ups[i](x)
501
+ x_source = self.noise_convs[i](har_source)
502
+ x = x + x_source
503
+ xs = None
504
+ for j in range(self.num_kernels):
505
+ if xs is None:
506
+ xs = self.resblocks[i * self.num_kernels + j](x)
507
+ else:
508
+ xs += self.resblocks[i * self.num_kernels + j](x)
509
+ x = xs / self.num_kernels
510
+ x = F.leaky_relu(x)
511
+ x = self.conv_post(x)
512
+ x = torch.tanh(x)
513
+ return x
514
+
515
+ def remove_weight_norm(self):
516
+ for l in self.ups:
517
+ remove_weight_norm(l)
518
+ for l in self.resblocks:
519
+ l.remove_weight_norm()
520
+
521
+
522
+ sr2sr = {
523
+ "32k": 32000,
524
+ "40k": 40000,
525
+ "48k": 48000,
526
+ }
527
+
528
+
529
+ class SynthesizerTrnMs256NSFsid(nn.Module):
530
+ def __init__(
531
+ self,
532
+ spec_channels,
533
+ segment_size,
534
+ inter_channels,
535
+ hidden_channels,
536
+ filter_channels,
537
+ n_heads,
538
+ n_layers,
539
+ kernel_size,
540
+ p_dropout,
541
+ resblock,
542
+ resblock_kernel_sizes,
543
+ resblock_dilation_sizes,
544
+ upsample_rates,
545
+ upsample_initial_channel,
546
+ upsample_kernel_sizes,
547
+ spk_embed_dim,
548
+ gin_channels,
549
+ sr,
550
+ **kwargs
551
+ ):
552
+ super().__init__()
553
+ if type(sr) == type("strr"):
554
+ sr = sr2sr[sr]
555
+ self.spec_channels = spec_channels
556
+ self.inter_channels = inter_channels
557
+ self.hidden_channels = hidden_channels
558
+ self.filter_channels = filter_channels
559
+ self.n_heads = n_heads
560
+ self.n_layers = n_layers
561
+ self.kernel_size = kernel_size
562
+ self.p_dropout = p_dropout
563
+ self.resblock = resblock
564
+ self.resblock_kernel_sizes = resblock_kernel_sizes
565
+ self.resblock_dilation_sizes = resblock_dilation_sizes
566
+ self.upsample_rates = upsample_rates
567
+ self.upsample_initial_channel = upsample_initial_channel
568
+ self.upsample_kernel_sizes = upsample_kernel_sizes
569
+ self.segment_size = segment_size
570
+ self.gin_channels = gin_channels
571
+ # self.hop_length = hop_length#
572
+ self.spk_embed_dim = spk_embed_dim
573
+ self.enc_p = TextEncoder256(
574
+ inter_channels,
575
+ hidden_channels,
576
+ filter_channels,
577
+ n_heads,
578
+ n_layers,
579
+ kernel_size,
580
+ p_dropout,
581
+ )
582
+ self.dec = GeneratorNSF(
583
+ inter_channels,
584
+ resblock,
585
+ resblock_kernel_sizes,
586
+ resblock_dilation_sizes,
587
+ upsample_rates,
588
+ upsample_initial_channel,
589
+ upsample_kernel_sizes,
590
+ gin_channels=gin_channels,
591
+ sr=sr,
592
+ is_half=kwargs["is_half"],
593
+ )
594
+ self.enc_q = PosteriorEncoder(
595
+ spec_channels,
596
+ inter_channels,
597
+ hidden_channels,
598
+ 5,
599
+ 1,
600
+ 16,
601
+ gin_channels=gin_channels,
602
+ )
603
+ self.flow = ResidualCouplingBlock(
604
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
605
+ )
606
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
607
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
608
+
609
+ def remove_weight_norm(self):
610
+ self.dec.remove_weight_norm()
611
+ self.flow.remove_weight_norm()
612
+ self.enc_q.remove_weight_norm()
613
+
614
+ def forward(
615
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
616
+ ): # 这里ds是id,[bs,1]
617
+ # print(1,pitch.shape)#[bs,t]
618
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
619
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
620
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
621
+ z_p = self.flow(z, y_mask, g=g)
622
+ z_slice, ids_slice = commons.rand_slice_segments(
623
+ z, y_lengths, self.segment_size
624
+ )
625
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
626
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
627
+ # print(-2,pitchf.shape,z_slice.shape)
628
+ o = self.dec(z_slice, pitchf, g=g)
629
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
630
+
631
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
632
+ g = self.emb_g(sid).unsqueeze(-1)
633
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
634
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
635
+ if rate:
636
+ head = int(z_p.shape[2] * rate)
637
+ z_p = z_p[:, :, -head:]
638
+ x_mask = x_mask[:, :, -head:]
639
+ nsff0 = nsff0[:, -head:]
640
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
641
+ o = self.dec(z * x_mask, nsff0, g=g)
642
+ return o, x_mask, (z, z_p, m_p, logs_p)
643
+
644
+
645
+ class SynthesizerTrnMs768NSFsid(nn.Module):
646
+ def __init__(
647
+ self,
648
+ spec_channels,
649
+ segment_size,
650
+ inter_channels,
651
+ hidden_channels,
652
+ filter_channels,
653
+ n_heads,
654
+ n_layers,
655
+ kernel_size,
656
+ p_dropout,
657
+ resblock,
658
+ resblock_kernel_sizes,
659
+ resblock_dilation_sizes,
660
+ upsample_rates,
661
+ upsample_initial_channel,
662
+ upsample_kernel_sizes,
663
+ spk_embed_dim,
664
+ gin_channels,
665
+ sr,
666
+ **kwargs
667
+ ):
668
+ super().__init__()
669
+ if type(sr) == type("strr"):
670
+ sr = sr2sr[sr]
671
+ self.spec_channels = spec_channels
672
+ self.inter_channels = inter_channels
673
+ self.hidden_channels = hidden_channels
674
+ self.filter_channels = filter_channels
675
+ self.n_heads = n_heads
676
+ self.n_layers = n_layers
677
+ self.kernel_size = kernel_size
678
+ self.p_dropout = p_dropout
679
+ self.resblock = resblock
680
+ self.resblock_kernel_sizes = resblock_kernel_sizes
681
+ self.resblock_dilation_sizes = resblock_dilation_sizes
682
+ self.upsample_rates = upsample_rates
683
+ self.upsample_initial_channel = upsample_initial_channel
684
+ self.upsample_kernel_sizes = upsample_kernel_sizes
685
+ self.segment_size = segment_size
686
+ self.gin_channels = gin_channels
687
+ # self.hop_length = hop_length#
688
+ self.spk_embed_dim = spk_embed_dim
689
+ self.enc_p = TextEncoder768(
690
+ inter_channels,
691
+ hidden_channels,
692
+ filter_channels,
693
+ n_heads,
694
+ n_layers,
695
+ kernel_size,
696
+ p_dropout,
697
+ )
698
+ self.dec = GeneratorNSF(
699
+ inter_channels,
700
+ resblock,
701
+ resblock_kernel_sizes,
702
+ resblock_dilation_sizes,
703
+ upsample_rates,
704
+ upsample_initial_channel,
705
+ upsample_kernel_sizes,
706
+ gin_channels=gin_channels,
707
+ sr=sr,
708
+ is_half=kwargs["is_half"],
709
+ )
710
+ self.enc_q = PosteriorEncoder(
711
+ spec_channels,
712
+ inter_channels,
713
+ hidden_channels,
714
+ 5,
715
+ 1,
716
+ 16,
717
+ gin_channels=gin_channels,
718
+ )
719
+ self.flow = ResidualCouplingBlock(
720
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
721
+ )
722
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
723
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
724
+
725
+ def remove_weight_norm(self):
726
+ self.dec.remove_weight_norm()
727
+ self.flow.remove_weight_norm()
728
+ self.enc_q.remove_weight_norm()
729
+
730
+ def forward(
731
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
732
+ ): # 这里ds是id��[bs,1]
733
+ # print(1,pitch.shape)#[bs,t]
734
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
735
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
736
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
737
+ z_p = self.flow(z, y_mask, g=g)
738
+ z_slice, ids_slice = commons.rand_slice_segments(
739
+ z, y_lengths, self.segment_size
740
+ )
741
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
742
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
743
+ # print(-2,pitchf.shape,z_slice.shape)
744
+ o = self.dec(z_slice, pitchf, g=g)
745
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
746
+
747
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
748
+ g = self.emb_g(sid).unsqueeze(-1)
749
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
750
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
751
+ if rate:
752
+ head = int(z_p.shape[2] * rate)
753
+ z_p = z_p[:, :, -head:]
754
+ x_mask = x_mask[:, :, -head:]
755
+ nsff0 = nsff0[:, -head:]
756
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
757
+ o = self.dec(z * x_mask, nsff0, g=g)
758
+ return o, x_mask, (z, z_p, m_p, logs_p)
759
+
760
+
761
+ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
762
+ def __init__(
763
+ self,
764
+ spec_channels,
765
+ segment_size,
766
+ inter_channels,
767
+ hidden_channels,
768
+ filter_channels,
769
+ n_heads,
770
+ n_layers,
771
+ kernel_size,
772
+ p_dropout,
773
+ resblock,
774
+ resblock_kernel_sizes,
775
+ resblock_dilation_sizes,
776
+ upsample_rates,
777
+ upsample_initial_channel,
778
+ upsample_kernel_sizes,
779
+ spk_embed_dim,
780
+ gin_channels,
781
+ sr=None,
782
+ **kwargs
783
+ ):
784
+ super().__init__()
785
+ self.spec_channels = spec_channels
786
+ self.inter_channels = inter_channels
787
+ self.hidden_channels = hidden_channels
788
+ self.filter_channels = filter_channels
789
+ self.n_heads = n_heads
790
+ self.n_layers = n_layers
791
+ self.kernel_size = kernel_size
792
+ self.p_dropout = p_dropout
793
+ self.resblock = resblock
794
+ self.resblock_kernel_sizes = resblock_kernel_sizes
795
+ self.resblock_dilation_sizes = resblock_dilation_sizes
796
+ self.upsample_rates = upsample_rates
797
+ self.upsample_initial_channel = upsample_initial_channel
798
+ self.upsample_kernel_sizes = upsample_kernel_sizes
799
+ self.segment_size = segment_size
800
+ self.gin_channels = gin_channels
801
+ # self.hop_length = hop_length#
802
+ self.spk_embed_dim = spk_embed_dim
803
+ self.enc_p = TextEncoder256(
804
+ inter_channels,
805
+ hidden_channels,
806
+ filter_channels,
807
+ n_heads,
808
+ n_layers,
809
+ kernel_size,
810
+ p_dropout,
811
+ f0=False,
812
+ )
813
+ self.dec = Generator(
814
+ inter_channels,
815
+ resblock,
816
+ resblock_kernel_sizes,
817
+ resblock_dilation_sizes,
818
+ upsample_rates,
819
+ upsample_initial_channel,
820
+ upsample_kernel_sizes,
821
+ gin_channels=gin_channels,
822
+ )
823
+ self.enc_q = PosteriorEncoder(
824
+ spec_channels,
825
+ inter_channels,
826
+ hidden_channels,
827
+ 5,
828
+ 1,
829
+ 16,
830
+ gin_channels=gin_channels,
831
+ )
832
+ self.flow = ResidualCouplingBlock(
833
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
834
+ )
835
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
836
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
837
+
838
+ def remove_weight_norm(self):
839
+ self.dec.remove_weight_norm()
840
+ self.flow.remove_weight_norm()
841
+ self.enc_q.remove_weight_norm()
842
+
843
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
844
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
845
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
846
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
847
+ z_p = self.flow(z, y_mask, g=g)
848
+ z_slice, ids_slice = commons.rand_slice_segments(
849
+ z, y_lengths, self.segment_size
850
+ )
851
+ o = self.dec(z_slice, g=g)
852
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
853
+
854
+ def infer(self, phone, phone_lengths, sid, rate=None):
855
+ g = self.emb_g(sid).unsqueeze(-1)
856
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
857
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
858
+ if rate:
859
+ head = int(z_p.shape[2] * rate)
860
+ z_p = z_p[:, :, -head:]
861
+ x_mask = x_mask[:, :, -head:]
862
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
863
+ o = self.dec(z * x_mask, g=g)
864
+ return o, x_mask, (z, z_p, m_p, logs_p)
865
+
866
+
867
+ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
868
+ def __init__(
869
+ self,
870
+ spec_channels,
871
+ segment_size,
872
+ inter_channels,
873
+ hidden_channels,
874
+ filter_channels,
875
+ n_heads,
876
+ n_layers,
877
+ kernel_size,
878
+ p_dropout,
879
+ resblock,
880
+ resblock_kernel_sizes,
881
+ resblock_dilation_sizes,
882
+ upsample_rates,
883
+ upsample_initial_channel,
884
+ upsample_kernel_sizes,
885
+ spk_embed_dim,
886
+ gin_channels,
887
+ sr=None,
888
+ **kwargs
889
+ ):
890
+ super().__init__()
891
+ self.spec_channels = spec_channels
892
+ self.inter_channels = inter_channels
893
+ self.hidden_channels = hidden_channels
894
+ self.filter_channels = filter_channels
895
+ self.n_heads = n_heads
896
+ self.n_layers = n_layers
897
+ self.kernel_size = kernel_size
898
+ self.p_dropout = p_dropout
899
+ self.resblock = resblock
900
+ self.resblock_kernel_sizes = resblock_kernel_sizes
901
+ self.resblock_dilation_sizes = resblock_dilation_sizes
902
+ self.upsample_rates = upsample_rates
903
+ self.upsample_initial_channel = upsample_initial_channel
904
+ self.upsample_kernel_sizes = upsample_kernel_sizes
905
+ self.segment_size = segment_size
906
+ self.gin_channels = gin_channels
907
+ # self.hop_length = hop_length#
908
+ self.spk_embed_dim = spk_embed_dim
909
+ self.enc_p = TextEncoder768(
910
+ inter_channels,
911
+ hidden_channels,
912
+ filter_channels,
913
+ n_heads,
914
+ n_layers,
915
+ kernel_size,
916
+ p_dropout,
917
+ f0=False,
918
+ )
919
+ self.dec = Generator(
920
+ inter_channels,
921
+ resblock,
922
+ resblock_kernel_sizes,
923
+ resblock_dilation_sizes,
924
+ upsample_rates,
925
+ upsample_initial_channel,
926
+ upsample_kernel_sizes,
927
+ gin_channels=gin_channels,
928
+ )
929
+ self.enc_q = PosteriorEncoder(
930
+ spec_channels,
931
+ inter_channels,
932
+ hidden_channels,
933
+ 5,
934
+ 1,
935
+ 16,
936
+ gin_channels=gin_channels,
937
+ )
938
+ self.flow = ResidualCouplingBlock(
939
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
940
+ )
941
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
942
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
943
+
944
+ def remove_weight_norm(self):
945
+ self.dec.remove_weight_norm()
946
+ self.flow.remove_weight_norm()
947
+ self.enc_q.remove_weight_norm()
948
+
949
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
950
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
951
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
952
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
953
+ z_p = self.flow(z, y_mask, g=g)
954
+ z_slice, ids_slice = commons.rand_slice_segments(
955
+ z, y_lengths, self.segment_size
956
+ )
957
+ o = self.dec(z_slice, g=g)
958
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
959
+
960
+ def infer(self, phone, phone_lengths, sid, rate=None):
961
+ g = self.emb_g(sid).unsqueeze(-1)
962
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
963
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
964
+ if rate:
965
+ head = int(z_p.shape[2] * rate)
966
+ z_p = z_p[:, :, -head:]
967
+ x_mask = x_mask[:, :, -head:]
968
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
969
+ o = self.dec(z * x_mask, g=g)
970
+ return o, x_mask, (z, z_p, m_p, logs_p)
971
+
972
+
973
+ class MultiPeriodDiscriminator(torch.nn.Module):
974
+ def __init__(self, use_spectral_norm=False):
975
+ super(MultiPeriodDiscriminator, self).__init__()
976
+ periods = [2, 3, 5, 7, 11, 17]
977
+ # periods = [3, 5, 7, 11, 17, 23, 37]
978
+
979
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
980
+ discs = discs + [
981
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
982
+ ]
983
+ self.discriminators = nn.ModuleList(discs)
984
+
985
+ def forward(self, y, y_hat):
986
+ y_d_rs = [] #
987
+ y_d_gs = []
988
+ fmap_rs = []
989
+ fmap_gs = []
990
+ for i, d in enumerate(self.discriminators):
991
+ y_d_r, fmap_r = d(y)
992
+ y_d_g, fmap_g = d(y_hat)
993
+ # for j in range(len(fmap_r)):
994
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
995
+ y_d_rs.append(y_d_r)
996
+ y_d_gs.append(y_d_g)
997
+ fmap_rs.append(fmap_r)
998
+ fmap_gs.append(fmap_g)
999
+
1000
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1001
+
1002
+
1003
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
1004
+ def __init__(self, use_spectral_norm=False):
1005
+ super(MultiPeriodDiscriminatorV2, self).__init__()
1006
+ # periods = [2, 3, 5, 7, 11, 17]
1007
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
1008
+
1009
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1010
+ discs = discs + [
1011
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1012
+ ]
1013
+ self.discriminators = nn.ModuleList(discs)
1014
+
1015
+ def forward(self, y, y_hat):
1016
+ y_d_rs = [] #
1017
+ y_d_gs = []
1018
+ fmap_rs = []
1019
+ fmap_gs = []
1020
+ for i, d in enumerate(self.discriminators):
1021
+ y_d_r, fmap_r = d(y)
1022
+ y_d_g, fmap_g = d(y_hat)
1023
+ # for j in range(len(fmap_r)):
1024
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1025
+ y_d_rs.append(y_d_r)
1026
+ y_d_gs.append(y_d_g)
1027
+ fmap_rs.append(fmap_r)
1028
+ fmap_gs.append(fmap_g)
1029
+
1030
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1031
+
1032
+
1033
+ class DiscriminatorS(torch.nn.Module):
1034
+ def __init__(self, use_spectral_norm=False):
1035
+ super(DiscriminatorS, self).__init__()
1036
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1037
+ self.convs = nn.ModuleList(
1038
+ [
1039
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1040
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1041
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1042
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1043
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1044
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1045
+ ]
1046
+ )
1047
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1048
+
1049
+ def forward(self, x):
1050
+ fmap = []
1051
+
1052
+ for l in self.convs:
1053
+ x = l(x)
1054
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1055
+ fmap.append(x)
1056
+ x = self.conv_post(x)
1057
+ fmap.append(x)
1058
+ x = torch.flatten(x, 1, -1)
1059
+
1060
+ return x, fmap
1061
+
1062
+
1063
+ class DiscriminatorP(torch.nn.Module):
1064
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1065
+ super(DiscriminatorP, self).__init__()
1066
+ self.period = period
1067
+ self.use_spectral_norm = use_spectral_norm
1068
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1069
+ self.convs = nn.ModuleList(
1070
+ [
1071
+ norm_f(
1072
+ Conv2d(
1073
+ 1,
1074
+ 32,
1075
+ (kernel_size, 1),
1076
+ (stride, 1),
1077
+ padding=(get_padding(kernel_size, 1), 0),
1078
+ )
1079
+ ),
1080
+ norm_f(
1081
+ Conv2d(
1082
+ 32,
1083
+ 128,
1084
+ (kernel_size, 1),
1085
+ (stride, 1),
1086
+ padding=(get_padding(kernel_size, 1), 0),
1087
+ )
1088
+ ),
1089
+ norm_f(
1090
+ Conv2d(
1091
+ 128,
1092
+ 512,
1093
+ (kernel_size, 1),
1094
+ (stride, 1),
1095
+ padding=(get_padding(kernel_size, 1), 0),
1096
+ )
1097
+ ),
1098
+ norm_f(
1099
+ Conv2d(
1100
+ 512,
1101
+ 1024,
1102
+ (kernel_size, 1),
1103
+ (stride, 1),
1104
+ padding=(get_padding(kernel_size, 1), 0),
1105
+ )
1106
+ ),
1107
+ norm_f(
1108
+ Conv2d(
1109
+ 1024,
1110
+ 1024,
1111
+ (kernel_size, 1),
1112
+ 1,
1113
+ padding=(get_padding(kernel_size, 1), 0),
1114
+ )
1115
+ ),
1116
+ ]
1117
+ )
1118
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1119
+
1120
+ def forward(self, x):
1121
+ fmap = []
1122
+
1123
+ # 1d to 2d
1124
+ b, c, t = x.shape
1125
+ if t % self.period != 0: # pad first
1126
+ n_pad = self.period - (t % self.period)
1127
+ x = F.pad(x, (0, n_pad), "reflect")
1128
+ t = t + n_pad
1129
+ x = x.view(b, c, t // self.period, self.period)
1130
+
1131
+ for l in self.convs:
1132
+ x = l(x)
1133
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1134
+ fmap.append(x)
1135
+ x = self.conv_post(x)
1136
+ fmap.append(x)
1137
+ x = torch.flatten(x, 1, -1)
1138
+
1139
+ return x, fmap
infer_pack/modules.py ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ from torch.nn import Conv1d
7
+ from torch.nn.utils import weight_norm, remove_weight_norm
8
+
9
+ from infer_pack import commons
10
+ from infer_pack.commons import init_weights, get_padding
11
+ from infer_pack.transforms import piecewise_rational_quadratic_transform
12
+
13
+ LRELU_SLOPE = 0.1
14
+
15
+
16
+ class LayerNorm(nn.Module):
17
+ def __init__(self, channels, eps=1e-5):
18
+ super().__init__()
19
+ self.channels = channels
20
+ self.eps = eps
21
+
22
+ self.gamma = nn.Parameter(torch.ones(channels))
23
+ self.beta = nn.Parameter(torch.zeros(channels))
24
+
25
+ def forward(self, x):
26
+ x = x.transpose(1, -1)
27
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
28
+ return x.transpose(1, -1)
29
+
30
+
31
+ class ConvReluNorm(nn.Module):
32
+ def __init__(
33
+ self,
34
+ in_channels,
35
+ hidden_channels,
36
+ out_channels,
37
+ kernel_size,
38
+ n_layers,
39
+ p_dropout,
40
+ ):
41
+ super().__init__()
42
+ self.in_channels = in_channels
43
+ self.hidden_channels = hidden_channels
44
+ self.out_channels = out_channels
45
+ self.kernel_size = kernel_size
46
+ self.n_layers = n_layers
47
+ self.p_dropout = p_dropout
48
+ assert n_layers > 1, "Number of layers should be larger than 0."
49
+
50
+ self.conv_layers = nn.ModuleList()
51
+ self.norm_layers = nn.ModuleList()
52
+ self.conv_layers.append(
53
+ nn.Conv1d(
54
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
55
+ )
56
+ )
57
+ self.norm_layers.append(LayerNorm(hidden_channels))
58
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
59
+ for _ in range(n_layers - 1):
60
+ self.conv_layers.append(
61
+ nn.Conv1d(
62
+ hidden_channels,
63
+ hidden_channels,
64
+ kernel_size,
65
+ padding=kernel_size // 2,
66
+ )
67
+ )
68
+ self.norm_layers.append(LayerNorm(hidden_channels))
69
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
70
+ self.proj.weight.data.zero_()
71
+ self.proj.bias.data.zero_()
72
+
73
+ def forward(self, x, x_mask):
74
+ x_org = x
75
+ for i in range(self.n_layers):
76
+ x = self.conv_layers[i](x * x_mask)
77
+ x = self.norm_layers[i](x)
78
+ x = self.relu_drop(x)
79
+ x = x_org + self.proj(x)
80
+ return x * x_mask
81
+
82
+
83
+ class DDSConv(nn.Module):
84
+ """
85
+ Dialted and Depth-Separable Convolution
86
+ """
87
+
88
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
89
+ super().__init__()
90
+ self.channels = channels
91
+ self.kernel_size = kernel_size
92
+ self.n_layers = n_layers
93
+ self.p_dropout = p_dropout
94
+
95
+ self.drop = nn.Dropout(p_dropout)
96
+ self.convs_sep = nn.ModuleList()
97
+ self.convs_1x1 = nn.ModuleList()
98
+ self.norms_1 = nn.ModuleList()
99
+ self.norms_2 = nn.ModuleList()
100
+ for i in range(n_layers):
101
+ dilation = kernel_size**i
102
+ padding = (kernel_size * dilation - dilation) // 2
103
+ self.convs_sep.append(
104
+ nn.Conv1d(
105
+ channels,
106
+ channels,
107
+ kernel_size,
108
+ groups=channels,
109
+ dilation=dilation,
110
+ padding=padding,
111
+ )
112
+ )
113
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
114
+ self.norms_1.append(LayerNorm(channels))
115
+ self.norms_2.append(LayerNorm(channels))
116
+
117
+ def forward(self, x, x_mask, g=None):
118
+ if g is not None:
119
+ x = x + g
120
+ for i in range(self.n_layers):
121
+ y = self.convs_sep[i](x * x_mask)
122
+ y = self.norms_1[i](y)
123
+ y = F.gelu(y)
124
+ y = self.convs_1x1[i](y)
125
+ y = self.norms_2[i](y)
126
+ y = F.gelu(y)
127
+ y = self.drop(y)
128
+ x = x + y
129
+ return x * x_mask
130
+
131
+
132
+ class WN(torch.nn.Module):
133
+ def __init__(
134
+ self,
135
+ hidden_channels,
136
+ kernel_size,
137
+ dilation_rate,
138
+ n_layers,
139
+ gin_channels=0,
140
+ p_dropout=0,
141
+ ):
142
+ super(WN, self).__init__()
143
+ assert kernel_size % 2 == 1
144
+ self.hidden_channels = hidden_channels
145
+ self.kernel_size = (kernel_size,)
146
+ self.dilation_rate = dilation_rate
147
+ self.n_layers = n_layers
148
+ self.gin_channels = gin_channels
149
+ self.p_dropout = p_dropout
150
+
151
+ self.in_layers = torch.nn.ModuleList()
152
+ self.res_skip_layers = torch.nn.ModuleList()
153
+ self.drop = nn.Dropout(p_dropout)
154
+
155
+ if gin_channels != 0:
156
+ cond_layer = torch.nn.Conv1d(
157
+ gin_channels, 2 * hidden_channels * n_layers, 1
158
+ )
159
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
160
+
161
+ for i in range(n_layers):
162
+ dilation = dilation_rate**i
163
+ padding = int((kernel_size * dilation - dilation) / 2)
164
+ in_layer = torch.nn.Conv1d(
165
+ hidden_channels,
166
+ 2 * hidden_channels,
167
+ kernel_size,
168
+ dilation=dilation,
169
+ padding=padding,
170
+ )
171
+ in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
172
+ self.in_layers.append(in_layer)
173
+
174
+ # last one is not necessary
175
+ if i < n_layers - 1:
176
+ res_skip_channels = 2 * hidden_channels
177
+ else:
178
+ res_skip_channels = hidden_channels
179
+
180
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
181
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
182
+ self.res_skip_layers.append(res_skip_layer)
183
+
184
+ def forward(self, x, x_mask, g=None, **kwargs):
185
+ output = torch.zeros_like(x)
186
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
187
+
188
+ if g is not None:
189
+ g = self.cond_layer(g)
190
+
191
+ for i in range(self.n_layers):
192
+ x_in = self.in_layers[i](x)
193
+ if g is not None:
194
+ cond_offset = i * 2 * self.hidden_channels
195
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
196
+ else:
197
+ g_l = torch.zeros_like(x_in)
198
+
199
+ acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
200
+ acts = self.drop(acts)
201
+
202
+ res_skip_acts = self.res_skip_layers[i](acts)
203
+ if i < self.n_layers - 1:
204
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
205
+ x = (x + res_acts) * x_mask
206
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
207
+ else:
208
+ output = output + res_skip_acts
209
+ return output * x_mask
210
+
211
+ def remove_weight_norm(self):
212
+ if self.gin_channels != 0:
213
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
214
+ for l in self.in_layers:
215
+ torch.nn.utils.remove_weight_norm(l)
216
+ for l in self.res_skip_layers:
217
+ torch.nn.utils.remove_weight_norm(l)
218
+
219
+
220
+ class ResBlock1(torch.nn.Module):
221
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
222
+ super(ResBlock1, self).__init__()
223
+ self.convs1 = nn.ModuleList(
224
+ [
225
+ weight_norm(
226
+ Conv1d(
227
+ channels,
228
+ channels,
229
+ kernel_size,
230
+ 1,
231
+ dilation=dilation[0],
232
+ padding=get_padding(kernel_size, dilation[0]),
233
+ )
234
+ ),
235
+ weight_norm(
236
+ Conv1d(
237
+ channels,
238
+ channels,
239
+ kernel_size,
240
+ 1,
241
+ dilation=dilation[1],
242
+ padding=get_padding(kernel_size, dilation[1]),
243
+ )
244
+ ),
245
+ weight_norm(
246
+ Conv1d(
247
+ channels,
248
+ channels,
249
+ kernel_size,
250
+ 1,
251
+ dilation=dilation[2],
252
+ padding=get_padding(kernel_size, dilation[2]),
253
+ )
254
+ ),
255
+ ]
256
+ )
257
+ self.convs1.apply(init_weights)
258
+
259
+ self.convs2 = nn.ModuleList(
260
+ [
261
+ weight_norm(
262
+ Conv1d(
263
+ channels,
264
+ channels,
265
+ kernel_size,
266
+ 1,
267
+ dilation=1,
268
+ padding=get_padding(kernel_size, 1),
269
+ )
270
+ ),
271
+ weight_norm(
272
+ Conv1d(
273
+ channels,
274
+ channels,
275
+ kernel_size,
276
+ 1,
277
+ dilation=1,
278
+ padding=get_padding(kernel_size, 1),
279
+ )
280
+ ),
281
+ weight_norm(
282
+ Conv1d(
283
+ channels,
284
+ channels,
285
+ kernel_size,
286
+ 1,
287
+ dilation=1,
288
+ padding=get_padding(kernel_size, 1),
289
+ )
290
+ ),
291
+ ]
292
+ )
293
+ self.convs2.apply(init_weights)
294
+
295
+ def forward(self, x, x_mask=None):
296
+ for c1, c2 in zip(self.convs1, self.convs2):
297
+ xt = F.leaky_relu(x, LRELU_SLOPE)
298
+ if x_mask is not None:
299
+ xt = xt * x_mask
300
+ xt = c1(xt)
301
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
302
+ if x_mask is not None:
303
+ xt = xt * x_mask
304
+ xt = c2(xt)
305
+ x = xt + x
306
+ if x_mask is not None:
307
+ x = x * x_mask
308
+ return x
309
+
310
+ def remove_weight_norm(self):
311
+ for l in self.convs1:
312
+ remove_weight_norm(l)
313
+ for l in self.convs2:
314
+ remove_weight_norm(l)
315
+
316
+
317
+ class ResBlock2(torch.nn.Module):
318
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
319
+ super(ResBlock2, self).__init__()
320
+ self.convs = nn.ModuleList(
321
+ [
322
+ weight_norm(
323
+ Conv1d(
324
+ channels,
325
+ channels,
326
+ kernel_size,
327
+ 1,
328
+ dilation=dilation[0],
329
+ padding=get_padding(kernel_size, dilation[0]),
330
+ )
331
+ ),
332
+ weight_norm(
333
+ Conv1d(
334
+ channels,
335
+ channels,
336
+ kernel_size,
337
+ 1,
338
+ dilation=dilation[1],
339
+ padding=get_padding(kernel_size, dilation[1]),
340
+ )
341
+ ),
342
+ ]
343
+ )
344
+ self.convs.apply(init_weights)
345
+
346
+ def forward(self, x, x_mask=None):
347
+ for c in self.convs:
348
+ xt = F.leaky_relu(x, LRELU_SLOPE)
349
+ if x_mask is not None:
350
+ xt = xt * x_mask
351
+ xt = c(xt)
352
+ x = xt + x
353
+ if x_mask is not None:
354
+ x = x * x_mask
355
+ return x
356
+
357
+ def remove_weight_norm(self):
358
+ for l in self.convs:
359
+ remove_weight_norm(l)
360
+
361
+
362
+ class Log(nn.Module):
363
+ def forward(self, x, x_mask, reverse=False, **kwargs):
364
+ if not reverse:
365
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
366
+ logdet = torch.sum(-y, [1, 2])
367
+ return y, logdet
368
+ else:
369
+ x = torch.exp(x) * x_mask
370
+ return x
371
+
372
+
373
+ class Flip(nn.Module):
374
+ def forward(self, x, *args, reverse=False, **kwargs):
375
+ x = torch.flip(x, [1])
376
+ if not reverse:
377
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
378
+ return x, logdet
379
+ else:
380
+ return x
381
+
382
+
383
+ class ElementwiseAffine(nn.Module):
384
+ def __init__(self, channels):
385
+ super().__init__()
386
+ self.channels = channels
387
+ self.m = nn.Parameter(torch.zeros(channels, 1))
388
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
389
+
390
+ def forward(self, x, x_mask, reverse=False, **kwargs):
391
+ if not reverse:
392
+ y = self.m + torch.exp(self.logs) * x
393
+ y = y * x_mask
394
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
395
+ return y, logdet
396
+ else:
397
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
398
+ return x
399
+
400
+
401
+ class ResidualCouplingLayer(nn.Module):
402
+ def __init__(
403
+ self,
404
+ channels,
405
+ hidden_channels,
406
+ kernel_size,
407
+ dilation_rate,
408
+ n_layers,
409
+ p_dropout=0,
410
+ gin_channels=0,
411
+ mean_only=False,
412
+ ):
413
+ assert channels % 2 == 0, "channels should be divisible by 2"
414
+ super().__init__()
415
+ self.channels = channels
416
+ self.hidden_channels = hidden_channels
417
+ self.kernel_size = kernel_size
418
+ self.dilation_rate = dilation_rate
419
+ self.n_layers = n_layers
420
+ self.half_channels = channels // 2
421
+ self.mean_only = mean_only
422
+
423
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
424
+ self.enc = WN(
425
+ hidden_channels,
426
+ kernel_size,
427
+ dilation_rate,
428
+ n_layers,
429
+ p_dropout=p_dropout,
430
+ gin_channels=gin_channels,
431
+ )
432
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
433
+ self.post.weight.data.zero_()
434
+ self.post.bias.data.zero_()
435
+
436
+ def forward(self, x, x_mask, g=None, reverse=False):
437
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
438
+ h = self.pre(x0) * x_mask
439
+ h = self.enc(h, x_mask, g=g)
440
+ stats = self.post(h) * x_mask
441
+ if not self.mean_only:
442
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
443
+ else:
444
+ m = stats
445
+ logs = torch.zeros_like(m)
446
+
447
+ if not reverse:
448
+ x1 = m + x1 * torch.exp(logs) * x_mask
449
+ x = torch.cat([x0, x1], 1)
450
+ logdet = torch.sum(logs, [1, 2])
451
+ return x, logdet
452
+ else:
453
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
454
+ x = torch.cat([x0, x1], 1)
455
+ return x
456
+
457
+ def remove_weight_norm(self):
458
+ self.enc.remove_weight_norm()
459
+
460
+
461
+ class ConvFlow(nn.Module):
462
+ def __init__(
463
+ self,
464
+ in_channels,
465
+ filter_channels,
466
+ kernel_size,
467
+ n_layers,
468
+ num_bins=10,
469
+ tail_bound=5.0,
470
+ ):
471
+ super().__init__()
472
+ self.in_channels = in_channels
473
+ self.filter_channels = filter_channels
474
+ self.kernel_size = kernel_size
475
+ self.n_layers = n_layers
476
+ self.num_bins = num_bins
477
+ self.tail_bound = tail_bound
478
+ self.half_channels = in_channels // 2
479
+
480
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
481
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
482
+ self.proj = nn.Conv1d(
483
+ filter_channels, self.half_channels * (num_bins * 3 - 1), 1
484
+ )
485
+ self.proj.weight.data.zero_()
486
+ self.proj.bias.data.zero_()
487
+
488
+ def forward(self, x, x_mask, g=None, reverse=False):
489
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
490
+ h = self.pre(x0)
491
+ h = self.convs(h, x_mask, g=g)
492
+ h = self.proj(h) * x_mask
493
+
494
+ b, c, t = x0.shape
495
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
496
+
497
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
498
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
499
+ self.filter_channels
500
+ )
501
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
502
+
503
+ x1, logabsdet = piecewise_rational_quadratic_transform(
504
+ x1,
505
+ unnormalized_widths,
506
+ unnormalized_heights,
507
+ unnormalized_derivatives,
508
+ inverse=reverse,
509
+ tails="linear",
510
+ tail_bound=self.tail_bound,
511
+ )
512
+
513
+ x = torch.cat([x0, x1], 1) * x_mask
514
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
515
+ if not reverse:
516
+ return x, logdet
517
+ else:
518
+ return x
infer_pack/modules/F0Predictor/DioF0Predictor.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
+ import pyworld
3
+ import numpy as np
4
+
5
+
6
+ class DioF0Predictor(F0Predictor):
7
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
+ self.hop_length = hop_length
9
+ self.f0_min = f0_min
10
+ self.f0_max = f0_max
11
+ self.sampling_rate = sampling_rate
12
+
13
+ def interpolate_f0(self, f0):
14
+ """
15
+ 对F0进行插值处理
16
+ """
17
+
18
+ data = np.reshape(f0, (f0.size, 1))
19
+
20
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
+ vuv_vector[data > 0.0] = 1.0
22
+ vuv_vector[data <= 0.0] = 0.0
23
+
24
+ ip_data = data
25
+
26
+ frame_number = data.size
27
+ last_value = 0.0
28
+ for i in range(frame_number):
29
+ if data[i] <= 0.0:
30
+ j = i + 1
31
+ for j in range(i + 1, frame_number):
32
+ if data[j] > 0.0:
33
+ break
34
+ if j < frame_number - 1:
35
+ if last_value > 0.0:
36
+ step = (data[j] - data[i - 1]) / float(j - i)
37
+ for k in range(i, j):
38
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
39
+ else:
40
+ for k in range(i, j):
41
+ ip_data[k] = data[j]
42
+ else:
43
+ for k in range(i, frame_number):
44
+ ip_data[k] = last_value
45
+ else:
46
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
47
+ last_value = data[i]
48
+
49
+ return ip_data[:, 0], vuv_vector[:, 0]
50
+
51
+ def resize_f0(self, x, target_len):
52
+ source = np.array(x)
53
+ source[source < 0.001] = np.nan
54
+ target = np.interp(
55
+ np.arange(0, len(source) * target_len, len(source)) / target_len,
56
+ np.arange(0, len(source)),
57
+ source,
58
+ )
59
+ res = np.nan_to_num(target)
60
+ return res
61
+
62
+ def compute_f0(self, wav, p_len=None):
63
+ if p_len is None:
64
+ p_len = wav.shape[0] // self.hop_length
65
+ f0, t = pyworld.dio(
66
+ wav.astype(np.double),
67
+ fs=self.sampling_rate,
68
+ f0_floor=self.f0_min,
69
+ f0_ceil=self.f0_max,
70
+ frame_period=1000 * self.hop_length / self.sampling_rate,
71
+ )
72
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
73
+ for index, pitch in enumerate(f0):
74
+ f0[index] = round(pitch, 1)
75
+ return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
76
+
77
+ def compute_f0_uv(self, wav, p_len=None):
78
+ if p_len is None:
79
+ p_len = wav.shape[0] // self.hop_length
80
+ f0, t = pyworld.dio(
81
+ wav.astype(np.double),
82
+ fs=self.sampling_rate,
83
+ f0_floor=self.f0_min,
84
+ f0_ceil=self.f0_max,
85
+ frame_period=1000 * self.hop_length / self.sampling_rate,
86
+ )
87
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
88
+ for index, pitch in enumerate(f0):
89
+ f0[index] = round(pitch, 1)
90
+ return self.interpolate_f0(self.resize_f0(f0, p_len))
infer_pack/modules/F0Predictor/F0Predictor.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class F0Predictor(object):
2
+ def compute_f0(self, wav, p_len):
3
+ """
4
+ input: wav:[signal_length]
5
+ p_len:int
6
+ output: f0:[signal_length//hop_length]
7
+ """
8
+ pass
9
+
10
+ def compute_f0_uv(self, wav, p_len):
11
+ """
12
+ input: wav:[signal_length]
13
+ p_len:int
14
+ output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
15
+ """
16
+ pass
infer_pack/modules/F0Predictor/HarvestF0Predictor.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
+ import pyworld
3
+ import numpy as np
4
+
5
+
6
+ class HarvestF0Predictor(F0Predictor):
7
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
+ self.hop_length = hop_length
9
+ self.f0_min = f0_min
10
+ self.f0_max = f0_max
11
+ self.sampling_rate = sampling_rate
12
+
13
+ def interpolate_f0(self, f0):
14
+ """
15
+ 对F0进行插值处理
16
+ """
17
+
18
+ data = np.reshape(f0, (f0.size, 1))
19
+
20
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
+ vuv_vector[data > 0.0] = 1.0
22
+ vuv_vector[data <= 0.0] = 0.0
23
+
24
+ ip_data = data
25
+
26
+ frame_number = data.size
27
+ last_value = 0.0
28
+ for i in range(frame_number):
29
+ if data[i] <= 0.0:
30
+ j = i + 1
31
+ for j in range(i + 1, frame_number):
32
+ if data[j] > 0.0:
33
+ break
34
+ if j < frame_number - 1:
35
+ if last_value > 0.0:
36
+ step = (data[j] - data[i - 1]) / float(j - i)
37
+ for k in range(i, j):
38
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
39
+ else:
40
+ for k in range(i, j):
41
+ ip_data[k] = data[j]
42
+ else:
43
+ for k in range(i, frame_number):
44
+ ip_data[k] = last_value
45
+ else:
46
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
47
+ last_value = data[i]
48
+
49
+ return ip_data[:, 0], vuv_vector[:, 0]
50
+
51
+ def resize_f0(self, x, target_len):
52
+ source = np.array(x)
53
+ source[source < 0.001] = np.nan
54
+ target = np.interp(
55
+ np.arange(0, len(source) * target_len, len(source)) / target_len,
56
+ np.arange(0, len(source)),
57
+ source,
58
+ )
59
+ res = np.nan_to_num(target)
60
+ return res
61
+
62
+ def compute_f0(self, wav, p_len=None):
63
+ if p_len is None:
64
+ p_len = wav.shape[0] // self.hop_length
65
+ f0, t = pyworld.harvest(
66
+ wav.astype(np.double),
67
+ fs=self.hop_length,
68
+ f0_ceil=self.f0_max,
69
+ f0_floor=self.f0_min,
70
+ frame_period=1000 * self.hop_length / self.sampling_rate,
71
+ )
72
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
73
+ return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
74
+
75
+ def compute_f0_uv(self, wav, p_len=None):
76
+ if p_len is None:
77
+ p_len = wav.shape[0] // self.hop_length
78
+ f0, t = pyworld.harvest(
79
+ wav.astype(np.double),
80
+ fs=self.sampling_rate,
81
+ f0_floor=self.f0_min,
82
+ f0_ceil=self.f0_max,
83
+ frame_period=1000 * self.hop_length / self.sampling_rate,
84
+ )
85
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
86
+ return self.interpolate_f0(self.resize_f0(f0, p_len))
infer_pack/modules/F0Predictor/PMF0Predictor.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
+ import parselmouth
3
+ import numpy as np
4
+
5
+
6
+ class PMF0Predictor(F0Predictor):
7
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
+ self.hop_length = hop_length
9
+ self.f0_min = f0_min
10
+ self.f0_max = f0_max
11
+ self.sampling_rate = sampling_rate
12
+
13
+ def interpolate_f0(self, f0):
14
+ """
15
+ 对F0进行插值处理
16
+ """
17
+
18
+ data = np.reshape(f0, (f0.size, 1))
19
+
20
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
+ vuv_vector[data > 0.0] = 1.0
22
+ vuv_vector[data <= 0.0] = 0.0
23
+
24
+ ip_data = data
25
+
26
+ frame_number = data.size
27
+ last_value = 0.0
28
+ for i in range(frame_number):
29
+ if data[i] <= 0.0:
30
+ j = i + 1
31
+ for j in range(i + 1, frame_number):
32
+ if data[j] > 0.0:
33
+ break
34
+ if j < frame_number - 1:
35
+ if last_value > 0.0:
36
+ step = (data[j] - data[i - 1]) / float(j - i)
37
+ for k in range(i, j):
38
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
39
+ else:
40
+ for k in range(i, j):
41
+ ip_data[k] = data[j]
42
+ else:
43
+ for k in range(i, frame_number):
44
+ ip_data[k] = last_value
45
+ else:
46
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
47
+ last_value = data[i]
48
+
49
+ return ip_data[:, 0], vuv_vector[:, 0]
50
+
51
+ def compute_f0(self, wav, p_len=None):
52
+ x = wav
53
+ if p_len is None:
54
+ p_len = x.shape[0] // self.hop_length
55
+ else:
56
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
57
+ time_step = self.hop_length / self.sampling_rate * 1000
58
+ f0 = (
59
+ parselmouth.Sound(x, self.sampling_rate)
60
+ .to_pitch_ac(
61
+ time_step=time_step / 1000,
62
+ voicing_threshold=0.6,
63
+ pitch_floor=self.f0_min,
64
+ pitch_ceiling=self.f0_max,
65
+ )
66
+ .selected_array["frequency"]
67
+ )
68
+
69
+ pad_size = (p_len - len(f0) + 1) // 2
70
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
71
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
72
+ f0, uv = self.interpolate_f0(f0)
73
+ return f0
74
+
75
+ def compute_f0_uv(self, wav, p_len=None):
76
+ x = wav
77
+ if p_len is None:
78
+ p_len = x.shape[0] // self.hop_length
79
+ else:
80
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
81
+ time_step = self.hop_length / self.sampling_rate * 1000
82
+ f0 = (
83
+ parselmouth.Sound(x, self.sampling_rate)
84
+ .to_pitch_ac(
85
+ time_step=time_step / 1000,
86
+ voicing_threshold=0.6,
87
+ pitch_floor=self.f0_min,
88
+ pitch_ceiling=self.f0_max,
89
+ )
90
+ .selected_array["frequency"]
91
+ )
92
+
93
+ pad_size = (p_len - len(f0) + 1) // 2
94
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
95
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
96
+ f0, uv = self.interpolate_f0(f0)
97
+ return f0, uv
infer_pack/modules/F0Predictor/__init__.py ADDED
File without changes
infer_pack/transforms.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn import functional as F
3
+
4
+ import numpy as np
5
+
6
+ DEFAULT_MIN_BIN_WIDTH = 1e-3
7
+ DEFAULT_MIN_BIN_HEIGHT = 1e-3
8
+ DEFAULT_MIN_DERIVATIVE = 1e-3
9
+
10
+
11
+ def piecewise_rational_quadratic_transform(
12
+ inputs,
13
+ unnormalized_widths,
14
+ unnormalized_heights,
15
+ unnormalized_derivatives,
16
+ inverse=False,
17
+ tails=None,
18
+ tail_bound=1.0,
19
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
20
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
21
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
22
+ ):
23
+ if tails is None:
24
+ spline_fn = rational_quadratic_spline
25
+ spline_kwargs = {}
26
+ else:
27
+ spline_fn = unconstrained_rational_quadratic_spline
28
+ spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
29
+
30
+ outputs, logabsdet = spline_fn(
31
+ inputs=inputs,
32
+ unnormalized_widths=unnormalized_widths,
33
+ unnormalized_heights=unnormalized_heights,
34
+ unnormalized_derivatives=unnormalized_derivatives,
35
+ inverse=inverse,
36
+ min_bin_width=min_bin_width,
37
+ min_bin_height=min_bin_height,
38
+ min_derivative=min_derivative,
39
+ **spline_kwargs
40
+ )
41
+ return outputs, logabsdet
42
+
43
+
44
+ def searchsorted(bin_locations, inputs, eps=1e-6):
45
+ bin_locations[..., -1] += eps
46
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
47
+
48
+
49
+ def unconstrained_rational_quadratic_spline(
50
+ inputs,
51
+ unnormalized_widths,
52
+ unnormalized_heights,
53
+ unnormalized_derivatives,
54
+ inverse=False,
55
+ tails="linear",
56
+ tail_bound=1.0,
57
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
58
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
59
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
60
+ ):
61
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
62
+ outside_interval_mask = ~inside_interval_mask
63
+
64
+ outputs = torch.zeros_like(inputs)
65
+ logabsdet = torch.zeros_like(inputs)
66
+
67
+ if tails == "linear":
68
+ unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
69
+ constant = np.log(np.exp(1 - min_derivative) - 1)
70
+ unnormalized_derivatives[..., 0] = constant
71
+ unnormalized_derivatives[..., -1] = constant
72
+
73
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
74
+ logabsdet[outside_interval_mask] = 0
75
+ else:
76
+ raise RuntimeError("{} tails are not implemented.".format(tails))
77
+
78
+ (
79
+ outputs[inside_interval_mask],
80
+ logabsdet[inside_interval_mask],
81
+ ) = rational_quadratic_spline(
82
+ inputs=inputs[inside_interval_mask],
83
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
84
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
85
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
86
+ inverse=inverse,
87
+ left=-tail_bound,
88
+ right=tail_bound,
89
+ bottom=-tail_bound,
90
+ top=tail_bound,
91
+ min_bin_width=min_bin_width,
92
+ min_bin_height=min_bin_height,
93
+ min_derivative=min_derivative,
94
+ )
95
+
96
+ return outputs, logabsdet
97
+
98
+
99
+ def rational_quadratic_spline(
100
+ inputs,
101
+ unnormalized_widths,
102
+ unnormalized_heights,
103
+ unnormalized_derivatives,
104
+ inverse=False,
105
+ left=0.0,
106
+ right=1.0,
107
+ bottom=0.0,
108
+ top=1.0,
109
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
110
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
111
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
112
+ ):
113
+ if torch.min(inputs) < left or torch.max(inputs) > right:
114
+ raise ValueError("Input to a transform is not within its domain")
115
+
116
+ num_bins = unnormalized_widths.shape[-1]
117
+
118
+ if min_bin_width * num_bins > 1.0:
119
+ raise ValueError("Minimal bin width too large for the number of bins")
120
+ if min_bin_height * num_bins > 1.0:
121
+ raise ValueError("Minimal bin height too large for the number of bins")
122
+
123
+ widths = F.softmax(unnormalized_widths, dim=-1)
124
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
125
+ cumwidths = torch.cumsum(widths, dim=-1)
126
+ cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
127
+ cumwidths = (right - left) * cumwidths + left
128
+ cumwidths[..., 0] = left
129
+ cumwidths[..., -1] = right
130
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
131
+
132
+ derivatives = min_derivative + F.softplus(unnormalized_derivatives)
133
+
134
+ heights = F.softmax(unnormalized_heights, dim=-1)
135
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
136
+ cumheights = torch.cumsum(heights, dim=-1)
137
+ cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
138
+ cumheights = (top - bottom) * cumheights + bottom
139
+ cumheights[..., 0] = bottom
140
+ cumheights[..., -1] = top
141
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
142
+
143
+ if inverse:
144
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
145
+ else:
146
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
147
+
148
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
149
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
150
+
151
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
152
+ delta = heights / widths
153
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
154
+
155
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
156
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
157
+
158
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
159
+
160
+ if inverse:
161
+ a = (inputs - input_cumheights) * (
162
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
163
+ ) + input_heights * (input_delta - input_derivatives)
164
+ b = input_heights * input_derivatives - (inputs - input_cumheights) * (
165
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
166
+ )
167
+ c = -input_delta * (inputs - input_cumheights)
168
+
169
+ discriminant = b.pow(2) - 4 * a * c
170
+ assert (discriminant >= 0).all()
171
+
172
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
173
+ outputs = root * input_bin_widths + input_cumwidths
174
+
175
+ theta_one_minus_theta = root * (1 - root)
176
+ denominator = input_delta + (
177
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
178
+ * theta_one_minus_theta
179
+ )
180
+ derivative_numerator = input_delta.pow(2) * (
181
+ input_derivatives_plus_one * root.pow(2)
182
+ + 2 * input_delta * theta_one_minus_theta
183
+ + input_derivatives * (1 - root).pow(2)
184
+ )
185
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
186
+
187
+ return outputs, -logabsdet
188
+ else:
189
+ theta = (inputs - input_cumwidths) / input_bin_widths
190
+ theta_one_minus_theta = theta * (1 - theta)
191
+
192
+ numerator = input_heights * (
193
+ input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
194
+ )
195
+ denominator = input_delta + (
196
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
197
+ * theta_one_minus_theta
198
+ )
199
+ outputs = input_cumheights + numerator / denominator
200
+
201
+ derivative_numerator = input_delta.pow(2) * (
202
+ input_derivatives_plus_one * theta.pow(2)
203
+ + 2 * input_delta * theta_one_minus_theta
204
+ + input_derivatives * (1 - theta).pow(2)
205
+ )
206
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
207
+
208
+ return outputs, logabsdet
infer_web.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vc_infer_pipeline import VC
2
+ from myutils import Audio
3
+ from infer_pack.models import (
4
+ SynthesizerTrnMs256NSFsid,
5
+ SynthesizerTrnMs256NSFsid_nono,
6
+ SynthesizerTrnMs768NSFsid,
7
+ SynthesizerTrnMs768NSFsid_nono,
8
+ )
9
+ from fairseq import checkpoint_utils
10
+ from config import Config
11
+ import torch
12
+ import numpy as np
13
+ import traceback
14
+ import os
15
+ import sys
16
+ import warnings
17
+
18
+ now_dir = os.getcwd()
19
+ sys.path.append(now_dir)
20
+ os.makedirs(os.path.join(now_dir, "audios"), exist_ok=True)
21
+ os.makedirs(os.path.join(now_dir, "audio-outputs"), exist_ok=True)
22
+ os.makedirs(os.path.join(now_dir, "weights"), exist_ok=True)
23
+ warnings.filterwarnings("ignore")
24
+ torch.manual_seed(114514)
25
+
26
+ config = Config()
27
+
28
+ hubert_model = None
29
+ weight_root = "weights"
30
+
31
+ def load_hubert():
32
+ # Determinar si existe una tarjeta N que pueda usarse para entrenar y acelerar la inferencia.
33
+ global hubert_model
34
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
35
+ ["hubert_base.pt"],
36
+ suffix="",
37
+ )
38
+ hubert_model = models[0]
39
+ hubert_model = hubert_model.to(config.device)
40
+ if config.is_half:
41
+ hubert_model = hubert_model.half()
42
+ else:
43
+ hubert_model = hubert_model.float()
44
+ hubert_model.eval()
45
+
46
+ def vc_single(
47
+ sid,
48
+ input_audio_path0,
49
+ input_audio_path1,
50
+ f0_up_key,
51
+ f0_file,
52
+ f0_method,
53
+ file_index,
54
+ file_index2,
55
+ # file_big_npy,
56
+ index_rate,
57
+ filter_radius,
58
+ resample_sr,
59
+ rms_mix_rate,
60
+ protect,
61
+ crepe_hop_length,
62
+ ):
63
+ global tgt_sr, net_g, vc, hubert_model, version
64
+ if input_audio_path0 is None or input_audio_path0 is None:
65
+ return "You need to upload an audio", None
66
+ f0_up_key = int(f0_up_key)
67
+ try:
68
+ if input_audio_path0 == "":
69
+ audio = Audio.load_audio(input_audio_path1, 16000)
70
+ else:
71
+ audio = Audio.load_audio(input_audio_path0, 16000)
72
+
73
+ audio_max = np.abs(audio).max() / 0.95
74
+ if audio_max > 1:
75
+ audio /= audio_max
76
+ times = [0, 0, 0]
77
+ if not hubert_model:
78
+ load_hubert()
79
+ if_f0 = cpt.get("f0", 1)
80
+ file_index = (
81
+ (
82
+ file_index.strip(" ")
83
+ .strip('"')
84
+ .strip("\n")
85
+ .strip('"')
86
+ .strip(" ")
87
+ .replace("trained", "added")
88
+ )
89
+ if file_index != ""
90
+ else file_index2
91
+ )
92
+
93
+ audio_opt = vc.pipeline(
94
+ hubert_model,
95
+ net_g,
96
+ sid,
97
+ audio,
98
+ input_audio_path1,
99
+ times,
100
+ f0_up_key,
101
+ f0_method,
102
+ file_index,
103
+ # file_big_npy,
104
+ index_rate,
105
+ if_f0,
106
+ filter_radius,
107
+ tgt_sr,
108
+ resample_sr,
109
+ rms_mix_rate,
110
+ version,
111
+ protect,
112
+ crepe_hop_length,
113
+ f0_file=f0_file,
114
+ )
115
+ if tgt_sr != resample_sr >= 16000:
116
+ tgt_sr = resample_sr
117
+ index_info = (
118
+ "Using index:%s." % file_index
119
+ if os.path.exists(file_index)
120
+ else "Index not used."
121
+ )
122
+ print(index_info)
123
+ return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
124
+ index_info,
125
+ times[0],
126
+ times[1],
127
+ times[2],
128
+ ), (tgt_sr, audio_opt)
129
+ except:
130
+ info = traceback.format_exc()
131
+ print(info)
132
+ return info, (None, None)
133
+
134
+ def get_vc(model_name):
135
+ global tgt_sr, net_g, vc, cpt, version
136
+
137
+ # Comprobar si se pasó uno o varios modelos
138
+ if model_name == "" or model_name == []:
139
+ global hubert_model
140
+ if hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
141
+ print("Limpiar caché")
142
+ del net_g, vc, hubert_model, tgt_sr # ,cpt
143
+ hubert_model = net_g = vc = hubert_model = tgt_sr = None
144
+
145
+ # Si hay una GPU disponible, libera la memoria de la GPU
146
+ if torch.cuda.is_available():
147
+ torch.cuda.empty_cache()
148
+
149
+ # Bloque de abajo no limpia completamente
150
+ if_f0 = cpt.get("f0", 1)
151
+ version = cpt.get("version", "v1")
152
+ if version == "v1":
153
+ if if_f0 == 1:
154
+ net_g = SynthesizerTrnMs256NSFsid(
155
+ *cpt["config"], is_half=config.is_half
156
+ )
157
+ else:
158
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
159
+ elif version == "v2":
160
+ if if_f0 == 1:
161
+ net_g = SynthesizerTrnMs768NSFsid(
162
+ *cpt["config"], is_half=config.is_half
163
+ )
164
+ else:
165
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
166
+
167
+ del net_g, cpt
168
+ if torch.cuda.is_available():
169
+ torch.cuda.empty_cache()
170
+ cpt = None
171
+ return {"success": False, "message": "No se proporcionó un sid"}
172
+
173
+ person = "%s/%s" % (weight_root, model_name)
174
+ print("Cargando %s" % person)
175
+ cpt = torch.load(person, map_location="cpu")
176
+ tgt_sr = cpt["config"][-1]
177
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
178
+ if_f0 = cpt.get("f0", 1)
179
+ version = cpt.get("version", "v1")
180
+
181
+ if version == "v1":
182
+ if if_f0 == 1:
183
+ net_g = SynthesizerTrnMs256NSFsid(
184
+ *cpt["config"], is_half=config.is_half)
185
+ else:
186
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
187
+ elif version == "v2":
188
+ if if_f0 == 1:
189
+ net_g = SynthesizerTrnMs768NSFsid(
190
+ *cpt["config"], is_half=config.is_half)
191
+ else:
192
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
193
+ del net_g.enc_q
194
+
195
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
196
+ net_g.eval().to(config.device)
197
+ if config.is_half:
198
+ net_g = net_g.half()
199
+ else:
200
+ net_g = net_g.float()
201
+ vc = VC(tgt_sr, config)
inference.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import infer_web
2
+ import wget
3
+ import os
4
+ import scipy.io.wavfile as wavfile
5
+ from utils import model
6
+ import validators
7
+ from myutils import delete_files
8
+
9
+ class Inference:
10
+
11
+ inference_cont = 0
12
+
13
+ def __init__(
14
+ self,
15
+ model_name=None,
16
+ source_audio_path=None,
17
+ output_file_name=None,
18
+ feature_index_path="",
19
+ f0_file=None,
20
+ speaker_id=0,
21
+ transposition=-2,
22
+ f0_method="harvest",
23
+ crepe_hop_length=160,
24
+ harvest_median_filter=3,
25
+ resample=0,
26
+ mix=1,
27
+ feature_ratio=0.78,
28
+ protection_amnt=0.33,
29
+ protect1=False
30
+ ):
31
+ Inference.inference_cont += 1
32
+ self._model_name = model_name
33
+ self._source_audio_path = source_audio_path
34
+ self._output_file_name = output_file_name
35
+ self._feature_index_path = feature_index_path
36
+ self._f0_file = f0_file
37
+ self._speaker_id = speaker_id
38
+ self._transposition = transposition
39
+ self._f0_method = f0_method
40
+ self._crepe_hop_length = crepe_hop_length
41
+ self._harvest_median_filter = harvest_median_filter
42
+ self._resample = resample
43
+ self._mix = mix
44
+ self._feature_ratio = feature_ratio
45
+ self._protection_amnt = protection_amnt
46
+ self._protect1 = protect1
47
+ self._id = Inference.inference_cont
48
+
49
+ if not os.path.exists("./hubert_base.pt"):
50
+ wget.download(
51
+ "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt", out="./hubert_base.pt")
52
+
53
+ @property
54
+ def id(self):
55
+ return self._id
56
+
57
+ @id.setter
58
+ def id(self, id):
59
+ self._id = id
60
+
61
+ @property
62
+ def audio(self):
63
+ return self._audio
64
+
65
+ @audio.setter
66
+ def audio_file(self, audio):
67
+ self._audio_file = audio
68
+
69
+ @property
70
+ def model_name(self):
71
+ return self._model_name
72
+
73
+ @model_name.setter
74
+ def model_name(self, model_name):
75
+ self._model_name = model_name
76
+
77
+ @property
78
+ def source_audio_path(self):
79
+ return self._source_audio_path
80
+
81
+ @source_audio_path.setter
82
+ def source_audio_path(self, source_audio_path):
83
+ if not self._output_file_name:
84
+ self._output_file_name = os.path.join("./audio-outputs", os.path.basename(source_audio_path))
85
+ self._source_audio_path = source_audio_path
86
+
87
+ @property
88
+ def output_file_name(self):
89
+ return self._output_file_name
90
+
91
+ @output_file_name.setter
92
+ def output_file_name(self, output_file_name):
93
+ self._output_file_name = output_file_name
94
+
95
+ @property
96
+ def feature_index_path(self):
97
+ return self._feature_index_path
98
+
99
+ @feature_index_path.setter
100
+ def feature_index_path(self, feature_index_path):
101
+ self._feature_index_path = feature_index_path
102
+
103
+ @property
104
+ def f0_file(self):
105
+ return self._f0_file
106
+
107
+ @f0_file.setter
108
+ def f0_file(self, f0_file):
109
+ self._f0_file = f0_file
110
+
111
+ @property
112
+ def speaker_id(self):
113
+ return self._speaker_id
114
+
115
+ @speaker_id.setter
116
+ def speaker_id(self, speaker_id):
117
+ self._speaker_id = speaker_id
118
+
119
+ @property
120
+ def transposition(self):
121
+ return self._transposition
122
+
123
+ @transposition.setter
124
+ def transposition(self, transposition):
125
+ self._transposition = transposition
126
+
127
+ @property
128
+ def f0_method(self):
129
+ return self._f0_method
130
+
131
+ @f0_method.setter
132
+ def f0_method(self, f0_method):
133
+ self._f0_method = f0_method
134
+
135
+ @property
136
+ def crepe_hop_length(self):
137
+ return self._crepe_hop_length
138
+
139
+ @crepe_hop_length.setter
140
+ def crepe_hop_length(self, crepe_hop_length):
141
+ self._crepe_hop_length = crepe_hop_length
142
+
143
+ @property
144
+ def harvest_median_filter(self):
145
+ return self._harvest_median_filter
146
+
147
+ @crepe_hop_length.setter
148
+ def harvest_median_filter(self, harvest_median_filter):
149
+ self._harvest_median_filter = harvest_median_filter
150
+
151
+ @property
152
+ def resample(self):
153
+ return self._resample
154
+
155
+ @resample.setter
156
+ def resample(self, resample):
157
+ self._resample = resample
158
+
159
+ @property
160
+ def mix(self):
161
+ return self._mix
162
+
163
+ @mix.setter
164
+ def mix(self, mix):
165
+ self._mix = mix
166
+
167
+ @property
168
+ def feature_ratio(self):
169
+ return self._feature_ratio
170
+
171
+ @feature_ratio.setter
172
+ def feature_ratio(self, feature_ratio):
173
+ self._feature_ratio = feature_ratio
174
+
175
+ @property
176
+ def protection_amnt(self):
177
+ return self._protection_amnt
178
+
179
+ @protection_amnt.setter
180
+ def protection_amnt(self, protection_amnt):
181
+ self._protection_amnt = protection_amnt
182
+
183
+ @property
184
+ def protect1(self):
185
+ return self._protect1
186
+
187
+ @protect1.setter
188
+ def protect1(self, protect1):
189
+ self._protect1 = protect1
190
+
191
+ def run(self):
192
+ current_dir = os.getcwd()
193
+ modelname = model.model_downloader(self._model_name, "./zips/", "./weights/")
194
+
195
+ model_info = model.get_model(os.path.join(current_dir, 'weights') , modelname)
196
+ index = model_info.get('index', '')
197
+ pth = model_info.get('pth', None)
198
+
199
+ infer_web.get_vc(pth)
200
+
201
+ conversion_data = infer_web.vc_single(
202
+ self.speaker_id,
203
+ self.source_audio_path,
204
+ self.source_audio_path,
205
+ self.transposition,
206
+ self.f0_file,
207
+ self.f0_method,
208
+ index,
209
+ index,
210
+ self.feature_ratio,
211
+ self.harvest_median_filter,
212
+ self.resample,
213
+ self.mix,
214
+ self.protection_amnt,
215
+ self.crepe_hop_length,
216
+ )
217
+
218
+ if "Success." in conversion_data[0]:
219
+ wavfile.write(
220
+ "%s/%s" % ("audio-outputs",os.path.basename(self._output_file_name)),
221
+ conversion_data[1][0],
222
+ conversion_data[1][1],
223
+ )
224
+ return({
225
+ "success": True,
226
+ "file": self._output_file_name
227
+ })
228
+ else:
229
+ return({
230
+ "success": False,
231
+ "file": self._output_file_name
232
+ })
233
+ #print(conversion_data[0])
myutils.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import os
3
+ import ffmpeg
4
+ import numpy as np
5
+ import random
6
+ import shutil
7
+ import torchaudio
8
+ from pydub import AudioSegment
9
+ import tempfile
10
+
11
+ class Audio:
12
+
13
+ audio_path = "./audios"
14
+
15
+ def __init__(self, name, url):
16
+ self._name = name
17
+ self._url = url
18
+
19
+ if not os.path.exists(Audio.audio_path):
20
+ os.mkdir(Audio.audio_path)
21
+
22
+ @property
23
+ def name(self):
24
+ return self._name
25
+
26
+ @name.setter
27
+ def name(self, name):
28
+ self._name = name
29
+
30
+ @property
31
+ def url(self):
32
+ return self._url
33
+
34
+ @url.setter
35
+ def url(self, url):
36
+ self._url = url
37
+
38
+ def __str__(self):
39
+ return f'Audio: {self._name} {self._url}'
40
+
41
+ @classmethod
42
+ def load_audio(cls, file, sr):
43
+ try:
44
+ file = file.strip(' "\n') # Eliminar espacios y comillas del nombre del archivo
45
+ # Convertir a formato WAV si no lo está
46
+ if not file.endswith(".wav"):
47
+ file_formanted = f"{file}.wav"
48
+ if not os.path.isfile(file_formanted):
49
+ (
50
+ ffmpeg.input(file)
51
+ .output(file_formanted, format="wav")
52
+ .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
53
+ )
54
+ else:
55
+ file_formanted = file
56
+
57
+ # Cargar el archivo formateado y devolverlo como NumPy array
58
+ out, _ = (
59
+ ffmpeg.input(file_formanted)
60
+ .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
61
+ .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
62
+ )
63
+
64
+ # Eliminar el archivo formateado
65
+ os.remove(file_formanted)
66
+ except Exception as e:
67
+ raise RuntimeError(f"Failed to load audio: {e}")
68
+
69
+ return np.frombuffer(out, np.float32).flatten()
70
+
71
+ @classmethod
72
+ def dowload_from_url(self, url = None, output = "./audios/file.wav"):
73
+ """
74
+ Descarga un aduio desde una url
75
+ Args:
76
+ path: Folder where the audio will be downloaded
77
+ Returns:
78
+ return: the path of the downloaded audio
79
+ """
80
+ request = requests.get(url, allow_redirects=True)
81
+ open(output, 'wb').write(request.content)
82
+
83
+ return output
84
+
85
+
86
+ def delete_files(paths):
87
+ for path in paths:
88
+ if os.path.exists(path):
89
+ if os.path.isdir(path):
90
+ shutil.rmtree(path, ignore_errors=True)
91
+ if os.path.isfile(path):
92
+ os.remove(path)
requirements.txt ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==1.4.0
2
+ aiofiles==23.1.0
3
+ aiohttp==3.8.3
4
+ aiosignal==1.3.1
5
+ altair==4.2.0
6
+ antlr4-python3-runtime==4.8
7
+ anyio==3.6.2
8
+ appdirs==1.4.4
9
+ asttokens==2.2.1
10
+ async-timeout==4.0.2
11
+ attrs==21.4.0
12
+ audioread==2.1.9
13
+ backcall==0.2.0
14
+ beautifulsoup4==4.12.2
15
+ bitarray==2.5.1
16
+ bs4==0.0.1
17
+ cachetools==5.3.0
18
+ certifi==2022.12.7
19
+ cffi==1.15.1
20
+ charset-normalizer==2.1.1
21
+ click==8.1.3
22
+ colorama==0.4.6
23
+ coloredlogs==15.0.1
24
+ contourpy==1.0.6
25
+ cycler==0.11.0
26
+ Cython==0.29.30
27
+ decorator==5.1.1
28
+ discord.py==2.3.2
29
+ edge-tts==6.1.5
30
+ elevenlabs==0.2.21
31
+ entrypoints==0.4
32
+ exceptiongroup==1.1.3
33
+ executing==1.2.0
34
+ fairseq==0.12.2
35
+ faiss-cpu==1.7.2
36
+ fastapi==0.88.0
37
+ ffmpeg-python==0.2.0
38
+ ffmpy==0.3.1
39
+ filelock==3.10.0
40
+ flatbuffers==23.5.9
41
+ fonttools==4.38.0
42
+ frozenlist==1.3.3
43
+ fsspec==2022.11.0
44
+ future==0.18.2
45
+ google-auth==2.16.2
46
+ google-auth-oauthlib==1.0.0
47
+ gradio==3.34.0
48
+ gradio_client==0.2.10
49
+ grpcio==1.51.3
50
+ gspread==5.10.0
51
+ gTTS==2.3.2
52
+ h11==0.12.0
53
+ httpcore==0.15.0
54
+ httpx==0.23.0
55
+ huggingface-hub==0.16.4
56
+ humanfriendly==10.0
57
+ hydra-core==1.0.7
58
+ idna==3.4
59
+ importlib-metadata==6.0.0
60
+ importlib-resources==6.0.0
61
+ ipython
62
+ jedi==0.18.2
63
+ Jinja2==3.1.2
64
+ joblib==1.2.0
65
+ json5==0.9.14
66
+ jsonschema==4.17.3
67
+ kiwisolver==1.4.4
68
+ librosa==0.9.1
69
+ linkify-it-py==1.0.3
70
+ llvmlite==0.39.1
71
+ lxml==4.9.3
72
+ Markdown==3.4.3
73
+ markdown-it-py==2.2.0
74
+ MarkupSafe==2.1.1
75
+ matplotlib==3.6.2
76
+ matplotlib-inline==0.1.6
77
+ mdit-py-plugins==0.3.3
78
+ mdurl==0.1.1
79
+ mega.py==1.0.8
80
+ mpmath==1.2.1
81
+ multidict==6.0.3
82
+ networkx==2.8.8
83
+ noisereduce==2.0.1
84
+ numba==0.56.4
85
+ numpy==1.23.5
86
+ oauthlib==3.2.2
87
+ omegaconf==2.0.6
88
+ onnx==1.14.0
89
+ onnxconverter-common==1.13.0
90
+ onnxruntime-gpu==1.14.1
91
+ orjson==3.8.3
92
+ packaging==22.0
93
+ pandas==1.5.2
94
+ parso==0.8.3
95
+ pathlib==1.0.1
96
+ pickleshare==0.7.5
97
+ Pillow==9.3.0
98
+ pooch==1.6.0
99
+ portalocker==2.6.0
100
+ praat-parselmouth==0.4.2
101
+ praatio==6.0.1
102
+ prompt-toolkit==3.0.39
103
+ protobuf==4.22.3
104
+ pure-eval==0.2.2
105
+ pyasn1==0.4.8
106
+ pyasn1-modules==0.2.8
107
+ pycparser==2.21
108
+ pycryptodome==3.16.0
109
+ pydantic==1.10.2
110
+ pydub==0.25.1
111
+ Pygments==2.15.1
112
+ pyparsing==3.0.9
113
+ pyreadline3==3.4.1
114
+ pyrsistent==0.19.2
115
+ PySimpleGUI==4.60.4
116
+ python-dateutil==2.8.2
117
+ python-dotenv==1.0.0
118
+ python-multipart==0.0.5
119
+ pytz==2022.6
120
+ pyworld==0.3.2
121
+ PyYAML==6.0
122
+ regex==2022.10.31
123
+ requests==2.28.1
124
+ requests-oauthlib==1.3.1
125
+ resampy==0.4.2
126
+ rfc3986==1.5.0
127
+ rich==13.3.5
128
+ rsa==4.9
129
+ sacrebleu==2.3.1
130
+ scikit-learn==1.2.0
131
+ scipy==1.9.3
132
+ semantic-version==2.10.0
133
+ six==1.16.0
134
+ skl2onnx==1.14.1
135
+ sniffio==1.3.0
136
+ sounddevice==0.4.6
137
+ soundfile==0.11.0
138
+ soupsieve==2.4.1
139
+ stack-data==0.6.2
140
+ starlette==0.22.0
141
+ sympy==1.11.1
142
+ tabulate==0.9.0
143
+ tenacity==5.1.5
144
+ tensorboard==2.13.0
145
+ tensorboard-data-server==0.7.0
146
+ tensorboard-plugin-wit==1.8.1
147
+ tensorboardX==2.6.1
148
+ threadpoolctl==3.1.0
149
+ toolz==0.12.0
150
+ torch==2.0.0
151
+ torchaudio==2.0.1
152
+ torchcrepe==0.0.19
153
+ torchgen==0.0.1
154
+ tornado==6.3.2
155
+ tqdm==4.64.1
156
+ traitlets==5.9.0
157
+ typing_extensions==4.4.0
158
+ uc-micro-py==1.0.1
159
+ urllib3==1.26.13
160
+ uvicorn==0.20.0
161
+ wcwidth==0.2.6
162
+ websockets==10.4
163
+ Werkzeug==2.2.3
164
+ wget==3.2
165
+ yarl==1.8.2
166
+ zipp==3.15.0
167
+ firebase
168
+ firebase_admin
169
+ gdown
170
+ validators
rmvpe.py ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, torch, numpy as np, traceback, pdb
2
+ import torch.nn as nn
3
+ from time import time as ttime
4
+ import torch.nn.functional as F
5
+
6
+
7
+ class BiGRU(nn.Module):
8
+ def __init__(self, input_features, hidden_features, num_layers):
9
+ super(BiGRU, self).__init__()
10
+ self.gru = nn.GRU(
11
+ input_features,
12
+ hidden_features,
13
+ num_layers=num_layers,
14
+ batch_first=True,
15
+ bidirectional=True,
16
+ )
17
+
18
+ def forward(self, x):
19
+ return self.gru(x)[0]
20
+
21
+
22
+ class ConvBlockRes(nn.Module):
23
+ def __init__(self, in_channels, out_channels, momentum=0.01):
24
+ super(ConvBlockRes, self).__init__()
25
+ self.conv = nn.Sequential(
26
+ nn.Conv2d(
27
+ in_channels=in_channels,
28
+ out_channels=out_channels,
29
+ kernel_size=(3, 3),
30
+ stride=(1, 1),
31
+ padding=(1, 1),
32
+ bias=False,
33
+ ),
34
+ nn.BatchNorm2d(out_channels, momentum=momentum),
35
+ nn.ReLU(),
36
+ nn.Conv2d(
37
+ in_channels=out_channels,
38
+ out_channels=out_channels,
39
+ kernel_size=(3, 3),
40
+ stride=(1, 1),
41
+ padding=(1, 1),
42
+ bias=False,
43
+ ),
44
+ nn.BatchNorm2d(out_channels, momentum=momentum),
45
+ nn.ReLU(),
46
+ )
47
+ if in_channels != out_channels:
48
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
49
+ self.is_shortcut = True
50
+ else:
51
+ self.is_shortcut = False
52
+
53
+ def forward(self, x):
54
+ if self.is_shortcut:
55
+ return self.conv(x) + self.shortcut(x)
56
+ else:
57
+ return self.conv(x) + x
58
+
59
+
60
+ class Encoder(nn.Module):
61
+ def __init__(
62
+ self,
63
+ in_channels,
64
+ in_size,
65
+ n_encoders,
66
+ kernel_size,
67
+ n_blocks,
68
+ out_channels=16,
69
+ momentum=0.01,
70
+ ):
71
+ super(Encoder, self).__init__()
72
+ self.n_encoders = n_encoders
73
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
74
+ self.layers = nn.ModuleList()
75
+ self.latent_channels = []
76
+ for i in range(self.n_encoders):
77
+ self.layers.append(
78
+ ResEncoderBlock(
79
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
80
+ )
81
+ )
82
+ self.latent_channels.append([out_channels, in_size])
83
+ in_channels = out_channels
84
+ out_channels *= 2
85
+ in_size //= 2
86
+ self.out_size = in_size
87
+ self.out_channel = out_channels
88
+
89
+ def forward(self, x):
90
+ concat_tensors = []
91
+ x = self.bn(x)
92
+ for i in range(self.n_encoders):
93
+ _, x = self.layers[i](x)
94
+ concat_tensors.append(_)
95
+ return x, concat_tensors
96
+
97
+
98
+ class ResEncoderBlock(nn.Module):
99
+ def __init__(
100
+ self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
101
+ ):
102
+ super(ResEncoderBlock, self).__init__()
103
+ self.n_blocks = n_blocks
104
+ self.conv = nn.ModuleList()
105
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
106
+ for i in range(n_blocks - 1):
107
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
108
+ self.kernel_size = kernel_size
109
+ if self.kernel_size is not None:
110
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
111
+
112
+ def forward(self, x):
113
+ for i in range(self.n_blocks):
114
+ x = self.conv[i](x)
115
+ if self.kernel_size is not None:
116
+ return x, self.pool(x)
117
+ else:
118
+ return x
119
+
120
+
121
+ class Intermediate(nn.Module): #
122
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
123
+ super(Intermediate, self).__init__()
124
+ self.n_inters = n_inters
125
+ self.layers = nn.ModuleList()
126
+ self.layers.append(
127
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
128
+ )
129
+ for i in range(self.n_inters - 1):
130
+ self.layers.append(
131
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
132
+ )
133
+
134
+ def forward(self, x):
135
+ for i in range(self.n_inters):
136
+ x = self.layers[i](x)
137
+ return x
138
+
139
+
140
+ class ResDecoderBlock(nn.Module):
141
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
142
+ super(ResDecoderBlock, self).__init__()
143
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
144
+ self.n_blocks = n_blocks
145
+ self.conv1 = nn.Sequential(
146
+ nn.ConvTranspose2d(
147
+ in_channels=in_channels,
148
+ out_channels=out_channels,
149
+ kernel_size=(3, 3),
150
+ stride=stride,
151
+ padding=(1, 1),
152
+ output_padding=out_padding,
153
+ bias=False,
154
+ ),
155
+ nn.BatchNorm2d(out_channels, momentum=momentum),
156
+ nn.ReLU(),
157
+ )
158
+ self.conv2 = nn.ModuleList()
159
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
160
+ for i in range(n_blocks - 1):
161
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
162
+
163
+ def forward(self, x, concat_tensor):
164
+ x = self.conv1(x)
165
+ x = torch.cat((x, concat_tensor), dim=1)
166
+ for i in range(self.n_blocks):
167
+ x = self.conv2[i](x)
168
+ return x
169
+
170
+
171
+ class Decoder(nn.Module):
172
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
173
+ super(Decoder, self).__init__()
174
+ self.layers = nn.ModuleList()
175
+ self.n_decoders = n_decoders
176
+ for i in range(self.n_decoders):
177
+ out_channels = in_channels // 2
178
+ self.layers.append(
179
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
180
+ )
181
+ in_channels = out_channels
182
+
183
+ def forward(self, x, concat_tensors):
184
+ for i in range(self.n_decoders):
185
+ x = self.layers[i](x, concat_tensors[-1 - i])
186
+ return x
187
+
188
+
189
+ class DeepUnet(nn.Module):
190
+ def __init__(
191
+ self,
192
+ kernel_size,
193
+ n_blocks,
194
+ en_de_layers=5,
195
+ inter_layers=4,
196
+ in_channels=1,
197
+ en_out_channels=16,
198
+ ):
199
+ super(DeepUnet, self).__init__()
200
+ self.encoder = Encoder(
201
+ in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
202
+ )
203
+ self.intermediate = Intermediate(
204
+ self.encoder.out_channel // 2,
205
+ self.encoder.out_channel,
206
+ inter_layers,
207
+ n_blocks,
208
+ )
209
+ self.decoder = Decoder(
210
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
211
+ )
212
+
213
+ def forward(self, x):
214
+ x, concat_tensors = self.encoder(x)
215
+ x = self.intermediate(x)
216
+ x = self.decoder(x, concat_tensors)
217
+ return x
218
+
219
+
220
+ class E2E(nn.Module):
221
+ def __init__(
222
+ self,
223
+ n_blocks,
224
+ n_gru,
225
+ kernel_size,
226
+ en_de_layers=5,
227
+ inter_layers=4,
228
+ in_channels=1,
229
+ en_out_channels=16,
230
+ ):
231
+ super(E2E, self).__init__()
232
+ self.unet = DeepUnet(
233
+ kernel_size,
234
+ n_blocks,
235
+ en_de_layers,
236
+ inter_layers,
237
+ in_channels,
238
+ en_out_channels,
239
+ )
240
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
241
+ if n_gru:
242
+ self.fc = nn.Sequential(
243
+ BiGRU(3 * 128, 256, n_gru),
244
+ nn.Linear(512, 360),
245
+ nn.Dropout(0.25),
246
+ nn.Sigmoid(),
247
+ )
248
+ else:
249
+ self.fc = nn.Sequential(
250
+ nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
251
+ )
252
+
253
+ def forward(self, mel):
254
+ mel = mel.transpose(-1, -2).unsqueeze(1)
255
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
256
+ x = self.fc(x)
257
+ return x
258
+
259
+
260
+ from librosa.filters import mel
261
+
262
+
263
+ class MelSpectrogram(torch.nn.Module):
264
+ def __init__(
265
+ self,
266
+ is_half,
267
+ n_mel_channels,
268
+ sampling_rate,
269
+ win_length,
270
+ hop_length,
271
+ n_fft=None,
272
+ mel_fmin=0,
273
+ mel_fmax=None,
274
+ clamp=1e-5,
275
+ ):
276
+ super().__init__()
277
+ n_fft = win_length if n_fft is None else n_fft
278
+ self.hann_window = {}
279
+ mel_basis = mel(
280
+ sr=sampling_rate,
281
+ n_fft=n_fft,
282
+ n_mels=n_mel_channels,
283
+ fmin=mel_fmin,
284
+ fmax=mel_fmax,
285
+ htk=True,
286
+ )
287
+ mel_basis = torch.from_numpy(mel_basis).float()
288
+ self.register_buffer("mel_basis", mel_basis)
289
+ self.n_fft = win_length if n_fft is None else n_fft
290
+ self.hop_length = hop_length
291
+ self.win_length = win_length
292
+ self.sampling_rate = sampling_rate
293
+ self.n_mel_channels = n_mel_channels
294
+ self.clamp = clamp
295
+ self.is_half = is_half
296
+
297
+ def forward(self, audio, keyshift=0, speed=1, center=True):
298
+ factor = 2 ** (keyshift / 12)
299
+ n_fft_new = int(np.round(self.n_fft * factor))
300
+ win_length_new = int(np.round(self.win_length * factor))
301
+ hop_length_new = int(np.round(self.hop_length * speed))
302
+ keyshift_key = str(keyshift) + "_" + str(audio.device)
303
+ if keyshift_key not in self.hann_window:
304
+ self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
305
+ audio.device
306
+ )
307
+ fft = torch.stft(
308
+ audio,
309
+ n_fft=n_fft_new,
310
+ hop_length=hop_length_new,
311
+ win_length=win_length_new,
312
+ window=self.hann_window[keyshift_key],
313
+ center=center,
314
+ return_complex=True,
315
+ )
316
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
317
+ if keyshift != 0:
318
+ size = self.n_fft // 2 + 1
319
+ resize = magnitude.size(1)
320
+ if resize < size:
321
+ magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
322
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
323
+ mel_output = torch.matmul(self.mel_basis, magnitude)
324
+ if self.is_half == True:
325
+ mel_output = mel_output.half()
326
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
327
+ return log_mel_spec
328
+
329
+
330
+ class RMVPE:
331
+ def __init__(self, model_path, is_half, device=None):
332
+ self.resample_kernel = {}
333
+ model = E2E(4, 1, (2, 2))
334
+ ckpt = torch.load(model_path, map_location="cpu")
335
+ model.load_state_dict(ckpt)
336
+ model.eval()
337
+ if is_half == True:
338
+ model = model.half()
339
+ self.model = model
340
+ self.resample_kernel = {}
341
+ self.is_half = is_half
342
+ if device is None:
343
+ device = "cuda" if torch.cuda.is_available() else "cpu"
344
+ self.device = device
345
+ self.mel_extractor = MelSpectrogram(
346
+ is_half, 128, 16000, 1024, 160, None, 30, 8000
347
+ ).to(device)
348
+ self.model = self.model.to(device)
349
+ cents_mapping = 20 * np.arange(360) + 1997.3794084376191
350
+ self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
351
+
352
+ def mel2hidden(self, mel):
353
+ with torch.no_grad():
354
+ n_frames = mel.shape[-1]
355
+ mel = F.pad(
356
+ mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
357
+ )
358
+ hidden = self.model(mel)
359
+ return hidden[:, :n_frames]
360
+
361
+ def decode(self, hidden, thred=0.03):
362
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
363
+ f0 = 10 * (2 ** (cents_pred / 1200))
364
+ f0[f0 == 10] = 0
365
+ # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
366
+ return f0
367
+
368
+ def infer_from_audio(self, audio, thred=0.03):
369
+ audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
370
+ # torch.cuda.synchronize()
371
+ # t0=ttime()
372
+ mel = self.mel_extractor(audio, center=True)
373
+ # torch.cuda.synchronize()
374
+ # t1=ttime()
375
+ hidden = self.mel2hidden(mel)
376
+ # torch.cuda.synchronize()
377
+ # t2=ttime()
378
+ hidden = hidden.squeeze(0).cpu().numpy()
379
+ if self.is_half == True:
380
+ hidden = hidden.astype("float32")
381
+ f0 = self.decode(hidden, thred=thred)
382
+ # torch.cuda.synchronize()
383
+ # t3=ttime()
384
+ # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
385
+ return f0
386
+
387
+ def to_local_average_cents(self, salience, thred=0.05):
388
+ # t0 = ttime()
389
+ center = np.argmax(salience, axis=1) # 帧长#index
390
+ salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368
391
+ # t1 = ttime()
392
+ center += 4
393
+ todo_salience = []
394
+ todo_cents_mapping = []
395
+ starts = center - 4
396
+ ends = center + 5
397
+ for idx in range(salience.shape[0]):
398
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
399
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
400
+ # t2 = ttime()
401
+ todo_salience = np.array(todo_salience) # 帧长,9
402
+ todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9
403
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
404
+ weight_sum = np.sum(todo_salience, 1) # 帧长
405
+ devided = product_sum / weight_sum # 帧长
406
+ # t3 = ttime()
407
+ maxx = np.max(salience, axis=1) # 帧长
408
+ devided[maxx <= thred] = 0
409
+ # t4 = ttime()
410
+ # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
411
+ return devided
412
+
413
+
414
+ # if __name__ == '__main__':
415
+ # audio, sampling_rate = sf.read("卢本伟语录~1.wav")
416
+ # if len(audio.shape) > 1:
417
+ # audio = librosa.to_mono(audio.transpose(1, 0))
418
+ # audio_bak = audio.copy()
419
+ # if sampling_rate != 16000:
420
+ # audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
421
+ # model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/test-RMVPE/weights/rmvpe_llc_half.pt"
422
+ # thred = 0.03 # 0.01
423
+ # device = 'cuda' if torch.cuda.is_available() else 'cpu'
424
+ # rmvpe = RMVPE(model_path,is_half=False, device=device)
425
+ # t0=ttime()
426
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
427
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
428
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
429
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
430
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
431
+ # t1=ttime()
432
+ # print(f0.shape,t1-t0)
utils/__init__.py ADDED
File without changes
utils/dependencies.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+ import shutil
4
+ import tarfile
5
+ import subprocess
6
+ from pathlib import Path
7
+
8
+
9
+ def install_packages_but_jank_af():
10
+ packages = ['build-essential', 'python3-dev', 'ffmpeg', 'aria2']
11
+ pip_packages = ['pip', 'setuptools', 'wheel', 'httpx==0.23.0', 'faiss-gpu', 'fairseq', 'gradio==3.34.0',
12
+ 'ffmpeg', 'ffmpeg-python', 'praat-parselmouth', 'pyworld', 'numpy==1.23.5',
13
+ 'numba==0.56.4', 'librosa==0.9.2', 'mega.py', 'gdown', 'onnxruntime', 'pyngrok==4.1.12',
14
+ 'gTTS', 'elevenlabs', 'wget', 'tensorboardX', 'unidecode', 'huggingface-hub',
15
+ 'yt-dlp', 'pedalboard', 'pathvalidate', 'nltk', 'edge-tts', 'git+https://github.com/suno-ai/bark.git', 'python-dotenv', 'av']
16
+
17
+ print("Updating and installing system packages...")
18
+ for package in packages:
19
+ print(f"Installing {package}...")
20
+ subprocess.check_call(['apt-get', 'install', '-qq', '-y', package])
21
+
22
+ print("Updating and installing pip packages...")
23
+ subprocess.check_call(['pip', 'install', '--upgrade'] + pip_packages)
24
+
25
+ print('Packages up to date.')
26
+
27
+
28
+ def setup_environment(ForceUpdateDependencies, ForceTemporaryStorage):
29
+ # Mounting Google Drive
30
+ if not ForceTemporaryStorage:
31
+ from google.colab import drive
32
+
33
+ if not os.path.exists('/content/drive'):
34
+ drive.mount('/content/drive')
35
+ else:
36
+ print('Drive is already mounted. Proceeding...')
37
+
38
+ # Function to install dependencies with progress
39
+ def install_packages():
40
+ packages = ['build-essential', 'python3-dev', 'ffmpeg', 'aria2']
41
+ pip_packages = ['pip', 'setuptools', 'wheel', 'httpx==0.23.0', 'faiss-gpu', 'fairseq', 'gradio==3.34.0',
42
+ 'ffmpeg', 'ffmpeg-python', 'praat-parselmouth', 'pyworld', 'numpy==1.23.5',
43
+ 'numba==0.56.4', 'librosa==0.9.2', 'mega.py', 'gdown', 'onnxruntime', 'pyngrok==4.1.12',
44
+ 'gTTS', 'elevenlabs', 'wget', 'tensorboardX', 'unidecode', 'huggingface-hub',
45
+ 'yt-dlp', 'pedalboard', 'pathvalidate', 'nltk', 'edge-tts', 'git+https://github.com/suno-ai/bark.git', 'python-dotenv', 'av']
46
+
47
+ print("Updating and installing system packages...")
48
+ for package in packages:
49
+ print(f"Installing {package}...")
50
+ subprocess.check_call(['apt-get', 'install', '-qq', '-y', package])
51
+
52
+ print("Updating and installing pip packages...")
53
+ subprocess.check_call(['pip', 'install', '--upgrade'] + pip_packages)
54
+
55
+ print('Packages up to date.')
56
+
57
+ # Function to scan a directory and writes filenames and timestamps
58
+ def scan_and_write(base_path, output_file):
59
+ with open(output_file, 'w', newline='') as f:
60
+ writer = csv.writer(f)
61
+ for dirpath, dirs, files in os.walk(base_path):
62
+ for filename in files:
63
+ fname = os.path.join(dirpath, filename)
64
+ try:
65
+ mtime = os.path.getmtime(fname)
66
+ writer.writerow([fname, mtime])
67
+ except Exception as e:
68
+ print(
69
+ f'Skipping irrelevant nonexistent file {fname}: {str(e)}')
70
+ print(f'Finished recording filesystem timestamps to {output_file}.')
71
+
72
+ # Function to compare files
73
+ def compare_files(old_file, new_file):
74
+ old_files = {}
75
+ new_files = {}
76
+
77
+ with open(old_file, 'r') as f:
78
+ reader = csv.reader(f)
79
+ old_files = {rows[0]: rows[1] for rows in reader}
80
+
81
+ with open(new_file, 'r') as f:
82
+ reader = csv.reader(f)
83
+ new_files = {rows[0]: rows[1] for rows in reader}
84
+
85
+ removed_files = old_files.keys() - new_files.keys()
86
+ added_files = new_files.keys() - old_files.keys()
87
+ unchanged_files = old_files.keys() & new_files.keys()
88
+
89
+ changed_files = {
90
+ f for f in unchanged_files if old_files[f] != new_files[f]}
91
+
92
+ for file in removed_files:
93
+ print(f'File has been removed: {file}')
94
+
95
+ for file in changed_files:
96
+ print(f'File has been updated: {file}')
97
+
98
+ return list(added_files) + list(changed_files)
99
+
100
+ # Check if CachedRVC.tar.gz exists
101
+ if ForceTemporaryStorage:
102
+ file_path = '/content/CachedRVC.tar.gz'
103
+ else:
104
+ file_path = '/content/drive/MyDrive/RVC_Cached/CachedRVC.tar.gz'
105
+
106
+ content_file_path = '/content/CachedRVC.tar.gz'
107
+ extract_path = '/'
108
+
109
+ if not os.path.exists(file_path):
110
+ folder_path = os.path.dirname(file_path)
111
+ os.makedirs(folder_path, exist_ok=True)
112
+ print('No cached dependency install found. Attempting to download GitHub backup..')
113
+
114
+ try:
115
+ download_url = "https://github.com/kalomaze/QuickMangioFixes/releases/download/release3/CachedRVC.tar.gz"
116
+ subprocess.run(["wget", "-O", file_path, download_url])
117
+ print('Download completed successfully!')
118
+ except Exception as e:
119
+ print('Download failed:', str(e))
120
+
121
+ # Delete the failed download file
122
+ if os.path.exists(file_path):
123
+ os.remove(file_path)
124
+ print('Failed download file deleted. Continuing manual backup..')
125
+
126
+ if Path(file_path).exists():
127
+ if ForceTemporaryStorage:
128
+ print('Finished downloading CachedRVC.tar.gz.')
129
+ else:
130
+ print(
131
+ 'CachedRVC.tar.gz found on Google Drive. Proceeding to copy and extract...')
132
+
133
+ # Check if ForceTemporaryStorage is True and skip copying if it is
134
+ if ForceTemporaryStorage:
135
+ pass
136
+ else:
137
+ shutil.copy(file_path, content_file_path)
138
+
139
+ print('Beginning backup copy operation...')
140
+
141
+ with tarfile.open(content_file_path, 'r:gz') as tar:
142
+ for member in tar.getmembers():
143
+ target_path = os.path.join(extract_path, member.name)
144
+ try:
145
+ tar.extract(member, extract_path)
146
+ except Exception as e:
147
+ print(
148
+ 'Failed to extract a file (this isn\'t normal)... forcing an update to compensate')
149
+ ForceUpdateDependencies = True
150
+ print(
151
+ f'Extraction of {content_file_path} to {extract_path} completed.')
152
+
153
+ if ForceUpdateDependencies:
154
+ install_packages()
155
+ ForceUpdateDependencies = False
156
+ else:
157
+ print('CachedRVC.tar.gz not found. Proceeding to create an index of all current files...')
158
+ scan_and_write('/usr/', '/content/usr_files.csv')
159
+
160
+ install_packages()
161
+
162
+ scan_and_write('/usr/', '/content/usr_files_new.csv')
163
+ changed_files = compare_files(
164
+ '/content/usr_files.csv', '/content/usr_files_new.csv')
165
+
166
+ with tarfile.open('/content/CachedRVC.tar.gz', 'w:gz') as new_tar:
167
+ for file in changed_files:
168
+ new_tar.add(file)
169
+ print(f'Added to tar: {file}')
170
+
171
+ os.makedirs('/content/drive/MyDrive/RVC_Cached', exist_ok=True)
172
+ shutil.copy('/content/CachedRVC.tar.gz',
173
+ '/content/drive/MyDrive/RVC_Cached/CachedRVC.tar.gz')
174
+ print('Updated CachedRVC.tar.gz copied to Google Drive.')
175
+ print('Dependencies fully up to date; future runs should be faster.')
utils/model.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from mega import Mega
4
+ import gdown
5
+ import re
6
+ import wget
7
+ import sys
8
+ import uuid
9
+ import zipfile
10
+
11
+
12
+ class InvalidDriveId(Exception):
13
+ def __init__(self, message="Error de la url"):
14
+ self.message = message
15
+ super().__init__(self.message)
16
+
17
+
18
+ def model_downloader(url, zip_path, dest_path):
19
+ """Download and unzip a file from Google Drive or Mega."""
20
+
21
+ def drive_download(url, dest_folder):
22
+ print(f"Descargando desde drive...")
23
+ try:
24
+ filename = gdown.download(url, os.path.join(dest_folder, f"{uuid.uuid4()}.zip"), fuzzy=True)
25
+ return filename
26
+ except:
27
+ print("El intento de descargar con drive no funcionó")
28
+ return None
29
+
30
+ def mega_download(url, dest_folder):
31
+ try:
32
+ file_id = None
33
+ if "#!" in url:
34
+ file_id = url.split("#!")[1].split("!")[0]
35
+ elif "file/" in url:
36
+ file_id = url.split("file/")[1].split("/")[0]
37
+ else:
38
+ file_id = None
39
+
40
+ print(f"Descargando desde mega...")
41
+ if file_id:
42
+ mega = Mega()
43
+ m = mega.login()
44
+ filename = m.download_url(url, dest_path=dest_folder, dest_filename=f"{uuid.uuid4()}.zip")
45
+
46
+ return os.path.basename(filename)
47
+ else:
48
+ return None
49
+
50
+ except Exception as e:
51
+ print("Ocurrio un error**")
52
+ print(e)
53
+ return None
54
+
55
+ def download(url, dest_folder):
56
+ try:
57
+ print(f"Descargando desde url generica...")
58
+ dest_path = wget.download(url=url, out=os.path.join(dest_folder, f"{uuid.uuid4()}.zip"))
59
+
60
+ return os.path.basename(dest_path)
61
+ except Exception as e:
62
+ print(f"Error al descargar el archivo: {str(e)}")
63
+
64
+ filename = ""
65
+
66
+ if not os.path.exists(zip_path):
67
+ os.mkdir(zip_path)
68
+
69
+ if url and 'drive.google.com' in url:
70
+ # Descargar el elemento si la URL es de Google Drive
71
+ filename = drive_download(url, zip_path)
72
+ elif url and 'mega.nz' in url:
73
+ filename = mega_download(url, zip_path)
74
+ elif url and 'pixeldrain' in url:
75
+ print("No se puede descargar de pixeldrain")
76
+ sys.exit()
77
+ else:
78
+ filename = download(url, zip_path)
79
+
80
+ if filename:
81
+ print(f"Descomprimiendo {filename}...")
82
+ modelname = str(filename).replace(".zip", "")
83
+ zip_file_path = os.path.join(zip_path, filename)
84
+
85
+ try:
86
+ shutil.unpack_archive(zip_file_path, os.path.join(dest_path, modelname))
87
+ except Exception as e:
88
+ try:
89
+ with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
90
+ zip_ref.extractall(dest_path)
91
+ except zipfile.BadZipFile as e:
92
+ print(f"Error: El archivo ZIP no es válido - {e}")
93
+ except Exception as e:
94
+ print(f"Error inesperado: {e}")
95
+
96
+ if os.path.exists(zip_file_path):
97
+ os.remove(zip_file_path)
98
+
99
+ return modelname
100
+ else:
101
+ return None
102
+
103
+
104
+ def get_models(weight_path):
105
+ # Obtener todos los elementos en la ruta
106
+ files = os.listdir(weight_path)
107
+ # Filtrar solo los directorios
108
+ return [file for file in files if os.path.isdir(os.path.join(weight_path, file))]
109
+
110
+
111
+ def get_model(weight_path, modelname):
112
+ resources = {}
113
+ for root, dirs, files in os.walk(os.path.join(weight_path, modelname)):
114
+ for file in files:
115
+ if file.endswith('.index'):
116
+ resources['index'] = os.path.relpath(os.path.join(root, file))
117
+ if file.endswith('.pth'):
118
+ resources['pth'] = os.path.relpath(os.path.join(root, file), start=weight_path)
119
+ return resources
120
+
121
+
122
+ def get_audios(audios_path):
123
+ # Obtener todos los elementos en la ruta
124
+ files = os.listdir(audios_path)
125
+ # Filtrar solo los directorios
126
+ return [file for file in files if not os.path.isdir(os.path.join(audios_path, file)) and os.path.join(audios_path, file).endswith(('.mp3', '.wav'))]
vc_infer_pipeline.py ADDED
@@ -0,0 +1,656 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np, parselmouth, torch, pdb, sys, os
2
+ from time import time as ttime
3
+ import torch.nn.functional as F
4
+ import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
5
+ from torch import Tensor
6
+ import scipy.signal as signal
7
+ import pyworld, os, traceback, faiss, librosa, torchcrepe
8
+ from scipy import signal
9
+ from functools import lru_cache
10
+
11
+ now_dir = os.getcwd()
12
+ sys.path.append(now_dir)
13
+
14
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
15
+
16
+ input_audio_path2wav = {}
17
+
18
+
19
+ @lru_cache
20
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
21
+ audio = input_audio_path2wav[input_audio_path]
22
+ f0, t = pyworld.harvest(
23
+ audio,
24
+ fs=fs,
25
+ f0_ceil=f0max,
26
+ f0_floor=f0min,
27
+ frame_period=frame_period,
28
+ )
29
+ f0 = pyworld.stonemask(audio, f0, t, fs)
30
+ return f0
31
+
32
+
33
+ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
34
+ # print(data1.max(),data2.max())
35
+ rms1 = librosa.feature.rms(
36
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
37
+ ) # 每半秒一个点
38
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
39
+ rms1 = torch.from_numpy(rms1)
40
+ rms1 = F.interpolate(
41
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
42
+ ).squeeze()
43
+ rms2 = torch.from_numpy(rms2)
44
+ rms2 = F.interpolate(
45
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
46
+ ).squeeze()
47
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
48
+ data2 *= (
49
+ torch.pow(rms1, torch.tensor(1 - rate))
50
+ * torch.pow(rms2, torch.tensor(rate - 1))
51
+ ).numpy()
52
+ return data2
53
+
54
+
55
+ class VC(object):
56
+ def __init__(self, tgt_sr, config):
57
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
58
+ config.x_pad,
59
+ config.x_query,
60
+ config.x_center,
61
+ config.x_max,
62
+ config.is_half,
63
+ )
64
+ self.sr = 16000 # hubert输入采样率
65
+ self.window = 160 # 每帧点数
66
+ self.t_pad = self.sr * self.x_pad # 每条前后pad时间
67
+ self.t_pad_tgt = tgt_sr * self.x_pad
68
+ self.t_pad2 = self.t_pad * 2
69
+ self.t_query = self.sr * self.x_query # 查询切点前后查询时间
70
+ self.t_center = self.sr * self.x_center # 查询切点位置
71
+ self.t_max = self.sr * self.x_max # 免查询时长阈值
72
+ self.device = config.device
73
+
74
+ # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
75
+ def get_optimal_torch_device(self, index: int = 0) -> torch.device:
76
+ # Get cuda device
77
+ if torch.cuda.is_available():
78
+ return torch.device(
79
+ f"cuda:{index % torch.cuda.device_count()}"
80
+ ) # Very fast
81
+ elif torch.backends.mps.is_available():
82
+ return torch.device("mps")
83
+ # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
84
+ # Else wise return the "cpu" as a torch device,
85
+ return torch.device("cpu")
86
+
87
+ # Fork Feature: Compute f0 with the crepe method
88
+ def get_f0_crepe_computation(
89
+ self,
90
+ x,
91
+ f0_min,
92
+ f0_max,
93
+ p_len,
94
+ hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
95
+ model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
96
+ ):
97
+ x = x.astype(
98
+ np.float32
99
+ ) # fixes the F.conv2D exception. We needed to convert double to float.
100
+ x /= np.quantile(np.abs(x), 0.999)
101
+ torch_device = self.get_optimal_torch_device()
102
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
103
+ audio = torch.unsqueeze(audio, dim=0)
104
+ if audio.ndim == 2 and audio.shape[0] > 1:
105
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
106
+ audio = audio.detach()
107
+ print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
108
+ pitch: Tensor = torchcrepe.predict(
109
+ audio,
110
+ self.sr,
111
+ hop_length,
112
+ f0_min,
113
+ f0_max,
114
+ model,
115
+ batch_size=hop_length * 2,
116
+ device=torch_device,
117
+ pad=True,
118
+ )
119
+ p_len = p_len or x.shape[0] // hop_length
120
+ # Resize the pitch for final f0
121
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
122
+ source[source < 0.001] = np.nan
123
+ target = np.interp(
124
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
125
+ np.arange(0, len(source)),
126
+ source,
127
+ )
128
+ f0 = np.nan_to_num(target)
129
+ return f0 # Resized f0
130
+
131
+ def get_f0_official_crepe_computation(
132
+ self,
133
+ x,
134
+ f0_min,
135
+ f0_max,
136
+ model="full",
137
+ ):
138
+ # Pick a batch size that doesn't cause memory errors on your gpu
139
+ batch_size = 512
140
+ # Compute pitch using first gpu
141
+ audio = torch.tensor(np.copy(x))[None].float()
142
+ f0, pd = torchcrepe.predict(
143
+ audio,
144
+ self.sr,
145
+ self.window,
146
+ f0_min,
147
+ f0_max,
148
+ model,
149
+ batch_size=batch_size,
150
+ device=self.device,
151
+ return_periodicity=True,
152
+ )
153
+ pd = torchcrepe.filter.median(pd, 3)
154
+ f0 = torchcrepe.filter.mean(f0, 3)
155
+ f0[pd < 0.1] = 0
156
+ f0 = f0[0].cpu().numpy()
157
+ return f0
158
+
159
+ # Fork Feature: Compute pYIN f0 method
160
+ def get_f0_pyin_computation(self, x, f0_min, f0_max):
161
+ y, sr = librosa.load("saudio/Sidney.wav", self.sr, mono=True)
162
+ f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
163
+ f0 = f0[1:] # Get rid of extra first frame
164
+ return f0
165
+
166
+ # Fork Feature: Acquire median hybrid f0 estimation calculation
167
+ def get_f0_hybrid_computation(
168
+ self,
169
+ methods_str,
170
+ input_audio_path,
171
+ x,
172
+ f0_min,
173
+ f0_max,
174
+ p_len,
175
+ filter_radius,
176
+ crepe_hop_length,
177
+ time_step,
178
+ ):
179
+ # Get various f0 methods from input to use in the computation stack
180
+ s = methods_str
181
+ s = s.split("hybrid")[1]
182
+ s = s.replace("[", "").replace("]", "")
183
+ methods = s.split("+")
184
+ f0_computation_stack = []
185
+
186
+ print("Calculating f0 pitch estimations for methods: %s" % str(methods))
187
+ x = x.astype(np.float32)
188
+ x /= np.quantile(np.abs(x), 0.999)
189
+ # Get f0 calculations for all methods specified
190
+ for method in methods:
191
+ f0 = None
192
+ if method == "pm":
193
+ f0 = (
194
+ parselmouth.Sound(x, self.sr)
195
+ .to_pitch_ac(
196
+ time_step=time_step / 1000,
197
+ voicing_threshold=0.6,
198
+ pitch_floor=f0_min,
199
+ pitch_ceiling=f0_max,
200
+ )
201
+ .selected_array["frequency"]
202
+ )
203
+ pad_size = (p_len - len(f0) + 1) // 2
204
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
205
+ f0 = np.pad(
206
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
207
+ )
208
+ elif method == "crepe":
209
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
210
+ f0 = f0[1:] # Get rid of extra first frame
211
+ elif method == "crepe-tiny":
212
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
213
+ f0 = f0[1:] # Get rid of extra first frame
214
+ elif method == "mangio-crepe":
215
+ f0 = self.get_f0_crepe_computation(
216
+ x, f0_min, f0_max, p_len, crepe_hop_length
217
+ )
218
+ elif method == "mangio-crepe-tiny":
219
+ f0 = self.get_f0_crepe_computation(
220
+ x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
221
+ )
222
+ elif method == "harvest":
223
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
224
+ if filter_radius > 2:
225
+ f0 = signal.medfilt(f0, 3)
226
+ f0 = f0[1:] # Get rid of first frame.
227
+ elif method == "rmvpe":
228
+ if hasattr(self, "model_rmvpe") == False:
229
+ from rmvpe import RMVPE
230
+
231
+ print("loading rmvpe model")
232
+ self.model_rmvpe = RMVPE(
233
+ "rmvpe.pt", is_half=self.is_half, device=self.device
234
+ )
235
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
236
+ f0 = f0[1:] # Get rid of first frame.
237
+ elif method == "dio": # Potentially buggy?
238
+ f0, t = pyworld.dio(
239
+ x.astype(np.double),
240
+ fs=self.sr,
241
+ f0_ceil=f0_max,
242
+ f0_floor=f0_min,
243
+ frame_period=10,
244
+ )
245
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
246
+ f0 = signal.medfilt(f0, 3)
247
+ f0 = f0[1:]
248
+ # elif method == "pyin": Not Working just yet
249
+ # f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
250
+ # Push method to the stack
251
+ f0_computation_stack.append(f0)
252
+
253
+ for fc in f0_computation_stack:
254
+ print(len(fc))
255
+
256
+ print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
257
+ f0_median_hybrid = None
258
+ if len(f0_computation_stack) == 1:
259
+ f0_median_hybrid = f0_computation_stack[0]
260
+ else:
261
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
262
+ return f0_median_hybrid
263
+
264
+ def get_f0(
265
+ self,
266
+ input_audio_path,
267
+ x,
268
+ p_len,
269
+ f0_up_key,
270
+ f0_method,
271
+ filter_radius,
272
+ crepe_hop_length,
273
+ inp_f0=None,
274
+ ):
275
+ global input_audio_path2wav
276
+ time_step = self.window / self.sr * 1000
277
+ f0_min = 50
278
+ f0_max = 1100
279
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
280
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
281
+ if f0_method == "pm":
282
+ f0 = (
283
+ parselmouth.Sound(x, self.sr)
284
+ .to_pitch_ac(
285
+ time_step=time_step / 1000,
286
+ voicing_threshold=0.6,
287
+ pitch_floor=f0_min,
288
+ pitch_ceiling=f0_max,
289
+ )
290
+ .selected_array["frequency"]
291
+ )
292
+ pad_size = (p_len - len(f0) + 1) // 2
293
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
294
+ f0 = np.pad(
295
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
296
+ )
297
+ elif f0_method == "harvest":
298
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
299
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
300
+ if filter_radius > 2:
301
+ f0 = signal.medfilt(f0, 3)
302
+ elif f0_method == "dio": # Potentially Buggy?
303
+ f0, t = pyworld.dio(
304
+ x.astype(np.double),
305
+ fs=self.sr,
306
+ f0_ceil=f0_max,
307
+ f0_floor=f0_min,
308
+ frame_period=10,
309
+ )
310
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
311
+ f0 = signal.medfilt(f0, 3)
312
+ elif f0_method == "crepe":
313
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
314
+ elif f0_method == "crepe-tiny":
315
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
316
+ elif f0_method == "mangio-crepe":
317
+ f0 = self.get_f0_crepe_computation(
318
+ x, f0_min, f0_max, p_len, crepe_hop_length
319
+ )
320
+ elif f0_method == "mangio-crepe-tiny":
321
+ f0 = self.get_f0_crepe_computation(
322
+ x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
323
+ )
324
+ elif f0_method == "rmvpe":
325
+ if hasattr(self, "model_rmvpe") == False:
326
+ from rmvpe import RMVPE
327
+
328
+ print("loading rmvpe model")
329
+ self.model_rmvpe = RMVPE(
330
+ "rmvpe.pt", is_half=self.is_half, device=self.device
331
+ )
332
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
333
+
334
+ elif "hybrid" in f0_method:
335
+ # Perform hybrid median pitch estimation
336
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
337
+ f0 = self.get_f0_hybrid_computation(
338
+ f0_method,
339
+ input_audio_path,
340
+ x,
341
+ f0_min,
342
+ f0_max,
343
+ p_len,
344
+ filter_radius,
345
+ crepe_hop_length,
346
+ time_step,
347
+ )
348
+
349
+ f0 *= pow(2, f0_up_key / 12)
350
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
351
+ tf0 = self.sr // self.window # 每秒f0点数
352
+ if inp_f0 is not None:
353
+ delta_t = np.round(
354
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
355
+ ).astype("int16")
356
+ replace_f0 = np.interp(
357
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
358
+ )
359
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
360
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
361
+ :shape
362
+ ]
363
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
364
+ f0bak = f0.copy()
365
+ f0_mel = 1127 * np.log(1 + f0 / 700)
366
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
367
+ f0_mel_max - f0_mel_min
368
+ ) + 1
369
+ f0_mel[f0_mel <= 1] = 1
370
+ f0_mel[f0_mel > 255] = 255
371
+ f0_coarse = np.rint(f0_mel).astype(np.int)
372
+
373
+ return f0_coarse, f0bak # 1-0
374
+
375
+ def vc(
376
+ self,
377
+ model,
378
+ net_g,
379
+ sid,
380
+ audio0,
381
+ pitch,
382
+ pitchf,
383
+ times,
384
+ index,
385
+ big_npy,
386
+ index_rate,
387
+ version,
388
+ protect,
389
+ ): # ,file_index,file_big_npy
390
+ feats = torch.from_numpy(audio0)
391
+ if self.is_half:
392
+ feats = feats.half()
393
+ else:
394
+ feats = feats.float()
395
+ if feats.dim() == 2: # double channels
396
+ feats = feats.mean(-1)
397
+ assert feats.dim() == 1, feats.dim()
398
+ feats = feats.view(1, -1)
399
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
400
+
401
+ inputs = {
402
+ "source": feats.to(self.device),
403
+ "padding_mask": padding_mask,
404
+ "output_layer": 9 if version == "v1" else 12,
405
+ }
406
+ t0 = ttime()
407
+ with torch.no_grad():
408
+ logits = model.extract_features(**inputs)
409
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
410
+ if protect < 0.5 and pitch != None and pitchf != None:
411
+ feats0 = feats.clone()
412
+ if (
413
+ isinstance(index, type(None)) == False
414
+ and isinstance(big_npy, type(None)) == False
415
+ and index_rate != 0
416
+ ):
417
+ npy = feats[0].cpu().numpy()
418
+ if self.is_half:
419
+ npy = npy.astype("float32")
420
+
421
+ # _, I = index.search(npy, 1)
422
+ # npy = big_npy[I.squeeze()]
423
+
424
+ score, ix = index.search(npy, k=8)
425
+ weight = np.square(1 / score)
426
+ weight /= weight.sum(axis=1, keepdims=True)
427
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
428
+
429
+ if self.is_half:
430
+ npy = npy.astype("float16")
431
+ feats = (
432
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
433
+ + (1 - index_rate) * feats
434
+ )
435
+
436
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
437
+ if protect < 0.5 and pitch != None and pitchf != None:
438
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
439
+ 0, 2, 1
440
+ )
441
+ t1 = ttime()
442
+ p_len = audio0.shape[0] // self.window
443
+ if feats.shape[1] < p_len:
444
+ p_len = feats.shape[1]
445
+ if pitch != None and pitchf != None:
446
+ pitch = pitch[:, :p_len]
447
+ pitchf = pitchf[:, :p_len]
448
+
449
+ if protect < 0.5 and pitch != None and pitchf != None:
450
+ pitchff = pitchf.clone()
451
+ pitchff[pitchf > 0] = 1
452
+ pitchff[pitchf < 1] = protect
453
+ pitchff = pitchff.unsqueeze(-1)
454
+ feats = feats * pitchff + feats0 * (1 - pitchff)
455
+ feats = feats.to(feats0.dtype)
456
+ p_len = torch.tensor([p_len], device=self.device).long()
457
+ with torch.no_grad():
458
+ if pitch != None and pitchf != None:
459
+ audio1 = (
460
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
461
+ .data.cpu()
462
+ .float()
463
+ .numpy()
464
+ )
465
+ else:
466
+ audio1 = (
467
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
468
+ )
469
+ del feats, p_len, padding_mask
470
+ if torch.cuda.is_available():
471
+ torch.cuda.empty_cache()
472
+ t2 = ttime()
473
+ times[0] += t1 - t0
474
+ times[2] += t2 - t1
475
+ return audio1
476
+
477
+ def pipeline(
478
+ self,
479
+ model,
480
+ net_g,
481
+ sid,
482
+ audio,
483
+ input_audio_path,
484
+ times,
485
+ f0_up_key,
486
+ f0_method,
487
+ file_index,
488
+ # file_big_npy,
489
+ index_rate,
490
+ if_f0,
491
+ filter_radius,
492
+ tgt_sr,
493
+ resample_sr,
494
+ rms_mix_rate,
495
+ version,
496
+ protect,
497
+ crepe_hop_length,
498
+ f0_file=None,
499
+ ):
500
+ if (
501
+ file_index != ""
502
+ # and file_big_npy != ""
503
+ # and os.path.exists(file_big_npy) == True
504
+ and os.path.exists(file_index) == True
505
+ and index_rate != 0
506
+ ):
507
+ try:
508
+ index = faiss.read_index(file_index)
509
+ # big_npy = np.load(file_big_npy)
510
+ big_npy = index.reconstruct_n(0, index.ntotal)
511
+ except:
512
+ traceback.print_exc()
513
+ index = big_npy = None
514
+ else:
515
+ index = big_npy = None
516
+ audio = signal.filtfilt(bh, ah, audio)
517
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
518
+ opt_ts = []
519
+ if audio_pad.shape[0] > self.t_max:
520
+ audio_sum = np.zeros_like(audio)
521
+ for i in range(self.window):
522
+ audio_sum += audio_pad[i : i - self.window]
523
+ for t in range(self.t_center, audio.shape[0], self.t_center):
524
+ opt_ts.append(
525
+ t
526
+ - self.t_query
527
+ + np.where(
528
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
529
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
530
+ )[0][0]
531
+ )
532
+ s = 0
533
+ audio_opt = []
534
+ t = None
535
+ t1 = ttime()
536
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
537
+ p_len = audio_pad.shape[0] // self.window
538
+ inp_f0 = None
539
+ if hasattr(f0_file, "name") == True:
540
+ try:
541
+ with open(f0_file.name, "r") as f:
542
+ lines = f.read().strip("\n").split("\n")
543
+ inp_f0 = []
544
+ for line in lines:
545
+ inp_f0.append([float(i) for i in line.split(",")])
546
+ inp_f0 = np.array(inp_f0, dtype="float32")
547
+ except:
548
+ traceback.print_exc()
549
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
550
+ pitch, pitchf = None, None
551
+ if if_f0 == 1:
552
+ pitch, pitchf = self.get_f0(
553
+ input_audio_path,
554
+ audio_pad,
555
+ p_len,
556
+ f0_up_key,
557
+ f0_method,
558
+ filter_radius,
559
+ crepe_hop_length,
560
+ inp_f0,
561
+ )
562
+ pitch = pitch[:p_len]
563
+ pitchf = pitchf[:p_len]
564
+ if self.device == "mps":
565
+ pitchf = pitchf.astype(np.float32)
566
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
567
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
568
+ t2 = ttime()
569
+ times[1] += t2 - t1
570
+ for t in opt_ts:
571
+ t = t // self.window * self.window
572
+ if if_f0 == 1:
573
+ audio_opt.append(
574
+ self.vc(
575
+ model,
576
+ net_g,
577
+ sid,
578
+ audio_pad[s : t + self.t_pad2 + self.window],
579
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
580
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
581
+ times,
582
+ index,
583
+ big_npy,
584
+ index_rate,
585
+ version,
586
+ protect,
587
+ )[self.t_pad_tgt : -self.t_pad_tgt]
588
+ )
589
+ else:
590
+ audio_opt.append(
591
+ self.vc(
592
+ model,
593
+ net_g,
594
+ sid,
595
+ audio_pad[s : t + self.t_pad2 + self.window],
596
+ None,
597
+ None,
598
+ times,
599
+ index,
600
+ big_npy,
601
+ index_rate,
602
+ version,
603
+ protect,
604
+ )[self.t_pad_tgt : -self.t_pad_tgt]
605
+ )
606
+ s = t
607
+ if if_f0 == 1:
608
+ audio_opt.append(
609
+ self.vc(
610
+ model,
611
+ net_g,
612
+ sid,
613
+ audio_pad[t:],
614
+ pitch[:, t // self.window :] if t is not None else pitch,
615
+ pitchf[:, t // self.window :] if t is not None else pitchf,
616
+ times,
617
+ index,
618
+ big_npy,
619
+ index_rate,
620
+ version,
621
+ protect,
622
+ )[self.t_pad_tgt : -self.t_pad_tgt]
623
+ )
624
+ else:
625
+ audio_opt.append(
626
+ self.vc(
627
+ model,
628
+ net_g,
629
+ sid,
630
+ audio_pad[t:],
631
+ None,
632
+ None,
633
+ times,
634
+ index,
635
+ big_npy,
636
+ index_rate,
637
+ version,
638
+ protect,
639
+ )[self.t_pad_tgt : -self.t_pad_tgt]
640
+ )
641
+ audio_opt = np.concatenate(audio_opt)
642
+ if rms_mix_rate != 1:
643
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
644
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
645
+ audio_opt = librosa.resample(
646
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
647
+ )
648
+ audio_max = np.abs(audio_opt).max() / 0.99
649
+ max_int16 = 32768
650
+ if audio_max > 1:
651
+ max_int16 /= audio_max
652
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
653
+ del pitch, pitchf, sid
654
+ if torch.cuda.is_available():
655
+ torch.cuda.empty_cache()
656
+ return audio_opt