oItsMineZ commited on
Commit
319e348
·
verified ·
1 Parent(s): a986973

Upload 72 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env +7 -0
  2. .gitattributes +4 -0
  3. LazyImport.py +13 -0
  4. README.md +8 -9
  5. assets/hubert/.gitignore +2 -0
  6. assets/rmvpe/.gitignore +2 -0
  7. assets/weights/.gitignore +2 -0
  8. audios/.gitignore +0 -0
  9. configs/32k.json +50 -0
  10. configs/32k_v2.json +50 -0
  11. configs/40k.json +50 -0
  12. configs/48k.json +50 -0
  13. configs/48k_v2.json +50 -0
  14. configs/config.json +15 -0
  15. configs/config.py +265 -0
  16. configs/v1/32k.json +46 -0
  17. configs/v1/40k.json +46 -0
  18. configs/v1/48k.json +46 -0
  19. configs/v2/32k.json +46 -0
  20. configs/v2/48k.json +46 -0
  21. csvdb/formanting.csv +0 -0
  22. csvdb/stop.csv +0 -0
  23. easy_infer.py +638 -0
  24. formantshiftcfg/Put your formantshift presets here as a txt file +0 -0
  25. formantshiftcfg/f2m.txt +2 -0
  26. formantshiftcfg/m2f.txt +2 -0
  27. formantshiftcfg/random.txt +2 -0
  28. infer/lib/audio.py +197 -0
  29. infer/lib/csvutil.py +41 -0
  30. infer/lib/infer_pack/attentions.py +417 -0
  31. infer/lib/infer_pack/commons.py +167 -0
  32. infer/lib/infer_pack/models.py +1174 -0
  33. infer/lib/infer_pack/models_onnx.py +824 -0
  34. infer/lib/infer_pack/modules.py +521 -0
  35. infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py +91 -0
  36. infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py +16 -0
  37. infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py +87 -0
  38. infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py +98 -0
  39. infer/lib/infer_pack/modules/F0Predictor/__init__.py +0 -0
  40. infer/lib/infer_pack/onnx_inference.py +149 -0
  41. infer/lib/infer_pack/transforms.py +207 -0
  42. infer/lib/rmvpe.py +717 -0
  43. infer/modules/vc/__init__.py +0 -0
  44. infer/modules/vc/modules.py +526 -0
  45. infer/modules/vc/pipeline.py +655 -0
  46. infer/modules/vc/utils.py +42 -0
  47. lib/globals/globals.py +5 -0
  48. lib/infer_pack/attentions.py +417 -0
  49. lib/infer_pack/commons.py +166 -0
  50. lib/infer_pack/models.py +1144 -0
.env ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ OPENBLAS_NUM_THREADS = 1
2
+ no_proxy = localhost, 127.0.0.1, ::1
3
+
4
+ # You can change the location of the model, etc. by changing here
5
+ weight_root = weights
6
+ index_root = logs
7
+ rmvpe_root = assets/rmvpe
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ logs/DaengGuitar/added_IVF473_Flat_nprobe_1_daengguitar_v2.index filter=lfs diff=lfs merge=lfs -text
37
+ logs/TAEEXZENFIRE/added_IVF340_Flat_nprobe_1_taeexzenfire_v2.index filter=lfs diff=lfs merge=lfs -text
38
+ logs/ท่านศาสดา/added_IVF109_Flat_nprobe_1_sadsada_v2.index filter=lfs diff=lfs merge=lfs -text
39
+ stftpitchshift filter=lfs diff=lfs merge=lfs -text
LazyImport.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from importlib.util import find_spec, LazyLoader, module_from_spec
2
+ from sys import modules
3
+
4
+ def lazyload(name):
5
+ if name in modules:
6
+ return modules[name]
7
+ else:
8
+ spec = find_spec(name)
9
+ loader = LazyLoader(spec.loader)
10
+ module = module_from_spec(spec)
11
+ modules[name] = module
12
+ loader.exec_module(module)
13
+ return module
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
- title: RVC V2 WebUI
3
- emoji: 📈
4
- colorFrom: indigo
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.26.0
8
  app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: oItsMinez's RVC v2 WebUI
3
+ emoji: 🎙️
4
+ colorFrom: red
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 3.43.2
8
  app_file: app.py
9
+ pinned: true
10
+ short_description: Use oItsMineZ's RVC v2 Model with WebUI (For Vocal to Vocal)
11
+ ---
 
assets/hubert/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *
2
+ !.gitignore
assets/rmvpe/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *
2
+ !.gitignore
assets/weights/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *
2
+ !.gitignore
audios/.gitignore ADDED
File without changes
configs/32k.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3, 7, 11],
38
+ "resblock_dilation_sizes": [
39
+ [1, 3, 5],
40
+ [1, 3, 5],
41
+ [1, 3, 5]
42
+ ],
43
+ "upsample_rates": [10, 4, 2, 2, 2],
44
+ "upsample_initial_channel": 512,
45
+ "upsample_kernel_sizes": [16, 16, 4, 4, 4],
46
+ "use_spectral_norm": false,
47
+ "gin_channels": 256,
48
+ "spk_embed_dim": 109
49
+ }
50
+ }
configs/32k_v2.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3, 7, 11],
38
+ "resblock_dilation_sizes": [
39
+ [1, 3, 5],
40
+ [1, 3, 5],
41
+ [1, 3, 5]
42
+ ],
43
+ "upsample_rates": [10, 8, 2, 2],
44
+ "upsample_initial_channel": 512,
45
+ "upsample_kernel_sizes": [20, 16, 4, 4],
46
+ "use_spectral_norm": false,
47
+ "gin_channels": 256,
48
+ "spk_embed_dim": 109
49
+ }
50
+ }
configs/40k.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 40000,
21
+ "filter_length": 2048,
22
+ "hop_length": 400,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 125,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3, 7, 11],
38
+ "resblock_dilation_sizes": [
39
+ [1, 3, 5],
40
+ [1, 3, 5],
41
+ [1, 3, 5]
42
+ ],
43
+ "upsample_rates": [10, 10, 2, 2],
44
+ "upsample_initial_channel": 512,
45
+ "upsample_kernel_sizes": [16, 16, 4, 4],
46
+ "use_spectral_norm": false,
47
+ "gin_channels": 256,
48
+ "spk_embed_dim": 109
49
+ }
50
+ }
configs/48k.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 11520,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3, 7, 11],
38
+ "resblock_dilation_sizes": [
39
+ [1, 3, 5],
40
+ [1, 3, 5],
41
+ [1, 3, 5]
42
+ ],
43
+ "upsample_rates": [10, 6, 2, 2, 2],
44
+ "upsample_initial_channel": 512,
45
+ "upsample_kernel_sizes": [16, 16, 4, 4, 4],
46
+ "use_spectral_norm": false,
47
+ "gin_channels": 256,
48
+ "spk_embed_dim": 109
49
+ }
50
+ }
configs/48k_v2.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 17280,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3, 7, 11],
38
+ "resblock_dilation_sizes": [
39
+ [1, 3, 5],
40
+ [1, 3, 5],
41
+ [1, 3, 5]
42
+ ],
43
+ "upsample_rates": [12, 10, 2, 2],
44
+ "upsample_initial_channel": 512,
45
+ "upsample_kernel_sizes": [24, 20, 4, 4],
46
+ "use_spectral_norm": false,
47
+ "gin_channels": 256,
48
+ "spk_embed_dim": 109
49
+ }
50
+ }
configs/config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pth_path": "assets/weights/kikiV1.pth",
3
+ "index_path": "logs/kikiV1.index",
4
+ "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)",
5
+ "sg_output_device": "VoiceMeeter Aux Input (VB-Audio (MME)",
6
+ "threhold": -45.0,
7
+ "pitch": 12.0,
8
+ "index_rate": 0.0,
9
+ "rms_mix_rate": 0.0,
10
+ "block_time": 0.25,
11
+ "crossfade_length": 0.04,
12
+ "extra_time": 2.0,
13
+ "n_cpu": 6.0,
14
+ "f0method": "rmvpe"
15
+ }
configs/config.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+ import json
5
+ from multiprocessing import cpu_count
6
+
7
+ import torch
8
+
9
+ try:
10
+ import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
11
+ if torch.xpu.is_available():
12
+ from infer.modules.ipex import ipex_init
13
+ ipex_init()
14
+ except Exception:
15
+ pass
16
+
17
+ import logging
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ version_config_list = [
23
+ "v1/32k.json",
24
+ "v1/40k.json",
25
+ "v1/48k.json",
26
+ "v2/48k.json",
27
+ "v2/32k.json",
28
+ ]
29
+
30
+
31
+ def singleton_variable(func):
32
+ def wrapper(*args, **kwargs):
33
+ if not wrapper.instance:
34
+ wrapper.instance = func(*args, **kwargs)
35
+ return wrapper.instance
36
+
37
+ wrapper.instance = None
38
+ return wrapper
39
+
40
+
41
+ @singleton_variable
42
+ class Config:
43
+ def __init__(self):
44
+ self.device = "cuda:0"
45
+ self.is_half = True
46
+ self.n_cpu = 0
47
+ self.gpu_name = None
48
+ self.json_config = self.load_config_json()
49
+ self.gpu_mem = None
50
+ (
51
+ self.python_cmd,
52
+ self.listen_port,
53
+ self.iscolab,
54
+ self.noparallel,
55
+ self.noautoopen,
56
+ self.paperspace,
57
+ self.is_cli,
58
+ self.grtheme,
59
+ self.dml,
60
+ ) = self.arg_parse()
61
+ self.instead = ""
62
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
63
+
64
+ @staticmethod
65
+ def load_config_json() -> dict:
66
+ d = {}
67
+ for config_file in version_config_list:
68
+ with open(f"configs/{config_file}", "r") as f:
69
+ d[config_file] = json.load(f)
70
+ return d
71
+
72
+ @staticmethod
73
+ def arg_parse() -> tuple:
74
+ exe = sys.executable or "python"
75
+ parser = argparse.ArgumentParser()
76
+ parser.add_argument("--port", type=int, default=7865, help="Listen port")
77
+ parser.add_argument("--pycmd", type=str, default=exe, help="Python command")
78
+ parser.add_argument("--colab", action="store_true", help="Launch in colab")
79
+ parser.add_argument(
80
+ "--noparallel", action="store_true", help="Disable parallel processing"
81
+ )
82
+ parser.add_argument(
83
+ "--noautoopen",
84
+ action="store_true",
85
+ help="Do not open in browser automatically",
86
+ )
87
+ parser.add_argument(
88
+ "--paperspace",
89
+ action="store_true",
90
+ help="Note that this argument just shares a gradio link for the web UI. Thus can be used on other non-local CLI systems.",
91
+ )
92
+ parser.add_argument(
93
+ "--is_cli",
94
+ action="store_true",
95
+ help="Use the CLI instead of setting up a gradio UI. This flag will launch an RVC text interface where you can execute functions from infer-web.py!",
96
+ )
97
+
98
+ parser.add_argument(
99
+ "-t",
100
+ "--theme",
101
+ help = "Theme for Gradio. Format - `JohnSmith9982/small_and_pretty` (no backticks)",
102
+ default = "JohnSmith9982/small_and_pretty",
103
+ type = str
104
+ )
105
+
106
+ parser.add_argument(
107
+ "--dml",
108
+ action="store_true",
109
+ help="Use DirectML backend instead of CUDA."
110
+ )
111
+
112
+ cmd_opts = parser.parse_args()
113
+
114
+ cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
115
+
116
+ return (
117
+ cmd_opts.pycmd,
118
+ cmd_opts.port,
119
+ cmd_opts.colab,
120
+ cmd_opts.noparallel,
121
+ cmd_opts.noautoopen,
122
+ cmd_opts.paperspace,
123
+ cmd_opts.is_cli,
124
+ cmd_opts.theme,
125
+ cmd_opts.dml,
126
+ )
127
+
128
+ # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
129
+ # check `getattr` and try it for compatibility
130
+ @staticmethod
131
+ def has_mps() -> bool:
132
+ if not torch.backends.mps.is_available():
133
+ return False
134
+ try:
135
+ torch.zeros(1).to(torch.device("mps"))
136
+ return True
137
+ except Exception:
138
+ return False
139
+
140
+ @staticmethod
141
+ def has_xpu() -> bool:
142
+ if hasattr(torch, "xpu") and torch.xpu.is_available():
143
+ return True
144
+ else:
145
+ return False
146
+
147
+ def use_fp32_config(self):
148
+ for config_file in version_config_list:
149
+ self.json_config[config_file]["train"]["fp16_run"] = False
150
+
151
+ def device_config(self) -> tuple:
152
+ if torch.cuda.is_available():
153
+ if self.has_xpu():
154
+ self.device = self.instead = "xpu:0"
155
+ self.is_half = True
156
+ i_device = int(self.device.split(":")[-1])
157
+ self.gpu_name = torch.cuda.get_device_name(i_device)
158
+ if (
159
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
160
+ or "P40" in self.gpu_name.upper()
161
+ or "P10" in self.gpu_name.upper()
162
+ or "1060" in self.gpu_name
163
+ or "1070" in self.gpu_name
164
+ or "1080" in self.gpu_name
165
+ ):
166
+ logger.info("Found GPU %s, force to fp32", self.gpu_name)
167
+ self.is_half = False
168
+ self.use_fp32_config()
169
+ else:
170
+ logger.info("Found GPU %s", self.gpu_name)
171
+ self.gpu_mem = int(
172
+ torch.cuda.get_device_properties(i_device).total_memory
173
+ / 1024
174
+ / 1024
175
+ / 1024
176
+ + 0.4
177
+ )
178
+ if self.gpu_mem <= 4:
179
+ with open("infer/modules/train/preprocess.py", "r") as f:
180
+ strr = f.read().replace("3.7", "3.0")
181
+ with open("infer/modules/train/preprocess.py", "w") as f:
182
+ f.write(strr)
183
+ elif self.has_mps():
184
+ logger.info("No supported Nvidia GPU found")
185
+ self.device = self.instead = "mps"
186
+ self.is_half = False
187
+ self.use_fp32_config()
188
+ else:
189
+ logger.info("No supported Nvidia GPU found")
190
+ self.device = self.instead = "cpu"
191
+ self.is_half = False
192
+ self.use_fp32_config()
193
+
194
+ if self.n_cpu == 0:
195
+ self.n_cpu = cpu_count()
196
+
197
+ if self.is_half:
198
+ # 6G显存配置
199
+ x_pad = 3
200
+ x_query = 10
201
+ x_center = 60
202
+ x_max = 65
203
+ else:
204
+ # 5G显存配置
205
+ x_pad = 1
206
+ x_query = 6
207
+ x_center = 38
208
+ x_max = 41
209
+
210
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
211
+ x_pad = 1
212
+ x_query = 5
213
+ x_center = 30
214
+ x_max = 32
215
+ if self.dml:
216
+ logger.info("Use DirectML instead")
217
+ if (
218
+ os.path.exists(
219
+ "runtime\Lib\site-packages\onnxruntime\capi\DirectML.dll"
220
+ )
221
+ == False
222
+ ):
223
+ try:
224
+ os.rename(
225
+ "runtime\Lib\site-packages\onnxruntime",
226
+ "runtime\Lib\site-packages\onnxruntime-cuda",
227
+ )
228
+ except:
229
+ pass
230
+ try:
231
+ os.rename(
232
+ "runtime\Lib\site-packages\onnxruntime-dml",
233
+ "runtime\Lib\site-packages\onnxruntime",
234
+ )
235
+ except:
236
+ pass
237
+ # if self.device != "cpu":
238
+ import torch_directml
239
+
240
+ self.device = torch_directml.device(torch_directml.default_device())
241
+ self.is_half = False
242
+ else:
243
+ if self.instead:
244
+ logger.info(f"Use {self.instead} instead")
245
+ if (
246
+ os.path.exists(
247
+ "runtime\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll"
248
+ )
249
+ == False
250
+ ):
251
+ try:
252
+ os.rename(
253
+ "runtime\Lib\site-packages\onnxruntime",
254
+ "runtime\Lib\site-packages\onnxruntime-dml",
255
+ )
256
+ except:
257
+ pass
258
+ try:
259
+ os.rename(
260
+ "runtime\Lib\site-packages\onnxruntime-cuda",
261
+ "runtime\Lib\site-packages\onnxruntime",
262
+ )
263
+ except:
264
+ pass
265
+ return x_pad, x_query, x_center, x_max
configs/v1/32k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,4,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/v1/40k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 40000,
21
+ "filter_length": 2048,
22
+ "hop_length": 400,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 125,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/v1/48k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 11520,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,6,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/v2/32k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,8,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [20,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/v2/48k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 17280,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [12,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [24,20,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
csvdb/formanting.csv ADDED
File without changes
csvdb/stop.csv ADDED
File without changes
easy_infer.py ADDED
@@ -0,0 +1,638 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import os
3
+ import sys
4
+ import errno
5
+ import shutil
6
+ from mega import Mega
7
+ import datetime
8
+ import unicodedata
9
+ import torch
10
+ import glob
11
+ import gradio as gr
12
+ import gdown
13
+ import zipfile
14
+ import traceback
15
+ import json
16
+ import requests
17
+ import wget
18
+ import ffmpeg
19
+ import hashlib
20
+ now_dir = os.getcwd()
21
+ sys.path.append(now_dir)
22
+ from unidecode import unidecode
23
+ import re
24
+ import time
25
+ from infer.modules.vc.pipeline import Pipeline
26
+ VC = Pipeline
27
+ from lib.infer_pack.models import (
28
+ SynthesizerTrnMs256NSFsid,
29
+ SynthesizerTrnMs256NSFsid_nono,
30
+ SynthesizerTrnMs768NSFsid,
31
+ SynthesizerTrnMs768NSFsid_nono,
32
+ )
33
+ from configs.config import Config
34
+ from huggingface_hub import HfApi, list_models
35
+ from huggingface_hub import login
36
+ from bs4 import BeautifulSoup
37
+ from sklearn.cluster import MiniBatchKMeans
38
+ from dotenv import load_dotenv
39
+ load_dotenv()
40
+ config = Config()
41
+ tmp = os.path.join(now_dir, "TEMP")
42
+ shutil.rmtree(tmp, ignore_errors=True)
43
+ os.environ["TEMP"] = tmp
44
+ weight_root = os.getenv("weight_root")
45
+ index_root = os.getenv("index_root")
46
+ audio_root = "audios"
47
+ names = []
48
+ for name in os.listdir(weight_root):
49
+ if name.endswith(".pth"):
50
+ names.append(name)
51
+ index_paths = []
52
+
53
+ global indexes_list
54
+ indexes_list = []
55
+
56
+ audio_paths = []
57
+
58
+ for root, dirs, files in os.walk(index_root, topdown=False):
59
+ for name in files:
60
+ if name.endswith(".index") and "trained" not in name:
61
+ index_paths.append("%s\\%s" % (root, name))
62
+
63
+ for root, dirs, files in os.walk(audio_root, topdown=False):
64
+ for name in files:
65
+ audio_paths.append("%s/%s" % (root, name))
66
+
67
+ def calculate_md5(file_path):
68
+ hash_md5 = hashlib.md5()
69
+ with open(file_path, "rb") as f:
70
+ for chunk in iter(lambda: f.read(4096), b""):
71
+ hash_md5.update(chunk)
72
+ return hash_md5.hexdigest()
73
+
74
+ def format_title(title):
75
+ formatted_title = re.sub(r'[^\w\s-]', '', title)
76
+ formatted_title = formatted_title.replace(" ", "_")
77
+ return formatted_title
78
+
79
+ def silentremove(filename):
80
+ try:
81
+ os.remove(filename)
82
+ except OSError as e:
83
+ if e.errno != errno.ENOENT:
84
+ raise
85
+ def get_md5(temp_folder):
86
+ for root, subfolders, files in os.walk(temp_folder):
87
+ for file in files:
88
+ if not file.startswith("G_") and not file.startswith("D_") and file.endswith(".pth") and not "_G_" in file and not "_D_" in file:
89
+ md5_hash = calculate_md5(os.path.join(root, file))
90
+ return md5_hash
91
+
92
+ return None
93
+
94
+ def find_parent(search_dir, file_name):
95
+ for dirpath, dirnames, filenames in os.walk(search_dir):
96
+ if file_name in filenames:
97
+ return os.path.abspath(dirpath)
98
+ return None
99
+
100
+ def find_folder_parent(search_dir, folder_name):
101
+ for dirpath, dirnames, filenames in os.walk(search_dir):
102
+ if folder_name in dirnames:
103
+ return os.path.abspath(dirpath)
104
+ return None
105
+
106
+ def delete_large_files(directory_path, max_size_megabytes):
107
+ for filename in os.listdir(directory_path):
108
+ file_path = os.path.join(directory_path, filename)
109
+ if os.path.isfile(file_path):
110
+ size_in_bytes = os.path.getsize(file_path)
111
+ size_in_megabytes = size_in_bytes / (1024 * 1024) # Convert bytes to megabytes
112
+
113
+ if size_in_megabytes > max_size_megabytes:
114
+ print("###################################")
115
+ print(f"Deleting s*** {filename} (Size: {size_in_megabytes:.2f} MB)")
116
+ os.remove(file_path)
117
+ print("###################################")
118
+
119
+ def download_from_url(url):
120
+ parent_path = find_folder_parent(".", "pretrained_v2")
121
+ zips_path = os.path.join(parent_path, 'zips')
122
+ print(f"Limit download size in MB {os.getenv('MAX_DOWNLOAD_SIZE')}, duplicate the space for modify the limit")
123
+
124
+ if url != '':
125
+ print("Downloading the file: " + f"{url}")
126
+ if "drive.google.com" in url:
127
+ if "file/d/" in url:
128
+ file_id = url.split("file/d/")[1].split("/")[0]
129
+ elif "id=" in url:
130
+ file_id = url.split("id=")[1].split("&")[0]
131
+ else:
132
+ return None
133
+
134
+ if file_id:
135
+ os.chdir('./zips')
136
+ result = subprocess.run(["gdown", f"https://drive.google.com/uc?id={file_id}", "--fuzzy"], capture_output=True, text=True, encoding='utf-8')
137
+ if "Too many users have viewed or downloaded this file recently" in str(result.stderr):
138
+ return "too much use"
139
+ if "Cannot retrieve the public link of the file." in str(result.stderr):
140
+ return "private link"
141
+ print(result.stderr)
142
+
143
+ elif "/blob/" in url:
144
+ os.chdir('./zips')
145
+ url = url.replace("blob", "resolve")
146
+ response = requests.get(url)
147
+ if response.status_code == 200:
148
+ file_name = url.split('/')[-1]
149
+ with open(os.path.join(zips_path, file_name), "wb") as newfile:
150
+ newfile.write(response.content)
151
+ else:
152
+ os.chdir(parent_path)
153
+ elif "mega.nz" in url:
154
+ if "#!" in url:
155
+ file_id = url.split("#!")[1].split("!")[0]
156
+ elif "file/" in url:
157
+ file_id = url.split("file/")[1].split("/")[0]
158
+ else:
159
+ return None
160
+ if file_id:
161
+ m = Mega()
162
+ m.download_url(url, zips_path)
163
+ elif "/tree/main" in url:
164
+ response = requests.get(url)
165
+ soup = BeautifulSoup(response.content, 'html.parser')
166
+ temp_url = ''
167
+ for link in soup.find_all('a', href=True):
168
+ if link['href'].endswith('.zip'):
169
+ temp_url = link['href']
170
+ break
171
+ if temp_url:
172
+ url = temp_url
173
+ url = url.replace("blob", "resolve")
174
+ if "huggingface.co" not in url:
175
+ url = "https://huggingface.co" + url
176
+
177
+ wget.download(url)
178
+ else:
179
+ print("No .zip file found on the page.")
180
+ elif "cdn.discordapp.com" in url:
181
+ file = requests.get(url)
182
+ if file.status_code == 200:
183
+ name = url.split('/')
184
+ with open(os.path.join(zips_path, name[len(name)-1]), "wb") as newfile:
185
+ newfile.write(file.content)
186
+ else:
187
+ return None
188
+ elif "pixeldrain.com" in url:
189
+ try:
190
+ file_id = url.split("pixeldrain.com/u/")[1]
191
+ os.chdir('./zips')
192
+ print(file_id)
193
+ response = requests.get(f"https://pixeldrain.com/api/file/{file_id}")
194
+ if response.status_code == 200:
195
+ file_name = response.headers.get("Content-Disposition").split('filename=')[-1].strip('";')
196
+ if not os.path.exists(zips_path):
197
+ os.makedirs(zips_path)
198
+ with open(os.path.join(zips_path, file_name), "wb") as newfile:
199
+ newfile.write(response.content)
200
+ os.chdir(parent_path)
201
+ return "downloaded"
202
+ else:
203
+ os.chdir(parent_path)
204
+ return None
205
+ except Exception as e:
206
+ print(e)
207
+ os.chdir(parent_path)
208
+ return None
209
+ else:
210
+ os.chdir('./zips')
211
+ wget.download(url)
212
+ delete_large_files(zips_path, int(os.getenv("MAX_DOWNLOAD_SIZE")))
213
+ os.chdir(parent_path)
214
+ print("Full download")
215
+ return "downloaded"
216
+ else:
217
+ return None
218
+
219
+ class error_message(Exception):
220
+ def __init__(self, mensaje):
221
+ self.mensaje = mensaje
222
+ super().__init__(mensaje)
223
+
224
+ def get_vc(sid, to_return_protect0, to_return_protect1):
225
+ global n_spk, tgt_sr, net_g, vc, cpt, version
226
+ if sid == "" or sid == []:
227
+ global hubert_model
228
+ if hubert_model is not None:
229
+ print("clean_empty_cache")
230
+ del net_g, n_spk, vc, hubert_model, tgt_sr
231
+ hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
232
+ if torch.cuda.is_available():
233
+ torch.cuda.empty_cache()
234
+ if_f0 = cpt.get("f0", 1)
235
+ version = cpt.get("version", "v1")
236
+ if version == "v1":
237
+ if if_f0 == 1:
238
+ net_g = SynthesizerTrnMs256NSFsid(
239
+ *cpt["config"], is_half=config.is_half
240
+ )
241
+ else:
242
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
243
+ elif version == "v2":
244
+ if if_f0 == 1:
245
+ net_g = SynthesizerTrnMs768NSFsid(
246
+ *cpt["config"], is_half=config.is_half
247
+ )
248
+ else:
249
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
250
+ del net_g, cpt
251
+ if torch.cuda.is_available():
252
+ torch.cuda.empty_cache()
253
+ cpt = None
254
+ return (
255
+ {"visible": False, "__type__": "update"},
256
+ {"visible": False, "__type__": "update"},
257
+ {"visible": False, "__type__": "update"},
258
+ )
259
+ person = "%s/%s" % (weight_root, sid)
260
+ print("loading %s" % person)
261
+ cpt = torch.load(person, map_location="cpu")
262
+ tgt_sr = cpt["config"][-1]
263
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
264
+ if_f0 = cpt.get("f0", 1)
265
+ if if_f0 == 0:
266
+ to_return_protect0 = to_return_protect1 = {
267
+ "visible": False,
268
+ "value": 0.5,
269
+ "__type__": "update",
270
+ }
271
+ else:
272
+ to_return_protect0 = {
273
+ "visible": True,
274
+ "value": to_return_protect0,
275
+ "__type__": "update",
276
+ }
277
+ to_return_protect1 = {
278
+ "visible": True,
279
+ "value": to_return_protect1,
280
+ "__type__": "update",
281
+ }
282
+ version = cpt.get("version", "v1")
283
+ if version == "v1":
284
+ if if_f0 == 1:
285
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
286
+ else:
287
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
288
+ elif version == "v2":
289
+ if if_f0 == 1:
290
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
291
+ else:
292
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
293
+ del net_g.enc_q
294
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
295
+ net_g.eval().to(config.device)
296
+ if config.is_half:
297
+ net_g = net_g.half()
298
+ else:
299
+ net_g = net_g.float()
300
+ vc = VC(tgt_sr, config)
301
+ n_spk = cpt["config"][-3]
302
+ return (
303
+ {"visible": True, "maximum": n_spk, "__type__": "update"},
304
+ to_return_protect0,
305
+ to_return_protect1,
306
+ )
307
+
308
+ def load_downloaded_model(url):
309
+ parent_path = find_folder_parent(".", "pretrained_v2")
310
+ try:
311
+ infos = []
312
+ logs_folders = ['0_gt_wavs','1_16k_wavs','2a_f0','2b-f0nsf','3_feature256','3_feature768']
313
+ zips_path = os.path.join(parent_path, 'zips')
314
+ unzips_path = os.path.join(parent_path, 'unzips')
315
+ weights_path = os.path.join(parent_path, 'weights')
316
+ logs_dir = ""
317
+
318
+ if os.path.exists(zips_path):
319
+ shutil.rmtree(zips_path)
320
+ if os.path.exists(unzips_path):
321
+ shutil.rmtree(unzips_path)
322
+
323
+ os.mkdir(zips_path)
324
+ os.mkdir(unzips_path)
325
+
326
+ download_file = download_from_url(url)
327
+ if not download_file:
328
+ print("The file could not be downloaded.")
329
+ infos.append("The file could not be downloaded.")
330
+ yield "\n".join(infos)
331
+ elif download_file == "downloaded":
332
+ print("It has been downloaded successfully.")
333
+ infos.append("It has been downloaded successfully.")
334
+ yield "\n".join(infos)
335
+ elif download_file == "too much use":
336
+ raise Exception("Too many users have recently viewed or downloaded this file")
337
+ elif download_file == "private link":
338
+ raise Exception("Cannot get file from this private link")
339
+
340
+ for filename in os.listdir(zips_path):
341
+ if filename.endswith(".zip"):
342
+ zipfile_path = os.path.join(zips_path,filename)
343
+ print("Proceeding with the extraction...")
344
+ infos.append("Proceeding with the extraction...")
345
+ shutil.unpack_archive(zipfile_path, unzips_path, 'zip')
346
+ model_name = os.path.basename(zipfile_path)
347
+ logs_dir = os.path.join(parent_path,'logs', os.path.normpath(str(model_name).replace(".zip","")))
348
+ yield "\n".join(infos)
349
+ else:
350
+ print("Unzip error.")
351
+ infos.append("Unzip error.")
352
+ yield "\n".join(infos)
353
+
354
+ index_file = False
355
+ model_file = False
356
+ D_file = False
357
+ G_file = False
358
+
359
+ for path, subdirs, files in os.walk(unzips_path):
360
+ for item in files:
361
+ item_path = os.path.join(path, item)
362
+ if not 'G_' in item and not 'D_' in item and item.endswith('.pth'):
363
+ model_file = True
364
+ model_name = item.replace(".pth","")
365
+ logs_dir = os.path.join(parent_path,'logs', model_name)
366
+ if os.path.exists(logs_dir):
367
+ shutil.rmtree(logs_dir)
368
+ os.mkdir(logs_dir)
369
+ if not os.path.exists(weights_path):
370
+ os.mkdir(weights_path)
371
+ if os.path.exists(os.path.join(weights_path, item)):
372
+ os.remove(os.path.join(weights_path, item))
373
+ if os.path.exists(item_path):
374
+ shutil.move(item_path, weights_path)
375
+
376
+ if not model_file and not os.path.exists(logs_dir):
377
+ os.mkdir(logs_dir)
378
+ for path, subdirs, files in os.walk(unzips_path):
379
+ for item in files:
380
+ item_path = os.path.join(path, item)
381
+ if item.startswith('added_') and item.endswith('.index'):
382
+ index_file = True
383
+ if os.path.exists(item_path):
384
+ if os.path.exists(os.path.join(logs_dir, item)):
385
+ os.remove(os.path.join(logs_dir, item))
386
+ shutil.move(item_path, logs_dir)
387
+ if item.startswith('total_fea.npy') or item.startswith('events.'):
388
+ if os.path.exists(item_path):
389
+ if os.path.exists(os.path.join(logs_dir, item)):
390
+ os.remove(os.path.join(logs_dir, item))
391
+ shutil.move(item_path, logs_dir)
392
+
393
+
394
+ result = ""
395
+ if model_file:
396
+ if index_file:
397
+ print("The model works for inference, and has the .index file.")
398
+ infos.append("\n" + "The model works for inference, and has the .index file.")
399
+ yield "\n".join(infos)
400
+ else:
401
+ print("The model works for inference, but it doesn't have the .index file.")
402
+ infos.append("\n" + "The model works for inference, but it doesn't have the .index file.")
403
+ yield "\n".join(infos)
404
+
405
+ if not index_file and not model_file:
406
+ print("No relevant file was found to upload.")
407
+ infos.append("No relevant file was found to upload.")
408
+ yield "\n".join(infos)
409
+
410
+ if os.path.exists(zips_path):
411
+ shutil.rmtree(zips_path)
412
+ if os.path.exists(unzips_path):
413
+ shutil.rmtree(unzips_path)
414
+ os.chdir(parent_path)
415
+ return result
416
+ except Exception as e:
417
+ os.chdir(parent_path)
418
+ if "too much use" in str(e):
419
+ print("Too many users have recently viewed or downloaded this file")
420
+ yield "Too many users have recently viewed or downloaded this file"
421
+ elif "private link" in str(e):
422
+ print("Cannot get file from this private link")
423
+ yield "Cannot get file from this private link"
424
+ else:
425
+ print(e)
426
+ yield "An error occurred downloading"
427
+ finally:
428
+ os.chdir(parent_path)
429
+
430
+ def save_to_wav(record_button):
431
+ if record_button is None:
432
+ pass
433
+ else:
434
+ path_to_file=record_button
435
+ new_name = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+'.wav'
436
+ new_path='./audios/'+new_name
437
+ shutil.move(path_to_file,new_path)
438
+ return new_name
439
+
440
+ def change_choices2():
441
+ audio_paths=[]
442
+ for filename in os.listdir("./audios"):
443
+ if filename.endswith(('wav', 'mp3', 'flac', 'ogg', 'opus',
444
+ 'm4a', 'mp4', 'aac', 'alac', 'wma',
445
+ 'aiff', 'webm', 'ac3')):
446
+ audio_paths.append(os.path.join('./audios',filename).replace('\\', '/'))
447
+ return {"choices": sorted(audio_paths), "__type__": "update"}, {"__type__": "update"}
448
+
449
+ sup_audioext = {'wav', 'mp3', 'flac', 'ogg', 'opus',
450
+ 'm4a', 'mp4', 'aac', 'alac', 'wma',
451
+ 'aiff', 'webm', 'ac3'}
452
+
453
+ def load_downloaded_audio(url):
454
+ parent_path = find_folder_parent(".", "pretrained_v2")
455
+ try:
456
+ infos = []
457
+ audios_path = os.path.join(parent_path, 'audios')
458
+ zips_path = os.path.join(parent_path, 'zips')
459
+
460
+ if not os.path.exists(audios_path):
461
+ os.mkdir(audios_path)
462
+
463
+ download_file = download_from_url(url)
464
+ if not download_file:
465
+ print("The file could not be downloaded.")
466
+ infos.append("The file could not be downloaded.")
467
+ yield "\n".join(infos)
468
+ elif download_file == "downloaded":
469
+ print("It has been downloaded successfully.")
470
+ infos.append("It has been downloaded successfully.")
471
+ yield "\n".join(infos)
472
+ elif download_file == "too much use":
473
+ raise Exception("Too many users have recently viewed or downloaded this file")
474
+ elif download_file == "private link":
475
+ raise Exception("Cannot get file from this private link")
476
+
477
+ for filename in os.listdir(zips_path):
478
+ item_path = os.path.join(zips_path, filename)
479
+ if item_path.split('.')[-1] in sup_audioext:
480
+ if os.path.exists(item_path):
481
+ shutil.move(item_path, audios_path)
482
+
483
+ result = ""
484
+ print("Audio files have been moved to the 'audios' folder.")
485
+ infos.append("Audio files have been moved to the 'audios' folder.")
486
+ yield "\n".join(infos)
487
+
488
+ os.chdir(parent_path)
489
+ return result
490
+ except Exception as e:
491
+ os.chdir(parent_path)
492
+ if "too much use" in str(e):
493
+ print("Too many users have recently viewed or downloaded this file")
494
+ yield "Too many users have recently viewed or downloaded this file"
495
+ elif "private link" in str(e):
496
+ print("Cannot get file from this private link")
497
+ yield "Cannot get file from this private link"
498
+ else:
499
+ print(e)
500
+ yield "An error occurred downloading"
501
+ finally:
502
+ os.chdir(parent_path)
503
+
504
+
505
+ class error_message(Exception):
506
+ def __init__(self, mensaje):
507
+ self.mensaje = mensaje
508
+ super().__init__(mensaje)
509
+
510
+ def get_vc(sid, to_return_protect0, to_return_protect1):
511
+ global n_spk, tgt_sr, net_g, vc, cpt, version
512
+ if sid == "" or sid == []:
513
+ global hubert_model
514
+ if hubert_model is not None:
515
+ print("clean_empty_cache")
516
+ del net_g, n_spk, vc, hubert_model, tgt_sr
517
+ hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
518
+ if torch.cuda.is_available():
519
+ torch.cuda.empty_cache()
520
+ if_f0 = cpt.get("f0", 1)
521
+ version = cpt.get("version", "v1")
522
+ if version == "v1":
523
+ if if_f0 == 1:
524
+ net_g = SynthesizerTrnMs256NSFsid(
525
+ *cpt["config"], is_half=config.is_half
526
+ )
527
+ else:
528
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
529
+ elif version == "v2":
530
+ if if_f0 == 1:
531
+ net_g = SynthesizerTrnMs768NSFsid(
532
+ *cpt["config"], is_half=config.is_half
533
+ )
534
+ else:
535
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
536
+ del net_g, cpt
537
+ if torch.cuda.is_available():
538
+ torch.cuda.empty_cache()
539
+ cpt = None
540
+ return (
541
+ {"visible": False, "__type__": "update"},
542
+ {"visible": False, "__type__": "update"},
543
+ {"visible": False, "__type__": "update"},
544
+ )
545
+ person = "%s/%s" % (weight_root, sid)
546
+ print("loading %s" % person)
547
+ cpt = torch.load(person, map_location="cpu")
548
+ tgt_sr = cpt["config"][-1]
549
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
550
+ if_f0 = cpt.get("f0", 1)
551
+ if if_f0 == 0:
552
+ to_return_protect0 = to_return_protect1 = {
553
+ "visible": False,
554
+ "value": 0.5,
555
+ "__type__": "update",
556
+ }
557
+ else:
558
+ to_return_protect0 = {
559
+ "visible": True,
560
+ "value": to_return_protect0,
561
+ "__type__": "update",
562
+ }
563
+ to_return_protect1 = {
564
+ "visible": True,
565
+ "value": to_return_protect1,
566
+ "__type__": "update",
567
+ }
568
+ version = cpt.get("version", "v1")
569
+ if version == "v1":
570
+ if if_f0 == 1:
571
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
572
+ else:
573
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
574
+ elif version == "v2":
575
+ if if_f0 == 1:
576
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
577
+ else:
578
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
579
+ del net_g.enc_q
580
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
581
+ net_g.eval().to(config.device)
582
+ if config.is_half:
583
+ net_g = net_g.half()
584
+ else:
585
+ net_g = net_g.float()
586
+ vc = VC(tgt_sr, config)
587
+ n_spk = cpt["config"][-3]
588
+ return (
589
+ {"visible": True, "maximum": n_spk, "__type__": "update"},
590
+ to_return_protect0,
591
+ to_return_protect1,
592
+ )
593
+
594
+ def_download = "https://huggingface.co/Kuma6/Satoru-Gojo/resolve/main/Gojo.zip"
595
+
596
+ def download_model():
597
+ gr.Markdown(value="# " + "Download Model")
598
+ gr.Markdown(value="It is used to download your inference models.")
599
+ with gr.Row():
600
+ model_url=gr.Textbox(label="Url:", value=def_download)
601
+ with gr.Row():
602
+ download_model_status_bar=gr.Textbox(label="Status:")
603
+ with gr.Row():
604
+ download_button=gr.Button("Download")
605
+ download_button.click(fn=load_downloaded_model, inputs=[model_url], outputs=[download_model_status_bar])
606
+
607
+ def download_audio():
608
+ gr.Markdown(value="# " + "Download Audio")
609
+ gr.Markdown(value="Download audios of any format for use in inference (Recommended for Mobile Users).")
610
+ with gr.Row():
611
+ audio_url=gr.Textbox(label="Url:")
612
+ with gr.Row():
613
+ download_audio_status_bar=gr.Textbox(label="Status:")
614
+ with gr.Row():
615
+ download_button2=gr.Button("Download")
616
+ download_button2.click(fn=load_downloaded_audio, inputs=[audio_url], outputs=[download_audio_status_bar])
617
+
618
+ def get_edge_voice():
619
+ completed_process = subprocess.run(['edge-tts',"-l"], capture_output=True, text=True)
620
+ lines = completed_process.stdout.strip().split("\n")
621
+ data = []
622
+ current_entry = {}
623
+ for line in lines:
624
+ if line.startswith("Name: "):
625
+ if current_entry:
626
+ data.append(current_entry)
627
+ current_entry = {"Name": line.split(": ")[1]}
628
+ elif line.startswith("Gender: "):
629
+ current_entry["Gender"] = line.split(": ")[1]
630
+ if current_entry:
631
+ data.append(current_entry)
632
+ tts_voice = []
633
+ for entry in data:
634
+ name = entry["Name"]
635
+ gender = entry["Gender"]
636
+ formatted_entry = f'{name}-{gender}'
637
+ tts_voice.append(formatted_entry)
638
+ return tts_voice
formantshiftcfg/Put your formantshift presets here as a txt file ADDED
File without changes
formantshiftcfg/f2m.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 1.0
2
+ 0.8
formantshiftcfg/m2f.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 1.0
2
+ 1.2
formantshiftcfg/random.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 32.0
2
+ 9.8
infer/lib/audio.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import av
4
+ from io import BytesIO
5
+ import ffmpeg
6
+ import os
7
+ import sys
8
+
9
+ import random
10
+ from infer.lib.csvutil import CSVutil
11
+ #import csv
12
+
13
+ platform_stft_mapping = {
14
+ 'linux': 'stftpitchshift',
15
+ 'darwin': 'stftpitchshift',
16
+ 'win32': 'stftpitchshift.exe',
17
+ }
18
+
19
+ stft = platform_stft_mapping.get(sys.platform)
20
+
21
+ def wav2(i, o, format):
22
+ inp = av.open(i, 'rb')
23
+ if format == "m4a": format = "mp4"
24
+ out = av.open(o, 'wb', format=format)
25
+ if format == "ogg": format = "libvorbis"
26
+ if format == "mp4": format = "aac"
27
+
28
+ ostream = out.add_stream(format)
29
+
30
+ for frame in inp.decode(audio=0):
31
+ for p in ostream.encode(frame): out.mux(p)
32
+
33
+ for p in ostream.encode(None): out.mux(p)
34
+
35
+ out.close()
36
+ inp.close()
37
+
38
+ def audio2(i, o, format, sr):
39
+ inp = av.open(i, 'rb')
40
+ out = av.open(o, 'wb', format=format)
41
+ if format == "ogg": format = "libvorbis"
42
+ if format == "f32le": format = "pcm_f32le"
43
+
44
+ ostream = out.add_stream(format, channels=1)
45
+ ostream.sample_rate = sr
46
+
47
+ for frame in inp.decode(audio=0):
48
+ for p in ostream.encode(frame): out.mux(p)
49
+
50
+ out.close()
51
+ inp.close()
52
+
53
+ def load_audion(file, sr):
54
+ try:
55
+ file = (
56
+ file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
57
+ ) # 防止小白拷路径头尾带了空格和"和回车
58
+ with open(file, "rb") as f:
59
+ with BytesIO() as out:
60
+ audio2(f, out, "f32le", sr)
61
+ return np.frombuffer(out.getvalue(), np.float32).flatten()
62
+
63
+ except AttributeError:
64
+ audio = file[1] / 32768.0
65
+ if len(audio.shape) == 2:
66
+ audio = np.mean(audio, -1)
67
+ return librosa.resample(audio, orig_sr=file[0], target_sr=16000)
68
+
69
+ except Exception as e:
70
+ raise RuntimeError(f"Failed to load audio: {e}")
71
+
72
+
73
+
74
+
75
+ def load_audio(file, sr, DoFormant=False, Quefrency=1.0, Timbre=1.0):
76
+ converted = False
77
+ DoFormant, Quefrency, Timbre = CSVutil("csvdb/formanting.csv", "r", "formanting")
78
+ try:
79
+ # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
80
+ # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
81
+ # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
82
+ file = (
83
+ file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
84
+ ) # 防止小白拷路径头尾带了空格和"和回车
85
+ file_formanted = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
86
+
87
+ # print(f"dofor={bool(DoFormant)} timbr={Timbre} quef={Quefrency}\n")
88
+
89
+ if (
90
+ lambda DoFormant: True
91
+ if DoFormant.lower() == "true"
92
+ else (False if DoFormant.lower() == "false" else DoFormant)
93
+ )(DoFormant):
94
+ numerator = round(random.uniform(1, 4), 4)
95
+ # os.system(f"stftpitchshift -i {file} -q {Quefrency} -t {Timbre} -o {file_formanted}")
96
+ # print('stftpitchshift -i "%s" -p 1.0 --rms -w 128 -v 8 -q %s -t %s -o "%s"' % (file, Quefrency, Timbre, file_formanted))
97
+
98
+ if not file.endswith(".wav"):
99
+ if not os.path.isfile(f"{file_formanted}.wav"):
100
+ converted = True
101
+ # print(f"\nfile = {file}\n")
102
+ # print(f"\nfile_formanted = {file_formanted}\n")
103
+ converting = (
104
+ ffmpeg.input(file_formanted, threads=0)
105
+ .output(f"{file_formanted}.wav")
106
+ .run(
107
+ cmd=["ffmpeg", "-nostdin"],
108
+ capture_stdout=True,
109
+ capture_stderr=True,
110
+ )
111
+ )
112
+ else:
113
+ pass
114
+
115
+ file_formanted = (
116
+ f"{file_formanted}.wav"
117
+ if not file_formanted.endswith(".wav")
118
+ else file_formanted
119
+ )
120
+
121
+ print(f" · Formanting {file_formanted}...\n")
122
+
123
+ os.system(
124
+ '%s -i "%s" -q "%s" -t "%s" -o "%sFORMANTED_%s.wav"'
125
+ % (
126
+ stft,
127
+ file_formanted,
128
+ Quefrency,
129
+ Timbre,
130
+ file_formanted,
131
+ str(numerator),
132
+ )
133
+ )
134
+
135
+ print(f" · Formanted {file_formanted}!\n")
136
+
137
+ # filepraat = (os.path.abspath(os.getcwd()) + '\\' + file).replace('/','\\')
138
+ # file_formantedpraat = ('"' + os.path.abspath(os.getcwd()) + '/' + 'formanted'.join(file_formanted) + '"').replace('/','\\')
139
+ # print("%sFORMANTED_%s.wav" % (file_formanted, str(numerator)))
140
+
141
+ out, _ = (
142
+ ffmpeg.input(
143
+ "%sFORMANTED_%s.wav" % (file_formanted, str(numerator)), threads=0
144
+ )
145
+ .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
146
+ .run(
147
+ cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True
148
+ )
149
+ )
150
+
151
+ try:
152
+ os.remove("%sFORMANTED_%s.wav" % (file_formanted, str(numerator)))
153
+ except Exception:
154
+ pass
155
+ print("couldn't remove formanted type of file")
156
+
157
+ else:
158
+ out, _ = (
159
+ ffmpeg.input(file, threads=0)
160
+ .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
161
+ .run(
162
+ cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True
163
+ )
164
+ )
165
+ except Exception as e:
166
+ raise RuntimeError(f"Failed to load audio: {e}")
167
+
168
+ if converted:
169
+ try:
170
+ os.remove(file_formanted)
171
+ except Exception:
172
+ pass
173
+ print("couldn't remove converted type of file")
174
+ converted = False
175
+
176
+ return np.frombuffer(out, np.float32).flatten()
177
+
178
+
179
+ def check_audio_duration(file):
180
+ try:
181
+ file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
182
+
183
+ probe = ffmpeg.probe(file)
184
+
185
+ duration = float(probe['streams'][0]['duration'])
186
+
187
+ if duration < 0.76:
188
+ print(
189
+ f"\n------------\n"
190
+ f"Audio file, {file.split('/')[-1]}, under ~0.76s detected - file is too short. Target at least 1-2s for best results."
191
+ f"\n------------\n\n"
192
+ )
193
+ return False
194
+
195
+ return True
196
+ except Exception as e:
197
+ raise RuntimeError(f"Failed to check audio duration: {e}")
infer/lib/csvutil.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import numpy as np
3
+
4
+ # import praatio
5
+ # import praatio.praat_scripts
6
+ import os
7
+ import sys
8
+
9
+ import random
10
+
11
+ import csv
12
+
13
+ # praatEXE = join('.',os.path.abspath(os.getcwd()) + r"\Praat.exe")
14
+
15
+
16
+ def CSVutil(file, rw, type, *args):
17
+ if type == "formanting":
18
+ if rw == "r":
19
+ with open(file) as fileCSVread:
20
+ csv_reader = list(csv.reader(fileCSVread))
21
+ return (
22
+ (csv_reader[0][0], csv_reader[0][1], csv_reader[0][2])
23
+ if csv_reader is not None
24
+ else (lambda: exec('raise ValueError("No data")'))()
25
+ )
26
+ else:
27
+ if args:
28
+ doformnt = args[0]
29
+ else:
30
+ doformnt = False
31
+ qfr = args[1] if len(args) > 1 else 1.0
32
+ tmb = args[2] if len(args) > 2 else 1.0
33
+ with open(file, rw, newline="") as fileCSVwrite:
34
+ csv_writer = csv.writer(fileCSVwrite, delimiter=",")
35
+ csv_writer.writerow([doformnt, qfr, tmb])
36
+ elif type == "stop":
37
+ stop = args[0] if args else False
38
+ with open(file, rw, newline="") as fileCSVwrite:
39
+ csv_writer = csv.writer(fileCSVwrite, delimiter=",")
40
+ csv_writer.writerow([stop])
41
+
infer/lib/infer_pack/attentions.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+
4
+ import numpy as np
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+ from infer.lib.infer_pack import commons, modules
10
+ from infer.lib.infer_pack.modules import LayerNorm
11
+
12
+
13
+ class Encoder(nn.Module):
14
+ def __init__(
15
+ self,
16
+ hidden_channels,
17
+ filter_channels,
18
+ n_heads,
19
+ n_layers,
20
+ kernel_size=1,
21
+ p_dropout=0.0,
22
+ window_size=10,
23
+ **kwargs
24
+ ):
25
+ super().__init__()
26
+ self.hidden_channels = hidden_channels
27
+ self.filter_channels = filter_channels
28
+ self.n_heads = n_heads
29
+ self.n_layers = n_layers
30
+ self.kernel_size = kernel_size
31
+ self.p_dropout = p_dropout
32
+ self.window_size = window_size
33
+
34
+ self.drop = nn.Dropout(p_dropout)
35
+ self.attn_layers = nn.ModuleList()
36
+ self.norm_layers_1 = nn.ModuleList()
37
+ self.ffn_layers = nn.ModuleList()
38
+ self.norm_layers_2 = nn.ModuleList()
39
+ for i in range(self.n_layers):
40
+ self.attn_layers.append(
41
+ MultiHeadAttention(
42
+ hidden_channels,
43
+ hidden_channels,
44
+ n_heads,
45
+ p_dropout=p_dropout,
46
+ window_size=window_size,
47
+ )
48
+ )
49
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
50
+ self.ffn_layers.append(
51
+ FFN(
52
+ hidden_channels,
53
+ hidden_channels,
54
+ filter_channels,
55
+ kernel_size,
56
+ p_dropout=p_dropout,
57
+ )
58
+ )
59
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
60
+
61
+ def forward(self, x, x_mask):
62
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
63
+ x = x * x_mask
64
+ for i in range(self.n_layers):
65
+ y = self.attn_layers[i](x, x, attn_mask)
66
+ y = self.drop(y)
67
+ x = self.norm_layers_1[i](x + y)
68
+
69
+ y = self.ffn_layers[i](x, x_mask)
70
+ y = self.drop(y)
71
+ x = self.norm_layers_2[i](x + y)
72
+ x = x * x_mask
73
+ return x
74
+
75
+
76
+ class Decoder(nn.Module):
77
+ def __init__(
78
+ self,
79
+ hidden_channels,
80
+ filter_channels,
81
+ n_heads,
82
+ n_layers,
83
+ kernel_size=1,
84
+ p_dropout=0.0,
85
+ proximal_bias=False,
86
+ proximal_init=True,
87
+ **kwargs
88
+ ):
89
+ super().__init__()
90
+ self.hidden_channels = hidden_channels
91
+ self.filter_channels = filter_channels
92
+ self.n_heads = n_heads
93
+ self.n_layers = n_layers
94
+ self.kernel_size = kernel_size
95
+ self.p_dropout = p_dropout
96
+ self.proximal_bias = proximal_bias
97
+ self.proximal_init = proximal_init
98
+
99
+ self.drop = nn.Dropout(p_dropout)
100
+ self.self_attn_layers = nn.ModuleList()
101
+ self.norm_layers_0 = nn.ModuleList()
102
+ self.encdec_attn_layers = nn.ModuleList()
103
+ self.norm_layers_1 = nn.ModuleList()
104
+ self.ffn_layers = nn.ModuleList()
105
+ self.norm_layers_2 = nn.ModuleList()
106
+ for i in range(self.n_layers):
107
+ self.self_attn_layers.append(
108
+ MultiHeadAttention(
109
+ hidden_channels,
110
+ hidden_channels,
111
+ n_heads,
112
+ p_dropout=p_dropout,
113
+ proximal_bias=proximal_bias,
114
+ proximal_init=proximal_init,
115
+ )
116
+ )
117
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
118
+ self.encdec_attn_layers.append(
119
+ MultiHeadAttention(
120
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
121
+ )
122
+ )
123
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
124
+ self.ffn_layers.append(
125
+ FFN(
126
+ hidden_channels,
127
+ hidden_channels,
128
+ filter_channels,
129
+ kernel_size,
130
+ p_dropout=p_dropout,
131
+ causal=True,
132
+ )
133
+ )
134
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
135
+
136
+ def forward(self, x, x_mask, h, h_mask):
137
+ """
138
+ x: decoder input
139
+ h: encoder output
140
+ """
141
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
142
+ device=x.device, dtype=x.dtype
143
+ )
144
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
145
+ x = x * x_mask
146
+ for i in range(self.n_layers):
147
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
148
+ y = self.drop(y)
149
+ x = self.norm_layers_0[i](x + y)
150
+
151
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
152
+ y = self.drop(y)
153
+ x = self.norm_layers_1[i](x + y)
154
+
155
+ y = self.ffn_layers[i](x, x_mask)
156
+ y = self.drop(y)
157
+ x = self.norm_layers_2[i](x + y)
158
+ x = x * x_mask
159
+ return x
160
+
161
+
162
+ class MultiHeadAttention(nn.Module):
163
+ def __init__(
164
+ self,
165
+ channels,
166
+ out_channels,
167
+ n_heads,
168
+ p_dropout=0.0,
169
+ window_size=None,
170
+ heads_share=True,
171
+ block_length=None,
172
+ proximal_bias=False,
173
+ proximal_init=False,
174
+ ):
175
+ super().__init__()
176
+ assert channels % n_heads == 0
177
+
178
+ self.channels = channels
179
+ self.out_channels = out_channels
180
+ self.n_heads = n_heads
181
+ self.p_dropout = p_dropout
182
+ self.window_size = window_size
183
+ self.heads_share = heads_share
184
+ self.block_length = block_length
185
+ self.proximal_bias = proximal_bias
186
+ self.proximal_init = proximal_init
187
+ self.attn = None
188
+
189
+ self.k_channels = channels // n_heads
190
+ self.conv_q = nn.Conv1d(channels, channels, 1)
191
+ self.conv_k = nn.Conv1d(channels, channels, 1)
192
+ self.conv_v = nn.Conv1d(channels, channels, 1)
193
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
194
+ self.drop = nn.Dropout(p_dropout)
195
+
196
+ if window_size is not None:
197
+ n_heads_rel = 1 if heads_share else n_heads
198
+ rel_stddev = self.k_channels**-0.5
199
+ self.emb_rel_k = nn.Parameter(
200
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
201
+ * rel_stddev
202
+ )
203
+ self.emb_rel_v = nn.Parameter(
204
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
205
+ * rel_stddev
206
+ )
207
+
208
+ nn.init.xavier_uniform_(self.conv_q.weight)
209
+ nn.init.xavier_uniform_(self.conv_k.weight)
210
+ nn.init.xavier_uniform_(self.conv_v.weight)
211
+ if proximal_init:
212
+ with torch.no_grad():
213
+ self.conv_k.weight.copy_(self.conv_q.weight)
214
+ self.conv_k.bias.copy_(self.conv_q.bias)
215
+
216
+ def forward(self, x, c, attn_mask=None):
217
+ q = self.conv_q(x)
218
+ k = self.conv_k(c)
219
+ v = self.conv_v(c)
220
+
221
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
222
+
223
+ x = self.conv_o(x)
224
+ return x
225
+
226
+ def attention(self, query, key, value, mask=None):
227
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
228
+ b, d, t_s, t_t = (*key.size(), query.size(2))
229
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
230
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
231
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
232
+
233
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
234
+ if self.window_size is not None:
235
+ assert (
236
+ t_s == t_t
237
+ ), "Relative attention is only available for self-attention."
238
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
239
+ rel_logits = self._matmul_with_relative_keys(
240
+ query / math.sqrt(self.k_channels), key_relative_embeddings
241
+ )
242
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
243
+ scores = scores + scores_local
244
+ if self.proximal_bias:
245
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
246
+ scores = scores + self._attention_bias_proximal(t_s).to(
247
+ device=scores.device, dtype=scores.dtype
248
+ )
249
+ if mask is not None:
250
+ scores = scores.masked_fill(mask == 0, -1e4)
251
+ if self.block_length is not None:
252
+ assert (
253
+ t_s == t_t
254
+ ), "Local attention is only available for self-attention."
255
+ block_mask = (
256
+ torch.ones_like(scores)
257
+ .triu(-self.block_length)
258
+ .tril(self.block_length)
259
+ )
260
+ scores = scores.masked_fill(block_mask == 0, -1e4)
261
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
262
+ p_attn = self.drop(p_attn)
263
+ output = torch.matmul(p_attn, value)
264
+ if self.window_size is not None:
265
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
266
+ value_relative_embeddings = self._get_relative_embeddings(
267
+ self.emb_rel_v, t_s
268
+ )
269
+ output = output + self._matmul_with_relative_values(
270
+ relative_weights, value_relative_embeddings
271
+ )
272
+ output = (
273
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
274
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
275
+ return output, p_attn
276
+
277
+ def _matmul_with_relative_values(self, x, y):
278
+ """
279
+ x: [b, h, l, m]
280
+ y: [h or 1, m, d]
281
+ ret: [b, h, l, d]
282
+ """
283
+ ret = torch.matmul(x, y.unsqueeze(0))
284
+ return ret
285
+
286
+ def _matmul_with_relative_keys(self, x, y):
287
+ """
288
+ x: [b, h, l, d]
289
+ y: [h or 1, m, d]
290
+ ret: [b, h, l, m]
291
+ """
292
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
293
+ return ret
294
+
295
+ def _get_relative_embeddings(self, relative_embeddings, length):
296
+ max_relative_position = 2 * self.window_size + 1
297
+ # Pad first before slice to avoid using cond ops.
298
+ pad_length = max(length - (self.window_size + 1), 0)
299
+ slice_start_position = max((self.window_size + 1) - length, 0)
300
+ slice_end_position = slice_start_position + 2 * length - 1
301
+ if pad_length > 0:
302
+ padded_relative_embeddings = F.pad(
303
+ relative_embeddings,
304
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
305
+ )
306
+ else:
307
+ padded_relative_embeddings = relative_embeddings
308
+ used_relative_embeddings = padded_relative_embeddings[
309
+ :, slice_start_position:slice_end_position
310
+ ]
311
+ return used_relative_embeddings
312
+
313
+ def _relative_position_to_absolute_position(self, x):
314
+ """
315
+ x: [b, h, l, 2*l-1]
316
+ ret: [b, h, l, l]
317
+ """
318
+ batch, heads, length, _ = x.size()
319
+ # Concat columns of pad to shift from relative to absolute indexing.
320
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
321
+
322
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
323
+ x_flat = x.view([batch, heads, length * 2 * length])
324
+ x_flat = F.pad(
325
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
326
+ )
327
+
328
+ # Reshape and slice out the padded elements.
329
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
330
+ :, :, :length, length - 1 :
331
+ ]
332
+ return x_final
333
+
334
+ def _absolute_position_to_relative_position(self, x):
335
+ """
336
+ x: [b, h, l, l]
337
+ ret: [b, h, l, 2*l-1]
338
+ """
339
+ batch, heads, length, _ = x.size()
340
+ # padd along column
341
+ x = F.pad(
342
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
343
+ )
344
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
345
+ # add 0's in the beginning that will skew the elements after reshape
346
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
347
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
348
+ return x_final
349
+
350
+ def _attention_bias_proximal(self, length):
351
+ """Bias for self-attention to encourage attention to close positions.
352
+ Args:
353
+ length: an integer scalar.
354
+ Returns:
355
+ a Tensor with shape [1, 1, length, length]
356
+ """
357
+ r = torch.arange(length, dtype=torch.float32)
358
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
359
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
360
+
361
+
362
+ class FFN(nn.Module):
363
+ def __init__(
364
+ self,
365
+ in_channels,
366
+ out_channels,
367
+ filter_channels,
368
+ kernel_size,
369
+ p_dropout=0.0,
370
+ activation=None,
371
+ causal=False,
372
+ ):
373
+ super().__init__()
374
+ self.in_channels = in_channels
375
+ self.out_channels = out_channels
376
+ self.filter_channels = filter_channels
377
+ self.kernel_size = kernel_size
378
+ self.p_dropout = p_dropout
379
+ self.activation = activation
380
+ self.causal = causal
381
+
382
+ if causal:
383
+ self.padding = self._causal_padding
384
+ else:
385
+ self.padding = self._same_padding
386
+
387
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
388
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
389
+ self.drop = nn.Dropout(p_dropout)
390
+
391
+ def forward(self, x, x_mask):
392
+ x = self.conv_1(self.padding(x * x_mask))
393
+ if self.activation == "gelu":
394
+ x = x * torch.sigmoid(1.702 * x)
395
+ else:
396
+ x = torch.relu(x)
397
+ x = self.drop(x)
398
+ x = self.conv_2(self.padding(x * x_mask))
399
+ return x * x_mask
400
+
401
+ def _causal_padding(self, x):
402
+ if self.kernel_size == 1:
403
+ return x
404
+ pad_l = self.kernel_size - 1
405
+ pad_r = 0
406
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
407
+ x = F.pad(x, commons.convert_pad_shape(padding))
408
+ return x
409
+
410
+ def _same_padding(self, x):
411
+ if self.kernel_size == 1:
412
+ return x
413
+ pad_l = (self.kernel_size - 1) // 2
414
+ pad_r = self.kernel_size // 2
415
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
416
+ x = F.pad(x, commons.convert_pad_shape(padding))
417
+ return x
infer/lib/infer_pack/commons.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+
9
+ def init_weights(m, mean=0.0, std=0.01):
10
+ classname = m.__class__.__name__
11
+ if classname.find("Conv") != -1:
12
+ m.weight.data.normal_(mean, std)
13
+
14
+
15
+ def get_padding(kernel_size, dilation=1):
16
+ return int((kernel_size * dilation - dilation) / 2)
17
+
18
+
19
+ def convert_pad_shape(pad_shape):
20
+ l = pad_shape[::-1]
21
+ pad_shape = [item for sublist in l for item in sublist]
22
+ return pad_shape
23
+
24
+
25
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
26
+ """KL(P||Q)"""
27
+ kl = (logs_q - logs_p) - 0.5
28
+ kl += (
29
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
30
+ )
31
+ return kl
32
+
33
+
34
+ def rand_gumbel(shape):
35
+ """Sample from the Gumbel distribution, protect from overflows."""
36
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
37
+ return -torch.log(-torch.log(uniform_samples))
38
+
39
+
40
+ def rand_gumbel_like(x):
41
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
42
+ return g
43
+
44
+
45
+ def slice_segments(x, ids_str, segment_size=4):
46
+ ret = torch.zeros_like(x[:, :, :segment_size])
47
+ for i in range(x.size(0)):
48
+ idx_str = ids_str[i]
49
+ idx_end = idx_str + segment_size
50
+ ret[i] = x[i, :, idx_str:idx_end]
51
+ return ret
52
+
53
+
54
+ def slice_segments2(x, ids_str, segment_size=4):
55
+ ret = torch.zeros_like(x[:, :segment_size])
56
+ for i in range(x.size(0)):
57
+ idx_str = ids_str[i]
58
+ idx_end = idx_str + segment_size
59
+ ret[i] = x[i, idx_str:idx_end]
60
+ return ret
61
+
62
+
63
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
64
+ b, d, t = x.size()
65
+ if x_lengths is None:
66
+ x_lengths = t
67
+ ids_str_max = x_lengths - segment_size + 1
68
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
69
+ ret = slice_segments(x, ids_str, segment_size)
70
+ return ret, ids_str
71
+
72
+
73
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
74
+ position = torch.arange(length, dtype=torch.float)
75
+ num_timescales = channels // 2
76
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
77
+ num_timescales - 1
78
+ )
79
+ inv_timescales = min_timescale * torch.exp(
80
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
81
+ )
82
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
83
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
84
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
85
+ signal = signal.view(1, channels, length)
86
+ return signal
87
+
88
+
89
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
90
+ b, channels, length = x.size()
91
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
92
+ return x + signal.to(dtype=x.dtype, device=x.device)
93
+
94
+
95
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
96
+ b, channels, length = x.size()
97
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
98
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
99
+
100
+
101
+ def subsequent_mask(length):
102
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
103
+ return mask
104
+
105
+
106
+ @torch.jit.script
107
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
108
+ n_channels_int = n_channels[0]
109
+ in_act = input_a + input_b
110
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
111
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
112
+ acts = t_act * s_act
113
+ return acts
114
+
115
+
116
+ def convert_pad_shape(pad_shape):
117
+ l = pad_shape[::-1]
118
+ pad_shape = [item for sublist in l for item in sublist]
119
+ return pad_shape
120
+
121
+
122
+ def shift_1d(x):
123
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
124
+ return x
125
+
126
+
127
+ def sequence_mask(length, max_length=None):
128
+ if max_length is None:
129
+ max_length = length.max()
130
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
131
+ return x.unsqueeze(0) < length.unsqueeze(1)
132
+
133
+
134
+ def generate_path(duration, mask):
135
+ """
136
+ duration: [b, 1, t_x]
137
+ mask: [b, 1, t_y, t_x]
138
+ """
139
+ device = duration.device
140
+
141
+ b, _, t_y, t_x = mask.shape
142
+ cum_duration = torch.cumsum(duration, -1)
143
+
144
+ cum_duration_flat = cum_duration.view(b * t_x)
145
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
146
+ path = path.view(b, t_x, t_y)
147
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
148
+ path = path.unsqueeze(1).transpose(2, 3) * mask
149
+ return path
150
+
151
+
152
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
153
+ if isinstance(parameters, torch.Tensor):
154
+ parameters = [parameters]
155
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
156
+ norm_type = float(norm_type)
157
+ if clip_value is not None:
158
+ clip_value = float(clip_value)
159
+
160
+ total_norm = 0
161
+ for p in parameters:
162
+ param_norm = p.grad.data.norm(norm_type)
163
+ total_norm += param_norm.item() ** norm_type
164
+ if clip_value is not None:
165
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
166
+ total_norm = total_norm ** (1.0 / norm_type)
167
+ return total_norm
infer/lib/infer_pack/models.py ADDED
@@ -0,0 +1,1174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch import nn
9
+ from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
10
+ from torch.nn import functional as F
11
+ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
12
+
13
+ from infer.lib.infer_pack import attentions, commons, modules
14
+ from infer.lib.infer_pack.commons import get_padding, init_weights
15
+ has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available())
16
+
17
+ class TextEncoder256(nn.Module):
18
+ def __init__(
19
+ self,
20
+ out_channels,
21
+ hidden_channels,
22
+ filter_channels,
23
+ n_heads,
24
+ n_layers,
25
+ kernel_size,
26
+ p_dropout,
27
+ f0=True,
28
+ ):
29
+ super().__init__()
30
+ self.out_channels = out_channels
31
+ self.hidden_channels = hidden_channels
32
+ self.filter_channels = filter_channels
33
+ self.n_heads = n_heads
34
+ self.n_layers = n_layers
35
+ self.kernel_size = kernel_size
36
+ self.p_dropout = p_dropout
37
+ self.emb_phone = nn.Linear(256, hidden_channels)
38
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
+ if f0 == True:
40
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
+ self.encoder = attentions.Encoder(
42
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
+ )
44
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
+
46
+ def forward(self, phone, pitch, lengths):
47
+ if pitch == None:
48
+ x = self.emb_phone(phone)
49
+ else:
50
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
+ x = self.lrelu(x)
53
+ x = torch.transpose(x, 1, -1) # [b, h, t]
54
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
+ x.dtype
56
+ )
57
+ x = self.encoder(x * x_mask, x_mask)
58
+ stats = self.proj(x) * x_mask
59
+
60
+ m, logs = torch.split(stats, self.out_channels, dim=1)
61
+ return m, logs, x_mask
62
+
63
+
64
+ class TextEncoder768(nn.Module):
65
+ def __init__(
66
+ self,
67
+ out_channels,
68
+ hidden_channels,
69
+ filter_channels,
70
+ n_heads,
71
+ n_layers,
72
+ kernel_size,
73
+ p_dropout,
74
+ f0=True,
75
+ ):
76
+ super().__init__()
77
+ self.out_channels = out_channels
78
+ self.hidden_channels = hidden_channels
79
+ self.filter_channels = filter_channels
80
+ self.n_heads = n_heads
81
+ self.n_layers = n_layers
82
+ self.kernel_size = kernel_size
83
+ self.p_dropout = p_dropout
84
+ self.emb_phone = nn.Linear(768, hidden_channels)
85
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
+ if f0 == True:
87
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
+ self.encoder = attentions.Encoder(
89
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
+ )
91
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
+
93
+ def forward(self, phone, pitch, lengths):
94
+ if pitch == None:
95
+ x = self.emb_phone(phone)
96
+ else:
97
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
+ x = self.lrelu(x)
100
+ x = torch.transpose(x, 1, -1) # [b, h, t]
101
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
+ x.dtype
103
+ )
104
+ x = self.encoder(x * x_mask, x_mask)
105
+ stats = self.proj(x) * x_mask
106
+
107
+ m, logs = torch.split(stats, self.out_channels, dim=1)
108
+ return m, logs, x_mask
109
+
110
+
111
+ class ResidualCouplingBlock(nn.Module):
112
+ def __init__(
113
+ self,
114
+ channels,
115
+ hidden_channels,
116
+ kernel_size,
117
+ dilation_rate,
118
+ n_layers,
119
+ n_flows=4,
120
+ gin_channels=0,
121
+ ):
122
+ super().__init__()
123
+ self.channels = channels
124
+ self.hidden_channels = hidden_channels
125
+ self.kernel_size = kernel_size
126
+ self.dilation_rate = dilation_rate
127
+ self.n_layers = n_layers
128
+ self.n_flows = n_flows
129
+ self.gin_channels = gin_channels
130
+
131
+ self.flows = nn.ModuleList()
132
+ for i in range(n_flows):
133
+ self.flows.append(
134
+ modules.ResidualCouplingLayer(
135
+ channels,
136
+ hidden_channels,
137
+ kernel_size,
138
+ dilation_rate,
139
+ n_layers,
140
+ gin_channels=gin_channels,
141
+ mean_only=True,
142
+ )
143
+ )
144
+ self.flows.append(modules.Flip())
145
+
146
+ def forward(self, x, x_mask, g=None, reverse=False):
147
+ if not reverse:
148
+ for flow in self.flows:
149
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
+ else:
151
+ for flow in reversed(self.flows):
152
+ x = flow(x, x_mask, g=g, reverse=reverse)
153
+ return x
154
+
155
+ def remove_weight_norm(self):
156
+ for i in range(self.n_flows):
157
+ self.flows[i * 2].remove_weight_norm()
158
+
159
+
160
+ class PosteriorEncoder(nn.Module):
161
+ def __init__(
162
+ self,
163
+ in_channels,
164
+ out_channels,
165
+ hidden_channels,
166
+ kernel_size,
167
+ dilation_rate,
168
+ n_layers,
169
+ gin_channels=0,
170
+ ):
171
+ super().__init__()
172
+ self.in_channels = in_channels
173
+ self.out_channels = out_channels
174
+ self.hidden_channels = hidden_channels
175
+ self.kernel_size = kernel_size
176
+ self.dilation_rate = dilation_rate
177
+ self.n_layers = n_layers
178
+ self.gin_channels = gin_channels
179
+
180
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
+ self.enc = modules.WN(
182
+ hidden_channels,
183
+ kernel_size,
184
+ dilation_rate,
185
+ n_layers,
186
+ gin_channels=gin_channels,
187
+ )
188
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
+
190
+ def forward(self, x, x_lengths, g=None):
191
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
+ x.dtype
193
+ )
194
+ x = self.pre(x) * x_mask
195
+ x = self.enc(x, x_mask, g=g)
196
+ stats = self.proj(x) * x_mask
197
+ m, logs = torch.split(stats, self.out_channels, dim=1)
198
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
+ return z, m, logs, x_mask
200
+
201
+ def remove_weight_norm(self):
202
+ self.enc.remove_weight_norm()
203
+
204
+
205
+ class Generator(torch.nn.Module):
206
+ def __init__(
207
+ self,
208
+ initial_channel,
209
+ resblock,
210
+ resblock_kernel_sizes,
211
+ resblock_dilation_sizes,
212
+ upsample_rates,
213
+ upsample_initial_channel,
214
+ upsample_kernel_sizes,
215
+ gin_channels=0,
216
+ ):
217
+ super(Generator, self).__init__()
218
+ self.num_kernels = len(resblock_kernel_sizes)
219
+ self.num_upsamples = len(upsample_rates)
220
+ self.conv_pre = Conv1d(
221
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
222
+ )
223
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
+
225
+ self.ups = nn.ModuleList()
226
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
+ self.ups.append(
228
+ weight_norm(
229
+ ConvTranspose1d(
230
+ upsample_initial_channel // (2**i),
231
+ upsample_initial_channel // (2 ** (i + 1)),
232
+ k,
233
+ u,
234
+ padding=(k - u) // 2,
235
+ )
236
+ )
237
+ )
238
+
239
+ self.resblocks = nn.ModuleList()
240
+ for i in range(len(self.ups)):
241
+ ch = upsample_initial_channel // (2 ** (i + 1))
242
+ for j, (k, d) in enumerate(
243
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
+ ):
245
+ self.resblocks.append(resblock(ch, k, d))
246
+
247
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
+ self.ups.apply(init_weights)
249
+
250
+ if gin_channels != 0:
251
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
+
253
+ def forward(self, x, g=None):
254
+ x = self.conv_pre(x)
255
+ if g is not None:
256
+ x = x + self.cond(g)
257
+
258
+ for i in range(self.num_upsamples):
259
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
+ x = self.ups[i](x)
261
+ xs = None
262
+ for j in range(self.num_kernels):
263
+ if xs is None:
264
+ xs = self.resblocks[i * self.num_kernels + j](x)
265
+ else:
266
+ xs += self.resblocks[i * self.num_kernels + j](x)
267
+ x = xs / self.num_kernels
268
+ x = F.leaky_relu(x)
269
+ x = self.conv_post(x)
270
+ x = torch.tanh(x)
271
+
272
+ return x
273
+
274
+ def remove_weight_norm(self):
275
+ for l in self.ups:
276
+ remove_weight_norm(l)
277
+ for l in self.resblocks:
278
+ l.remove_weight_norm()
279
+
280
+
281
+ class SineGen(torch.nn.Module):
282
+ """Definition of sine generator
283
+ SineGen(samp_rate, harmonic_num = 0,
284
+ sine_amp = 0.1, noise_std = 0.003,
285
+ voiced_threshold = 0,
286
+ flag_for_pulse=False)
287
+ samp_rate: sampling rate in Hz
288
+ harmonic_num: number of harmonic overtones (default 0)
289
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
290
+ noise_std: std of Gaussian noise (default 0.003)
291
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
+ Note: when flag_for_pulse is True, the first time step of a voiced
294
+ segment is always sin(np.pi) or cos(0)
295
+ """
296
+
297
+ def __init__(
298
+ self,
299
+ samp_rate,
300
+ harmonic_num=0,
301
+ sine_amp=0.1,
302
+ noise_std=0.003,
303
+ voiced_threshold=0,
304
+ flag_for_pulse=False,
305
+ ):
306
+ super(SineGen, self).__init__()
307
+ self.sine_amp = sine_amp
308
+ self.noise_std = noise_std
309
+ self.harmonic_num = harmonic_num
310
+ self.dim = self.harmonic_num + 1
311
+ self.sampling_rate = samp_rate
312
+ self.voiced_threshold = voiced_threshold
313
+
314
+ def _f02uv(self, f0):
315
+ # generate uv signal
316
+ uv = torch.ones_like(f0)
317
+ uv = uv * (f0 > self.voiced_threshold)
318
+ if uv.device.type == "privateuseone": # for DirectML
319
+ uv = uv.float()
320
+ return uv
321
+
322
+ def forward(self, f0, upp):
323
+ """sine_tensor, uv = forward(f0)
324
+ input F0: tensor(batchsize=1, length, dim=1)
325
+ f0 for unvoiced steps should be 0
326
+ output sine_tensor: tensor(batchsize=1, length, dim)
327
+ output uv: tensor(batchsize=1, length, 1)
328
+ """
329
+ with torch.no_grad():
330
+ f0 = f0[:, None].transpose(1, 2)
331
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
332
+ # fundamental component
333
+ f0_buf[:, :, 0] = f0[:, :, 0]
334
+ for idx in np.arange(self.harmonic_num):
335
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
336
+ idx + 2
337
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
338
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
339
+ rand_ini = torch.rand(
340
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
341
+ )
342
+ rand_ini[:, 0] = 0
343
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
344
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
345
+ tmp_over_one *= upp
346
+ tmp_over_one = F.interpolate(
347
+ tmp_over_one.transpose(2, 1),
348
+ scale_factor=upp,
349
+ mode="linear",
350
+ align_corners=True,
351
+ ).transpose(2, 1)
352
+ rad_values = F.interpolate(
353
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
354
+ ).transpose(
355
+ 2, 1
356
+ ) #######
357
+ tmp_over_one %= 1
358
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
359
+ cumsum_shift = torch.zeros_like(rad_values)
360
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
361
+ sine_waves = torch.sin(
362
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
363
+ )
364
+ sine_waves = sine_waves * self.sine_amp
365
+ uv = self._f02uv(f0)
366
+ uv = F.interpolate(
367
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
368
+ ).transpose(2, 1)
369
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
370
+ noise = noise_amp * torch.randn_like(sine_waves)
371
+ sine_waves = sine_waves * uv + noise
372
+ return sine_waves, uv, noise
373
+
374
+
375
+ class SourceModuleHnNSF(torch.nn.Module):
376
+ """SourceModule for hn-nsf
377
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
378
+ add_noise_std=0.003, voiced_threshod=0)
379
+ sampling_rate: sampling_rate in Hz
380
+ harmonic_num: number of harmonic above F0 (default: 0)
381
+ sine_amp: amplitude of sine source signal (default: 0.1)
382
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
383
+ note that amplitude of noise in unvoiced is decided
384
+ by sine_amp
385
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
386
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
387
+ F0_sampled (batchsize, length, 1)
388
+ Sine_source (batchsize, length, 1)
389
+ noise_source (batchsize, length 1)
390
+ uv (batchsize, length, 1)
391
+ """
392
+
393
+ def __init__(
394
+ self,
395
+ sampling_rate,
396
+ harmonic_num=0,
397
+ sine_amp=0.1,
398
+ add_noise_std=0.003,
399
+ voiced_threshod=0,
400
+ is_half=True,
401
+ ):
402
+ super(SourceModuleHnNSF, self).__init__()
403
+
404
+ self.sine_amp = sine_amp
405
+ self.noise_std = add_noise_std
406
+ self.is_half = is_half
407
+ # to produce sine waveforms
408
+ self.l_sin_gen = SineGen(
409
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
410
+ )
411
+
412
+ # to merge source harmonics into a single excitation
413
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
414
+ self.l_tanh = torch.nn.Tanh()
415
+
416
+ def forward(self, x, upp=None):
417
+ if hasattr(self, "ddtype") == False:
418
+ self.ddtype = self.l_linear.weight.dtype
419
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
420
+ # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype)
421
+ # if self.is_half:
422
+ # sine_wavs = sine_wavs.half()
423
+ # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x)))
424
+ # print(sine_wavs.dtype,self.ddtype)
425
+ if sine_wavs.dtype != self.ddtype:
426
+ sine_wavs = sine_wavs.to(self.ddtype)
427
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
428
+ return sine_merge, None, None # noise, uv
429
+
430
+
431
+ class GeneratorNSF(torch.nn.Module):
432
+ def __init__(
433
+ self,
434
+ initial_channel,
435
+ resblock,
436
+ resblock_kernel_sizes,
437
+ resblock_dilation_sizes,
438
+ upsample_rates,
439
+ upsample_initial_channel,
440
+ upsample_kernel_sizes,
441
+ gin_channels,
442
+ sr,
443
+ is_half=False,
444
+ ):
445
+ super(GeneratorNSF, self).__init__()
446
+ self.num_kernels = len(resblock_kernel_sizes)
447
+ self.num_upsamples = len(upsample_rates)
448
+
449
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
450
+ self.m_source = SourceModuleHnNSF(
451
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
452
+ )
453
+ self.noise_convs = nn.ModuleList()
454
+ self.conv_pre = Conv1d(
455
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
456
+ )
457
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
458
+
459
+ self.ups = nn.ModuleList()
460
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
461
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
462
+ self.ups.append(
463
+ weight_norm(
464
+ ConvTranspose1d(
465
+ upsample_initial_channel // (2**i),
466
+ upsample_initial_channel // (2 ** (i + 1)),
467
+ k,
468
+ u,
469
+ padding=(k - u) // 2,
470
+ )
471
+ )
472
+ )
473
+ if i + 1 < len(upsample_rates):
474
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
475
+ self.noise_convs.append(
476
+ Conv1d(
477
+ 1,
478
+ c_cur,
479
+ kernel_size=stride_f0 * 2,
480
+ stride=stride_f0,
481
+ padding=stride_f0 // 2,
482
+ )
483
+ )
484
+ else:
485
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
486
+
487
+ self.resblocks = nn.ModuleList()
488
+ for i in range(len(self.ups)):
489
+ ch = upsample_initial_channel // (2 ** (i + 1))
490
+ for j, (k, d) in enumerate(
491
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
492
+ ):
493
+ self.resblocks.append(resblock(ch, k, d))
494
+
495
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
496
+ self.ups.apply(init_weights)
497
+
498
+ if gin_channels != 0:
499
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
500
+
501
+ self.upp = np.prod(upsample_rates)
502
+
503
+ def forward(self, x, f0, g=None):
504
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
505
+ har_source = har_source.transpose(1, 2)
506
+ x = self.conv_pre(x)
507
+ if g is not None:
508
+ x = x + self.cond(g)
509
+
510
+ for i in range(self.num_upsamples):
511
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
512
+ x = self.ups[i](x)
513
+ x_source = self.noise_convs[i](har_source)
514
+ x = x + x_source
515
+ xs = None
516
+ for j in range(self.num_kernels):
517
+ if xs is None:
518
+ xs = self.resblocks[i * self.num_kernels + j](x)
519
+ else:
520
+ xs += self.resblocks[i * self.num_kernels + j](x)
521
+ x = xs / self.num_kernels
522
+ x = F.leaky_relu(x)
523
+ x = self.conv_post(x)
524
+ x = torch.tanh(x)
525
+ return x
526
+
527
+ def remove_weight_norm(self):
528
+ for l in self.ups:
529
+ remove_weight_norm(l)
530
+ for l in self.resblocks:
531
+ l.remove_weight_norm()
532
+
533
+
534
+ sr2sr = {
535
+ "32k": 32000,
536
+ "40k": 40000,
537
+ "48k": 48000,
538
+ }
539
+
540
+
541
+ class SynthesizerTrnMs256NSFsid(nn.Module):
542
+ def __init__(
543
+ self,
544
+ spec_channels,
545
+ segment_size,
546
+ inter_channels,
547
+ hidden_channels,
548
+ filter_channels,
549
+ n_heads,
550
+ n_layers,
551
+ kernel_size,
552
+ p_dropout,
553
+ resblock,
554
+ resblock_kernel_sizes,
555
+ resblock_dilation_sizes,
556
+ upsample_rates,
557
+ upsample_initial_channel,
558
+ upsample_kernel_sizes,
559
+ spk_embed_dim,
560
+ gin_channels,
561
+ sr,
562
+ **kwargs
563
+ ):
564
+ super().__init__()
565
+ if type(sr) == type("strr"):
566
+ sr = sr2sr[sr]
567
+ self.spec_channels = spec_channels
568
+ self.inter_channels = inter_channels
569
+ self.hidden_channels = hidden_channels
570
+ self.filter_channels = filter_channels
571
+ self.n_heads = n_heads
572
+ self.n_layers = n_layers
573
+ self.kernel_size = kernel_size
574
+ self.p_dropout = p_dropout
575
+ self.resblock = resblock
576
+ self.resblock_kernel_sizes = resblock_kernel_sizes
577
+ self.resblock_dilation_sizes = resblock_dilation_sizes
578
+ self.upsample_rates = upsample_rates
579
+ self.upsample_initial_channel = upsample_initial_channel
580
+ self.upsample_kernel_sizes = upsample_kernel_sizes
581
+ self.segment_size = segment_size
582
+ self.gin_channels = gin_channels
583
+ # self.hop_length = hop_length#
584
+ self.spk_embed_dim = spk_embed_dim
585
+ self.enc_p = TextEncoder256(
586
+ inter_channels,
587
+ hidden_channels,
588
+ filter_channels,
589
+ n_heads,
590
+ n_layers,
591
+ kernel_size,
592
+ p_dropout,
593
+ )
594
+ self.dec = GeneratorNSF(
595
+ inter_channels,
596
+ resblock,
597
+ resblock_kernel_sizes,
598
+ resblock_dilation_sizes,
599
+ upsample_rates,
600
+ upsample_initial_channel,
601
+ upsample_kernel_sizes,
602
+ gin_channels=gin_channels,
603
+ sr=sr,
604
+ is_half=kwargs["is_half"],
605
+ )
606
+ self.enc_q = PosteriorEncoder(
607
+ spec_channels,
608
+ inter_channels,
609
+ hidden_channels,
610
+ 5,
611
+ 1,
612
+ 16,
613
+ gin_channels=gin_channels,
614
+ )
615
+ self.flow = ResidualCouplingBlock(
616
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
617
+ )
618
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
619
+ logger.debug(
620
+ "gin_channels: "
621
+ + str(gin_channels)
622
+ + ", self.spk_embed_dim: "
623
+ + str(self.spk_embed_dim)
624
+ )
625
+
626
+ def remove_weight_norm(self):
627
+ self.dec.remove_weight_norm()
628
+ self.flow.remove_weight_norm()
629
+ self.enc_q.remove_weight_norm()
630
+
631
+ def forward(
632
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
633
+ ): # 这里ds是id,[bs,1]
634
+ # print(1,pitch.shape)#[bs,t]
635
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
636
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
637
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
638
+ z_p = self.flow(z, y_mask, g=g)
639
+ z_slice, ids_slice = commons.rand_slice_segments(
640
+ z, y_lengths, self.segment_size
641
+ )
642
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
643
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
644
+ # print(-2,pitchf.shape,z_slice.shape)
645
+ o = self.dec(z_slice, pitchf, g=g)
646
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
647
+
648
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
649
+ g = self.emb_g(sid).unsqueeze(-1)
650
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
651
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
652
+ if rate:
653
+ head = int(z_p.shape[2] * rate)
654
+ z_p = z_p[:, :, -head:]
655
+ x_mask = x_mask[:, :, -head:]
656
+ nsff0 = nsff0[:, -head:]
657
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
658
+ o = self.dec(z * x_mask, nsff0, g=g)
659
+ return o, x_mask, (z, z_p, m_p, logs_p)
660
+
661
+
662
+ class SynthesizerTrnMs768NSFsid(nn.Module):
663
+ def __init__(
664
+ self,
665
+ spec_channels,
666
+ segment_size,
667
+ inter_channels,
668
+ hidden_channels,
669
+ filter_channels,
670
+ n_heads,
671
+ n_layers,
672
+ kernel_size,
673
+ p_dropout,
674
+ resblock,
675
+ resblock_kernel_sizes,
676
+ resblock_dilation_sizes,
677
+ upsample_rates,
678
+ upsample_initial_channel,
679
+ upsample_kernel_sizes,
680
+ spk_embed_dim,
681
+ gin_channels,
682
+ sr,
683
+ **kwargs
684
+ ):
685
+ super().__init__()
686
+ if type(sr) == type("strr"):
687
+ sr = sr2sr[sr]
688
+ self.spec_channels = spec_channels
689
+ self.inter_channels = inter_channels
690
+ self.hidden_channels = hidden_channels
691
+ self.filter_channels = filter_channels
692
+ self.n_heads = n_heads
693
+ self.n_layers = n_layers
694
+ self.kernel_size = kernel_size
695
+ self.p_dropout = p_dropout
696
+ self.resblock = resblock
697
+ self.resblock_kernel_sizes = resblock_kernel_sizes
698
+ self.resblock_dilation_sizes = resblock_dilation_sizes
699
+ self.upsample_rates = upsample_rates
700
+ self.upsample_initial_channel = upsample_initial_channel
701
+ self.upsample_kernel_sizes = upsample_kernel_sizes
702
+ self.segment_size = segment_size
703
+ self.gin_channels = gin_channels
704
+ # self.hop_length = hop_length#
705
+ self.spk_embed_dim = spk_embed_dim
706
+ self.enc_p = TextEncoder768(
707
+ inter_channels,
708
+ hidden_channels,
709
+ filter_channels,
710
+ n_heads,
711
+ n_layers,
712
+ kernel_size,
713
+ p_dropout,
714
+ )
715
+ self.dec = GeneratorNSF(
716
+ inter_channels,
717
+ resblock,
718
+ resblock_kernel_sizes,
719
+ resblock_dilation_sizes,
720
+ upsample_rates,
721
+ upsample_initial_channel,
722
+ upsample_kernel_sizes,
723
+ gin_channels=gin_channels,
724
+ sr=sr,
725
+ is_half=kwargs["is_half"],
726
+ )
727
+ self.enc_q = PosteriorEncoder(
728
+ spec_channels,
729
+ inter_channels,
730
+ hidden_channels,
731
+ 5,
732
+ 1,
733
+ 16,
734
+ gin_channels=gin_channels,
735
+ )
736
+ self.flow = ResidualCouplingBlock(
737
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
738
+ )
739
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
740
+ logger.debug(
741
+ "gin_channels: "
742
+ + str(gin_channels)
743
+ + ", self.spk_embed_dim: "
744
+ + str(self.spk_embed_dim)
745
+ )
746
+
747
+ def remove_weight_norm(self):
748
+ self.dec.remove_weight_norm()
749
+ self.flow.remove_weight_norm()
750
+ self.enc_q.remove_weight_norm()
751
+
752
+ def forward(
753
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
754
+ ): # 这里ds是id,[bs,1]
755
+ # print(1,pitch.shape)#[bs,t]
756
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
757
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
758
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
759
+ z_p = self.flow(z, y_mask, g=g)
760
+ z_slice, ids_slice = commons.rand_slice_segments(
761
+ z, y_lengths, self.segment_size
762
+ )
763
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
764
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
765
+ # print(-2,pitchf.shape,z_slice.shape)
766
+ o = self.dec(z_slice, pitchf, g=g)
767
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
768
+
769
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
770
+ g = self.emb_g(sid).unsqueeze(-1)
771
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
772
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
773
+ if rate:
774
+ head = int(z_p.shape[2] * rate)
775
+ z_p = z_p[:, :, -head:]
776
+ x_mask = x_mask[:, :, -head:]
777
+ nsff0 = nsff0[:, -head:]
778
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
779
+ o = self.dec(z * x_mask, nsff0, g=g)
780
+ return o, x_mask, (z, z_p, m_p, logs_p)
781
+
782
+
783
+ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
784
+ def __init__(
785
+ self,
786
+ spec_channels,
787
+ segment_size,
788
+ inter_channels,
789
+ hidden_channels,
790
+ filter_channels,
791
+ n_heads,
792
+ n_layers,
793
+ kernel_size,
794
+ p_dropout,
795
+ resblock,
796
+ resblock_kernel_sizes,
797
+ resblock_dilation_sizes,
798
+ upsample_rates,
799
+ upsample_initial_channel,
800
+ upsample_kernel_sizes,
801
+ spk_embed_dim,
802
+ gin_channels,
803
+ sr=None,
804
+ **kwargs
805
+ ):
806
+ super().__init__()
807
+ self.spec_channels = spec_channels
808
+ self.inter_channels = inter_channels
809
+ self.hidden_channels = hidden_channels
810
+ self.filter_channels = filter_channels
811
+ self.n_heads = n_heads
812
+ self.n_layers = n_layers
813
+ self.kernel_size = kernel_size
814
+ self.p_dropout = p_dropout
815
+ self.resblock = resblock
816
+ self.resblock_kernel_sizes = resblock_kernel_sizes
817
+ self.resblock_dilation_sizes = resblock_dilation_sizes
818
+ self.upsample_rates = upsample_rates
819
+ self.upsample_initial_channel = upsample_initial_channel
820
+ self.upsample_kernel_sizes = upsample_kernel_sizes
821
+ self.segment_size = segment_size
822
+ self.gin_channels = gin_channels
823
+ # self.hop_length = hop_length#
824
+ self.spk_embed_dim = spk_embed_dim
825
+ self.enc_p = TextEncoder256(
826
+ inter_channels,
827
+ hidden_channels,
828
+ filter_channels,
829
+ n_heads,
830
+ n_layers,
831
+ kernel_size,
832
+ p_dropout,
833
+ f0=False,
834
+ )
835
+ self.dec = Generator(
836
+ inter_channels,
837
+ resblock,
838
+ resblock_kernel_sizes,
839
+ resblock_dilation_sizes,
840
+ upsample_rates,
841
+ upsample_initial_channel,
842
+ upsample_kernel_sizes,
843
+ gin_channels=gin_channels,
844
+ )
845
+ self.enc_q = PosteriorEncoder(
846
+ spec_channels,
847
+ inter_channels,
848
+ hidden_channels,
849
+ 5,
850
+ 1,
851
+ 16,
852
+ gin_channels=gin_channels,
853
+ )
854
+ self.flow = ResidualCouplingBlock(
855
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
856
+ )
857
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
858
+ logger.debug(
859
+ "gin_channels: "
860
+ + str(gin_channels)
861
+ + ", self.spk_embed_dim: "
862
+ + str(self.spk_embed_dim)
863
+ )
864
+
865
+ def remove_weight_norm(self):
866
+ self.dec.remove_weight_norm()
867
+ self.flow.remove_weight_norm()
868
+ self.enc_q.remove_weight_norm()
869
+
870
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
871
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
872
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
873
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
874
+ z_p = self.flow(z, y_mask, g=g)
875
+ z_slice, ids_slice = commons.rand_slice_segments(
876
+ z, y_lengths, self.segment_size
877
+ )
878
+ o = self.dec(z_slice, g=g)
879
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
880
+
881
+ def infer(self, phone, phone_lengths, sid, rate=None):
882
+ g = self.emb_g(sid).unsqueeze(-1)
883
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
884
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
885
+ if rate:
886
+ head = int(z_p.shape[2] * rate)
887
+ z_p = z_p[:, :, -head:]
888
+ x_mask = x_mask[:, :, -head:]
889
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
890
+ o = self.dec(z * x_mask, g=g)
891
+ return o, x_mask, (z, z_p, m_p, logs_p)
892
+
893
+
894
+ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
895
+ def __init__(
896
+ self,
897
+ spec_channels,
898
+ segment_size,
899
+ inter_channels,
900
+ hidden_channels,
901
+ filter_channels,
902
+ n_heads,
903
+ n_layers,
904
+ kernel_size,
905
+ p_dropout,
906
+ resblock,
907
+ resblock_kernel_sizes,
908
+ resblock_dilation_sizes,
909
+ upsample_rates,
910
+ upsample_initial_channel,
911
+ upsample_kernel_sizes,
912
+ spk_embed_dim,
913
+ gin_channels,
914
+ sr=None,
915
+ **kwargs
916
+ ):
917
+ super().__init__()
918
+ self.spec_channels = spec_channels
919
+ self.inter_channels = inter_channels
920
+ self.hidden_channels = hidden_channels
921
+ self.filter_channels = filter_channels
922
+ self.n_heads = n_heads
923
+ self.n_layers = n_layers
924
+ self.kernel_size = kernel_size
925
+ self.p_dropout = p_dropout
926
+ self.resblock = resblock
927
+ self.resblock_kernel_sizes = resblock_kernel_sizes
928
+ self.resblock_dilation_sizes = resblock_dilation_sizes
929
+ self.upsample_rates = upsample_rates
930
+ self.upsample_initial_channel = upsample_initial_channel
931
+ self.upsample_kernel_sizes = upsample_kernel_sizes
932
+ self.segment_size = segment_size
933
+ self.gin_channels = gin_channels
934
+ # self.hop_length = hop_length#
935
+ self.spk_embed_dim = spk_embed_dim
936
+ self.enc_p = TextEncoder768(
937
+ inter_channels,
938
+ hidden_channels,
939
+ filter_channels,
940
+ n_heads,
941
+ n_layers,
942
+ kernel_size,
943
+ p_dropout,
944
+ f0=False,
945
+ )
946
+ self.dec = Generator(
947
+ inter_channels,
948
+ resblock,
949
+ resblock_kernel_sizes,
950
+ resblock_dilation_sizes,
951
+ upsample_rates,
952
+ upsample_initial_channel,
953
+ upsample_kernel_sizes,
954
+ gin_channels=gin_channels,
955
+ )
956
+ self.enc_q = PosteriorEncoder(
957
+ spec_channels,
958
+ inter_channels,
959
+ hidden_channels,
960
+ 5,
961
+ 1,
962
+ 16,
963
+ gin_channels=gin_channels,
964
+ )
965
+ self.flow = ResidualCouplingBlock(
966
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
967
+ )
968
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
969
+ logger.debug(
970
+ "gin_channels: "
971
+ + str(gin_channels)
972
+ + ", self.spk_embed_dim: "
973
+ + str(self.spk_embed_dim)
974
+ )
975
+
976
+ def remove_weight_norm(self):
977
+ self.dec.remove_weight_norm()
978
+ self.flow.remove_weight_norm()
979
+ self.enc_q.remove_weight_norm()
980
+
981
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
982
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
983
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
984
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
985
+ z_p = self.flow(z, y_mask, g=g)
986
+ z_slice, ids_slice = commons.rand_slice_segments(
987
+ z, y_lengths, self.segment_size
988
+ )
989
+ o = self.dec(z_slice, g=g)
990
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
991
+
992
+ def infer(self, phone, phone_lengths, sid, rate=None):
993
+ g = self.emb_g(sid).unsqueeze(-1)
994
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
995
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
996
+ if rate:
997
+ head = int(z_p.shape[2] * rate)
998
+ z_p = z_p[:, :, -head:]
999
+ x_mask = x_mask[:, :, -head:]
1000
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
1001
+ o = self.dec(z * x_mask, g=g)
1002
+ return o, x_mask, (z, z_p, m_p, logs_p)
1003
+
1004
+
1005
+ class MultiPeriodDiscriminator(torch.nn.Module):
1006
+ def __init__(self, use_spectral_norm=False):
1007
+ super(MultiPeriodDiscriminator, self).__init__()
1008
+ periods = [2, 3, 5, 7, 11, 17]
1009
+ # periods = [3, 5, 7, 11, 17, 23, 37]
1010
+
1011
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1012
+ discs = discs + [
1013
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1014
+ ]
1015
+ self.discriminators = nn.ModuleList(discs)
1016
+
1017
+ def forward(self, y, y_hat):
1018
+ y_d_rs = [] #
1019
+ y_d_gs = []
1020
+ fmap_rs = []
1021
+ fmap_gs = []
1022
+ for i, d in enumerate(self.discriminators):
1023
+ y_d_r, fmap_r = d(y)
1024
+ y_d_g, fmap_g = d(y_hat)
1025
+ # for j in range(len(fmap_r)):
1026
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1027
+ y_d_rs.append(y_d_r)
1028
+ y_d_gs.append(y_d_g)
1029
+ fmap_rs.append(fmap_r)
1030
+ fmap_gs.append(fmap_g)
1031
+
1032
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1033
+
1034
+
1035
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
1036
+ def __init__(self, use_spectral_norm=False):
1037
+ super(MultiPeriodDiscriminatorV2, self).__init__()
1038
+ # periods = [2, 3, 5, 7, 11, 17]
1039
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
1040
+
1041
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1042
+ discs = discs + [
1043
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1044
+ ]
1045
+ self.discriminators = nn.ModuleList(discs)
1046
+
1047
+ def forward(self, y, y_hat):
1048
+ y_d_rs = [] #
1049
+ y_d_gs = []
1050
+ fmap_rs = []
1051
+ fmap_gs = []
1052
+ for i, d in enumerate(self.discriminators):
1053
+ y_d_r, fmap_r = d(y)
1054
+ y_d_g, fmap_g = d(y_hat)
1055
+ # for j in range(len(fmap_r)):
1056
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1057
+ y_d_rs.append(y_d_r)
1058
+ y_d_gs.append(y_d_g)
1059
+ fmap_rs.append(fmap_r)
1060
+ fmap_gs.append(fmap_g)
1061
+
1062
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1063
+
1064
+
1065
+ class DiscriminatorS(torch.nn.Module):
1066
+ def __init__(self, use_spectral_norm=False):
1067
+ super(DiscriminatorS, self).__init__()
1068
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1069
+ self.convs = nn.ModuleList(
1070
+ [
1071
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1072
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1073
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1074
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1075
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1076
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1077
+ ]
1078
+ )
1079
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1080
+
1081
+ def forward(self, x):
1082
+ fmap = []
1083
+
1084
+ for l in self.convs:
1085
+ x = l(x)
1086
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1087
+ fmap.append(x)
1088
+ x = self.conv_post(x)
1089
+ fmap.append(x)
1090
+ x = torch.flatten(x, 1, -1)
1091
+
1092
+ return x, fmap
1093
+
1094
+
1095
+ class DiscriminatorP(torch.nn.Module):
1096
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1097
+ super(DiscriminatorP, self).__init__()
1098
+ self.period = period
1099
+ self.use_spectral_norm = use_spectral_norm
1100
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1101
+ self.convs = nn.ModuleList(
1102
+ [
1103
+ norm_f(
1104
+ Conv2d(
1105
+ 1,
1106
+ 32,
1107
+ (kernel_size, 1),
1108
+ (stride, 1),
1109
+ padding=(get_padding(kernel_size, 1), 0),
1110
+ )
1111
+ ),
1112
+ norm_f(
1113
+ Conv2d(
1114
+ 32,
1115
+ 128,
1116
+ (kernel_size, 1),
1117
+ (stride, 1),
1118
+ padding=(get_padding(kernel_size, 1), 0),
1119
+ )
1120
+ ),
1121
+ norm_f(
1122
+ Conv2d(
1123
+ 128,
1124
+ 512,
1125
+ (kernel_size, 1),
1126
+ (stride, 1),
1127
+ padding=(get_padding(kernel_size, 1), 0),
1128
+ )
1129
+ ),
1130
+ norm_f(
1131
+ Conv2d(
1132
+ 512,
1133
+ 1024,
1134
+ (kernel_size, 1),
1135
+ (stride, 1),
1136
+ padding=(get_padding(kernel_size, 1), 0),
1137
+ )
1138
+ ),
1139
+ norm_f(
1140
+ Conv2d(
1141
+ 1024,
1142
+ 1024,
1143
+ (kernel_size, 1),
1144
+ 1,
1145
+ padding=(get_padding(kernel_size, 1), 0),
1146
+ )
1147
+ ),
1148
+ ]
1149
+ )
1150
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1151
+
1152
+ def forward(self, x):
1153
+ fmap = []
1154
+
1155
+ # 1d to 2d
1156
+ b, c, t = x.shape
1157
+ if t % self.period != 0: # pad first
1158
+ n_pad = self.period - (t % self.period)
1159
+ if has_xpu and x.dtype == torch.bfloat16:
1160
+ x = F.pad(x.to(dtype=torch.float16), (0, n_pad), "reflect").to(dtype=torch.bfloat16)
1161
+ else:
1162
+ x = F.pad(x, (0, n_pad), "reflect")
1163
+ t = t + n_pad
1164
+ x = x.view(b, c, t // self.period, self.period)
1165
+
1166
+ for l in self.convs:
1167
+ x = l(x)
1168
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1169
+ fmap.append(x)
1170
+ x = self.conv_post(x)
1171
+ fmap.append(x)
1172
+ x = torch.flatten(x, 1, -1)
1173
+
1174
+ return x, fmap
infer/lib/infer_pack/models_onnx.py ADDED
@@ -0,0 +1,824 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch import nn
9
+ from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
10
+ from torch.nn import functional as F
11
+ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
12
+
13
+ from infer.lib.infer_pack import attentions, commons, modules
14
+ from infer.lib.infer_pack.commons import get_padding, init_weights
15
+
16
+
17
+ class TextEncoder256(nn.Module):
18
+ def __init__(
19
+ self,
20
+ out_channels,
21
+ hidden_channels,
22
+ filter_channels,
23
+ n_heads,
24
+ n_layers,
25
+ kernel_size,
26
+ p_dropout,
27
+ f0=True,
28
+ ):
29
+ super().__init__()
30
+ self.out_channels = out_channels
31
+ self.hidden_channels = hidden_channels
32
+ self.filter_channels = filter_channels
33
+ self.n_heads = n_heads
34
+ self.n_layers = n_layers
35
+ self.kernel_size = kernel_size
36
+ self.p_dropout = p_dropout
37
+ self.emb_phone = nn.Linear(256, hidden_channels)
38
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
+ if f0 == True:
40
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
+ self.encoder = attentions.Encoder(
42
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
+ )
44
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
+
46
+ def forward(self, phone, pitch, lengths):
47
+ if pitch == None:
48
+ x = self.emb_phone(phone)
49
+ else:
50
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
+ x = self.lrelu(x)
53
+ x = torch.transpose(x, 1, -1) # [b, h, t]
54
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
+ x.dtype
56
+ )
57
+ x = self.encoder(x * x_mask, x_mask)
58
+ stats = self.proj(x) * x_mask
59
+
60
+ m, logs = torch.split(stats, self.out_channels, dim=1)
61
+ return m, logs, x_mask
62
+
63
+
64
+ class TextEncoder768(nn.Module):
65
+ def __init__(
66
+ self,
67
+ out_channels,
68
+ hidden_channels,
69
+ filter_channels,
70
+ n_heads,
71
+ n_layers,
72
+ kernel_size,
73
+ p_dropout,
74
+ f0=True,
75
+ ):
76
+ super().__init__()
77
+ self.out_channels = out_channels
78
+ self.hidden_channels = hidden_channels
79
+ self.filter_channels = filter_channels
80
+ self.n_heads = n_heads
81
+ self.n_layers = n_layers
82
+ self.kernel_size = kernel_size
83
+ self.p_dropout = p_dropout
84
+ self.emb_phone = nn.Linear(768, hidden_channels)
85
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
+ if f0 == True:
87
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
+ self.encoder = attentions.Encoder(
89
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
+ )
91
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
+
93
+ def forward(self, phone, pitch, lengths):
94
+ if pitch == None:
95
+ x = self.emb_phone(phone)
96
+ else:
97
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
+ x = self.lrelu(x)
100
+ x = torch.transpose(x, 1, -1) # [b, h, t]
101
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
+ x.dtype
103
+ )
104
+ x = self.encoder(x * x_mask, x_mask)
105
+ stats = self.proj(x) * x_mask
106
+
107
+ m, logs = torch.split(stats, self.out_channels, dim=1)
108
+ return m, logs, x_mask
109
+
110
+
111
+ class ResidualCouplingBlock(nn.Module):
112
+ def __init__(
113
+ self,
114
+ channels,
115
+ hidden_channels,
116
+ kernel_size,
117
+ dilation_rate,
118
+ n_layers,
119
+ n_flows=4,
120
+ gin_channels=0,
121
+ ):
122
+ super().__init__()
123
+ self.channels = channels
124
+ self.hidden_channels = hidden_channels
125
+ self.kernel_size = kernel_size
126
+ self.dilation_rate = dilation_rate
127
+ self.n_layers = n_layers
128
+ self.n_flows = n_flows
129
+ self.gin_channels = gin_channels
130
+
131
+ self.flows = nn.ModuleList()
132
+ for i in range(n_flows):
133
+ self.flows.append(
134
+ modules.ResidualCouplingLayer(
135
+ channels,
136
+ hidden_channels,
137
+ kernel_size,
138
+ dilation_rate,
139
+ n_layers,
140
+ gin_channels=gin_channels,
141
+ mean_only=True,
142
+ )
143
+ )
144
+ self.flows.append(modules.Flip())
145
+
146
+ def forward(self, x, x_mask, g=None, reverse=False):
147
+ if not reverse:
148
+ for flow in self.flows:
149
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
+ else:
151
+ for flow in reversed(self.flows):
152
+ x = flow(x, x_mask, g=g, reverse=reverse)
153
+ return x
154
+
155
+ def remove_weight_norm(self):
156
+ for i in range(self.n_flows):
157
+ self.flows[i * 2].remove_weight_norm()
158
+
159
+
160
+ class PosteriorEncoder(nn.Module):
161
+ def __init__(
162
+ self,
163
+ in_channels,
164
+ out_channels,
165
+ hidden_channels,
166
+ kernel_size,
167
+ dilation_rate,
168
+ n_layers,
169
+ gin_channels=0,
170
+ ):
171
+ super().__init__()
172
+ self.in_channels = in_channels
173
+ self.out_channels = out_channels
174
+ self.hidden_channels = hidden_channels
175
+ self.kernel_size = kernel_size
176
+ self.dilation_rate = dilation_rate
177
+ self.n_layers = n_layers
178
+ self.gin_channels = gin_channels
179
+
180
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
+ self.enc = modules.WN(
182
+ hidden_channels,
183
+ kernel_size,
184
+ dilation_rate,
185
+ n_layers,
186
+ gin_channels=gin_channels,
187
+ )
188
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
+
190
+ def forward(self, x, x_lengths, g=None):
191
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
+ x.dtype
193
+ )
194
+ x = self.pre(x) * x_mask
195
+ x = self.enc(x, x_mask, g=g)
196
+ stats = self.proj(x) * x_mask
197
+ m, logs = torch.split(stats, self.out_channels, dim=1)
198
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
+ return z, m, logs, x_mask
200
+
201
+ def remove_weight_norm(self):
202
+ self.enc.remove_weight_norm()
203
+
204
+
205
+ class Generator(torch.nn.Module):
206
+ def __init__(
207
+ self,
208
+ initial_channel,
209
+ resblock,
210
+ resblock_kernel_sizes,
211
+ resblock_dilation_sizes,
212
+ upsample_rates,
213
+ upsample_initial_channel,
214
+ upsample_kernel_sizes,
215
+ gin_channels=0,
216
+ ):
217
+ super(Generator, self).__init__()
218
+ self.num_kernels = len(resblock_kernel_sizes)
219
+ self.num_upsamples = len(upsample_rates)
220
+ self.conv_pre = Conv1d(
221
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
222
+ )
223
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
+
225
+ self.ups = nn.ModuleList()
226
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
+ self.ups.append(
228
+ weight_norm(
229
+ ConvTranspose1d(
230
+ upsample_initial_channel // (2**i),
231
+ upsample_initial_channel // (2 ** (i + 1)),
232
+ k,
233
+ u,
234
+ padding=(k - u) // 2,
235
+ )
236
+ )
237
+ )
238
+
239
+ self.resblocks = nn.ModuleList()
240
+ for i in range(len(self.ups)):
241
+ ch = upsample_initial_channel // (2 ** (i + 1))
242
+ for j, (k, d) in enumerate(
243
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
+ ):
245
+ self.resblocks.append(resblock(ch, k, d))
246
+
247
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
+ self.ups.apply(init_weights)
249
+
250
+ if gin_channels != 0:
251
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
+
253
+ def forward(self, x, g=None):
254
+ x = self.conv_pre(x)
255
+ if g is not None:
256
+ x = x + self.cond(g)
257
+
258
+ for i in range(self.num_upsamples):
259
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
+ x = self.ups[i](x)
261
+ xs = None
262
+ for j in range(self.num_kernels):
263
+ if xs is None:
264
+ xs = self.resblocks[i * self.num_kernels + j](x)
265
+ else:
266
+ xs += self.resblocks[i * self.num_kernels + j](x)
267
+ x = xs / self.num_kernels
268
+ x = F.leaky_relu(x)
269
+ x = self.conv_post(x)
270
+ x = torch.tanh(x)
271
+
272
+ return x
273
+
274
+ def remove_weight_norm(self):
275
+ for l in self.ups:
276
+ remove_weight_norm(l)
277
+ for l in self.resblocks:
278
+ l.remove_weight_norm()
279
+
280
+
281
+ class SineGen(torch.nn.Module):
282
+ """Definition of sine generator
283
+ SineGen(samp_rate, harmonic_num = 0,
284
+ sine_amp = 0.1, noise_std = 0.003,
285
+ voiced_threshold = 0,
286
+ flag_for_pulse=False)
287
+ samp_rate: sampling rate in Hz
288
+ harmonic_num: number of harmonic overtones (default 0)
289
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
290
+ noise_std: std of Gaussian noise (default 0.003)
291
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
+ Note: when flag_for_pulse is True, the first time step of a voiced
294
+ segment is always sin(np.pi) or cos(0)
295
+ """
296
+
297
+ def __init__(
298
+ self,
299
+ samp_rate,
300
+ harmonic_num=0,
301
+ sine_amp=0.1,
302
+ noise_std=0.003,
303
+ voiced_threshold=0,
304
+ flag_for_pulse=False,
305
+ ):
306
+ super(SineGen, self).__init__()
307
+ self.sine_amp = sine_amp
308
+ self.noise_std = noise_std
309
+ self.harmonic_num = harmonic_num
310
+ self.dim = self.harmonic_num + 1
311
+ self.sampling_rate = samp_rate
312
+ self.voiced_threshold = voiced_threshold
313
+
314
+ def _f02uv(self, f0):
315
+ # generate uv signal
316
+ uv = torch.ones_like(f0)
317
+ uv = uv * (f0 > self.voiced_threshold)
318
+ return uv
319
+
320
+ def forward(self, f0, upp):
321
+ """sine_tensor, uv = forward(f0)
322
+ input F0: tensor(batchsize=1, length, dim=1)
323
+ f0 for unvoiced steps should be 0
324
+ output sine_tensor: tensor(batchsize=1, length, dim)
325
+ output uv: tensor(batchsize=1, length, 1)
326
+ """
327
+ with torch.no_grad():
328
+ f0 = f0[:, None].transpose(1, 2)
329
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330
+ # fundamental component
331
+ f0_buf[:, :, 0] = f0[:, :, 0]
332
+ for idx in np.arange(self.harmonic_num):
333
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334
+ idx + 2
335
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
337
+ rand_ini = torch.rand(
338
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339
+ )
340
+ rand_ini[:, 0] = 0
341
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
343
+ tmp_over_one *= upp
344
+ tmp_over_one = F.interpolate(
345
+ tmp_over_one.transpose(2, 1),
346
+ scale_factor=upp,
347
+ mode="linear",
348
+ align_corners=True,
349
+ ).transpose(2, 1)
350
+ rad_values = F.interpolate(
351
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
352
+ ).transpose(
353
+ 2, 1
354
+ ) #######
355
+ tmp_over_one %= 1
356
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
357
+ cumsum_shift = torch.zeros_like(rad_values)
358
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
359
+ sine_waves = torch.sin(
360
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
361
+ )
362
+ sine_waves = sine_waves * self.sine_amp
363
+ uv = self._f02uv(f0)
364
+ uv = F.interpolate(
365
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
366
+ ).transpose(2, 1)
367
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
368
+ noise = noise_amp * torch.randn_like(sine_waves)
369
+ sine_waves = sine_waves * uv + noise
370
+ return sine_waves, uv, noise
371
+
372
+
373
+ class SourceModuleHnNSF(torch.nn.Module):
374
+ """SourceModule for hn-nsf
375
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
376
+ add_noise_std=0.003, voiced_threshod=0)
377
+ sampling_rate: sampling_rate in Hz
378
+ harmonic_num: number of harmonic above F0 (default: 0)
379
+ sine_amp: amplitude of sine source signal (default: 0.1)
380
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
381
+ note that amplitude of noise in unvoiced is decided
382
+ by sine_amp
383
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
384
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
385
+ F0_sampled (batchsize, length, 1)
386
+ Sine_source (batchsize, length, 1)
387
+ noise_source (batchsize, length 1)
388
+ uv (batchsize, length, 1)
389
+ """
390
+
391
+ def __init__(
392
+ self,
393
+ sampling_rate,
394
+ harmonic_num=0,
395
+ sine_amp=0.1,
396
+ add_noise_std=0.003,
397
+ voiced_threshod=0,
398
+ is_half=True,
399
+ ):
400
+ super(SourceModuleHnNSF, self).__init__()
401
+
402
+ self.sine_amp = sine_amp
403
+ self.noise_std = add_noise_std
404
+ self.is_half = is_half
405
+ # to produce sine waveforms
406
+ self.l_sin_gen = SineGen(
407
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
408
+ )
409
+
410
+ # to merge source harmonics into a single excitation
411
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
412
+ self.l_tanh = torch.nn.Tanh()
413
+
414
+ def forward(self, x, upp=None):
415
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
416
+ if self.is_half:
417
+ sine_wavs = sine_wavs.half()
418
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
419
+ return sine_merge, None, None # noise, uv
420
+
421
+
422
+ class GeneratorNSF(torch.nn.Module):
423
+ def __init__(
424
+ self,
425
+ initial_channel,
426
+ resblock,
427
+ resblock_kernel_sizes,
428
+ resblock_dilation_sizes,
429
+ upsample_rates,
430
+ upsample_initial_channel,
431
+ upsample_kernel_sizes,
432
+ gin_channels,
433
+ sr,
434
+ is_half=False,
435
+ ):
436
+ super(GeneratorNSF, self).__init__()
437
+ self.num_kernels = len(resblock_kernel_sizes)
438
+ self.num_upsamples = len(upsample_rates)
439
+
440
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
441
+ self.m_source = SourceModuleHnNSF(
442
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
443
+ )
444
+ self.noise_convs = nn.ModuleList()
445
+ self.conv_pre = Conv1d(
446
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
447
+ )
448
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
449
+
450
+ self.ups = nn.ModuleList()
451
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
452
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
453
+ self.ups.append(
454
+ weight_norm(
455
+ ConvTranspose1d(
456
+ upsample_initial_channel // (2**i),
457
+ upsample_initial_channel // (2 ** (i + 1)),
458
+ k,
459
+ u,
460
+ padding=(k - u) // 2,
461
+ )
462
+ )
463
+ )
464
+ if i + 1 < len(upsample_rates):
465
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
466
+ self.noise_convs.append(
467
+ Conv1d(
468
+ 1,
469
+ c_cur,
470
+ kernel_size=stride_f0 * 2,
471
+ stride=stride_f0,
472
+ padding=stride_f0 // 2,
473
+ )
474
+ )
475
+ else:
476
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
477
+
478
+ self.resblocks = nn.ModuleList()
479
+ for i in range(len(self.ups)):
480
+ ch = upsample_initial_channel // (2 ** (i + 1))
481
+ for j, (k, d) in enumerate(
482
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
483
+ ):
484
+ self.resblocks.append(resblock(ch, k, d))
485
+
486
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
487
+ self.ups.apply(init_weights)
488
+
489
+ if gin_channels != 0:
490
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
491
+
492
+ self.upp = np.prod(upsample_rates)
493
+
494
+ def forward(self, x, f0, g=None):
495
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
496
+ har_source = har_source.transpose(1, 2)
497
+ x = self.conv_pre(x)
498
+ if g is not None:
499
+ x = x + self.cond(g)
500
+
501
+ for i in range(self.num_upsamples):
502
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
503
+ x = self.ups[i](x)
504
+ x_source = self.noise_convs[i](har_source)
505
+ x = x + x_source
506
+ xs = None
507
+ for j in range(self.num_kernels):
508
+ if xs is None:
509
+ xs = self.resblocks[i * self.num_kernels + j](x)
510
+ else:
511
+ xs += self.resblocks[i * self.num_kernels + j](x)
512
+ x = xs / self.num_kernels
513
+ x = F.leaky_relu(x)
514
+ x = self.conv_post(x)
515
+ x = torch.tanh(x)
516
+ return x
517
+
518
+ def remove_weight_norm(self):
519
+ for l in self.ups:
520
+ remove_weight_norm(l)
521
+ for l in self.resblocks:
522
+ l.remove_weight_norm()
523
+
524
+
525
+ sr2sr = {
526
+ "32k": 32000,
527
+ "40k": 40000,
528
+ "48k": 48000,
529
+ }
530
+
531
+
532
+ class SynthesizerTrnMsNSFsidM(nn.Module):
533
+ def __init__(
534
+ self,
535
+ spec_channels,
536
+ segment_size,
537
+ inter_channels,
538
+ hidden_channels,
539
+ filter_channels,
540
+ n_heads,
541
+ n_layers,
542
+ kernel_size,
543
+ p_dropout,
544
+ resblock,
545
+ resblock_kernel_sizes,
546
+ resblock_dilation_sizes,
547
+ upsample_rates,
548
+ upsample_initial_channel,
549
+ upsample_kernel_sizes,
550
+ spk_embed_dim,
551
+ gin_channels,
552
+ sr,
553
+ version,
554
+ **kwargs
555
+ ):
556
+ super().__init__()
557
+ if type(sr) == type("strr"):
558
+ sr = sr2sr[sr]
559
+ self.spec_channels = spec_channels
560
+ self.inter_channels = inter_channels
561
+ self.hidden_channels = hidden_channels
562
+ self.filter_channels = filter_channels
563
+ self.n_heads = n_heads
564
+ self.n_layers = n_layers
565
+ self.kernel_size = kernel_size
566
+ self.p_dropout = p_dropout
567
+ self.resblock = resblock
568
+ self.resblock_kernel_sizes = resblock_kernel_sizes
569
+ self.resblock_dilation_sizes = resblock_dilation_sizes
570
+ self.upsample_rates = upsample_rates
571
+ self.upsample_initial_channel = upsample_initial_channel
572
+ self.upsample_kernel_sizes = upsample_kernel_sizes
573
+ self.segment_size = segment_size
574
+ self.gin_channels = gin_channels
575
+ # self.hop_length = hop_length#
576
+ self.spk_embed_dim = spk_embed_dim
577
+ if version == "v1":
578
+ self.enc_p = TextEncoder256(
579
+ inter_channels,
580
+ hidden_channels,
581
+ filter_channels,
582
+ n_heads,
583
+ n_layers,
584
+ kernel_size,
585
+ p_dropout,
586
+ )
587
+ else:
588
+ self.enc_p = TextEncoder768(
589
+ inter_channels,
590
+ hidden_channels,
591
+ filter_channels,
592
+ n_heads,
593
+ n_layers,
594
+ kernel_size,
595
+ p_dropout,
596
+ )
597
+ self.dec = GeneratorNSF(
598
+ inter_channels,
599
+ resblock,
600
+ resblock_kernel_sizes,
601
+ resblock_dilation_sizes,
602
+ upsample_rates,
603
+ upsample_initial_channel,
604
+ upsample_kernel_sizes,
605
+ gin_channels=gin_channels,
606
+ sr=sr,
607
+ is_half=kwargs["is_half"],
608
+ )
609
+ self.enc_q = PosteriorEncoder(
610
+ spec_channels,
611
+ inter_channels,
612
+ hidden_channels,
613
+ 5,
614
+ 1,
615
+ 16,
616
+ gin_channels=gin_channels,
617
+ )
618
+ self.flow = ResidualCouplingBlock(
619
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
620
+ )
621
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
622
+ self.speaker_map = None
623
+ logger.debug(
624
+ "gin_channels: "
625
+ + gin_channels
626
+ + ", self.spk_embed_dim: "
627
+ + self.spk_embed_dim
628
+ )
629
+
630
+ def remove_weight_norm(self):
631
+ self.dec.remove_weight_norm()
632
+ self.flow.remove_weight_norm()
633
+ self.enc_q.remove_weight_norm()
634
+
635
+ def construct_spkmixmap(self, n_speaker):
636
+ self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
637
+ for i in range(n_speaker):
638
+ self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
639
+ self.speaker_map = self.speaker_map.unsqueeze(0)
640
+
641
+ def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
642
+ if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
643
+ g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
644
+ g = g * self.speaker_map # [N, S, B, 1, H]
645
+ g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
646
+ g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
647
+ else:
648
+ g = g.unsqueeze(0)
649
+ g = self.emb_g(g).transpose(1, 2)
650
+
651
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
652
+ z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
653
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
654
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
655
+ return o
656
+
657
+
658
+ class MultiPeriodDiscriminator(torch.nn.Module):
659
+ def __init__(self, use_spectral_norm=False):
660
+ super(MultiPeriodDiscriminator, self).__init__()
661
+ periods = [2, 3, 5, 7, 11, 17]
662
+ # periods = [3, 5, 7, 11, 17, 23, 37]
663
+
664
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
665
+ discs = discs + [
666
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
667
+ ]
668
+ self.discriminators = nn.ModuleList(discs)
669
+
670
+ def forward(self, y, y_hat):
671
+ y_d_rs = [] #
672
+ y_d_gs = []
673
+ fmap_rs = []
674
+ fmap_gs = []
675
+ for i, d in enumerate(self.discriminators):
676
+ y_d_r, fmap_r = d(y)
677
+ y_d_g, fmap_g = d(y_hat)
678
+ # for j in range(len(fmap_r)):
679
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
680
+ y_d_rs.append(y_d_r)
681
+ y_d_gs.append(y_d_g)
682
+ fmap_rs.append(fmap_r)
683
+ fmap_gs.append(fmap_g)
684
+
685
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
686
+
687
+
688
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
689
+ def __init__(self, use_spectral_norm=False):
690
+ super(MultiPeriodDiscriminatorV2, self).__init__()
691
+ # periods = [2, 3, 5, 7, 11, 17]
692
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
693
+
694
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
695
+ discs = discs + [
696
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
697
+ ]
698
+ self.discriminators = nn.ModuleList(discs)
699
+
700
+ def forward(self, y, y_hat):
701
+ y_d_rs = [] #
702
+ y_d_gs = []
703
+ fmap_rs = []
704
+ fmap_gs = []
705
+ for i, d in enumerate(self.discriminators):
706
+ y_d_r, fmap_r = d(y)
707
+ y_d_g, fmap_g = d(y_hat)
708
+ # for j in range(len(fmap_r)):
709
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
710
+ y_d_rs.append(y_d_r)
711
+ y_d_gs.append(y_d_g)
712
+ fmap_rs.append(fmap_r)
713
+ fmap_gs.append(fmap_g)
714
+
715
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
716
+
717
+
718
+ class DiscriminatorS(torch.nn.Module):
719
+ def __init__(self, use_spectral_norm=False):
720
+ super(DiscriminatorS, self).__init__()
721
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
722
+ self.convs = nn.ModuleList(
723
+ [
724
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
725
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
726
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
727
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
728
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
729
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
730
+ ]
731
+ )
732
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
733
+
734
+ def forward(self, x):
735
+ fmap = []
736
+
737
+ for l in self.convs:
738
+ x = l(x)
739
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
740
+ fmap.append(x)
741
+ x = self.conv_post(x)
742
+ fmap.append(x)
743
+ x = torch.flatten(x, 1, -1)
744
+
745
+ return x, fmap
746
+
747
+
748
+ class DiscriminatorP(torch.nn.Module):
749
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
750
+ super(DiscriminatorP, self).__init__()
751
+ self.period = period
752
+ self.use_spectral_norm = use_spectral_norm
753
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
754
+ self.convs = nn.ModuleList(
755
+ [
756
+ norm_f(
757
+ Conv2d(
758
+ 1,
759
+ 32,
760
+ (kernel_size, 1),
761
+ (stride, 1),
762
+ padding=(get_padding(kernel_size, 1), 0),
763
+ )
764
+ ),
765
+ norm_f(
766
+ Conv2d(
767
+ 32,
768
+ 128,
769
+ (kernel_size, 1),
770
+ (stride, 1),
771
+ padding=(get_padding(kernel_size, 1), 0),
772
+ )
773
+ ),
774
+ norm_f(
775
+ Conv2d(
776
+ 128,
777
+ 512,
778
+ (kernel_size, 1),
779
+ (stride, 1),
780
+ padding=(get_padding(kernel_size, 1), 0),
781
+ )
782
+ ),
783
+ norm_f(
784
+ Conv2d(
785
+ 512,
786
+ 1024,
787
+ (kernel_size, 1),
788
+ (stride, 1),
789
+ padding=(get_padding(kernel_size, 1), 0),
790
+ )
791
+ ),
792
+ norm_f(
793
+ Conv2d(
794
+ 1024,
795
+ 1024,
796
+ (kernel_size, 1),
797
+ 1,
798
+ padding=(get_padding(kernel_size, 1), 0),
799
+ )
800
+ ),
801
+ ]
802
+ )
803
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
804
+
805
+ def forward(self, x):
806
+ fmap = []
807
+
808
+ # 1d to 2d
809
+ b, c, t = x.shape
810
+ if t % self.period != 0: # pad first
811
+ n_pad = self.period - (t % self.period)
812
+ x = F.pad(x, (0, n_pad), "reflect")
813
+ t = t + n_pad
814
+ x = x.view(b, c, t // self.period, self.period)
815
+
816
+ for l in self.convs:
817
+ x = l(x)
818
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
819
+ fmap.append(x)
820
+ x = self.conv_post(x)
821
+ fmap.append(x)
822
+ x = torch.flatten(x, 1, -1)
823
+
824
+ return x, fmap
infer/lib/infer_pack/modules.py ADDED
@@ -0,0 +1,521 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+
4
+ import numpy as np
5
+ import scipy
6
+ import torch
7
+ from torch import nn
8
+ from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
9
+ from torch.nn import functional as F
10
+ from torch.nn.utils import remove_weight_norm, weight_norm
11
+
12
+ from infer.lib.infer_pack import commons
13
+ from infer.lib.infer_pack.commons import get_padding, init_weights
14
+ from infer.lib.infer_pack.transforms import piecewise_rational_quadratic_transform
15
+
16
+ LRELU_SLOPE = 0.1
17
+
18
+
19
+ class LayerNorm(nn.Module):
20
+ def __init__(self, channels, eps=1e-5):
21
+ super().__init__()
22
+ self.channels = channels
23
+ self.eps = eps
24
+
25
+ self.gamma = nn.Parameter(torch.ones(channels))
26
+ self.beta = nn.Parameter(torch.zeros(channels))
27
+
28
+ def forward(self, x):
29
+ x = x.transpose(1, -1)
30
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
31
+ return x.transpose(1, -1)
32
+
33
+
34
+ class ConvReluNorm(nn.Module):
35
+ def __init__(
36
+ self,
37
+ in_channels,
38
+ hidden_channels,
39
+ out_channels,
40
+ kernel_size,
41
+ n_layers,
42
+ p_dropout,
43
+ ):
44
+ super().__init__()
45
+ self.in_channels = in_channels
46
+ self.hidden_channels = hidden_channels
47
+ self.out_channels = out_channels
48
+ self.kernel_size = kernel_size
49
+ self.n_layers = n_layers
50
+ self.p_dropout = p_dropout
51
+ assert n_layers > 1, "Number of layers should be larger than 0."
52
+
53
+ self.conv_layers = nn.ModuleList()
54
+ self.norm_layers = nn.ModuleList()
55
+ self.conv_layers.append(
56
+ nn.Conv1d(
57
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
58
+ )
59
+ )
60
+ self.norm_layers.append(LayerNorm(hidden_channels))
61
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
62
+ for _ in range(n_layers - 1):
63
+ self.conv_layers.append(
64
+ nn.Conv1d(
65
+ hidden_channels,
66
+ hidden_channels,
67
+ kernel_size,
68
+ padding=kernel_size // 2,
69
+ )
70
+ )
71
+ self.norm_layers.append(LayerNorm(hidden_channels))
72
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
73
+ self.proj.weight.data.zero_()
74
+ self.proj.bias.data.zero_()
75
+
76
+ def forward(self, x, x_mask):
77
+ x_org = x
78
+ for i in range(self.n_layers):
79
+ x = self.conv_layers[i](x * x_mask)
80
+ x = self.norm_layers[i](x)
81
+ x = self.relu_drop(x)
82
+ x = x_org + self.proj(x)
83
+ return x * x_mask
84
+
85
+
86
+ class DDSConv(nn.Module):
87
+ """
88
+ Dialted and Depth-Separable Convolution
89
+ """
90
+
91
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
92
+ super().__init__()
93
+ self.channels = channels
94
+ self.kernel_size = kernel_size
95
+ self.n_layers = n_layers
96
+ self.p_dropout = p_dropout
97
+
98
+ self.drop = nn.Dropout(p_dropout)
99
+ self.convs_sep = nn.ModuleList()
100
+ self.convs_1x1 = nn.ModuleList()
101
+ self.norms_1 = nn.ModuleList()
102
+ self.norms_2 = nn.ModuleList()
103
+ for i in range(n_layers):
104
+ dilation = kernel_size**i
105
+ padding = (kernel_size * dilation - dilation) // 2
106
+ self.convs_sep.append(
107
+ nn.Conv1d(
108
+ channels,
109
+ channels,
110
+ kernel_size,
111
+ groups=channels,
112
+ dilation=dilation,
113
+ padding=padding,
114
+ )
115
+ )
116
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
117
+ self.norms_1.append(LayerNorm(channels))
118
+ self.norms_2.append(LayerNorm(channels))
119
+
120
+ def forward(self, x, x_mask, g=None):
121
+ if g is not None:
122
+ x = x + g
123
+ for i in range(self.n_layers):
124
+ y = self.convs_sep[i](x * x_mask)
125
+ y = self.norms_1[i](y)
126
+ y = F.gelu(y)
127
+ y = self.convs_1x1[i](y)
128
+ y = self.norms_2[i](y)
129
+ y = F.gelu(y)
130
+ y = self.drop(y)
131
+ x = x + y
132
+ return x * x_mask
133
+
134
+
135
+ class WN(torch.nn.Module):
136
+ def __init__(
137
+ self,
138
+ hidden_channels,
139
+ kernel_size,
140
+ dilation_rate,
141
+ n_layers,
142
+ gin_channels=0,
143
+ p_dropout=0,
144
+ ):
145
+ super(WN, self).__init__()
146
+ assert kernel_size % 2 == 1
147
+ self.hidden_channels = hidden_channels
148
+ self.kernel_size = (kernel_size,)
149
+ self.dilation_rate = dilation_rate
150
+ self.n_layers = n_layers
151
+ self.gin_channels = gin_channels
152
+ self.p_dropout = p_dropout
153
+
154
+ self.in_layers = torch.nn.ModuleList()
155
+ self.res_skip_layers = torch.nn.ModuleList()
156
+ self.drop = nn.Dropout(p_dropout)
157
+
158
+ if gin_channels != 0:
159
+ cond_layer = torch.nn.Conv1d(
160
+ gin_channels, 2 * hidden_channels * n_layers, 1
161
+ )
162
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
163
+
164
+ for i in range(n_layers):
165
+ dilation = dilation_rate**i
166
+ padding = int((kernel_size * dilation - dilation) / 2)
167
+ in_layer = torch.nn.Conv1d(
168
+ hidden_channels,
169
+ 2 * hidden_channels,
170
+ kernel_size,
171
+ dilation=dilation,
172
+ padding=padding,
173
+ )
174
+ in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
175
+ self.in_layers.append(in_layer)
176
+
177
+ # last one is not necessary
178
+ if i < n_layers - 1:
179
+ res_skip_channels = 2 * hidden_channels
180
+ else:
181
+ res_skip_channels = hidden_channels
182
+
183
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
184
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
185
+ self.res_skip_layers.append(res_skip_layer)
186
+
187
+ def forward(self, x, x_mask, g=None, **kwargs):
188
+ output = torch.zeros_like(x)
189
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
190
+
191
+ if g is not None:
192
+ g = self.cond_layer(g)
193
+
194
+ for i in range(self.n_layers):
195
+ x_in = self.in_layers[i](x)
196
+ if g is not None:
197
+ cond_offset = i * 2 * self.hidden_channels
198
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
199
+ else:
200
+ g_l = torch.zeros_like(x_in)
201
+
202
+ acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
203
+ acts = self.drop(acts)
204
+
205
+ res_skip_acts = self.res_skip_layers[i](acts)
206
+ if i < self.n_layers - 1:
207
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
208
+ x = (x + res_acts) * x_mask
209
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
210
+ else:
211
+ output = output + res_skip_acts
212
+ return output * x_mask
213
+
214
+ def remove_weight_norm(self):
215
+ if self.gin_channels != 0:
216
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
217
+ for l in self.in_layers:
218
+ torch.nn.utils.remove_weight_norm(l)
219
+ for l in self.res_skip_layers:
220
+ torch.nn.utils.remove_weight_norm(l)
221
+
222
+
223
+ class ResBlock1(torch.nn.Module):
224
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
225
+ super(ResBlock1, self).__init__()
226
+ self.convs1 = nn.ModuleList(
227
+ [
228
+ weight_norm(
229
+ Conv1d(
230
+ channels,
231
+ channels,
232
+ kernel_size,
233
+ 1,
234
+ dilation=dilation[0],
235
+ padding=get_padding(kernel_size, dilation[0]),
236
+ )
237
+ ),
238
+ weight_norm(
239
+ Conv1d(
240
+ channels,
241
+ channels,
242
+ kernel_size,
243
+ 1,
244
+ dilation=dilation[1],
245
+ padding=get_padding(kernel_size, dilation[1]),
246
+ )
247
+ ),
248
+ weight_norm(
249
+ Conv1d(
250
+ channels,
251
+ channels,
252
+ kernel_size,
253
+ 1,
254
+ dilation=dilation[2],
255
+ padding=get_padding(kernel_size, dilation[2]),
256
+ )
257
+ ),
258
+ ]
259
+ )
260
+ self.convs1.apply(init_weights)
261
+
262
+ self.convs2 = nn.ModuleList(
263
+ [
264
+ weight_norm(
265
+ Conv1d(
266
+ channels,
267
+ channels,
268
+ kernel_size,
269
+ 1,
270
+ dilation=1,
271
+ padding=get_padding(kernel_size, 1),
272
+ )
273
+ ),
274
+ weight_norm(
275
+ Conv1d(
276
+ channels,
277
+ channels,
278
+ kernel_size,
279
+ 1,
280
+ dilation=1,
281
+ padding=get_padding(kernel_size, 1),
282
+ )
283
+ ),
284
+ weight_norm(
285
+ Conv1d(
286
+ channels,
287
+ channels,
288
+ kernel_size,
289
+ 1,
290
+ dilation=1,
291
+ padding=get_padding(kernel_size, 1),
292
+ )
293
+ ),
294
+ ]
295
+ )
296
+ self.convs2.apply(init_weights)
297
+
298
+ def forward(self, x, x_mask=None):
299
+ for c1, c2 in zip(self.convs1, self.convs2):
300
+ xt = F.leaky_relu(x, LRELU_SLOPE)
301
+ if x_mask is not None:
302
+ xt = xt * x_mask
303
+ xt = c1(xt)
304
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
305
+ if x_mask is not None:
306
+ xt = xt * x_mask
307
+ xt = c2(xt)
308
+ x = xt + x
309
+ if x_mask is not None:
310
+ x = x * x_mask
311
+ return x
312
+
313
+ def remove_weight_norm(self):
314
+ for l in self.convs1:
315
+ remove_weight_norm(l)
316
+ for l in self.convs2:
317
+ remove_weight_norm(l)
318
+
319
+
320
+ class ResBlock2(torch.nn.Module):
321
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
322
+ super(ResBlock2, self).__init__()
323
+ self.convs = nn.ModuleList(
324
+ [
325
+ weight_norm(
326
+ Conv1d(
327
+ channels,
328
+ channels,
329
+ kernel_size,
330
+ 1,
331
+ dilation=dilation[0],
332
+ padding=get_padding(kernel_size, dilation[0]),
333
+ )
334
+ ),
335
+ weight_norm(
336
+ Conv1d(
337
+ channels,
338
+ channels,
339
+ kernel_size,
340
+ 1,
341
+ dilation=dilation[1],
342
+ padding=get_padding(kernel_size, dilation[1]),
343
+ )
344
+ ),
345
+ ]
346
+ )
347
+ self.convs.apply(init_weights)
348
+
349
+ def forward(self, x, x_mask=None):
350
+ for c in self.convs:
351
+ xt = F.leaky_relu(x, LRELU_SLOPE)
352
+ if x_mask is not None:
353
+ xt = xt * x_mask
354
+ xt = c(xt)
355
+ x = xt + x
356
+ if x_mask is not None:
357
+ x = x * x_mask
358
+ return x
359
+
360
+ def remove_weight_norm(self):
361
+ for l in self.convs:
362
+ remove_weight_norm(l)
363
+
364
+
365
+ class Log(nn.Module):
366
+ def forward(self, x, x_mask, reverse=False, **kwargs):
367
+ if not reverse:
368
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
369
+ logdet = torch.sum(-y, [1, 2])
370
+ return y, logdet
371
+ else:
372
+ x = torch.exp(x) * x_mask
373
+ return x
374
+
375
+
376
+ class Flip(nn.Module):
377
+ def forward(self, x, *args, reverse=False, **kwargs):
378
+ x = torch.flip(x, [1])
379
+ if not reverse:
380
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
381
+ return x, logdet
382
+ else:
383
+ return x
384
+
385
+
386
+ class ElementwiseAffine(nn.Module):
387
+ def __init__(self, channels):
388
+ super().__init__()
389
+ self.channels = channels
390
+ self.m = nn.Parameter(torch.zeros(channels, 1))
391
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
392
+
393
+ def forward(self, x, x_mask, reverse=False, **kwargs):
394
+ if not reverse:
395
+ y = self.m + torch.exp(self.logs) * x
396
+ y = y * x_mask
397
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
398
+ return y, logdet
399
+ else:
400
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
401
+ return x
402
+
403
+
404
+ class ResidualCouplingLayer(nn.Module):
405
+ def __init__(
406
+ self,
407
+ channels,
408
+ hidden_channels,
409
+ kernel_size,
410
+ dilation_rate,
411
+ n_layers,
412
+ p_dropout=0,
413
+ gin_channels=0,
414
+ mean_only=False,
415
+ ):
416
+ assert channels % 2 == 0, "channels should be divisible by 2"
417
+ super().__init__()
418
+ self.channels = channels
419
+ self.hidden_channels = hidden_channels
420
+ self.kernel_size = kernel_size
421
+ self.dilation_rate = dilation_rate
422
+ self.n_layers = n_layers
423
+ self.half_channels = channels // 2
424
+ self.mean_only = mean_only
425
+
426
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
427
+ self.enc = WN(
428
+ hidden_channels,
429
+ kernel_size,
430
+ dilation_rate,
431
+ n_layers,
432
+ p_dropout=p_dropout,
433
+ gin_channels=gin_channels,
434
+ )
435
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
436
+ self.post.weight.data.zero_()
437
+ self.post.bias.data.zero_()
438
+
439
+ def forward(self, x, x_mask, g=None, reverse=False):
440
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
441
+ h = self.pre(x0) * x_mask
442
+ h = self.enc(h, x_mask, g=g)
443
+ stats = self.post(h) * x_mask
444
+ if not self.mean_only:
445
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
446
+ else:
447
+ m = stats
448
+ logs = torch.zeros_like(m)
449
+
450
+ if not reverse:
451
+ x1 = m + x1 * torch.exp(logs) * x_mask
452
+ x = torch.cat([x0, x1], 1)
453
+ logdet = torch.sum(logs, [1, 2])
454
+ return x, logdet
455
+ else:
456
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
457
+ x = torch.cat([x0, x1], 1)
458
+ return x
459
+
460
+ def remove_weight_norm(self):
461
+ self.enc.remove_weight_norm()
462
+
463
+
464
+ class ConvFlow(nn.Module):
465
+ def __init__(
466
+ self,
467
+ in_channels,
468
+ filter_channels,
469
+ kernel_size,
470
+ n_layers,
471
+ num_bins=10,
472
+ tail_bound=5.0,
473
+ ):
474
+ super().__init__()
475
+ self.in_channels = in_channels
476
+ self.filter_channels = filter_channels
477
+ self.kernel_size = kernel_size
478
+ self.n_layers = n_layers
479
+ self.num_bins = num_bins
480
+ self.tail_bound = tail_bound
481
+ self.half_channels = in_channels // 2
482
+
483
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
484
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
485
+ self.proj = nn.Conv1d(
486
+ filter_channels, self.half_channels * (num_bins * 3 - 1), 1
487
+ )
488
+ self.proj.weight.data.zero_()
489
+ self.proj.bias.data.zero_()
490
+
491
+ def forward(self, x, x_mask, g=None, reverse=False):
492
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
493
+ h = self.pre(x0)
494
+ h = self.convs(h, x_mask, g=g)
495
+ h = self.proj(h) * x_mask
496
+
497
+ b, c, t = x0.shape
498
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
499
+
500
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
501
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
502
+ self.filter_channels
503
+ )
504
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
505
+
506
+ x1, logabsdet = piecewise_rational_quadratic_transform(
507
+ x1,
508
+ unnormalized_widths,
509
+ unnormalized_heights,
510
+ unnormalized_derivatives,
511
+ inverse=reverse,
512
+ tails="linear",
513
+ tail_bound=self.tail_bound,
514
+ )
515
+
516
+ x = torch.cat([x0, x1], 1) * x_mask
517
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
518
+ if not reverse:
519
+ return x, logdet
520
+ else:
521
+ return x
infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pyworld
3
+
4
+ from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
5
+
6
+
7
+ class DioF0Predictor(F0Predictor):
8
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
9
+ self.hop_length = hop_length
10
+ self.f0_min = f0_min
11
+ self.f0_max = f0_max
12
+ self.sampling_rate = sampling_rate
13
+
14
+ def interpolate_f0(self, f0):
15
+ """
16
+ 对F0进行插值处理
17
+ """
18
+
19
+ data = np.reshape(f0, (f0.size, 1))
20
+
21
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22
+ vuv_vector[data > 0.0] = 1.0
23
+ vuv_vector[data <= 0.0] = 0.0
24
+
25
+ ip_data = data
26
+
27
+ frame_number = data.size
28
+ last_value = 0.0
29
+ for i in range(frame_number):
30
+ if data[i] <= 0.0:
31
+ j = i + 1
32
+ for j in range(i + 1, frame_number):
33
+ if data[j] > 0.0:
34
+ break
35
+ if j < frame_number - 1:
36
+ if last_value > 0.0:
37
+ step = (data[j] - data[i - 1]) / float(j - i)
38
+ for k in range(i, j):
39
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
40
+ else:
41
+ for k in range(i, j):
42
+ ip_data[k] = data[j]
43
+ else:
44
+ for k in range(i, frame_number):
45
+ ip_data[k] = last_value
46
+ else:
47
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
48
+ last_value = data[i]
49
+
50
+ return ip_data[:, 0], vuv_vector[:, 0]
51
+
52
+ def resize_f0(self, x, target_len):
53
+ source = np.array(x)
54
+ source[source < 0.001] = np.nan
55
+ target = np.interp(
56
+ np.arange(0, len(source) * target_len, len(source)) / target_len,
57
+ np.arange(0, len(source)),
58
+ source,
59
+ )
60
+ res = np.nan_to_num(target)
61
+ return res
62
+
63
+ def compute_f0(self, wav, p_len=None):
64
+ if p_len is None:
65
+ p_len = wav.shape[0] // self.hop_length
66
+ f0, t = pyworld.dio(
67
+ wav.astype(np.double),
68
+ fs=self.sampling_rate,
69
+ f0_floor=self.f0_min,
70
+ f0_ceil=self.f0_max,
71
+ frame_period=1000 * self.hop_length / self.sampling_rate,
72
+ )
73
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
74
+ for index, pitch in enumerate(f0):
75
+ f0[index] = round(pitch, 1)
76
+ return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
77
+
78
+ def compute_f0_uv(self, wav, p_len=None):
79
+ if p_len is None:
80
+ p_len = wav.shape[0] // self.hop_length
81
+ f0, t = pyworld.dio(
82
+ wav.astype(np.double),
83
+ fs=self.sampling_rate,
84
+ f0_floor=self.f0_min,
85
+ f0_ceil=self.f0_max,
86
+ frame_period=1000 * self.hop_length / self.sampling_rate,
87
+ )
88
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
89
+ for index, pitch in enumerate(f0):
90
+ f0[index] = round(pitch, 1)
91
+ return self.interpolate_f0(self.resize_f0(f0, p_len))
infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class F0Predictor(object):
2
+ def compute_f0(self, wav, p_len):
3
+ """
4
+ input: wav:[signal_length]
5
+ p_len:int
6
+ output: f0:[signal_length//hop_length]
7
+ """
8
+ pass
9
+
10
+ def compute_f0_uv(self, wav, p_len):
11
+ """
12
+ input: wav:[signal_length]
13
+ p_len:int
14
+ output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
15
+ """
16
+ pass
infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pyworld
3
+
4
+ from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
5
+
6
+
7
+ class HarvestF0Predictor(F0Predictor):
8
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
9
+ self.hop_length = hop_length
10
+ self.f0_min = f0_min
11
+ self.f0_max = f0_max
12
+ self.sampling_rate = sampling_rate
13
+
14
+ def interpolate_f0(self, f0):
15
+ """
16
+ 对F0进行插值处理
17
+ """
18
+
19
+ data = np.reshape(f0, (f0.size, 1))
20
+
21
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22
+ vuv_vector[data > 0.0] = 1.0
23
+ vuv_vector[data <= 0.0] = 0.0
24
+
25
+ ip_data = data
26
+
27
+ frame_number = data.size
28
+ last_value = 0.0
29
+ for i in range(frame_number):
30
+ if data[i] <= 0.0:
31
+ j = i + 1
32
+ for j in range(i + 1, frame_number):
33
+ if data[j] > 0.0:
34
+ break
35
+ if j < frame_number - 1:
36
+ if last_value > 0.0:
37
+ step = (data[j] - data[i - 1]) / float(j - i)
38
+ for k in range(i, j):
39
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
40
+ else:
41
+ for k in range(i, j):
42
+ ip_data[k] = data[j]
43
+ else:
44
+ for k in range(i, frame_number):
45
+ ip_data[k] = last_value
46
+ else:
47
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
48
+ last_value = data[i]
49
+
50
+ return ip_data[:, 0], vuv_vector[:, 0]
51
+
52
+ def resize_f0(self, x, target_len):
53
+ source = np.array(x)
54
+ source[source < 0.001] = np.nan
55
+ target = np.interp(
56
+ np.arange(0, len(source) * target_len, len(source)) / target_len,
57
+ np.arange(0, len(source)),
58
+ source,
59
+ )
60
+ res = np.nan_to_num(target)
61
+ return res
62
+
63
+ def compute_f0(self, wav, p_len=None):
64
+ if p_len is None:
65
+ p_len = wav.shape[0] // self.hop_length
66
+ f0, t = pyworld.harvest(
67
+ wav.astype(np.double),
68
+ fs=self.hop_length,
69
+ f0_ceil=self.f0_max,
70
+ f0_floor=self.f0_min,
71
+ frame_period=1000 * self.hop_length / self.sampling_rate,
72
+ )
73
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
74
+ return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
75
+
76
+ def compute_f0_uv(self, wav, p_len=None):
77
+ if p_len is None:
78
+ p_len = wav.shape[0] // self.hop_length
79
+ f0, t = pyworld.harvest(
80
+ wav.astype(np.double),
81
+ fs=self.sampling_rate,
82
+ f0_floor=self.f0_min,
83
+ f0_ceil=self.f0_max,
84
+ frame_period=1000 * self.hop_length / self.sampling_rate,
85
+ )
86
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
87
+ return self.interpolate_f0(self.resize_f0(f0, p_len))
infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import parselmouth
3
+
4
+ from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
5
+
6
+
7
+ class PMF0Predictor(F0Predictor):
8
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
9
+ self.hop_length = hop_length
10
+ self.f0_min = f0_min
11
+ self.f0_max = f0_max
12
+ self.sampling_rate = sampling_rate
13
+
14
+ def interpolate_f0(self, f0):
15
+ """
16
+ 对F0进行插值处理
17
+ """
18
+
19
+ data = np.reshape(f0, (f0.size, 1))
20
+
21
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22
+ vuv_vector[data > 0.0] = 1.0
23
+ vuv_vector[data <= 0.0] = 0.0
24
+
25
+ ip_data = data
26
+
27
+ frame_number = data.size
28
+ last_value = 0.0
29
+ for i in range(frame_number):
30
+ if data[i] <= 0.0:
31
+ j = i + 1
32
+ for j in range(i + 1, frame_number):
33
+ if data[j] > 0.0:
34
+ break
35
+ if j < frame_number - 1:
36
+ if last_value > 0.0:
37
+ step = (data[j] - data[i - 1]) / float(j - i)
38
+ for k in range(i, j):
39
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
40
+ else:
41
+ for k in range(i, j):
42
+ ip_data[k] = data[j]
43
+ else:
44
+ for k in range(i, frame_number):
45
+ ip_data[k] = last_value
46
+ else:
47
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
48
+ last_value = data[i]
49
+
50
+ return ip_data[:, 0], vuv_vector[:, 0]
51
+
52
+ def compute_f0(self, wav, p_len=None):
53
+ x = wav
54
+ if p_len is None:
55
+ p_len = x.shape[0] // self.hop_length
56
+ else:
57
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
58
+ time_step = self.hop_length / self.sampling_rate * 1000
59
+ f0 = (
60
+ parselmouth.Sound(x, self.sampling_rate)
61
+ .to_pitch_ac(
62
+ time_step=time_step / 1000,
63
+ voicing_threshold=0.6,
64
+ pitch_floor=self.f0_min,
65
+ pitch_ceiling=self.f0_max,
66
+ )
67
+ .selected_array["frequency"]
68
+ )
69
+
70
+ pad_size = (p_len - len(f0) + 1) // 2
71
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
72
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
73
+ f0, uv = self.interpolate_f0(f0)
74
+ return f0
75
+
76
+ def compute_f0_uv(self, wav, p_len=None):
77
+ x = wav
78
+ if p_len is None:
79
+ p_len = x.shape[0] // self.hop_length
80
+ else:
81
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
82
+ time_step = self.hop_length / self.sampling_rate * 1000
83
+ f0 = (
84
+ parselmouth.Sound(x, self.sampling_rate)
85
+ .to_pitch_ac(
86
+ time_step=time_step / 1000,
87
+ voicing_threshold=0.6,
88
+ pitch_floor=self.f0_min,
89
+ pitch_ceiling=self.f0_max,
90
+ )
91
+ .selected_array["frequency"]
92
+ )
93
+
94
+ pad_size = (p_len - len(f0) + 1) // 2
95
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
96
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
97
+ f0, uv = self.interpolate_f0(f0)
98
+ return f0, uv
infer/lib/infer_pack/modules/F0Predictor/__init__.py ADDED
File without changes
infer/lib/infer_pack/onnx_inference.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import onnxruntime
4
+ import soundfile
5
+
6
+ import logging
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class ContentVec:
12
+ def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
13
+ logger.info("Load model(s) from {}".format(vec_path))
14
+ if device == "cpu" or device is None:
15
+ providers = ["CPUExecutionProvider"]
16
+ elif device == "cuda":
17
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
18
+ elif device == "dml":
19
+ providers = ["DmlExecutionProvider"]
20
+ else:
21
+ raise RuntimeError("Unsportted Device")
22
+ self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
23
+
24
+ def __call__(self, wav):
25
+ return self.forward(wav)
26
+
27
+ def forward(self, wav):
28
+ feats = wav
29
+ if feats.ndim == 2: # double channels
30
+ feats = feats.mean(-1)
31
+ assert feats.ndim == 1, feats.ndim
32
+ feats = np.expand_dims(np.expand_dims(feats, 0), 0)
33
+ onnx_input = {self.model.get_inputs()[0].name: feats}
34
+ logits = self.model.run(None, onnx_input)[0]
35
+ return logits.transpose(0, 2, 1)
36
+
37
+
38
+ def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
39
+ if f0_predictor == "pm":
40
+ from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
41
+
42
+ f0_predictor_object = PMF0Predictor(
43
+ hop_length=hop_length, sampling_rate=sampling_rate
44
+ )
45
+ elif f0_predictor == "harvest":
46
+ from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import (
47
+ HarvestF0Predictor,
48
+ )
49
+
50
+ f0_predictor_object = HarvestF0Predictor(
51
+ hop_length=hop_length, sampling_rate=sampling_rate
52
+ )
53
+ elif f0_predictor == "dio":
54
+ from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
55
+
56
+ f0_predictor_object = DioF0Predictor(
57
+ hop_length=hop_length, sampling_rate=sampling_rate
58
+ )
59
+ else:
60
+ raise Exception("Unknown f0 predictor")
61
+ return f0_predictor_object
62
+
63
+
64
+ class OnnxRVC:
65
+ def __init__(
66
+ self,
67
+ model_path,
68
+ sr=40000,
69
+ hop_size=512,
70
+ vec_path="vec-768-layer-12",
71
+ device="cpu",
72
+ ):
73
+ vec_path = f"pretrained/{vec_path}.onnx"
74
+ self.vec_model = ContentVec(vec_path, device)
75
+ if device == "cpu" or device is None:
76
+ providers = ["CPUExecutionProvider"]
77
+ elif device == "cuda":
78
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
79
+ elif device == "dml":
80
+ providers = ["DmlExecutionProvider"]
81
+ else:
82
+ raise RuntimeError("Unsportted Device")
83
+ self.model = onnxruntime.InferenceSession(model_path, providers=providers)
84
+ self.sampling_rate = sr
85
+ self.hop_size = hop_size
86
+
87
+ def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd):
88
+ onnx_input = {
89
+ self.model.get_inputs()[0].name: hubert,
90
+ self.model.get_inputs()[1].name: hubert_length,
91
+ self.model.get_inputs()[2].name: pitch,
92
+ self.model.get_inputs()[3].name: pitchf,
93
+ self.model.get_inputs()[4].name: ds,
94
+ self.model.get_inputs()[5].name: rnd,
95
+ }
96
+ return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16)
97
+
98
+ def inference(
99
+ self,
100
+ raw_path,
101
+ sid,
102
+ f0_method="dio",
103
+ f0_up_key=0,
104
+ pad_time=0.5,
105
+ cr_threshold=0.02,
106
+ ):
107
+ f0_min = 50
108
+ f0_max = 1100
109
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
110
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
111
+ f0_predictor = get_f0_predictor(
112
+ f0_method,
113
+ hop_length=self.hop_size,
114
+ sampling_rate=self.sampling_rate,
115
+ threshold=cr_threshold,
116
+ )
117
+ wav, sr = librosa.load(raw_path, sr=self.sampling_rate)
118
+ org_length = len(wav)
119
+ if org_length / sr > 50.0:
120
+ raise RuntimeError("Reached Max Length")
121
+
122
+ wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000)
123
+ wav16k = wav16k
124
+
125
+ hubert = self.vec_model(wav16k)
126
+ hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
127
+ hubert_length = hubert.shape[1]
128
+
129
+ pitchf = f0_predictor.compute_f0(wav, hubert_length)
130
+ pitchf = pitchf * 2 ** (f0_up_key / 12)
131
+ pitch = pitchf.copy()
132
+ f0_mel = 1127 * np.log(1 + pitch / 700)
133
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
134
+ f0_mel_max - f0_mel_min
135
+ ) + 1
136
+ f0_mel[f0_mel <= 1] = 1
137
+ f0_mel[f0_mel > 255] = 255
138
+ pitch = np.rint(f0_mel).astype(np.int64)
139
+
140
+ pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32)
141
+ pitch = pitch.reshape(1, len(pitch))
142
+ ds = np.array([sid]).astype(np.int64)
143
+
144
+ rnd = np.random.randn(1, 192, hubert_length).astype(np.float32)
145
+ hubert_length = np.array([hubert_length]).astype(np.int64)
146
+
147
+ out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze()
148
+ out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant")
149
+ return out_wav[0:org_length]
infer/lib/infer_pack/transforms.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from torch.nn import functional as F
4
+
5
+ DEFAULT_MIN_BIN_WIDTH = 1e-3
6
+ DEFAULT_MIN_BIN_HEIGHT = 1e-3
7
+ DEFAULT_MIN_DERIVATIVE = 1e-3
8
+
9
+
10
+ def piecewise_rational_quadratic_transform(
11
+ inputs,
12
+ unnormalized_widths,
13
+ unnormalized_heights,
14
+ unnormalized_derivatives,
15
+ inverse=False,
16
+ tails=None,
17
+ tail_bound=1.0,
18
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
19
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
20
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
21
+ ):
22
+ if tails is None:
23
+ spline_fn = rational_quadratic_spline
24
+ spline_kwargs = {}
25
+ else:
26
+ spline_fn = unconstrained_rational_quadratic_spline
27
+ spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
28
+
29
+ outputs, logabsdet = spline_fn(
30
+ inputs=inputs,
31
+ unnormalized_widths=unnormalized_widths,
32
+ unnormalized_heights=unnormalized_heights,
33
+ unnormalized_derivatives=unnormalized_derivatives,
34
+ inverse=inverse,
35
+ min_bin_width=min_bin_width,
36
+ min_bin_height=min_bin_height,
37
+ min_derivative=min_derivative,
38
+ **spline_kwargs
39
+ )
40
+ return outputs, logabsdet
41
+
42
+
43
+ def searchsorted(bin_locations, inputs, eps=1e-6):
44
+ bin_locations[..., -1] += eps
45
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
46
+
47
+
48
+ def unconstrained_rational_quadratic_spline(
49
+ inputs,
50
+ unnormalized_widths,
51
+ unnormalized_heights,
52
+ unnormalized_derivatives,
53
+ inverse=False,
54
+ tails="linear",
55
+ tail_bound=1.0,
56
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
57
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
58
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
59
+ ):
60
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
61
+ outside_interval_mask = ~inside_interval_mask
62
+
63
+ outputs = torch.zeros_like(inputs)
64
+ logabsdet = torch.zeros_like(inputs)
65
+
66
+ if tails == "linear":
67
+ unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
68
+ constant = np.log(np.exp(1 - min_derivative) - 1)
69
+ unnormalized_derivatives[..., 0] = constant
70
+ unnormalized_derivatives[..., -1] = constant
71
+
72
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
73
+ logabsdet[outside_interval_mask] = 0
74
+ else:
75
+ raise RuntimeError("{} tails are not implemented.".format(tails))
76
+
77
+ (
78
+ outputs[inside_interval_mask],
79
+ logabsdet[inside_interval_mask],
80
+ ) = rational_quadratic_spline(
81
+ inputs=inputs[inside_interval_mask],
82
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
83
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
84
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
85
+ inverse=inverse,
86
+ left=-tail_bound,
87
+ right=tail_bound,
88
+ bottom=-tail_bound,
89
+ top=tail_bound,
90
+ min_bin_width=min_bin_width,
91
+ min_bin_height=min_bin_height,
92
+ min_derivative=min_derivative,
93
+ )
94
+
95
+ return outputs, logabsdet
96
+
97
+
98
+ def rational_quadratic_spline(
99
+ inputs,
100
+ unnormalized_widths,
101
+ unnormalized_heights,
102
+ unnormalized_derivatives,
103
+ inverse=False,
104
+ left=0.0,
105
+ right=1.0,
106
+ bottom=0.0,
107
+ top=1.0,
108
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
109
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
110
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
111
+ ):
112
+ if torch.min(inputs) < left or torch.max(inputs) > right:
113
+ raise ValueError("Input to a transform is not within its domain")
114
+
115
+ num_bins = unnormalized_widths.shape[-1]
116
+
117
+ if min_bin_width * num_bins > 1.0:
118
+ raise ValueError("Minimal bin width too large for the number of bins")
119
+ if min_bin_height * num_bins > 1.0:
120
+ raise ValueError("Minimal bin height too large for the number of bins")
121
+
122
+ widths = F.softmax(unnormalized_widths, dim=-1)
123
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
124
+ cumwidths = torch.cumsum(widths, dim=-1)
125
+ cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
126
+ cumwidths = (right - left) * cumwidths + left
127
+ cumwidths[..., 0] = left
128
+ cumwidths[..., -1] = right
129
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
130
+
131
+ derivatives = min_derivative + F.softplus(unnormalized_derivatives)
132
+
133
+ heights = F.softmax(unnormalized_heights, dim=-1)
134
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
135
+ cumheights = torch.cumsum(heights, dim=-1)
136
+ cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
137
+ cumheights = (top - bottom) * cumheights + bottom
138
+ cumheights[..., 0] = bottom
139
+ cumheights[..., -1] = top
140
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
141
+
142
+ if inverse:
143
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
144
+ else:
145
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
146
+
147
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
148
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
149
+
150
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
151
+ delta = heights / widths
152
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
153
+
154
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
155
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
156
+
157
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
158
+
159
+ if inverse:
160
+ a = (inputs - input_cumheights) * (
161
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
162
+ ) + input_heights * (input_delta - input_derivatives)
163
+ b = input_heights * input_derivatives - (inputs - input_cumheights) * (
164
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
165
+ )
166
+ c = -input_delta * (inputs - input_cumheights)
167
+
168
+ discriminant = b.pow(2) - 4 * a * c
169
+ assert (discriminant >= 0).all()
170
+
171
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
172
+ outputs = root * input_bin_widths + input_cumwidths
173
+
174
+ theta_one_minus_theta = root * (1 - root)
175
+ denominator = input_delta + (
176
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
177
+ * theta_one_minus_theta
178
+ )
179
+ derivative_numerator = input_delta.pow(2) * (
180
+ input_derivatives_plus_one * root.pow(2)
181
+ + 2 * input_delta * theta_one_minus_theta
182
+ + input_derivatives * (1 - root).pow(2)
183
+ )
184
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
185
+
186
+ return outputs, -logabsdet
187
+ else:
188
+ theta = (inputs - input_cumwidths) / input_bin_widths
189
+ theta_one_minus_theta = theta * (1 - theta)
190
+
191
+ numerator = input_heights * (
192
+ input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
193
+ )
194
+ denominator = input_delta + (
195
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
196
+ * theta_one_minus_theta
197
+ )
198
+ outputs = input_cumheights + numerator / denominator
199
+
200
+ derivative_numerator = input_delta.pow(2) * (
201
+ input_derivatives_plus_one * theta.pow(2)
202
+ + 2 * input_delta * theta_one_minus_theta
203
+ + input_derivatives * (1 - theta).pow(2)
204
+ )
205
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
206
+
207
+ return outputs, logabsdet
infer/lib/rmvpe.py ADDED
@@ -0,0 +1,717 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb, os
2
+
3
+ import numpy as np
4
+ import torch
5
+ try:
6
+ #Fix "Torch not compiled with CUDA enabled"
7
+ import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
8
+ if torch.xpu.is_available():
9
+ from infer.modules.ipex import ipex_init
10
+ ipex_init()
11
+ except Exception:
12
+ pass
13
+ import torch.nn as nn
14
+ import torch.nn.functional as F
15
+ from librosa.util import normalize, pad_center, tiny
16
+ from scipy.signal import get_window
17
+
18
+ import logging
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ ###stft codes from https://github.com/pseeth/torch-stft/blob/master/torch_stft/util.py
24
+ def window_sumsquare(
25
+ window,
26
+ n_frames,
27
+ hop_length=200,
28
+ win_length=800,
29
+ n_fft=800,
30
+ dtype=np.float32,
31
+ norm=None,
32
+ ):
33
+ """
34
+ # from librosa 0.6
35
+ Compute the sum-square envelope of a window function at a given hop length.
36
+ This is used to estimate modulation effects induced by windowing
37
+ observations in short-time fourier transforms.
38
+ Parameters
39
+ ----------
40
+ window : string, tuple, number, callable, or list-like
41
+ Window specification, as in `get_window`
42
+ n_frames : int > 0
43
+ The number of analysis frames
44
+ hop_length : int > 0
45
+ The number of samples to advance between frames
46
+ win_length : [optional]
47
+ The length of the window function. By default, this matches `n_fft`.
48
+ n_fft : int > 0
49
+ The length of each analysis frame.
50
+ dtype : np.dtype
51
+ The data type of the output
52
+ Returns
53
+ -------
54
+ wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
55
+ The sum-squared envelope of the window function
56
+ """
57
+ if win_length is None:
58
+ win_length = n_fft
59
+
60
+ n = n_fft + hop_length * (n_frames - 1)
61
+ x = np.zeros(n, dtype=dtype)
62
+
63
+ # Compute the squared window at the desired length
64
+ win_sq = get_window(window, win_length, fftbins=True)
65
+ win_sq = normalize(win_sq, norm=norm) ** 2
66
+ win_sq = pad_center(win_sq, n_fft)
67
+
68
+ # Fill the envelope
69
+ for i in range(n_frames):
70
+ sample = i * hop_length
71
+ x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
72
+ return x
73
+
74
+
75
+ class STFT(torch.nn.Module):
76
+ def __init__(
77
+ self, filter_length=1024, hop_length=512, win_length=None, window="hann"
78
+ ):
79
+ """
80
+ This module implements an STFT using 1D convolution and 1D transpose convolutions.
81
+ This is a bit tricky so there are some cases that probably won't work as working
82
+ out the same sizes before and after in all overlap add setups is tough. Right now,
83
+ this code should work with hop lengths that are half the filter length (50% overlap
84
+ between frames).
85
+
86
+ Keyword Arguments:
87
+ filter_length {int} -- Length of filters used (default: {1024})
88
+ hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512})
89
+ win_length {[type]} -- Length of the window function applied to each frame (if not specified, it
90
+ equals the filter length). (default: {None})
91
+ window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris)
92
+ (default: {'hann'})
93
+ """
94
+ super(STFT, self).__init__()
95
+ self.filter_length = filter_length
96
+ self.hop_length = hop_length
97
+ self.win_length = win_length if win_length else filter_length
98
+ self.window = window
99
+ self.forward_transform = None
100
+ self.pad_amount = int(self.filter_length / 2)
101
+ scale = self.filter_length / self.hop_length
102
+ fourier_basis = np.fft.fft(np.eye(self.filter_length))
103
+
104
+ cutoff = int((self.filter_length / 2 + 1))
105
+ fourier_basis = np.vstack(
106
+ [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
107
+ )
108
+ forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
109
+ inverse_basis = torch.FloatTensor(
110
+ np.linalg.pinv(scale * fourier_basis).T[:, None, :]
111
+ )
112
+
113
+ assert filter_length >= self.win_length
114
+ # get window and zero center pad it to filter_length
115
+ fft_window = get_window(window, self.win_length, fftbins=True)
116
+ fft_window = pad_center(fft_window, size=filter_length)
117
+ fft_window = torch.from_numpy(fft_window).float()
118
+
119
+ # window the bases
120
+ forward_basis *= fft_window
121
+ inverse_basis *= fft_window
122
+
123
+ self.register_buffer("forward_basis", forward_basis.float())
124
+ self.register_buffer("inverse_basis", inverse_basis.float())
125
+
126
+ def transform(self, input_data):
127
+ """Take input data (audio) to STFT domain.
128
+
129
+ Arguments:
130
+ input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
131
+
132
+ Returns:
133
+ magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
134
+ num_frequencies, num_frames)
135
+ phase {tensor} -- Phase of STFT with shape (num_batch,
136
+ num_frequencies, num_frames)
137
+ """
138
+ num_batches = input_data.shape[0]
139
+ num_samples = input_data.shape[-1]
140
+
141
+ self.num_samples = num_samples
142
+
143
+ # similar to librosa, reflect-pad the input
144
+ input_data = input_data.view(num_batches, 1, num_samples)
145
+ # print(1234,input_data.shape)
146
+ input_data = F.pad(
147
+ input_data.unsqueeze(1),
148
+ (self.pad_amount, self.pad_amount, 0, 0, 0, 0),
149
+ mode="reflect",
150
+ ).squeeze(1)
151
+ # print(2333,input_data.shape,self.forward_basis.shape,self.hop_length)
152
+ # pdb.set_trace()
153
+ forward_transform = F.conv1d(
154
+ input_data, self.forward_basis, stride=self.hop_length, padding=0
155
+ )
156
+
157
+ cutoff = int((self.filter_length / 2) + 1)
158
+ real_part = forward_transform[:, :cutoff, :]
159
+ imag_part = forward_transform[:, cutoff:, :]
160
+
161
+ magnitude = torch.sqrt(real_part**2 + imag_part**2)
162
+ # phase = torch.atan2(imag_part.data, real_part.data)
163
+
164
+ return magnitude # , phase
165
+
166
+ def inverse(self, magnitude, phase):
167
+ """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced
168
+ by the ```transform``` function.
169
+
170
+ Arguments:
171
+ magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
172
+ num_frequencies, num_frames)
173
+ phase {tensor} -- Phase of STFT with shape (num_batch,
174
+ num_frequencies, num_frames)
175
+
176
+ Returns:
177
+ inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of
178
+ shape (num_batch, num_samples)
179
+ """
180
+ recombine_magnitude_phase = torch.cat(
181
+ [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
182
+ )
183
+
184
+ inverse_transform = F.conv_transpose1d(
185
+ recombine_magnitude_phase,
186
+ self.inverse_basis,
187
+ stride=self.hop_length,
188
+ padding=0,
189
+ )
190
+
191
+ if self.window is not None:
192
+ window_sum = window_sumsquare(
193
+ self.window,
194
+ magnitude.size(-1),
195
+ hop_length=self.hop_length,
196
+ win_length=self.win_length,
197
+ n_fft=self.filter_length,
198
+ dtype=np.float32,
199
+ )
200
+ # remove modulation effects
201
+ approx_nonzero_indices = torch.from_numpy(
202
+ np.where(window_sum > tiny(window_sum))[0]
203
+ )
204
+ window_sum = torch.from_numpy(window_sum).to(inverse_transform.device)
205
+ inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
206
+ approx_nonzero_indices
207
+ ]
208
+
209
+ # scale by hop ratio
210
+ inverse_transform *= float(self.filter_length) / self.hop_length
211
+
212
+ inverse_transform = inverse_transform[..., self.pad_amount :]
213
+ inverse_transform = inverse_transform[..., : self.num_samples]
214
+ inverse_transform = inverse_transform.squeeze(1)
215
+
216
+ return inverse_transform
217
+
218
+ def forward(self, input_data):
219
+ """Take input data (audio) to STFT domain and then back to audio.
220
+
221
+ Arguments:
222
+ input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
223
+
224
+ Returns:
225
+ reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of
226
+ shape (num_batch, num_samples)
227
+ """
228
+ self.magnitude, self.phase = self.transform(input_data)
229
+ reconstruction = self.inverse(self.magnitude, self.phase)
230
+ return reconstruction
231
+
232
+
233
+ from time import time as ttime
234
+
235
+
236
+ class BiGRU(nn.Module):
237
+ def __init__(self, input_features, hidden_features, num_layers):
238
+ super(BiGRU, self).__init__()
239
+ self.gru = nn.GRU(
240
+ input_features,
241
+ hidden_features,
242
+ num_layers=num_layers,
243
+ batch_first=True,
244
+ bidirectional=True,
245
+ )
246
+
247
+ def forward(self, x):
248
+ return self.gru(x)[0]
249
+
250
+
251
+ class ConvBlockRes(nn.Module):
252
+ def __init__(self, in_channels, out_channels, momentum=0.01):
253
+ super(ConvBlockRes, self).__init__()
254
+ self.conv = nn.Sequential(
255
+ nn.Conv2d(
256
+ in_channels=in_channels,
257
+ out_channels=out_channels,
258
+ kernel_size=(3, 3),
259
+ stride=(1, 1),
260
+ padding=(1, 1),
261
+ bias=False,
262
+ ),
263
+ nn.BatchNorm2d(out_channels, momentum=momentum),
264
+ nn.ReLU(),
265
+ nn.Conv2d(
266
+ in_channels=out_channels,
267
+ out_channels=out_channels,
268
+ kernel_size=(3, 3),
269
+ stride=(1, 1),
270
+ padding=(1, 1),
271
+ bias=False,
272
+ ),
273
+ nn.BatchNorm2d(out_channels, momentum=momentum),
274
+ nn.ReLU(),
275
+ )
276
+ if in_channels != out_channels:
277
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
278
+ self.is_shortcut = True
279
+ else:
280
+ self.is_shortcut = False
281
+
282
+ def forward(self, x):
283
+ if self.is_shortcut:
284
+ return self.conv(x) + self.shortcut(x)
285
+ else:
286
+ return self.conv(x) + x
287
+
288
+
289
+ class Encoder(nn.Module):
290
+ def __init__(
291
+ self,
292
+ in_channels,
293
+ in_size,
294
+ n_encoders,
295
+ kernel_size,
296
+ n_blocks,
297
+ out_channels=16,
298
+ momentum=0.01,
299
+ ):
300
+ super(Encoder, self).__init__()
301
+ self.n_encoders = n_encoders
302
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
303
+ self.layers = nn.ModuleList()
304
+ self.latent_channels = []
305
+ for i in range(self.n_encoders):
306
+ self.layers.append(
307
+ ResEncoderBlock(
308
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
309
+ )
310
+ )
311
+ self.latent_channels.append([out_channels, in_size])
312
+ in_channels = out_channels
313
+ out_channels *= 2
314
+ in_size //= 2
315
+ self.out_size = in_size
316
+ self.out_channel = out_channels
317
+
318
+ def forward(self, x):
319
+ concat_tensors = []
320
+ x = self.bn(x)
321
+ for i in range(self.n_encoders):
322
+ _, x = self.layers[i](x)
323
+ concat_tensors.append(_)
324
+ return x, concat_tensors
325
+
326
+
327
+ class ResEncoderBlock(nn.Module):
328
+ def __init__(
329
+ self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
330
+ ):
331
+ super(ResEncoderBlock, self).__init__()
332
+ self.n_blocks = n_blocks
333
+ self.conv = nn.ModuleList()
334
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
335
+ for i in range(n_blocks - 1):
336
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
337
+ self.kernel_size = kernel_size
338
+ if self.kernel_size is not None:
339
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
340
+
341
+ def forward(self, x):
342
+ for i in range(self.n_blocks):
343
+ x = self.conv[i](x)
344
+ if self.kernel_size is not None:
345
+ return x, self.pool(x)
346
+ else:
347
+ return x
348
+
349
+
350
+ class Intermediate(nn.Module): #
351
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
352
+ super(Intermediate, self).__init__()
353
+ self.n_inters = n_inters
354
+ self.layers = nn.ModuleList()
355
+ self.layers.append(
356
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
357
+ )
358
+ for i in range(self.n_inters - 1):
359
+ self.layers.append(
360
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
361
+ )
362
+
363
+ def forward(self, x):
364
+ for i in range(self.n_inters):
365
+ x = self.layers[i](x)
366
+ return x
367
+
368
+
369
+ class ResDecoderBlock(nn.Module):
370
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
371
+ super(ResDecoderBlock, self).__init__()
372
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
373
+ self.n_blocks = n_blocks
374
+ self.conv1 = nn.Sequential(
375
+ nn.ConvTranspose2d(
376
+ in_channels=in_channels,
377
+ out_channels=out_channels,
378
+ kernel_size=(3, 3),
379
+ stride=stride,
380
+ padding=(1, 1),
381
+ output_padding=out_padding,
382
+ bias=False,
383
+ ),
384
+ nn.BatchNorm2d(out_channels, momentum=momentum),
385
+ nn.ReLU(),
386
+ )
387
+ self.conv2 = nn.ModuleList()
388
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
389
+ for i in range(n_blocks - 1):
390
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
391
+
392
+ def forward(self, x, concat_tensor):
393
+ x = self.conv1(x)
394
+ x = torch.cat((x, concat_tensor), dim=1)
395
+ for i in range(self.n_blocks):
396
+ x = self.conv2[i](x)
397
+ return x
398
+
399
+
400
+ class Decoder(nn.Module):
401
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
402
+ super(Decoder, self).__init__()
403
+ self.layers = nn.ModuleList()
404
+ self.n_decoders = n_decoders
405
+ for i in range(self.n_decoders):
406
+ out_channels = in_channels // 2
407
+ self.layers.append(
408
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
409
+ )
410
+ in_channels = out_channels
411
+
412
+ def forward(self, x, concat_tensors):
413
+ for i in range(self.n_decoders):
414
+ x = self.layers[i](x, concat_tensors[-1 - i])
415
+ return x
416
+
417
+
418
+ class DeepUnet(nn.Module):
419
+ def __init__(
420
+ self,
421
+ kernel_size,
422
+ n_blocks,
423
+ en_de_layers=5,
424
+ inter_layers=4,
425
+ in_channels=1,
426
+ en_out_channels=16,
427
+ ):
428
+ super(DeepUnet, self).__init__()
429
+ self.encoder = Encoder(
430
+ in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
431
+ )
432
+ self.intermediate = Intermediate(
433
+ self.encoder.out_channel // 2,
434
+ self.encoder.out_channel,
435
+ inter_layers,
436
+ n_blocks,
437
+ )
438
+ self.decoder = Decoder(
439
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
440
+ )
441
+
442
+ def forward(self, x):
443
+ x, concat_tensors = self.encoder(x)
444
+ x = self.intermediate(x)
445
+ x = self.decoder(x, concat_tensors)
446
+ return x
447
+
448
+
449
+ class E2E(nn.Module):
450
+ def __init__(
451
+ self,
452
+ n_blocks,
453
+ n_gru,
454
+ kernel_size,
455
+ en_de_layers=5,
456
+ inter_layers=4,
457
+ in_channels=1,
458
+ en_out_channels=16,
459
+ ):
460
+ super(E2E, self).__init__()
461
+ self.unet = DeepUnet(
462
+ kernel_size,
463
+ n_blocks,
464
+ en_de_layers,
465
+ inter_layers,
466
+ in_channels,
467
+ en_out_channels,
468
+ )
469
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
470
+ if n_gru:
471
+ self.fc = nn.Sequential(
472
+ BiGRU(3 * 128, 256, n_gru),
473
+ nn.Linear(512, 360),
474
+ nn.Dropout(0.25),
475
+ nn.Sigmoid(),
476
+ )
477
+ else:
478
+ self.fc = nn.Sequential(
479
+ nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
480
+ )
481
+
482
+ def forward(self, mel):
483
+ # print(mel.shape)
484
+ mel = mel.transpose(-1, -2).unsqueeze(1)
485
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
486
+ x = self.fc(x)
487
+ # print(x.shape)
488
+ return x
489
+
490
+
491
+ from librosa.filters import mel
492
+
493
+
494
+ class MelSpectrogram(torch.nn.Module):
495
+ def __init__(
496
+ self,
497
+ is_half,
498
+ n_mel_channels,
499
+ sampling_rate,
500
+ win_length,
501
+ hop_length,
502
+ n_fft=None,
503
+ mel_fmin=0,
504
+ mel_fmax=None,
505
+ clamp=1e-5,
506
+ ):
507
+ super().__init__()
508
+ n_fft = win_length if n_fft is None else n_fft
509
+ self.hann_window = {}
510
+ mel_basis = mel(
511
+ sr=sampling_rate,
512
+ n_fft=n_fft,
513
+ n_mels=n_mel_channels,
514
+ fmin=mel_fmin,
515
+ fmax=mel_fmax,
516
+ htk=True,
517
+ )
518
+ mel_basis = torch.from_numpy(mel_basis).float()
519
+ self.register_buffer("mel_basis", mel_basis)
520
+ self.n_fft = win_length if n_fft is None else n_fft
521
+ self.hop_length = hop_length
522
+ self.win_length = win_length
523
+ self.sampling_rate = sampling_rate
524
+ self.n_mel_channels = n_mel_channels
525
+ self.clamp = clamp
526
+ self.is_half = is_half
527
+
528
+ def forward(self, audio, keyshift=0, speed=1, center=True):
529
+ factor = 2 ** (keyshift / 12)
530
+ n_fft_new = int(np.round(self.n_fft * factor))
531
+ win_length_new = int(np.round(self.win_length * factor))
532
+ hop_length_new = int(np.round(self.hop_length * speed))
533
+ keyshift_key = str(keyshift) + "_" + str(audio.device)
534
+ if keyshift_key not in self.hann_window:
535
+ self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
536
+ # "cpu"if(audio.device.type=="privateuseone") else audio.device
537
+ audio.device
538
+ )
539
+ # fft = torch.stft(#doesn't support pytorch_dml
540
+ # # audio.cpu() if(audio.device.type=="privateuseone")else audio,
541
+ # audio,
542
+ # n_fft=n_fft_new,
543
+ # hop_length=hop_length_new,
544
+ # win_length=win_length_new,
545
+ # window=self.hann_window[keyshift_key],
546
+ # center=center,
547
+ # return_complex=True,
548
+ # )
549
+ # magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
550
+ # print(1111111111)
551
+ # print(222222222222222,audio.device,self.is_half)
552
+ if hasattr(self, "stft") == False:
553
+ # print(n_fft_new,hop_length_new,win_length_new,audio.shape)
554
+ self.stft = STFT(
555
+ filter_length=n_fft_new,
556
+ hop_length=hop_length_new,
557
+ win_length=win_length_new,
558
+ window="hann",
559
+ ).to(audio.device)
560
+ magnitude = self.stft.transform(audio) # phase
561
+ # if (audio.device.type == "privateuseone"):
562
+ # magnitude=magnitude.to(audio.device)
563
+ if keyshift != 0:
564
+ size = self.n_fft // 2 + 1
565
+ resize = magnitude.size(1)
566
+ if resize < size:
567
+ magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
568
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
569
+ mel_output = torch.matmul(self.mel_basis, magnitude)
570
+ if self.is_half == True:
571
+ mel_output = mel_output.half()
572
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
573
+ # print(log_mel_spec.device.type)
574
+ return log_mel_spec
575
+
576
+
577
+ class RMVPE:
578
+ def __init__(self, model_path, is_half, device=None):
579
+ self.resample_kernel = {}
580
+ self.resample_kernel = {}
581
+ self.is_half = is_half
582
+ if device is None:
583
+ device = "cuda" if torch.cuda.is_available() else "cpu"
584
+ self.device = device
585
+ self.mel_extractor = MelSpectrogram(
586
+ is_half, 128, 16000, 1024, 160, None, 30, 8000
587
+ ).to(device)
588
+ if "privateuseone" in str(device):
589
+ import onnxruntime as ort
590
+
591
+ ort_session = ort.InferenceSession(
592
+ "%s/rmvpe.onnx" % os.environ["rmvpe_root"],
593
+ providers=["DmlExecutionProvider"],
594
+ )
595
+ self.model = ort_session
596
+ else:
597
+ model = E2E(4, 1, (2, 2))
598
+ ckpt = torch.load(model_path, map_location="cpu")
599
+ model.load_state_dict(ckpt)
600
+ model.eval()
601
+ if is_half == True:
602
+ model = model.half()
603
+ self.model = model
604
+ self.model = self.model.to(device)
605
+ cents_mapping = 20 * np.arange(360) + 1997.3794084376191
606
+ self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
607
+
608
+ def mel2hidden(self, mel):
609
+ with torch.no_grad():
610
+ n_frames = mel.shape[-1]
611
+ mel = F.pad(
612
+ mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="constant"
613
+ )
614
+ if "privateuseone" in str(self.device):
615
+ onnx_input_name = self.model.get_inputs()[0].name
616
+ onnx_outputs_names = self.model.get_outputs()[0].name
617
+ hidden = self.model.run(
618
+ [onnx_outputs_names],
619
+ input_feed={onnx_input_name: mel.cpu().numpy()},
620
+ )[0]
621
+ else:
622
+ hidden = self.model(mel)
623
+ return hidden[:, :n_frames]
624
+
625
+ def decode(self, hidden, thred=0.03):
626
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
627
+ f0 = 10 * (2 ** (cents_pred / 1200))
628
+ f0[f0 == 10] = 0
629
+ # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
630
+ return f0
631
+
632
+ def infer_from_audio(self, audio, thred=0.03):
633
+ # torch.cuda.synchronize()
634
+ t0 = ttime()
635
+ mel = self.mel_extractor(
636
+ torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True
637
+ )
638
+ # print(123123123,mel.device.type)
639
+ # torch.cuda.synchronize()
640
+ t1 = ttime()
641
+ hidden = self.mel2hidden(mel)
642
+ # torch.cuda.synchronize()
643
+ t2 = ttime()
644
+ # print(234234,hidden.device.type)
645
+ if "privateuseone" not in str(self.device):
646
+ hidden = hidden.squeeze(0).cpu().numpy()
647
+ else:
648
+ hidden = hidden[0]
649
+ if self.is_half == True:
650
+ hidden = hidden.astype("float32")
651
+
652
+ f0 = self.decode(hidden, thred=thred)
653
+ # torch.cuda.synchronize()
654
+ t3 = ttime()
655
+ # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
656
+ return f0
657
+
658
+ def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100):
659
+ audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
660
+ mel = self.mel_extractor(audio, center=True)
661
+ hidden = self.mel2hidden(mel)
662
+ hidden = hidden.squeeze(0).cpu().numpy()
663
+ if self.is_half == True:
664
+ hidden = hidden.astype("float32")
665
+ f0 = self.decode(hidden, thred=thred)
666
+ f0[(f0 < f0_min) | (f0 > f0_max)] = 0
667
+ return f0
668
+
669
+ def to_local_average_cents(self, salience, thred=0.05):
670
+ # t0 = ttime()
671
+ center = np.argmax(salience, axis=1) # 帧长#index
672
+ salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368
673
+ # t1 = ttime()
674
+ center += 4
675
+ todo_salience = []
676
+ todo_cents_mapping = []
677
+ starts = center - 4
678
+ ends = center + 5
679
+ for idx in range(salience.shape[0]):
680
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
681
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
682
+ # t2 = ttime()
683
+ todo_salience = np.array(todo_salience) # 帧长,9
684
+ todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9
685
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
686
+ weight_sum = np.sum(todo_salience, 1) # 帧长
687
+ devided = product_sum / weight_sum # 帧长
688
+ # t3 = ttime()
689
+ maxx = np.max(salience, axis=1) # 帧长
690
+ devided[maxx <= thred] = 0
691
+ # t4 = ttime()
692
+ # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
693
+ return devided
694
+
695
+
696
+ if __name__ == "__main__":
697
+ import librosa
698
+ import soundfile as sf
699
+
700
+ audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav")
701
+ if len(audio.shape) > 1:
702
+ audio = librosa.to_mono(audio.transpose(1, 0))
703
+ audio_bak = audio.copy()
704
+ if sampling_rate != 16000:
705
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
706
+ model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt"
707
+ thred = 0.03 # 0.01
708
+ device = "cuda" if torch.cuda.is_available() else "cpu"
709
+ rmvpe = RMVPE(model_path, is_half=False, device=device)
710
+ t0 = ttime()
711
+ f0 = rmvpe.infer_from_audio(audio, thred=thred)
712
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
713
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
714
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
715
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
716
+ t1 = ttime()
717
+ logger.info("%s %.2f", f0.shape, t1 - t0)
infer/modules/vc/__init__.py ADDED
File without changes
infer/modules/vc/modules.py ADDED
@@ -0,0 +1,526 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import traceback
3
+ import logging
4
+ now_dir = os.getcwd()
5
+ sys.path.append(now_dir)
6
+ logger = logging.getLogger(__name__)
7
+ import lib.globals.globals as rvc_globals
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import torch
11
+ from io import BytesIO
12
+ from infer.lib.audio import load_audio
13
+ from infer.lib.audio import wav2
14
+ from infer.lib.infer_pack.models import (
15
+ SynthesizerTrnMs256NSFsid,
16
+ SynthesizerTrnMs256NSFsid_nono,
17
+ SynthesizerTrnMs768NSFsid,
18
+ SynthesizerTrnMs768NSFsid_nono,
19
+ )
20
+ from infer.modules.vc.pipeline import Pipeline
21
+ from infer.modules.vc.utils import *
22
+ import time
23
+ import scipy.io.wavfile as wavfile
24
+
25
+ def note_to_hz(note_name):
26
+ SEMITONES = {'C': -9, 'C#': -8, 'D': -7, 'D#': -6, 'E': -5, 'F': -4, 'F#': -3, 'G': -2, 'G#': -1, 'A': 0, 'A#': 1, 'B': 2}
27
+ pitch_class, octave = note_name[:-1], int(note_name[-1])
28
+ semitone = SEMITONES[pitch_class]
29
+ note_number = 12 * (octave - 4) + semitone
30
+ frequency = 440.0 * (2.0 ** (1.0/12)) ** note_number
31
+ return frequency
32
+
33
+ class VC:
34
+ def __init__(self, config):
35
+ self.n_spk = None
36
+ self.tgt_sr = None
37
+ self.net_g = None
38
+ self.pipeline = None
39
+ self.cpt = None
40
+ self.version = None
41
+ self.if_f0 = None
42
+ self.version = None
43
+ self.hubert_model = None
44
+
45
+ self.config = config
46
+
47
+ def get_vc(self, sid, *to_return_protect):
48
+ logger.info("Get sid: " + sid)
49
+
50
+ to_return_protect0 = {
51
+ "visible": self.if_f0 != 0,
52
+ "value": to_return_protect[0]
53
+ if self.if_f0 != 0 and to_return_protect
54
+ else 0.5,
55
+ "__type__": "update",
56
+ }
57
+ to_return_protect1 = {
58
+ "visible": self.if_f0 != 0,
59
+ "value": to_return_protect[1]
60
+ if self.if_f0 != 0 and to_return_protect
61
+ else 0.33,
62
+ "__type__": "update",
63
+ }
64
+
65
+ if not sid:
66
+ if self.hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
67
+ logger.info("Clean model cache")
68
+ del (
69
+ self.net_g,
70
+ self.n_spk,
71
+ self.vc,
72
+ self.hubert_model,
73
+ self.tgt_sr,
74
+ ) # ,cpt
75
+ self.hubert_model = (
76
+ self.net_g
77
+ ) = self.n_spk = self.vc = self.hubert_model = self.tgt_sr = None
78
+ if torch.cuda.is_available():
79
+ torch.cuda.empty_cache()
80
+ ###楼下不这么折腾清理不干净
81
+ self.if_f0 = self.cpt.get("f0", 1)
82
+ self.version = self.cpt.get("version", "v1")
83
+ if self.version == "v1":
84
+ if self.if_f0 == 1:
85
+ self.net_g = SynthesizerTrnMs256NSFsid(
86
+ *self.cpt["config"], is_half=self.config.is_half
87
+ )
88
+ else:
89
+ self.net_g = SynthesizerTrnMs256NSFsid_nono(*self.cpt["config"])
90
+ elif self.version == "v2":
91
+ if self.if_f0 == 1:
92
+ self.net_g = SynthesizerTrnMs768NSFsid(
93
+ *self.cpt["config"], is_half=self.config.is_half
94
+ )
95
+ else:
96
+ self.net_g = SynthesizerTrnMs768NSFsid_nono(*self.cpt["config"])
97
+ del self.net_g, self.cpt
98
+ if torch.cuda.is_available():
99
+ torch.cuda.empty_cache()
100
+ return (
101
+ {"visible": False, "__type__": "update"},
102
+ {
103
+ "visible": True,
104
+ "value": to_return_protect0,
105
+ "__type__": "update",
106
+ },
107
+ {
108
+ "visible": True,
109
+ "value": to_return_protect1,
110
+ "__type__": "update",
111
+ },
112
+ "",
113
+ "",
114
+ )
115
+ #person = f'{os.getenv("weight_root")}/{sid}'
116
+ person = f'{sid}'
117
+ #logger.info(f"Loading: {person}")
118
+ logger.info(f"Loading...")
119
+ self.cpt = torch.load(person, map_location="cpu")
120
+ self.tgt_sr = self.cpt["config"][-1]
121
+ self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk
122
+ self.if_f0 = self.cpt.get("f0", 1)
123
+ self.version = self.cpt.get("version", "v1")
124
+
125
+ synthesizer_class = {
126
+ ("v1", 1): SynthesizerTrnMs256NSFsid,
127
+ ("v1", 0): SynthesizerTrnMs256NSFsid_nono,
128
+ ("v2", 1): SynthesizerTrnMs768NSFsid,
129
+ ("v2", 0): SynthesizerTrnMs768NSFsid_nono,
130
+ }
131
+
132
+ self.net_g = synthesizer_class.get(
133
+ (self.version, self.if_f0), SynthesizerTrnMs256NSFsid
134
+ )(*self.cpt["config"], is_half=self.config.is_half)
135
+
136
+ del self.net_g.enc_q
137
+
138
+ self.net_g.load_state_dict(self.cpt["weight"], strict=False)
139
+ self.net_g.eval().to(self.config.device)
140
+ if self.config.is_half:
141
+ self.net_g = self.net_g.half()
142
+ else:
143
+ self.net_g = self.net_g.float()
144
+
145
+ self.pipeline = Pipeline(self.tgt_sr, self.config)
146
+ n_spk = self.cpt["config"][-3]
147
+ index = {"value": get_index_path_from_model(sid), "__type__": "update"}
148
+ logger.info("Select index: " + index["value"])
149
+
150
+ return (
151
+ (
152
+ {"visible": False, "maximum": n_spk, "__type__": "update"},
153
+ to_return_protect0,
154
+ to_return_protect1
155
+ )
156
+ if to_return_protect
157
+ else {"visible": False, "maximum": n_spk, "__type__": "update"}
158
+ )
159
+
160
+
161
+ def vc_single(
162
+ self,
163
+ sid,
164
+ input_audio_path0,
165
+ input_audio_path1,
166
+ f0_up_key,
167
+ f0_file,
168
+ f0_method,
169
+ file_index,
170
+ file_index2,
171
+ index_rate,
172
+ filter_radius,
173
+ resample_sr,
174
+ rms_mix_rate,
175
+ protect,
176
+ crepe_hop_length,
177
+ f0_min,
178
+ note_min,
179
+ f0_max,
180
+ note_max,
181
+ f0_autotune,
182
+ ):
183
+ global total_time
184
+ total_time = 0
185
+ start_time = time.time()
186
+ if not input_audio_path0 and not input_audio_path1:
187
+ return "You need to upload an audio", None
188
+
189
+ if (not os.path.exists(input_audio_path0)) and (not os.path.exists(os.path.join(now_dir, input_audio_path0))):
190
+ return "Audio was not properly selected or doesn't exist", None
191
+
192
+ input_audio_path1 = input_audio_path1 or input_audio_path0
193
+ print(f"\nStarting inference for '{os.path.basename(input_audio_path1)}'")
194
+ print("-------------------")
195
+ f0_up_key = int(f0_up_key)
196
+ if rvc_globals.NotesOrHertz and f0_method != 'rmvpe':
197
+ f0_min = note_to_hz(note_min) if note_min else 50
198
+ f0_max = note_to_hz(note_max) if note_max else 1100
199
+ print(f"Converted Min pitch: freq - {f0_min}\n"
200
+ f"Converted Max pitch: freq - {f0_max}")
201
+ else:
202
+ f0_min = f0_min or 50
203
+ f0_max = f0_max or 1100
204
+ try:
205
+ input_audio_path1 = input_audio_path1 or input_audio_path0
206
+ print(f"Attempting to load {input_audio_path1}....")
207
+ audio = load_audio(file=input_audio_path1,
208
+ sr=16000,
209
+ DoFormant=rvc_globals.DoFormant,
210
+ Quefrency=rvc_globals.Quefrency,
211
+ Timbre=rvc_globals.Timbre)
212
+
213
+ audio_max = np.abs(audio).max() / 0.95
214
+ if audio_max > 1:
215
+ audio /= audio_max
216
+ times = [0, 0, 0]
217
+
218
+ if self.hubert_model is None:
219
+ self.hubert_model = load_hubert(self.config)
220
+
221
+ try:
222
+ self.if_f0 = self.cpt.get("f0", 1)
223
+ except NameError:
224
+ message = "Model was not properly selected"
225
+ print(message)
226
+ return message, None
227
+
228
+ file_index = (
229
+ (
230
+ file_index.strip(" ")
231
+ .strip('"')
232
+ .strip("\n")
233
+ .strip('"')
234
+ .strip(" ")
235
+ .replace("trained", "added")
236
+ )
237
+ if file_index != ""
238
+ else file_index2
239
+ ) # 防止小白写错,自动帮他替换掉
240
+
241
+ try:
242
+ audio_opt = self.pipeline.pipeline(
243
+ self.hubert_model,
244
+ self.net_g,
245
+ sid,
246
+ audio,
247
+ input_audio_path1,
248
+ times,
249
+ f0_up_key,
250
+ f0_method,
251
+ file_index,
252
+ index_rate,
253
+ self.if_f0,
254
+ filter_radius,
255
+ self.tgt_sr,
256
+ resample_sr,
257
+ rms_mix_rate,
258
+ self.version,
259
+ protect,
260
+ crepe_hop_length,
261
+ f0_autotune,
262
+ f0_file=f0_file,
263
+ f0_min=f0_min,
264
+ f0_max=f0_max
265
+ )
266
+ except AssertionError:
267
+ message = "Mismatching index version detected (v1 with v2, or v2 with v1)."
268
+ print(message)
269
+ return message, None
270
+ except NameError:
271
+ message = "RVC libraries are still loading. Please try again in a few seconds."
272
+ print(message)
273
+ return message, None
274
+
275
+ if self.tgt_sr != resample_sr >= 16000:
276
+ self.tgt_sr = resample_sr
277
+ index_info = (
278
+ "Index:\n%s." % file_index
279
+ if os.path.exists(file_index)
280
+ else "Index not used."
281
+ )
282
+ end_time = time.time()
283
+ total_time = end_time - start_time
284
+
285
+ output_folder = "audio-outputs"
286
+ os.makedirs(output_folder, exist_ok=True)
287
+ output_filename = "generated_audio_{}.wav"
288
+ output_count = 1
289
+ while True:
290
+ current_output_path = os.path.join(output_folder, output_filename.format(output_count))
291
+ if not os.path.exists(current_output_path):
292
+ break
293
+ output_count += 1
294
+
295
+ wavfile.write(current_output_path, self.tgt_sr, audio_opt)
296
+ print(f"Generated audio saved to: {current_output_path}")
297
+ return f"Success.\n {index_info}\nTime:\n npy:{times[0]}, f0:{times[1]}, infer:{times[2]}\nTotal Time: {total_time} seconds", (self.tgt_sr, audio_opt)
298
+ except:
299
+ info = traceback.format_exc()
300
+ logger.warn(info)
301
+ return info, (None, None)
302
+
303
+ def vc_single_dont_save(
304
+ self,
305
+ sid,
306
+ input_audio_path0,
307
+ input_audio_path1,
308
+ f0_up_key,
309
+ f0_file,
310
+ f0_method,
311
+ file_index,
312
+ file_index2,
313
+ index_rate,
314
+ filter_radius,
315
+ resample_sr,
316
+ rms_mix_rate,
317
+ protect,
318
+ crepe_hop_length,
319
+ f0_min,
320
+ note_min,
321
+ f0_max,
322
+ note_max,
323
+ f0_autotune,
324
+ ):
325
+ global total_time
326
+ total_time = 0
327
+ start_time = time.time()
328
+ if not input_audio_path0 and not input_audio_path1:
329
+ return "You need to upload an audio", None
330
+
331
+ if (not os.path.exists(input_audio_path0)) and (not os.path.exists(os.path.join(now_dir, input_audio_path0))):
332
+ return "Audio was not properly selected or doesn't exist", None
333
+
334
+ input_audio_path1 = input_audio_path1 or input_audio_path0
335
+ print(f"\nStarting inference for '{os.path.basename(input_audio_path1)}'")
336
+ print("-------------------")
337
+ f0_up_key = int(f0_up_key)
338
+ if rvc_globals.NotesOrHertz and f0_method != 'rmvpe':
339
+ f0_min = note_to_hz(note_min) if note_min else 50
340
+ f0_max = note_to_hz(note_max) if note_max else 1100
341
+ print(f"Converted Min pitch: freq - {f0_min}\n"
342
+ f"Converted Max pitch: freq - {f0_max}")
343
+ else:
344
+ f0_min = f0_min or 50
345
+ f0_max = f0_max or 1100
346
+ try:
347
+ input_audio_path1 = input_audio_path1 or input_audio_path0
348
+ print(f"Attempting to load {input_audio_path1}....")
349
+ audio = load_audio(file=input_audio_path1,
350
+ sr=16000,
351
+ DoFormant=rvc_globals.DoFormant,
352
+ Quefrency=rvc_globals.Quefrency,
353
+ Timbre=rvc_globals.Timbre)
354
+
355
+ audio_max = np.abs(audio).max() / 0.95
356
+ if audio_max > 1:
357
+ audio /= audio_max
358
+ times = [0, 0, 0]
359
+
360
+ if self.hubert_model is None:
361
+ self.hubert_model = load_hubert(self.config)
362
+
363
+ try:
364
+ self.if_f0 = self.cpt.get("f0", 1)
365
+ except NameError:
366
+ message = "Model was not properly selected"
367
+ print(message)
368
+ return message, None
369
+
370
+ file_index = (
371
+ (
372
+ file_index.strip(" ")
373
+ .strip('"')
374
+ .strip("\n")
375
+ .strip('"')
376
+ .strip(" ")
377
+ .replace("trained", "added")
378
+ )
379
+ if file_index != ""
380
+ else file_index2
381
+ ) # 防止小白写错,自动帮他替换掉
382
+
383
+ try:
384
+ audio_opt = self.pipeline.pipeline(
385
+ self.hubert_model,
386
+ self.net_g,
387
+ sid,
388
+ audio,
389
+ input_audio_path1,
390
+ times,
391
+ f0_up_key,
392
+ f0_method,
393
+ file_index,
394
+ index_rate,
395
+ self.if_f0,
396
+ filter_radius,
397
+ self.tgt_sr,
398
+ resample_sr,
399
+ rms_mix_rate,
400
+ self.version,
401
+ protect,
402
+ crepe_hop_length,
403
+ f0_autotune,
404
+ f0_file=f0_file,
405
+ f0_min=f0_min,
406
+ f0_max=f0_max
407
+ )
408
+ except AssertionError:
409
+ message = "Mismatching index version detected (v1 with v2, or v2 with v1)."
410
+ print(message)
411
+ return message, None
412
+ except NameError:
413
+ message = "RVC libraries are still loading. Please try again in a few seconds."
414
+ print(message)
415
+ return message, None
416
+
417
+ if self.tgt_sr != resample_sr >= 16000:
418
+ self.tgt_sr = resample_sr
419
+ index_info = (
420
+ "Index:\n%s." % file_index
421
+ if os.path.exists(file_index)
422
+ else "Index not used."
423
+ )
424
+ end_time = time.time()
425
+ total_time = end_time - start_time
426
+
427
+ return f"Success.\n {index_info}\nTime:\n npy:{times[0]}, f0:{times[1]}, infer:{times[2]}\nTotal Time: {total_time} seconds", (self.tgt_sr, audio_opt)
428
+ except:
429
+ info = traceback.format_exc()
430
+ logger.warn(info)
431
+ return info, (None, None)
432
+
433
+
434
+ def vc_multi(
435
+ self,
436
+ sid,
437
+ dir_path,
438
+ opt_root,
439
+ paths,
440
+ f0_up_key,
441
+ f0_method,
442
+ file_index,
443
+ file_index2,
444
+ index_rate,
445
+ filter_radius,
446
+ resample_sr,
447
+ rms_mix_rate,
448
+ protect,
449
+ format1,
450
+ crepe_hop_length,
451
+ f0_min,
452
+ note_min,
453
+ f0_max,
454
+ note_max,
455
+ f0_autotune,
456
+ ):
457
+ if rvc_globals.NotesOrHertz and f0_method != 'rmvpe':
458
+ f0_min = note_to_hz(note_min) if note_min else 50
459
+ f0_max = note_to_hz(note_max) if note_max else 1100
460
+ print(f"Converted Min pitch: freq - {f0_min}\n"
461
+ f"Converted Max pitch: freq - {f0_max}")
462
+ else:
463
+ f0_min = f0_min or 50
464
+ f0_max = f0_max or 1100
465
+ try:
466
+ dir_path = (
467
+ dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
468
+ ) # 防止小白拷路径头尾带了空格和"和回车
469
+ opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
470
+ os.makedirs(opt_root, exist_ok=True)
471
+ try:
472
+ if dir_path != "":
473
+ paths = [
474
+ os.path.join(dir_path, name) for name in os.listdir(dir_path)
475
+ ]
476
+ else:
477
+ paths = [path.name for path in paths]
478
+ except:
479
+ traceback.print_exc()
480
+ paths = [path.name for path in paths]
481
+ infos = []
482
+ for path in paths:
483
+ info, opt = self.vc_single(
484
+ sid,
485
+ path,
486
+ f0_up_key,
487
+ None,
488
+ f0_method,
489
+ file_index,
490
+ file_index2,
491
+ # file_big_npy,
492
+ index_rate,
493
+ filter_radius,
494
+ resample_sr,
495
+ rms_mix_rate,
496
+ protect,
497
+ )
498
+ if "Success" in info:
499
+ try:
500
+ tgt_sr, audio_opt = opt
501
+ if format1 in ["wav", "flac"]:
502
+ sf.write(
503
+ "%s/%s.%s"
504
+ % (opt_root, os.path.basename(path), format1),
505
+ audio_opt,
506
+ tgt_sr,
507
+ )
508
+ else:
509
+ path = "%s/%s.%s" % (opt_root, os.path.basename(path), format1)
510
+ with BytesIO() as wavf:
511
+ sf.write(
512
+ wavf,
513
+ audio_opt,
514
+ tgt_sr,
515
+ format="wav"
516
+ )
517
+ wavf.seek(0, 0)
518
+ with open(path, "wb") as outf:
519
+ wav2(wavf, outf, format1)
520
+ except:
521
+ info += traceback.format_exc()
522
+ infos.append("%s->%s" % (os.path.basename(path), info))
523
+ yield "\n".join(infos)
524
+ yield "\n".join(infos)
525
+ except:
526
+ yield traceback.format_exc()
infer/modules/vc/pipeline.py ADDED
@@ -0,0 +1,655 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import traceback
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ from functools import lru_cache
9
+ from time import time as ttime
10
+ from torch import Tensor
11
+ import faiss
12
+ import librosa
13
+ import numpy as np
14
+ import parselmouth
15
+ import pyworld
16
+ import torch
17
+ import torch.nn.functional as F
18
+ import torchcrepe
19
+ from scipy import signal
20
+ from tqdm import tqdm
21
+
22
+ import random
23
+ now_dir = os.getcwd()
24
+ sys.path.append(now_dir)
25
+ import re
26
+ from functools import partial
27
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
28
+
29
+ input_audio_path2wav = {}
30
+ from LazyImport import lazyload
31
+ torchcrepe = lazyload("torchcrepe") # Fork Feature. Crepe algo for training and preprocess
32
+ torch = lazyload("torch")
33
+ from infer.lib.rmvpe import RMVPE
34
+
35
+ @lru_cache
36
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
37
+ audio = input_audio_path2wav[input_audio_path]
38
+ f0, t = pyworld.harvest(
39
+ audio,
40
+ fs=fs,
41
+ f0_ceil=f0max,
42
+ f0_floor=f0min,
43
+ frame_period=frame_period,
44
+ )
45
+ f0 = pyworld.stonemask(audio, f0, t, fs)
46
+ return f0
47
+
48
+
49
+ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
50
+ # print(data1.max(),data2.max())
51
+ rms1 = librosa.feature.rms(
52
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
53
+ ) # 每半秒一个点
54
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
55
+ rms1 = torch.from_numpy(rms1)
56
+ rms1 = F.interpolate(
57
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
58
+ ).squeeze()
59
+ rms2 = torch.from_numpy(rms2)
60
+ rms2 = F.interpolate(
61
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
62
+ ).squeeze()
63
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
64
+ data2 *= (
65
+ torch.pow(rms1, torch.tensor(1 - rate))
66
+ * torch.pow(rms2, torch.tensor(rate - 1))
67
+ ).numpy()
68
+ return data2
69
+
70
+
71
+ class Pipeline(object):
72
+ def __init__(self, tgt_sr, config):
73
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
74
+ config.x_pad,
75
+ config.x_query,
76
+ config.x_center,
77
+ config.x_max,
78
+ config.is_half,
79
+ )
80
+ self.sr = 16000 # hubert输入采样率
81
+ self.window = 160 # 每帧点数
82
+ self.t_pad = self.sr * self.x_pad # 每条前后pad时间
83
+ self.t_pad_tgt = tgt_sr * self.x_pad
84
+ self.t_pad2 = self.t_pad * 2
85
+ self.t_query = self.sr * self.x_query # 查询切点前后查询时间
86
+ self.t_center = self.sr * self.x_center # 查询切点位置
87
+ self.t_max = self.sr * self.x_max # 免查询时长阈值
88
+ self.device = config.device
89
+ self.model_rmvpe = RMVPE("%s/rmvpe.pt" % os.environ["rmvpe_root"], is_half=self.is_half, device=self.device)
90
+ self.f0_method_dict = {
91
+ "pm": self.get_pm,
92
+ "harvest": self.get_harvest,
93
+ "dio": self.get_dio,
94
+ "rmvpe": self.get_rmvpe,
95
+ "rmvpe+": self.get_pitch_dependant_rmvpe,
96
+ "crepe": self.get_f0_official_crepe_computation,
97
+ "crepe-tiny": partial(self.get_f0_official_crepe_computation, model='model'),
98
+ "mangio-crepe": self.get_f0_crepe_computation,
99
+ "mangio-crepe-tiny": partial(self.get_f0_crepe_computation, model='model'),
100
+
101
+ }
102
+ self.note_dict = [
103
+ 65.41, 69.30, 73.42, 77.78, 82.41, 87.31,
104
+ 92.50, 98.00, 103.83, 110.00, 116.54, 123.47,
105
+ 130.81, 138.59, 146.83, 155.56, 164.81, 174.61,
106
+ 185.00, 196.00, 207.65, 220.00, 233.08, 246.94,
107
+ 261.63, 277.18, 293.66, 311.13, 329.63, 349.23,
108
+ 369.99, 392.00, 415.30, 440.00, 466.16, 493.88,
109
+ 523.25, 554.37, 587.33, 622.25, 659.25, 698.46,
110
+ 739.99, 783.99, 830.61, 880.00, 932.33, 987.77,
111
+ 1046.50, 1108.73, 1174.66, 1244.51, 1318.51, 1396.91,
112
+ 1479.98, 1567.98, 1661.22, 1760.00, 1864.66, 1975.53,
113
+ 2093.00, 2217.46, 2349.32, 2489.02, 2637.02, 2793.83,
114
+ 2959.96, 3135.96, 3322.44, 3520.00, 3729.31, 3951.07
115
+ ]
116
+
117
+ # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
118
+ def get_optimal_torch_device(self, index: int = 0) -> torch.device:
119
+ if torch.cuda.is_available():
120
+ return torch.device(
121
+ f"cuda:{index % torch.cuda.device_count()}"
122
+ ) # Very fast
123
+ elif torch.backends.mps.is_available():
124
+ return torch.device("mps")
125
+ return torch.device("cpu")
126
+
127
+ # Fork Feature: Compute f0 with the crepe method
128
+ def get_f0_crepe_computation(
129
+ self,
130
+ x,
131
+ f0_min,
132
+ f0_max,
133
+ p_len,
134
+ *args, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
135
+ **kwargs, # Either use crepe-tiny "tiny" or crepe "full". Default is full
136
+ ):
137
+ x = x.astype(
138
+ np.float32
139
+ ) # fixes the F.conv2D exception. We needed to convert double to float.
140
+ x /= np.quantile(np.abs(x), 0.999)
141
+ torch_device = self.get_optimal_torch_device()
142
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
143
+ audio = torch.unsqueeze(audio, dim=0)
144
+ if audio.ndim == 2 and audio.shape[0] > 1:
145
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
146
+ audio = audio.detach()
147
+ hop_length = kwargs.get('crepe_hop_length', 160)
148
+ model = kwargs.get('model', 'full')
149
+ print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
150
+ pitch: Tensor = torchcrepe.predict(
151
+ audio,
152
+ self.sr,
153
+ hop_length,
154
+ f0_min,
155
+ f0_max,
156
+ model,
157
+ batch_size=hop_length * 2,
158
+ device=torch_device,
159
+ pad=True,
160
+ )
161
+ p_len = p_len or x.shape[0] // hop_length
162
+ # Resize the pitch for final f0
163
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
164
+ source[source < 0.001] = np.nan
165
+ target = np.interp(
166
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
167
+ np.arange(0, len(source)),
168
+ source,
169
+ )
170
+ f0 = np.nan_to_num(target)
171
+ return f0 # Resized f0
172
+
173
+ def get_f0_official_crepe_computation(
174
+ self,
175
+ x,
176
+ f0_min,
177
+ f0_max,
178
+ *args,
179
+ **kwargs
180
+ ):
181
+ # Pick a batch size that doesn't cause memory errors on your gpu
182
+ batch_size = 512
183
+ # Compute pitch using first gpu
184
+ audio = torch.tensor(np.copy(x))[None].float()
185
+ model = kwargs.get('model', 'full')
186
+ f0, pd = torchcrepe.predict(
187
+ audio,
188
+ self.sr,
189
+ self.window,
190
+ f0_min,
191
+ f0_max,
192
+ model,
193
+ batch_size=batch_size,
194
+ device=self.device,
195
+ return_periodicity=True,
196
+ )
197
+ pd = torchcrepe.filter.median(pd, 3)
198
+ f0 = torchcrepe.filter.mean(f0, 3)
199
+ f0[pd < 0.1] = 0
200
+ f0 = f0[0].cpu().numpy()
201
+ return f0
202
+
203
+ # Fork Feature: Compute pYIN f0 method
204
+ def get_f0_pyin_computation(self, x, f0_min, f0_max):
205
+ y, sr = librosa.load("saudio/Sidney.wav", self.sr, mono=True)
206
+ f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
207
+ f0 = f0[1:] # Get rid of extra first frame
208
+ return f0
209
+
210
+ def get_pm(self, x, p_len, *args, **kwargs):
211
+ f0 = parselmouth.Sound(x, self.sr).to_pitch_ac(
212
+ time_step=160 / 16000,
213
+ voicing_threshold=0.6,
214
+ pitch_floor=kwargs.get('f0_min'),
215
+ pitch_ceiling=kwargs.get('f0_max'),
216
+ ).selected_array["frequency"]
217
+
218
+ return np.pad(
219
+ f0,
220
+ [[max(0, (p_len - len(f0) + 1) // 2), max(0, p_len - len(f0) - (p_len - len(f0) + 1) // 2)]],
221
+ mode="constant"
222
+ )
223
+
224
+ def get_harvest(self, x, *args, **kwargs):
225
+ f0_spectral = pyworld.harvest(
226
+ x.astype(np.double),
227
+ fs=self.sr,
228
+ f0_ceil=kwargs.get('f0_max'),
229
+ f0_floor=kwargs.get('f0_min'),
230
+ frame_period=1000 * kwargs.get('hop_length', 160) / self.sr,
231
+ )
232
+ return pyworld.stonemask(x.astype(np.double), *f0_spectral, self.sr)
233
+
234
+ def get_dio(self, x, *args, **kwargs):
235
+ f0_spectral = pyworld.dio(
236
+ x.astype(np.double),
237
+ fs=self.sr,
238
+ f0_ceil=kwargs.get('f0_max'),
239
+ f0_floor=kwargs.get('f0_min'),
240
+ frame_period=1000 * kwargs.get('hop_length', 160) / self.sr,
241
+ )
242
+ return pyworld.stonemask(x.astype(np.double), *f0_spectral, self.sr)
243
+
244
+
245
+ def get_rmvpe(self, x, *args, **kwargs):
246
+ if not hasattr(self, "model_rmvpe"):
247
+ from infer.lib.rmvpe import RMVPE
248
+
249
+ logger.info(
250
+ "Loading rmvpe model,%s" % "%s/rmvpe.pt" % os.environ["rmvpe_root"]
251
+ )
252
+ self.model_rmvpe = RMVPE(
253
+ "%s/rmvpe.pt" % os.environ["rmvpe_root"],
254
+ is_half=self.is_half,
255
+ device=self.device,
256
+ )
257
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
258
+
259
+ return f0
260
+
261
+
262
+ def get_pitch_dependant_rmvpe(self, x, f0_min=1, f0_max=40000, *args, **kwargs):
263
+ return self.model_rmvpe.infer_from_audio_with_pitch(x, thred=0.03, f0_min=f0_min, f0_max=f0_max)
264
+
265
+ def autotune_f0(self, f0):
266
+ autotuned_f0 = []
267
+ for freq in f0:
268
+ closest_notes = [x for x in self.note_dict if abs(x - freq) == min(abs(n - freq) for n in self.note_dict)]
269
+ autotuned_f0.append(random.choice(closest_notes))
270
+ return np.array(autotuned_f0, np.float64)
271
+
272
+ # Fork Feature: Acquire median hybrid f0 estimation calculation
273
+ def get_f0_hybrid_computation(
274
+ self,
275
+ methods_str,
276
+ input_audio_path,
277
+ x,
278
+ f0_min,
279
+ f0_max,
280
+ p_len,
281
+ filter_radius,
282
+ crepe_hop_length,
283
+ time_step
284
+ ):
285
+ # Get various f0 methods from input to use in the computation stack
286
+ params = {'x': x, 'p_len': p_len, 'f0_min': f0_min,
287
+ 'f0_max': f0_max, 'time_step': time_step, 'filter_radius': filter_radius,
288
+ 'crepe_hop_length': crepe_hop_length, 'model': "full"
289
+ }
290
+ methods_str = re.search('hybrid\[(.+)\]', methods_str)
291
+ if methods_str: # Ensure a match was found
292
+ methods = [method.strip() for method in methods_str.group(1).split('+')]
293
+ f0_computation_stack = []
294
+
295
+ print(f"Calculating f0 pitch estimations for methods: {str(methods)}")
296
+ x = x.astype(np.float32)
297
+ x /= np.quantile(np.abs(x), 0.999)
298
+ # Get f0 calculations for all methods specified
299
+
300
+ for method in methods:
301
+ if method not in self.f0_method_dict:
302
+ print(f"Method {method} not found.")
303
+ continue
304
+ f0 = self.f0_method_dict[method](**params)
305
+ if method == 'harvest' and filter_radius > 2:
306
+ f0 = signal.medfilt(f0, 3)
307
+ f0 = f0[1:] # Get rid of first frame.
308
+ f0_computation_stack.append(f0)
309
+
310
+ for fc in f0_computation_stack:
311
+ print(len(fc))
312
+
313
+ print(f"Calculating hybrid median f0 from the stack of: {str(methods)}")
314
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
315
+ return f0_median_hybrid
316
+
317
+ def get_f0(
318
+ self,
319
+ input_audio_path,
320
+ x,
321
+ p_len,
322
+ f0_up_key,
323
+ f0_method,
324
+ filter_radius,
325
+ crepe_hop_length,
326
+ f0_autotune,
327
+ inp_f0=None,
328
+ f0_min=50,
329
+ f0_max=1100,
330
+ ):
331
+ global input_audio_path2wav
332
+ time_step = self.window / self.sr * 1000
333
+ f0_min = 50
334
+ f0_max = 1100
335
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
336
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
337
+ params = {'x': x, 'p_len': p_len, 'f0_up_key': f0_up_key, 'f0_min': f0_min,
338
+ 'f0_max': f0_max, 'time_step': time_step, 'filter_radius': filter_radius,
339
+ 'crepe_hop_length': crepe_hop_length, 'model': "full"
340
+ }
341
+
342
+ if "hybrid" in f0_method:
343
+ # Perform hybrid median pitch estimation
344
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
345
+ f0 = self.get_f0_hybrid_computation(
346
+ f0_method,+
347
+ input_audio_path,
348
+ x,
349
+ f0_min,
350
+ f0_max,
351
+ p_len,
352
+ filter_radius,
353
+ crepe_hop_length,
354
+ time_step,
355
+ )
356
+ else:
357
+ f0 = self.f0_method_dict[f0_method](**params)
358
+
359
+ if "privateuseone" in str(self.device): # clean ortruntime memory
360
+ del self.model_rmvpe.model
361
+ del self.model_rmvpe
362
+ logger.info("Cleaning ortruntime memory")
363
+
364
+ if f0_autotune:
365
+ f0 = self.autotune_f0(f0)
366
+
367
+ f0 *= pow(2, f0_up_key / 12)
368
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
369
+ tf0 = self.sr // self.window # 每秒f0点数
370
+ if inp_f0 is not None:
371
+ delta_t = np.round(
372
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
373
+ ).astype("int16")
374
+ replace_f0 = np.interp(
375
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
376
+ )
377
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
378
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
379
+ :shape
380
+ ]
381
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
382
+ f0bak = f0.copy()
383
+ f0_mel = 1127 * np.log(1 + f0 / 700)
384
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
385
+ f0_mel_max - f0_mel_min
386
+ ) + 1
387
+ f0_mel[f0_mel <= 1] = 1
388
+ f0_mel[f0_mel > 255] = 255
389
+ f0_coarse = np.rint(f0_mel).astype(np.int32)
390
+ return f0_coarse, f0bak # 1-0
391
+
392
+ def vc(
393
+ self,
394
+ model,
395
+ net_g,
396
+ sid,
397
+ audio0,
398
+ pitch,
399
+ pitchf,
400
+ times,
401
+ index,
402
+ big_npy,
403
+ index_rate,
404
+ version,
405
+ protect,
406
+ ): # ,file_index,file_big_npy
407
+ feats = torch.from_numpy(audio0)
408
+ if self.is_half:
409
+ feats = feats.half()
410
+ else:
411
+ feats = feats.float()
412
+ if feats.dim() == 2: # double channels
413
+ feats = feats.mean(-1)
414
+ assert feats.dim() == 1, feats.dim()
415
+ feats = feats.view(1, -1)
416
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
417
+
418
+ inputs = {
419
+ "source": feats.to(self.device),
420
+ "padding_mask": padding_mask,
421
+ "output_layer": 9 if version == "v1" else 12,
422
+ }
423
+ t0 = ttime()
424
+ with torch.no_grad():
425
+ logits = model.extract_features(**inputs)
426
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
427
+ if protect < 0.5 and pitch is not None and pitchf is not None:
428
+ feats0 = feats.clone()
429
+ if (
430
+ not isinstance(index, type(None))
431
+ and not isinstance(big_npy, type(None))
432
+ and index_rate != 0
433
+ ):
434
+ npy = feats[0].cpu().numpy()
435
+ if self.is_half:
436
+ npy = npy.astype("float32")
437
+
438
+ # _, I = index.search(npy, 1)
439
+ # npy = big_npy[I.squeeze()]
440
+
441
+ score, ix = index.search(npy, k=8)
442
+ weight = np.square(1 / score)
443
+ weight /= weight.sum(axis=1, keepdims=True)
444
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
445
+
446
+ if self.is_half:
447
+ npy = npy.astype("float16")
448
+ feats = (
449
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
450
+ + (1 - index_rate) * feats
451
+ )
452
+
453
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
454
+ if protect < 0.5 and pitch is not None and pitchf is not None:
455
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
456
+ 0, 2, 1
457
+ )
458
+ t1 = ttime()
459
+ p_len = audio0.shape[0] // self.window
460
+ if feats.shape[1] < p_len:
461
+ p_len = feats.shape[1]
462
+ if pitch is not None and pitchf is not None:
463
+ pitch = pitch[:, :p_len]
464
+ pitchf = pitchf[:, :p_len]
465
+
466
+ if protect < 0.5 and pitch is not None and pitchf is not None:
467
+ pitchff = pitchf.clone()
468
+ pitchff[pitchf > 0] = 1
469
+ pitchff[pitchf < 1] = protect
470
+ pitchff = pitchff.unsqueeze(-1)
471
+ feats = feats * pitchff + feats0 * (1 - pitchff)
472
+ feats = feats.to(feats0.dtype)
473
+ p_len = torch.tensor([p_len], device=self.device).long()
474
+ with torch.no_grad():
475
+ hasp = pitch is not None and pitchf is not None
476
+ arg = (feats, p_len, pitch, pitchf, sid) if hasp else (feats, p_len, sid)
477
+ audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy()
478
+ del hasp, arg
479
+ del feats, p_len, padding_mask
480
+ if torch.cuda.is_available():
481
+ torch.cuda.empty_cache()
482
+ t2 = ttime()
483
+ times[0] += t1 - t0
484
+ times[2] += t2 - t1
485
+ return audio1
486
+ def process_t(self, t, s, window, audio_pad, pitch, pitchf, times, index, big_npy, index_rate, version, protect, t_pad_tgt, if_f0, sid, model, net_g):
487
+ t = t // window * window
488
+ if if_f0 == 1:
489
+ return self.vc(
490
+ model,
491
+ net_g,
492
+ sid,
493
+ audio_pad[s : t + t_pad_tgt + window],
494
+ pitch[:, s // window : (t + t_pad_tgt) // window],
495
+ pitchf[:, s // window : (t + t_pad_tgt) // window],
496
+ times,
497
+ index,
498
+ big_npy,
499
+ index_rate,
500
+ version,
501
+ protect,
502
+ )[t_pad_tgt : -t_pad_tgt]
503
+ else:
504
+ return self.vc(
505
+ model,
506
+ net_g,
507
+ sid,
508
+ audio_pad[s : t + t_pad_tgt + window],
509
+ None,
510
+ None,
511
+ times,
512
+ index,
513
+ big_npy,
514
+ index_rate,
515
+ version,
516
+ protect,
517
+ )[t_pad_tgt : -t_pad_tgt]
518
+
519
+
520
+ def pipeline(
521
+ self,
522
+ model,
523
+ net_g,
524
+ sid,
525
+ audio,
526
+ input_audio_path,
527
+ times,
528
+ f0_up_key,
529
+ f0_method,
530
+ file_index,
531
+ index_rate,
532
+ if_f0,
533
+ filter_radius,
534
+ tgt_sr,
535
+ resample_sr,
536
+ rms_mix_rate,
537
+ version,
538
+ protect,
539
+ crepe_hop_length,
540
+ f0_autotune,
541
+ f0_file=None,
542
+ f0_min=50,
543
+ f0_max=1100
544
+ ):
545
+ if (
546
+ file_index != ""
547
+ # and file_big_npy != ""
548
+ # and os.path.exists(file_big_npy) == True
549
+ and os.path.exists(file_index)
550
+ and index_rate != 0
551
+ ):
552
+ try:
553
+ index = faiss.read_index(file_index)
554
+ # big_npy = np.load(file_big_npy)
555
+ big_npy = index.reconstruct_n(0, index.ntotal)
556
+ except:
557
+ traceback.print_exc()
558
+ index = big_npy = None
559
+ else:
560
+ index = big_npy = None
561
+ audio = signal.filtfilt(bh, ah, audio)
562
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
563
+ opt_ts = []
564
+ if audio_pad.shape[0] > self.t_max:
565
+ audio_sum = np.zeros_like(audio)
566
+ for i in range(self.window):
567
+ audio_sum += audio_pad[i : i - self.window]
568
+ for t in range(self.t_center, audio.shape[0], self.t_center):
569
+ opt_ts.append(
570
+ t
571
+ - self.t_query
572
+ + np.where(
573
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
574
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
575
+ )[0][0]
576
+ )
577
+ s = 0
578
+ audio_opt = []
579
+ t = None
580
+ t1 = ttime()
581
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
582
+ p_len = audio_pad.shape[0] // self.window
583
+ inp_f0 = None
584
+ if hasattr(f0_file, "name"):
585
+ try:
586
+ with open(f0_file.name, "r") as f:
587
+ lines = f.read().strip("\n").split("\n")
588
+ inp_f0 = []
589
+ for line in lines:
590
+ inp_f0.append([float(i) for i in line.split(",")])
591
+ inp_f0 = np.array(inp_f0, dtype="float32")
592
+ except:
593
+ traceback.print_exc()
594
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
595
+ pitch, pitchf = None, None
596
+ if if_f0:
597
+ pitch, pitchf = self.get_f0(
598
+ input_audio_path,
599
+ audio_pad,
600
+ p_len,
601
+ f0_up_key,
602
+ f0_method,
603
+ filter_radius,
604
+ crepe_hop_length,
605
+ f0_autotune,
606
+ inp_f0,
607
+ f0_min,
608
+ f0_max
609
+ )
610
+ pitch = pitch[:p_len]
611
+ pitchf = pitchf[:p_len]
612
+ if self.device == "mps" or "xpu" in self.device:
613
+ pitchf = pitchf.astype(np.float32)
614
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
615
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
616
+ t2 = ttime()
617
+ times[1] += t2 - t1
618
+
619
+ with tqdm(total=len(opt_ts), desc="Processing", unit="window") as pbar:
620
+ for i, t in enumerate(opt_ts):
621
+ t = t // self.window * self.window
622
+ start = s
623
+ end = t + self.t_pad2 + self.window
624
+ audio_slice = audio_pad[start:end]
625
+ pitch_slice = pitch[:, start // self.window:end // self.window] if if_f0 else None
626
+ pitchf_slice = pitchf[:, start // self.window:end // self.window] if if_f0 else None
627
+ audio_opt.append(self.vc(model, net_g, sid, audio_slice, pitch_slice, pitchf_slice, times, index, big_npy, index_rate, version, protect)[self.t_pad_tgt : -self.t_pad_tgt])
628
+ s = t
629
+ pbar.update(1)
630
+ pbar.refresh()
631
+
632
+ audio_slice = audio_pad[t:]
633
+ pitch_slice = pitch[:, t // self.window:] if if_f0 and t is not None else pitch
634
+ pitchf_slice = pitchf[:, t // self.window:] if if_f0 and t is not None else pitchf
635
+ audio_opt.append(self.vc(model, net_g, sid, audio_slice, pitch_slice, pitchf_slice, times, index, big_npy, index_rate, version, protect)[self.t_pad_tgt : -self.t_pad_tgt])
636
+
637
+ audio_opt = np.concatenate(audio_opt)
638
+ if rms_mix_rate != 1:
639
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
640
+ if tgt_sr != resample_sr >= 16000:
641
+ audio_opt = librosa.resample(
642
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
643
+ )
644
+ audio_max = np.abs(audio_opt).max() / 0.99
645
+ max_int16 = 32768
646
+ if audio_max > 1:
647
+ max_int16 /= audio_max
648
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
649
+ del pitch, pitchf, sid
650
+ if torch.cuda.is_available():
651
+ torch.cuda.empty_cache()
652
+
653
+ print("Returning completed audio...")
654
+ print("-------------------")
655
+ return audio_opt
infer/modules/vc/utils.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from fairseq import checkpoint_utils
4
+
5
+
6
+ def get_index_path_from_model(sid):
7
+ sid0strip = re.sub(r'\.pth|\.onnx$', '', sid)
8
+ sid0name = os.path.split(sid0strip)[-1] # Extract only the name, not the directory
9
+
10
+ # Check if the sid0strip has the specific ending format _eXXX_sXXX
11
+ if re.match(r'.+_e\d+_s\d+$', sid0name):
12
+ base_model_name = sid0name.rsplit('_', 2)[0]
13
+ else:
14
+ base_model_name = sid0name
15
+
16
+ return next(
17
+ (
18
+ f
19
+ for f in [
20
+ os.path.join(root, name)
21
+ for root, _, files in os.walk(os.getenv("index_root"), topdown=False)
22
+ for name in files
23
+ if name.endswith(".index") and "trained" not in name
24
+ ]
25
+ if base_model_name in f
26
+ ),
27
+ "",
28
+ )
29
+
30
+
31
+ def load_hubert(config):
32
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
33
+ ["assets/hubert/hubert_base.pt"],
34
+ suffix="",
35
+ )
36
+ hubert_model = models[0]
37
+ hubert_model = hubert_model.to(config.device)
38
+ if config.is_half:
39
+ hubert_model = hubert_model.half()
40
+ else:
41
+ hubert_model = hubert_model.float()
42
+ return hubert_model.eval()
lib/globals/globals.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ DoFormant: bool = False
2
+ Quefrency: float = 8.0
3
+ Timbre: float = 1.2
4
+
5
+ NotesOrHertz: bool = False
lib/infer_pack/attentions.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ from lib.infer_pack import commons
9
+ from lib.infer_pack import modules
10
+ from lib.infer_pack.modules import LayerNorm
11
+
12
+
13
+ class Encoder(nn.Module):
14
+ def __init__(
15
+ self,
16
+ hidden_channels,
17
+ filter_channels,
18
+ n_heads,
19
+ n_layers,
20
+ kernel_size=1,
21
+ p_dropout=0.0,
22
+ window_size=10,
23
+ **kwargs
24
+ ):
25
+ super().__init__()
26
+ self.hidden_channels = hidden_channels
27
+ self.filter_channels = filter_channels
28
+ self.n_heads = n_heads
29
+ self.n_layers = n_layers
30
+ self.kernel_size = kernel_size
31
+ self.p_dropout = p_dropout
32
+ self.window_size = window_size
33
+
34
+ self.drop = nn.Dropout(p_dropout)
35
+ self.attn_layers = nn.ModuleList()
36
+ self.norm_layers_1 = nn.ModuleList()
37
+ self.ffn_layers = nn.ModuleList()
38
+ self.norm_layers_2 = nn.ModuleList()
39
+ for i in range(self.n_layers):
40
+ self.attn_layers.append(
41
+ MultiHeadAttention(
42
+ hidden_channels,
43
+ hidden_channels,
44
+ n_heads,
45
+ p_dropout=p_dropout,
46
+ window_size=window_size,
47
+ )
48
+ )
49
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
50
+ self.ffn_layers.append(
51
+ FFN(
52
+ hidden_channels,
53
+ hidden_channels,
54
+ filter_channels,
55
+ kernel_size,
56
+ p_dropout=p_dropout,
57
+ )
58
+ )
59
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
60
+
61
+ def forward(self, x, x_mask):
62
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
63
+ x = x * x_mask
64
+ for i in range(self.n_layers):
65
+ y = self.attn_layers[i](x, x, attn_mask)
66
+ y = self.drop(y)
67
+ x = self.norm_layers_1[i](x + y)
68
+
69
+ y = self.ffn_layers[i](x, x_mask)
70
+ y = self.drop(y)
71
+ x = self.norm_layers_2[i](x + y)
72
+ x = x * x_mask
73
+ return x
74
+
75
+
76
+ class Decoder(nn.Module):
77
+ def __init__(
78
+ self,
79
+ hidden_channels,
80
+ filter_channels,
81
+ n_heads,
82
+ n_layers,
83
+ kernel_size=1,
84
+ p_dropout=0.0,
85
+ proximal_bias=False,
86
+ proximal_init=True,
87
+ **kwargs
88
+ ):
89
+ super().__init__()
90
+ self.hidden_channels = hidden_channels
91
+ self.filter_channels = filter_channels
92
+ self.n_heads = n_heads
93
+ self.n_layers = n_layers
94
+ self.kernel_size = kernel_size
95
+ self.p_dropout = p_dropout
96
+ self.proximal_bias = proximal_bias
97
+ self.proximal_init = proximal_init
98
+
99
+ self.drop = nn.Dropout(p_dropout)
100
+ self.self_attn_layers = nn.ModuleList()
101
+ self.norm_layers_0 = nn.ModuleList()
102
+ self.encdec_attn_layers = nn.ModuleList()
103
+ self.norm_layers_1 = nn.ModuleList()
104
+ self.ffn_layers = nn.ModuleList()
105
+ self.norm_layers_2 = nn.ModuleList()
106
+ for i in range(self.n_layers):
107
+ self.self_attn_layers.append(
108
+ MultiHeadAttention(
109
+ hidden_channels,
110
+ hidden_channels,
111
+ n_heads,
112
+ p_dropout=p_dropout,
113
+ proximal_bias=proximal_bias,
114
+ proximal_init=proximal_init,
115
+ )
116
+ )
117
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
118
+ self.encdec_attn_layers.append(
119
+ MultiHeadAttention(
120
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
121
+ )
122
+ )
123
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
124
+ self.ffn_layers.append(
125
+ FFN(
126
+ hidden_channels,
127
+ hidden_channels,
128
+ filter_channels,
129
+ kernel_size,
130
+ p_dropout=p_dropout,
131
+ causal=True,
132
+ )
133
+ )
134
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
135
+
136
+ def forward(self, x, x_mask, h, h_mask):
137
+ """
138
+ x: decoder input
139
+ h: encoder output
140
+ """
141
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
142
+ device=x.device, dtype=x.dtype
143
+ )
144
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
145
+ x = x * x_mask
146
+ for i in range(self.n_layers):
147
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
148
+ y = self.drop(y)
149
+ x = self.norm_layers_0[i](x + y)
150
+
151
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
152
+ y = self.drop(y)
153
+ x = self.norm_layers_1[i](x + y)
154
+
155
+ y = self.ffn_layers[i](x, x_mask)
156
+ y = self.drop(y)
157
+ x = self.norm_layers_2[i](x + y)
158
+ x = x * x_mask
159
+ return x
160
+
161
+
162
+ class MultiHeadAttention(nn.Module):
163
+ def __init__(
164
+ self,
165
+ channels,
166
+ out_channels,
167
+ n_heads,
168
+ p_dropout=0.0,
169
+ window_size=None,
170
+ heads_share=True,
171
+ block_length=None,
172
+ proximal_bias=False,
173
+ proximal_init=False,
174
+ ):
175
+ super().__init__()
176
+ assert channels % n_heads == 0
177
+
178
+ self.channels = channels
179
+ self.out_channels = out_channels
180
+ self.n_heads = n_heads
181
+ self.p_dropout = p_dropout
182
+ self.window_size = window_size
183
+ self.heads_share = heads_share
184
+ self.block_length = block_length
185
+ self.proximal_bias = proximal_bias
186
+ self.proximal_init = proximal_init
187
+ self.attn = None
188
+
189
+ self.k_channels = channels // n_heads
190
+ self.conv_q = nn.Conv1d(channels, channels, 1)
191
+ self.conv_k = nn.Conv1d(channels, channels, 1)
192
+ self.conv_v = nn.Conv1d(channels, channels, 1)
193
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
194
+ self.drop = nn.Dropout(p_dropout)
195
+
196
+ if window_size is not None:
197
+ n_heads_rel = 1 if heads_share else n_heads
198
+ rel_stddev = self.k_channels**-0.5
199
+ self.emb_rel_k = nn.Parameter(
200
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
201
+ * rel_stddev
202
+ )
203
+ self.emb_rel_v = nn.Parameter(
204
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
205
+ * rel_stddev
206
+ )
207
+
208
+ nn.init.xavier_uniform_(self.conv_q.weight)
209
+ nn.init.xavier_uniform_(self.conv_k.weight)
210
+ nn.init.xavier_uniform_(self.conv_v.weight)
211
+ if proximal_init:
212
+ with torch.no_grad():
213
+ self.conv_k.weight.copy_(self.conv_q.weight)
214
+ self.conv_k.bias.copy_(self.conv_q.bias)
215
+
216
+ def forward(self, x, c, attn_mask=None):
217
+ q = self.conv_q(x)
218
+ k = self.conv_k(c)
219
+ v = self.conv_v(c)
220
+
221
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
222
+
223
+ x = self.conv_o(x)
224
+ return x
225
+
226
+ def attention(self, query, key, value, mask=None):
227
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
228
+ b, d, t_s, t_t = (*key.size(), query.size(2))
229
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
230
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
231
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
232
+
233
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
234
+ if self.window_size is not None:
235
+ assert (
236
+ t_s == t_t
237
+ ), "Relative attention is only available for self-attention."
238
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
239
+ rel_logits = self._matmul_with_relative_keys(
240
+ query / math.sqrt(self.k_channels), key_relative_embeddings
241
+ )
242
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
243
+ scores = scores + scores_local
244
+ if self.proximal_bias:
245
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
246
+ scores = scores + self._attention_bias_proximal(t_s).to(
247
+ device=scores.device, dtype=scores.dtype
248
+ )
249
+ if mask is not None:
250
+ scores = scores.masked_fill(mask == 0, -1e4)
251
+ if self.block_length is not None:
252
+ assert (
253
+ t_s == t_t
254
+ ), "Local attention is only available for self-attention."
255
+ block_mask = (
256
+ torch.ones_like(scores)
257
+ .triu(-self.block_length)
258
+ .tril(self.block_length)
259
+ )
260
+ scores = scores.masked_fill(block_mask == 0, -1e4)
261
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
262
+ p_attn = self.drop(p_attn)
263
+ output = torch.matmul(p_attn, value)
264
+ if self.window_size is not None:
265
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
266
+ value_relative_embeddings = self._get_relative_embeddings(
267
+ self.emb_rel_v, t_s
268
+ )
269
+ output = output + self._matmul_with_relative_values(
270
+ relative_weights, value_relative_embeddings
271
+ )
272
+ output = (
273
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
274
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
275
+ return output, p_attn
276
+
277
+ def _matmul_with_relative_values(self, x, y):
278
+ """
279
+ x: [b, h, l, m]
280
+ y: [h or 1, m, d]
281
+ ret: [b, h, l, d]
282
+ """
283
+ ret = torch.matmul(x, y.unsqueeze(0))
284
+ return ret
285
+
286
+ def _matmul_with_relative_keys(self, x, y):
287
+ """
288
+ x: [b, h, l, d]
289
+ y: [h or 1, m, d]
290
+ ret: [b, h, l, m]
291
+ """
292
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
293
+ return ret
294
+
295
+ def _get_relative_embeddings(self, relative_embeddings, length):
296
+ max_relative_position = 2 * self.window_size + 1
297
+ # Pad first before slice to avoid using cond ops.
298
+ pad_length = max(length - (self.window_size + 1), 0)
299
+ slice_start_position = max((self.window_size + 1) - length, 0)
300
+ slice_end_position = slice_start_position + 2 * length - 1
301
+ if pad_length > 0:
302
+ padded_relative_embeddings = F.pad(
303
+ relative_embeddings,
304
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
305
+ )
306
+ else:
307
+ padded_relative_embeddings = relative_embeddings
308
+ used_relative_embeddings = padded_relative_embeddings[
309
+ :, slice_start_position:slice_end_position
310
+ ]
311
+ return used_relative_embeddings
312
+
313
+ def _relative_position_to_absolute_position(self, x):
314
+ """
315
+ x: [b, h, l, 2*l-1]
316
+ ret: [b, h, l, l]
317
+ """
318
+ batch, heads, length, _ = x.size()
319
+ # Concat columns of pad to shift from relative to absolute indexing.
320
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
321
+
322
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
323
+ x_flat = x.view([batch, heads, length * 2 * length])
324
+ x_flat = F.pad(
325
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
326
+ )
327
+
328
+ # Reshape and slice out the padded elements.
329
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
330
+ :, :, :length, length - 1 :
331
+ ]
332
+ return x_final
333
+
334
+ def _absolute_position_to_relative_position(self, x):
335
+ """
336
+ x: [b, h, l, l]
337
+ ret: [b, h, l, 2*l-1]
338
+ """
339
+ batch, heads, length, _ = x.size()
340
+ # padd along column
341
+ x = F.pad(
342
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
343
+ )
344
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
345
+ # add 0's in the beginning that will skew the elements after reshape
346
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
347
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
348
+ return x_final
349
+
350
+ def _attention_bias_proximal(self, length):
351
+ """Bias for self-attention to encourage attention to close positions.
352
+ Args:
353
+ length: an integer scalar.
354
+ Returns:
355
+ a Tensor with shape [1, 1, length, length]
356
+ """
357
+ r = torch.arange(length, dtype=torch.float32)
358
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
359
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
360
+
361
+
362
+ class FFN(nn.Module):
363
+ def __init__(
364
+ self,
365
+ in_channels,
366
+ out_channels,
367
+ filter_channels,
368
+ kernel_size,
369
+ p_dropout=0.0,
370
+ activation=None,
371
+ causal=False,
372
+ ):
373
+ super().__init__()
374
+ self.in_channels = in_channels
375
+ self.out_channels = out_channels
376
+ self.filter_channels = filter_channels
377
+ self.kernel_size = kernel_size
378
+ self.p_dropout = p_dropout
379
+ self.activation = activation
380
+ self.causal = causal
381
+
382
+ if causal:
383
+ self.padding = self._causal_padding
384
+ else:
385
+ self.padding = self._same_padding
386
+
387
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
388
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
389
+ self.drop = nn.Dropout(p_dropout)
390
+
391
+ def forward(self, x, x_mask):
392
+ x = self.conv_1(self.padding(x * x_mask))
393
+ if self.activation == "gelu":
394
+ x = x * torch.sigmoid(1.702 * x)
395
+ else:
396
+ x = torch.relu(x)
397
+ x = self.drop(x)
398
+ x = self.conv_2(self.padding(x * x_mask))
399
+ return x * x_mask
400
+
401
+ def _causal_padding(self, x):
402
+ if self.kernel_size == 1:
403
+ return x
404
+ pad_l = self.kernel_size - 1
405
+ pad_r = 0
406
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
407
+ x = F.pad(x, commons.convert_pad_shape(padding))
408
+ return x
409
+
410
+ def _same_padding(self, x):
411
+ if self.kernel_size == 1:
412
+ return x
413
+ pad_l = (self.kernel_size - 1) // 2
414
+ pad_r = self.kernel_size // 2
415
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
416
+ x = F.pad(x, commons.convert_pad_shape(padding))
417
+ return x
lib/infer_pack/commons.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+
8
+ def init_weights(m, mean=0.0, std=0.01):
9
+ classname = m.__class__.__name__
10
+ if classname.find("Conv") != -1:
11
+ m.weight.data.normal_(mean, std)
12
+
13
+
14
+ def get_padding(kernel_size, dilation=1):
15
+ return int((kernel_size * dilation - dilation) / 2)
16
+
17
+
18
+ def convert_pad_shape(pad_shape):
19
+ l = pad_shape[::-1]
20
+ pad_shape = [item for sublist in l for item in sublist]
21
+ return pad_shape
22
+
23
+
24
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
25
+ """KL(P||Q)"""
26
+ kl = (logs_q - logs_p) - 0.5
27
+ kl += (
28
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
29
+ )
30
+ return kl
31
+
32
+
33
+ def rand_gumbel(shape):
34
+ """Sample from the Gumbel distribution, protect from overflows."""
35
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
36
+ return -torch.log(-torch.log(uniform_samples))
37
+
38
+
39
+ def rand_gumbel_like(x):
40
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
41
+ return g
42
+
43
+
44
+ def slice_segments(x, ids_str, segment_size=4):
45
+ ret = torch.zeros_like(x[:, :, :segment_size])
46
+ for i in range(x.size(0)):
47
+ idx_str = ids_str[i]
48
+ idx_end = idx_str + segment_size
49
+ ret[i] = x[i, :, idx_str:idx_end]
50
+ return ret
51
+
52
+
53
+ def slice_segments2(x, ids_str, segment_size=4):
54
+ ret = torch.zeros_like(x[:, :segment_size])
55
+ for i in range(x.size(0)):
56
+ idx_str = ids_str[i]
57
+ idx_end = idx_str + segment_size
58
+ ret[i] = x[i, idx_str:idx_end]
59
+ return ret
60
+
61
+
62
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
63
+ b, d, t = x.size()
64
+ if x_lengths is None:
65
+ x_lengths = t
66
+ ids_str_max = x_lengths - segment_size + 1
67
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
68
+ ret = slice_segments(x, ids_str, segment_size)
69
+ return ret, ids_str
70
+
71
+
72
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
73
+ position = torch.arange(length, dtype=torch.float)
74
+ num_timescales = channels // 2
75
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
76
+ num_timescales - 1
77
+ )
78
+ inv_timescales = min_timescale * torch.exp(
79
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
80
+ )
81
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
82
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
83
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
84
+ signal = signal.view(1, channels, length)
85
+ return signal
86
+
87
+
88
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
89
+ b, channels, length = x.size()
90
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
91
+ return x + signal.to(dtype=x.dtype, device=x.device)
92
+
93
+
94
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
95
+ b, channels, length = x.size()
96
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
97
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
98
+
99
+
100
+ def subsequent_mask(length):
101
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
102
+ return mask
103
+
104
+
105
+ @torch.jit.script
106
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
107
+ n_channels_int = n_channels[0]
108
+ in_act = input_a + input_b
109
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
110
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
111
+ acts = t_act * s_act
112
+ return acts
113
+
114
+
115
+ def convert_pad_shape(pad_shape):
116
+ l = pad_shape[::-1]
117
+ pad_shape = [item for sublist in l for item in sublist]
118
+ return pad_shape
119
+
120
+
121
+ def shift_1d(x):
122
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
123
+ return x
124
+
125
+
126
+ def sequence_mask(length, max_length=None):
127
+ if max_length is None:
128
+ max_length = length.max()
129
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
130
+ return x.unsqueeze(0) < length.unsqueeze(1)
131
+
132
+
133
+ def generate_path(duration, mask):
134
+ """
135
+ duration: [b, 1, t_x]
136
+ mask: [b, 1, t_y, t_x]
137
+ """
138
+ device = duration.device
139
+
140
+ b, _, t_y, t_x = mask.shape
141
+ cum_duration = torch.cumsum(duration, -1)
142
+
143
+ cum_duration_flat = cum_duration.view(b * t_x)
144
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
145
+ path = path.view(b, t_x, t_y)
146
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
147
+ path = path.unsqueeze(1).transpose(2, 3) * mask
148
+ return path
149
+
150
+
151
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
152
+ if isinstance(parameters, torch.Tensor):
153
+ parameters = [parameters]
154
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
155
+ norm_type = float(norm_type)
156
+ if clip_value is not None:
157
+ clip_value = float(clip_value)
158
+
159
+ total_norm = 0
160
+ for p in parameters:
161
+ param_norm = p.grad.data.norm(norm_type)
162
+ total_norm += param_norm.item() ** norm_type
163
+ if clip_value is not None:
164
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
165
+ total_norm = total_norm ** (1.0 / norm_type)
166
+ return total_norm
lib/infer_pack/models.py ADDED
@@ -0,0 +1,1144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, pdb, os
2
+ from time import time as ttime
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+ from lib.infer_pack import modules
7
+ from lib.infer_pack import attentions
8
+ from lib.infer_pack import commons
9
+ from lib.infer_pack.commons import init_weights, get_padding
10
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from lib.infer_pack.commons import init_weights
13
+ import numpy as np
14
+ from lib.infer_pack import commons
15
+
16
+
17
+ class TextEncoder256(nn.Module):
18
+ def __init__(
19
+ self,
20
+ out_channels,
21
+ hidden_channels,
22
+ filter_channels,
23
+ n_heads,
24
+ n_layers,
25
+ kernel_size,
26
+ p_dropout,
27
+ f0=True,
28
+ ):
29
+ super().__init__()
30
+ self.out_channels = out_channels
31
+ self.hidden_channels = hidden_channels
32
+ self.filter_channels = filter_channels
33
+ self.n_heads = n_heads
34
+ self.n_layers = n_layers
35
+ self.kernel_size = kernel_size
36
+ self.p_dropout = p_dropout
37
+ self.emb_phone = nn.Linear(256, hidden_channels)
38
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
+ if f0 == True:
40
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
+ self.encoder = attentions.Encoder(
42
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
+ )
44
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
+
46
+ def forward(self, phone, pitch, lengths):
47
+ if pitch == None:
48
+ x = self.emb_phone(phone)
49
+ else:
50
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
+ x = self.lrelu(x)
53
+ x = torch.transpose(x, 1, -1) # [b, h, t]
54
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
+ x.dtype
56
+ )
57
+ x = self.encoder(x * x_mask, x_mask)
58
+ stats = self.proj(x) * x_mask
59
+
60
+ m, logs = torch.split(stats, self.out_channels, dim=1)
61
+ return m, logs, x_mask
62
+
63
+
64
+ class TextEncoder768(nn.Module):
65
+ def __init__(
66
+ self,
67
+ out_channels,
68
+ hidden_channels,
69
+ filter_channels,
70
+ n_heads,
71
+ n_layers,
72
+ kernel_size,
73
+ p_dropout,
74
+ f0=True,
75
+ ):
76
+ super().__init__()
77
+ self.out_channels = out_channels
78
+ self.hidden_channels = hidden_channels
79
+ self.filter_channels = filter_channels
80
+ self.n_heads = n_heads
81
+ self.n_layers = n_layers
82
+ self.kernel_size = kernel_size
83
+ self.p_dropout = p_dropout
84
+ self.emb_phone = nn.Linear(768, hidden_channels)
85
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
+ if f0 == True:
87
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
+ self.encoder = attentions.Encoder(
89
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
+ )
91
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
+
93
+ def forward(self, phone, pitch, lengths):
94
+ if pitch == None:
95
+ x = self.emb_phone(phone)
96
+ else:
97
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
+ x = self.lrelu(x)
100
+ x = torch.transpose(x, 1, -1) # [b, h, t]
101
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
+ x.dtype
103
+ )
104
+ x = self.encoder(x * x_mask, x_mask)
105
+ stats = self.proj(x) * x_mask
106
+
107
+ m, logs = torch.split(stats, self.out_channels, dim=1)
108
+ return m, logs, x_mask
109
+
110
+
111
+ class ResidualCouplingBlock(nn.Module):
112
+ def __init__(
113
+ self,
114
+ channels,
115
+ hidden_channels,
116
+ kernel_size,
117
+ dilation_rate,
118
+ n_layers,
119
+ n_flows=4,
120
+ gin_channels=0,
121
+ ):
122
+ super().__init__()
123
+ self.channels = channels
124
+ self.hidden_channels = hidden_channels
125
+ self.kernel_size = kernel_size
126
+ self.dilation_rate = dilation_rate
127
+ self.n_layers = n_layers
128
+ self.n_flows = n_flows
129
+ self.gin_channels = gin_channels
130
+
131
+ self.flows = nn.ModuleList()
132
+ for i in range(n_flows):
133
+ self.flows.append(
134
+ modules.ResidualCouplingLayer(
135
+ channels,
136
+ hidden_channels,
137
+ kernel_size,
138
+ dilation_rate,
139
+ n_layers,
140
+ gin_channels=gin_channels,
141
+ mean_only=True,
142
+ )
143
+ )
144
+ self.flows.append(modules.Flip())
145
+
146
+ def forward(self, x, x_mask, g=None, reverse=False):
147
+ if not reverse:
148
+ for flow in self.flows:
149
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
+ else:
151
+ for flow in reversed(self.flows):
152
+ x = flow(x, x_mask, g=g, reverse=reverse)
153
+ return x
154
+
155
+ def remove_weight_norm(self):
156
+ for i in range(self.n_flows):
157
+ self.flows[i * 2].remove_weight_norm()
158
+
159
+
160
+ class PosteriorEncoder(nn.Module):
161
+ def __init__(
162
+ self,
163
+ in_channels,
164
+ out_channels,
165
+ hidden_channels,
166
+ kernel_size,
167
+ dilation_rate,
168
+ n_layers,
169
+ gin_channels=0,
170
+ ):
171
+ super().__init__()
172
+ self.in_channels = in_channels
173
+ self.out_channels = out_channels
174
+ self.hidden_channels = hidden_channels
175
+ self.kernel_size = kernel_size
176
+ self.dilation_rate = dilation_rate
177
+ self.n_layers = n_layers
178
+ self.gin_channels = gin_channels
179
+
180
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
+ self.enc = modules.WN(
182
+ hidden_channels,
183
+ kernel_size,
184
+ dilation_rate,
185
+ n_layers,
186
+ gin_channels=gin_channels,
187
+ )
188
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
+
190
+ def forward(self, x, x_lengths, g=None):
191
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
+ x.dtype
193
+ )
194
+ x = self.pre(x) * x_mask
195
+ x = self.enc(x, x_mask, g=g)
196
+ stats = self.proj(x) * x_mask
197
+ m, logs = torch.split(stats, self.out_channels, dim=1)
198
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
+ return z, m, logs, x_mask
200
+
201
+ def remove_weight_norm(self):
202
+ self.enc.remove_weight_norm()
203
+
204
+
205
+ class Generator(torch.nn.Module):
206
+ def __init__(
207
+ self,
208
+ initial_channel,
209
+ resblock,
210
+ resblock_kernel_sizes,
211
+ resblock_dilation_sizes,
212
+ upsample_rates,
213
+ upsample_initial_channel,
214
+ upsample_kernel_sizes,
215
+ gin_channels=0,
216
+ ):
217
+ super(Generator, self).__init__()
218
+ self.num_kernels = len(resblock_kernel_sizes)
219
+ self.num_upsamples = len(upsample_rates)
220
+ self.conv_pre = Conv1d(
221
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
222
+ )
223
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
+
225
+ self.ups = nn.ModuleList()
226
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
+ self.ups.append(
228
+ weight_norm(
229
+ ConvTranspose1d(
230
+ upsample_initial_channel // (2**i),
231
+ upsample_initial_channel // (2 ** (i + 1)),
232
+ k,
233
+ u,
234
+ padding=(k - u) // 2,
235
+ )
236
+ )
237
+ )
238
+
239
+ self.resblocks = nn.ModuleList()
240
+ for i in range(len(self.ups)):
241
+ ch = upsample_initial_channel // (2 ** (i + 1))
242
+ for j, (k, d) in enumerate(
243
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
+ ):
245
+ self.resblocks.append(resblock(ch, k, d))
246
+
247
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
+ self.ups.apply(init_weights)
249
+
250
+ if gin_channels != 0:
251
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
+
253
+ def forward(self, x, g=None):
254
+ x = self.conv_pre(x)
255
+ if g is not None:
256
+ x = x + self.cond(g)
257
+
258
+ for i in range(self.num_upsamples):
259
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
+ x = self.ups[i](x)
261
+ xs = None
262
+ for j in range(self.num_kernels):
263
+ if xs is None:
264
+ xs = self.resblocks[i * self.num_kernels + j](x)
265
+ else:
266
+ xs += self.resblocks[i * self.num_kernels + j](x)
267
+ x = xs / self.num_kernels
268
+ x = F.leaky_relu(x)
269
+ x = self.conv_post(x)
270
+ x = torch.tanh(x)
271
+
272
+ return x
273
+
274
+ def remove_weight_norm(self):
275
+ for l in self.ups:
276
+ remove_weight_norm(l)
277
+ for l in self.resblocks:
278
+ l.remove_weight_norm()
279
+
280
+
281
+ class SineGen(torch.nn.Module):
282
+ """Definition of sine generator
283
+ SineGen(samp_rate, harmonic_num = 0,
284
+ sine_amp = 0.1, noise_std = 0.003,
285
+ voiced_threshold = 0,
286
+ flag_for_pulse=False)
287
+ samp_rate: sampling rate in Hz
288
+ harmonic_num: number of harmonic overtones (default 0)
289
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
290
+ noise_std: std of Gaussian noise (default 0.003)
291
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
+ Note: when flag_for_pulse is True, the first time step of a voiced
294
+ segment is always sin(np.pi) or cos(0)
295
+ """
296
+
297
+ def __init__(
298
+ self,
299
+ samp_rate,
300
+ harmonic_num=0,
301
+ sine_amp=0.1,
302
+ noise_std=0.003,
303
+ voiced_threshold=0,
304
+ flag_for_pulse=False,
305
+ ):
306
+ super(SineGen, self).__init__()
307
+ self.sine_amp = sine_amp
308
+ self.noise_std = noise_std
309
+ self.harmonic_num = harmonic_num
310
+ self.dim = self.harmonic_num + 1
311
+ self.sampling_rate = samp_rate
312
+ self.voiced_threshold = voiced_threshold
313
+
314
+ def _f02uv(self, f0):
315
+ # generate uv signal
316
+ uv = torch.ones_like(f0)
317
+ uv = uv * (f0 > self.voiced_threshold)
318
+ if uv.device.type == "privateuseone": # for DirectML
319
+ uv = uv.float()
320
+ return uv
321
+
322
+ def forward(self, f0, upp):
323
+ """sine_tensor, uv = forward(f0)
324
+ input F0: tensor(batchsize=1, length, dim=1)
325
+ f0 for unvoiced steps should be 0
326
+ output sine_tensor: tensor(batchsize=1, length, dim)
327
+ output uv: tensor(batchsize=1, length, 1)
328
+ """
329
+ with torch.no_grad():
330
+ f0 = f0[:, None].transpose(1, 2)
331
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
332
+ # fundamental component
333
+ f0_buf[:, :, 0] = f0[:, :, 0]
334
+ for idx in np.arange(self.harmonic_num):
335
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
336
+ idx + 2
337
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
338
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
339
+ rand_ini = torch.rand(
340
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
341
+ )
342
+ rand_ini[:, 0] = 0
343
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
344
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
345
+ tmp_over_one *= upp
346
+ tmp_over_one = F.interpolate(
347
+ tmp_over_one.transpose(2, 1),
348
+ scale_factor=upp,
349
+ mode="linear",
350
+ align_corners=True,
351
+ ).transpose(2, 1)
352
+ rad_values = F.interpolate(
353
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
354
+ ).transpose(
355
+ 2, 1
356
+ ) #######
357
+ tmp_over_one %= 1
358
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
359
+ cumsum_shift = torch.zeros_like(rad_values)
360
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
361
+ sine_waves = torch.sin(
362
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
363
+ )
364
+ sine_waves = sine_waves * self.sine_amp
365
+ uv = self._f02uv(f0)
366
+ uv = F.interpolate(
367
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
368
+ ).transpose(2, 1)
369
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
370
+ noise = noise_amp * torch.randn_like(sine_waves)
371
+ sine_waves = sine_waves * uv + noise
372
+ return sine_waves, uv, noise
373
+
374
+
375
+ class SourceModuleHnNSF(torch.nn.Module):
376
+ """SourceModule for hn-nsf
377
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
378
+ add_noise_std=0.003, voiced_threshod=0)
379
+ sampling_rate: sampling_rate in Hz
380
+ harmonic_num: number of harmonic above F0 (default: 0)
381
+ sine_amp: amplitude of sine source signal (default: 0.1)
382
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
383
+ note that amplitude of noise in unvoiced is decided
384
+ by sine_amp
385
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
386
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
387
+ F0_sampled (batchsize, length, 1)
388
+ Sine_source (batchsize, length, 1)
389
+ noise_source (batchsize, length 1)
390
+ uv (batchsize, length, 1)
391
+ """
392
+
393
+ def __init__(
394
+ self,
395
+ sampling_rate,
396
+ harmonic_num=0,
397
+ sine_amp=0.1,
398
+ add_noise_std=0.003,
399
+ voiced_threshod=0,
400
+ is_half=True,
401
+ ):
402
+ super(SourceModuleHnNSF, self).__init__()
403
+
404
+ self.sine_amp = sine_amp
405
+ self.noise_std = add_noise_std
406
+ self.is_half = is_half
407
+ # to produce sine waveforms
408
+ self.l_sin_gen = SineGen(
409
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
410
+ )
411
+
412
+ # to merge source harmonics into a single excitation
413
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
414
+ self.l_tanh = torch.nn.Tanh()
415
+
416
+ def forward(self, x, upp=None):
417
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
418
+ if self.is_half:
419
+ sine_wavs = sine_wavs.half()
420
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
421
+ return sine_merge, None, None # noise, uv
422
+
423
+
424
+ class GeneratorNSF(torch.nn.Module):
425
+ def __init__(
426
+ self,
427
+ initial_channel,
428
+ resblock,
429
+ resblock_kernel_sizes,
430
+ resblock_dilation_sizes,
431
+ upsample_rates,
432
+ upsample_initial_channel,
433
+ upsample_kernel_sizes,
434
+ gin_channels,
435
+ sr,
436
+ is_half=False,
437
+ ):
438
+ super(GeneratorNSF, self).__init__()
439
+ self.num_kernels = len(resblock_kernel_sizes)
440
+ self.num_upsamples = len(upsample_rates)
441
+
442
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
443
+ self.m_source = SourceModuleHnNSF(
444
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
445
+ )
446
+ self.noise_convs = nn.ModuleList()
447
+ self.conv_pre = Conv1d(
448
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
449
+ )
450
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
451
+
452
+ self.ups = nn.ModuleList()
453
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
454
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
455
+ self.ups.append(
456
+ weight_norm(
457
+ ConvTranspose1d(
458
+ upsample_initial_channel // (2**i),
459
+ upsample_initial_channel // (2 ** (i + 1)),
460
+ k,
461
+ u,
462
+ padding=(k - u) // 2,
463
+ )
464
+ )
465
+ )
466
+ if i + 1 < len(upsample_rates):
467
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
468
+ self.noise_convs.append(
469
+ Conv1d(
470
+ 1,
471
+ c_cur,
472
+ kernel_size=stride_f0 * 2,
473
+ stride=stride_f0,
474
+ padding=stride_f0 // 2,
475
+ )
476
+ )
477
+ else:
478
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
479
+
480
+ self.resblocks = nn.ModuleList()
481
+ for i in range(len(self.ups)):
482
+ ch = upsample_initial_channel // (2 ** (i + 1))
483
+ for j, (k, d) in enumerate(
484
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
485
+ ):
486
+ self.resblocks.append(resblock(ch, k, d))
487
+
488
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
489
+ self.ups.apply(init_weights)
490
+
491
+ if gin_channels != 0:
492
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
493
+
494
+ self.upp = np.prod(upsample_rates)
495
+
496
+ def forward(self, x, f0, g=None):
497
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
498
+ har_source = har_source.transpose(1, 2)
499
+ x = self.conv_pre(x)
500
+ if g is not None:
501
+ x = x + self.cond(g)
502
+
503
+ for i in range(self.num_upsamples):
504
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
505
+ x = self.ups[i](x)
506
+ x_source = self.noise_convs[i](har_source)
507
+ x = x + x_source
508
+ xs = None
509
+ for j in range(self.num_kernels):
510
+ if xs is None:
511
+ xs = self.resblocks[i * self.num_kernels + j](x)
512
+ else:
513
+ xs += self.resblocks[i * self.num_kernels + j](x)
514
+ x = xs / self.num_kernels
515
+ x = F.leaky_relu(x)
516
+ x = self.conv_post(x)
517
+ x = torch.tanh(x)
518
+ return x
519
+
520
+ def remove_weight_norm(self):
521
+ for l in self.ups:
522
+ remove_weight_norm(l)
523
+ for l in self.resblocks:
524
+ l.remove_weight_norm()
525
+
526
+
527
+ sr2sr = {
528
+ "32k": 32000,
529
+ "40k": 40000,
530
+ "48k": 48000,
531
+ }
532
+
533
+
534
+ class SynthesizerTrnMs256NSFsid(nn.Module):
535
+ def __init__(
536
+ self,
537
+ spec_channels,
538
+ segment_size,
539
+ inter_channels,
540
+ hidden_channels,
541
+ filter_channels,
542
+ n_heads,
543
+ n_layers,
544
+ kernel_size,
545
+ p_dropout,
546
+ resblock,
547
+ resblock_kernel_sizes,
548
+ resblock_dilation_sizes,
549
+ upsample_rates,
550
+ upsample_initial_channel,
551
+ upsample_kernel_sizes,
552
+ spk_embed_dim,
553
+ gin_channels,
554
+ sr,
555
+ **kwargs
556
+ ):
557
+ super().__init__()
558
+ if type(sr) == type("strr"):
559
+ sr = sr2sr[sr]
560
+ self.spec_channels = spec_channels
561
+ self.inter_channels = inter_channels
562
+ self.hidden_channels = hidden_channels
563
+ self.filter_channels = filter_channels
564
+ self.n_heads = n_heads
565
+ self.n_layers = n_layers
566
+ self.kernel_size = kernel_size
567
+ self.p_dropout = p_dropout
568
+ self.resblock = resblock
569
+ self.resblock_kernel_sizes = resblock_kernel_sizes
570
+ self.resblock_dilation_sizes = resblock_dilation_sizes
571
+ self.upsample_rates = upsample_rates
572
+ self.upsample_initial_channel = upsample_initial_channel
573
+ self.upsample_kernel_sizes = upsample_kernel_sizes
574
+ self.segment_size = segment_size
575
+ self.gin_channels = gin_channels
576
+ # self.hop_length = hop_length#
577
+ self.spk_embed_dim = spk_embed_dim
578
+ self.enc_p = TextEncoder256(
579
+ inter_channels,
580
+ hidden_channels,
581
+ filter_channels,
582
+ n_heads,
583
+ n_layers,
584
+ kernel_size,
585
+ p_dropout,
586
+ )
587
+ self.dec = GeneratorNSF(
588
+ inter_channels,
589
+ resblock,
590
+ resblock_kernel_sizes,
591
+ resblock_dilation_sizes,
592
+ upsample_rates,
593
+ upsample_initial_channel,
594
+ upsample_kernel_sizes,
595
+ gin_channels=gin_channels,
596
+ sr=sr,
597
+ is_half=kwargs["is_half"],
598
+ )
599
+ self.enc_q = PosteriorEncoder(
600
+ spec_channels,
601
+ inter_channels,
602
+ hidden_channels,
603
+ 5,
604
+ 1,
605
+ 16,
606
+ gin_channels=gin_channels,
607
+ )
608
+ self.flow = ResidualCouplingBlock(
609
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
610
+ )
611
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
612
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
613
+
614
+ def remove_weight_norm(self):
615
+ self.dec.remove_weight_norm()
616
+ self.flow.remove_weight_norm()
617
+ self.enc_q.remove_weight_norm()
618
+
619
+ def forward(
620
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
621
+ ): # 这里ds是id,[bs,1]
622
+ # print(1,pitch.shape)#[bs,t]
623
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
624
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
625
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
626
+ z_p = self.flow(z, y_mask, g=g)
627
+ z_slice, ids_slice = commons.rand_slice_segments(
628
+ z, y_lengths, self.segment_size
629
+ )
630
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
631
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
632
+ # print(-2,pitchf.shape,z_slice.shape)
633
+ o = self.dec(z_slice, pitchf, g=g)
634
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
635
+
636
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
637
+ g = self.emb_g(sid).unsqueeze(-1)
638
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
639
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
640
+ if rate:
641
+ head = int(z_p.shape[2] * rate)
642
+ z_p = z_p[:, :, -head:]
643
+ x_mask = x_mask[:, :, -head:]
644
+ nsff0 = nsff0[:, -head:]
645
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
646
+ o = self.dec(z * x_mask, nsff0, g=g)
647
+ return o, x_mask, (z, z_p, m_p, logs_p)
648
+
649
+
650
+ class SynthesizerTrnMs768NSFsid(nn.Module):
651
+ def __init__(
652
+ self,
653
+ spec_channels,
654
+ segment_size,
655
+ inter_channels,
656
+ hidden_channels,
657
+ filter_channels,
658
+ n_heads,
659
+ n_layers,
660
+ kernel_size,
661
+ p_dropout,
662
+ resblock,
663
+ resblock_kernel_sizes,
664
+ resblock_dilation_sizes,
665
+ upsample_rates,
666
+ upsample_initial_channel,
667
+ upsample_kernel_sizes,
668
+ spk_embed_dim,
669
+ gin_channels,
670
+ sr,
671
+ **kwargs
672
+ ):
673
+ super().__init__()
674
+ if type(sr) == type("strr"):
675
+ sr = sr2sr[sr]
676
+ self.spec_channels = spec_channels
677
+ self.inter_channels = inter_channels
678
+ self.hidden_channels = hidden_channels
679
+ self.filter_channels = filter_channels
680
+ self.n_heads = n_heads
681
+ self.n_layers = n_layers
682
+ self.kernel_size = kernel_size
683
+ self.p_dropout = p_dropout
684
+ self.resblock = resblock
685
+ self.resblock_kernel_sizes = resblock_kernel_sizes
686
+ self.resblock_dilation_sizes = resblock_dilation_sizes
687
+ self.upsample_rates = upsample_rates
688
+ self.upsample_initial_channel = upsample_initial_channel
689
+ self.upsample_kernel_sizes = upsample_kernel_sizes
690
+ self.segment_size = segment_size
691
+ self.gin_channels = gin_channels
692
+ # self.hop_length = hop_length#
693
+ self.spk_embed_dim = spk_embed_dim
694
+ self.enc_p = TextEncoder768(
695
+ inter_channels,
696
+ hidden_channels,
697
+ filter_channels,
698
+ n_heads,
699
+ n_layers,
700
+ kernel_size,
701
+ p_dropout,
702
+ )
703
+ self.dec = GeneratorNSF(
704
+ inter_channels,
705
+ resblock,
706
+ resblock_kernel_sizes,
707
+ resblock_dilation_sizes,
708
+ upsample_rates,
709
+ upsample_initial_channel,
710
+ upsample_kernel_sizes,
711
+ gin_channels=gin_channels,
712
+ sr=sr,
713
+ is_half=kwargs["is_half"],
714
+ )
715
+ self.enc_q = PosteriorEncoder(
716
+ spec_channels,
717
+ inter_channels,
718
+ hidden_channels,
719
+ 5,
720
+ 1,
721
+ 16,
722
+ gin_channels=gin_channels,
723
+ )
724
+ self.flow = ResidualCouplingBlock(
725
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
726
+ )
727
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
728
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
729
+
730
+ def remove_weight_norm(self):
731
+ self.dec.remove_weight_norm()
732
+ self.flow.remove_weight_norm()
733
+ self.enc_q.remove_weight_norm()
734
+
735
+ def forward(
736
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
737
+ ): # 这里ds是id,[bs,1]
738
+ # print(1,pitch.shape)#[bs,t]
739
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
740
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
741
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
742
+ z_p = self.flow(z, y_mask, g=g)
743
+ z_slice, ids_slice = commons.rand_slice_segments(
744
+ z, y_lengths, self.segment_size
745
+ )
746
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
747
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
748
+ # print(-2,pitchf.shape,z_slice.shape)
749
+ o = self.dec(z_slice, pitchf, g=g)
750
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
751
+
752
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
753
+ g = self.emb_g(sid).unsqueeze(-1)
754
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
755
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
756
+ if rate:
757
+ head = int(z_p.shape[2] * rate)
758
+ z_p = z_p[:, :, -head:]
759
+ x_mask = x_mask[:, :, -head:]
760
+ nsff0 = nsff0[:, -head:]
761
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
762
+ o = self.dec(z * x_mask, nsff0, g=g)
763
+ return o, x_mask, (z, z_p, m_p, logs_p)
764
+
765
+
766
+ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
767
+ def __init__(
768
+ self,
769
+ spec_channels,
770
+ segment_size,
771
+ inter_channels,
772
+ hidden_channels,
773
+ filter_channels,
774
+ n_heads,
775
+ n_layers,
776
+ kernel_size,
777
+ p_dropout,
778
+ resblock,
779
+ resblock_kernel_sizes,
780
+ resblock_dilation_sizes,
781
+ upsample_rates,
782
+ upsample_initial_channel,
783
+ upsample_kernel_sizes,
784
+ spk_embed_dim,
785
+ gin_channels,
786
+ sr=None,
787
+ **kwargs
788
+ ):
789
+ super().__init__()
790
+ self.spec_channels = spec_channels
791
+ self.inter_channels = inter_channels
792
+ self.hidden_channels = hidden_channels
793
+ self.filter_channels = filter_channels
794
+ self.n_heads = n_heads
795
+ self.n_layers = n_layers
796
+ self.kernel_size = kernel_size
797
+ self.p_dropout = p_dropout
798
+ self.resblock = resblock
799
+ self.resblock_kernel_sizes = resblock_kernel_sizes
800
+ self.resblock_dilation_sizes = resblock_dilation_sizes
801
+ self.upsample_rates = upsample_rates
802
+ self.upsample_initial_channel = upsample_initial_channel
803
+ self.upsample_kernel_sizes = upsample_kernel_sizes
804
+ self.segment_size = segment_size
805
+ self.gin_channels = gin_channels
806
+ # self.hop_length = hop_length#
807
+ self.spk_embed_dim = spk_embed_dim
808
+ self.enc_p = TextEncoder256(
809
+ inter_channels,
810
+ hidden_channels,
811
+ filter_channels,
812
+ n_heads,
813
+ n_layers,
814
+ kernel_size,
815
+ p_dropout,
816
+ f0=False,
817
+ )
818
+ self.dec = Generator(
819
+ inter_channels,
820
+ resblock,
821
+ resblock_kernel_sizes,
822
+ resblock_dilation_sizes,
823
+ upsample_rates,
824
+ upsample_initial_channel,
825
+ upsample_kernel_sizes,
826
+ gin_channels=gin_channels,
827
+ )
828
+ self.enc_q = PosteriorEncoder(
829
+ spec_channels,
830
+ inter_channels,
831
+ hidden_channels,
832
+ 5,
833
+ 1,
834
+ 16,
835
+ gin_channels=gin_channels,
836
+ )
837
+ self.flow = ResidualCouplingBlock(
838
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
839
+ )
840
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
841
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
842
+
843
+ def remove_weight_norm(self):
844
+ self.dec.remove_weight_norm()
845
+ self.flow.remove_weight_norm()
846
+ self.enc_q.remove_weight_norm()
847
+
848
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
849
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
850
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
851
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
852
+ z_p = self.flow(z, y_mask, g=g)
853
+ z_slice, ids_slice = commons.rand_slice_segments(
854
+ z, y_lengths, self.segment_size
855
+ )
856
+ o = self.dec(z_slice, g=g)
857
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
858
+
859
+ def infer(self, phone, phone_lengths, sid, rate=None):
860
+ g = self.emb_g(sid).unsqueeze(-1)
861
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
862
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
863
+ if rate:
864
+ head = int(z_p.shape[2] * rate)
865
+ z_p = z_p[:, :, -head:]
866
+ x_mask = x_mask[:, :, -head:]
867
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
868
+ o = self.dec(z * x_mask, g=g)
869
+ return o, x_mask, (z, z_p, m_p, logs_p)
870
+
871
+
872
+ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
873
+ def __init__(
874
+ self,
875
+ spec_channels,
876
+ segment_size,
877
+ inter_channels,
878
+ hidden_channels,
879
+ filter_channels,
880
+ n_heads,
881
+ n_layers,
882
+ kernel_size,
883
+ p_dropout,
884
+ resblock,
885
+ resblock_kernel_sizes,
886
+ resblock_dilation_sizes,
887
+ upsample_rates,
888
+ upsample_initial_channel,
889
+ upsample_kernel_sizes,
890
+ spk_embed_dim,
891
+ gin_channels,
892
+ sr=None,
893
+ **kwargs
894
+ ):
895
+ super().__init__()
896
+ self.spec_channels = spec_channels
897
+ self.inter_channels = inter_channels
898
+ self.hidden_channels = hidden_channels
899
+ self.filter_channels = filter_channels
900
+ self.n_heads = n_heads
901
+ self.n_layers = n_layers
902
+ self.kernel_size = kernel_size
903
+ self.p_dropout = p_dropout
904
+ self.resblock = resblock
905
+ self.resblock_kernel_sizes = resblock_kernel_sizes
906
+ self.resblock_dilation_sizes = resblock_dilation_sizes
907
+ self.upsample_rates = upsample_rates
908
+ self.upsample_initial_channel = upsample_initial_channel
909
+ self.upsample_kernel_sizes = upsample_kernel_sizes
910
+ self.segment_size = segment_size
911
+ self.gin_channels = gin_channels
912
+ # self.hop_length = hop_length#
913
+ self.spk_embed_dim = spk_embed_dim
914
+ self.enc_p = TextEncoder768(
915
+ inter_channels,
916
+ hidden_channels,
917
+ filter_channels,
918
+ n_heads,
919
+ n_layers,
920
+ kernel_size,
921
+ p_dropout,
922
+ f0=False,
923
+ )
924
+ self.dec = Generator(
925
+ inter_channels,
926
+ resblock,
927
+ resblock_kernel_sizes,
928
+ resblock_dilation_sizes,
929
+ upsample_rates,
930
+ upsample_initial_channel,
931
+ upsample_kernel_sizes,
932
+ gin_channels=gin_channels,
933
+ )
934
+ self.enc_q = PosteriorEncoder(
935
+ spec_channels,
936
+ inter_channels,
937
+ hidden_channels,
938
+ 5,
939
+ 1,
940
+ 16,
941
+ gin_channels=gin_channels,
942
+ )
943
+ self.flow = ResidualCouplingBlock(
944
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
945
+ )
946
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
947
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
948
+
949
+ def remove_weight_norm(self):
950
+ self.dec.remove_weight_norm()
951
+ self.flow.remove_weight_norm()
952
+ self.enc_q.remove_weight_norm()
953
+
954
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
955
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
956
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
957
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
958
+ z_p = self.flow(z, y_mask, g=g)
959
+ z_slice, ids_slice = commons.rand_slice_segments(
960
+ z, y_lengths, self.segment_size
961
+ )
962
+ o = self.dec(z_slice, g=g)
963
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
964
+
965
+ def infer(self, phone, phone_lengths, sid, rate=None):
966
+ g = self.emb_g(sid).unsqueeze(-1)
967
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
968
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
969
+ if rate:
970
+ head = int(z_p.shape[2] * rate)
971
+ z_p = z_p[:, :, -head:]
972
+ x_mask = x_mask[:, :, -head:]
973
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
974
+ o = self.dec(z * x_mask, g=g)
975
+ return o, x_mask, (z, z_p, m_p, logs_p)
976
+
977
+
978
+ class MultiPeriodDiscriminator(torch.nn.Module):
979
+ def __init__(self, use_spectral_norm=False):
980
+ super(MultiPeriodDiscriminator, self).__init__()
981
+ periods = [2, 3, 5, 7, 11, 17]
982
+ # periods = [3, 5, 7, 11, 17, 23, 37]
983
+
984
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
985
+ discs = discs + [
986
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
987
+ ]
988
+ self.discriminators = nn.ModuleList(discs)
989
+
990
+ def forward(self, y, y_hat):
991
+ y_d_rs = [] #
992
+ y_d_gs = []
993
+ fmap_rs = []
994
+ fmap_gs = []
995
+ for i, d in enumerate(self.discriminators):
996
+ y_d_r, fmap_r = d(y)
997
+ y_d_g, fmap_g = d(y_hat)
998
+ # for j in range(len(fmap_r)):
999
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1000
+ y_d_rs.append(y_d_r)
1001
+ y_d_gs.append(y_d_g)
1002
+ fmap_rs.append(fmap_r)
1003
+ fmap_gs.append(fmap_g)
1004
+
1005
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1006
+
1007
+
1008
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
1009
+ def __init__(self, use_spectral_norm=False):
1010
+ super(MultiPeriodDiscriminatorV2, self).__init__()
1011
+ # periods = [2, 3, 5, 7, 11, 17]
1012
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
1013
+
1014
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1015
+ discs = discs + [
1016
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1017
+ ]
1018
+ self.discriminators = nn.ModuleList(discs)
1019
+
1020
+ def forward(self, y, y_hat):
1021
+ y_d_rs = [] #
1022
+ y_d_gs = []
1023
+ fmap_rs = []
1024
+ fmap_gs = []
1025
+ for i, d in enumerate(self.discriminators):
1026
+ y_d_r, fmap_r = d(y)
1027
+ y_d_g, fmap_g = d(y_hat)
1028
+ # for j in range(len(fmap_r)):
1029
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1030
+ y_d_rs.append(y_d_r)
1031
+ y_d_gs.append(y_d_g)
1032
+ fmap_rs.append(fmap_r)
1033
+ fmap_gs.append(fmap_g)
1034
+
1035
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1036
+
1037
+
1038
+ class DiscriminatorS(torch.nn.Module):
1039
+ def __init__(self, use_spectral_norm=False):
1040
+ super(DiscriminatorS, self).__init__()
1041
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1042
+ self.convs = nn.ModuleList(
1043
+ [
1044
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1045
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1046
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1047
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1048
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1049
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1050
+ ]
1051
+ )
1052
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1053
+
1054
+ def forward(self, x):
1055
+ fmap = []
1056
+
1057
+ for l in self.convs:
1058
+ x = l(x)
1059
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1060
+ fmap.append(x)
1061
+ x = self.conv_post(x)
1062
+ fmap.append(x)
1063
+ x = torch.flatten(x, 1, -1)
1064
+
1065
+ return x, fmap
1066
+
1067
+
1068
+ class DiscriminatorP(torch.nn.Module):
1069
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1070
+ super(DiscriminatorP, self).__init__()
1071
+ self.period = period
1072
+ self.use_spectral_norm = use_spectral_norm
1073
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1074
+ self.convs = nn.ModuleList(
1075
+ [
1076
+ norm_f(
1077
+ Conv2d(
1078
+ 1,
1079
+ 32,
1080
+ (kernel_size, 1),
1081
+ (stride, 1),
1082
+ padding=(get_padding(kernel_size, 1), 0),
1083
+ )
1084
+ ),
1085
+ norm_f(
1086
+ Conv2d(
1087
+ 32,
1088
+ 128,
1089
+ (kernel_size, 1),
1090
+ (stride, 1),
1091
+ padding=(get_padding(kernel_size, 1), 0),
1092
+ )
1093
+ ),
1094
+ norm_f(
1095
+ Conv2d(
1096
+ 128,
1097
+ 512,
1098
+ (kernel_size, 1),
1099
+ (stride, 1),
1100
+ padding=(get_padding(kernel_size, 1), 0),
1101
+ )
1102
+ ),
1103
+ norm_f(
1104
+ Conv2d(
1105
+ 512,
1106
+ 1024,
1107
+ (kernel_size, 1),
1108
+ (stride, 1),
1109
+ padding=(get_padding(kernel_size, 1), 0),
1110
+ )
1111
+ ),
1112
+ norm_f(
1113
+ Conv2d(
1114
+ 1024,
1115
+ 1024,
1116
+ (kernel_size, 1),
1117
+ 1,
1118
+ padding=(get_padding(kernel_size, 1), 0),
1119
+ )
1120
+ ),
1121
+ ]
1122
+ )
1123
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1124
+
1125
+ def forward(self, x):
1126
+ fmap = []
1127
+
1128
+ # 1d to 2d
1129
+ b, c, t = x.shape
1130
+ if t % self.period != 0: # pad first
1131
+ n_pad = self.period - (t % self.period)
1132
+ x = F.pad(x, (0, n_pad), "reflect")
1133
+ t = t + n_pad
1134
+ x = x.view(b, c, t // self.period, self.period)
1135
+
1136
+ for l in self.convs:
1137
+ x = l(x)
1138
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1139
+ fmap.append(x)
1140
+ x = self.conv_post(x)
1141
+ fmap.append(x)
1142
+ x = torch.flatten(x, 1, -1)
1143
+
1144
+ return x, fmap