nonoJDWAOIDAWKDA commited on 15 days ago

Commit

1f5676f

verified ·

1 Parent(s): 33b0cb4

Upload StyleTTS2 checkpoint epoch_2nd_00014.pth with all inference components

Browse files

Files changed (31) hide show

.gitattributes +1 -34
README.md +84 -0
Utils/ASR/config.yml +29 -0
Utils/ASR/epoch_00080.pth +3 -0
Utils/ASR/layers.py +354 -0
Utils/ASR/models.py +186 -0
Utils/JDC/bst.t7 +3 -0
Utils/JDC/model.py +190 -0
Utils/PLBERT/config.yml +30 -0
Utils/PLBERT/step_1000000.t7 +3 -0
Utils/PLBERT/util.py +42 -0
bert.pth +3 -0
bert_encoder.pth +3 -0
checkpoint.pth +3 -0
config.json +202 -0
config.yml +66 -0
decoder.pth +3 -0
diffusion.pth +3 -0
models.py +713 -0
mpd.pth +3 -0
msd.pth +3 -0
pitch_extractor.pth +3 -0
predictor.pth +3 -0
predictor_encoder.pth +3 -0
style_encoder.pth +3 -0
text_aligner.pth +3 -0
text_encoder.pth +3 -0
text_utils.py +26 -0
training_metrics.png +0 -0
utils.py +74 -0
wd.pth +3 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
























1	*.pth filter=lfs diff=lfs merge=lfs -text
2	+ *.t7 filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,84 @@

+---
+language: en
+tags:
+- text-to-speech
+- StyleTTS2
+- speech-synthesis
+license: mit
+pipeline_tag: text-to-speech
+---
+# StyleTTS2 Fine-tuned Model
+This model is a fine-tuned version of StyleTTS2, containing all necessary components for inference.
+## Model Details
+- **Base Model:** StyleTTS2-LibriTTS
+- **Architecture:** StyleTTS2
+- **Task:** Text-to-Speech
+- **Last Checkpoint:** epoch_2nd_00014.pth
+## Training Details
+- **Total Epochs:** 30
+- **Completed Epochs:** 14
+- **Total Iterations:** 1169
+- **Batch Size:** 2
+- **Max Length:** 120
+- **Learning Rate:** 0.0001
+- **Final Validation Loss:** 0.418901
+## Model Components
+The repository includes all necessary components for inference:
+### Main Model Components:
+- bert.pth
+- bert_encoder.pth
+- predictor.pth
+- decoder.pth
+- text_encoder.pth
+- predictor_encoder.pth
+- style_encoder.pth
+- diffusion.pth
+- text_aligner.pth
+- pitch_extractor.pth
+- mpd.pth
+- msd.pth
+- wd.pth
+### Utility Components:
+- ASR (Automatic Speech Recognition)
+  - epoch_00080.pth
+  - config.yml
+  - models.py
+  - layers.py
+- JDC (F0 Prediction)
+  - bst.t7
+  - model.py
+- PLBERT
+  - step_1000000.t7
+  - config.yml
+  - util.py
+### Additional Files:
+- text_utils.py: Text preprocessing utilities
+- models.py: Model architecture definitions
+- utils.py: Utility functions
+- config.yml: Model configuration
+- config.json: Detailed configuration and training metrics
+## Training Metrics
+Training metrics visualization is available in training_metrics.png
+## Directory Structure
+├── Utils/
+│ ├── ASR/
+│ ├── JDC/
+│ └── PLBERT/
+├── model_components/
+└── configs/
+## Usage Instructions
+1. Load the model using the provided config.yml
+2. Ensure all utility components (ASR, JDC, PLBERT) are in their respective directories
+3. Use text_utils.py for text preprocessing
+4. Follow the inference example in the StyleTTS2 documentation

Utils/ASR/config.yml ADDED Viewed

	@@ -0,0 +1,29 @@

+log_dir: "logs/20201006"
+save_freq: 5
+device: "cuda"
+epochs: 180
+batch_size: 64
+pretrained_model: ""
+train_data: "ASRDataset/train_list.txt"
+val_data: "ASRDataset/val_list.txt"
+dataset_params:
+  data_augmentation: false
+preprocess_parasm:
+  sr: 24000
+  spect_params:
+    n_fft: 2048
+    win_length: 1200
+    hop_length: 300
+  mel_params:
+    n_mels: 80
+model_params:
+   input_dim: 80
+   hidden_dim: 256
+   n_token: 178
+   token_embedding_dim: 512
+optimizer_params:
+  lr: 0.0005

Utils/ASR/epoch_00080.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fedd55a1234b0c56e1e8b509c74edf3a5e2f27106a66038a4a946047a775bd6c
+size 94552811

Utils/ASR/layers.py ADDED Viewed

	@@ -0,0 +1,354 @@

+import math
+import torch
+from torch import nn
+from typing import Optional, Any
+from torch import Tensor
+import torch.nn.functional as F
+import torchaudio
+import torchaudio.functional as audio_F
+import random
+random.seed(0)
+def _get_activation_fn(activ):
+    if activ == 'relu':
+        return nn.ReLU()
+    elif activ == 'lrelu':
+        return nn.LeakyReLU(0.2)
+    elif activ == 'swish':
+        return lambda x: x*torch.sigmoid(x)
+    else:
+        raise RuntimeError('Unexpected activ type %s, expected [relu, lrelu, swish]' % activ)
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.linear_layer.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, x):
+        return self.linear_layer(x)
+class ConvNorm(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
+                 padding=None, dilation=1, bias=True, w_init_gain='linear', param=None):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert(kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+        self.conv = torch.nn.Conv1d(in_channels, out_channels,
+                                    kernel_size=kernel_size, stride=stride,
+                                    padding=padding, dilation=dilation,
+                                    bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain, param=param))
+    def forward(self, signal):
+        conv_signal = self.conv(signal)
+        return conv_signal
+class CausualConv(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=1, dilation=1, bias=True, w_init_gain='linear', param=None):
+        super(CausualConv, self).__init__()
+        if padding is None:
+            assert(kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2) * 2
+        else:
+            self.padding = padding * 2
+        self.conv = nn.Conv1d(in_channels, out_channels,
+                              kernel_size=kernel_size, stride=stride,
+                              padding=self.padding,
+                              dilation=dilation,
+                              bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain, param=param))
+    def forward(self, x):
+        x = self.conv(x)
+        x = x[:, :, :-self.padding]
+        return x
+class CausualBlock(nn.Module):
+    def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2, activ='lrelu'):
+        super(CausualBlock, self).__init__()
+        self.blocks = nn.ModuleList([
+            self._get_conv(hidden_dim, dilation=3**i, activ=activ, dropout_p=dropout_p)
+            for i in range(n_conv)])
+    def forward(self, x):
+        for block in self.blocks:
+            res = x
+            x = block(x)
+            x += res
+        return x
+    def _get_conv(self, hidden_dim, dilation, activ='lrelu', dropout_p=0.2):
+        layers = [
+            CausualConv(hidden_dim, hidden_dim, kernel_size=3, padding=dilation, dilation=dilation),
+            _get_activation_fn(activ),
+            nn.BatchNorm1d(hidden_dim),
+            nn.Dropout(p=dropout_p),
+            CausualConv(hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=1),
+            _get_activation_fn(activ),
+            nn.Dropout(p=dropout_p)
+        ]
+        return nn.Sequential(*layers)
+class ConvBlock(nn.Module):
+    def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2, activ='relu'):
+        super().__init__()
+        self._n_groups = 8
+        self.blocks = nn.ModuleList([
+            self._get_conv(hidden_dim, dilation=3**i, activ=activ, dropout_p=dropout_p)
+            for i in range(n_conv)])
+    def forward(self, x):
+        for block in self.blocks:
+            res = x
+            x = block(x)
+            x += res
+        return x
+    def _get_conv(self, hidden_dim, dilation, activ='relu', dropout_p=0.2):
+        layers = [
+            ConvNorm(hidden_dim, hidden_dim, kernel_size=3, padding=dilation, dilation=dilation),
+            _get_activation_fn(activ),
+            nn.GroupNorm(num_groups=self._n_groups, num_channels=hidden_dim),
+            nn.Dropout(p=dropout_p),
+            ConvNorm(hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=1),
+            _get_activation_fn(activ),
+            nn.Dropout(p=dropout_p)
+        ]
+        return nn.Sequential(*layers)
+class LocationLayer(nn.Module):
+    def __init__(self, attention_n_filters, attention_kernel_size,
+                 attention_dim):
+        super(LocationLayer, self).__init__()
+        padding = int((attention_kernel_size - 1) / 2)
+        self.location_conv = ConvNorm(2, attention_n_filters,
+                                      kernel_size=attention_kernel_size,
+                                      padding=padding, bias=False, stride=1,
+                                      dilation=1)
+        self.location_dense = LinearNorm(attention_n_filters, attention_dim,
+                                         bias=False, w_init_gain='tanh')
+    def forward(self, attention_weights_cat):
+        processed_attention = self.location_conv(attention_weights_cat)
+        processed_attention = processed_attention.transpose(1, 2)
+        processed_attention = self.location_dense(processed_attention)
+        return processed_attention
+class Attention(nn.Module):
+    def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
+                 attention_location_n_filters, attention_location_kernel_size):
+        super(Attention, self).__init__()
+        self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
+                                      bias=False, w_init_gain='tanh')
+        self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
+                                       w_init_gain='tanh')
+        self.v = LinearNorm(attention_dim, 1, bias=False)
+        self.location_layer = LocationLayer(attention_location_n_filters,
+                                            attention_location_kernel_size,
+                                            attention_dim)
+        self.score_mask_value = -float("inf")
+    def get_alignment_energies(self, query, processed_memory,
+                               attention_weights_cat):
+        """
+        PARAMS
+        ------
+        query: decoder output (batch, n_mel_channels * n_frames_per_step)
+        processed_memory: processed encoder outputs (B, T_in, attention_dim)
+        attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
+        RETURNS
+        -------
+        alignment (batch, max_time)
+        """
+        processed_query = self.query_layer(query.unsqueeze(1))
+        processed_attention_weights = self.location_layer(attention_weights_cat)
+        energies = self.v(torch.tanh(
+            processed_query + processed_attention_weights + processed_memory))
+        energies = energies.squeeze(-1)
+        return energies
+    def forward(self, attention_hidden_state, memory, processed_memory,
+                attention_weights_cat, mask):
+        """
+        PARAMS
+        ------
+        attention_hidden_state: attention rnn last output
+        memory: encoder outputs
+        processed_memory: processed encoder outputs
+        attention_weights_cat: previous and cummulative attention weights
+        mask: binary mask for padded data
+        """
+        alignment = self.get_alignment_energies(
+            attention_hidden_state, processed_memory, attention_weights_cat)
+        if mask is not None:
+            alignment.data.masked_fill_(mask, self.score_mask_value)
+        attention_weights = F.softmax(alignment, dim=1)
+        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
+        attention_context = attention_context.squeeze(1)
+        return attention_context, attention_weights
+class ForwardAttentionV2(nn.Module):
+    def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
+                 attention_location_n_filters, attention_location_kernel_size):
+        super(ForwardAttentionV2, self).__init__()
+        self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
+                                      bias=False, w_init_gain='tanh')
+        self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
+                                       w_init_gain='tanh')
+        self.v = LinearNorm(attention_dim, 1, bias=False)
+        self.location_layer = LocationLayer(attention_location_n_filters,
+                                            attention_location_kernel_size,
+                                            attention_dim)
+        self.score_mask_value = -float(1e20)
+    def get_alignment_energies(self, query, processed_memory,
+                               attention_weights_cat):
+        """
+        PARAMS
+        ------
+        query: decoder output (batch, n_mel_channels * n_frames_per_step)
+        processed_memory: processed encoder outputs (B, T_in, attention_dim)
+        attention_weights_cat:  prev. and cumulative att weights (B, 2, max_time)
+        RETURNS
+        -------
+        alignment (batch, max_time)
+        """
+        processed_query = self.query_layer(query.unsqueeze(1))
+        processed_attention_weights = self.location_layer(attention_weights_cat)
+        energies = self.v(torch.tanh(
+            processed_query + processed_attention_weights + processed_memory))
+        energies = energies.squeeze(-1)
+        return energies
+    def forward(self, attention_hidden_state, memory, processed_memory,
+                attention_weights_cat, mask, log_alpha):
+        """
+        PARAMS
+        ------
+        attention_hidden_state: attention rnn last output
+        memory: encoder outputs
+        processed_memory: processed encoder outputs
+        attention_weights_cat: previous and cummulative attention weights
+        mask: binary mask for padded data
+        """
+        log_energy = self.get_alignment_energies(
+            attention_hidden_state, processed_memory, attention_weights_cat)
+        #log_energy =
+        if mask is not None:
+            log_energy.data.masked_fill_(mask, self.score_mask_value)
+        #attention_weights = F.softmax(alignment, dim=1)
+        #content_score = log_energy.unsqueeze(1) #[B, MAX_TIME] -> [B, 1, MAX_TIME]
+        #log_alpha = log_alpha.unsqueeze(2) #[B, MAX_TIME] -> [B, MAX_TIME, 1]
+        #log_total_score = log_alpha + content_score
+        #previous_attention_weights = attention_weights_cat[:,0,:]
+        log_alpha_shift_padded = []
+        max_time = log_energy.size(1)
+        for sft in range(2):
+            shifted = log_alpha[:,:max_time-sft]
+            shift_padded = F.pad(shifted, (sft,0), 'constant', self.score_mask_value)
+            log_alpha_shift_padded.append(shift_padded.unsqueeze(2))
+        biased = torch.logsumexp(torch.cat(log_alpha_shift_padded,2), 2)
+        log_alpha_new = biased +  log_energy
+        attention_weights =  F.softmax(log_alpha_new, dim=1)
+        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
+        attention_context = attention_context.squeeze(1)
+        return attention_context, attention_weights, log_alpha_new
+class PhaseShuffle2d(nn.Module):
+    def __init__(self, n=2):
+        super(PhaseShuffle2d, self).__init__()
+        self.n = n
+        self.random = random.Random(1)
+    def forward(self, x, move=None):
+        # x.size = (B, C, M, L)
+        if move is None:
+            move = self.random.randint(-self.n, self.n)
+        if move == 0:
+            return x
+        else:
+            left = x[:, :, :, :move]
+            right = x[:, :, :, move:]
+            shuffled = torch.cat([right, left], dim=3)
+        return shuffled
+class PhaseShuffle1d(nn.Module):
+    def __init__(self, n=2):
+        super(PhaseShuffle1d, self).__init__()
+        self.n = n
+        self.random = random.Random(1)
+    def forward(self, x, move=None):
+        # x.size = (B, C, M, L)
+        if move is None:
+            move = self.random.randint(-self.n, self.n)
+        if move == 0:
+            return x
+        else:
+            left = x[:, :,  :move]
+            right = x[:, :, move:]
+            shuffled = torch.cat([right, left], dim=2)
+        return shuffled
+class MFCC(nn.Module):
+    def __init__(self, n_mfcc=40, n_mels=80):
+        super(MFCC, self).__init__()
+        self.n_mfcc = n_mfcc
+        self.n_mels = n_mels
+        self.norm = 'ortho'
+        dct_mat = audio_F.create_dct(self.n_mfcc, self.n_mels, self.norm)
+        self.register_buffer('dct_mat', dct_mat)
+    def forward(self, mel_specgram):
+        if len(mel_specgram.shape) == 2:
+            mel_specgram = mel_specgram.unsqueeze(0)
+            unsqueezed = True
+        else:
+            unsqueezed = False
+        # (channel, n_mels, time).tranpose(...) dot (n_mels, n_mfcc)
+        # -> (channel, time, n_mfcc).tranpose(...)
+        mfcc = torch.matmul(mel_specgram.transpose(1, 2), self.dct_mat).transpose(1, 2)
+        # unpack batch
+        if unsqueezed:
+            mfcc = mfcc.squeeze(0)
+        return mfcc

Utils/ASR/models.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import math
+import torch
+from torch import nn
+from torch.nn import TransformerEncoder
+import torch.nn.functional as F
+from .layers import MFCC, Attention, LinearNorm, ConvNorm, ConvBlock
+class ASRCNN(nn.Module):
+    def __init__(self,
+                 input_dim=80,
+                 hidden_dim=256,
+                 n_token=35,
+                 n_layers=6,
+                 token_embedding_dim=256,
+    ):
+        super().__init__()
+        self.n_token = n_token
+        self.n_down = 1
+        self.to_mfcc = MFCC()
+        self.init_cnn = ConvNorm(input_dim//2, hidden_dim, kernel_size=7, padding=3, stride=2)
+        self.cnns = nn.Sequential(
+            *[nn.Sequential(
+                ConvBlock(hidden_dim),
+                nn.GroupNorm(num_groups=1, num_channels=hidden_dim)
+            ) for n in range(n_layers)])
+        self.projection = ConvNorm(hidden_dim, hidden_dim // 2)
+        self.ctc_linear = nn.Sequential(
+            LinearNorm(hidden_dim//2, hidden_dim),
+            nn.ReLU(),
+            LinearNorm(hidden_dim, n_token))
+        self.asr_s2s = ASRS2S(
+            embedding_dim=token_embedding_dim,
+            hidden_dim=hidden_dim//2,
+            n_token=n_token)
+    def forward(self, x, src_key_padding_mask=None, text_input=None):
+        x = self.to_mfcc(x)
+        x = self.init_cnn(x)
+        x = self.cnns(x)
+        x = self.projection(x)
+        x = x.transpose(1, 2)
+        ctc_logit = self.ctc_linear(x)
+        if text_input is not None:
+            _, s2s_logit, s2s_attn = self.asr_s2s(x, src_key_padding_mask, text_input)
+            return ctc_logit, s2s_logit, s2s_attn
+        else:
+            return ctc_logit
+    def get_feature(self, x):
+        x = self.to_mfcc(x.squeeze(1))
+        x = self.init_cnn(x)
+        x = self.cnns(x)
+        x = self.projection(x)
+        return x
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1)).to(lengths.device)
+        return mask
+    def get_future_mask(self, out_length, unmask_future_steps=0):
+        """
+        Args:
+            out_length (int): returned mask shape is (out_length, out_length).
+            unmask_futre_steps (int): unmasking future step size.
+        Return:
+            mask (torch.BoolTensor): mask future timesteps mask[i, j] = True if i > j + unmask_future_steps else False
+        """
+        index_tensor = torch.arange(out_length).unsqueeze(0).expand(out_length, -1)
+        mask = torch.gt(index_tensor, index_tensor.T + unmask_future_steps)
+        return mask
+class ASRS2S(nn.Module):
+    def __init__(self,
+                 embedding_dim=256,
+                 hidden_dim=512,
+                 n_location_filters=32,
+                 location_kernel_size=63,
+                 n_token=40):
+        super(ASRS2S, self).__init__()
+        self.embedding = nn.Embedding(n_token, embedding_dim)
+        val_range = math.sqrt(6 / hidden_dim)
+        self.embedding.weight.data.uniform_(-val_range, val_range)
+        self.decoder_rnn_dim = hidden_dim
+        self.project_to_n_symbols = nn.Linear(self.decoder_rnn_dim, n_token)
+        self.attention_layer = Attention(
+            self.decoder_rnn_dim,
+            hidden_dim,
+            hidden_dim,
+            n_location_filters,
+            location_kernel_size
+        )
+        self.decoder_rnn = nn.LSTMCell(self.decoder_rnn_dim + embedding_dim, self.decoder_rnn_dim)
+        self.project_to_hidden = nn.Sequential(
+            LinearNorm(self.decoder_rnn_dim * 2, hidden_dim),
+            nn.Tanh())
+        self.sos = 1
+        self.eos = 2
+    def initialize_decoder_states(self, memory, mask):
+        """
+        moemory.shape = (B, L, H) = (Batchsize, Maxtimestep, Hiddendim)
+        """
+        B, L, H = memory.shape
+        self.decoder_hidden = torch.zeros((B, self.decoder_rnn_dim)).type_as(memory)
+        self.decoder_cell = torch.zeros((B, self.decoder_rnn_dim)).type_as(memory)
+        self.attention_weights = torch.zeros((B, L)).type_as(memory)
+        self.attention_weights_cum = torch.zeros((B, L)).type_as(memory)
+        self.attention_context = torch.zeros((B, H)).type_as(memory)
+        self.memory = memory
+        self.processed_memory = self.attention_layer.memory_layer(memory)
+        self.mask = mask
+        self.unk_index = 3
+        self.random_mask = 0.1
+    def forward(self, memory, memory_mask, text_input):
+        """
+        moemory.shape = (B, L, H) = (Batchsize, Maxtimestep, Hiddendim)
+        moemory_mask.shape = (B, L, )
+        texts_input.shape = (B, T)
+        """
+        self.initialize_decoder_states(memory, memory_mask)
+        # text random mask
+        random_mask = (torch.rand(text_input.shape) < self.random_mask).to(text_input.device)
+        _text_input = text_input.clone()
+        _text_input.masked_fill_(random_mask, self.unk_index)
+        decoder_inputs = self.embedding(_text_input).transpose(0, 1) # -> [T, B, channel]
+        start_embedding = self.embedding(
+            torch.LongTensor([self.sos]*decoder_inputs.size(1)).to(decoder_inputs.device))
+        decoder_inputs = torch.cat((start_embedding.unsqueeze(0), decoder_inputs), dim=0)
+        hidden_outputs, logit_outputs, alignments = [], [], []
+        while len(hidden_outputs) < decoder_inputs.size(0):
+            decoder_input = decoder_inputs[len(hidden_outputs)]
+            hidden, logit, attention_weights = self.decode(decoder_input)
+            hidden_outputs += [hidden]
+            logit_outputs += [logit]
+            alignments += [attention_weights]
+        hidden_outputs, logit_outputs, alignments = \
+            self.parse_decoder_outputs(
+                hidden_outputs, logit_outputs, alignments)
+        return hidden_outputs, logit_outputs, alignments
+    def decode(self, decoder_input):
+        cell_input = torch.cat((decoder_input, self.attention_context), -1)
+        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
+            cell_input,
+            (self.decoder_hidden, self.decoder_cell))
+        attention_weights_cat = torch.cat(
+            (self.attention_weights.unsqueeze(1),
+            self.attention_weights_cum.unsqueeze(1)),dim=1)
+        self.attention_context, self.attention_weights = self.attention_layer(
+            self.decoder_hidden,
+            self.memory,
+            self.processed_memory,
+            attention_weights_cat,
+            self.mask)
+        self.attention_weights_cum += self.attention_weights
+        hidden_and_context = torch.cat((self.decoder_hidden, self.attention_context), -1)
+        hidden = self.project_to_hidden(hidden_and_context)
+        # dropout to increasing g
+        logit = self.project_to_n_symbols(F.dropout(hidden, 0.5, self.training))
+        return hidden, logit, self.attention_weights
+    def parse_decoder_outputs(self, hidden, logit, alignments):
+        # -> [B, T_out + 1, max_time]
+        alignments = torch.stack(alignments).transpose(0,1)
+        # [T_out + 1, B, n_symbols] -> [B, T_out + 1,  n_symbols]
+        logit = torch.stack(logit).transpose(0, 1).contiguous()
+        hidden = torch.stack(hidden).transpose(0, 1).contiguous()
+        return hidden, logit, alignments

Utils/JDC/bst.t7 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54dc94364b97e18ac1dfa6287714ed121248cfaac4cfd39d061c6e0a089ef169
+size 21029926

Utils/JDC/model.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""
+Implementation of model from:
+Kum et al. - "Joint Detection and Classification of Singing Voice Melody Using
+Convolutional Recurrent Neural Networks" (2019)
+Link: https://www.semanticscholar.org/paper/Joint-Detection-and-Classification-of-Singing-Voice-Kum-Nam/60a2ad4c7db43bace75805054603747fcd062c0d
+"""
+import torch
+from torch import nn
+class JDCNet(nn.Module):
+    """
+    Joint Detection and Classification Network model for singing voice melody.
+    """
+    def __init__(self, num_class=722, seq_len=31, leaky_relu_slope=0.01):
+        super().__init__()
+        self.num_class = num_class
+        # input = (b, 1, 31, 513), b = batch size
+        self.conv_block = nn.Sequential(
+            nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1, bias=False),  # out: (b, 64, 31, 513)
+            nn.BatchNorm2d(num_features=64),
+            nn.LeakyReLU(leaky_relu_slope, inplace=True),
+            nn.Conv2d(64, 64, 3, padding=1, bias=False),  # (b, 64, 31, 513)
+        )
+        # res blocks
+        self.res_block1 = ResBlock(in_channels=64, out_channels=128)  # (b, 128, 31, 128)
+        self.res_block2 = ResBlock(in_channels=128, out_channels=192)  # (b, 192, 31, 32)
+        self.res_block3 = ResBlock(in_channels=192, out_channels=256)  # (b, 256, 31, 8)
+        # pool block
+        self.pool_block = nn.Sequential(
+            nn.BatchNorm2d(num_features=256),
+            nn.LeakyReLU(leaky_relu_slope, inplace=True),
+            nn.MaxPool2d(kernel_size=(1, 4)),  # (b, 256, 31, 2)
+            nn.Dropout(p=0.2),
+        )
+        # maxpool layers (for auxiliary network inputs)
+        # in = (b, 128, 31, 513) from conv_block, out = (b, 128, 31, 2)
+        self.maxpool1 = nn.MaxPool2d(kernel_size=(1, 40))
+        # in = (b, 128, 31, 128) from res_block1, out = (b, 128, 31, 2)
+        self.maxpool2 = nn.MaxPool2d(kernel_size=(1, 20))
+        # in = (b, 128, 31, 32) from res_block2, out = (b, 128, 31, 2)
+        self.maxpool3 = nn.MaxPool2d(kernel_size=(1, 10))
+        # in = (b, 640, 31, 2), out = (b, 256, 31, 2)
+        self.detector_conv = nn.Sequential(
+            nn.Conv2d(640, 256, 1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.LeakyReLU(leaky_relu_slope, inplace=True),
+            nn.Dropout(p=0.2),
+        )
+        # input: (b, 31, 512) - resized from (b, 256, 31, 2)
+        self.bilstm_classifier = nn.LSTM(
+            input_size=512, hidden_size=256,
+            batch_first=True, bidirectional=True)  # (b, 31, 512)
+        # input: (b, 31, 512) - resized from (b, 256, 31, 2)
+        self.bilstm_detector = nn.LSTM(
+            input_size=512, hidden_size=256,
+            batch_first=True, bidirectional=True)  # (b, 31, 512)
+        # input: (b * 31, 512)
+        self.classifier = nn.Linear(in_features=512, out_features=self.num_class)  # (b * 31, num_class)
+        # input: (b * 31, 512)
+        self.detector = nn.Linear(in_features=512, out_features=2)  # (b * 31, 2) - binary classifier
+        # initialize weights
+        self.apply(self.init_weights)
+    def get_feature_GAN(self, x):
+        seq_len = x.shape[-2]
+        x = x.float().transpose(-1, -2)
+        convblock_out = self.conv_block(x)
+        resblock1_out = self.res_block1(convblock_out)
+        resblock2_out = self.res_block2(resblock1_out)
+        resblock3_out = self.res_block3(resblock2_out)
+        poolblock_out = self.pool_block[0](resblock3_out)
+        poolblock_out = self.pool_block[1](poolblock_out)
+        return poolblock_out.transpose(-1, -2)
+    def get_feature(self, x):
+        seq_len = x.shape[-2]
+        x = x.float().transpose(-1, -2)
+        convblock_out = self.conv_block(x)
+        resblock1_out = self.res_block1(convblock_out)
+        resblock2_out = self.res_block2(resblock1_out)
+        resblock3_out = self.res_block3(resblock2_out)
+        poolblock_out = self.pool_block[0](resblock3_out)
+        poolblock_out = self.pool_block[1](poolblock_out)
+        return self.pool_block[2](poolblock_out)
+    def forward(self, x):
+        """
+        Returns:
+            classification_prediction, detection_prediction
+            sizes: (b, 31, 722), (b, 31, 2)
+        """
+        ###############################
+        # forward pass for classifier #
+        ###############################
+        seq_len = x.shape[-1]
+        x = x.float().transpose(-1, -2)
+        convblock_out = self.conv_block(x)
+        resblock1_out = self.res_block1(convblock_out)
+        resblock2_out = self.res_block2(resblock1_out)
+        resblock3_out = self.res_block3(resblock2_out)
+        poolblock_out = self.pool_block[0](resblock3_out)
+        poolblock_out = self.pool_block[1](poolblock_out)
+        GAN_feature = poolblock_out.transpose(-1, -2)
+        poolblock_out = self.pool_block[2](poolblock_out)
+        # (b, 256, 31, 2) => (b, 31, 256, 2) => (b, 31, 512)
+        classifier_out = poolblock_out.permute(0, 2, 1, 3).contiguous().view((-1, seq_len, 512))
+        classifier_out, _ = self.bilstm_classifier(classifier_out)  # ignore the hidden states
+        classifier_out = classifier_out.contiguous().view((-1, 512))  # (b * 31, 512)
+        classifier_out = self.classifier(classifier_out)
+        classifier_out = classifier_out.view((-1, seq_len, self.num_class))  # (b, 31, num_class)
+        # sizes: (b, 31, 722), (b, 31, 2)
+        # classifier output consists of predicted pitch classes per frame
+        # detector output consists of: (isvoice, notvoice) estimates per frame
+        return torch.abs(classifier_out.squeeze()), GAN_feature, poolblock_out
+    @staticmethod
+    def init_weights(m):
+        if isinstance(m, nn.Linear):
+            nn.init.kaiming_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.Conv2d):
+            nn.init.xavier_normal_(m.weight)
+        elif isinstance(m, nn.LSTM) or isinstance(m, nn.LSTMCell):
+            for p in m.parameters():
+                if p.data is None:
+                    continue
+                if len(p.shape) >= 2:
+                    nn.init.orthogonal_(p.data)
+                else:
+                    nn.init.normal_(p.data)
+class ResBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, leaky_relu_slope=0.01):
+        super().__init__()
+        self.downsample = in_channels != out_channels
+        # BN / LReLU / MaxPool layer before the conv layer - see Figure 1b in the paper
+        self.pre_conv = nn.Sequential(
+            nn.BatchNorm2d(num_features=in_channels),
+            nn.LeakyReLU(leaky_relu_slope, inplace=True),
+            nn.MaxPool2d(kernel_size=(1, 2)),  # apply downsampling on the y axis only
+        )
+        # conv layers
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.LeakyReLU(leaky_relu_slope, inplace=True),
+            nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
+        )
+        # 1 x 1 convolution layer to match the feature dimensions
+        self.conv1by1 = None
+        if self.downsample:
+            self.conv1by1 = nn.Conv2d(in_channels, out_channels, 1, bias=False)
+    def forward(self, x):
+        x = self.pre_conv(x)
+        if self.downsample:
+            x = self.conv(x) + self.conv1by1(x)
+        else:
+            x = self.conv(x) + x
+        return x

Utils/PLBERT/config.yml ADDED Viewed

	@@ -0,0 +1,30 @@

+log_dir: "Checkpoint"
+mixed_precision: "fp16"
+data_folder: "wikipedia_20220301.en.processed"
+batch_size: 192
+save_interval: 5000
+log_interval: 10
+num_process: 1 # number of GPUs
+num_steps: 1000000
+dataset_params:
+    tokenizer: "transfo-xl-wt103"
+    token_separator: " " # token used for phoneme separator (space)
+    token_mask: "M" # token used for phoneme mask (M)
+    word_separator: 3039 # token used for word separator (<formula>)
+    token_maps: "token_maps.pkl" # token map path
+    max_mel_length: 512 # max phoneme length
+    word_mask_prob: 0.15 # probability to mask the entire word
+    phoneme_mask_prob: 0.1 # probability to mask each phoneme
+    replace_prob: 0.2 # probablity to replace phonemes
+model_params:
+    vocab_size: 178
+    hidden_size: 768
+    num_attention_heads: 12
+    intermediate_size: 2048
+    max_position_embeddings: 512
+    num_hidden_layers: 12
+    dropout: 0.1

Utils/PLBERT/step_1000000.t7 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0714ff85804db43e06b3b0ac5749bf90cf206257c6c5916e8a98c5933b4c21e0
+size 25185187

Utils/PLBERT/util.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import yaml
+import torch
+from transformers import AlbertConfig, AlbertModel
+class CustomAlbert(AlbertModel):
+    def forward(self, *args, **kwargs):
+        # Call the original forward method
+        outputs = super().forward(*args, **kwargs)
+        # Only return the last_hidden_state
+        return outputs.last_hidden_state
+def load_plbert(log_dir):
+    config_path = os.path.join(log_dir, "config.yml")
+    plbert_config = yaml.safe_load(open(config_path))
+    albert_base_configuration = AlbertConfig(**plbert_config['model_params'])
+    bert = CustomAlbert(albert_base_configuration)
+    files = os.listdir(log_dir)
+    ckpts = []
+    for f in os.listdir(log_dir):
+        if f.startswith("step_"): ckpts.append(f)
+    iters = [int(f.split('_')[-1].split('.')[0]) for f in ckpts if os.path.isfile(os.path.join(log_dir, f))]
+    iters = sorted(iters)[-1]
+    checkpoint = torch.load(log_dir + "/step_" + str(iters) + ".t7", map_location='cpu')
+    state_dict = checkpoint['net']
+    from collections import OrderedDict
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = k[7:] # remove `module.`
+        if name.startswith('encoder.'):
+            name = name[8:] # remove `encoder.`
+            new_state_dict[name] = v
+    del new_state_dict["embeddings.position_ids"]
+    bert.load_state_dict(new_state_dict, strict=False)
+    return bert

bert.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aca81dd2457b43841b1725c51a8a9d9944d4f29d1b71238ae79784abaf8b89f0
+size 25178740

bert_encoder.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cec40e1c7015c8d10728c085fb27ce93854c130c89b4494aaf5c689658348c6
+size 1576502

checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e5e8f43daf3e5ef9fc0f6ed604d7479f751bc1dc29cf2a5862df8dbbd0855ba
+size 2201837262

config.json ADDED Viewed

	@@ -0,0 +1,202 @@

+{
+  "model_params": {
+    "decoder": {
+      "resblock_dilation_sizes": [
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ]
+      ],
+      "resblock_kernel_sizes": [
+        3,
+        7,
+        11
+      ],
+      "type": "hifigan",
+      "upsample_initial_channel": 512,
+      "upsample_kernel_sizes": [
+        20,
+        10,
+        6,
+        4
+      ],
+      "upsample_rates": [
+        10,
+        5,
+        3,
+        2
+      ]
+    },
+    "diffusion": {
+      "dist": {
+        "estimate_sigma_data": true,
+        "mean": -3.0,
+        "sigma_data": 0.2,
+        "std": 1.0
+      },
+      "embedding_mask_proba": 0.1,
+      "transformer": {
+        "head_features": 64,
+        "multiplier": 2,
+        "num_heads": 8,
+        "num_layers": 3
+      }
+    },
+    "dim_in": 64,
+    "dropout": 0.2,
+    "hidden_dim": 512,
+    "max_conv_dim": 512,
+    "max_dur": 50,
+    "multispeaker": false,
+    "n_layer": 3,
+    "n_mels": 80,
+    "n_token": 178,
+    "slm": {
+      "hidden": 768,
+      "initial_channel": 64,
+      "model": "microsoft/wavlm-base-plus",
+      "nlayers": 13,
+      "sr": 16000
+    },
+    "style_dim": 128
+  },
+  "training_config": {
+    "epochs": 30,
+    "batch_size": 2,
+    "max_len": 120,
+    "optimizer": {
+      "bert_lr": 1e-05,
+      "ft_lr": 0.0001,
+      "lr": 0.0001
+    },
+    "loss_params": {
+      "diff_epoch": 10,
+      "joint_epoch": 110,
+      "lambda_F0": 1.0,
+      "lambda_ce": 20.0,
+      "lambda_diff": 1.0,
+      "lambda_dur": 1.0,
+      "lambda_gen": 1.0,
+      "lambda_mel": 5.0,
+      "lambda_mono": 1.0,
+      "lambda_norm": 1.0,
+      "lambda_s2s": 1.0,
+      "lambda_slm": 1.0,
+      "lambda_sty": 1.0
+    }
+  },
+  "preprocess_params": {
+    "spect_params": {
+      "hop_length": 300,
+      "n_fft": 2048,
+      "win_length": 1200
+    },
+    "sr": 24000
+  },
+  "data_params": {
+    "OOD_data": "Data/OOD_texts.txt",
+    "min_length": 50,
+    "root_path": "Data/wavs",
+    "train_data": "Data/train_list.txt",
+    "val_data": "Data/val_list.txt"
+  },
+  "model_state": {
+    "epoch": 14,
+    "iterations": 1169,
+    "val_loss": 0.4189014434814453
+  },
+  "training_metrics": {
+    "train_loss": [],
+    "val_loss": [
+      41.0,
+      36.0,
+      31.0,
+      29.0,
+      25.0,
+      34.0,
+      33.0,
+      32.0,
+      31.0,
+      27.0,
+      52.0,
+      59.0,
+      4.0,
+      11.0,
+      17.0,
+      31.0,
+      37.0,
+      42.0
+    ],
+    "dur_loss": [
+      0.448,
+      0.449,
+      0.441,
+      0.488,
+      0.469,
+      0.437,
+      0.461,
+      0.42,
+      0.447,
+      0.436,
+      0.428,
+      0.425,
+      0.444,
+      0.44,
+      0.419,
+      0.423,
+      0.427,
+      0.405
+    ],
+    "F0_loss": [
+      1.223,
+      1.189,
+      1.208,
+      1.176,
+      1.141,
+      1.102,
+      1.168,
+      1.081,
+      1.119,
+      1.108,
+      1.108,
+      1.153,
+      1.093,
+      1.211,
+      1.102,
+      1.177,
+      1.162,
+      1.11
+    ],
+    "epochs": [
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      16,
+      17,
+      18
+    ]
+  }
+}

config.yml ADDED Viewed

	@@ -0,0 +1,66 @@

+ASR_config: Utils/ASR/config.yml
+ASR_path: Utils/ASR/epoch_00080.pth
+F0_path: Utils/JDC/bst.t7
+PLBERT_dir: Utils/PLBERT/
+model_params:
+  decoder:
+    resblock_dilation_sizes:
+    - - 1
+      - 3
+      - 5
+    - - 1
+      - 3
+      - 5
+    - - 1
+      - 3
+      - 5
+    resblock_kernel_sizes:
+    - 3
+    - 7
+    - 11
+    type: hifigan
+    upsample_initial_channel: 512
+    upsample_kernel_sizes:
+    - 20
+    - 10
+    - 6
+    - 4
+    upsample_rates:
+    - 10
+    - 5
+    - 3
+    - 2
+  diffusion:
+    dist:
+      estimate_sigma_data: true
+      mean: -3.0
+      sigma_data: 0.2
+      std: 1.0
+    embedding_mask_proba: 0.1
+    transformer:
+      head_features: 64
+      multiplier: 2
+      num_heads: 8
+      num_layers: 3
+  dim_in: 64
+  dropout: 0.2
+  hidden_dim: 512
+  max_conv_dim: 512
+  max_dur: 50
+  multispeaker: false
+  n_layer: 3
+  n_mels: 80
+  n_token: 178
+  slm:
+    hidden: 768
+    initial_channel: 64
+    model: microsoft/wavlm-base-plus
+    nlayers: 13
+    sr: 16000
+  style_dim: 128
+preprocess_params:
+  spect_params:
+    hop_length: 300
+    n_fft: 2048
+    win_length: 1200
+  sr: 24000

decoder.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d1a2c57768782aa6528ec5ac49e3ba7773b8526cfeed0b6114dc2f55e860f66
+size 217409318

diffusion.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15ed4a99b48a3f70640b15edf88f64fa3b6a07dc1fa8cebffb72368355f68da0
+size 87699504

models.py ADDED Viewed

	@@ -0,0 +1,713 @@

+#coding:utf-8
+import os
+import os.path as osp
+import copy
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from Utils.ASR.models import ASRCNN
+from Utils.JDC.model import JDCNet
+from Modules.diffusion.sampler import KDiffusion, LogNormalDistribution
+from Modules.diffusion.modules import Transformer1d, StyleTransformer1d
+from Modules.diffusion.diffusion import AudioDiffusionConditional
+from Modules.discriminators import MultiPeriodDiscriminator, MultiResSpecDiscriminator, WavLMDiscriminator
+from munch import Munch
+import yaml
+class LearnedDownSample(nn.Module):
+    def __init__(self, layer_type, dim_in):
+        super().__init__()
+        self.layer_type = layer_type
+        if self.layer_type == 'none':
+            self.conv = nn.Identity()
+        elif self.layer_type == 'timepreserve':
+            self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
+        elif self.layer_type == 'half':
+            self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
+        else:
+            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
+    def forward(self, x):
+        return self.conv(x)
+class LearnedUpSample(nn.Module):
+    def __init__(self, layer_type, dim_in):
+        super().__init__()
+        self.layer_type = layer_type
+        if self.layer_type == 'none':
+            self.conv = nn.Identity()
+        elif self.layer_type == 'timepreserve':
+            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
+        elif self.layer_type == 'half':
+            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
+        else:
+            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
+    def forward(self, x):
+        return self.conv(x)
+class DownSample(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        elif self.layer_type == 'timepreserve':
+            return F.avg_pool2d(x, (2, 1))
+        elif self.layer_type == 'half':
+            if x.shape[-1] % 2 != 0:
+                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
+            return F.avg_pool2d(x, 2)
+        else:
+            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
+class UpSample(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        elif self.layer_type == 'timepreserve':
+            return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
+        elif self.layer_type == 'half':
+            return F.interpolate(x, scale_factor=2, mode='nearest')
+        else:
+            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
+class ResBlk(nn.Module):
+    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
+                 normalize=False, downsample='none'):
+        super().__init__()
+        self.actv = actv
+        self.normalize = normalize
+        self.downsample = DownSample(downsample)
+        self.downsample_res = LearnedDownSample(downsample, dim_in)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out)
+    def _build_weights(self, dim_in, dim_out):
+        self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
+        self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
+        if self.normalize:
+            self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
+            self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
+        if self.learned_sc:
+            self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def _shortcut(self, x):
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        if self.downsample:
+            x = self.downsample(x)
+        return x
+    def _residual(self, x):
+        if self.normalize:
+            x = self.norm1(x)
+        x = self.actv(x)
+        x = self.conv1(x)
+        x = self.downsample_res(x)
+        if self.normalize:
+            x = self.norm2(x)
+        x = self.actv(x)
+        x = self.conv2(x)
+        return x
+    def forward(self, x):
+        x = self._shortcut(x) + self._residual(x)
+        return x / math.sqrt(2)  # unit variance
+class StyleEncoder(nn.Module):
+    def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
+        super().__init__()
+        blocks = []
+        blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
+        repeat_num = 4
+        for _ in range(repeat_num):
+            dim_out = min(dim_in*2, max_conv_dim)
+            blocks += [ResBlk(dim_in, dim_out, downsample='half')]
+            dim_in = dim_out
+        blocks += [nn.LeakyReLU(0.2)]
+        blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
+        blocks += [nn.AdaptiveAvgPool2d(1)]
+        blocks += [nn.LeakyReLU(0.2)]
+        self.shared = nn.Sequential(*blocks)
+        self.unshared = nn.Linear(dim_out, style_dim)
+    def forward(self, x):
+        h = self.shared(x)
+        h = h.view(h.size(0), -1)
+        s = self.unshared(h)
+        return s
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.linear_layer.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, x):
+        return self.linear_layer(x)
+class Discriminator2d(nn.Module):
+    def __init__(self, dim_in=48, num_domains=1, max_conv_dim=384, repeat_num=4):
+        super().__init__()
+        blocks = []
+        blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
+        for lid in range(repeat_num):
+            dim_out = min(dim_in*2, max_conv_dim)
+            blocks += [ResBlk(dim_in, dim_out, downsample='half')]
+            dim_in = dim_out
+        blocks += [nn.LeakyReLU(0.2)]
+        blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
+        blocks += [nn.LeakyReLU(0.2)]
+        blocks += [nn.AdaptiveAvgPool2d(1)]
+        blocks += [spectral_norm(nn.Conv2d(dim_out, num_domains, 1, 1, 0))]
+        self.main = nn.Sequential(*blocks)
+    def get_feature(self, x):
+        features = []
+        for l in self.main:
+            x = l(x)
+            features.append(x)
+        out = features[-1]
+        out = out.view(out.size(0), -1)  # (batch, num_domains)
+        return out, features
+    def forward(self, x):
+        out, features = self.get_feature(x)
+        out = out.squeeze()  # (batch)
+        return out, features
+class ResBlk1d(nn.Module):
+    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
+                 normalize=False, downsample='none', dropout_p=0.2):
+        super().__init__()
+        self.actv = actv
+        self.normalize = normalize
+        self.downsample_type = downsample
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out)
+        self.dropout_p = dropout_p
+        if self.downsample_type == 'none':
+            self.pool = nn.Identity()
+        else:
+            self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
+    def _build_weights(self, dim_in, dim_out):
+        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
+        self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
+        if self.normalize:
+            self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
+            self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
+        if self.learned_sc:
+            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def downsample(self, x):
+        if self.downsample_type == 'none':
+            return x
+        else:
+            if x.shape[-1] % 2 != 0:
+                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
+            return F.avg_pool1d(x, 2)
+    def _shortcut(self, x):
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        x = self.downsample(x)
+        return x
+    def _residual(self, x):
+        if self.normalize:
+            x = self.norm1(x)
+        x = self.actv(x)
+        x = F.dropout(x, p=self.dropout_p, training=self.training)
+        x = self.conv1(x)
+        x = self.pool(x)
+        if self.normalize:
+            x = self.norm2(x)
+        x = self.actv(x)
+        x = F.dropout(x, p=self.dropout_p, training=self.training)
+        x = self.conv2(x)
+        return x
+    def forward(self, x):
+        x = self._shortcut(x) + self._residual(x)
+        return x / math.sqrt(2)  # unit variance
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+class TextEncoder(nn.Module):
+    def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
+        super().__init__()
+        self.embedding = nn.Embedding(n_symbols, channels)
+        padding = (kernel_size - 1) // 2
+        self.cnn = nn.ModuleList()
+        for _ in range(depth):
+            self.cnn.append(nn.Sequential(
+                weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
+                LayerNorm(channels),
+                actv,
+                nn.Dropout(0.2),
+            ))
+        # self.cnn = nn.Sequential(*self.cnn)
+        self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
+    def forward(self, x, input_lengths, m):
+        x = self.embedding(x)  # [B, T, emb]
+        x = x.transpose(1, 2)  # [B, emb, T]
+        m = m.to(input_lengths.device).unsqueeze(1)
+        x.masked_fill_(m, 0.0)
+        for c in self.cnn:
+            x = c(x)
+            x.masked_fill_(m, 0.0)
+        x = x.transpose(1, 2)  # [B, T, chn]
+        input_lengths = input_lengths.cpu().numpy()
+        x = nn.utils.rnn.pack_padded_sequence(
+            x, input_lengths, batch_first=True, enforce_sorted=False)
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        x, _ = nn.utils.rnn.pad_packed_sequence(
+            x, batch_first=True)
+        x = x.transpose(-1, -2)
+        x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
+        x_pad[:, :, :x.shape[-1]] = x
+        x = x_pad.to(x.device)
+        x.masked_fill_(m, 0.0)
+        return x
+    def inference(self, x):
+        x = self.embedding(x)
+        x = x.transpose(1, 2)
+        x = self.cnn(x)
+        x = x.transpose(1, 2)
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        return x
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask
+class AdaIN1d(nn.Module):
+    def __init__(self, style_dim, num_features):
+        super().__init__()
+        self.norm = nn.InstanceNorm1d(num_features, affine=False)
+        self.fc = nn.Linear(style_dim, num_features*2)
+    def forward(self, x, s):
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        return (1 + gamma) * self.norm(x) + beta
+class UpSample1d(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        else:
+            return F.interpolate(x, scale_factor=2, mode='nearest')
+class AdainResBlk1d(nn.Module):
+    def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
+                 upsample='none', dropout_p=0.0):
+        super().__init__()
+        self.actv = actv
+        self.upsample_type = upsample
+        self.upsample = UpSample1d(upsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
+        self.dropout = nn.Dropout(dropout_p)
+        if upsample == 'none':
+            self.pool = nn.Identity()
+        else:
+            self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
+    def _build_weights(self, dim_in, dim_out, style_dim):
+        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
+        self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
+        self.norm1 = AdaIN1d(style_dim, dim_in)
+        self.norm2 = AdaIN1d(style_dim, dim_out)
+        if self.learned_sc:
+            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def _shortcut(self, x):
+        x = self.upsample(x)
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        return x
+    def _residual(self, x, s):
+        x = self.norm1(x, s)
+        x = self.actv(x)
+        x = self.pool(x)
+        x = self.conv1(self.dropout(x))
+        x = self.norm2(x, s)
+        x = self.actv(x)
+        x = self.conv2(self.dropout(x))
+        return x
+    def forward(self, x, s):
+        out = self._residual(x, s)
+        out = (out + self._shortcut(x)) / math.sqrt(2)
+        return out
+class AdaLayerNorm(nn.Module):
+    def __init__(self, style_dim, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.fc = nn.Linear(style_dim, channels*2)
+    def forward(self, x, s):
+        x = x.transpose(-1, -2)
+        x = x.transpose(1, -1)
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), eps=self.eps)
+        x = (1 + gamma) * x + beta
+        return x.transpose(1, -1).transpose(-1, -2)
+class ProsodyPredictor(nn.Module):
+    def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
+        super().__init__()
+        self.text_encoder = DurationEncoder(sty_dim=style_dim,
+                                            d_model=d_hid,
+                                            nlayers=nlayers,
+                                            dropout=dropout)
+        self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
+        self.duration_proj = LinearNorm(d_hid, max_dur)
+        self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
+        self.F0 = nn.ModuleList()
+        self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
+        self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
+        self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
+        self.N = nn.ModuleList()
+        self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
+        self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
+        self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
+        self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
+        self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
+    def forward(self, texts, style, text_lengths, alignment, m):
+        d = self.text_encoder(texts, style, text_lengths, m)
+        batch_size = d.shape[0]
+        text_size = d.shape[1]
+        # predict duration
+        input_lengths = text_lengths.cpu().numpy()
+        x = nn.utils.rnn.pack_padded_sequence(
+            d, input_lengths, batch_first=True, enforce_sorted=False)
+        m = m.to(text_lengths.device).unsqueeze(1)
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        x, _ = nn.utils.rnn.pad_packed_sequence(
+            x, batch_first=True)
+        x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]])
+        x_pad[:, :x.shape[1], :] = x
+        x = x_pad.to(x.device)
+        duration = self.duration_proj(nn.functional.dropout(x, 0.5, training=self.training))
+        en = (d.transpose(-1, -2) @ alignment)
+        return duration.squeeze(-1), en
+    def F0Ntrain(self, x, s):
+        x, _ = self.shared(x.transpose(-1, -2))
+        F0 = x.transpose(-1, -2)
+        for block in self.F0:
+            F0 = block(F0, s)
+        F0 = self.F0_proj(F0)
+        N = x.transpose(-1, -2)
+        for block in self.N:
+            N = block(N, s)
+        N = self.N_proj(N)
+        return F0.squeeze(1), N.squeeze(1)
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask
+class DurationEncoder(nn.Module):
+    def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
+        super().__init__()
+        self.lstms = nn.ModuleList()
+        for _ in range(nlayers):
+            self.lstms.append(nn.LSTM(d_model + sty_dim,
+                                 d_model // 2,
+                                 num_layers=1,
+                                 batch_first=True,
+                                 bidirectional=True,
+                                 dropout=dropout))
+            self.lstms.append(AdaLayerNorm(sty_dim, d_model))
+        self.dropout = dropout
+        self.d_model = d_model
+        self.sty_dim = sty_dim
+    def forward(self, x, style, text_lengths, m):
+        masks = m.to(text_lengths.device)
+        x = x.permute(2, 0, 1)
+        s = style.expand(x.shape[0], x.shape[1], -1)
+        x = torch.cat([x, s], axis=-1)
+        x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
+        x = x.transpose(0, 1)
+        input_lengths = text_lengths.cpu().numpy()
+        x = x.transpose(-1, -2)
+        for block in self.lstms:
+            if isinstance(block, AdaLayerNorm):
+                x = block(x.transpose(-1, -2), style).transpose(-1, -2)
+                x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
+                x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
+            else:
+                x = x.transpose(-1, -2)
+                x = nn.utils.rnn.pack_padded_sequence(
+                    x, input_lengths, batch_first=True, enforce_sorted=False)
+                block.flatten_parameters()
+                x, _ = block(x)
+                x, _ = nn.utils.rnn.pad_packed_sequence(
+                    x, batch_first=True)
+                x = F.dropout(x, p=self.dropout, training=self.training)
+                x = x.transpose(-1, -2)
+                x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
+                x_pad[:, :, :x.shape[-1]] = x
+                x = x_pad.to(x.device)
+        return x.transpose(-1, -2)
+    def inference(self, x, style):
+        x = self.embedding(x.transpose(-1, -2)) * math.sqrt(self.d_model)
+        style = style.expand(x.shape[0], x.shape[1], -1)
+        x = torch.cat([x, style], axis=-1)
+        src = self.pos_encoder(x)
+        output = self.transformer_encoder(src).transpose(0, 1)
+        return output
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask
+def load_F0_models(path):
+    # load F0 model
+    F0_model = JDCNet(num_class=1, seq_len=192)
+    params = torch.load(path, map_location='cpu')['net']
+    F0_model.load_state_dict(params)
+    _ = F0_model.train()
+    return F0_model
+def load_ASR_models(ASR_MODEL_PATH, ASR_MODEL_CONFIG):
+    # load ASR model
+    def _load_config(path):
+        with open(path) as f:
+            config = yaml.safe_load(f)
+        model_config = config['model_params']
+        return model_config
+    def _load_model(model_config, model_path):
+        model = ASRCNN(**model_config)
+        params = torch.load(model_path, map_location='cpu')['model']
+        model.load_state_dict(params)
+        return model
+    asr_model_config = _load_config(ASR_MODEL_CONFIG)
+    asr_model = _load_model(asr_model_config, ASR_MODEL_PATH)
+    _ = asr_model.train()
+    return asr_model
+def build_model(args, text_aligner, pitch_extractor, bert):
+    assert args.decoder.type in ['istftnet', 'hifigan'], 'Decoder type unknown'
+    if args.decoder.type == "istftnet":
+        from Modules.istftnet import Decoder
+        decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
+                resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
+                upsample_rates = args.decoder.upsample_rates,
+                upsample_initial_channel=args.decoder.upsample_initial_channel,
+                resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
+                upsample_kernel_sizes=args.decoder.upsample_kernel_sizes,
+                gen_istft_n_fft=args.decoder.gen_istft_n_fft, gen_istft_hop_size=args.decoder.gen_istft_hop_size)
+    else:
+        from Modules.hifigan import Decoder
+        decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
+                resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
+                upsample_rates = args.decoder.upsample_rates,
+                upsample_initial_channel=args.decoder.upsample_initial_channel,
+                resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
+                upsample_kernel_sizes=args.decoder.upsample_kernel_sizes)
+    text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
+    predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
+    style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # acoustic style encoder
+    predictor_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # prosodic style encoder
+    # define diffusion model
+    if args.multispeaker:
+        transformer = StyleTransformer1d(channels=args.style_dim*2,
+                                    context_embedding_features=bert.config.hidden_size,
+                                    context_features=args.style_dim*2,
+                                    **args.diffusion.transformer)
+    else:
+        transformer = Transformer1d(channels=args.style_dim*2,
+                                    context_embedding_features=bert.config.hidden_size,
+                                    **args.diffusion.transformer)
+    diffusion = AudioDiffusionConditional(
+        in_channels=1,
+        embedding_max_length=bert.config.max_position_embeddings,
+        embedding_features=bert.config.hidden_size,
+        embedding_mask_proba=args.diffusion.embedding_mask_proba, # Conditional dropout of batch elements,
+        channels=args.style_dim*2,
+        context_features=args.style_dim*2,
+    )
+    diffusion.diffusion = KDiffusion(
+        net=diffusion.unet,
+        sigma_distribution=LogNormalDistribution(mean = args.diffusion.dist.mean, std = args.diffusion.dist.std),
+        sigma_data=args.diffusion.dist.sigma_data, # a placeholder, will be changed dynamically when start training diffusion model
+        dynamic_threshold=0.0
+    )
+    diffusion.diffusion.net = transformer
+    diffusion.unet = transformer
+    nets = Munch(
+            bert=bert,
+            bert_encoder=nn.Linear(bert.config.hidden_size, args.hidden_dim),
+            predictor=predictor,
+            decoder=decoder,
+            text_encoder=text_encoder,
+            predictor_encoder=predictor_encoder,
+            style_encoder=style_encoder,
+            diffusion=diffusion,
+            text_aligner = text_aligner,
+            pitch_extractor=pitch_extractor,
+            mpd = MultiPeriodDiscriminator(),
+            msd = MultiResSpecDiscriminator(),
+            # slm discriminator head
+            wd = WavLMDiscriminator(args.slm.hidden, args.slm.nlayers, args.slm.initial_channel),
+       )
+    return nets
+def load_checkpoint(model, optimizer, path, load_only_params=True, ignore_modules=[]):
+    state = torch.load(path, map_location='cpu')
+    params = state['net']
+    for key in model:
+        if key in params and key not in ignore_modules:
+            print('%s loaded' % key)
+            model[key].load_state_dict(params[key], strict=False)
+    _ = [model[key].eval() for key in model]
+    if not load_only_params:
+        epoch = state["epoch"]
+        iters = state["iters"]
+        optimizer.load_state_dict(state["optimizer"])
+    else:
+        epoch = 0
+        iters = 0
+    return model, optimizer, epoch, iters

mpd.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99cb178409d8d768dc41ec1a297f392ac0da1fc49bc1307d5019ac9c657be69e
+size 164447824

msd.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f5c47d1659c30bbb9ff734b062c73048e556ce4e6272d61558c4eff9762b6fc
+size 1139020

pitch_extractor.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a15c28725403a9d5a479cbcb9f75a0cc62cd1dc32c0248d06e13adb8b7049b2
+size 21028913

predictor.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a1a84eb7a1cc2e81c29c3066db2bf032e7f1dd783769ba0b64c38a74941ab66
+size 64813639

predictor_encoder.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7f570dc9fee354ac02f161cc1ced4dadd1fa8dfb49f15bc30199465d69056ba
+size 55547155

style_encoder.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c6778c52c2e4635fed4f77356eca597dee6de3261a259fd4d8cdc0b8283b67f
+size 55546871

text_aligner.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a80a5c2b6298aca1a63211166b4597e95fbfc0f1c185c53e608fb2097ab28b9
+size 31531315

text_encoder.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d03453b1298255ad22d3251e828ee57b66ed601591e0903950716595ef032b08
+size 22432460

text_utils.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# IPA Phonemizer: https://github.com/bootphon/phonemizer
+_pad = "$"
+_punctuation = ';:,.!?¡¿—…"«»“” '
+_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+# Export all symbols:
+symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
+dicts = {}
+for i in range(len((symbols))):
+    dicts[symbols[i]] = i
+class TextCleaner:
+    def __init__(self, dummy=None):
+        self.word_index_dictionary = dicts
+        print(len(dicts))
+    def __call__(self, text):
+        indexes = []
+        for char in text:
+            try:
+                indexes.append(self.word_index_dictionary[char])
+            except KeyError:
+                print(text)
+        return indexes

training_metrics.png ADDED Viewed

utils.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from monotonic_align import maximum_path
+from monotonic_align import mask_from_lens
+from monotonic_align.core import maximum_path_c
+import numpy as np
+import torch
+import copy
+from torch import nn
+import torch.nn.functional as F
+import torchaudio
+import librosa
+import matplotlib.pyplot as plt
+from munch import Munch
+def maximum_path(neg_cent, mask):
+  """ Cython optimized version.
+  neg_cent: [b, t_t, t_s]
+  mask: [b, t_t, t_s]
+  """
+  device = neg_cent.device
+  dtype = neg_cent.dtype
+  neg_cent =  np.ascontiguousarray(neg_cent.data.cpu().numpy().astype(np.float32))
+  path =  np.ascontiguousarray(np.zeros(neg_cent.shape, dtype=np.int32))
+  t_t_max = np.ascontiguousarray(mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32))
+  t_s_max = np.ascontiguousarray(mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32))
+  maximum_path_c(path, neg_cent, t_t_max, t_s_max)
+  return torch.from_numpy(path).to(device=device, dtype=dtype)
+def get_data_path_list(train_path=None, val_path=None):
+    if train_path is None:
+        train_path = "Data/train_list.txt"
+    if val_path is None:
+        val_path = "Data/val_list.txt"
+    with open(train_path, 'r', encoding='utf-8', errors='ignore') as f:
+        train_list = f.readlines()
+    with open(val_path, 'r', encoding='utf-8', errors='ignore') as f:
+        val_list = f.readlines()
+    return train_list, val_list
+def length_to_mask(lengths):
+    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+    mask = torch.gt(mask+1, lengths.unsqueeze(1))
+    return mask
+# for norm consistency loss
+def log_norm(x, mean=-4, std=4, dim=2):
+    """
+    normalized log mel -> mel -> norm -> log(norm)
+    """
+    x = torch.log(torch.exp(x * std + mean).norm(dim=dim))
+    return x
+def get_image(arrs):
+    plt.switch_backend('agg')
+    fig = plt.figure()
+    ax = plt.gca()
+    ax.imshow(arrs)
+    return fig
+def recursive_munch(d):
+    if isinstance(d, dict):
+        return Munch((k, recursive_munch(v)) for k, v in d.items())
+    elif isinstance(d, list):
+        return [recursive_munch(v) for v in d]
+    else:
+        return d
+def log_print(message, logger):
+    logger.info(message)
+    print(message)

wd.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0064fbf02b28a73a1dbae037c63077bc38c661362cfd08402b301606f153dde
+size 4698570