AshwinSankar commited on
Commit
2679972
1 Parent(s): 3d72fa1

Upload model

Browse files
Files changed (3) hide show
  1. config.json +90 -0
  2. configuration_vits.py +240 -0
  3. model.safetensors +3 -0
config.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "rasa_boosted",
3
+ "activation_dropout": 0.1,
4
+ "architectures": [
5
+ "IndicVitsModel"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_vits.IndicVitsConfig",
10
+ "AutoModel": "modeling_vits.IndicVitsModel"
11
+ },
12
+ "depth_separable_channels": 2,
13
+ "depth_separable_num_layers": 3,
14
+ "duration_predictor_dropout": 0.5,
15
+ "duration_predictor_filter_channels": 256,
16
+ "duration_predictor_flow_bins": 10,
17
+ "duration_predictor_kernel_size": 3,
18
+ "duration_predictor_num_flows": 4,
19
+ "duration_predictor_tail_bound": 5.0,
20
+ "emotion_embedding_size": 256,
21
+ "ffn_dim": 768,
22
+ "ffn_kernel_size": 3,
23
+ "flow_size": 192,
24
+ "hidden_act": "relu",
25
+ "hidden_dropout": 0.1,
26
+ "hidden_size": 192,
27
+ "initializer_range": 0.02,
28
+ "layer_norm_eps": 1e-05,
29
+ "layerdrop": 0.1,
30
+ "leaky_relu_slope": 0.1,
31
+ "model_type": "indic_vits_model",
32
+ "noise_scale": 0.667,
33
+ "noise_scale_duration": 0.8,
34
+ "num_attention_heads": 2,
35
+ "num_emotions": 32,
36
+ "num_hidden_layers": 6,
37
+ "num_speakers": 1024,
38
+ "posterior_encoder_num_wavenet_layers": 16,
39
+ "prior_encoder_num_flows": 4,
40
+ "prior_encoder_num_wavenet_layers": 4,
41
+ "resblock_dilation_sizes": [
42
+ [
43
+ 1,
44
+ 3,
45
+ 5
46
+ ],
47
+ [
48
+ 1,
49
+ 3,
50
+ 5
51
+ ],
52
+ [
53
+ 1,
54
+ 3,
55
+ 5
56
+ ]
57
+ ],
58
+ "resblock_kernel_sizes": [
59
+ 3,
60
+ 7,
61
+ 11
62
+ ],
63
+ "sampling_rate": 24000,
64
+ "speaker_embedding_size": 256,
65
+ "speaking_rate": 1.0,
66
+ "spectrogram_bins": 513,
67
+ "tokenizer_class": "IndicVitsTokenizer",
68
+ "torch_dtype": "float32",
69
+ "transformers_version": "4.47.1",
70
+ "upsample_initial_channel": 512,
71
+ "upsample_kernel_sizes": [
72
+ 16,
73
+ 16,
74
+ 4,
75
+ 4
76
+ ],
77
+ "upsample_rates": [
78
+ 8,
79
+ 8,
80
+ 2,
81
+ 2
82
+ ],
83
+ "use_bias": true,
84
+ "use_stochastic_duration_prediction": true,
85
+ "vocab_size": 1260,
86
+ "wavenet_dilation_rate": 1,
87
+ "wavenet_dropout": 0.0,
88
+ "wavenet_kernel_size": 5,
89
+ "window_size": 4
90
+ }
configuration_vits.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """VITS model configuration"""
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.utils import logging
5
+
6
+
7
+ logger = logging.get_logger(__name__)
8
+
9
+
10
+ class IndicVitsConfig(PretrainedConfig):
11
+ r"""
12
+ This is the configuration class to store the configuration of a [`VitsModel`]. It is used to instantiate a VITS
13
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
14
+ defaults will yield a similar configuration to that of the VITS
15
+ [facebook/mms-tts-eng](https://huggingface.co/facebook/mms-tts-eng) architecture.
16
+
17
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
18
+ documentation from [`PretrainedConfig`] for more information.
19
+
20
+ Args:
21
+ vocab_size (`int`, *optional*, defaults to 38):
22
+ Vocabulary size of the VITS model. Defines the number of different tokens that can be represented by the
23
+ `inputs_ids` passed to the forward method of [`VitsModel`].
24
+ hidden_size (`int`, *optional*, defaults to 192):
25
+ Dimensionality of the text encoder layers.
26
+ num_hidden_layers (`int`, *optional*, defaults to 6):
27
+ Number of hidden layers in the Transformer encoder.
28
+ num_attention_heads (`int`, *optional*, defaults to 2):
29
+ Number of attention heads for each attention layer in the Transformer encoder.
30
+ window_size (`int`, *optional*, defaults to 4):
31
+ Window size for the relative positional embeddings in the attention layers of the Transformer encoder.
32
+ use_bias (`bool`, *optional*, defaults to `True`):
33
+ Whether to use bias in the key, query, value projection layers in the Transformer encoder.
34
+ ffn_dim (`int`, *optional*, defaults to 768):
35
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
36
+ layerdrop (`float`, *optional*, defaults to 0.1):
37
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
38
+ for more details.
39
+ ffn_kernel_size (`int`, *optional*, defaults to 3):
40
+ Kernel size of the 1D convolution layers used by the feed-forward network in the Transformer encoder.
41
+ flow_size (`int`, *optional*, defaults to 192):
42
+ Dimensionality of the flow layers.
43
+ spectrogram_bins (`int`, *optional*, defaults to 513):
44
+ Number of frequency bins in the target spectrogram.
45
+ hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
46
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
47
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
48
+ hidden_dropout (`float`, *optional*, defaults to 0.1):
49
+ The dropout probability for all fully connected layers in the embeddings and encoder.
50
+ attention_dropout (`float`, *optional*, defaults to 0.1):
51
+ The dropout ratio for the attention probabilities.
52
+ activation_dropout (`float`, *optional*, defaults to 0.1):
53
+ The dropout ratio for activations inside the fully connected layer.
54
+ initializer_range (`float`, *optional*, defaults to 0.02):
55
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
56
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
57
+ The epsilon used by the layer normalization layers.
58
+ use_stochastic_duration_prediction (`bool`, *optional*, defaults to `True`):
59
+ Whether to use the stochastic duration prediction module or the regular duration predictor.
60
+ num_speakers (`int`, *optional*, defaults to 1):
61
+ Number of speakers if this is a multi-speaker model.
62
+ speaker_embedding_size (`int`, *optional*, defaults to 0):
63
+ Number of channels used by the speaker embeddings. Is zero for single-speaker models.
64
+ upsample_initial_channel (`int`, *optional*, defaults to 512):
65
+ The number of input channels into the HiFi-GAN upsampling network.
66
+ upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 2, 2]`):
67
+ A tuple of integers defining the stride of each 1D convolutional layer in the HiFi-GAN upsampling network.
68
+ The length of `upsample_rates` defines the number of convolutional layers and has to match the length of
69
+ `upsample_kernel_sizes`.
70
+ upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[16, 16, 4, 4]`):
71
+ A tuple of integers defining the kernel size of each 1D convolutional layer in the HiFi-GAN upsampling
72
+ network. The length of `upsample_kernel_sizes` defines the number of convolutional layers and has to match
73
+ the length of `upsample_rates`.
74
+ resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
75
+ A tuple of integers defining the kernel sizes of the 1D convolutional layers in the HiFi-GAN
76
+ multi-receptive field fusion (MRF) module.
77
+ resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
78
+ A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
79
+ HiFi-GAN multi-receptive field fusion (MRF) module.
80
+ leaky_relu_slope (`float`, *optional*, defaults to 0.1):
81
+ The angle of the negative slope used by the leaky ReLU activation.
82
+ depth_separable_channels (`int`, *optional*, defaults to 2):
83
+ Number of channels to use in each depth-separable block.
84
+ depth_separable_num_layers (`int`, *optional*, defaults to 3):
85
+ Number of convolutional layers to use in each depth-separable block.
86
+ duration_predictor_flow_bins (`int`, *optional*, defaults to 10):
87
+ Number of channels to map using the unonstrained rational spline in the duration predictor model.
88
+ duration_predictor_tail_bound (`float`, *optional*, defaults to 5.0):
89
+ Value of the tail bin boundary when computing the unconstrained rational spline in the duration predictor
90
+ model.
91
+ duration_predictor_kernel_size (`int`, *optional*, defaults to 3):
92
+ Kernel size of the 1D convolution layers used in the duration predictor model.
93
+ duration_predictor_dropout (`float`, *optional*, defaults to 0.5):
94
+ The dropout ratio for the duration predictor model.
95
+ duration_predictor_num_flows (`int`, *optional*, defaults to 4):
96
+ Number of flow stages used by the duration predictor model.
97
+ duration_predictor_filter_channels (`int`, *optional*, defaults to 256):
98
+ Number of channels for the convolution layers used in the duration predictor model.
99
+ prior_encoder_num_flows (`int`, *optional*, defaults to 4):
100
+ Number of flow stages used by the prior encoder flow model.
101
+ prior_encoder_num_wavenet_layers (`int`, *optional*, defaults to 4):
102
+ Number of WaveNet layers used by the prior encoder flow model.
103
+ posterior_encoder_num_wavenet_layers (`int`, *optional*, defaults to 16):
104
+ Number of WaveNet layers used by the posterior encoder model.
105
+ wavenet_kernel_size (`int`, *optional*, defaults to 5):
106
+ Kernel size of the 1D convolution layers used in the WaveNet model.
107
+ wavenet_dilation_rate (`int`, *optional*, defaults to 1):
108
+ Dilation rates of the dilated 1D convolutional layers used in the WaveNet model.
109
+ wavenet_dropout (`float`, *optional*, defaults to 0.0):
110
+ The dropout ratio for the WaveNet layers.
111
+ speaking_rate (`float`, *optional*, defaults to 1.0):
112
+ Speaking rate. Larger values give faster synthesised speech.
113
+ noise_scale (`float`, *optional*, defaults to 0.667):
114
+ How random the speech prediction is. Larger values create more variation in the predicted speech.
115
+ noise_scale_duration (`float`, *optional*, defaults to 0.8):
116
+ How random the duration prediction is. Larger values create more variation in the predicted durations.
117
+ sampling_rate (`int`, *optional*, defaults to 16000):
118
+ The sampling rate at which the output audio waveform is digitalized expressed in hertz (Hz).
119
+
120
+ Example:
121
+
122
+ ```python
123
+ >>> from transformers import VitsModel, VitsConfig
124
+
125
+ >>> # Initializing a "facebook/mms-tts-eng" style configuration
126
+ >>> configuration = VitsConfig()
127
+
128
+ >>> # Initializing a model (with random weights) from the "facebook/mms-tts-eng" style configuration
129
+ >>> model = VitsModel(configuration)
130
+
131
+ >>> # Accessing the model configuration
132
+ >>> configuration = model.config
133
+ ```"""
134
+
135
+ model_type = "indic_vits_model"
136
+
137
+ def __init__(
138
+ self,
139
+ vocab_size=38,
140
+ hidden_size=192,
141
+ num_hidden_layers=6,
142
+ num_attention_heads=2,
143
+ window_size=4,
144
+ use_bias=True,
145
+ ffn_dim=768,
146
+ layerdrop=0.1,
147
+ ffn_kernel_size=3,
148
+ flow_size=192,
149
+ spectrogram_bins=513,
150
+ hidden_act="relu",
151
+ hidden_dropout=0.1,
152
+ attention_dropout=0.1,
153
+ activation_dropout=0.1,
154
+ initializer_range=0.02,
155
+ layer_norm_eps=1e-5,
156
+ use_stochastic_duration_prediction=True,
157
+ num_speakers=1,
158
+ speaker_embedding_size=0,
159
+ num_emotions=1,
160
+ emotion_embedding_size=0,
161
+ upsample_initial_channel=512,
162
+ upsample_rates=[8, 8, 2, 2],
163
+ upsample_kernel_sizes=[16, 16, 4, 4],
164
+ resblock_kernel_sizes=[3, 7, 11],
165
+ resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
166
+ leaky_relu_slope=0.1,
167
+ depth_separable_channels=2,
168
+ depth_separable_num_layers=3,
169
+ duration_predictor_flow_bins=10,
170
+ duration_predictor_tail_bound=5.0,
171
+ duration_predictor_kernel_size=3,
172
+ duration_predictor_dropout=0.5,
173
+ duration_predictor_num_flows=4,
174
+ duration_predictor_filter_channels=256,
175
+ prior_encoder_num_flows=4,
176
+ prior_encoder_num_wavenet_layers=4,
177
+ posterior_encoder_num_wavenet_layers=16,
178
+ wavenet_kernel_size=5,
179
+ wavenet_dilation_rate=1,
180
+ wavenet_dropout=0.0,
181
+ speaking_rate=1.0,
182
+ noise_scale=0.667,
183
+ noise_scale_duration=0.8,
184
+ sampling_rate=24_000,
185
+ **kwargs,
186
+ ):
187
+ self.vocab_size = vocab_size
188
+ self.hidden_size = hidden_size
189
+ self.num_hidden_layers = num_hidden_layers
190
+ self.num_attention_heads = num_attention_heads
191
+ self.window_size = window_size
192
+ self.use_bias = use_bias
193
+ self.ffn_dim = ffn_dim
194
+ self.layerdrop = layerdrop
195
+ self.ffn_kernel_size = ffn_kernel_size
196
+ self.flow_size = flow_size
197
+ self.spectrogram_bins = spectrogram_bins
198
+ self.hidden_act = hidden_act
199
+ self.hidden_dropout = hidden_dropout
200
+ self.attention_dropout = attention_dropout
201
+ self.activation_dropout = activation_dropout
202
+ self.initializer_range = initializer_range
203
+ self.layer_norm_eps = layer_norm_eps
204
+ self.use_stochastic_duration_prediction = use_stochastic_duration_prediction
205
+ self.num_speakers = num_speakers
206
+ self.speaker_embedding_size = speaker_embedding_size
207
+ self.num_emotions = num_emotions
208
+ self.emotion_embedding_size = emotion_embedding_size
209
+ self.upsample_initial_channel = upsample_initial_channel
210
+ self.upsample_rates = upsample_rates
211
+ self.upsample_kernel_sizes = upsample_kernel_sizes
212
+ self.resblock_kernel_sizes = resblock_kernel_sizes
213
+ self.resblock_dilation_sizes = resblock_dilation_sizes
214
+ self.leaky_relu_slope = leaky_relu_slope
215
+ self.depth_separable_channels = depth_separable_channels
216
+ self.depth_separable_num_layers = depth_separable_num_layers
217
+ self.duration_predictor_flow_bins = duration_predictor_flow_bins
218
+ self.duration_predictor_tail_bound = duration_predictor_tail_bound
219
+ self.duration_predictor_kernel_size = duration_predictor_kernel_size
220
+ self.duration_predictor_dropout = duration_predictor_dropout
221
+ self.duration_predictor_num_flows = duration_predictor_num_flows
222
+ self.duration_predictor_filter_channels = duration_predictor_filter_channels
223
+ self.prior_encoder_num_flows = prior_encoder_num_flows
224
+ self.prior_encoder_num_wavenet_layers = prior_encoder_num_wavenet_layers
225
+ self.posterior_encoder_num_wavenet_layers = posterior_encoder_num_wavenet_layers
226
+ self.wavenet_kernel_size = wavenet_kernel_size
227
+ self.wavenet_dilation_rate = wavenet_dilation_rate
228
+ self.wavenet_dropout = wavenet_dropout
229
+ self.speaking_rate = speaking_rate
230
+ self.noise_scale = noise_scale
231
+ self.noise_scale_duration = noise_scale_duration
232
+ self.sampling_rate = sampling_rate
233
+
234
+ if len(upsample_kernel_sizes) != len(upsample_rates):
235
+ raise ValueError(
236
+ f"The length of `upsample_kernel_sizes` ({len(upsample_kernel_sizes)}) must match the length of "
237
+ f"`upsample_rates` ({len(upsample_rates)})"
238
+ )
239
+
240
+ super().__init__(**kwargs)
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0596e963e176aa71b4f581ea3e69d9deceff4ae20caa2752b6ffa970e721fc91
3
+ size 160708568