upload model

Files changed (7) hide show

.gitattributes +1 -0
config.json +84 -0
hyperparams.yaml +86 -0
model.ckpt +3 -0
preprocessor_config.json +9 -0
tokenizer.ckpt +3 -0
wav2vec2.ckpt +3 -0

.gitattributes CHANGED Viewed

@@ -31,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+  "_name_or_path": "asafaya/hubert-large-arabic",
+  "speechbrain_interface": "EncoderASR",
+  "activation_dropout": 0.0,
+  "apply_spec_augment": true,
+  "architectures": [
+    "HubertModel"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.1,
+  "feat_proj_layer_norm": true,
+  "final_dropout": 0.0,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.075,
+  "mask_time_selection": "static",
+  "model_type": "hubert",
+  "num_attention_heads": 16,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.16.2",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 500
+}

hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,86 @@

+# URL for the biggest Fairseq english wav2vec2 model.
+wav2vec2_hub: asafaya/hubert-large-arabic
+sample_rate: 16000
+# BPE parameters
+token_type: unigram # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+# Model parameters
+activation: !name:torch.nn.GELU
+wav2vec_output_dim: 1024
+dnn_neurons: 1024
+freeze_wav2vec: false
+dropout: 0.2
+# Outputs
+output_neurons: 125 # BPE size, index(blank/eos/bos) = 0
+tokenizer: !new:sentencepiece.SentencePieceProcessor
+# Decoding parameters
+# Be sure that the bos and eos index match with the BPEs ones
+blank_index: 0
+bos_index: 1
+eos_index: 2
+enc: &id002 !new:speechbrain.nnet.containers.Sequential
+  input_shape: [null, null, 1024]
+  linear1: !name:speechbrain.nnet.linear.Linear
+    n_neurons: 1024
+    bias: true
+  bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
+  activation: !new:torch.nn.GELU
+  drop: !new:torch.nn.Dropout
+    p: 0.2
+  linear2: !name:speechbrain.nnet.linear.Linear
+    n_neurons: 1024
+    bias: true
+  bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
+  activation2: !new:torch.nn.GELU
+  drop2: !new:torch.nn.Dropout
+    p: 0.2
+  linear3: !name:speechbrain.nnet.linear.Linear
+    n_neurons: 1024
+    bias: true
+  bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
+  activation3: !new:torch.nn.GELU
+wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
+  source: asafaya/hubert-large-arabic
+  output_norm: true
+  freeze: false
+  save_path: wav2vec2_checkpoint
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+  input_size: 1024
+  n_neurons: 125
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+  apply_log: true
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+  blank_index: 0
+modules:
+  encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
+    wav2vec2: !ref <wav2vec2>
+    enc: !ref <enc>
+    ctc_lin: !ref <ctc_lin>
+model: !new:torch.nn.ModuleList
+- [!ref <enc>, !ref <ctc_lin>]
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+  split_tokens: true
+decoding_function: !name:speechbrain.decoders.ctc.ctc_greedy_decode
+  blank_id: !ref <blank_index>
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+  loadables:
+    wav2vec2: !ref <wav2vec2>
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>

model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b0a97590671d8b8928824205c9746fed51c2d29f301aebcf254f9cf61795298
+size 13164862

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "do_normalize": true,
+    "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+    "feature_size": 1,
+    "padding_side": "right",
+    "padding_value": 0,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+}

tokenizer.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:218ec5dc7632d1f191bba36d9a499d883ffcf43a2c1dcaf025f130335672bf93
+size 239537

wav2vec2.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1eb5ffad02a5bf771d3b845154de53dc131fa679c9456a3502c19c7c061fa0e5
+size 1261933253