ihanif commited on
Commit
e10304a
·
1 Parent(s): 395f05a

add wav2vec asr without language model

Browse files
Files changed (43) hide show
  1. README.md +93 -0
  2. added_tokens.json +4 -0
  3. all_results.json +15 -0
  4. config.json +107 -0
  5. eval.py +146 -0
  6. eval_results.json +10 -0
  7. google_fleurs_ps_af_test_eval_results.txt +2 -0
  8. log_google_fleurs_ps_af_test_predictions.txt +0 -0
  9. log_google_fleurs_ps_af_test_targets.txt +0 -0
  10. preprocessor_config.json +9 -0
  11. pytorch_model.bin +3 -0
  12. run.sh +33 -0
  13. run_2.sh +35 -0
  14. run_3.sh +34 -0
  15. run_speech_recognition_ctc.py +772 -0
  16. runs/Dec14_13-29-56_129-213-22-31/1671025286.7583845/events.out.tfevents.1671025286.129-213-22-31.83694.1 +3 -0
  17. runs/Dec14_13-29-56_129-213-22-31/events.out.tfevents.1671025286.129-213-22-31.83694.0 +3 -0
  18. runs/Dec14_13-29-56_129-213-22-31/events.out.tfevents.1671041124.129-213-22-31.83694.2 +3 -0
  19. runs/Dec16_13-55-02_129-146-104-29/1671199202.2565184/events.out.tfevents.1671199202.129-146-104-29.128095.1 +3 -0
  20. runs/Dec16_13-55-02_129-146-104-29/events.out.tfevents.1671199202.129-146-104-29.128095.0 +3 -0
  21. runs/Dec16_13-55-02_129-146-104-29/events.out.tfevents.1671201437.129-146-104-29.128095.2 +3 -0
  22. runs/Dec16_14-39-42_129-146-104-29/1671201754.79521/events.out.tfevents.1671201754.129-146-104-29.129288.1 +3 -0
  23. runs/Dec16_14-39-42_129-146-104-29/events.out.tfevents.1671201754.129-146-104-29.129288.0 +3 -0
  24. runs/Dec16_14-39-42_129-146-104-29/events.out.tfevents.1671204006.129-146-104-29.129288.2 +3 -0
  25. runs/Dec16_15-29-40_129-146-104-29/1671204751.2903225/events.out.tfevents.1671204751.129-146-104-29.131453.1 +3 -0
  26. runs/Dec16_15-29-40_129-146-104-29/events.out.tfevents.1671204751.129-146-104-29.131453.0 +3 -0
  27. runs/Dec16_15-39-46_129-146-104-29/1671205356.7546594/events.out.tfevents.1671205356.129-146-104-29.131763.1 +3 -0
  28. runs/Dec16_15-39-46_129-146-104-29/events.out.tfevents.1671205356.129-146-104-29.131763.0 +3 -0
  29. runs/Dec16_15-39-46_129-146-104-29/events.out.tfevents.1671207700.129-146-104-29.131763.2 +3 -0
  30. runs/Dec16_20-12-50_129-146-104-29/1671221741.6851091/events.out.tfevents.1671221741.129-146-104-29.144289.1 +3 -0
  31. runs/Dec16_20-12-50_129-146-104-29/events.out.tfevents.1671221741.129-146-104-29.144289.0 +3 -0
  32. runs/Dec16_20-12-50_129-146-104-29/events.out.tfevents.1671223965.129-146-104-29.144289.2 +3 -0
  33. runs/Dec16_20-56-58_129-146-104-29/1671224389.6246047/events.out.tfevents.1671224389.129-146-104-29.146388.1 +3 -0
  34. runs/Dec16_20-56-58_129-146-104-29/events.out.tfevents.1671224389.129-146-104-29.146388.0 +3 -0
  35. runs/Dec16_21-09-39_129-146-104-29/1671225152.348097/events.out.tfevents.1671225152.129-146-104-29.146624.1 +3 -0
  36. runs/Dec16_21-09-39_129-146-104-29/events.out.tfevents.1671225152.129-146-104-29.146624.0 +3 -0
  37. runs/Dec16_21-09-39_129-146-104-29/events.out.tfevents.1671227491.129-146-104-29.146624.2 +3 -0
  38. special_tokens_map.json +120 -0
  39. tokenizer_config.json +13 -0
  40. train_results.json +8 -0
  41. trainer_state.json +3745 -0
  42. training_args.bin +3 -0
  43. vocab.json +117 -0
README.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - google/fleurs
5
+ - generated_from_trainer
6
+ - automatic-speech-recognition
7
+ - hf-asr-leaderboard
8
+ - pashto
9
+ - ps
10
+ datasets:
11
+ - fleurs
12
+ metrics:
13
+ - wer
14
+ model-index:
15
+ - name: facebook/wav2vec2-xls-r-300m
16
+ results:
17
+ - task:
18
+ name: Automatic Speech Recognition
19
+ type: automatic-speech-recognition
20
+ dataset:
21
+ name: google/fleurs
22
+ type: google/fleurs
23
+ args: 'config: ps_af, split: test'
24
+ metrics:
25
+ - name: Wer
26
+ type: wer
27
+ value: 0.5159447476125512
28
+ ---
29
+
30
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
31
+ should probably proofread and complete it, then remove this comment. -->
32
+
33
+ # facebook/wav2vec2-xls-r-300m
34
+
35
+ This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on the GOOGLE/FLEURS - PS_AF dataset.
36
+ It achieves the following results on the evaluation set:
37
+ - Loss: 0.9162
38
+ - Wer: 0.5159
39
+ - Cer: 0.1972
40
+
41
+ ## Model description
42
+
43
+ More information needed
44
+
45
+ ## Intended uses & limitations
46
+
47
+ More information needed
48
+
49
+ ## Training and evaluation data
50
+
51
+ More information needed
52
+
53
+ ## Training procedure
54
+
55
+ ### Training hyperparameters
56
+
57
+ The following hyperparameters were used during training:
58
+ - learning_rate: 7.5e-07
59
+ - train_batch_size: 16
60
+ - eval_batch_size: 16
61
+ - seed: 42
62
+ - gradient_accumulation_steps: 2
63
+ - total_train_batch_size: 32
64
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
65
+ - lr_scheduler_type: linear
66
+ - lr_scheduler_warmup_steps: 1000
67
+ - training_steps: 6000
68
+ - mixed_precision_training: Native AMP
69
+
70
+ ### Training results
71
+
72
+ | Training Loss | Epoch | Step | Cer | Validation Loss | Wer |
73
+ |:-------------:|:-----:|:----:|:------:|:---------------:|:------:|
74
+ | 5.0767 | 6.33 | 500 | 1.0 | 4.8783 | 1.0 |
75
+ | 3.1156 | 12.66 | 1000 | 1.0 | 3.0990 | 1.0 |
76
+ | 1.3506 | 18.99 | 1500 | 0.2889 | 1.1056 | 0.7031 |
77
+ | 0.9997 | 25.32 | 2000 | 0.2301 | 0.9191 | 0.5944 |
78
+ | 0.7838 | 31.65 | 2500 | 0.2152 | 0.8952 | 0.5556 |
79
+ | 0.6665 | 37.97 | 3000 | 0.2017 | 0.8908 | 0.5252 |
80
+ | 0.6265 | 44.3 | 3500 | 0.1954 | 0.9063 | 0.5133 |
81
+ | 0.5935 | 50.63 | 4000 | 0.1969 | 0.9162 | 0.5156 |
82
+ | 0.5174 | 56.96 | 4500 | 0.1972 | 0.9287 | 0.5140 |
83
+ | 0.5462 | 63.29 | 5000 | 0.1974 | 0.9370 | 0.5138 |
84
+ | 0.5564 | 69.62 | 5500 | 0.1977 | 0.9461 | 0.5148 |
85
+ | 0.5252 | 75.95 | 6000 | 0.9505 | 0.5118 | 0.1969 |
86
+
87
+
88
+ ### Framework versions
89
+
90
+ - Transformers 4.26.0.dev0
91
+ - Pytorch 1.13.1+cu117
92
+ - Datasets 2.7.1.dev0
93
+ - Tokenizers 0.13.2
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 116,
3
+ "<s>": 115
4
+ }
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 75.95,
3
+ "eval_cer": 0.1972293657199707,
4
+ "eval_loss": 0.9162325859069824,
5
+ "eval_runtime": 45.3436,
6
+ "eval_samples": 481,
7
+ "eval_samples_per_second": 10.608,
8
+ "eval_steps_per_second": 0.684,
9
+ "eval_wer": 0.5159447476125512,
10
+ "train_loss": 0.044292491674423215,
11
+ "train_runtime": 2233.4842,
12
+ "train_samples": 2528,
13
+ "train_samples_per_second": 85.964,
14
+ "train_steps_per_second": 2.686
15
+ }
config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-xls-r-300m",
3
+ "activation_dropout": 0.1,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 768,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.0,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.0,
57
+ "hidden_size": 1024,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 4096,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.0,
62
+ "mask_feature_length": 64,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.1,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.3,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 16,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 24,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 1024,
79
+ "pad_token_id": 114,
80
+ "proj_codevector_dim": 768,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.26.0.dev0",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 117,
106
+ "xvector_output_dim": 512
107
+ }
eval.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ python3 eval.py --model_id ./wav2vec2-xlsr-300m-pashto --dataset google/fleurs --config ps_af --split test
3
+
4
+ '''
5
+ #!/usr/bin/env python3
6
+ import argparse
7
+ import re
8
+ from typing import Dict
9
+
10
+ import torch
11
+ from datasets import Audio, Dataset, load_dataset, load_metric
12
+
13
+ from transformers import AutoFeatureExtractor, pipeline
14
+
15
+
16
+ def log_results(result: Dataset, args: Dict[str, str]):
17
+ """DO NOT CHANGE. This function computes and logs the result metrics."""
18
+
19
+ log_outputs = args.log_outputs
20
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
21
+
22
+ # load metric
23
+ wer = load_metric("wer")
24
+ cer = load_metric("cer")
25
+
26
+ # compute metrics
27
+ wer_result = wer.compute(
28
+ references=result["target"], predictions=result["prediction"])
29
+ cer_result = cer.compute(
30
+ references=result["target"], predictions=result["prediction"])
31
+
32
+ # print & log results
33
+ result_str = f"WER: {wer_result}\nCER: {cer_result}"
34
+ print(result_str)
35
+
36
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
37
+ f.write(result_str)
38
+
39
+ # log all results in text file. Possibly interesting for analysis
40
+ if log_outputs is not None:
41
+ pred_file = f"log_{dataset_id}_predictions.txt"
42
+ target_file = f"log_{dataset_id}_targets.txt"
43
+
44
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
45
+
46
+ # mapping function to write output
47
+ def write_to_file(batch, i):
48
+ p.write(f"{i}" + "\n")
49
+ p.write(batch["prediction"] + "\n")
50
+ t.write(f"{i}" + "\n")
51
+ t.write(batch["target"] + "\n")
52
+
53
+ result.map(write_to_file, with_indices=True)
54
+
55
+
56
+ def normalize_text(text: str) -> str:
57
+ """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
58
+
59
+ chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
60
+
61
+ text = re.sub(chars_to_ignore_regex, "", text.lower())
62
+
63
+ # In addition, we can normalize the target text, e.g. removing new lines characters etc...
64
+ # note that order is important here!
65
+ token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
66
+
67
+ for t in token_sequences_to_ignore:
68
+ text = " ".join(text.split(t))
69
+
70
+ return text
71
+
72
+
73
+ def main(args):
74
+ # load dataset
75
+ dataset = load_dataset(args.dataset, args.config,
76
+ split=args.split, use_auth_token=True)
77
+
78
+ # for testing: only process the first two examples as a test
79
+ # dataset = dataset.select(range(10))
80
+
81
+ # load processor
82
+ feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
83
+ sampling_rate = feature_extractor.sampling_rate
84
+
85
+ # resample audio
86
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
87
+
88
+ # load eval pipeline
89
+ if args.device is None:
90
+ args.device = 0 if torch.cuda.is_available() else -1
91
+ asr = pipeline("automatic-speech-recognition",
92
+ model=args.model_id, device=args.device)
93
+
94
+ # map function to decode audio
95
+ def map_to_pred(batch):
96
+ prediction = asr(
97
+ batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
98
+ )
99
+
100
+ batch["prediction"] = prediction["text"]
101
+ batch["target"] = normalize_text(batch["transcription"])
102
+ return batch
103
+
104
+ # run inference on all examples
105
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
106
+
107
+ # compute and log_results
108
+ # do not change function below
109
+ log_results(result, args)
110
+
111
+
112
+ if __name__ == "__main__":
113
+ parser = argparse.ArgumentParser()
114
+
115
+ parser.add_argument(
116
+ "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
117
+ )
118
+ parser.add_argument(
119
+ "--dataset",
120
+ type=str,
121
+ required=True,
122
+ help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets",
123
+ )
124
+ parser.add_argument(
125
+ "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
126
+ )
127
+ parser.add_argument("--split", type=str, required=True,
128
+ help="Split of the dataset. *E.g.* `'test'`")
129
+ parser.add_argument(
130
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
131
+ )
132
+ parser.add_argument(
133
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to 1 second."
134
+ )
135
+ parser.add_argument(
136
+ "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
137
+ )
138
+ parser.add_argument(
139
+ "--device",
140
+ type=int,
141
+ default=None,
142
+ help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
143
+ )
144
+ args = parser.parse_args()
145
+
146
+ main(args)
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 75.95,
3
+ "eval_cer": 0.1972293657199707,
4
+ "eval_loss": 0.9162325859069824,
5
+ "eval_runtime": 45.3436,
6
+ "eval_samples": 481,
7
+ "eval_samples_per_second": 10.608,
8
+ "eval_steps_per_second": 0.684,
9
+ "eval_wer": 0.5159447476125512
10
+ }
google_fleurs_ps_af_test_eval_results.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ WER: 0.5107077764277035
2
+ CER: 0.2001802222741381
log_google_fleurs_ps_af_test_predictions.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_google_fleurs_ps_af_test_targets.txt ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16d5bdabf704db530a89e8f651eaa5068007117858043a754242f0b36576ecc4
3
+ size 1262381549
run.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python run_speech_recognition_ctc.py \
2
+ --dataset_name="google/fleurs" \
3
+ --dataset_config_name="ps_af" \
4
+ --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
5
+ --output_dir="./" \
6
+ --overwrite_output_dir \
7
+ --num_train_epochs="50" \
8
+ --per_device_train_batch_size="8" \
9
+ --per_device_eval_batch_size="8" \
10
+ --gradient_accumulation_steps="4" \
11
+ --learning_rate="7.5e-5" \
12
+ --warmup_steps="2000" \
13
+ --evaluation_strategy="steps" \
14
+ --text_column_name="transcription" \
15
+ --save_steps="500" \
16
+ --eval_steps="500" \
17
+ --logging_steps="10" \
18
+ --layerdrop="0.0" \
19
+ --activation_dropout="0.1" \
20
+ --eval_metrics wer cer \
21
+ --save_total_limit="1" \
22
+ --mask_time_prob="0.3" \
23
+ --mask_time_length="10" \
24
+ --mask_feature_prob="0.1" \
25
+ --fp16 \
26
+ --mask_feature_length="64" \
27
+ --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
28
+ --group_by_length \
29
+ --push_to_hub \
30
+ --do_train --do_eval \
31
+ --gradient_checkpointing \
32
+ --use_auth_token
33
+ --freeze_feature_extractor="True"
run_2.sh ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python run_speech_recognition_ctc.py \
2
+ --dataset_name="google/fleurs" \
3
+ --dataset_config_name="ps_af" \
4
+ --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
5
+ --output_dir="./" \
6
+ --overwrite_output_dir="False" \
7
+ --max_steps="4500" \
8
+ --per_device_train_batch_size="8" \
9
+ --per_device_eval_batch_size="8" \
10
+ --gradient_accumulation_steps="4" \
11
+ --learning_rate="7.5e-5" \
12
+ --warmup_steps="2000" \
13
+ --evaluation_strategy="steps" \
14
+ --text_column_name="transcription" \
15
+ --save_steps="500" \
16
+ --eval_steps="500" \
17
+ --logging_steps="10" \
18
+ --layerdrop="0.0" \
19
+ --activation_dropout="0.1" \
20
+ --eval_metrics wer cer \
21
+ --greater_is_better="False" \
22
+ --load_best_model_at_end \
23
+ --save_total_limit="3" \
24
+ --mask_time_prob="0.3" \
25
+ --mask_time_length="10" \
26
+ --mask_feature_prob="0.1" \
27
+ --fp16 \
28
+ --mask_feature_length="64" \
29
+ --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
30
+ --group_by_length \
31
+ --push_to_hub \
32
+ --do_train --do_eval \
33
+ --gradient_checkpointing \
34
+ --use_auth_token
35
+ --freeze_feature_extractor="True"
run_3.sh ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python run_speech_recognition_ctc.py \
2
+ --dataset_name="google/fleurs" \
3
+ --dataset_config_name="ps_af" \
4
+ --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
5
+ --output_dir="./" \
6
+ --overwrite_output_dir="False" \
7
+ --max_steps="6000" \
8
+ --per_device_train_batch_size="16" \
9
+ --per_device_eval_batch_size="16" \
10
+ --gradient_accumulation_steps="2" \
11
+ --learning_rate="7.5e-7" \
12
+ --warmup_steps="1000" \
13
+ --evaluation_strategy="steps" \
14
+ --text_column_name="transcription" \
15
+ --save_steps="500" \
16
+ --eval_steps="500" \
17
+ --logging_steps="10" \
18
+ --layerdrop="0.0" \
19
+ --activation_dropout="0.1" \
20
+ --eval_metrics wer cer \
21
+ --greater_is_better="False" \
22
+ --load_best_model_at_end \
23
+ --save_total_limit="10" \
24
+ --mask_time_prob="0.3" \
25
+ --mask_time_length="10" \
26
+ --mask_feature_prob="0.1" \
27
+ --fp16 \
28
+ --mask_feature_length="64" \
29
+ --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
30
+ --group_by_length \
31
+ --push_to_hub \
32
+ --do_train --do_eval \
33
+ --gradient_checkpointing \
34
+ --use_auth_token
run_speech_recognition_ctc.py ADDED
@@ -0,0 +1,772 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+
16
+ """ Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
17
+
18
+ import functools
19
+ import json
20
+ import logging
21
+ import os
22
+ import re
23
+ import sys
24
+ import warnings
25
+ from dataclasses import dataclass, field
26
+ from typing import Dict, List, Optional, Union
27
+
28
+ import datasets
29
+ import numpy as np
30
+ import torch
31
+ from datasets import DatasetDict, load_dataset
32
+
33
+ import evaluate
34
+ import transformers
35
+ from transformers import (
36
+ AutoConfig,
37
+ AutoFeatureExtractor,
38
+ AutoModelForCTC,
39
+ AutoProcessor,
40
+ AutoTokenizer,
41
+ HfArgumentParser,
42
+ Trainer,
43
+ TrainingArguments,
44
+ Wav2Vec2Processor,
45
+ set_seed,
46
+ )
47
+ from transformers.trainer_utils import get_last_checkpoint, is_main_process
48
+ from transformers.utils import check_min_version, send_example_telemetry
49
+ from transformers.utils.versions import require_version
50
+
51
+
52
+ # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
53
+ check_min_version("4.26.0.dev0")
54
+
55
+ require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
56
+
57
+
58
+ logger = logging.getLogger(__name__)
59
+
60
+
61
+ def list_field(default=None, metadata=None):
62
+ return field(default_factory=lambda: default, metadata=metadata)
63
+
64
+
65
+ @dataclass
66
+ class ModelArguments:
67
+ """
68
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
69
+ """
70
+
71
+ model_name_or_path: str = field(
72
+ metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
73
+ )
74
+ tokenizer_name_or_path: Optional[str] = field(
75
+ default=None,
76
+ metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
77
+ )
78
+ cache_dir: Optional[str] = field(
79
+ default=None,
80
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
81
+ )
82
+ freeze_feature_encoder: bool = field(
83
+ default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
84
+ )
85
+ attention_dropout: float = field(
86
+ default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
87
+ )
88
+ activation_dropout: float = field(
89
+ default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
90
+ )
91
+ feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
92
+ hidden_dropout: float = field(
93
+ default=0.0,
94
+ metadata={
95
+ "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
96
+ },
97
+ )
98
+ final_dropout: float = field(
99
+ default=0.0,
100
+ metadata={"help": "The dropout probability for the final projection layer."},
101
+ )
102
+ mask_time_prob: float = field(
103
+ default=0.05,
104
+ metadata={
105
+ "help": (
106
+ "Probability of each feature vector along the time axis to be chosen as the start of the vector"
107
+ "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
108
+ "vectors will be masked along the time axis."
109
+ )
110
+ },
111
+ )
112
+ mask_time_length: int = field(
113
+ default=10,
114
+ metadata={"help": "Length of vector span to mask along the time axis."},
115
+ )
116
+ mask_feature_prob: float = field(
117
+ default=0.0,
118
+ metadata={
119
+ "help": (
120
+ "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
121
+ " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
122
+ " bins will be masked along the time axis."
123
+ )
124
+ },
125
+ )
126
+ mask_feature_length: int = field(
127
+ default=10,
128
+ metadata={"help": "Length of vector span to mask along the feature axis."},
129
+ )
130
+ layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
131
+ ctc_loss_reduction: Optional[str] = field(
132
+ default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
133
+ )
134
+
135
+
136
+ @dataclass
137
+ class DataTrainingArguments:
138
+ """
139
+ Arguments pertaining to what data we are going to input our model for training and eval.
140
+
141
+ Using `HfArgumentParser` we can turn this class
142
+ into argparse arguments to be able to specify them on
143
+ the command line.
144
+ """
145
+
146
+ dataset_name: str = field(
147
+ metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
148
+ )
149
+ dataset_config_name: str = field(
150
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
151
+ )
152
+ train_split_name: str = field(
153
+ default="train+validation",
154
+ metadata={
155
+ "help": (
156
+ "The name of the training data set split to use (via the datasets library). Defaults to "
157
+ "'train+validation'"
158
+ )
159
+ },
160
+ )
161
+ eval_split_name: str = field(
162
+ default="test",
163
+ metadata={
164
+ "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
165
+ },
166
+ )
167
+ audio_column_name: str = field(
168
+ default="audio",
169
+ metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
170
+ )
171
+ text_column_name: str = field(
172
+ default="text",
173
+ metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
174
+ )
175
+ overwrite_cache: bool = field(
176
+ default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
177
+ )
178
+ preprocessing_num_workers: Optional[int] = field(
179
+ default=None,
180
+ metadata={"help": "The number of processes to use for the preprocessing."},
181
+ )
182
+ max_train_samples: Optional[int] = field(
183
+ default=None,
184
+ metadata={
185
+ "help": (
186
+ "For debugging purposes or quicker training, truncate the number of training examples to this "
187
+ "value if set."
188
+ )
189
+ },
190
+ )
191
+ max_eval_samples: Optional[int] = field(
192
+ default=None,
193
+ metadata={
194
+ "help": (
195
+ "For debugging purposes or quicker training, truncate the number of validation examples to this "
196
+ "value if set."
197
+ )
198
+ },
199
+ )
200
+ chars_to_ignore: Optional[List[str]] = list_field(
201
+ default=None,
202
+ metadata={"help": "A list of characters to remove from the transcripts."},
203
+ )
204
+ eval_metrics: List[str] = list_field(
205
+ default=["wer"],
206
+ metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
207
+ )
208
+ max_duration_in_seconds: float = field(
209
+ default=20.0,
210
+ metadata={
211
+ "help": (
212
+ "Filter audio files that are longer than `max_duration_in_seconds` seconds to"
213
+ " 'max_duration_in_seconds`"
214
+ )
215
+ },
216
+ )
217
+ min_duration_in_seconds: float = field(
218
+ default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
219
+ )
220
+ preprocessing_only: bool = field(
221
+ default=False,
222
+ metadata={
223
+ "help": (
224
+ "Whether to only do data preprocessing and skip training. This is especially useful when data"
225
+ " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
226
+ " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
227
+ " can consequently be loaded in distributed training"
228
+ )
229
+ },
230
+ )
231
+ use_auth_token: bool = field(
232
+ default=False,
233
+ metadata={
234
+ "help": (
235
+ "If :obj:`True`, will use the token generated when running"
236
+ ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
237
+ )
238
+ },
239
+ )
240
+ unk_token: str = field(
241
+ default="[UNK]",
242
+ metadata={"help": "The unk token for the tokenizer"},
243
+ )
244
+ pad_token: str = field(
245
+ default="[PAD]",
246
+ metadata={"help": "The padding token for the tokenizer"},
247
+ )
248
+ word_delimiter_token: str = field(
249
+ default="|",
250
+ metadata={"help": "The word delimiter token for the tokenizer"},
251
+ )
252
+ phoneme_language: Optional[str] = field(
253
+ default=None,
254
+ metadata={
255
+ "help": (
256
+ "The target language that should be used be"
257
+ " passed to the tokenizer for tokenization. Note that"
258
+ " this is only relevant if the model classifies the"
259
+ " input audio to a sequence of phoneme sequences."
260
+ )
261
+ },
262
+ )
263
+
264
+
265
+ @dataclass
266
+ class DataCollatorCTCWithPadding:
267
+ """
268
+ Data collator that will dynamically pad the inputs received.
269
+ Args:
270
+ processor (:class:`~transformers.AutoProcessor`)
271
+ The processor used for proccessing the data.
272
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
273
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
274
+ among:
275
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
276
+ sequence if provided).
277
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
278
+ maximum acceptable input length for the model if that argument is not provided.
279
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
280
+ different lengths).
281
+ max_length (:obj:`int`, `optional`):
282
+ Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
283
+ max_length_labels (:obj:`int`, `optional`):
284
+ Maximum length of the ``labels`` returned list and optionally padding length (see above).
285
+ pad_to_multiple_of (:obj:`int`, `optional`):
286
+ If set will pad the sequence to a multiple of the provided value.
287
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
288
+ 7.5 (Volta).
289
+ """
290
+
291
+ processor: AutoProcessor
292
+ padding: Union[bool, str] = "longest"
293
+ pad_to_multiple_of: Optional[int] = None
294
+ pad_to_multiple_of_labels: Optional[int] = None
295
+
296
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
297
+ # split inputs and labels since they have to be of different lenghts and need
298
+ # different padding methods
299
+ input_features = [{"input_values": feature["input_values"]} for feature in features]
300
+ label_features = [{"input_ids": feature["labels"]} for feature in features]
301
+
302
+ batch = self.processor.pad(
303
+ input_features,
304
+ padding=self.padding,
305
+ pad_to_multiple_of=self.pad_to_multiple_of,
306
+ return_tensors="pt",
307
+ )
308
+
309
+ labels_batch = self.processor.pad(
310
+ labels=label_features,
311
+ padding=self.padding,
312
+ pad_to_multiple_of=self.pad_to_multiple_of_labels,
313
+ return_tensors="pt",
314
+ )
315
+
316
+ # replace padding with -100 to ignore loss correctly
317
+ labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
318
+
319
+ batch["labels"] = labels
320
+ if "attention_mask" in batch:
321
+ batch["attention_mask"] = batch["attention_mask"].to(torch.long)
322
+
323
+ return batch
324
+
325
+
326
+ def create_vocabulary_from_data(
327
+ datasets: DatasetDict,
328
+ word_delimiter_token: Optional[str] = None,
329
+ unk_token: Optional[str] = None,
330
+ pad_token: Optional[str] = None,
331
+ ):
332
+ # Given training and test labels create vocabulary
333
+ def extract_all_chars(batch):
334
+ all_text = " ".join(batch["target_text"])
335
+ vocab = list(set(all_text))
336
+ return {"vocab": [vocab], "all_text": [all_text]}
337
+
338
+ vocabs = datasets.map(
339
+ extract_all_chars,
340
+ batched=True,
341
+ batch_size=-1,
342
+ keep_in_memory=True,
343
+ remove_columns=datasets["train"].column_names,
344
+ )
345
+
346
+ # take union of all unique characters in each dataset
347
+ vocab_set = functools.reduce(
348
+ lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
349
+ )
350
+
351
+ vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
352
+
353
+ # replace white space with delimiter token
354
+ if word_delimiter_token is not None:
355
+ vocab_dict[word_delimiter_token] = vocab_dict[" "]
356
+ del vocab_dict[" "]
357
+
358
+ # add unk and pad token
359
+ if unk_token is not None:
360
+ vocab_dict[unk_token] = len(vocab_dict)
361
+
362
+ if pad_token is not None:
363
+ vocab_dict[pad_token] = len(vocab_dict)
364
+
365
+ return vocab_dict
366
+
367
+
368
+ def main():
369
+ # See all possible arguments in src/transformers/training_args.py
370
+ # or by passing the --help flag to this script.
371
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
372
+
373
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
374
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
375
+ # If we pass only one argument to the script and it's the path to a json file,
376
+ # let's parse it to get our arguments.
377
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
378
+ else:
379
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
380
+
381
+ # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
382
+ # information sent is the one passed as arguments along with your Python/PyTorch versions.
383
+ send_example_telemetry("run_speech_recognition_ctc", model_args, data_args)
384
+
385
+ # Detecting last checkpoint.
386
+ last_checkpoint = None
387
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
388
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
389
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
390
+ raise ValueError(
391
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
392
+ "Use --overwrite_output_dir to overcome."
393
+ )
394
+ elif last_checkpoint is not None:
395
+ logger.info(
396
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
397
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
398
+ )
399
+
400
+ # Setup logging
401
+ logging.basicConfig(
402
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
403
+ datefmt="%m/%d/%Y %H:%M:%S",
404
+ handlers=[logging.StreamHandler(sys.stdout)],
405
+ )
406
+ logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
407
+
408
+ # Log on each process the small summary:
409
+ logger.warning(
410
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
411
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
412
+ )
413
+ # Set the verbosity to info of the Transformers logger (on main process only):
414
+ if is_main_process(training_args.local_rank):
415
+ transformers.utils.logging.set_verbosity_info()
416
+ logger.info("Training/evaluation parameters %s", training_args)
417
+
418
+ # Set seed before initializing model.
419
+ set_seed(training_args.seed)
420
+
421
+ # 1. First, let's load the dataset
422
+ raw_datasets = DatasetDict()
423
+
424
+ if training_args.do_train:
425
+ raw_datasets["train"] = load_dataset(
426
+ data_args.dataset_name,
427
+ data_args.dataset_config_name,
428
+ split=data_args.train_split_name,
429
+ use_auth_token=data_args.use_auth_token,
430
+ )
431
+
432
+ if data_args.audio_column_name not in raw_datasets["train"].column_names:
433
+ raise ValueError(
434
+ f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
435
+ " Make sure to set `--audio_column_name` to the correct audio column - one of"
436
+ f" {', '.join(raw_datasets['train'].column_names)}."
437
+ )
438
+
439
+ if data_args.text_column_name not in raw_datasets["train"].column_names:
440
+ raise ValueError(
441
+ f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
442
+ "Make sure to set `--text_column_name` to the correct text column - one of "
443
+ f"{', '.join(raw_datasets['train'].column_names)}."
444
+ )
445
+
446
+ if data_args.max_train_samples is not None:
447
+ raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
448
+
449
+ if training_args.do_eval:
450
+ raw_datasets["eval"] = load_dataset(
451
+ data_args.dataset_name,
452
+ data_args.dataset_config_name,
453
+ split=data_args.eval_split_name,
454
+ use_auth_token=data_args.use_auth_token,
455
+ )
456
+
457
+ if data_args.max_eval_samples is not None:
458
+ raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
459
+
460
+ # 2. We remove some special characters from the datasets
461
+ # that make training complicated and do not help in transcribing the speech
462
+ # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
463
+ # that could be easily picked up by the model
464
+ chars_to_ignore_regex = (
465
+ f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
466
+ )
467
+ text_column_name = data_args.text_column_name
468
+
469
+ def remove_special_characters(batch):
470
+ if chars_to_ignore_regex is not None:
471
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
472
+ else:
473
+ batch["target_text"] = batch[text_column_name].lower() + " "
474
+ return batch
475
+
476
+ with training_args.main_process_first(desc="dataset map special characters removal"):
477
+ raw_datasets = raw_datasets.map(
478
+ remove_special_characters,
479
+ remove_columns=[text_column_name],
480
+ desc="remove special characters from datasets",
481
+ )
482
+
483
+ # save special tokens for tokenizer
484
+ word_delimiter_token = data_args.word_delimiter_token
485
+ unk_token = data_args.unk_token
486
+ pad_token = data_args.pad_token
487
+
488
+ # 3. Next, let's load the config as we might need it to create
489
+ # the tokenizer
490
+ # load config
491
+ config = AutoConfig.from_pretrained(
492
+ model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
493
+ )
494
+
495
+ # 4. Next, if no tokenizer file is defined,
496
+ # we create the vocabulary of the model by extracting all unique characters from
497
+ # the training and evaluation datasets
498
+ # We need to make sure that only first rank saves vocabulary
499
+ # make sure all processes wait until vocab is created
500
+ tokenizer_name_or_path = model_args.tokenizer_name_or_path
501
+ tokenizer_kwargs = {}
502
+ if tokenizer_name_or_path is None:
503
+ # save vocab in training output dir
504
+ tokenizer_name_or_path = training_args.output_dir
505
+
506
+ vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
507
+
508
+ with training_args.main_process_first():
509
+ if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
510
+ try:
511
+ os.remove(vocab_file)
512
+ except OSError:
513
+ # in shared file-systems it might be the case that
514
+ # two processes try to delete the vocab file at the some time
515
+ pass
516
+
517
+ with training_args.main_process_first(desc="dataset map vocabulary creation"):
518
+ if not os.path.isfile(vocab_file):
519
+ os.makedirs(tokenizer_name_or_path, exist_ok=True)
520
+ vocab_dict = create_vocabulary_from_data(
521
+ raw_datasets,
522
+ word_delimiter_token=word_delimiter_token,
523
+ unk_token=unk_token,
524
+ pad_token=pad_token,
525
+ )
526
+
527
+ # save vocab dict to be loaded into tokenizer
528
+ with open(vocab_file, "w") as file:
529
+ json.dump(vocab_dict, file)
530
+
531
+ # if tokenizer has just been created
532
+ # it is defined by `tokenizer_class` if present in config else by `model_type`
533
+ tokenizer_kwargs = {
534
+ "config": config if config.tokenizer_class is not None else None,
535
+ "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
536
+ "unk_token": unk_token,
537
+ "pad_token": pad_token,
538
+ "word_delimiter_token": word_delimiter_token,
539
+ }
540
+
541
+ # 5. Now we can instantiate the feature extractor, tokenizer and model
542
+ # Note for distributed training, the .from_pretrained methods guarantee that only
543
+ # one local process can concurrently download model & vocab.
544
+
545
+ # load feature_extractor and tokenizer
546
+ tokenizer = AutoTokenizer.from_pretrained(
547
+ tokenizer_name_or_path,
548
+ use_auth_token=data_args.use_auth_token,
549
+ **tokenizer_kwargs,
550
+ )
551
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
552
+ model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
553
+ )
554
+
555
+ # adapt config
556
+ config.update(
557
+ {
558
+ "feat_proj_dropout": model_args.feat_proj_dropout,
559
+ "attention_dropout": model_args.attention_dropout,
560
+ "hidden_dropout": model_args.hidden_dropout,
561
+ "final_dropout": model_args.final_dropout,
562
+ "mask_time_prob": model_args.mask_time_prob,
563
+ "mask_time_length": model_args.mask_time_length,
564
+ "mask_feature_prob": model_args.mask_feature_prob,
565
+ "mask_feature_length": model_args.mask_feature_length,
566
+ "gradient_checkpointing": training_args.gradient_checkpointing,
567
+ "layerdrop": model_args.layerdrop,
568
+ "ctc_loss_reduction": model_args.ctc_loss_reduction,
569
+ "pad_token_id": tokenizer.pad_token_id,
570
+ "vocab_size": len(tokenizer),
571
+ "activation_dropout": model_args.activation_dropout,
572
+ }
573
+ )
574
+
575
+ # create model
576
+ model = AutoModelForCTC.from_pretrained(
577
+ model_args.model_name_or_path,
578
+ cache_dir=model_args.cache_dir,
579
+ config=config,
580
+ use_auth_token=data_args.use_auth_token,
581
+ )
582
+
583
+ # freeze encoder
584
+ if model_args.freeze_feature_encoder:
585
+ model.freeze_feature_encoder()
586
+
587
+ # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
588
+ # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
589
+ # so that we just need to set the correct target sampling rate and normalize the input
590
+ # via the `feature_extractor`
591
+
592
+ # make sure that dataset decodes audio with correct sampling rate
593
+ dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
594
+ if dataset_sampling_rate != feature_extractor.sampling_rate:
595
+ raw_datasets = raw_datasets.cast_column(
596
+ data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
597
+ )
598
+
599
+ # derive max & min input length for sample rate & max duration
600
+ max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
601
+ min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
602
+ audio_column_name = data_args.audio_column_name
603
+ num_workers = data_args.preprocessing_num_workers
604
+
605
+ # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
606
+ phoneme_language = data_args.phoneme_language
607
+
608
+ # Preprocessing the datasets.
609
+ # We need to read the audio files as arrays and tokenize the targets.
610
+ def prepare_dataset(batch):
611
+ # load audio
612
+ sample = batch[audio_column_name]
613
+
614
+ inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
615
+ batch["input_values"] = inputs.input_values[0]
616
+ batch["input_length"] = len(batch["input_values"])
617
+
618
+ # encode targets
619
+ additional_kwargs = {}
620
+ if phoneme_language is not None:
621
+ additional_kwargs["phonemizer_lang"] = phoneme_language
622
+
623
+ batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
624
+ return batch
625
+
626
+ with training_args.main_process_first(desc="dataset map preprocessing"):
627
+ vectorized_datasets = raw_datasets.map(
628
+ prepare_dataset,
629
+ remove_columns=next(iter(raw_datasets.values())).column_names,
630
+ num_proc=num_workers,
631
+ desc="preprocess datasets",
632
+ )
633
+
634
+ def is_audio_in_length_range(length):
635
+ return length > min_input_length and length < max_input_length
636
+
637
+ # filter data that is shorter than min_input_length
638
+ vectorized_datasets = vectorized_datasets.filter(
639
+ is_audio_in_length_range,
640
+ num_proc=num_workers,
641
+ input_columns=["input_length"],
642
+ )
643
+
644
+ # 7. Next, we can prepare the training.
645
+ # Let's use word error rate (WER) as our evaluation metric,
646
+ # instantiate a data collator and the trainer
647
+
648
+ # Define evaluation metrics during training, *i.e.* word error rate, character error rate
649
+ eval_metrics = {metric: evaluate.load(metric) for metric in data_args.eval_metrics}
650
+
651
+ # for large datasets it is advised to run the preprocessing on a
652
+ # single machine first with ``args.preprocessing_only`` since there will mostly likely
653
+ # be a timeout when running the script in distributed mode.
654
+ # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
655
+ # cached dataset
656
+ if data_args.preprocessing_only:
657
+ logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
658
+ return
659
+
660
+ def compute_metrics(pred):
661
+ pred_logits = pred.predictions
662
+ pred_ids = np.argmax(pred_logits, axis=-1)
663
+
664
+ pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
665
+
666
+ pred_str = tokenizer.batch_decode(pred_ids)
667
+ # we do not want to group tokens when computing the metrics
668
+ label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
669
+
670
+ metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
671
+
672
+ return metrics
673
+
674
+ # Now save everything to be able to create a single processor later
675
+ if is_main_process(training_args.local_rank):
676
+ # save feature extractor, tokenizer and config
677
+ feature_extractor.save_pretrained(training_args.output_dir)
678
+ tokenizer.save_pretrained(training_args.output_dir)
679
+ config.save_pretrained(training_args.output_dir)
680
+
681
+ try:
682
+ processor = AutoProcessor.from_pretrained(training_args.output_dir)
683
+ except (OSError, KeyError):
684
+ warnings.warn(
685
+ "Loading a processor from a feature extractor config that does not"
686
+ " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
687
+ " attribute to your `preprocessor_config.json` file to suppress this warning: "
688
+ " `'processor_class': 'Wav2Vec2Processor'`",
689
+ FutureWarning,
690
+ )
691
+ processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
692
+
693
+ # Instantiate custom data collator
694
+ data_collator = DataCollatorCTCWithPadding(processor=processor)
695
+
696
+ # Initialize Trainer
697
+ trainer = Trainer(
698
+ model=model,
699
+ data_collator=data_collator,
700
+ args=training_args,
701
+ compute_metrics=compute_metrics,
702
+ train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
703
+ eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
704
+ tokenizer=feature_extractor,
705
+ )
706
+
707
+ # 8. Finally, we can start training
708
+
709
+ # Training
710
+ if training_args.do_train:
711
+
712
+ # use last checkpoint if exist
713
+ if last_checkpoint is not None:
714
+ checkpoint = last_checkpoint
715
+ elif os.path.isdir(model_args.model_name_or_path):
716
+ checkpoint = model_args.model_name_or_path
717
+ else:
718
+ checkpoint = None
719
+
720
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
721
+ trainer.save_model()
722
+
723
+ metrics = train_result.metrics
724
+ max_train_samples = (
725
+ data_args.max_train_samples
726
+ if data_args.max_train_samples is not None
727
+ else len(vectorized_datasets["train"])
728
+ )
729
+ metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
730
+
731
+ trainer.log_metrics("train", metrics)
732
+ trainer.save_metrics("train", metrics)
733
+ trainer.save_state()
734
+
735
+ # Evaluation
736
+ results = {}
737
+ if training_args.do_eval:
738
+ logger.info("*** Evaluate ***")
739
+ metrics = trainer.evaluate()
740
+ max_eval_samples = (
741
+ data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
742
+ )
743
+ metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
744
+
745
+ trainer.log_metrics("eval", metrics)
746
+ trainer.save_metrics("eval", metrics)
747
+
748
+ # Write model card and (optionally) push to hub
749
+ config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
750
+ kwargs = {
751
+ "finetuned_from": model_args.model_name_or_path,
752
+ "tasks": "automatic-speech-recognition",
753
+ "tags": ["automatic-speech-recognition", data_args.dataset_name],
754
+ "dataset_args": (
755
+ f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
756
+ f" {data_args.eval_split_name}"
757
+ ),
758
+ "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
759
+ }
760
+ if "common_voice" in data_args.dataset_name:
761
+ kwargs["language"] = config_name
762
+
763
+ if training_args.push_to_hub:
764
+ trainer.push_to_hub(**kwargs)
765
+ else:
766
+ trainer.create_model_card(**kwargs)
767
+
768
+ return results
769
+
770
+
771
+ if __name__ == "__main__":
772
+ main()
runs/Dec14_13-29-56_129-213-22-31/1671025286.7583845/events.out.tfevents.1671025286.129-213-22-31.83694.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84fe7f9eb9bdee3f0ce972e8ae5cd67b151f661955efd2d0d8e279a1d976c2b6
3
+ size 5629
runs/Dec14_13-29-56_129-213-22-31/events.out.tfevents.1671025286.129-213-22-31.83694.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:868a191ddd017bc677b5d7108d90ae2c4e031d5cf34f17cf5c6cb23be7122dac
3
+ size 70052
runs/Dec14_13-29-56_129-213-22-31/events.out.tfevents.1671041124.129-213-22-31.83694.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56ec8495be91f4bd3dbb31e66bba57037df719175e531b261efc1d005b8ec6bf
3
+ size 405
runs/Dec16_13-55-02_129-146-104-29/1671199202.2565184/events.out.tfevents.1671199202.129-146-104-29.128095.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:414238c9cc401aa54440b67cd9afd1482b1e3d03117800b010df72223777e4b6
3
+ size 5633
runs/Dec16_13-55-02_129-146-104-29/events.out.tfevents.1671199202.129-146-104-29.128095.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de34b04e2c5880b8861a0e14b0100a15eb8fd48621e0e131dc82e4fe1d0301a7
3
+ size 13738
runs/Dec16_13-55-02_129-146-104-29/events.out.tfevents.1671201437.129-146-104-29.128095.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08206f99f4f7b3fbfa384e19ec6ae571dfe46052f6556baacc5e7735d060a873
3
+ size 405
runs/Dec16_14-39-42_129-146-104-29/1671201754.79521/events.out.tfevents.1671201754.129-146-104-29.129288.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd49e0eafc8f61728599067cf82cdb4c8e2dfc804418f88a9bac0a108f945f5a
3
+ size 5633
runs/Dec16_14-39-42_129-146-104-29/events.out.tfevents.1671201754.129-146-104-29.129288.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:519a0f832341b452e5e021bcb75ae3c6153d9aa9caaa5a99261ad7d742e5e630
3
+ size 13738
runs/Dec16_14-39-42_129-146-104-29/events.out.tfevents.1671204006.129-146-104-29.129288.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80d4d15418295c572aa4591c39561da14f6b9f84af7cb628829d3e04efcf370f
3
+ size 405
runs/Dec16_15-29-40_129-146-104-29/1671204751.2903225/events.out.tfevents.1671204751.129-146-104-29.131453.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c70656d64c1d71a68951d71fc889f492d33a57405e8d161c35d2cbb6085a13e4
3
+ size 5633
runs/Dec16_15-29-40_129-146-104-29/events.out.tfevents.1671204751.129-146-104-29.131453.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dd4b02386c90cfc08d2d69cfaf6701c41bf670404378038bb0a43b850490b37
3
+ size 5642
runs/Dec16_15-39-46_129-146-104-29/1671205356.7546594/events.out.tfevents.1671205356.129-146-104-29.131763.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0275113dba0a656e712b957344ac77a8789fbfa165be78d96631e171b0746187
3
+ size 5633
runs/Dec16_15-39-46_129-146-104-29/events.out.tfevents.1671205356.129-146-104-29.131763.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a08d8d7c897ed3b076f8d828040d3477a13578a5a86f64356debf482e5da28d
3
+ size 13738
runs/Dec16_15-39-46_129-146-104-29/events.out.tfevents.1671207700.129-146-104-29.131763.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f83a25a085a2175f629e030fb53b3c3b594db36c16ed1ac8ad53615b96a60088
3
+ size 405
runs/Dec16_20-12-50_129-146-104-29/1671221741.6851091/events.out.tfevents.1671221741.129-146-104-29.144289.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8a651984c4fb45e3bc756b5f150fcfd878800854f2d3260816244f2d70ec758
3
+ size 5633
runs/Dec16_20-12-50_129-146-104-29/events.out.tfevents.1671221741.129-146-104-29.144289.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c94b3ebe03452e093f885924b88ff8bb7938aee4cf25e9deffade7b9711bb853
3
+ size 13738
runs/Dec16_20-12-50_129-146-104-29/events.out.tfevents.1671223965.129-146-104-29.144289.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80e8f010a29167816de5acc6dfa4ebc9166999dc4d6436f409cabb8252b3b9dc
3
+ size 405
runs/Dec16_20-56-58_129-146-104-29/1671224389.6246047/events.out.tfevents.1671224389.129-146-104-29.146388.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61e148fd7677c43d22f7c5ef9aad820a1b1d94af7c61b0bd4e3bbb4160ca62cd
3
+ size 5633
runs/Dec16_20-56-58_129-146-104-29/events.out.tfevents.1671224389.129-146-104-29.146388.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15a413eb22f0a56eddd96a1300a9c14d3c643429ce13383689d03bf93592f4df
3
+ size 6114
runs/Dec16_21-09-39_129-146-104-29/1671225152.348097/events.out.tfevents.1671225152.129-146-104-29.146624.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:312a6a0e0842bed6b7a490b5edd74a368578cf4383631f88c7e50d37d8a93214
3
+ size 5633
runs/Dec16_21-09-39_129-146-104-29/events.out.tfevents.1671225152.129-146-104-29.146624.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f83beec22f9ef84db038caf7ebeecbf3750ce0e6139f7056f09119fed66b249
3
+ size 13741
runs/Dec16_21-09-39_129-146-104-29/events.out.tfevents.1671227491.129-146-104-29.146624.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40319fc688859220e783165235dc6652e723535f892fdb147887467e472c1a18
3
+ size 405
special_tokens_map.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</s>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<s>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "</s>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<s>",
33
+ "lstrip": false,
34
+ "normalized": true,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "</s>",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "<s>",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ {
53
+ "content": "</s>",
54
+ "lstrip": false,
55
+ "normalized": true,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "content": "<s>",
61
+ "lstrip": false,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ {
67
+ "content": "</s>",
68
+ "lstrip": false,
69
+ "normalized": true,
70
+ "rstrip": false,
71
+ "single_word": false
72
+ },
73
+ {
74
+ "content": "<s>",
75
+ "lstrip": false,
76
+ "normalized": true,
77
+ "rstrip": false,
78
+ "single_word": false
79
+ },
80
+ {
81
+ "content": "</s>",
82
+ "lstrip": false,
83
+ "normalized": true,
84
+ "rstrip": false,
85
+ "single_word": false
86
+ },
87
+ {
88
+ "content": "<s>",
89
+ "lstrip": false,
90
+ "normalized": true,
91
+ "rstrip": false,
92
+ "single_word": false
93
+ },
94
+ {
95
+ "content": "</s>",
96
+ "lstrip": false,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false
100
+ },
101
+ {
102
+ "content": "<s>",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false
107
+ },
108
+ {
109
+ "content": "</s>",
110
+ "lstrip": false,
111
+ "normalized": true,
112
+ "rstrip": false,
113
+ "single_word": false
114
+ }
115
+ ],
116
+ "bos_token": "<s>",
117
+ "eos_token": "</s>",
118
+ "pad_token": "[PAD]",
119
+ "unk_token": "[UNK]"
120
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "do_lower_case": false,
4
+ "eos_token": "</s>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "name_or_path": "./",
7
+ "pad_token": "[PAD]",
8
+ "replace_word_delimiter_char": " ",
9
+ "special_tokens_map_file": null,
10
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
11
+ "unk_token": "[UNK]",
12
+ "word_delimiter_token": "|"
13
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 75.95,
3
+ "train_loss": 0.044292491674423215,
4
+ "train_runtime": 2233.4842,
5
+ "train_samples": 2528,
6
+ "train_samples_per_second": 85.964,
7
+ "train_steps_per_second": 2.686
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,3745 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9162458181381226,
3
+ "best_model_checkpoint": "./checkpoint-4000",
4
+ "epoch": 75.9493670886076,
5
+ "global_step": 6000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.13,
12
+ "learning_rate": 3.7499999999999996e-07,
13
+ "loss": 23.4903,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.25,
18
+ "learning_rate": 7.499999999999999e-07,
19
+ "loss": 21.1248,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.38,
24
+ "learning_rate": 1.1249999999999998e-06,
25
+ "loss": 22.9317,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.51,
30
+ "learning_rate": 1.4625e-06,
31
+ "loss": 20.6205,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.63,
36
+ "learning_rate": 1.8375e-06,
37
+ "loss": 22.2659,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 0.76,
42
+ "learning_rate": 2.2124999999999996e-06,
43
+ "loss": 21.4276,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 0.89,
48
+ "learning_rate": 2.5875e-06,
49
+ "loss": 21.8665,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 1.01,
54
+ "learning_rate": 2.9624999999999996e-06,
55
+ "loss": 20.8487,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 1.14,
60
+ "learning_rate": 3.3374999999999994e-06,
61
+ "loss": 21.838,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 1.27,
66
+ "learning_rate": 3.7125e-06,
67
+ "loss": 19.0875,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 1.39,
72
+ "learning_rate": 4.087499999999999e-06,
73
+ "loss": 19.3293,
74
+ "step": 110
75
+ },
76
+ {
77
+ "epoch": 1.52,
78
+ "learning_rate": 4.462499999999999e-06,
79
+ "loss": 16.2192,
80
+ "step": 120
81
+ },
82
+ {
83
+ "epoch": 1.65,
84
+ "learning_rate": 4.8375e-06,
85
+ "loss": 15.0126,
86
+ "step": 130
87
+ },
88
+ {
89
+ "epoch": 1.77,
90
+ "learning_rate": 5.2125e-06,
91
+ "loss": 13.5756,
92
+ "step": 140
93
+ },
94
+ {
95
+ "epoch": 1.9,
96
+ "learning_rate": 5.5874999999999994e-06,
97
+ "loss": 12.5467,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 2.03,
102
+ "learning_rate": 5.962499999999999e-06,
103
+ "loss": 11.8743,
104
+ "step": 160
105
+ },
106
+ {
107
+ "epoch": 2.15,
108
+ "learning_rate": 6.3375e-06,
109
+ "loss": 11.4262,
110
+ "step": 170
111
+ },
112
+ {
113
+ "epoch": 2.28,
114
+ "learning_rate": 6.712499999999999e-06,
115
+ "loss": 10.3319,
116
+ "step": 180
117
+ },
118
+ {
119
+ "epoch": 2.41,
120
+ "learning_rate": 7.0874999999999995e-06,
121
+ "loss": 10.0354,
122
+ "step": 190
123
+ },
124
+ {
125
+ "epoch": 2.53,
126
+ "learning_rate": 7.4625e-06,
127
+ "loss": 9.8568,
128
+ "step": 200
129
+ },
130
+ {
131
+ "epoch": 2.66,
132
+ "learning_rate": 7.837499999999999e-06,
133
+ "loss": 9.4804,
134
+ "step": 210
135
+ },
136
+ {
137
+ "epoch": 2.78,
138
+ "learning_rate": 8.2125e-06,
139
+ "loss": 9.0891,
140
+ "step": 220
141
+ },
142
+ {
143
+ "epoch": 2.91,
144
+ "learning_rate": 8.5875e-06,
145
+ "loss": 8.6768,
146
+ "step": 230
147
+ },
148
+ {
149
+ "epoch": 3.04,
150
+ "learning_rate": 8.9625e-06,
151
+ "loss": 8.5948,
152
+ "step": 240
153
+ },
154
+ {
155
+ "epoch": 3.16,
156
+ "learning_rate": 9.3375e-06,
157
+ "loss": 8.4701,
158
+ "step": 250
159
+ },
160
+ {
161
+ "epoch": 3.29,
162
+ "learning_rate": 9.712499999999999e-06,
163
+ "loss": 8.2693,
164
+ "step": 260
165
+ },
166
+ {
167
+ "epoch": 3.42,
168
+ "learning_rate": 1.00875e-05,
169
+ "loss": 8.0611,
170
+ "step": 270
171
+ },
172
+ {
173
+ "epoch": 3.54,
174
+ "learning_rate": 1.04625e-05,
175
+ "loss": 8.0222,
176
+ "step": 280
177
+ },
178
+ {
179
+ "epoch": 3.67,
180
+ "learning_rate": 1.0837499999999997e-05,
181
+ "loss": 7.5698,
182
+ "step": 290
183
+ },
184
+ {
185
+ "epoch": 3.8,
186
+ "learning_rate": 1.1212499999999998e-05,
187
+ "loss": 7.5901,
188
+ "step": 300
189
+ },
190
+ {
191
+ "epoch": 3.92,
192
+ "learning_rate": 1.1587499999999999e-05,
193
+ "loss": 7.2712,
194
+ "step": 310
195
+ },
196
+ {
197
+ "epoch": 4.05,
198
+ "learning_rate": 1.19625e-05,
199
+ "loss": 7.4485,
200
+ "step": 320
201
+ },
202
+ {
203
+ "epoch": 4.18,
204
+ "learning_rate": 1.23375e-05,
205
+ "loss": 7.1484,
206
+ "step": 330
207
+ },
208
+ {
209
+ "epoch": 4.3,
210
+ "learning_rate": 1.2712499999999999e-05,
211
+ "loss": 7.1105,
212
+ "step": 340
213
+ },
214
+ {
215
+ "epoch": 4.43,
216
+ "learning_rate": 1.3087499999999998e-05,
217
+ "loss": 6.8399,
218
+ "step": 350
219
+ },
220
+ {
221
+ "epoch": 4.56,
222
+ "learning_rate": 1.3462499999999999e-05,
223
+ "loss": 6.8718,
224
+ "step": 360
225
+ },
226
+ {
227
+ "epoch": 4.68,
228
+ "learning_rate": 1.38375e-05,
229
+ "loss": 6.5045,
230
+ "step": 370
231
+ },
232
+ {
233
+ "epoch": 4.81,
234
+ "learning_rate": 1.4212499999999998e-05,
235
+ "loss": 6.574,
236
+ "step": 380
237
+ },
238
+ {
239
+ "epoch": 4.94,
240
+ "learning_rate": 1.4587499999999999e-05,
241
+ "loss": 6.1716,
242
+ "step": 390
243
+ },
244
+ {
245
+ "epoch": 5.06,
246
+ "learning_rate": 1.49625e-05,
247
+ "loss": 6.3618,
248
+ "step": 400
249
+ },
250
+ {
251
+ "epoch": 5.19,
252
+ "learning_rate": 1.5337499999999997e-05,
253
+ "loss": 6.012,
254
+ "step": 410
255
+ },
256
+ {
257
+ "epoch": 5.32,
258
+ "learning_rate": 1.57125e-05,
259
+ "loss": 6.0979,
260
+ "step": 420
261
+ },
262
+ {
263
+ "epoch": 5.44,
264
+ "learning_rate": 1.6087499999999998e-05,
265
+ "loss": 5.7887,
266
+ "step": 430
267
+ },
268
+ {
269
+ "epoch": 5.57,
270
+ "learning_rate": 1.6462499999999997e-05,
271
+ "loss": 5.892,
272
+ "step": 440
273
+ },
274
+ {
275
+ "epoch": 5.7,
276
+ "learning_rate": 1.68375e-05,
277
+ "loss": 5.4672,
278
+ "step": 450
279
+ },
280
+ {
281
+ "epoch": 5.82,
282
+ "learning_rate": 1.7212499999999998e-05,
283
+ "loss": 5.6353,
284
+ "step": 460
285
+ },
286
+ {
287
+ "epoch": 5.95,
288
+ "learning_rate": 1.7587499999999997e-05,
289
+ "loss": 5.2754,
290
+ "step": 470
291
+ },
292
+ {
293
+ "epoch": 6.08,
294
+ "learning_rate": 1.7962499999999996e-05,
295
+ "loss": 5.3343,
296
+ "step": 480
297
+ },
298
+ {
299
+ "epoch": 6.2,
300
+ "learning_rate": 1.83375e-05,
301
+ "loss": 5.0586,
302
+ "step": 490
303
+ },
304
+ {
305
+ "epoch": 6.33,
306
+ "learning_rate": 1.8712499999999997e-05,
307
+ "loss": 5.0767,
308
+ "step": 500
309
+ },
310
+ {
311
+ "epoch": 6.33,
312
+ "eval_cer": 1.0,
313
+ "eval_loss": 4.878269672393799,
314
+ "eval_runtime": 44.5452,
315
+ "eval_samples_per_second": 10.798,
316
+ "eval_steps_per_second": 1.369,
317
+ "eval_wer": 1.0,
318
+ "step": 500
319
+ },
320
+ {
321
+ "epoch": 6.46,
322
+ "learning_rate": 1.90875e-05,
323
+ "loss": 4.7637,
324
+ "step": 510
325
+ },
326
+ {
327
+ "epoch": 6.58,
328
+ "learning_rate": 1.94625e-05,
329
+ "loss": 4.8495,
330
+ "step": 520
331
+ },
332
+ {
333
+ "epoch": 6.71,
334
+ "learning_rate": 1.9837499999999998e-05,
335
+ "loss": 4.5651,
336
+ "step": 530
337
+ },
338
+ {
339
+ "epoch": 6.84,
340
+ "learning_rate": 2.02125e-05,
341
+ "loss": 4.7084,
342
+ "step": 540
343
+ },
344
+ {
345
+ "epoch": 6.96,
346
+ "learning_rate": 2.05875e-05,
347
+ "loss": 4.3947,
348
+ "step": 550
349
+ },
350
+ {
351
+ "epoch": 7.09,
352
+ "learning_rate": 2.09625e-05,
353
+ "loss": 4.4641,
354
+ "step": 560
355
+ },
356
+ {
357
+ "epoch": 7.22,
358
+ "learning_rate": 2.1337499999999997e-05,
359
+ "loss": 4.2175,
360
+ "step": 570
361
+ },
362
+ {
363
+ "epoch": 7.34,
364
+ "learning_rate": 2.1712499999999996e-05,
365
+ "loss": 4.2767,
366
+ "step": 580
367
+ },
368
+ {
369
+ "epoch": 7.47,
370
+ "learning_rate": 2.2087499999999998e-05,
371
+ "loss": 4.0216,
372
+ "step": 590
373
+ },
374
+ {
375
+ "epoch": 7.59,
376
+ "learning_rate": 2.2462499999999997e-05,
377
+ "loss": 4.0459,
378
+ "step": 600
379
+ },
380
+ {
381
+ "epoch": 7.72,
382
+ "learning_rate": 2.2837499999999996e-05,
383
+ "loss": 3.8726,
384
+ "step": 610
385
+ },
386
+ {
387
+ "epoch": 7.85,
388
+ "learning_rate": 2.32125e-05,
389
+ "loss": 3.8982,
390
+ "step": 620
391
+ },
392
+ {
393
+ "epoch": 7.97,
394
+ "learning_rate": 2.3587499999999997e-05,
395
+ "loss": 3.7419,
396
+ "step": 630
397
+ },
398
+ {
399
+ "epoch": 8.1,
400
+ "learning_rate": 2.39625e-05,
401
+ "loss": 3.7623,
402
+ "step": 640
403
+ },
404
+ {
405
+ "epoch": 8.23,
406
+ "learning_rate": 2.43375e-05,
407
+ "loss": 3.6173,
408
+ "step": 650
409
+ },
410
+ {
411
+ "epoch": 8.35,
412
+ "learning_rate": 2.4712499999999998e-05,
413
+ "loss": 3.625,
414
+ "step": 660
415
+ },
416
+ {
417
+ "epoch": 8.48,
418
+ "learning_rate": 2.50875e-05,
419
+ "loss": 3.4981,
420
+ "step": 670
421
+ },
422
+ {
423
+ "epoch": 8.61,
424
+ "learning_rate": 2.54625e-05,
425
+ "loss": 3.5114,
426
+ "step": 680
427
+ },
428
+ {
429
+ "epoch": 8.73,
430
+ "learning_rate": 2.5837499999999994e-05,
431
+ "loss": 3.42,
432
+ "step": 690
433
+ },
434
+ {
435
+ "epoch": 8.86,
436
+ "learning_rate": 2.6212499999999997e-05,
437
+ "loss": 3.4173,
438
+ "step": 700
439
+ },
440
+ {
441
+ "epoch": 8.99,
442
+ "learning_rate": 2.6587499999999996e-05,
443
+ "loss": 3.3501,
444
+ "step": 710
445
+ },
446
+ {
447
+ "epoch": 9.11,
448
+ "learning_rate": 2.6962499999999998e-05,
449
+ "loss": 3.3289,
450
+ "step": 720
451
+ },
452
+ {
453
+ "epoch": 9.24,
454
+ "learning_rate": 2.7337499999999997e-05,
455
+ "loss": 3.2786,
456
+ "step": 730
457
+ },
458
+ {
459
+ "epoch": 9.37,
460
+ "learning_rate": 2.7712499999999996e-05,
461
+ "loss": 3.2836,
462
+ "step": 740
463
+ },
464
+ {
465
+ "epoch": 9.49,
466
+ "learning_rate": 2.80875e-05,
467
+ "loss": 3.2514,
468
+ "step": 750
469
+ },
470
+ {
471
+ "epoch": 9.62,
472
+ "learning_rate": 2.8462499999999997e-05,
473
+ "loss": 3.2778,
474
+ "step": 760
475
+ },
476
+ {
477
+ "epoch": 9.75,
478
+ "learning_rate": 2.88375e-05,
479
+ "loss": 3.2042,
480
+ "step": 770
481
+ },
482
+ {
483
+ "epoch": 9.87,
484
+ "learning_rate": 2.92125e-05,
485
+ "loss": 3.1964,
486
+ "step": 780
487
+ },
488
+ {
489
+ "epoch": 10.0,
490
+ "learning_rate": 2.9587499999999998e-05,
491
+ "loss": 3.2052,
492
+ "step": 790
493
+ },
494
+ {
495
+ "epoch": 10.13,
496
+ "learning_rate": 2.99625e-05,
497
+ "loss": 3.1989,
498
+ "step": 800
499
+ },
500
+ {
501
+ "epoch": 10.25,
502
+ "learning_rate": 3.03375e-05,
503
+ "loss": 3.1823,
504
+ "step": 810
505
+ },
506
+ {
507
+ "epoch": 10.38,
508
+ "learning_rate": 3.0712499999999994e-05,
509
+ "loss": 3.1613,
510
+ "step": 820
511
+ },
512
+ {
513
+ "epoch": 10.51,
514
+ "learning_rate": 3.10875e-05,
515
+ "loss": 3.1659,
516
+ "step": 830
517
+ },
518
+ {
519
+ "epoch": 10.63,
520
+ "learning_rate": 3.14625e-05,
521
+ "loss": 3.1798,
522
+ "step": 840
523
+ },
524
+ {
525
+ "epoch": 10.76,
526
+ "learning_rate": 3.1837499999999995e-05,
527
+ "loss": 3.1711,
528
+ "step": 850
529
+ },
530
+ {
531
+ "epoch": 10.89,
532
+ "learning_rate": 3.22125e-05,
533
+ "loss": 3.1659,
534
+ "step": 860
535
+ },
536
+ {
537
+ "epoch": 11.01,
538
+ "learning_rate": 3.25875e-05,
539
+ "loss": 3.1309,
540
+ "step": 870
541
+ },
542
+ {
543
+ "epoch": 11.14,
544
+ "learning_rate": 3.2962499999999995e-05,
545
+ "loss": 3.1706,
546
+ "step": 880
547
+ },
548
+ {
549
+ "epoch": 11.27,
550
+ "learning_rate": 3.33375e-05,
551
+ "loss": 3.1441,
552
+ "step": 890
553
+ },
554
+ {
555
+ "epoch": 11.39,
556
+ "learning_rate": 3.37125e-05,
557
+ "loss": 3.1341,
558
+ "step": 900
559
+ },
560
+ {
561
+ "epoch": 11.52,
562
+ "learning_rate": 3.4087499999999995e-05,
563
+ "loss": 3.1594,
564
+ "step": 910
565
+ },
566
+ {
567
+ "epoch": 11.65,
568
+ "learning_rate": 3.44625e-05,
569
+ "loss": 3.1262,
570
+ "step": 920
571
+ },
572
+ {
573
+ "epoch": 11.77,
574
+ "learning_rate": 3.48375e-05,
575
+ "loss": 3.1541,
576
+ "step": 930
577
+ },
578
+ {
579
+ "epoch": 11.9,
580
+ "learning_rate": 3.5212499999999995e-05,
581
+ "loss": 3.1393,
582
+ "step": 940
583
+ },
584
+ {
585
+ "epoch": 12.03,
586
+ "learning_rate": 3.55875e-05,
587
+ "loss": 3.1398,
588
+ "step": 950
589
+ },
590
+ {
591
+ "epoch": 12.15,
592
+ "learning_rate": 3.596249999999999e-05,
593
+ "loss": 3.1551,
594
+ "step": 960
595
+ },
596
+ {
597
+ "epoch": 12.28,
598
+ "learning_rate": 3.6337499999999996e-05,
599
+ "loss": 3.1391,
600
+ "step": 970
601
+ },
602
+ {
603
+ "epoch": 12.41,
604
+ "learning_rate": 3.67125e-05,
605
+ "loss": 3.1236,
606
+ "step": 980
607
+ },
608
+ {
609
+ "epoch": 12.53,
610
+ "learning_rate": 3.7087499999999993e-05,
611
+ "loss": 3.1327,
612
+ "step": 990
613
+ },
614
+ {
615
+ "epoch": 12.66,
616
+ "learning_rate": 3.7462499999999996e-05,
617
+ "loss": 3.1156,
618
+ "step": 1000
619
+ },
620
+ {
621
+ "epoch": 12.66,
622
+ "eval_cer": 1.0,
623
+ "eval_loss": 3.0990264415740967,
624
+ "eval_runtime": 44.9243,
625
+ "eval_samples_per_second": 10.707,
626
+ "eval_steps_per_second": 1.358,
627
+ "eval_wer": 1.0,
628
+ "step": 1000
629
+ },
630
+ {
631
+ "epoch": 12.78,
632
+ "learning_rate": 3.783749999999999e-05,
633
+ "loss": 3.1365,
634
+ "step": 1010
635
+ },
636
+ {
637
+ "epoch": 12.91,
638
+ "learning_rate": 3.8212499999999994e-05,
639
+ "loss": 3.1123,
640
+ "step": 1020
641
+ },
642
+ {
643
+ "epoch": 13.04,
644
+ "learning_rate": 3.8587499999999996e-05,
645
+ "loss": 3.132,
646
+ "step": 1030
647
+ },
648
+ {
649
+ "epoch": 13.16,
650
+ "learning_rate": 3.896249999999999e-05,
651
+ "loss": 3.1218,
652
+ "step": 1040
653
+ },
654
+ {
655
+ "epoch": 13.29,
656
+ "learning_rate": 3.9337499999999994e-05,
657
+ "loss": 3.1266,
658
+ "step": 1050
659
+ },
660
+ {
661
+ "epoch": 13.42,
662
+ "learning_rate": 3.9712499999999996e-05,
663
+ "loss": 3.1247,
664
+ "step": 1060
665
+ },
666
+ {
667
+ "epoch": 13.54,
668
+ "learning_rate": 4.008749999999999e-05,
669
+ "loss": 3.1203,
670
+ "step": 1070
671
+ },
672
+ {
673
+ "epoch": 13.67,
674
+ "learning_rate": 4.0462499999999994e-05,
675
+ "loss": 3.1071,
676
+ "step": 1080
677
+ },
678
+ {
679
+ "epoch": 13.8,
680
+ "learning_rate": 4.0837499999999997e-05,
681
+ "loss": 3.1095,
682
+ "step": 1090
683
+ },
684
+ {
685
+ "epoch": 13.92,
686
+ "learning_rate": 4.12125e-05,
687
+ "loss": 3.0724,
688
+ "step": 1100
689
+ },
690
+ {
691
+ "epoch": 14.05,
692
+ "learning_rate": 4.1587499999999994e-05,
693
+ "loss": 3.1,
694
+ "step": 1110
695
+ },
696
+ {
697
+ "epoch": 14.18,
698
+ "learning_rate": 4.19625e-05,
699
+ "loss": 3.0862,
700
+ "step": 1120
701
+ },
702
+ {
703
+ "epoch": 14.3,
704
+ "learning_rate": 4.23375e-05,
705
+ "loss": 3.1141,
706
+ "step": 1130
707
+ },
708
+ {
709
+ "epoch": 14.43,
710
+ "learning_rate": 4.2712499999999995e-05,
711
+ "loss": 3.0847,
712
+ "step": 1140
713
+ },
714
+ {
715
+ "epoch": 14.56,
716
+ "learning_rate": 4.30875e-05,
717
+ "loss": 3.0845,
718
+ "step": 1150
719
+ },
720
+ {
721
+ "epoch": 14.68,
722
+ "learning_rate": 4.34625e-05,
723
+ "loss": 3.0537,
724
+ "step": 1160
725
+ },
726
+ {
727
+ "epoch": 14.81,
728
+ "learning_rate": 4.3837499999999995e-05,
729
+ "loss": 3.0811,
730
+ "step": 1170
731
+ },
732
+ {
733
+ "epoch": 14.94,
734
+ "learning_rate": 4.42125e-05,
735
+ "loss": 3.031,
736
+ "step": 1180
737
+ },
738
+ {
739
+ "epoch": 15.06,
740
+ "learning_rate": 4.45875e-05,
741
+ "loss": 3.0431,
742
+ "step": 1190
743
+ },
744
+ {
745
+ "epoch": 15.19,
746
+ "learning_rate": 4.4962499999999995e-05,
747
+ "loss": 2.9891,
748
+ "step": 1200
749
+ },
750
+ {
751
+ "epoch": 15.32,
752
+ "learning_rate": 4.53375e-05,
753
+ "loss": 2.9511,
754
+ "step": 1210
755
+ },
756
+ {
757
+ "epoch": 15.44,
758
+ "learning_rate": 4.57125e-05,
759
+ "loss": 2.8874,
760
+ "step": 1220
761
+ },
762
+ {
763
+ "epoch": 15.57,
764
+ "learning_rate": 4.60875e-05,
765
+ "loss": 2.8216,
766
+ "step": 1230
767
+ },
768
+ {
769
+ "epoch": 15.7,
770
+ "learning_rate": 4.64625e-05,
771
+ "loss": 2.7211,
772
+ "step": 1240
773
+ },
774
+ {
775
+ "epoch": 15.82,
776
+ "learning_rate": 4.68375e-05,
777
+ "loss": 2.6755,
778
+ "step": 1250
779
+ },
780
+ {
781
+ "epoch": 15.95,
782
+ "learning_rate": 4.721249999999999e-05,
783
+ "loss": 2.5301,
784
+ "step": 1260
785
+ },
786
+ {
787
+ "epoch": 16.08,
788
+ "learning_rate": 4.758749999999999e-05,
789
+ "loss": 2.4484,
790
+ "step": 1270
791
+ },
792
+ {
793
+ "epoch": 16.2,
794
+ "learning_rate": 4.7962499999999994e-05,
795
+ "loss": 2.2522,
796
+ "step": 1280
797
+ },
798
+ {
799
+ "epoch": 16.33,
800
+ "learning_rate": 4.8337499999999996e-05,
801
+ "loss": 2.1895,
802
+ "step": 1290
803
+ },
804
+ {
805
+ "epoch": 16.46,
806
+ "learning_rate": 4.871249999999999e-05,
807
+ "loss": 2.0274,
808
+ "step": 1300
809
+ },
810
+ {
811
+ "epoch": 16.58,
812
+ "learning_rate": 4.9087499999999994e-05,
813
+ "loss": 1.9528,
814
+ "step": 1310
815
+ },
816
+ {
817
+ "epoch": 16.71,
818
+ "learning_rate": 4.9462499999999996e-05,
819
+ "loss": 1.8849,
820
+ "step": 1320
821
+ },
822
+ {
823
+ "epoch": 16.84,
824
+ "learning_rate": 4.983749999999999e-05,
825
+ "loss": 1.8432,
826
+ "step": 1330
827
+ },
828
+ {
829
+ "epoch": 16.96,
830
+ "learning_rate": 5.0212499999999994e-05,
831
+ "loss": 1.7181,
832
+ "step": 1340
833
+ },
834
+ {
835
+ "epoch": 17.09,
836
+ "learning_rate": 5.0587499999999996e-05,
837
+ "loss": 1.7099,
838
+ "step": 1350
839
+ },
840
+ {
841
+ "epoch": 17.22,
842
+ "learning_rate": 5.096249999999999e-05,
843
+ "loss": 1.6276,
844
+ "step": 1360
845
+ },
846
+ {
847
+ "epoch": 17.34,
848
+ "learning_rate": 5.1337499999999994e-05,
849
+ "loss": 1.6456,
850
+ "step": 1370
851
+ },
852
+ {
853
+ "epoch": 17.47,
854
+ "learning_rate": 5.1712499999999997e-05,
855
+ "loss": 1.5613,
856
+ "step": 1380
857
+ },
858
+ {
859
+ "epoch": 17.59,
860
+ "learning_rate": 5.20875e-05,
861
+ "loss": 1.5553,
862
+ "step": 1390
863
+ },
864
+ {
865
+ "epoch": 17.72,
866
+ "learning_rate": 5.2462499999999994e-05,
867
+ "loss": 1.4874,
868
+ "step": 1400
869
+ },
870
+ {
871
+ "epoch": 17.85,
872
+ "learning_rate": 5.28375e-05,
873
+ "loss": 1.5079,
874
+ "step": 1410
875
+ },
876
+ {
877
+ "epoch": 17.97,
878
+ "learning_rate": 5.32125e-05,
879
+ "loss": 1.4935,
880
+ "step": 1420
881
+ },
882
+ {
883
+ "epoch": 18.1,
884
+ "learning_rate": 5.3587499999999995e-05,
885
+ "loss": 1.4843,
886
+ "step": 1430
887
+ },
888
+ {
889
+ "epoch": 18.23,
890
+ "learning_rate": 5.39625e-05,
891
+ "loss": 1.416,
892
+ "step": 1440
893
+ },
894
+ {
895
+ "epoch": 18.35,
896
+ "learning_rate": 5.43375e-05,
897
+ "loss": 1.3815,
898
+ "step": 1450
899
+ },
900
+ {
901
+ "epoch": 18.48,
902
+ "learning_rate": 5.4712499999999995e-05,
903
+ "loss": 1.4005,
904
+ "step": 1460
905
+ },
906
+ {
907
+ "epoch": 18.61,
908
+ "learning_rate": 5.50875e-05,
909
+ "loss": 1.3824,
910
+ "step": 1470
911
+ },
912
+ {
913
+ "epoch": 18.73,
914
+ "learning_rate": 5.54625e-05,
915
+ "loss": 1.3038,
916
+ "step": 1480
917
+ },
918
+ {
919
+ "epoch": 18.86,
920
+ "learning_rate": 5.58375e-05,
921
+ "loss": 1.4168,
922
+ "step": 1490
923
+ },
924
+ {
925
+ "epoch": 18.99,
926
+ "learning_rate": 5.62125e-05,
927
+ "loss": 1.3506,
928
+ "step": 1500
929
+ },
930
+ {
931
+ "epoch": 18.99,
932
+ "eval_cer": 0.28892684849736266,
933
+ "eval_loss": 1.1056294441223145,
934
+ "eval_runtime": 44.5929,
935
+ "eval_samples_per_second": 10.786,
936
+ "eval_steps_per_second": 1.368,
937
+ "eval_wer": 0.7031036834924966,
938
+ "step": 1500
939
+ },
940
+ {
941
+ "epoch": 19.11,
942
+ "learning_rate": 5.658749999999999e-05,
943
+ "loss": 1.3407,
944
+ "step": 1510
945
+ },
946
+ {
947
+ "epoch": 19.24,
948
+ "learning_rate": 5.696249999999999e-05,
949
+ "loss": 1.2849,
950
+ "step": 1520
951
+ },
952
+ {
953
+ "epoch": 19.37,
954
+ "learning_rate": 5.733749999999999e-05,
955
+ "loss": 1.3141,
956
+ "step": 1530
957
+ },
958
+ {
959
+ "epoch": 19.49,
960
+ "learning_rate": 5.771249999999999e-05,
961
+ "loss": 1.2858,
962
+ "step": 1540
963
+ },
964
+ {
965
+ "epoch": 19.62,
966
+ "learning_rate": 5.8087499999999996e-05,
967
+ "loss": 1.2842,
968
+ "step": 1550
969
+ },
970
+ {
971
+ "epoch": 19.75,
972
+ "learning_rate": 5.846249999999999e-05,
973
+ "loss": 1.2537,
974
+ "step": 1560
975
+ },
976
+ {
977
+ "epoch": 19.87,
978
+ "learning_rate": 5.8837499999999994e-05,
979
+ "loss": 1.304,
980
+ "step": 1570
981
+ },
982
+ {
983
+ "epoch": 20.0,
984
+ "learning_rate": 5.9212499999999996e-05,
985
+ "loss": 1.243,
986
+ "step": 1580
987
+ },
988
+ {
989
+ "epoch": 20.13,
990
+ "learning_rate": 5.958749999999999e-05,
991
+ "loss": 1.2636,
992
+ "step": 1590
993
+ },
994
+ {
995
+ "epoch": 20.25,
996
+ "learning_rate": 5.9962499999999994e-05,
997
+ "loss": 1.2239,
998
+ "step": 1600
999
+ },
1000
+ {
1001
+ "epoch": 20.38,
1002
+ "learning_rate": 6.0337499999999996e-05,
1003
+ "loss": 1.267,
1004
+ "step": 1610
1005
+ },
1006
+ {
1007
+ "epoch": 20.51,
1008
+ "learning_rate": 6.071249999999999e-05,
1009
+ "loss": 1.216,
1010
+ "step": 1620
1011
+ },
1012
+ {
1013
+ "epoch": 20.63,
1014
+ "learning_rate": 6.10875e-05,
1015
+ "loss": 1.253,
1016
+ "step": 1630
1017
+ },
1018
+ {
1019
+ "epoch": 20.76,
1020
+ "learning_rate": 6.14625e-05,
1021
+ "loss": 1.2137,
1022
+ "step": 1640
1023
+ },
1024
+ {
1025
+ "epoch": 20.89,
1026
+ "learning_rate": 6.183749999999999e-05,
1027
+ "loss": 1.1845,
1028
+ "step": 1650
1029
+ },
1030
+ {
1031
+ "epoch": 21.01,
1032
+ "learning_rate": 6.22125e-05,
1033
+ "loss": 1.1501,
1034
+ "step": 1660
1035
+ },
1036
+ {
1037
+ "epoch": 21.14,
1038
+ "learning_rate": 6.25875e-05,
1039
+ "loss": 1.1904,
1040
+ "step": 1670
1041
+ },
1042
+ {
1043
+ "epoch": 21.27,
1044
+ "learning_rate": 6.296249999999999e-05,
1045
+ "loss": 1.1456,
1046
+ "step": 1680
1047
+ },
1048
+ {
1049
+ "epoch": 21.39,
1050
+ "learning_rate": 6.33375e-05,
1051
+ "loss": 1.1798,
1052
+ "step": 1690
1053
+ },
1054
+ {
1055
+ "epoch": 21.52,
1056
+ "learning_rate": 6.37125e-05,
1057
+ "loss": 1.1122,
1058
+ "step": 1700
1059
+ },
1060
+ {
1061
+ "epoch": 21.65,
1062
+ "learning_rate": 6.408749999999999e-05,
1063
+ "loss": 1.1512,
1064
+ "step": 1710
1065
+ },
1066
+ {
1067
+ "epoch": 21.77,
1068
+ "learning_rate": 6.44625e-05,
1069
+ "loss": 1.1413,
1070
+ "step": 1720
1071
+ },
1072
+ {
1073
+ "epoch": 21.9,
1074
+ "learning_rate": 6.48375e-05,
1075
+ "loss": 1.1652,
1076
+ "step": 1730
1077
+ },
1078
+ {
1079
+ "epoch": 22.03,
1080
+ "learning_rate": 6.521249999999999e-05,
1081
+ "loss": 1.177,
1082
+ "step": 1740
1083
+ },
1084
+ {
1085
+ "epoch": 22.15,
1086
+ "learning_rate": 6.55875e-05,
1087
+ "loss": 1.1326,
1088
+ "step": 1750
1089
+ },
1090
+ {
1091
+ "epoch": 22.28,
1092
+ "learning_rate": 6.596249999999998e-05,
1093
+ "loss": 1.1082,
1094
+ "step": 1760
1095
+ },
1096
+ {
1097
+ "epoch": 22.41,
1098
+ "learning_rate": 6.633749999999999e-05,
1099
+ "loss": 1.1321,
1100
+ "step": 1770
1101
+ },
1102
+ {
1103
+ "epoch": 22.53,
1104
+ "learning_rate": 6.671249999999999e-05,
1105
+ "loss": 1.0721,
1106
+ "step": 1780
1107
+ },
1108
+ {
1109
+ "epoch": 22.66,
1110
+ "learning_rate": 6.70875e-05,
1111
+ "loss": 1.1199,
1112
+ "step": 1790
1113
+ },
1114
+ {
1115
+ "epoch": 22.78,
1116
+ "learning_rate": 6.746249999999999e-05,
1117
+ "loss": 1.1436,
1118
+ "step": 1800
1119
+ },
1120
+ {
1121
+ "epoch": 22.91,
1122
+ "learning_rate": 6.783749999999999e-05,
1123
+ "loss": 1.0941,
1124
+ "step": 1810
1125
+ },
1126
+ {
1127
+ "epoch": 23.04,
1128
+ "learning_rate": 6.82125e-05,
1129
+ "loss": 1.0631,
1130
+ "step": 1820
1131
+ },
1132
+ {
1133
+ "epoch": 23.16,
1134
+ "learning_rate": 6.85875e-05,
1135
+ "loss": 1.064,
1136
+ "step": 1830
1137
+ },
1138
+ {
1139
+ "epoch": 23.29,
1140
+ "learning_rate": 6.896249999999999e-05,
1141
+ "loss": 1.0739,
1142
+ "step": 1840
1143
+ },
1144
+ {
1145
+ "epoch": 23.42,
1146
+ "learning_rate": 6.93375e-05,
1147
+ "loss": 1.0354,
1148
+ "step": 1850
1149
+ },
1150
+ {
1151
+ "epoch": 23.54,
1152
+ "learning_rate": 6.97125e-05,
1153
+ "loss": 1.0343,
1154
+ "step": 1860
1155
+ },
1156
+ {
1157
+ "epoch": 23.67,
1158
+ "learning_rate": 7.008749999999999e-05,
1159
+ "loss": 1.0724,
1160
+ "step": 1870
1161
+ },
1162
+ {
1163
+ "epoch": 23.8,
1164
+ "learning_rate": 7.04625e-05,
1165
+ "loss": 1.0982,
1166
+ "step": 1880
1167
+ },
1168
+ {
1169
+ "epoch": 23.92,
1170
+ "learning_rate": 7.08375e-05,
1171
+ "loss": 1.065,
1172
+ "step": 1890
1173
+ },
1174
+ {
1175
+ "epoch": 24.05,
1176
+ "learning_rate": 7.121249999999999e-05,
1177
+ "loss": 1.0754,
1178
+ "step": 1900
1179
+ },
1180
+ {
1181
+ "epoch": 24.18,
1182
+ "learning_rate": 7.15875e-05,
1183
+ "loss": 1.0708,
1184
+ "step": 1910
1185
+ },
1186
+ {
1187
+ "epoch": 24.3,
1188
+ "learning_rate": 7.19625e-05,
1189
+ "loss": 1.0165,
1190
+ "step": 1920
1191
+ },
1192
+ {
1193
+ "epoch": 24.43,
1194
+ "learning_rate": 7.233749999999999e-05,
1195
+ "loss": 1.02,
1196
+ "step": 1930
1197
+ },
1198
+ {
1199
+ "epoch": 24.56,
1200
+ "learning_rate": 7.27125e-05,
1201
+ "loss": 1.0985,
1202
+ "step": 1940
1203
+ },
1204
+ {
1205
+ "epoch": 24.68,
1206
+ "learning_rate": 7.30875e-05,
1207
+ "loss": 0.9746,
1208
+ "step": 1950
1209
+ },
1210
+ {
1211
+ "epoch": 24.81,
1212
+ "learning_rate": 7.346249999999999e-05,
1213
+ "loss": 1.0644,
1214
+ "step": 1960
1215
+ },
1216
+ {
1217
+ "epoch": 24.94,
1218
+ "learning_rate": 7.38375e-05,
1219
+ "loss": 1.0104,
1220
+ "step": 1970
1221
+ },
1222
+ {
1223
+ "epoch": 25.06,
1224
+ "learning_rate": 7.42125e-05,
1225
+ "loss": 1.028,
1226
+ "step": 1980
1227
+ },
1228
+ {
1229
+ "epoch": 25.19,
1230
+ "learning_rate": 7.45875e-05,
1231
+ "loss": 1.0107,
1232
+ "step": 1990
1233
+ },
1234
+ {
1235
+ "epoch": 25.32,
1236
+ "learning_rate": 7.49625e-05,
1237
+ "loss": 0.9997,
1238
+ "step": 2000
1239
+ },
1240
+ {
1241
+ "epoch": 25.32,
1242
+ "eval_cer": 0.2301165693690988,
1243
+ "eval_loss": 0.919084906578064,
1244
+ "eval_runtime": 44.703,
1245
+ "eval_samples_per_second": 10.76,
1246
+ "eval_steps_per_second": 1.365,
1247
+ "eval_wer": 0.5943894952251023,
1248
+ "step": 2000
1249
+ },
1250
+ {
1251
+ "epoch": 25.44,
1252
+ "learning_rate": 7.465384615384615e-05,
1253
+ "loss": 0.9571,
1254
+ "step": 2010
1255
+ },
1256
+ {
1257
+ "epoch": 25.57,
1258
+ "learning_rate": 7.426923076923075e-05,
1259
+ "loss": 0.9801,
1260
+ "step": 2020
1261
+ },
1262
+ {
1263
+ "epoch": 25.7,
1264
+ "learning_rate": 7.388461538461538e-05,
1265
+ "loss": 0.9779,
1266
+ "step": 2030
1267
+ },
1268
+ {
1269
+ "epoch": 25.82,
1270
+ "learning_rate": 7.35e-05,
1271
+ "loss": 1.0168,
1272
+ "step": 2040
1273
+ },
1274
+ {
1275
+ "epoch": 25.95,
1276
+ "learning_rate": 7.31153846153846e-05,
1277
+ "loss": 0.9302,
1278
+ "step": 2050
1279
+ },
1280
+ {
1281
+ "epoch": 26.08,
1282
+ "learning_rate": 7.273076923076923e-05,
1283
+ "loss": 0.989,
1284
+ "step": 2060
1285
+ },
1286
+ {
1287
+ "epoch": 26.2,
1288
+ "learning_rate": 7.234615384615385e-05,
1289
+ "loss": 0.9357,
1290
+ "step": 2070
1291
+ },
1292
+ {
1293
+ "epoch": 26.33,
1294
+ "learning_rate": 7.196153846153846e-05,
1295
+ "loss": 0.9858,
1296
+ "step": 2080
1297
+ },
1298
+ {
1299
+ "epoch": 26.46,
1300
+ "learning_rate": 7.157692307692307e-05,
1301
+ "loss": 0.9813,
1302
+ "step": 2090
1303
+ },
1304
+ {
1305
+ "epoch": 26.58,
1306
+ "learning_rate": 7.119230769230769e-05,
1307
+ "loss": 0.9554,
1308
+ "step": 2100
1309
+ },
1310
+ {
1311
+ "epoch": 26.71,
1312
+ "learning_rate": 7.08076923076923e-05,
1313
+ "loss": 0.8935,
1314
+ "step": 2110
1315
+ },
1316
+ {
1317
+ "epoch": 26.84,
1318
+ "learning_rate": 7.042307692307692e-05,
1319
+ "loss": 0.9955,
1320
+ "step": 2120
1321
+ },
1322
+ {
1323
+ "epoch": 26.96,
1324
+ "learning_rate": 7.003846153846154e-05,
1325
+ "loss": 0.9205,
1326
+ "step": 2130
1327
+ },
1328
+ {
1329
+ "epoch": 27.09,
1330
+ "learning_rate": 6.965384615384615e-05,
1331
+ "loss": 0.9527,
1332
+ "step": 2140
1333
+ },
1334
+ {
1335
+ "epoch": 27.22,
1336
+ "learning_rate": 6.926923076923075e-05,
1337
+ "loss": 0.8899,
1338
+ "step": 2150
1339
+ },
1340
+ {
1341
+ "epoch": 27.34,
1342
+ "learning_rate": 6.888461538461538e-05,
1343
+ "loss": 0.9594,
1344
+ "step": 2160
1345
+ },
1346
+ {
1347
+ "epoch": 27.47,
1348
+ "learning_rate": 6.85e-05,
1349
+ "loss": 0.9061,
1350
+ "step": 2170
1351
+ },
1352
+ {
1353
+ "epoch": 27.59,
1354
+ "learning_rate": 6.81153846153846e-05,
1355
+ "loss": 0.94,
1356
+ "step": 2180
1357
+ },
1358
+ {
1359
+ "epoch": 27.72,
1360
+ "learning_rate": 6.773076923076923e-05,
1361
+ "loss": 0.8611,
1362
+ "step": 2190
1363
+ },
1364
+ {
1365
+ "epoch": 27.85,
1366
+ "learning_rate": 6.734615384615385e-05,
1367
+ "loss": 0.9391,
1368
+ "step": 2200
1369
+ },
1370
+ {
1371
+ "epoch": 27.97,
1372
+ "learning_rate": 6.696153846153846e-05,
1373
+ "loss": 0.8905,
1374
+ "step": 2210
1375
+ },
1376
+ {
1377
+ "epoch": 28.1,
1378
+ "learning_rate": 6.657692307692307e-05,
1379
+ "loss": 0.888,
1380
+ "step": 2220
1381
+ },
1382
+ {
1383
+ "epoch": 28.23,
1384
+ "learning_rate": 6.619230769230769e-05,
1385
+ "loss": 0.8749,
1386
+ "step": 2230
1387
+ },
1388
+ {
1389
+ "epoch": 28.35,
1390
+ "learning_rate": 6.580769230769231e-05,
1391
+ "loss": 0.9331,
1392
+ "step": 2240
1393
+ },
1394
+ {
1395
+ "epoch": 28.48,
1396
+ "learning_rate": 6.542307692307692e-05,
1397
+ "loss": 0.8135,
1398
+ "step": 2250
1399
+ },
1400
+ {
1401
+ "epoch": 28.61,
1402
+ "learning_rate": 6.503846153846154e-05,
1403
+ "loss": 0.9121,
1404
+ "step": 2260
1405
+ },
1406
+ {
1407
+ "epoch": 28.73,
1408
+ "learning_rate": 6.465384615384615e-05,
1409
+ "loss": 0.859,
1410
+ "step": 2270
1411
+ },
1412
+ {
1413
+ "epoch": 28.86,
1414
+ "learning_rate": 6.426923076923076e-05,
1415
+ "loss": 0.8726,
1416
+ "step": 2280
1417
+ },
1418
+ {
1419
+ "epoch": 28.99,
1420
+ "learning_rate": 6.388461538461538e-05,
1421
+ "loss": 0.8497,
1422
+ "step": 2290
1423
+ },
1424
+ {
1425
+ "epoch": 29.11,
1426
+ "learning_rate": 6.35e-05,
1427
+ "loss": 0.8673,
1428
+ "step": 2300
1429
+ },
1430
+ {
1431
+ "epoch": 29.24,
1432
+ "learning_rate": 6.31153846153846e-05,
1433
+ "loss": 0.8349,
1434
+ "step": 2310
1435
+ },
1436
+ {
1437
+ "epoch": 29.37,
1438
+ "learning_rate": 6.273076923076923e-05,
1439
+ "loss": 0.8946,
1440
+ "step": 2320
1441
+ },
1442
+ {
1443
+ "epoch": 29.49,
1444
+ "learning_rate": 6.234615384615384e-05,
1445
+ "loss": 0.8805,
1446
+ "step": 2330
1447
+ },
1448
+ {
1449
+ "epoch": 29.62,
1450
+ "learning_rate": 6.196153846153846e-05,
1451
+ "loss": 0.8752,
1452
+ "step": 2340
1453
+ },
1454
+ {
1455
+ "epoch": 29.75,
1456
+ "learning_rate": 6.157692307692307e-05,
1457
+ "loss": 0.8197,
1458
+ "step": 2350
1459
+ },
1460
+ {
1461
+ "epoch": 29.87,
1462
+ "learning_rate": 6.119230769230769e-05,
1463
+ "loss": 0.8332,
1464
+ "step": 2360
1465
+ },
1466
+ {
1467
+ "epoch": 30.0,
1468
+ "learning_rate": 6.08076923076923e-05,
1469
+ "loss": 0.7933,
1470
+ "step": 2370
1471
+ },
1472
+ {
1473
+ "epoch": 30.13,
1474
+ "learning_rate": 6.0423076923076924e-05,
1475
+ "loss": 0.8712,
1476
+ "step": 2380
1477
+ },
1478
+ {
1479
+ "epoch": 30.25,
1480
+ "learning_rate": 6.003846153846153e-05,
1481
+ "loss": 0.824,
1482
+ "step": 2390
1483
+ },
1484
+ {
1485
+ "epoch": 30.38,
1486
+ "learning_rate": 5.965384615384615e-05,
1487
+ "loss": 0.8158,
1488
+ "step": 2400
1489
+ },
1490
+ {
1491
+ "epoch": 30.51,
1492
+ "learning_rate": 5.926923076923076e-05,
1493
+ "loss": 0.8218,
1494
+ "step": 2410
1495
+ },
1496
+ {
1497
+ "epoch": 30.63,
1498
+ "learning_rate": 5.888461538461538e-05,
1499
+ "loss": 0.8403,
1500
+ "step": 2420
1501
+ },
1502
+ {
1503
+ "epoch": 30.76,
1504
+ "learning_rate": 5.85e-05,
1505
+ "loss": 0.7986,
1506
+ "step": 2430
1507
+ },
1508
+ {
1509
+ "epoch": 30.89,
1510
+ "learning_rate": 5.8115384615384614e-05,
1511
+ "loss": 0.8391,
1512
+ "step": 2440
1513
+ },
1514
+ {
1515
+ "epoch": 31.01,
1516
+ "learning_rate": 5.773076923076922e-05,
1517
+ "loss": 0.7736,
1518
+ "step": 2450
1519
+ },
1520
+ {
1521
+ "epoch": 31.14,
1522
+ "learning_rate": 5.734615384615384e-05,
1523
+ "loss": 0.8478,
1524
+ "step": 2460
1525
+ },
1526
+ {
1527
+ "epoch": 31.27,
1528
+ "learning_rate": 5.696153846153846e-05,
1529
+ "loss": 0.7728,
1530
+ "step": 2470
1531
+ },
1532
+ {
1533
+ "epoch": 31.39,
1534
+ "learning_rate": 5.6576923076923073e-05,
1535
+ "loss": 0.8231,
1536
+ "step": 2480
1537
+ },
1538
+ {
1539
+ "epoch": 31.52,
1540
+ "learning_rate": 5.619230769230769e-05,
1541
+ "loss": 0.7602,
1542
+ "step": 2490
1543
+ },
1544
+ {
1545
+ "epoch": 31.65,
1546
+ "learning_rate": 5.58076923076923e-05,
1547
+ "loss": 0.7838,
1548
+ "step": 2500
1549
+ },
1550
+ {
1551
+ "epoch": 31.65,
1552
+ "eval_cer": 0.2152122088112177,
1553
+ "eval_loss": 0.8952043056488037,
1554
+ "eval_runtime": 45.1763,
1555
+ "eval_samples_per_second": 10.647,
1556
+ "eval_steps_per_second": 1.35,
1557
+ "eval_wer": 0.555593451568895,
1558
+ "step": 2500
1559
+ },
1560
+ {
1561
+ "epoch": 31.77,
1562
+ "learning_rate": 5.542307692307691e-05,
1563
+ "loss": 0.8065,
1564
+ "step": 2510
1565
+ },
1566
+ {
1567
+ "epoch": 31.9,
1568
+ "learning_rate": 5.503846153846153e-05,
1569
+ "loss": 0.773,
1570
+ "step": 2520
1571
+ },
1572
+ {
1573
+ "epoch": 32.03,
1574
+ "learning_rate": 5.465384615384615e-05,
1575
+ "loss": 0.7854,
1576
+ "step": 2530
1577
+ },
1578
+ {
1579
+ "epoch": 32.15,
1580
+ "learning_rate": 5.426923076923076e-05,
1581
+ "loss": 0.7724,
1582
+ "step": 2540
1583
+ },
1584
+ {
1585
+ "epoch": 32.28,
1586
+ "learning_rate": 5.3884615384615384e-05,
1587
+ "loss": 0.7639,
1588
+ "step": 2550
1589
+ },
1590
+ {
1591
+ "epoch": 32.41,
1592
+ "learning_rate": 5.35e-05,
1593
+ "loss": 0.7993,
1594
+ "step": 2560
1595
+ },
1596
+ {
1597
+ "epoch": 32.53,
1598
+ "learning_rate": 5.311538461538461e-05,
1599
+ "loss": 0.7957,
1600
+ "step": 2570
1601
+ },
1602
+ {
1603
+ "epoch": 32.66,
1604
+ "learning_rate": 5.273076923076922e-05,
1605
+ "loss": 0.7686,
1606
+ "step": 2580
1607
+ },
1608
+ {
1609
+ "epoch": 32.78,
1610
+ "learning_rate": 5.234615384615384e-05,
1611
+ "loss": 0.8096,
1612
+ "step": 2590
1613
+ },
1614
+ {
1615
+ "epoch": 32.91,
1616
+ "learning_rate": 5.196153846153846e-05,
1617
+ "loss": 0.7357,
1618
+ "step": 2600
1619
+ },
1620
+ {
1621
+ "epoch": 33.04,
1622
+ "learning_rate": 5.1576923076923074e-05,
1623
+ "loss": 0.7674,
1624
+ "step": 2610
1625
+ },
1626
+ {
1627
+ "epoch": 33.16,
1628
+ "learning_rate": 5.119230769230769e-05,
1629
+ "loss": 0.7989,
1630
+ "step": 2620
1631
+ },
1632
+ {
1633
+ "epoch": 33.29,
1634
+ "learning_rate": 5.08076923076923e-05,
1635
+ "loss": 0.7474,
1636
+ "step": 2630
1637
+ },
1638
+ {
1639
+ "epoch": 33.42,
1640
+ "learning_rate": 5.042307692307692e-05,
1641
+ "loss": 0.7153,
1642
+ "step": 2640
1643
+ },
1644
+ {
1645
+ "epoch": 33.54,
1646
+ "learning_rate": 5.0038461538461533e-05,
1647
+ "loss": 0.7109,
1648
+ "step": 2650
1649
+ },
1650
+ {
1651
+ "epoch": 33.67,
1652
+ "learning_rate": 4.965384615384615e-05,
1653
+ "loss": 0.7841,
1654
+ "step": 2660
1655
+ },
1656
+ {
1657
+ "epoch": 33.8,
1658
+ "learning_rate": 4.926923076923076e-05,
1659
+ "loss": 0.762,
1660
+ "step": 2670
1661
+ },
1662
+ {
1663
+ "epoch": 33.92,
1664
+ "learning_rate": 4.8884615384615385e-05,
1665
+ "loss": 0.7414,
1666
+ "step": 2680
1667
+ },
1668
+ {
1669
+ "epoch": 34.05,
1670
+ "learning_rate": 4.849999999999999e-05,
1671
+ "loss": 0.7544,
1672
+ "step": 2690
1673
+ },
1674
+ {
1675
+ "epoch": 34.18,
1676
+ "learning_rate": 4.811538461538461e-05,
1677
+ "loss": 0.7338,
1678
+ "step": 2700
1679
+ },
1680
+ {
1681
+ "epoch": 34.3,
1682
+ "learning_rate": 4.773076923076922e-05,
1683
+ "loss": 0.7266,
1684
+ "step": 2710
1685
+ },
1686
+ {
1687
+ "epoch": 34.43,
1688
+ "learning_rate": 4.7346153846153845e-05,
1689
+ "loss": 0.7131,
1690
+ "step": 2720
1691
+ },
1692
+ {
1693
+ "epoch": 34.56,
1694
+ "learning_rate": 4.696153846153846e-05,
1695
+ "loss": 0.7291,
1696
+ "step": 2730
1697
+ },
1698
+ {
1699
+ "epoch": 34.68,
1700
+ "learning_rate": 4.6576923076923074e-05,
1701
+ "loss": 0.7051,
1702
+ "step": 2740
1703
+ },
1704
+ {
1705
+ "epoch": 34.81,
1706
+ "learning_rate": 4.619230769230769e-05,
1707
+ "loss": 0.7643,
1708
+ "step": 2750
1709
+ },
1710
+ {
1711
+ "epoch": 34.94,
1712
+ "learning_rate": 4.5807692307692304e-05,
1713
+ "loss": 0.727,
1714
+ "step": 2760
1715
+ },
1716
+ {
1717
+ "epoch": 35.06,
1718
+ "learning_rate": 4.542307692307692e-05,
1719
+ "loss": 0.7142,
1720
+ "step": 2770
1721
+ },
1722
+ {
1723
+ "epoch": 35.19,
1724
+ "learning_rate": 4.5038461538461534e-05,
1725
+ "loss": 0.7055,
1726
+ "step": 2780
1727
+ },
1728
+ {
1729
+ "epoch": 35.32,
1730
+ "learning_rate": 4.465384615384615e-05,
1731
+ "loss": 0.7339,
1732
+ "step": 2790
1733
+ },
1734
+ {
1735
+ "epoch": 35.44,
1736
+ "learning_rate": 4.426923076923077e-05,
1737
+ "loss": 0.6956,
1738
+ "step": 2800
1739
+ },
1740
+ {
1741
+ "epoch": 35.57,
1742
+ "learning_rate": 4.3884615384615385e-05,
1743
+ "loss": 0.7508,
1744
+ "step": 2810
1745
+ },
1746
+ {
1747
+ "epoch": 35.7,
1748
+ "learning_rate": 4.3499999999999993e-05,
1749
+ "loss": 0.7072,
1750
+ "step": 2820
1751
+ },
1752
+ {
1753
+ "epoch": 35.82,
1754
+ "learning_rate": 4.311538461538461e-05,
1755
+ "loss": 0.7103,
1756
+ "step": 2830
1757
+ },
1758
+ {
1759
+ "epoch": 35.95,
1760
+ "learning_rate": 4.273076923076923e-05,
1761
+ "loss": 0.6783,
1762
+ "step": 2840
1763
+ },
1764
+ {
1765
+ "epoch": 36.08,
1766
+ "learning_rate": 4.2346153846153845e-05,
1767
+ "loss": 0.7419,
1768
+ "step": 2850
1769
+ },
1770
+ {
1771
+ "epoch": 36.2,
1772
+ "learning_rate": 4.196153846153846e-05,
1773
+ "loss": 0.7091,
1774
+ "step": 2860
1775
+ },
1776
+ {
1777
+ "epoch": 36.33,
1778
+ "learning_rate": 4.1576923076923075e-05,
1779
+ "loss": 0.7073,
1780
+ "step": 2870
1781
+ },
1782
+ {
1783
+ "epoch": 36.46,
1784
+ "learning_rate": 4.119230769230768e-05,
1785
+ "loss": 0.6937,
1786
+ "step": 2880
1787
+ },
1788
+ {
1789
+ "epoch": 36.58,
1790
+ "learning_rate": 4.0807692307692305e-05,
1791
+ "loss": 0.756,
1792
+ "step": 2890
1793
+ },
1794
+ {
1795
+ "epoch": 36.71,
1796
+ "learning_rate": 4.042307692307692e-05,
1797
+ "loss": 0.6744,
1798
+ "step": 2900
1799
+ },
1800
+ {
1801
+ "epoch": 36.84,
1802
+ "learning_rate": 4.0038461538461534e-05,
1803
+ "loss": 0.7165,
1804
+ "step": 2910
1805
+ },
1806
+ {
1807
+ "epoch": 36.96,
1808
+ "learning_rate": 3.9653846153846156e-05,
1809
+ "loss": 0.6831,
1810
+ "step": 2920
1811
+ },
1812
+ {
1813
+ "epoch": 37.09,
1814
+ "learning_rate": 3.926923076923077e-05,
1815
+ "loss": 0.6894,
1816
+ "step": 2930
1817
+ },
1818
+ {
1819
+ "epoch": 37.22,
1820
+ "learning_rate": 3.888461538461538e-05,
1821
+ "loss": 0.6419,
1822
+ "step": 2940
1823
+ },
1824
+ {
1825
+ "epoch": 37.34,
1826
+ "learning_rate": 3.8499999999999994e-05,
1827
+ "loss": 0.7187,
1828
+ "step": 2950
1829
+ },
1830
+ {
1831
+ "epoch": 37.47,
1832
+ "learning_rate": 3.811538461538461e-05,
1833
+ "loss": 0.677,
1834
+ "step": 2960
1835
+ },
1836
+ {
1837
+ "epoch": 37.59,
1838
+ "learning_rate": 3.773076923076923e-05,
1839
+ "loss": 0.7263,
1840
+ "step": 2970
1841
+ },
1842
+ {
1843
+ "epoch": 37.72,
1844
+ "learning_rate": 3.734615384615384e-05,
1845
+ "loss": 0.6257,
1846
+ "step": 2980
1847
+ },
1848
+ {
1849
+ "epoch": 37.85,
1850
+ "learning_rate": 3.696153846153846e-05,
1851
+ "loss": 0.7051,
1852
+ "step": 2990
1853
+ },
1854
+ {
1855
+ "epoch": 37.97,
1856
+ "learning_rate": 3.6576923076923075e-05,
1857
+ "loss": 0.6665,
1858
+ "step": 3000
1859
+ },
1860
+ {
1861
+ "epoch": 37.97,
1862
+ "eval_cer": 0.2016781484053836,
1863
+ "eval_loss": 0.8907838463783264,
1864
+ "eval_runtime": 45.6242,
1865
+ "eval_samples_per_second": 10.543,
1866
+ "eval_steps_per_second": 1.337,
1867
+ "eval_wer": 0.5251534788540245,
1868
+ "step": 3000
1869
+ },
1870
+ {
1871
+ "epoch": 38.1,
1872
+ "learning_rate": 3.619230769230769e-05,
1873
+ "loss": 0.7016,
1874
+ "step": 3010
1875
+ },
1876
+ {
1877
+ "epoch": 38.23,
1878
+ "learning_rate": 3.5807692307692305e-05,
1879
+ "loss": 0.6585,
1880
+ "step": 3020
1881
+ },
1882
+ {
1883
+ "epoch": 38.35,
1884
+ "learning_rate": 3.542307692307692e-05,
1885
+ "loss": 0.6673,
1886
+ "step": 3030
1887
+ },
1888
+ {
1889
+ "epoch": 38.48,
1890
+ "learning_rate": 3.5038461538461535e-05,
1891
+ "loss": 0.6411,
1892
+ "step": 3040
1893
+ },
1894
+ {
1895
+ "epoch": 38.61,
1896
+ "learning_rate": 3.465384615384615e-05,
1897
+ "loss": 0.7038,
1898
+ "step": 3050
1899
+ },
1900
+ {
1901
+ "epoch": 38.73,
1902
+ "learning_rate": 3.4269230769230765e-05,
1903
+ "loss": 0.6458,
1904
+ "step": 3060
1905
+ },
1906
+ {
1907
+ "epoch": 38.86,
1908
+ "learning_rate": 3.3884615384615386e-05,
1909
+ "loss": 0.7231,
1910
+ "step": 3070
1911
+ },
1912
+ {
1913
+ "epoch": 38.99,
1914
+ "learning_rate": 3.3499999999999994e-05,
1915
+ "loss": 0.6495,
1916
+ "step": 3080
1917
+ },
1918
+ {
1919
+ "epoch": 39.11,
1920
+ "learning_rate": 3.3115384615384616e-05,
1921
+ "loss": 0.6788,
1922
+ "step": 3090
1923
+ },
1924
+ {
1925
+ "epoch": 39.24,
1926
+ "learning_rate": 3.273076923076923e-05,
1927
+ "loss": 0.6452,
1928
+ "step": 3100
1929
+ },
1930
+ {
1931
+ "epoch": 39.37,
1932
+ "learning_rate": 3.2346153846153846e-05,
1933
+ "loss": 0.7015,
1934
+ "step": 3110
1935
+ },
1936
+ {
1937
+ "epoch": 39.49,
1938
+ "learning_rate": 3.196153846153846e-05,
1939
+ "loss": 0.6518,
1940
+ "step": 3120
1941
+ },
1942
+ {
1943
+ "epoch": 39.62,
1944
+ "learning_rate": 3.1576923076923076e-05,
1945
+ "loss": 0.6757,
1946
+ "step": 3130
1947
+ },
1948
+ {
1949
+ "epoch": 39.75,
1950
+ "learning_rate": 3.119230769230769e-05,
1951
+ "loss": 0.6495,
1952
+ "step": 3140
1953
+ },
1954
+ {
1955
+ "epoch": 39.87,
1956
+ "learning_rate": 3.0807692307692305e-05,
1957
+ "loss": 0.6434,
1958
+ "step": 3150
1959
+ },
1960
+ {
1961
+ "epoch": 40.0,
1962
+ "learning_rate": 3.0423076923076924e-05,
1963
+ "loss": 0.6132,
1964
+ "step": 3160
1965
+ },
1966
+ {
1967
+ "epoch": 40.13,
1968
+ "learning_rate": 3.0038461538461535e-05,
1969
+ "loss": 0.6959,
1970
+ "step": 3170
1971
+ },
1972
+ {
1973
+ "epoch": 40.25,
1974
+ "learning_rate": 2.965384615384615e-05,
1975
+ "loss": 0.6468,
1976
+ "step": 3180
1977
+ },
1978
+ {
1979
+ "epoch": 40.38,
1980
+ "learning_rate": 2.926923076923077e-05,
1981
+ "loss": 0.6681,
1982
+ "step": 3190
1983
+ },
1984
+ {
1985
+ "epoch": 40.51,
1986
+ "learning_rate": 2.888461538461538e-05,
1987
+ "loss": 0.6446,
1988
+ "step": 3200
1989
+ },
1990
+ {
1991
+ "epoch": 40.63,
1992
+ "learning_rate": 2.8499999999999998e-05,
1993
+ "loss": 0.6554,
1994
+ "step": 3210
1995
+ },
1996
+ {
1997
+ "epoch": 40.76,
1998
+ "learning_rate": 2.8115384615384613e-05,
1999
+ "loss": 0.6204,
2000
+ "step": 3220
2001
+ },
2002
+ {
2003
+ "epoch": 40.89,
2004
+ "learning_rate": 2.7730769230769228e-05,
2005
+ "loss": 0.677,
2006
+ "step": 3230
2007
+ },
2008
+ {
2009
+ "epoch": 41.01,
2010
+ "learning_rate": 2.7346153846153843e-05,
2011
+ "loss": 0.5961,
2012
+ "step": 3240
2013
+ },
2014
+ {
2015
+ "epoch": 41.14,
2016
+ "learning_rate": 2.696153846153846e-05,
2017
+ "loss": 0.665,
2018
+ "step": 3250
2019
+ },
2020
+ {
2021
+ "epoch": 41.27,
2022
+ "learning_rate": 2.6576923076923073e-05,
2023
+ "loss": 0.6753,
2024
+ "step": 3260
2025
+ },
2026
+ {
2027
+ "epoch": 41.39,
2028
+ "learning_rate": 2.619230769230769e-05,
2029
+ "loss": 0.6387,
2030
+ "step": 3270
2031
+ },
2032
+ {
2033
+ "epoch": 41.52,
2034
+ "learning_rate": 2.5807692307692306e-05,
2035
+ "loss": 0.6281,
2036
+ "step": 3280
2037
+ },
2038
+ {
2039
+ "epoch": 41.65,
2040
+ "learning_rate": 2.542307692307692e-05,
2041
+ "loss": 0.6287,
2042
+ "step": 3290
2043
+ },
2044
+ {
2045
+ "epoch": 41.77,
2046
+ "learning_rate": 2.5038461538461536e-05,
2047
+ "loss": 0.6413,
2048
+ "step": 3300
2049
+ },
2050
+ {
2051
+ "epoch": 41.9,
2052
+ "learning_rate": 2.4653846153846154e-05,
2053
+ "loss": 0.6061,
2054
+ "step": 3310
2055
+ },
2056
+ {
2057
+ "epoch": 42.03,
2058
+ "learning_rate": 2.4269230769230765e-05,
2059
+ "loss": 0.648,
2060
+ "step": 3320
2061
+ },
2062
+ {
2063
+ "epoch": 42.15,
2064
+ "learning_rate": 2.3884615384615384e-05,
2065
+ "loss": 0.5926,
2066
+ "step": 3330
2067
+ },
2068
+ {
2069
+ "epoch": 42.28,
2070
+ "learning_rate": 2.35e-05,
2071
+ "loss": 0.6366,
2072
+ "step": 3340
2073
+ },
2074
+ {
2075
+ "epoch": 42.41,
2076
+ "learning_rate": 2.3115384615384614e-05,
2077
+ "loss": 0.6625,
2078
+ "step": 3350
2079
+ },
2080
+ {
2081
+ "epoch": 42.53,
2082
+ "learning_rate": 2.273076923076923e-05,
2083
+ "loss": 0.634,
2084
+ "step": 3360
2085
+ },
2086
+ {
2087
+ "epoch": 42.66,
2088
+ "learning_rate": 2.2346153846153847e-05,
2089
+ "loss": 0.618,
2090
+ "step": 3370
2091
+ },
2092
+ {
2093
+ "epoch": 42.78,
2094
+ "learning_rate": 2.1961538461538458e-05,
2095
+ "loss": 0.5911,
2096
+ "step": 3380
2097
+ },
2098
+ {
2099
+ "epoch": 42.91,
2100
+ "learning_rate": 2.1576923076923076e-05,
2101
+ "loss": 0.5936,
2102
+ "step": 3390
2103
+ },
2104
+ {
2105
+ "epoch": 43.04,
2106
+ "learning_rate": 2.119230769230769e-05,
2107
+ "loss": 0.6267,
2108
+ "step": 3400
2109
+ },
2110
+ {
2111
+ "epoch": 43.16,
2112
+ "learning_rate": 2.0807692307692303e-05,
2113
+ "loss": 0.6123,
2114
+ "step": 3410
2115
+ },
2116
+ {
2117
+ "epoch": 43.29,
2118
+ "learning_rate": 2.042307692307692e-05,
2119
+ "loss": 0.6398,
2120
+ "step": 3420
2121
+ },
2122
+ {
2123
+ "epoch": 43.42,
2124
+ "learning_rate": 2.003846153846154e-05,
2125
+ "loss": 0.606,
2126
+ "step": 3430
2127
+ },
2128
+ {
2129
+ "epoch": 43.54,
2130
+ "learning_rate": 1.965384615384615e-05,
2131
+ "loss": 0.6253,
2132
+ "step": 3440
2133
+ },
2134
+ {
2135
+ "epoch": 43.67,
2136
+ "learning_rate": 1.9269230769230766e-05,
2137
+ "loss": 0.5847,
2138
+ "step": 3450
2139
+ },
2140
+ {
2141
+ "epoch": 43.8,
2142
+ "learning_rate": 1.8884615384615384e-05,
2143
+ "loss": 0.6248,
2144
+ "step": 3460
2145
+ },
2146
+ {
2147
+ "epoch": 43.92,
2148
+ "learning_rate": 1.85e-05,
2149
+ "loss": 0.5884,
2150
+ "step": 3470
2151
+ },
2152
+ {
2153
+ "epoch": 44.05,
2154
+ "learning_rate": 1.8115384615384614e-05,
2155
+ "loss": 0.6038,
2156
+ "step": 3480
2157
+ },
2158
+ {
2159
+ "epoch": 44.18,
2160
+ "learning_rate": 1.773076923076923e-05,
2161
+ "loss": 0.5888,
2162
+ "step": 3490
2163
+ },
2164
+ {
2165
+ "epoch": 44.3,
2166
+ "learning_rate": 1.7346153846153844e-05,
2167
+ "loss": 0.6265,
2168
+ "step": 3500
2169
+ },
2170
+ {
2171
+ "epoch": 44.3,
2172
+ "eval_cer": 0.19540855592889456,
2173
+ "eval_loss": 0.9062958359718323,
2174
+ "eval_runtime": 44.6904,
2175
+ "eval_samples_per_second": 10.763,
2176
+ "eval_steps_per_second": 1.365,
2177
+ "eval_wer": 0.5133015006821282,
2178
+ "step": 3500
2179
+ },
2180
+ {
2181
+ "epoch": 44.43,
2182
+ "learning_rate": 1.8412499999999997e-05,
2183
+ "loss": 0.6002,
2184
+ "step": 3510
2185
+ },
2186
+ {
2187
+ "epoch": 44.56,
2188
+ "learning_rate": 1.8037499999999998e-05,
2189
+ "loss": 0.6191,
2190
+ "step": 3520
2191
+ },
2192
+ {
2193
+ "epoch": 44.68,
2194
+ "learning_rate": 1.76625e-05,
2195
+ "loss": 0.5811,
2196
+ "step": 3530
2197
+ },
2198
+ {
2199
+ "epoch": 44.81,
2200
+ "learning_rate": 1.72875e-05,
2201
+ "loss": 0.6299,
2202
+ "step": 3540
2203
+ },
2204
+ {
2205
+ "epoch": 44.94,
2206
+ "learning_rate": 1.6912499999999998e-05,
2207
+ "loss": 0.5605,
2208
+ "step": 3550
2209
+ },
2210
+ {
2211
+ "epoch": 45.06,
2212
+ "learning_rate": 1.65375e-05,
2213
+ "loss": 0.6183,
2214
+ "step": 3560
2215
+ },
2216
+ {
2217
+ "epoch": 45.19,
2218
+ "learning_rate": 1.61625e-05,
2219
+ "loss": 0.5852,
2220
+ "step": 3570
2221
+ },
2222
+ {
2223
+ "epoch": 45.32,
2224
+ "learning_rate": 1.5787499999999997e-05,
2225
+ "loss": 0.594,
2226
+ "step": 3580
2227
+ },
2228
+ {
2229
+ "epoch": 45.44,
2230
+ "learning_rate": 1.54125e-05,
2231
+ "loss": 0.5965,
2232
+ "step": 3590
2233
+ },
2234
+ {
2235
+ "epoch": 45.57,
2236
+ "learning_rate": 1.50375e-05,
2237
+ "loss": 0.6005,
2238
+ "step": 3600
2239
+ },
2240
+ {
2241
+ "epoch": 45.7,
2242
+ "learning_rate": 1.4662499999999999e-05,
2243
+ "loss": 0.5884,
2244
+ "step": 3610
2245
+ },
2246
+ {
2247
+ "epoch": 45.82,
2248
+ "learning_rate": 1.4287499999999998e-05,
2249
+ "loss": 0.5884,
2250
+ "step": 3620
2251
+ },
2252
+ {
2253
+ "epoch": 45.95,
2254
+ "learning_rate": 1.39125e-05,
2255
+ "loss": 0.5628,
2256
+ "step": 3630
2257
+ },
2258
+ {
2259
+ "epoch": 46.08,
2260
+ "learning_rate": 1.3537499999999999e-05,
2261
+ "loss": 0.6339,
2262
+ "step": 3640
2263
+ },
2264
+ {
2265
+ "epoch": 46.2,
2266
+ "learning_rate": 1.3162499999999998e-05,
2267
+ "loss": 0.5578,
2268
+ "step": 3650
2269
+ },
2270
+ {
2271
+ "epoch": 46.33,
2272
+ "learning_rate": 1.2787499999999999e-05,
2273
+ "loss": 0.6239,
2274
+ "step": 3660
2275
+ },
2276
+ {
2277
+ "epoch": 46.46,
2278
+ "learning_rate": 1.24125e-05,
2279
+ "loss": 0.5872,
2280
+ "step": 3670
2281
+ },
2282
+ {
2283
+ "epoch": 46.58,
2284
+ "learning_rate": 1.20375e-05,
2285
+ "loss": 0.5697,
2286
+ "step": 3680
2287
+ },
2288
+ {
2289
+ "epoch": 46.71,
2290
+ "learning_rate": 1.1662499999999999e-05,
2291
+ "loss": 0.5475,
2292
+ "step": 3690
2293
+ },
2294
+ {
2295
+ "epoch": 46.84,
2296
+ "learning_rate": 1.1287499999999998e-05,
2297
+ "loss": 0.5979,
2298
+ "step": 3700
2299
+ },
2300
+ {
2301
+ "epoch": 46.96,
2302
+ "learning_rate": 1.0912499999999998e-05,
2303
+ "loss": 0.5742,
2304
+ "step": 3710
2305
+ },
2306
+ {
2307
+ "epoch": 47.09,
2308
+ "learning_rate": 1.05375e-05,
2309
+ "loss": 0.6054,
2310
+ "step": 3720
2311
+ },
2312
+ {
2313
+ "epoch": 47.22,
2314
+ "learning_rate": 1.01625e-05,
2315
+ "loss": 0.5777,
2316
+ "step": 3730
2317
+ },
2318
+ {
2319
+ "epoch": 47.34,
2320
+ "learning_rate": 9.787499999999999e-06,
2321
+ "loss": 0.5734,
2322
+ "step": 3740
2323
+ },
2324
+ {
2325
+ "epoch": 47.47,
2326
+ "learning_rate": 9.412499999999998e-06,
2327
+ "loss": 0.5322,
2328
+ "step": 3750
2329
+ },
2330
+ {
2331
+ "epoch": 47.59,
2332
+ "learning_rate": 9.0375e-06,
2333
+ "loss": 0.6287,
2334
+ "step": 3760
2335
+ },
2336
+ {
2337
+ "epoch": 47.72,
2338
+ "learning_rate": 8.6625e-06,
2339
+ "loss": 0.547,
2340
+ "step": 3770
2341
+ },
2342
+ {
2343
+ "epoch": 47.85,
2344
+ "learning_rate": 8.2875e-06,
2345
+ "loss": 0.6414,
2346
+ "step": 3780
2347
+ },
2348
+ {
2349
+ "epoch": 47.97,
2350
+ "learning_rate": 7.9125e-06,
2351
+ "loss": 0.5661,
2352
+ "step": 3790
2353
+ },
2354
+ {
2355
+ "epoch": 48.1,
2356
+ "learning_rate": 7.537499999999999e-06,
2357
+ "loss": 0.5893,
2358
+ "step": 3800
2359
+ },
2360
+ {
2361
+ "epoch": 48.23,
2362
+ "learning_rate": 7.1625e-06,
2363
+ "loss": 0.556,
2364
+ "step": 3810
2365
+ },
2366
+ {
2367
+ "epoch": 48.35,
2368
+ "learning_rate": 6.787499999999999e-06,
2369
+ "loss": 0.6265,
2370
+ "step": 3820
2371
+ },
2372
+ {
2373
+ "epoch": 48.48,
2374
+ "learning_rate": 6.4125e-06,
2375
+ "loss": 0.5644,
2376
+ "step": 3830
2377
+ },
2378
+ {
2379
+ "epoch": 48.61,
2380
+ "learning_rate": 6.037499999999999e-06,
2381
+ "loss": 0.6202,
2382
+ "step": 3840
2383
+ },
2384
+ {
2385
+ "epoch": 48.73,
2386
+ "learning_rate": 5.6624999999999996e-06,
2387
+ "loss": 0.5581,
2388
+ "step": 3850
2389
+ },
2390
+ {
2391
+ "epoch": 48.86,
2392
+ "learning_rate": 5.287499999999999e-06,
2393
+ "loss": 0.572,
2394
+ "step": 3860
2395
+ },
2396
+ {
2397
+ "epoch": 48.99,
2398
+ "learning_rate": 4.9125e-06,
2399
+ "loss": 0.5559,
2400
+ "step": 3870
2401
+ },
2402
+ {
2403
+ "epoch": 49.11,
2404
+ "learning_rate": 4.537499999999999e-06,
2405
+ "loss": 0.6013,
2406
+ "step": 3880
2407
+ },
2408
+ {
2409
+ "epoch": 49.24,
2410
+ "learning_rate": 4.1624999999999995e-06,
2411
+ "loss": 0.5498,
2412
+ "step": 3890
2413
+ },
2414
+ {
2415
+ "epoch": 49.37,
2416
+ "learning_rate": 3.7874999999999997e-06,
2417
+ "loss": 0.5883,
2418
+ "step": 3900
2419
+ },
2420
+ {
2421
+ "epoch": 49.49,
2422
+ "learning_rate": 3.4124999999999995e-06,
2423
+ "loss": 0.5777,
2424
+ "step": 3910
2425
+ },
2426
+ {
2427
+ "epoch": 49.62,
2428
+ "learning_rate": 3.0374999999999997e-06,
2429
+ "loss": 0.5768,
2430
+ "step": 3920
2431
+ },
2432
+ {
2433
+ "epoch": 49.75,
2434
+ "learning_rate": 2.6624999999999995e-06,
2435
+ "loss": 0.5603,
2436
+ "step": 3930
2437
+ },
2438
+ {
2439
+ "epoch": 49.87,
2440
+ "learning_rate": 2.2874999999999997e-06,
2441
+ "loss": 0.5814,
2442
+ "step": 3940
2443
+ },
2444
+ {
2445
+ "epoch": 50.0,
2446
+ "learning_rate": 1.9125e-06,
2447
+ "loss": 0.5562,
2448
+ "step": 3950
2449
+ },
2450
+ {
2451
+ "epoch": 50.13,
2452
+ "learning_rate": 1.5374999999999999e-06,
2453
+ "loss": 0.5858,
2454
+ "step": 3960
2455
+ },
2456
+ {
2457
+ "epoch": 50.25,
2458
+ "learning_rate": 1.1624999999999999e-06,
2459
+ "loss": 0.5279,
2460
+ "step": 3970
2461
+ },
2462
+ {
2463
+ "epoch": 50.38,
2464
+ "learning_rate": 7.875e-07,
2465
+ "loss": 0.5734,
2466
+ "step": 3980
2467
+ },
2468
+ {
2469
+ "epoch": 50.51,
2470
+ "learning_rate": 4.124999999999999e-07,
2471
+ "loss": 0.5895,
2472
+ "step": 3990
2473
+ },
2474
+ {
2475
+ "epoch": 50.63,
2476
+ "learning_rate": 3.75e-08,
2477
+ "loss": 0.5935,
2478
+ "step": 4000
2479
+ },
2480
+ {
2481
+ "epoch": 50.63,
2482
+ "eval_cer": 0.1969102547256584,
2483
+ "eval_loss": 0.9162458181381226,
2484
+ "eval_runtime": 44.8405,
2485
+ "eval_samples_per_second": 10.727,
2486
+ "eval_steps_per_second": 1.36,
2487
+ "eval_wer": 0.5156036834924966,
2488
+ "step": 4000
2489
+ },
2490
+ {
2491
+ "epoch": 50.76,
2492
+ "learning_rate": 1.4729999999999998e-05,
2493
+ "loss": 0.543,
2494
+ "step": 4010
2495
+ },
2496
+ {
2497
+ "epoch": 50.89,
2498
+ "learning_rate": 1.4429999999999997e-05,
2499
+ "loss": 0.6044,
2500
+ "step": 4020
2501
+ },
2502
+ {
2503
+ "epoch": 51.01,
2504
+ "learning_rate": 1.413e-05,
2505
+ "loss": 0.5749,
2506
+ "step": 4030
2507
+ },
2508
+ {
2509
+ "epoch": 51.14,
2510
+ "learning_rate": 1.383e-05,
2511
+ "loss": 0.6171,
2512
+ "step": 4040
2513
+ },
2514
+ {
2515
+ "epoch": 51.27,
2516
+ "learning_rate": 1.353e-05,
2517
+ "loss": 0.5767,
2518
+ "step": 4050
2519
+ },
2520
+ {
2521
+ "epoch": 51.39,
2522
+ "learning_rate": 1.3229999999999999e-05,
2523
+ "loss": 0.5749,
2524
+ "step": 4060
2525
+ },
2526
+ {
2527
+ "epoch": 51.52,
2528
+ "learning_rate": 1.2929999999999998e-05,
2529
+ "loss": 0.565,
2530
+ "step": 4070
2531
+ },
2532
+ {
2533
+ "epoch": 51.65,
2534
+ "learning_rate": 1.2629999999999998e-05,
2535
+ "loss": 0.5907,
2536
+ "step": 4080
2537
+ },
2538
+ {
2539
+ "epoch": 51.77,
2540
+ "learning_rate": 1.2329999999999999e-05,
2541
+ "loss": 0.575,
2542
+ "step": 4090
2543
+ },
2544
+ {
2545
+ "epoch": 51.9,
2546
+ "learning_rate": 1.2029999999999998e-05,
2547
+ "loss": 0.5692,
2548
+ "step": 4100
2549
+ },
2550
+ {
2551
+ "epoch": 52.03,
2552
+ "learning_rate": 1.173e-05,
2553
+ "loss": 0.5219,
2554
+ "step": 4110
2555
+ },
2556
+ {
2557
+ "epoch": 52.15,
2558
+ "learning_rate": 1.1429999999999999e-05,
2559
+ "loss": 0.5535,
2560
+ "step": 4120
2561
+ },
2562
+ {
2563
+ "epoch": 52.28,
2564
+ "learning_rate": 1.113e-05,
2565
+ "loss": 0.5519,
2566
+ "step": 4130
2567
+ },
2568
+ {
2569
+ "epoch": 52.41,
2570
+ "learning_rate": 1.083e-05,
2571
+ "loss": 0.5826,
2572
+ "step": 4140
2573
+ },
2574
+ {
2575
+ "epoch": 52.53,
2576
+ "learning_rate": 1.0529999999999999e-05,
2577
+ "loss": 0.5472,
2578
+ "step": 4150
2579
+ },
2580
+ {
2581
+ "epoch": 52.66,
2582
+ "learning_rate": 1.0229999999999999e-05,
2583
+ "loss": 0.5603,
2584
+ "step": 4160
2585
+ },
2586
+ {
2587
+ "epoch": 52.78,
2588
+ "learning_rate": 9.929999999999998e-06,
2589
+ "loss": 0.589,
2590
+ "step": 4170
2591
+ },
2592
+ {
2593
+ "epoch": 52.91,
2594
+ "learning_rate": 9.629999999999998e-06,
2595
+ "loss": 0.6005,
2596
+ "step": 4180
2597
+ },
2598
+ {
2599
+ "epoch": 53.04,
2600
+ "learning_rate": 9.329999999999999e-06,
2601
+ "loss": 0.5844,
2602
+ "step": 4190
2603
+ },
2604
+ {
2605
+ "epoch": 53.16,
2606
+ "learning_rate": 9.029999999999998e-06,
2607
+ "loss": 0.5779,
2608
+ "step": 4200
2609
+ },
2610
+ {
2611
+ "epoch": 53.29,
2612
+ "learning_rate": 8.73e-06,
2613
+ "loss": 0.5611,
2614
+ "step": 4210
2615
+ },
2616
+ {
2617
+ "epoch": 53.42,
2618
+ "learning_rate": 8.429999999999999e-06,
2619
+ "loss": 0.5859,
2620
+ "step": 4220
2621
+ },
2622
+ {
2623
+ "epoch": 53.54,
2624
+ "learning_rate": 8.129999999999998e-06,
2625
+ "loss": 0.5906,
2626
+ "step": 4230
2627
+ },
2628
+ {
2629
+ "epoch": 53.67,
2630
+ "learning_rate": 7.83e-06,
2631
+ "loss": 0.5522,
2632
+ "step": 4240
2633
+ },
2634
+ {
2635
+ "epoch": 53.8,
2636
+ "learning_rate": 7.56e-06,
2637
+ "loss": 0.5762,
2638
+ "step": 4250
2639
+ },
2640
+ {
2641
+ "epoch": 53.92,
2642
+ "learning_rate": 7.259999999999999e-06,
2643
+ "loss": 0.5498,
2644
+ "step": 4260
2645
+ },
2646
+ {
2647
+ "epoch": 54.05,
2648
+ "learning_rate": 6.959999999999999e-06,
2649
+ "loss": 0.5752,
2650
+ "step": 4270
2651
+ },
2652
+ {
2653
+ "epoch": 54.18,
2654
+ "learning_rate": 6.66e-06,
2655
+ "loss": 0.5428,
2656
+ "step": 4280
2657
+ },
2658
+ {
2659
+ "epoch": 54.3,
2660
+ "learning_rate": 6.359999999999999e-06,
2661
+ "loss": 0.5515,
2662
+ "step": 4290
2663
+ },
2664
+ {
2665
+ "epoch": 54.43,
2666
+ "learning_rate": 6.06e-06,
2667
+ "loss": 0.5662,
2668
+ "step": 4300
2669
+ },
2670
+ {
2671
+ "epoch": 54.56,
2672
+ "learning_rate": 5.759999999999999e-06,
2673
+ "loss": 0.5916,
2674
+ "step": 4310
2675
+ },
2676
+ {
2677
+ "epoch": 54.68,
2678
+ "learning_rate": 5.459999999999999e-06,
2679
+ "loss": 0.537,
2680
+ "step": 4320
2681
+ },
2682
+ {
2683
+ "epoch": 54.81,
2684
+ "learning_rate": 5.16e-06,
2685
+ "loss": 0.5744,
2686
+ "step": 4330
2687
+ },
2688
+ {
2689
+ "epoch": 54.94,
2690
+ "learning_rate": 4.859999999999999e-06,
2691
+ "loss": 0.5606,
2692
+ "step": 4340
2693
+ },
2694
+ {
2695
+ "epoch": 55.06,
2696
+ "learning_rate": 4.5599999999999995e-06,
2697
+ "loss": 0.5855,
2698
+ "step": 4350
2699
+ },
2700
+ {
2701
+ "epoch": 55.19,
2702
+ "learning_rate": 4.26e-06,
2703
+ "loss": 0.5486,
2704
+ "step": 4360
2705
+ },
2706
+ {
2707
+ "epoch": 55.32,
2708
+ "learning_rate": 3.959999999999999e-06,
2709
+ "loss": 0.5644,
2710
+ "step": 4370
2711
+ },
2712
+ {
2713
+ "epoch": 55.44,
2714
+ "learning_rate": 3.66e-06,
2715
+ "loss": 0.5525,
2716
+ "step": 4380
2717
+ },
2718
+ {
2719
+ "epoch": 55.57,
2720
+ "learning_rate": 3.3599999999999996e-06,
2721
+ "loss": 0.6088,
2722
+ "step": 4390
2723
+ },
2724
+ {
2725
+ "epoch": 55.7,
2726
+ "learning_rate": 3.06e-06,
2727
+ "loss": 0.5344,
2728
+ "step": 4400
2729
+ },
2730
+ {
2731
+ "epoch": 55.82,
2732
+ "learning_rate": 2.76e-06,
2733
+ "loss": 0.5379,
2734
+ "step": 4410
2735
+ },
2736
+ {
2737
+ "epoch": 55.95,
2738
+ "learning_rate": 2.46e-06,
2739
+ "loss": 0.5204,
2740
+ "step": 4420
2741
+ },
2742
+ {
2743
+ "epoch": 56.08,
2744
+ "learning_rate": 2.1599999999999996e-06,
2745
+ "loss": 0.5754,
2746
+ "step": 4430
2747
+ },
2748
+ {
2749
+ "epoch": 56.2,
2750
+ "learning_rate": 1.8599999999999998e-06,
2751
+ "loss": 0.5507,
2752
+ "step": 4440
2753
+ },
2754
+ {
2755
+ "epoch": 56.33,
2756
+ "learning_rate": 1.5599999999999999e-06,
2757
+ "loss": 0.5592,
2758
+ "step": 4450
2759
+ },
2760
+ {
2761
+ "epoch": 56.46,
2762
+ "learning_rate": 1.2599999999999998e-06,
2763
+ "loss": 0.5396,
2764
+ "step": 4460
2765
+ },
2766
+ {
2767
+ "epoch": 56.58,
2768
+ "learning_rate": 9.6e-07,
2769
+ "loss": 0.579,
2770
+ "step": 4470
2771
+ },
2772
+ {
2773
+ "epoch": 56.71,
2774
+ "learning_rate": 6.6e-07,
2775
+ "loss": 0.545,
2776
+ "step": 4480
2777
+ },
2778
+ {
2779
+ "epoch": 56.84,
2780
+ "learning_rate": 3.5999999999999994e-07,
2781
+ "loss": 0.5919,
2782
+ "step": 4490
2783
+ },
2784
+ {
2785
+ "epoch": 56.96,
2786
+ "learning_rate": 6e-08,
2787
+ "loss": 0.5174,
2788
+ "step": 4500
2789
+ },
2790
+ {
2791
+ "epoch": 56.96,
2792
+ "eval_cer": 0.19719182325005163,
2793
+ "eval_loss": 0.9287102818489075,
2794
+ "eval_runtime": 44.3461,
2795
+ "eval_samples_per_second": 10.847,
2796
+ "eval_steps_per_second": 1.376,
2797
+ "eval_wer": 0.5139836289222374,
2798
+ "step": 4500
2799
+ },
2800
+ {
2801
+ "epoch": 57.09,
2802
+ "learning_rate": 1.2299999999999999e-05,
2803
+ "loss": 0.5852,
2804
+ "step": 4510
2805
+ },
2806
+ {
2807
+ "epoch": 57.22,
2808
+ "learning_rate": 1.205e-05,
2809
+ "loss": 0.5752,
2810
+ "step": 4520
2811
+ },
2812
+ {
2813
+ "epoch": 57.34,
2814
+ "learning_rate": 1.1799999999999999e-05,
2815
+ "loss": 0.5433,
2816
+ "step": 4530
2817
+ },
2818
+ {
2819
+ "epoch": 57.47,
2820
+ "learning_rate": 1.155e-05,
2821
+ "loss": 0.5648,
2822
+ "step": 4540
2823
+ },
2824
+ {
2825
+ "epoch": 57.59,
2826
+ "learning_rate": 1.1299999999999999e-05,
2827
+ "loss": 0.5704,
2828
+ "step": 4550
2829
+ },
2830
+ {
2831
+ "epoch": 57.72,
2832
+ "learning_rate": 1.105e-05,
2833
+ "loss": 0.5216,
2834
+ "step": 4560
2835
+ },
2836
+ {
2837
+ "epoch": 57.85,
2838
+ "learning_rate": 1.0799999999999998e-05,
2839
+ "loss": 0.5998,
2840
+ "step": 4570
2841
+ },
2842
+ {
2843
+ "epoch": 57.97,
2844
+ "learning_rate": 1.0549999999999999e-05,
2845
+ "loss": 0.5439,
2846
+ "step": 4580
2847
+ },
2848
+ {
2849
+ "epoch": 58.1,
2850
+ "learning_rate": 1.03e-05,
2851
+ "loss": 0.5679,
2852
+ "step": 4590
2853
+ },
2854
+ {
2855
+ "epoch": 58.23,
2856
+ "learning_rate": 1.005e-05,
2857
+ "loss": 0.5621,
2858
+ "step": 4600
2859
+ },
2860
+ {
2861
+ "epoch": 58.35,
2862
+ "learning_rate": 9.799999999999998e-06,
2863
+ "loss": 0.5557,
2864
+ "step": 4610
2865
+ },
2866
+ {
2867
+ "epoch": 58.48,
2868
+ "learning_rate": 9.549999999999998e-06,
2869
+ "loss": 0.5525,
2870
+ "step": 4620
2871
+ },
2872
+ {
2873
+ "epoch": 58.61,
2874
+ "learning_rate": 9.299999999999999e-06,
2875
+ "loss": 0.6033,
2876
+ "step": 4630
2877
+ },
2878
+ {
2879
+ "epoch": 58.73,
2880
+ "learning_rate": 9.05e-06,
2881
+ "loss": 0.5059,
2882
+ "step": 4640
2883
+ },
2884
+ {
2885
+ "epoch": 58.86,
2886
+ "learning_rate": 8.799999999999999e-06,
2887
+ "loss": 0.5362,
2888
+ "step": 4650
2889
+ },
2890
+ {
2891
+ "epoch": 58.99,
2892
+ "learning_rate": 8.55e-06,
2893
+ "loss": 0.535,
2894
+ "step": 4660
2895
+ },
2896
+ {
2897
+ "epoch": 59.11,
2898
+ "learning_rate": 8.299999999999998e-06,
2899
+ "loss": 0.586,
2900
+ "step": 4670
2901
+ },
2902
+ {
2903
+ "epoch": 59.24,
2904
+ "learning_rate": 8.05e-06,
2905
+ "loss": 0.55,
2906
+ "step": 4680
2907
+ },
2908
+ {
2909
+ "epoch": 59.37,
2910
+ "learning_rate": 7.799999999999998e-06,
2911
+ "loss": 0.582,
2912
+ "step": 4690
2913
+ },
2914
+ {
2915
+ "epoch": 59.49,
2916
+ "learning_rate": 7.55e-06,
2917
+ "loss": 0.5065,
2918
+ "step": 4700
2919
+ },
2920
+ {
2921
+ "epoch": 59.62,
2922
+ "learning_rate": 7.299999999999999e-06,
2923
+ "loss": 0.5715,
2924
+ "step": 4710
2925
+ },
2926
+ {
2927
+ "epoch": 59.75,
2928
+ "learning_rate": 7.049999999999999e-06,
2929
+ "loss": 0.5359,
2930
+ "step": 4720
2931
+ },
2932
+ {
2933
+ "epoch": 59.87,
2934
+ "learning_rate": 6.8e-06,
2935
+ "loss": 0.5535,
2936
+ "step": 4730
2937
+ },
2938
+ {
2939
+ "epoch": 60.0,
2940
+ "learning_rate": 6.549999999999999e-06,
2941
+ "loss": 0.5256,
2942
+ "step": 4740
2943
+ },
2944
+ {
2945
+ "epoch": 60.13,
2946
+ "learning_rate": 6.3e-06,
2947
+ "loss": 0.5869,
2948
+ "step": 4750
2949
+ },
2950
+ {
2951
+ "epoch": 60.25,
2952
+ "learning_rate": 6.05e-06,
2953
+ "loss": 0.4978,
2954
+ "step": 4760
2955
+ },
2956
+ {
2957
+ "epoch": 60.38,
2958
+ "learning_rate": 5.7999999999999995e-06,
2959
+ "loss": 0.5402,
2960
+ "step": 4770
2961
+ },
2962
+ {
2963
+ "epoch": 60.51,
2964
+ "learning_rate": 5.549999999999999e-06,
2965
+ "loss": 0.5607,
2966
+ "step": 4780
2967
+ },
2968
+ {
2969
+ "epoch": 60.63,
2970
+ "learning_rate": 5.3e-06,
2971
+ "loss": 0.5583,
2972
+ "step": 4790
2973
+ },
2974
+ {
2975
+ "epoch": 60.76,
2976
+ "learning_rate": 5.049999999999999e-06,
2977
+ "loss": 0.5382,
2978
+ "step": 4800
2979
+ },
2980
+ {
2981
+ "epoch": 60.89,
2982
+ "learning_rate": 4.8e-06,
2983
+ "loss": 0.5498,
2984
+ "step": 4810
2985
+ },
2986
+ {
2987
+ "epoch": 61.01,
2988
+ "learning_rate": 4.55e-06,
2989
+ "loss": 0.5443,
2990
+ "step": 4820
2991
+ },
2992
+ {
2993
+ "epoch": 61.14,
2994
+ "learning_rate": 4.2999999999999995e-06,
2995
+ "loss": 0.5579,
2996
+ "step": 4830
2997
+ },
2998
+ {
2999
+ "epoch": 61.27,
3000
+ "learning_rate": 4.049999999999999e-06,
3001
+ "loss": 0.517,
3002
+ "step": 4840
3003
+ },
3004
+ {
3005
+ "epoch": 61.39,
3006
+ "learning_rate": 3.7999999999999996e-06,
3007
+ "loss": 0.566,
3008
+ "step": 4850
3009
+ },
3010
+ {
3011
+ "epoch": 61.52,
3012
+ "learning_rate": 3.5499999999999995e-06,
3013
+ "loss": 0.572,
3014
+ "step": 4860
3015
+ },
3016
+ {
3017
+ "epoch": 61.65,
3018
+ "learning_rate": 3.2999999999999993e-06,
3019
+ "loss": 0.5425,
3020
+ "step": 4870
3021
+ },
3022
+ {
3023
+ "epoch": 61.77,
3024
+ "learning_rate": 3.0499999999999996e-06,
3025
+ "loss": 0.5617,
3026
+ "step": 4880
3027
+ },
3028
+ {
3029
+ "epoch": 61.9,
3030
+ "learning_rate": 2.8e-06,
3031
+ "loss": 0.5352,
3032
+ "step": 4890
3033
+ },
3034
+ {
3035
+ "epoch": 62.03,
3036
+ "learning_rate": 2.55e-06,
3037
+ "loss": 0.5328,
3038
+ "step": 4900
3039
+ },
3040
+ {
3041
+ "epoch": 62.15,
3042
+ "learning_rate": 2.2999999999999996e-06,
3043
+ "loss": 0.567,
3044
+ "step": 4910
3045
+ },
3046
+ {
3047
+ "epoch": 62.28,
3048
+ "learning_rate": 2.05e-06,
3049
+ "loss": 0.554,
3050
+ "step": 4920
3051
+ },
3052
+ {
3053
+ "epoch": 62.41,
3054
+ "learning_rate": 1.8e-06,
3055
+ "loss": 0.5846,
3056
+ "step": 4930
3057
+ },
3058
+ {
3059
+ "epoch": 62.53,
3060
+ "learning_rate": 1.5499999999999998e-06,
3061
+ "loss": 0.5451,
3062
+ "step": 4940
3063
+ },
3064
+ {
3065
+ "epoch": 62.66,
3066
+ "learning_rate": 1.2999999999999998e-06,
3067
+ "loss": 0.5251,
3068
+ "step": 4950
3069
+ },
3070
+ {
3071
+ "epoch": 62.78,
3072
+ "learning_rate": 1.05e-06,
3073
+ "loss": 0.5373,
3074
+ "step": 4960
3075
+ },
3076
+ {
3077
+ "epoch": 62.91,
3078
+ "learning_rate": 7.999999999999999e-07,
3079
+ "loss": 0.5245,
3080
+ "step": 4970
3081
+ },
3082
+ {
3083
+ "epoch": 63.04,
3084
+ "learning_rate": 5.499999999999999e-07,
3085
+ "loss": 0.5535,
3086
+ "step": 4980
3087
+ },
3088
+ {
3089
+ "epoch": 63.16,
3090
+ "learning_rate": 3e-07,
3091
+ "loss": 0.545,
3092
+ "step": 4990
3093
+ },
3094
+ {
3095
+ "epoch": 63.29,
3096
+ "learning_rate": 4.999999999999999e-08,
3097
+ "loss": 0.5462,
3098
+ "step": 5000
3099
+ },
3100
+ {
3101
+ "epoch": 63.29,
3102
+ "eval_cer": 0.19739830683460666,
3103
+ "eval_loss": 0.9370450973510742,
3104
+ "eval_runtime": 45.0983,
3105
+ "eval_samples_per_second": 10.666,
3106
+ "eval_steps_per_second": 1.353,
3107
+ "eval_wer": 0.5138130968622101,
3108
+ "step": 5000
3109
+ },
3110
+ {
3111
+ "epoch": 63.42,
3112
+ "learning_rate": 8.2e-06,
3113
+ "loss": 0.5146,
3114
+ "step": 5010
3115
+ },
3116
+ {
3117
+ "epoch": 63.54,
3118
+ "learning_rate": 8.033333333333333e-06,
3119
+ "loss": 0.5334,
3120
+ "step": 5020
3121
+ },
3122
+ {
3123
+ "epoch": 63.67,
3124
+ "learning_rate": 7.866666666666667e-06,
3125
+ "loss": 0.5824,
3126
+ "step": 5030
3127
+ },
3128
+ {
3129
+ "epoch": 63.8,
3130
+ "learning_rate": 7.699999999999999e-06,
3131
+ "loss": 0.5354,
3132
+ "step": 5040
3133
+ },
3134
+ {
3135
+ "epoch": 63.92,
3136
+ "learning_rate": 7.533333333333333e-06,
3137
+ "loss": 0.5225,
3138
+ "step": 5050
3139
+ },
3140
+ {
3141
+ "epoch": 64.05,
3142
+ "learning_rate": 7.366666666666666e-06,
3143
+ "loss": 0.5296,
3144
+ "step": 5060
3145
+ },
3146
+ {
3147
+ "epoch": 64.18,
3148
+ "learning_rate": 7.2e-06,
3149
+ "loss": 0.525,
3150
+ "step": 5070
3151
+ },
3152
+ {
3153
+ "epoch": 64.3,
3154
+ "learning_rate": 7.033333333333333e-06,
3155
+ "loss": 0.5549,
3156
+ "step": 5080
3157
+ },
3158
+ {
3159
+ "epoch": 64.43,
3160
+ "learning_rate": 6.8666666666666664e-06,
3161
+ "loss": 0.5579,
3162
+ "step": 5090
3163
+ },
3164
+ {
3165
+ "epoch": 64.56,
3166
+ "learning_rate": 6.699999999999999e-06,
3167
+ "loss": 0.5527,
3168
+ "step": 5100
3169
+ },
3170
+ {
3171
+ "epoch": 64.68,
3172
+ "learning_rate": 6.533333333333333e-06,
3173
+ "loss": 0.5191,
3174
+ "step": 5110
3175
+ },
3176
+ {
3177
+ "epoch": 64.81,
3178
+ "learning_rate": 6.366666666666666e-06,
3179
+ "loss": 0.5591,
3180
+ "step": 5120
3181
+ },
3182
+ {
3183
+ "epoch": 64.94,
3184
+ "learning_rate": 6.199999999999999e-06,
3185
+ "loss": 0.5371,
3186
+ "step": 5130
3187
+ },
3188
+ {
3189
+ "epoch": 65.06,
3190
+ "learning_rate": 6.033333333333333e-06,
3191
+ "loss": 0.5527,
3192
+ "step": 5140
3193
+ },
3194
+ {
3195
+ "epoch": 65.19,
3196
+ "learning_rate": 5.866666666666666e-06,
3197
+ "loss": 0.5318,
3198
+ "step": 5150
3199
+ },
3200
+ {
3201
+ "epoch": 65.32,
3202
+ "learning_rate": 5.7e-06,
3203
+ "loss": 0.5684,
3204
+ "step": 5160
3205
+ },
3206
+ {
3207
+ "epoch": 65.44,
3208
+ "learning_rate": 5.533333333333333e-06,
3209
+ "loss": 0.528,
3210
+ "step": 5170
3211
+ },
3212
+ {
3213
+ "epoch": 65.57,
3214
+ "learning_rate": 5.366666666666666e-06,
3215
+ "loss": 0.5366,
3216
+ "step": 5180
3217
+ },
3218
+ {
3219
+ "epoch": 65.7,
3220
+ "learning_rate": 5.199999999999999e-06,
3221
+ "loss": 0.5482,
3222
+ "step": 5190
3223
+ },
3224
+ {
3225
+ "epoch": 65.82,
3226
+ "learning_rate": 5.033333333333332e-06,
3227
+ "loss": 0.5402,
3228
+ "step": 5200
3229
+ },
3230
+ {
3231
+ "epoch": 65.95,
3232
+ "learning_rate": 4.866666666666666e-06,
3233
+ "loss": 0.5568,
3234
+ "step": 5210
3235
+ },
3236
+ {
3237
+ "epoch": 66.08,
3238
+ "learning_rate": 4.699999999999999e-06,
3239
+ "loss": 0.5466,
3240
+ "step": 5220
3241
+ },
3242
+ {
3243
+ "epoch": 66.2,
3244
+ "learning_rate": 4.533333333333333e-06,
3245
+ "loss": 0.5353,
3246
+ "step": 5230
3247
+ },
3248
+ {
3249
+ "epoch": 66.33,
3250
+ "learning_rate": 4.366666666666667e-06,
3251
+ "loss": 0.5629,
3252
+ "step": 5240
3253
+ },
3254
+ {
3255
+ "epoch": 66.46,
3256
+ "learning_rate": 4.2e-06,
3257
+ "loss": 0.5227,
3258
+ "step": 5250
3259
+ },
3260
+ {
3261
+ "epoch": 66.58,
3262
+ "learning_rate": 4.033333333333333e-06,
3263
+ "loss": 0.5126,
3264
+ "step": 5260
3265
+ },
3266
+ {
3267
+ "epoch": 66.71,
3268
+ "learning_rate": 3.866666666666666e-06,
3269
+ "loss": 0.5173,
3270
+ "step": 5270
3271
+ },
3272
+ {
3273
+ "epoch": 66.84,
3274
+ "learning_rate": 3.6999999999999997e-06,
3275
+ "loss": 0.5773,
3276
+ "step": 5280
3277
+ },
3278
+ {
3279
+ "epoch": 66.96,
3280
+ "learning_rate": 3.533333333333333e-06,
3281
+ "loss": 0.5131,
3282
+ "step": 5290
3283
+ },
3284
+ {
3285
+ "epoch": 67.09,
3286
+ "learning_rate": 3.3666666666666665e-06,
3287
+ "loss": 0.5592,
3288
+ "step": 5300
3289
+ },
3290
+ {
3291
+ "epoch": 67.22,
3292
+ "learning_rate": 3.1999999999999994e-06,
3293
+ "loss": 0.5164,
3294
+ "step": 5310
3295
+ },
3296
+ {
3297
+ "epoch": 67.34,
3298
+ "learning_rate": 3.033333333333333e-06,
3299
+ "loss": 0.5166,
3300
+ "step": 5320
3301
+ },
3302
+ {
3303
+ "epoch": 67.47,
3304
+ "learning_rate": 2.866666666666666e-06,
3305
+ "loss": 0.5079,
3306
+ "step": 5330
3307
+ },
3308
+ {
3309
+ "epoch": 67.59,
3310
+ "learning_rate": 2.6999999999999996e-06,
3311
+ "loss": 0.547,
3312
+ "step": 5340
3313
+ },
3314
+ {
3315
+ "epoch": 67.72,
3316
+ "learning_rate": 2.533333333333333e-06,
3317
+ "loss": 0.5188,
3318
+ "step": 5350
3319
+ },
3320
+ {
3321
+ "epoch": 67.85,
3322
+ "learning_rate": 2.3666666666666667e-06,
3323
+ "loss": 0.5779,
3324
+ "step": 5360
3325
+ },
3326
+ {
3327
+ "epoch": 67.97,
3328
+ "learning_rate": 2.1999999999999997e-06,
3329
+ "loss": 0.5424,
3330
+ "step": 5370
3331
+ },
3332
+ {
3333
+ "epoch": 68.1,
3334
+ "learning_rate": 2.033333333333333e-06,
3335
+ "loss": 0.5307,
3336
+ "step": 5380
3337
+ },
3338
+ {
3339
+ "epoch": 68.23,
3340
+ "learning_rate": 1.8666666666666664e-06,
3341
+ "loss": 0.5353,
3342
+ "step": 5390
3343
+ },
3344
+ {
3345
+ "epoch": 68.35,
3346
+ "learning_rate": 1.7e-06,
3347
+ "loss": 0.5521,
3348
+ "step": 5400
3349
+ },
3350
+ {
3351
+ "epoch": 68.48,
3352
+ "learning_rate": 1.5333333333333332e-06,
3353
+ "loss": 0.5024,
3354
+ "step": 5410
3355
+ },
3356
+ {
3357
+ "epoch": 68.61,
3358
+ "learning_rate": 1.3666666666666666e-06,
3359
+ "loss": 0.5765,
3360
+ "step": 5420
3361
+ },
3362
+ {
3363
+ "epoch": 68.73,
3364
+ "learning_rate": 1.2e-06,
3365
+ "loss": 0.497,
3366
+ "step": 5430
3367
+ },
3368
+ {
3369
+ "epoch": 68.86,
3370
+ "learning_rate": 1.0333333333333333e-06,
3371
+ "loss": 0.5822,
3372
+ "step": 5440
3373
+ },
3374
+ {
3375
+ "epoch": 68.99,
3376
+ "learning_rate": 8.666666666666666e-07,
3377
+ "loss": 0.5189,
3378
+ "step": 5450
3379
+ },
3380
+ {
3381
+ "epoch": 69.11,
3382
+ "learning_rate": 7e-07,
3383
+ "loss": 0.5356,
3384
+ "step": 5460
3385
+ },
3386
+ {
3387
+ "epoch": 69.24,
3388
+ "learning_rate": 5.333333333333333e-07,
3389
+ "loss": 0.5289,
3390
+ "step": 5470
3391
+ },
3392
+ {
3393
+ "epoch": 69.37,
3394
+ "learning_rate": 3.666666666666666e-07,
3395
+ "loss": 0.5522,
3396
+ "step": 5480
3397
+ },
3398
+ {
3399
+ "epoch": 69.49,
3400
+ "learning_rate": 1.9999999999999996e-07,
3401
+ "loss": 0.4897,
3402
+ "step": 5490
3403
+ },
3404
+ {
3405
+ "epoch": 69.62,
3406
+ "learning_rate": 3.3333333333333334e-08,
3407
+ "loss": 0.5564,
3408
+ "step": 5500
3409
+ },
3410
+ {
3411
+ "epoch": 69.62,
3412
+ "eval_cer": 0.19773618906387852,
3413
+ "eval_loss": 0.9461079239845276,
3414
+ "eval_runtime": 45.3275,
3415
+ "eval_samples_per_second": 10.612,
3416
+ "eval_steps_per_second": 1.346,
3417
+ "eval_wer": 0.5148362892223738,
3418
+ "step": 5500
3419
+ },
3420
+ {
3421
+ "epoch": 69.75,
3422
+ "learning_rate": 7.38e-06,
3423
+ "loss": 0.525,
3424
+ "step": 5510
3425
+ },
3426
+ {
3427
+ "epoch": 69.87,
3428
+ "learning_rate": 7.229999999999999e-06,
3429
+ "loss": 0.5232,
3430
+ "step": 5520
3431
+ },
3432
+ {
3433
+ "epoch": 70.0,
3434
+ "learning_rate": 7.079999999999999e-06,
3435
+ "loss": 0.5318,
3436
+ "step": 5530
3437
+ },
3438
+ {
3439
+ "epoch": 70.13,
3440
+ "learning_rate": 6.929999999999999e-06,
3441
+ "loss": 0.562,
3442
+ "step": 5540
3443
+ },
3444
+ {
3445
+ "epoch": 70.25,
3446
+ "learning_rate": 6.779999999999999e-06,
3447
+ "loss": 0.494,
3448
+ "step": 5550
3449
+ },
3450
+ {
3451
+ "epoch": 70.38,
3452
+ "learning_rate": 6.63e-06,
3453
+ "loss": 0.5314,
3454
+ "step": 5560
3455
+ },
3456
+ {
3457
+ "epoch": 70.51,
3458
+ "learning_rate": 6.48e-06,
3459
+ "loss": 0.5332,
3460
+ "step": 5570
3461
+ },
3462
+ {
3463
+ "epoch": 70.63,
3464
+ "learning_rate": 6.3299999999999995e-06,
3465
+ "loss": 0.552,
3466
+ "step": 5580
3467
+ },
3468
+ {
3469
+ "epoch": 70.76,
3470
+ "learning_rate": 6.179999999999999e-06,
3471
+ "loss": 0.5538,
3472
+ "step": 5590
3473
+ },
3474
+ {
3475
+ "epoch": 70.89,
3476
+ "learning_rate": 6.029999999999999e-06,
3477
+ "loss": 0.5507,
3478
+ "step": 5600
3479
+ },
3480
+ {
3481
+ "epoch": 71.01,
3482
+ "learning_rate": 5.88e-06,
3483
+ "loss": 0.5207,
3484
+ "step": 5610
3485
+ },
3486
+ {
3487
+ "epoch": 71.14,
3488
+ "learning_rate": 5.729999999999999e-06,
3489
+ "loss": 0.5613,
3490
+ "step": 5620
3491
+ },
3492
+ {
3493
+ "epoch": 71.27,
3494
+ "learning_rate": 5.579999999999999e-06,
3495
+ "loss": 0.5263,
3496
+ "step": 5630
3497
+ },
3498
+ {
3499
+ "epoch": 71.39,
3500
+ "learning_rate": 5.43e-06,
3501
+ "loss": 0.5138,
3502
+ "step": 5640
3503
+ },
3504
+ {
3505
+ "epoch": 71.52,
3506
+ "learning_rate": 5.28e-06,
3507
+ "loss": 0.5268,
3508
+ "step": 5650
3509
+ },
3510
+ {
3511
+ "epoch": 71.65,
3512
+ "learning_rate": 5.13e-06,
3513
+ "loss": 0.5285,
3514
+ "step": 5660
3515
+ },
3516
+ {
3517
+ "epoch": 71.77,
3518
+ "learning_rate": 4.98e-06,
3519
+ "loss": 0.539,
3520
+ "step": 5670
3521
+ },
3522
+ {
3523
+ "epoch": 71.9,
3524
+ "learning_rate": 4.8299999999999995e-06,
3525
+ "loss": 0.5518,
3526
+ "step": 5680
3527
+ },
3528
+ {
3529
+ "epoch": 72.03,
3530
+ "learning_rate": 4.679999999999999e-06,
3531
+ "loss": 0.5392,
3532
+ "step": 5690
3533
+ },
3534
+ {
3535
+ "epoch": 72.15,
3536
+ "learning_rate": 4.53e-06,
3537
+ "loss": 0.5341,
3538
+ "step": 5700
3539
+ },
3540
+ {
3541
+ "epoch": 72.28,
3542
+ "learning_rate": 4.3799999999999996e-06,
3543
+ "loss": 0.528,
3544
+ "step": 5710
3545
+ },
3546
+ {
3547
+ "epoch": 72.41,
3548
+ "learning_rate": 4.229999999999999e-06,
3549
+ "loss": 0.5285,
3550
+ "step": 5720
3551
+ },
3552
+ {
3553
+ "epoch": 72.53,
3554
+ "learning_rate": 4.079999999999999e-06,
3555
+ "loss": 0.5291,
3556
+ "step": 5730
3557
+ },
3558
+ {
3559
+ "epoch": 72.66,
3560
+ "learning_rate": 3.93e-06,
3561
+ "loss": 0.56,
3562
+ "step": 5740
3563
+ },
3564
+ {
3565
+ "epoch": 72.78,
3566
+ "learning_rate": 3.78e-06,
3567
+ "loss": 0.5638,
3568
+ "step": 5750
3569
+ },
3570
+ {
3571
+ "epoch": 72.91,
3572
+ "learning_rate": 3.6299999999999995e-06,
3573
+ "loss": 0.5111,
3574
+ "step": 5760
3575
+ },
3576
+ {
3577
+ "epoch": 73.04,
3578
+ "learning_rate": 3.4799999999999993e-06,
3579
+ "loss": 0.5239,
3580
+ "step": 5770
3581
+ },
3582
+ {
3583
+ "epoch": 73.16,
3584
+ "learning_rate": 3.33e-06,
3585
+ "loss": 0.5386,
3586
+ "step": 5780
3587
+ },
3588
+ {
3589
+ "epoch": 73.29,
3590
+ "learning_rate": 3.1799999999999996e-06,
3591
+ "loss": 0.525,
3592
+ "step": 5790
3593
+ },
3594
+ {
3595
+ "epoch": 73.42,
3596
+ "learning_rate": 3.03e-06,
3597
+ "loss": 0.5051,
3598
+ "step": 5800
3599
+ },
3600
+ {
3601
+ "epoch": 73.54,
3602
+ "learning_rate": 2.8799999999999995e-06,
3603
+ "loss": 0.5119,
3604
+ "step": 5810
3605
+ },
3606
+ {
3607
+ "epoch": 73.67,
3608
+ "learning_rate": 2.7299999999999997e-06,
3609
+ "loss": 0.5209,
3610
+ "step": 5820
3611
+ },
3612
+ {
3613
+ "epoch": 73.8,
3614
+ "learning_rate": 2.58e-06,
3615
+ "loss": 0.5659,
3616
+ "step": 5830
3617
+ },
3618
+ {
3619
+ "epoch": 73.92,
3620
+ "learning_rate": 2.4299999999999996e-06,
3621
+ "loss": 0.5178,
3622
+ "step": 5840
3623
+ },
3624
+ {
3625
+ "epoch": 74.05,
3626
+ "learning_rate": 2.2799999999999998e-06,
3627
+ "loss": 0.5523,
3628
+ "step": 5850
3629
+ },
3630
+ {
3631
+ "epoch": 74.18,
3632
+ "learning_rate": 2.13e-06,
3633
+ "loss": 0.5048,
3634
+ "step": 5860
3635
+ },
3636
+ {
3637
+ "epoch": 74.3,
3638
+ "learning_rate": 1.9799999999999997e-06,
3639
+ "loss": 0.5109,
3640
+ "step": 5870
3641
+ },
3642
+ {
3643
+ "epoch": 74.43,
3644
+ "learning_rate": 1.83e-06,
3645
+ "loss": 0.5092,
3646
+ "step": 5880
3647
+ },
3648
+ {
3649
+ "epoch": 74.56,
3650
+ "learning_rate": 1.6799999999999998e-06,
3651
+ "loss": 0.5439,
3652
+ "step": 5890
3653
+ },
3654
+ {
3655
+ "epoch": 74.68,
3656
+ "learning_rate": 1.53e-06,
3657
+ "loss": 0.5501,
3658
+ "step": 5900
3659
+ },
3660
+ {
3661
+ "epoch": 74.81,
3662
+ "learning_rate": 1.38e-06,
3663
+ "loss": 0.5628,
3664
+ "step": 5910
3665
+ },
3666
+ {
3667
+ "epoch": 74.94,
3668
+ "learning_rate": 1.23e-06,
3669
+ "loss": 0.5097,
3670
+ "step": 5920
3671
+ },
3672
+ {
3673
+ "epoch": 75.06,
3674
+ "learning_rate": 1.0799999999999998e-06,
3675
+ "loss": 0.5363,
3676
+ "step": 5930
3677
+ },
3678
+ {
3679
+ "epoch": 75.19,
3680
+ "learning_rate": 9.299999999999999e-07,
3681
+ "loss": 0.5304,
3682
+ "step": 5940
3683
+ },
3684
+ {
3685
+ "epoch": 75.32,
3686
+ "learning_rate": 7.799999999999999e-07,
3687
+ "loss": 0.5358,
3688
+ "step": 5950
3689
+ },
3690
+ {
3691
+ "epoch": 75.44,
3692
+ "learning_rate": 6.299999999999999e-07,
3693
+ "loss": 0.5262,
3694
+ "step": 5960
3695
+ },
3696
+ {
3697
+ "epoch": 75.57,
3698
+ "learning_rate": 4.8e-07,
3699
+ "loss": 0.5258,
3700
+ "step": 5970
3701
+ },
3702
+ {
3703
+ "epoch": 75.7,
3704
+ "learning_rate": 3.3e-07,
3705
+ "loss": 0.4952,
3706
+ "step": 5980
3707
+ },
3708
+ {
3709
+ "epoch": 75.82,
3710
+ "learning_rate": 1.7999999999999997e-07,
3711
+ "loss": 0.5285,
3712
+ "step": 5990
3713
+ },
3714
+ {
3715
+ "epoch": 75.95,
3716
+ "learning_rate": 3e-08,
3717
+ "loss": 0.5252,
3718
+ "step": 6000
3719
+ },
3720
+ {
3721
+ "epoch": 75.95,
3722
+ "eval_cer": 0.19692902596061795,
3723
+ "eval_loss": 0.9505288004875183,
3724
+ "eval_runtime": 45.9888,
3725
+ "eval_samples_per_second": 10.459,
3726
+ "eval_steps_per_second": 0.674,
3727
+ "eval_wer": 0.5117667121418826,
3728
+ "step": 6000
3729
+ },
3730
+ {
3731
+ "epoch": 75.95,
3732
+ "step": 6000,
3733
+ "total_flos": 6.910110276723645e+19,
3734
+ "train_loss": 0.044292491674423215,
3735
+ "train_runtime": 2233.4842,
3736
+ "train_samples_per_second": 85.964,
3737
+ "train_steps_per_second": 2.686
3738
+ }
3739
+ ],
3740
+ "max_steps": 6000,
3741
+ "num_train_epochs": 76,
3742
+ "total_flos": 6.910110276723645e+19,
3743
+ "trial_name": null,
3744
+ "trial_params": null
3745
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c320c55ba9124e0a7d45da133173ad9a74af1e9959ad8e0d0bd14767c2fba239
3
+ size 3451
vocab.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 114,
3
+ "[UNK]": 113,
4
+ "_": 1,
5
+ "`": 2,
6
+ "a": 3,
7
+ "b": 4,
8
+ "c": 5,
9
+ "d": 6,
10
+ "e": 7,
11
+ "f": 8,
12
+ "g": 9,
13
+ "h": 10,
14
+ "i": 11,
15
+ "j": 12,
16
+ "k": 13,
17
+ "l": 14,
18
+ "m": 15,
19
+ "n": 16,
20
+ "o": 17,
21
+ "p": 18,
22
+ "q": 19,
23
+ "r": 20,
24
+ "s": 21,
25
+ "t": 22,
26
+ "u": 23,
27
+ "v": 24,
28
+ "w": 25,
29
+ "x": 26,
30
+ "y": 27,
31
+ "z": 28,
32
+ "|": 0,
33
+ "¥": 29,
34
+ "°": 30,
35
+ "½": 31,
36
+ "¾": 32,
37
+ "é": 33,
38
+ "í": 34,
39
+ "،": 35,
40
+ "؛": 36,
41
+ "؟": 37,
42
+ "ء": 38,
43
+ "آ": 39,
44
+ "أ": 40,
45
+ "ؤ": 41,
46
+ "ئ": 42,
47
+ "ا": 43,
48
+ "ب": 44,
49
+ "ت": 45,
50
+ "ث": 46,
51
+ "ج": 47,
52
+ "ح": 48,
53
+ "خ": 49,
54
+ "د": 50,
55
+ "ذ": 51,
56
+ "ر": 52,
57
+ "ز": 53,
58
+ "س": 54,
59
+ "ش": 55,
60
+ "ص": 56,
61
+ "ض": 57,
62
+ "ط": 58,
63
+ "ظ": 59,
64
+ "ع": 60,
65
+ "غ": 61,
66
+ "ـ": 62,
67
+ "ف": 63,
68
+ "ق": 64,
69
+ "ك": 65,
70
+ "ل": 66,
71
+ "م": 67,
72
+ "ن": 68,
73
+ "ه": 69,
74
+ "و": 70,
75
+ "ى": 71,
76
+ "ي": 72,
77
+ "ً": 73,
78
+ "ٌ": 74,
79
+ "َ": 75,
80
+ "ُ": 76,
81
+ "ّ": 77,
82
+ "٪": 78,
83
+ "ټ": 79,
84
+ "پ": 80,
85
+ "ځ": 81,
86
+ "څ": 82,
87
+ "چ": 83,
88
+ "ډ": 84,
89
+ "ړ": 85,
90
+ "ږ": 86,
91
+ "ژ": 87,
92
+ "ښ": 88,
93
+ "ک": 89,
94
+ "ګ": 90,
95
+ "گ": 91,
96
+ "ڼ": 92,
97
+ "ھ": 93,
98
+ "ی": 94,
99
+ "ۍ": 95,
100
+ "ې": 96,
101
+ "ے": 97,
102
+ "۔": 98,
103
+ "۰": 99,
104
+ "۱": 100,
105
+ "۲": 101,
106
+ "۳": 102,
107
+ "۴": 103,
108
+ "۵": 104,
109
+ "۶": 105,
110
+ "۷": 106,
111
+ "۸": 107,
112
+ "۹": 108,
113
+ "‌": 109,
114
+ "‍": 110,
115
+ "–": 111,
116
+ "—": 112
117
+ }