infinitejoy commited on
Commit
9f20bd4
·
1 Parent(s): eb50e66

Upload odia_training_script.ipynb

Browse files
Files changed (1) hide show
  1. odia_training_script.ipynb +1449 -0
odia_training_script.ipynb ADDED
@@ -0,0 +1,1449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# HuggingFace challenge - Debugger notebook\n",
8
+ "Run this notebook to verify your libraries versions, check GPU config and run a quick training"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "metadata": {
15
+ "id": "T2utsYSKszvv"
16
+ },
17
+ "outputs": [],
18
+ "source": [
19
+ "import platform\n",
20
+ "import multiprocessing\n",
21
+ "\n",
22
+ "import torch\n",
23
+ "import transformers\n",
24
+ "import datasets\n",
25
+ "\n",
26
+ "import soundfile"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "markdown",
31
+ "metadata": {},
32
+ "source": [
33
+ "## Print main infos"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 2,
39
+ "metadata": {
40
+ "colab": {
41
+ "base_uri": "https://localhost:8080/"
42
+ },
43
+ "id": "5P6I-W9ts-kR",
44
+ "outputId": "939bd550-1486-46a6-8371-e82ada0f448c"
45
+ },
46
+ "outputs": [
47
+ {
48
+ "name": "stdout",
49
+ "output_type": "stream",
50
+ "text": [
51
+ "Platform: Linux-5.11.0-37-generic-x86_64-with-glibc2.10\n",
52
+ "CPU cores: 60\n",
53
+ "Python version: 3.8.8\n",
54
+ "PyTorch version: 1.10.1+cu102\n",
55
+ "GPU is visible: True\n",
56
+ "Transformers version: 4.16.0.dev0\n",
57
+ "Datasets version: 1.17.1.dev0\n",
58
+ "soundfile version: 0.10.3\n"
59
+ ]
60
+ }
61
+ ],
62
+ "source": [
63
+ "print(f\"Platform: {platform.platform()}\")\n",
64
+ "print(f\"CPU cores: {multiprocessing.cpu_count()}\")\n",
65
+ "\n",
66
+ "print(f\"Python version: {platform.python_version()}\")\n",
67
+ "\n",
68
+ "print(f\"PyTorch version: {torch.__version__}\")\n",
69
+ "print(f\"GPU is visible: {torch.cuda.is_available()}\")\n",
70
+ "\n",
71
+ "print(f\"Transformers version: {transformers.__version__}\")\n",
72
+ "print(f\"Datasets version: {datasets.__version__}\")\n",
73
+ "\n",
74
+ "print(f\"soundfile version: {soundfile.__version__}\")"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "markdown",
79
+ "metadata": {},
80
+ "source": [
81
+ "## Check your GPU informations (if any)\n",
82
+ "If you launched an AI Training job with GPU resources, they should be listed below (Tesla V100s 32GB).\n",
83
+ "Driver and CUDA version "
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": 3,
89
+ "metadata": {
90
+ "colab": {
91
+ "base_uri": "https://localhost:8080/"
92
+ },
93
+ "id": "YT7fRnKctggU",
94
+ "outputId": "f355a3e0-20da-489f-bd1f-5e508e792a68"
95
+ },
96
+ "outputs": [
97
+ {
98
+ "name": "stdout",
99
+ "output_type": "stream",
100
+ "text": [
101
+ "Fri Jan 21 03:07:38 2022 \n",
102
+ "+-----------------------------------------------------------------------------+\n",
103
+ "| NVIDIA-SMI 470.57.02 Driver Version: 470.57.02 CUDA Version: 11.4 |\n",
104
+ "|-------------------------------+----------------------+----------------------+\n",
105
+ "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
106
+ "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
107
+ "| | | MIG M. |\n",
108
+ "|===============================+======================+======================|\n",
109
+ "| 0 Tesla V100S-PCI... Off | 00000000:00:06.0 Off | 0 |\n",
110
+ "| N/A 35C P0 26W / 250W | 4MiB / 32510MiB | 0% Default |\n",
111
+ "| | | N/A |\n",
112
+ "+-------------------------------+----------------------+----------------------+\n",
113
+ " \n",
114
+ "+-----------------------------------------------------------------------------+\n",
115
+ "| Processes: |\n",
116
+ "| GPU GI CI PID Type Process name GPU Memory |\n",
117
+ "| ID ID Usage |\n",
118
+ "|=============================================================================|\n",
119
+ "| No running processes found |\n",
120
+ "+-----------------------------------------------------------------------------+\n"
121
+ ]
122
+ }
123
+ ],
124
+ "source": [
125
+ "!nvidia-smi"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 4,
131
+ "metadata": {},
132
+ "outputs": [
133
+ {
134
+ "data": {
135
+ "application/vnd.jupyter.widget-view+json": {
136
+ "model_id": "1f72bffe678b4bdca366b35305baaab5",
137
+ "version_major": 2,
138
+ "version_minor": 0
139
+ },
140
+ "text/plain": [
141
+ "VBox(children=(HTML(value='<center>\\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
142
+ ]
143
+ },
144
+ "metadata": {},
145
+ "output_type": "display_data"
146
+ }
147
+ ],
148
+ "source": [
149
+ "from huggingface_hub import notebook_login\n",
150
+ "\n",
151
+ "notebook_login()"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "markdown",
156
+ "metadata": {
157
+ "id": "TorMtpwPv6RQ"
158
+ },
159
+ "source": [
160
+ "## Quick training run with a dummy model and data\n",
161
+ "more information on https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 5,
167
+ "metadata": {
168
+ "colab": {
169
+ "base_uri": "https://localhost:8080/"
170
+ },
171
+ "id": "fevoJD15u4Ss",
172
+ "outputId": "5861d34e-745b-45ee-e780-ed363043e655"
173
+ },
174
+ "outputs": [
175
+ {
176
+ "name": "stdout",
177
+ "output_type": "stream",
178
+ "text": [
179
+ "--2022-01-21 03:07:52-- https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py\n",
180
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
181
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
182
+ "HTTP request sent, awaiting response... 200 OK\n",
183
+ "Length: 30348 (30K) [text/plain]\n",
184
+ "Saving to: ‘run_speech_recognition_ctc.py’\n",
185
+ "\n",
186
+ "run_speech_recognit 100%[===================>] 29.64K --.-KB/s in 0.001s \n",
187
+ "\n",
188
+ "2022-01-21 03:07:52 (21.5 MB/s) - ‘run_speech_recognition_ctc.py’ saved [30348/30348]\n",
189
+ "\n"
190
+ ]
191
+ }
192
+ ],
193
+ "source": [
194
+ "!wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": 46,
200
+ "metadata": {
201
+ "colab": {
202
+ "base_uri": "https://localhost:8080/"
203
+ },
204
+ "id": "Mz4bubhxxsad",
205
+ "outputId": "23398525-cc19-43c2-9fec-497e06214f29"
206
+ },
207
+ "outputs": [
208
+ {
209
+ "name": "stdout",
210
+ "output_type": "stream",
211
+ "text": [
212
+ "01/21/2022 06:29:10 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: True\n",
213
+ "01/21/2022 06:29:10 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
214
+ "_n_gpu=1,\n",
215
+ "adafactor=False,\n",
216
+ "adam_beta1=0.9,\n",
217
+ "adam_beta2=0.999,\n",
218
+ "adam_epsilon=1e-08,\n",
219
+ "bf16=False,\n",
220
+ "bf16_full_eval=False,\n",
221
+ "dataloader_drop_last=False,\n",
222
+ "dataloader_num_workers=0,\n",
223
+ "dataloader_pin_memory=True,\n",
224
+ "ddp_bucket_cap_mb=None,\n",
225
+ "ddp_find_unused_parameters=None,\n",
226
+ "debug=[],\n",
227
+ "deepspeed=None,\n",
228
+ "disable_tqdm=False,\n",
229
+ "do_eval=True,\n",
230
+ "do_predict=False,\n",
231
+ "do_train=True,\n",
232
+ "eval_accumulation_steps=None,\n",
233
+ "eval_steps=500,\n",
234
+ "evaluation_strategy=IntervalStrategy.STEPS,\n",
235
+ "fp16=True,\n",
236
+ "fp16_backend=auto,\n",
237
+ "fp16_full_eval=False,\n",
238
+ "fp16_opt_level=O1,\n",
239
+ "gradient_accumulation_steps=2,\n",
240
+ "gradient_checkpointing=True,\n",
241
+ "greater_is_better=None,\n",
242
+ "group_by_length=True,\n",
243
+ "half_precision_backend=auto,\n",
244
+ "hub_model_id=None,\n",
245
+ "hub_strategy=HubStrategy.EVERY_SAVE,\n",
246
+ "hub_token=<HUB_TOKEN>,\n",
247
+ "ignore_data_skip=False,\n",
248
+ "label_names=None,\n",
249
+ "label_smoothing_factor=0.0,\n",
250
+ "learning_rate=7.5e-05,\n",
251
+ "length_column_name=input_length,\n",
252
+ "load_best_model_at_end=False,\n",
253
+ "local_rank=-1,\n",
254
+ "log_level=-1,\n",
255
+ "log_level_replica=-1,\n",
256
+ "log_on_each_node=True,\n",
257
+ "logging_dir=./wav2vec2-large-xls-r-300m-odia/runs/Jan21_06-29-10_job-8be8b741-e32e-4579-bbec-1e00d9824b4f,\n",
258
+ "logging_first_step=False,\n",
259
+ "logging_nan_inf_filter=True,\n",
260
+ "logging_steps=100,\n",
261
+ "logging_strategy=IntervalStrategy.STEPS,\n",
262
+ "lr_scheduler_type=SchedulerType.LINEAR,\n",
263
+ "max_grad_norm=1.0,\n",
264
+ "max_steps=-1,\n",
265
+ "metric_for_best_model=None,\n",
266
+ "mp_parameters=,\n",
267
+ "no_cuda=False,\n",
268
+ "num_train_epochs=120.0,\n",
269
+ "optim=OptimizerNames.ADAMW_HF,\n",
270
+ "output_dir=./wav2vec2-large-xls-r-300m-odia,\n",
271
+ "overwrite_output_dir=True,\n",
272
+ "past_index=-1,\n",
273
+ "per_device_eval_batch_size=16,\n",
274
+ "per_device_train_batch_size=16,\n",
275
+ "prediction_loss_only=False,\n",
276
+ "push_to_hub=True,\n",
277
+ "push_to_hub_model_id=None,\n",
278
+ "push_to_hub_organization=None,\n",
279
+ "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
280
+ "remove_unused_columns=True,\n",
281
+ "report_to=[],\n",
282
+ "resume_from_checkpoint=None,\n",
283
+ "run_name=./wav2vec2-large-xls-r-300m-odia,\n",
284
+ "save_on_each_node=False,\n",
285
+ "save_steps=500,\n",
286
+ "save_strategy=IntervalStrategy.STEPS,\n",
287
+ "save_total_limit=3,\n",
288
+ "seed=42,\n",
289
+ "sharded_ddp=[],\n",
290
+ "skip_memory_metrics=True,\n",
291
+ "tf32=None,\n",
292
+ "tpu_metrics_debug=False,\n",
293
+ "tpu_num_cores=None,\n",
294
+ "use_legacy_prediction_loop=False,\n",
295
+ "warmup_ratio=0.0,\n",
296
+ "warmup_steps=500,\n",
297
+ "weight_decay=0.0,\n",
298
+ "xpu_backend=None,\n",
299
+ ")\n",
300
+ "01/21/2022 06:29:12 - WARNING - datasets.builder - Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/or/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n",
301
+ "01/21/2022 06:29:15 - WARNING - datasets.builder - Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/or/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n",
302
+ "remove special characters from datasets: 100%|█| 537/537 [00:00<00:00, 5280.48ex\n",
303
+ "remove special characters from datasets: 100%|█| 112/112 [00:00<00:00, 6377.61ex\n",
304
+ "loading configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n",
305
+ "Model config Wav2Vec2Config {\n",
306
+ " \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n",
307
+ " \"activation_dropout\": 0.0,\n",
308
+ " \"adapter_kernel_size\": 3,\n",
309
+ " \"adapter_stride\": 2,\n",
310
+ " \"add_adapter\": false,\n",
311
+ " \"apply_spec_augment\": true,\n",
312
+ " \"architectures\": [\n",
313
+ " \"Wav2Vec2ForPreTraining\"\n",
314
+ " ],\n",
315
+ " \"attention_dropout\": 0.1,\n",
316
+ " \"bos_token_id\": 1,\n",
317
+ " \"classifier_proj_size\": 256,\n",
318
+ " \"codevector_dim\": 768,\n",
319
+ " \"contrastive_logits_temperature\": 0.1,\n",
320
+ " \"conv_bias\": true,\n",
321
+ " \"conv_dim\": [\n",
322
+ " 512,\n",
323
+ " 512,\n",
324
+ " 512,\n",
325
+ " 512,\n",
326
+ " 512,\n",
327
+ " 512,\n",
328
+ " 512\n",
329
+ " ],\n",
330
+ " \"conv_kernel\": [\n",
331
+ " 10,\n",
332
+ " 3,\n",
333
+ " 3,\n",
334
+ " 3,\n",
335
+ " 3,\n",
336
+ " 2,\n",
337
+ " 2\n",
338
+ " ],\n",
339
+ " \"conv_stride\": [\n",
340
+ " 5,\n",
341
+ " 2,\n",
342
+ " 2,\n",
343
+ " 2,\n",
344
+ " 2,\n",
345
+ " 2,\n",
346
+ " 2\n",
347
+ " ],\n",
348
+ " \"ctc_loss_reduction\": \"sum\",\n",
349
+ " \"ctc_zero_infinity\": false,\n",
350
+ " \"diversity_loss_weight\": 0.1,\n",
351
+ " \"do_stable_layer_norm\": true,\n",
352
+ " \"eos_token_id\": 2,\n",
353
+ " \"feat_extract_activation\": \"gelu\",\n",
354
+ " \"feat_extract_dropout\": 0.0,\n",
355
+ " \"feat_extract_norm\": \"layer\",\n",
356
+ " \"feat_proj_dropout\": 0.1,\n",
357
+ " \"feat_quantizer_dropout\": 0.0,\n",
358
+ " \"final_dropout\": 0.0,\n",
359
+ " \"gradient_checkpointing\": false,\n",
360
+ " \"hidden_act\": \"gelu\",\n",
361
+ " \"hidden_dropout\": 0.1,\n",
362
+ " \"hidden_size\": 1024,\n",
363
+ " \"initializer_range\": 0.02,\n",
364
+ " \"intermediate_size\": 4096,\n",
365
+ " \"layer_norm_eps\": 1e-05,\n",
366
+ " \"layerdrop\": 0.1,\n",
367
+ " \"mask_feature_length\": 10,\n",
368
+ " \"mask_feature_min_masks\": 0,\n",
369
+ " \"mask_feature_prob\": 0.0,\n",
370
+ " \"mask_time_length\": 10,\n",
371
+ " \"mask_time_min_masks\": 2,\n",
372
+ " \"mask_time_prob\": 0.075,\n",
373
+ " \"model_type\": \"wav2vec2\",\n",
374
+ " \"num_adapter_layers\": 3,\n",
375
+ " \"num_attention_heads\": 16,\n",
376
+ " \"num_codevector_groups\": 2,\n",
377
+ " \"num_codevectors_per_group\": 320,\n",
378
+ " \"num_conv_pos_embedding_groups\": 16,\n",
379
+ " \"num_conv_pos_embeddings\": 128,\n",
380
+ " \"num_feat_extract_layers\": 7,\n",
381
+ " \"num_hidden_layers\": 24,\n",
382
+ " \"num_negatives\": 100,\n",
383
+ " \"output_hidden_size\": 1024,\n",
384
+ " \"pad_token_id\": 0,\n",
385
+ " \"proj_codevector_dim\": 768,\n",
386
+ " \"tdnn_dilation\": [\n",
387
+ " 1,\n",
388
+ " 2,\n",
389
+ " 3,\n",
390
+ " 1,\n",
391
+ " 1\n",
392
+ " ],\n",
393
+ " \"tdnn_dim\": [\n",
394
+ " 512,\n",
395
+ " 512,\n",
396
+ " 512,\n",
397
+ " 512,\n",
398
+ " 1500\n",
399
+ " ],\n",
400
+ " \"tdnn_kernel\": [\n",
401
+ " 5,\n",
402
+ " 3,\n",
403
+ " 3,\n",
404
+ " 1,\n",
405
+ " 1\n",
406
+ " ],\n",
407
+ " \"torch_dtype\": \"float32\",\n",
408
+ " \"transformers_version\": \"4.16.0.dev0\",\n",
409
+ " \"use_weighted_layer_sum\": false,\n",
410
+ " \"vocab_size\": 32,\n",
411
+ " \"xvector_output_dim\": 512\n",
412
+ "}\n",
413
+ "\n",
414
+ "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 44.57ba/s]\n",
415
+ "100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 236.54ba/s]\n",
416
+ "Didn't find file ./wav2vec2-large-xls-r-300m-odia/tokenizer.json. We won't load it.\n",
417
+ "loading file ./wav2vec2-large-xls-r-300m-odia/vocab.json\n",
418
+ "loading file ./wav2vec2-large-xls-r-300m-odia/tokenizer_config.json\n",
419
+ "loading file ./wav2vec2-large-xls-r-300m-odia/added_tokens.json\n",
420
+ "loading file ./wav2vec2-large-xls-r-300m-odia/special_tokens_map.json\n",
421
+ "loading file None\n",
422
+ "Adding <s> to the vocabulary\n",
423
+ "Adding </s> to the vocabulary\n",
424
+ "loading configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n",
425
+ "Model config Wav2Vec2Config {\n",
426
+ " \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n",
427
+ " \"activation_dropout\": 0.0,\n",
428
+ " \"adapter_kernel_size\": 3,\n",
429
+ " \"adapter_stride\": 2,\n",
430
+ " \"add_adapter\": false,\n",
431
+ " \"apply_spec_augment\": true,\n",
432
+ " \"architectures\": [\n",
433
+ " \"Wav2Vec2ForPreTraining\"\n",
434
+ " ],\n",
435
+ " \"attention_dropout\": 0.1,\n",
436
+ " \"bos_token_id\": 1,\n",
437
+ " \"classifier_proj_size\": 256,\n",
438
+ " \"codevector_dim\": 768,\n",
439
+ " \"contrastive_logits_temperature\": 0.1,\n",
440
+ " \"conv_bias\": true,\n",
441
+ " \"conv_dim\": [\n",
442
+ " 512,\n",
443
+ " 512,\n",
444
+ " 512,\n",
445
+ " 512,\n",
446
+ " 512,\n",
447
+ " 512,\n",
448
+ " 512\n",
449
+ " ],\n",
450
+ " \"conv_kernel\": [\n",
451
+ " 10,\n",
452
+ " 3,\n",
453
+ " 3,\n",
454
+ " 3,\n",
455
+ " 3,\n",
456
+ " 2,\n",
457
+ " 2\n",
458
+ " ],\n",
459
+ " \"conv_stride\": [\n",
460
+ " 5,\n",
461
+ " 2,\n",
462
+ " 2,\n",
463
+ " 2,\n",
464
+ " 2,\n",
465
+ " 2,\n",
466
+ " 2\n",
467
+ " ],\n",
468
+ " \"ctc_loss_reduction\": \"sum\",\n",
469
+ " \"ctc_zero_infinity\": false,\n",
470
+ " \"diversity_loss_weight\": 0.1,\n",
471
+ " \"do_stable_layer_norm\": true,\n",
472
+ " \"eos_token_id\": 2,\n",
473
+ " \"feat_extract_activation\": \"gelu\",\n",
474
+ " \"feat_extract_dropout\": 0.0,\n",
475
+ " \"feat_extract_norm\": \"layer\",\n",
476
+ " \"feat_proj_dropout\": 0.1,\n",
477
+ " \"feat_quantizer_dropout\": 0.0,\n",
478
+ " \"final_dropout\": 0.0,\n",
479
+ " \"gradient_checkpointing\": false,\n",
480
+ " \"hidden_act\": \"gelu\",\n",
481
+ " \"hidden_dropout\": 0.1,\n",
482
+ " \"hidden_size\": 1024,\n",
483
+ " \"initializer_range\": 0.02,\n",
484
+ " \"intermediate_size\": 4096,\n",
485
+ " \"layer_norm_eps\": 1e-05,\n",
486
+ " \"layerdrop\": 0.1,\n",
487
+ " \"mask_feature_length\": 10,\n",
488
+ " \"mask_feature_min_masks\": 0,\n",
489
+ " \"mask_feature_prob\": 0.0,\n",
490
+ " \"mask_time_length\": 10,\n",
491
+ " \"mask_time_min_masks\": 2,\n",
492
+ " \"mask_time_prob\": 0.075,\n",
493
+ " \"model_type\": \"wav2vec2\",\n",
494
+ " \"num_adapter_layers\": 3,\n",
495
+ " \"num_attention_heads\": 16,\n",
496
+ " \"num_codevector_groups\": 2,\n",
497
+ " \"num_codevectors_per_group\": 320,\n",
498
+ " \"num_conv_pos_embedding_groups\": 16,\n",
499
+ " \"num_conv_pos_embeddings\": 128,\n",
500
+ " \"num_feat_extract_layers\": 7,\n",
501
+ " \"num_hidden_layers\": 24,\n",
502
+ " \"num_negatives\": 100,\n",
503
+ " \"output_hidden_size\": 1024,\n",
504
+ " \"pad_token_id\": 0,\n",
505
+ " \"proj_codevector_dim\": 768,\n",
506
+ " \"tdnn_dilation\": [\n",
507
+ " 1,\n",
508
+ " 2,\n",
509
+ " 3,\n",
510
+ " 1,\n",
511
+ " 1\n",
512
+ " ],\n",
513
+ " \"tdnn_dim\": [\n",
514
+ " 512,\n",
515
+ " 512,\n",
516
+ " 512,\n",
517
+ " 512,\n",
518
+ " 1500\n",
519
+ " ],\n",
520
+ " \"tdnn_kernel\": [\n",
521
+ " 5,\n",
522
+ " 3,\n",
523
+ " 3,\n",
524
+ " 1,\n",
525
+ " 1\n",
526
+ " ],\n",
527
+ " \"torch_dtype\": \"float32\",\n",
528
+ " \"transformers_version\": \"4.16.0.dev0\",\n",
529
+ " \"use_weighted_layer_sum\": false,\n",
530
+ " \"vocab_size\": 32,\n",
531
+ " \"xvector_output_dim\": 512\n",
532
+ "}\n",
533
+ "\n",
534
+ "loading feature extractor configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/preprocessor_config.json from cache at /workspace/.cache/huggingface/transformers/6fb028b95b394059e7d3b367bbca2382b576c66aebe896f04d2cd34e1b575f5b.d4484dc1c81456a2461485e7168b04347a7b9a4e3b1ef3aba723323b33e12326\n",
535
+ "Feature extractor Wav2Vec2FeatureExtractor {\n",
536
+ " \"do_normalize\": true,\n",
537
+ " \"feature_extractor_type\": \"Wav2Vec2FeatureExtractor\",\n",
538
+ " \"feature_size\": 1,\n",
539
+ " \"padding_side\": \"right\",\n",
540
+ " \"padding_value\": 0,\n",
541
+ " \"return_attention_mask\": true,\n",
542
+ " \"sampling_rate\": 16000\n",
543
+ "}\n",
544
+ "\n",
545
+ "loading weights file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/pytorch_model.bin from cache at /workspace/.cache/huggingface/transformers/1e6a6507f3b689035cd4b247e2a37c154e27f39143f31357a49b4e38baeccc36.1edb32803799e27ed554eb7dd935f6745b1a0b17b0ea256442fe24db6eb546cd\n",
546
+ "Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['project_q.weight', 'project_hid.weight', 'project_q.bias', 'project_hid.bias', 'quantizer.weight_proj.weight', 'quantizer.codevectors', 'quantizer.weight_proj.bias']\n",
547
+ "- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
548
+ "- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
549
+ "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias']\n",
550
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
551
+ "preprocess datasets: 100%|████████████████████| 537/537 [00:05<00:00, 97.46ex/s]\n",
552
+ "preprocess datasets: 100%|███████████████████| 112/112 [00:01<00:00, 107.18ex/s]\n",
553
+ "100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 739.08ba/s]\n",
554
+ "100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1258.42ba/s]\n",
555
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/preprocessor_config.json\n",
556
+ "tokenizer config file saved in ./wav2vec2-large-xls-r-300m-odia/tokenizer_config.json\n",
557
+ "Special tokens file saved in ./wav2vec2-large-xls-r-300m-odia/special_tokens_map.json\n",
558
+ "added tokens file saved in ./wav2vec2-large-xls-r-300m-odia/added_tokens.json\n",
559
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/config.json\n",
560
+ "loading feature extractor configuration file ./wav2vec2-large-xls-r-300m-odia/preprocessor_config.json\n",
561
+ "loading configuration file ./wav2vec2-large-xls-r-300m-odia/config.json\n",
562
+ "Model config Wav2Vec2Config {\n",
563
+ " \"_name_or_path\": \"./wav2vec2-large-xls-r-300m-odia\",\n",
564
+ " \"activation_dropout\": 0.1,\n",
565
+ " \"adapter_kernel_size\": 3,\n",
566
+ " \"adapter_stride\": 2,\n",
567
+ " \"add_adapter\": false,\n",
568
+ " \"apply_spec_augment\": true,\n",
569
+ " \"architectures\": [\n",
570
+ " \"Wav2Vec2ForPreTraining\"\n",
571
+ " ],\n",
572
+ " \"attention_dropout\": 0.0,\n",
573
+ " \"bos_token_id\": 1,\n",
574
+ " \"classifier_proj_size\": 256,\n",
575
+ " \"codevector_dim\": 768,\n",
576
+ " \"contrastive_logits_temperature\": 0.1,\n",
577
+ " \"conv_bias\": true,\n",
578
+ " \"conv_dim\": [\n",
579
+ " 512,\n",
580
+ " 512,\n",
581
+ " 512,\n",
582
+ " 512,\n",
583
+ " 512,\n",
584
+ " 512,\n",
585
+ " 512\n",
586
+ " ],\n",
587
+ " \"conv_kernel\": [\n",
588
+ " 10,\n",
589
+ " 3,\n",
590
+ " 3,\n",
591
+ " 3,\n",
592
+ " 3,\n",
593
+ " 2,\n",
594
+ " 2\n",
595
+ " ],\n",
596
+ " \"conv_stride\": [\n",
597
+ " 5,\n",
598
+ " 2,\n",
599
+ " 2,\n",
600
+ " 2,\n",
601
+ " 2,\n",
602
+ " 2,\n",
603
+ " 2\n",
604
+ " ],\n",
605
+ " \"ctc_loss_reduction\": \"mean\",\n",
606
+ " \"ctc_zero_infinity\": false,\n",
607
+ " \"diversity_loss_weight\": 0.1,\n",
608
+ " \"do_stable_layer_norm\": true,\n",
609
+ " \"eos_token_id\": 2,\n",
610
+ " \"feat_extract_activation\": \"gelu\",\n",
611
+ " \"feat_extract_dropout\": 0.0,\n",
612
+ " \"feat_extract_norm\": \"layer\",\n",
613
+ " \"feat_proj_dropout\": 0.0,\n",
614
+ " \"feat_quantizer_dropout\": 0.0,\n",
615
+ " \"final_dropout\": 0.0,\n",
616
+ " \"hidden_act\": \"gelu\",\n",
617
+ " \"hidden_dropout\": 0.0,\n",
618
+ " \"hidden_size\": 1024,\n",
619
+ " \"initializer_range\": 0.02,\n",
620
+ " \"intermediate_size\": 4096,\n",
621
+ " \"layer_norm_eps\": 1e-05,\n",
622
+ " \"layerdrop\": 0.0,\n",
623
+ " \"mask_feature_length\": 64,\n",
624
+ " \"mask_feature_min_masks\": 0,\n",
625
+ " \"mask_feature_prob\": 0.25,\n",
626
+ " \"mask_time_length\": 10,\n",
627
+ " \"mask_time_min_masks\": 2,\n",
628
+ " \"mask_time_prob\": 0.75,\n",
629
+ " \"model_type\": \"wav2vec2\",\n",
630
+ " \"num_adapter_layers\": 3,\n",
631
+ " \"num_attention_heads\": 16,\n",
632
+ " \"num_codevector_groups\": 2,\n",
633
+ " \"num_codevectors_per_group\": 320,\n",
634
+ " \"num_conv_pos_embedding_groups\": 16,\n",
635
+ " \"num_conv_pos_embeddings\": 128,\n",
636
+ " \"num_feat_extract_layers\": 7,\n",
637
+ " \"num_hidden_layers\": 24,\n",
638
+ " \"num_negatives\": 100,\n",
639
+ " \"output_hidden_size\": 1024,\n",
640
+ " \"pad_token_id\": 62,\n",
641
+ " \"proj_codevector_dim\": 768,\n",
642
+ " \"tdnn_dilation\": [\n",
643
+ " 1,\n",
644
+ " 2,\n",
645
+ " 3,\n",
646
+ " 1,\n",
647
+ " 1\n",
648
+ " ],\n",
649
+ " \"tdnn_dim\": [\n",
650
+ " 512,\n",
651
+ " 512,\n",
652
+ " 512,\n",
653
+ " 512,\n",
654
+ " 1500\n",
655
+ " ],\n",
656
+ " \"tdnn_kernel\": [\n",
657
+ " 5,\n",
658
+ " 3,\n",
659
+ " 3,\n",
660
+ " 1,\n",
661
+ " 1\n",
662
+ " ],\n",
663
+ " \"torch_dtype\": \"float32\",\n",
664
+ " \"transformers_version\": \"4.16.0.dev0\",\n",
665
+ " \"use_weighted_layer_sum\": false,\n",
666
+ " \"vocab_size\": 64,\n",
667
+ " \"xvector_output_dim\": 512\n",
668
+ "}\n",
669
+ "\n",
670
+ "loading feature extractor configuration file ./wav2vec2-large-xls-r-300m-odia/preprocessor_config.json\n",
671
+ "Feature extractor Wav2Vec2FeatureExtractor {\n",
672
+ " \"do_normalize\": true,\n",
673
+ " \"feature_extractor_type\": \"Wav2Vec2FeatureExtractor\",\n",
674
+ " \"feature_size\": 1,\n",
675
+ " \"padding_side\": \"right\",\n",
676
+ " \"padding_value\": 0,\n",
677
+ " \"return_attention_mask\": true,\n",
678
+ " \"sampling_rate\": 16000\n",
679
+ "}\n",
680
+ "\n",
681
+ "Didn't find file ./wav2vec2-large-xls-r-300m-odia/tokenizer.json. We won't load it.\n",
682
+ "loading file ./wav2vec2-large-xls-r-300m-odia/vocab.json\n",
683
+ "loading file ./wav2vec2-large-xls-r-300m-odia/tokenizer_config.json\n",
684
+ "loading file ./wav2vec2-large-xls-r-300m-odia/added_tokens.json\n",
685
+ "loading file ./wav2vec2-large-xls-r-300m-odia/special_tokens_map.json\n",
686
+ "loading file None\n",
687
+ "Adding <s> to the vocabulary\n",
688
+ "Adding </s> to the vocabulary\n",
689
+ "/workspace/oriya_training/./wav2vec2-large-xls-r-300m-odia is already a clone of https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-odia. Make sure you pull the latest changes with `repo.git_pull()`.\n",
690
+ "01/21/2022 06:29:36 - WARNING - huggingface_hub.repository - /workspace/oriya_training/./wav2vec2-large-xls-r-300m-odia is already a clone of https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-odia. Make sure you pull the latest changes with `repo.git_pull()`.\n",
691
+ "Using amp half precision backend\n",
692
+ "The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
693
+ "/opt/conda/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use thePyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
694
+ " warnings.warn(\n",
695
+ "***** Running training *****\n",
696
+ " Num examples = 537\n",
697
+ " Num Epochs = 120\n",
698
+ " Instantaneous batch size per device = 16\n",
699
+ " Total train batch size (w. parallel, distributed & accumulation) = 32\n",
700
+ " Gradient Accumulation steps = 2\n",
701
+ " Total optimization steps = 2040\n",
702
+ "{'loss': 16.0345, 'learning_rate': 1.455e-05, 'epoch': 5.88} \n",
703
+ "{'loss': 6.2463, 'learning_rate': 2.955e-05, 'epoch': 11.76} \n",
704
+ "{'loss': 4.2277, 'learning_rate': 4.454999999999999e-05, 'epoch': 17.65} \n",
705
+ "{'loss': 3.4915, 'learning_rate': 5.955e-05, 'epoch': 23.53} \n",
706
+ "{'loss': 3.3231, 'learning_rate': 7.455e-05, 'epoch': 29.41} \n",
707
+ " 25%|█████████▊ | 500/2040 [15:40<47:23, 1.85s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
708
+ "***** Running Evaluation *****\n",
709
+ " Num examples = 112\n",
710
+ " Batch size = 16\n",
711
+ "\n",
712
+ " 0%| | 0/7 [00:00<?, ?it/s]\u001b[A\n",
713
+ " 29%|████████████▊ | 2/7 [00:00<00:02, 2.32it/s]\u001b[A\n",
714
+ " 43%|███████████████████▎ | 3/7 [00:01<00:02, 1.94it/s]\u001b[A\n",
715
+ " 57%|█████████████████████████▋ | 4/7 [00:02<00:01, 1.67it/s]\u001b[A\n",
716
+ " 71%|████████████████████████████████▏ | 5/7 [00:02<00:01, 1.69it/s]\u001b[A\n",
717
+ " 86%|██████████████████████████████████████▌ | 6/7 [00:03<00:00, 1.56it/s]\u001b[A\n",
718
+ " \u001b[A\n",
719
+ "\u001b[A{'eval_loss': 3.352241277694702, 'eval_wer': 0.998972250770812, 'eval_runtime': 5.0475, 'eval_samples_per_second': 22.189, 'eval_steps_per_second': 1.387, 'epoch': 29.41}\n",
720
+ " 25%|█████████▊ | 500/2040 [15:45<47:23, 1.85s/it]\n",
721
+ "100%|█████████████████████████████████████████████| 7/7 [00:04<00:00, 1.60it/s]\u001b[A\n",
722
+ " \u001b[ASaving model checkpoint to ./wav2vec2-large-xls-r-300m-odia/checkpoint-500\n",
723
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/checkpoint-500/config.json\n",
724
+ "Model weights saved in ./wav2vec2-large-xls-r-300m-odia/checkpoint-500/pytorch_model.bin\n",
725
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/checkpoint-500/preprocessor_config.json\n",
726
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/preprocessor_config.json\n",
727
+ "{'loss': 3.2264, 'learning_rate': 7.027597402597401e-05, 'epoch': 35.29} \n",
728
+ "{'loss': 3.1652, 'learning_rate': 6.540584415584416e-05, 'epoch': 41.18} \n",
729
+ "{'loss': 3.019, 'learning_rate': 6.0535714285714285e-05, 'epoch': 47.06} \n",
730
+ "{'loss': 2.6429, 'learning_rate': 5.566558441558441e-05, 'epoch': 52.94} \n",
731
+ "{'loss': 2.1146, 'learning_rate': 5.0795454545454536e-05, 'epoch': 58.82} \n",
732
+ " 49%|███████████████████ | 1000/2040 [32:52<33:58, 1.96s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
733
+ "***** Running Evaluation *****\n",
734
+ " Num examples = 112\n",
735
+ " Batch size = 16\n",
736
+ "\n",
737
+ " 0%| | 0/7 [00:00<?, ?it/s]\u001b[A\n",
738
+ " 29%|████████████▊ | 2/7 [00:00<00:02, 2.36it/s]\u001b[A\n",
739
+ " 43%|███████████████████▎ | 3/7 [00:01<00:02, 1.95it/s]\u001b[A\n",
740
+ " 57%|█████████████████████████▋ | 4/7 [00:02<00:01, 1.67it/s]\u001b[A\n",
741
+ " 71%|████████████████████████████████▏ | 5/7 [00:02<00:01, 1.70it/s]\u001b[A\n",
742
+ " 86%|██████████████████████████████████████▌ | 6/7 [00:03<00:00, 1.55it/s]\u001b[A\n",
743
+ " \u001b[A\n",
744
+ "\u001b[A{'eval_loss': 1.3367875814437866, 'eval_wer': 0.9383350462487153, 'eval_runtime': 5.0633, 'eval_samples_per_second': 22.12, 'eval_steps_per_second': 1.382, 'epoch': 58.82}\n",
745
+ " 49%|███████████████████ | 1000/2040 [32:57<33:58, 1.96s/it]\n",
746
+ "100%|█████████████████████████████████████████████| 7/7 [00:04<00:00, 1.60it/s]\u001b[A\n",
747
+ " \u001b[ASaving model checkpoint to ./wav2vec2-large-xls-r-300m-odia/checkpoint-1000\n",
748
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/checkpoint-1000/config.json\n",
749
+ "Model weights saved in ./wav2vec2-large-xls-r-300m-odia/checkpoint-1000/pytorch_model.bin\n",
750
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/checkpoint-1000/preprocessor_config.json\n",
751
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/preprocessor_config.json\n",
752
+ "{'loss': 1.7769, 'learning_rate': 4.592532467532467e-05, 'epoch': 64.71} \n",
753
+ "{'loss': 1.5937, 'learning_rate': 4.10551948051948e-05, 'epoch': 70.59} \n",
754
+ "{'loss': 1.4694, 'learning_rate': 3.6185064935064934e-05, 'epoch': 76.47} \n",
755
+ "{'loss': 1.3781, 'learning_rate': 3.131493506493506e-05, 'epoch': 82.35} \n",
756
+ "{'loss': 1.3134, 'learning_rate': 2.6444805194805193e-05, 'epoch': 88.24} \n",
757
+ " 74%|████████████████████████████▋ | 1500/2040 [50:05<15:32, 1.73s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
758
+ "***** Running Evaluation *****\n",
759
+ " Num examples = 112\n",
760
+ " Batch size = 16\n",
761
+ "\n",
762
+ " 0%| | 0/7 [00:00<?, ?it/s]\u001b[A\n",
763
+ " 29%|████████████▊ | 2/7 [00:00<00:02, 2.39it/s]\u001b[A\n",
764
+ " 43%|███████████████████▎ | 3/7 [00:01<00:02, 1.98it/s]\u001b[A\n",
765
+ " 57%|█████████████████████████▋ | 4/7 [00:02<00:01, 1.68it/s]\u001b[A\n",
766
+ " 71%|████████████████████████████████▏ | 5/7 [00:02<00:01, 1.72it/s]\u001b[A\n",
767
+ " 86%|██████████████████████████████████████▌ | 6/7 [00:03<00:00, 1.52it/s]\u001b[A\n",
768
+ " \u001b[A\n",
769
+ "\u001b[A{'eval_loss': 0.7372016310691833, 'eval_wer': 0.9578622816032888, 'eval_runtime': 5.0913, 'eval_samples_per_second': 21.998, 'eval_steps_per_second': 1.375, 'epoch': 88.24}\n",
770
+ " 74%|████████████████████████████▋ | 1500/2040 [50:10<15:32, 1.73s/it]\n",
771
+ "100%|█████████████████████████████████████████████| 7/7 [00:04<00:00, 1.57it/s]\u001b[A\n",
772
+ " \u001b[ASaving model checkpoint to ./wav2vec2-large-xls-r-300m-odia/checkpoint-1500\n",
773
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/checkpoint-1500/config.json\n",
774
+ "Model weights saved in ./wav2vec2-large-xls-r-300m-odia/checkpoint-1500/pytorch_model.bin\n",
775
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/checkpoint-1500/preprocessor_config.json\n",
776
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/preprocessor_config.json\n",
777
+ "{'loss': 1.271, 'learning_rate': 2.162337662337662e-05, 'epoch': 94.12} \n",
778
+ "{'loss': 1.2273, 'learning_rate': 1.6753246753246752e-05, 'epoch': 100.0} \n",
779
+ "{'loss': 1.2121, 'learning_rate': 1.1883116883116881e-05, 'epoch': 105.88} \n",
780
+ "{'loss': 1.169, 'learning_rate': 7.012987012987012e-06, 'epoch': 111.76} \n",
781
+ "{'loss': 1.1506, 'learning_rate': 2.1428571428571427e-06, 'epoch': 117.65} \n",
782
+ " 98%|████████████████████████████████████▎| 2000/2040 [1:07:17<01:13, 1.85s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
783
+ "***** Running Evaluation *****\n",
784
+ " Num examples = 112\n",
785
+ " Batch size = 16\n",
786
+ "\n",
787
+ " 0%| | 0/7 [00:00<?, ?it/s]\u001b[A\n",
788
+ " 29%|████████████▊ | 2/7 [00:00<00:02, 2.36it/s]\u001b[A\n",
789
+ " 43%|███████████████████▎ | 3/7 [00:01<00:02, 1.96it/s]\u001b[A\n",
790
+ " 57%|█████████████████████████▋ | 4/7 [00:02<00:01, 1.67it/s]\u001b[A\n",
791
+ " 71%|████████████████████████████████▏ | 5/7 [00:02<00:01, 1.69it/s]\u001b[A\n",
792
+ " 86%|██████████████████████████████████████▌ | 6/7 [00:03<00:00, 1.54it/s]\u001b[A\n",
793
+ " \u001b[A\n",
794
+ "\u001b[A{'eval_loss': 0.6582115292549133, 'eval_wer': 0.9681397738951696, 'eval_runtime': 5.0953, 'eval_samples_per_second': 21.981, 'eval_steps_per_second': 1.374, 'epoch': 117.65}\n",
795
+ " 98%|████████████████████████████████████▎| 2000/2040 [1:07:22<01:13, 1.85s/it]\n",
796
+ "100%|█████████████████████████████████████████████| 7/7 [00:04<00:00, 1.57it/s]\u001b[A\n",
797
+ " \u001b[ASaving model checkpoint to ./wav2vec2-large-xls-r-300m-odia/checkpoint-2000\n",
798
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/checkpoint-2000/config.json\n",
799
+ "Model weights saved in ./wav2vec2-large-xls-r-300m-odia/checkpoint-2000/pytorch_model.bin\n",
800
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/checkpoint-2000/preprocessor_config.json\n",
801
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/preprocessor_config.json\n",
802
+ "Deleting older checkpoint [wav2vec2-large-xls-r-300m-odia/checkpoint-500] due to args.save_total_limit\n",
803
+ "100%|█████████████████████████████████████| 2040/2040 [1:10:04<00:00, 1.69s/it]\n",
804
+ "\n",
805
+ "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
806
+ "\n",
807
+ "\n",
808
+ "{'train_runtime': 4204.6915, 'train_samples_per_second': 15.326, 'train_steps_per_second': 0.485, 'train_loss': 3.015083034365785, 'epoch': 120.0}\n",
809
+ "100%|█████████████████████████████████████| 2040/2040 [1:10:04<00:00, 2.06s/it]\n",
810
+ "Saving model checkpoint to ./wav2vec2-large-xls-r-300m-odia\n",
811
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/config.json\n",
812
+ "Model weights saved in ./wav2vec2-large-xls-r-300m-odia/pytorch_model.bin\n",
813
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/preprocessor_config.json\n",
814
+ "***** train metrics *****\n",
815
+ " epoch = 120.0\n",
816
+ " train_loss = 3.0151\n",
817
+ " train_runtime = 1:10:04.69\n",
818
+ " train_samples = 537\n",
819
+ " train_samples_per_second = 15.326\n",
820
+ " train_steps_per_second = 0.485\n",
821
+ "01/21/2022 07:39:46 - INFO - __main__ - *** Evaluate ***\n",
822
+ "The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
823
+ "***** Running Evaluation *****\n",
824
+ " Num examples = 112\n",
825
+ " Batch size = 16\n",
826
+ "100%|█████████████████████████████████████████████| 7/7 [00:04<00:00, 1.66it/s]\n",
827
+ "***** eval metrics *****\n",
828
+ " epoch = 120.0\n",
829
+ " eval_loss = 0.658\n",
830
+ " eval_runtime = 0:00:05.06\n",
831
+ " eval_samples = 112\n",
832
+ " eval_samples_per_second = 22.115\n",
833
+ " eval_steps_per_second = 1.382\n",
834
+ " eval_wer = 0.9712\n",
835
+ "Saving model checkpoint to ./wav2vec2-large-xls-r-300m-odia\n",
836
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/config.json\n",
837
+ "Model weights saved in ./wav2vec2-large-xls-r-300m-odia/pytorch_model.bin\n",
838
+ "Configuration saved in ./wav2vec2-large-xls-r-300m-odia/preprocessor_config.json\n",
839
+ "Upload file pytorch_model.bin: 98%|██████▊| 1.15G/1.18G [00:41<00:01, 28.4MB/s]To https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-odia\n",
840
+ " 543c1c1..e870548 main -> main\n",
841
+ "\n",
842
+ "01/21/2022 07:41:54 - WARNING - huggingface_hub.repository - To https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-odia\n",
843
+ " 543c1c1..e870548 main -> main\n",
844
+ "\n",
845
+ "Upload file pytorch_model.bin: 100%|███████| 1.18G/1.18G [00:42<00:00, 29.8MB/s]\n",
846
+ "Dropping the following result as it does not have all the necessary fields:\n",
847
+ "{'dataset': {'name': 'MOZILLA-FOUNDATION/COMMON_VOICE_7_0 - OR', 'type': 'common_voice', 'args': 'Config: or, Training split: train+validation, Eval split: test'}}\n",
848
+ "To https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-odia\n",
849
+ " e870548..eb50e66 main -> main\n",
850
+ "\n",
851
+ "01/21/2022 07:42:00 - WARNING - huggingface_hub.repository - To https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-odia\n",
852
+ " e870548..eb50e66 main -> main\n",
853
+ "\n"
854
+ ]
855
+ }
856
+ ],
857
+ "source": [
858
+ "!python run_speech_recognition_ctc.py \\\n",
859
+ "\t--dataset_name=\"mozilla-foundation/common_voice_7_0\" \\\n",
860
+ "\t--model_name_or_path=\"facebook/wav2vec2-xls-r-300m\" \\\n",
861
+ "\t--dataset_config_name=\"or\" \\\n",
862
+ "\t--output_dir=\"./wav2vec2-large-xls-r-300m-odia\" \\\n",
863
+ "\t--overwrite_output_dir \\\n",
864
+ "\t--num_train_epochs=\"120\" \\\n",
865
+ "\t--per_device_train_batch_size=\"16\" \\\n",
866
+ "\t--per_device_eval_batch_size=\"16\" \\\n",
867
+ "\t--gradient_accumulation_steps=\"2\" \\\n",
868
+ "\t--learning_rate=\"7.5e-5\" \\\n",
869
+ "\t--warmup_steps=\"500\" \\\n",
870
+ "\t--length_column_name=\"input_length\" \\\n",
871
+ "\t--evaluation_strategy=\"steps\" \\\n",
872
+ "\t--text_column_name=\"sentence\" \\\n",
873
+ "\t--chars_to_ignore , ? . ! \\- \\; \\: \\\" “ % ‘ ” � — \\’ … \\– \\' \\’ \\– \\\n",
874
+ "\t--save_steps=\"500\" \\\n",
875
+ "\t--eval_steps=\"500\" \\\n",
876
+ "\t--logging_steps=\"100\" \\\n",
877
+ "\t--layerdrop=\"0.0\" \\\n",
878
+ "\t--activation_dropout=\"0.1\" \\\n",
879
+ "\t--save_total_limit=\"3\" \\\n",
880
+ "\t--freeze_feature_encoder \\\n",
881
+ "\t--feat_proj_dropout=\"0.0\" \\\n",
882
+ "\t--mask_time_prob=\"0.75\" \\\n",
883
+ "\t--mask_time_length=\"10\" \\\n",
884
+ "\t--mask_feature_prob=\"0.25\" \\\n",
885
+ "\t--mask_feature_length=\"64\" \\\n",
886
+ "\t--gradient_checkpointing \\\n",
887
+ "\t--use_auth_token \\\n",
888
+ "\t--fp16 \\\n",
889
+ "\t--group_by_length \\\n",
890
+ "\t--do_train --do_eval \\\n",
891
+ " --push_to_hub"
892
+ ]
893
+ },
894
+ {
895
+ "cell_type": "code",
896
+ "execution_count": null,
897
+ "metadata": {},
898
+ "outputs": [],
899
+ "source": [
900
+ "import pandas as pd\n",
901
+ "\n",
902
+ "df = pd.DataFrame([\n",
903
+ " {}\n",
904
+ "])"
905
+ ]
906
+ },
907
+ {
908
+ "cell_type": "code",
909
+ "execution_count": 13,
910
+ "metadata": {},
911
+ "outputs": [],
912
+ "source": [
913
+ "# !zip -r wav2vec2-large-xls-r-300m-odia.zip wav2vec2-large-xls-r-300m-odia/\n",
914
+ "# !rm wav2vec2-large-xls-r-300m-odia.zip"
915
+ ]
916
+ },
917
+ {
918
+ "cell_type": "code",
919
+ "execution_count": 10,
920
+ "metadata": {
921
+ "collapsed": true,
922
+ "jupyter": {
923
+ "outputs_hidden": true
924
+ }
925
+ },
926
+ "outputs": [
927
+ {
928
+ "name": "stdout",
929
+ "output_type": "stream",
930
+ "text": [
931
+ "Filesystem Size Used Avail Use% Mounted on\n",
932
+ "overlay 3.5T 557G 2.8T 17% /\n",
933
+ "tmpfs 64M 0 64M 0% /dev\n",
934
+ "tmpfs 87G 0 87G 0% /sys/fs/cgroup\n",
935
+ "tmpfs 87G 0 87G 0% /dev/shm\n",
936
+ "/dev/md0 3.5T 557G 2.8T 17% /etc/group\n",
937
+ "tmpfs 87G 12K 87G 1% /proc/driver/nvidia\n",
938
+ "/dev/vda1 49G 6.6G 42G 14% /usr/bin/nvidia-smi\n",
939
+ "udev 87G 0 87G 0% /dev/nvidia0\n",
940
+ "tmpfs 87G 0 87G 0% /proc/acpi\n",
941
+ "tmpfs 87G 0 87G 0% /proc/scsi\n",
942
+ "tmpfs 87G 0 87G 0% /sys/firmware\n"
943
+ ]
944
+ }
945
+ ],
946
+ "source": [
947
+ "!df -h"
948
+ ]
949
+ },
950
+ {
951
+ "cell_type": "code",
952
+ "execution_count": 16,
953
+ "metadata": {},
954
+ "outputs": [
955
+ {
956
+ "name": "stderr",
957
+ "output_type": "stream",
958
+ "text": [
959
+ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/or/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n",
960
+ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/or/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n"
961
+ ]
962
+ }
963
+ ],
964
+ "source": [
965
+ "from datasets import load_dataset, load_metric, Audio\n",
966
+ "\n",
967
+ "common_voice_train = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"or\", use_auth_token=True, split=\"train+validation\")\n",
968
+ "common_voice_test = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"or\", use_auth_token=True, split=\"test\")"
969
+ ]
970
+ },
971
+ {
972
+ "cell_type": "code",
973
+ "execution_count": 54,
974
+ "metadata": {},
975
+ "outputs": [
976
+ {
977
+ "data": {
978
+ "text/plain": [
979
+ "2013.75"
980
+ ]
981
+ },
982
+ "execution_count": 54,
983
+ "metadata": {},
984
+ "output_type": "execute_result"
985
+ }
986
+ ],
987
+ "source": [
988
+ "len(common_voice_train) * 120 / 32"
989
+ ]
990
+ },
991
+ {
992
+ "cell_type": "code",
993
+ "execution_count": 17,
994
+ "metadata": {},
995
+ "outputs": [],
996
+ "source": [
997
+ "common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n",
998
+ "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])"
999
+ ]
1000
+ },
1001
+ {
1002
+ "cell_type": "code",
1003
+ "execution_count": 18,
1004
+ "metadata": {},
1005
+ "outputs": [],
1006
+ "source": [
1007
+ "from datasets import ClassLabel\n",
1008
+ "import random\n",
1009
+ "import pandas as pd\n",
1010
+ "from IPython.display import display, HTML\n",
1011
+ "\n",
1012
+ "def show_random_elements(dataset, num_examples=10):\n",
1013
+ " assert num_examples <= len(dataset), \"Can't pick more elements than there are in the dataset.\"\n",
1014
+ " picks = []\n",
1015
+ " for _ in range(num_examples):\n",
1016
+ " pick = random.randint(0, len(dataset)-1)\n",
1017
+ " while pick in picks:\n",
1018
+ " pick = random.randint(0, len(dataset)-1)\n",
1019
+ " picks.append(pick)\n",
1020
+ " \n",
1021
+ " df = pd.DataFrame(dataset[picks])\n",
1022
+ " display(HTML(df.to_html()))"
1023
+ ]
1024
+ },
1025
+ {
1026
+ "cell_type": "code",
1027
+ "execution_count": 19,
1028
+ "metadata": {},
1029
+ "outputs": [
1030
+ {
1031
+ "data": {
1032
+ "text/html": [
1033
+ "<table border=\"1\" class=\"dataframe\">\n",
1034
+ " <thead>\n",
1035
+ " <tr style=\"text-align: right;\">\n",
1036
+ " <th></th>\n",
1037
+ " <th>sentence</th>\n",
1038
+ " </tr>\n",
1039
+ " </thead>\n",
1040
+ " <tbody>\n",
1041
+ " <tr>\n",
1042
+ " <th>0</th>\n",
1043
+ " <td>ସେ କଥା ଯାଉ, ଆମ୍ଭମାନଙ୍କୁ ଆଉ କଥା ଲେଖିବାକୁ ହେବ ।</td>\n",
1044
+ " </tr>\n",
1045
+ " <tr>\n",
1046
+ " <th>1</th>\n",
1047
+ " <td>ଯାହା ଦରମା ଗଣ୍ଡାକ ପାଉଥିଲେ, ପେଟ ପିଠିକୁ ନିଅଣ୍ଟ, ବିଧବା ଲାଗି ସାଇତି ଯିବେ କଣ?</td>\n",
1048
+ " </tr>\n",
1049
+ " <tr>\n",
1050
+ " <th>2</th>\n",
1051
+ " <td>ଯେ ଯେଡ଼େ ହୁସିଆର ହେବ, ଆପଦ ବିପଦ କାହାରିକୁ ଛାଡ଼ିନାହିଁ ।</td>\n",
1052
+ " </tr>\n",
1053
+ " <tr>\n",
1054
+ " <th>3</th>\n",
1055
+ " <td>ମୁଁ ପୂଜା ସାରି ସେମାନଙ୍କୁ କିଛି ଭୋଗ ଦେଇ ଘରେ ଛାଡ଼ିଆସିଲି, ବାକି ଭୋଗକୁ ବାନ୍ଧିଲି ।</td>\n",
1056
+ " </tr>\n",
1057
+ " <tr>\n",
1058
+ " <th>4</th>\n",
1059
+ " <td>ବାସୁ ଦୁଇ ଟଙ୍କାର ନଡ଼ା କିଣି ବାଡ଼ିରେ ଗଦେଇଅଛି, ଶରଣ ଦେବାରୁ ଛପରବନ୍ଦି ହୋଇପାରି ନାହିଁ ।</td>\n",
1060
+ " </tr>\n",
1061
+ " <tr>\n",
1062
+ " <th>5</th>\n",
1063
+ " <td>ଦେଖି ଦେଖି ମନରେ କଲା, ଆଜି ଏ କଣ ହେଉଛି ।</td>\n",
1064
+ " </tr>\n",
1065
+ " <tr>\n",
1066
+ " <th>6</th>\n",
1067
+ " <td>ଶାଶୁ ମୁହଁକୁ ଚାହିଁ ଗାଳି ଦିଏ ନାହିଁ; ଓଢ଼ଣା ପଡ଼ିଥାଏ, ପଛ କରି ବରବର କରି ବକିଯାଏ ।</td>\n",
1068
+ " </tr>\n",
1069
+ " <tr>\n",
1070
+ " <th>7</th>\n",
1071
+ " <td>ଆଜି ମହାପ୍ରସାଦ ଉଠା ପରା ।</td>\n",
1072
+ " </tr>\n",
1073
+ " <tr>\n",
1074
+ " <th>8</th>\n",
1075
+ " <td>\"\"\"ଯାହାର ବାହା ସେ ଖେଳୁଛି ପଶା ଧାଇଁ ବୁଲୁଛନ୍ତି ସାଇ ପଡିଶା ।\"\"\"</td>\n",
1076
+ " </tr>\n",
1077
+ " <tr>\n",
1078
+ " <th>9</th>\n",
1079
+ " <td>ଅଶୀ ବର୍ଷର ପୁରୁଷ ବି ବିଭା ହୋଇ ପାରେ ।</td>\n",
1080
+ " </tr>\n",
1081
+ " </tbody>\n",
1082
+ "</table>"
1083
+ ],
1084
+ "text/plain": [
1085
+ "<IPython.core.display.HTML object>"
1086
+ ]
1087
+ },
1088
+ "metadata": {},
1089
+ "output_type": "display_data"
1090
+ }
1091
+ ],
1092
+ "source": [
1093
+ "show_random_elements(common_voice_train.remove_columns([\"path\", \"audio\"]), num_examples=10)"
1094
+ ]
1095
+ },
1096
+ {
1097
+ "cell_type": "code",
1098
+ "execution_count": 36,
1099
+ "metadata": {},
1100
+ "outputs": [],
1101
+ "source": [
1102
+ "import re\n",
1103
+ "chars_to_remove_regex = '[\\,\\?\\.\\!\\-\\;\\:\\\"\\“\\%\\‘\\”\\�\\'\\’\\–]'\n",
1104
+ "\n",
1105
+ "def remove_special_characters(batch):\n",
1106
+ " batch[\"sentence\"] = re.sub(chars_to_remove_regex, '', batch[\"sentence\"]).lower()\n",
1107
+ " return batch"
1108
+ ]
1109
+ },
1110
+ {
1111
+ "cell_type": "code",
1112
+ "execution_count": 37,
1113
+ "metadata": {},
1114
+ "outputs": [
1115
+ {
1116
+ "data": {
1117
+ "application/vnd.jupyter.widget-view+json": {
1118
+ "model_id": "a9df324b393840628b6a038aa00aa697",
1119
+ "version_major": 2,
1120
+ "version_minor": 0
1121
+ },
1122
+ "text/plain": [
1123
+ " 0%| | 0/537 [00:00<?, ?ex/s]"
1124
+ ]
1125
+ },
1126
+ "metadata": {},
1127
+ "output_type": "display_data"
1128
+ },
1129
+ {
1130
+ "data": {
1131
+ "application/vnd.jupyter.widget-view+json": {
1132
+ "model_id": "35fb5a05a9484b5892a1b41be076c55c",
1133
+ "version_major": 2,
1134
+ "version_minor": 0
1135
+ },
1136
+ "text/plain": [
1137
+ " 0%| | 0/112 [00:00<?, ?ex/s]"
1138
+ ]
1139
+ },
1140
+ "metadata": {},
1141
+ "output_type": "display_data"
1142
+ }
1143
+ ],
1144
+ "source": [
1145
+ "common_voice_train = common_voice_train.map(remove_special_characters)\n",
1146
+ "common_voice_test = common_voice_test.map(remove_special_characters)"
1147
+ ]
1148
+ },
1149
+ {
1150
+ "cell_type": "code",
1151
+ "execution_count": 38,
1152
+ "metadata": {},
1153
+ "outputs": [],
1154
+ "source": [
1155
+ "def replace_hatted_characters(batch):\n",
1156
+ " batch[\"sentence\"] = re.sub('[â]', 'a', batch[\"sentence\"])\n",
1157
+ " batch[\"sentence\"] = re.sub('[î]', 'i', batch[\"sentence\"])\n",
1158
+ " batch[\"sentence\"] = re.sub('[ô]', 'o', batch[\"sentence\"])\n",
1159
+ " batch[\"sentence\"] = re.sub('[û]', 'u', batch[\"sentence\"])\n",
1160
+ " return batch"
1161
+ ]
1162
+ },
1163
+ {
1164
+ "cell_type": "code",
1165
+ "execution_count": 39,
1166
+ "metadata": {},
1167
+ "outputs": [
1168
+ {
1169
+ "data": {
1170
+ "application/vnd.jupyter.widget-view+json": {
1171
+ "model_id": "e57ed45f40c440dc8df26f140b226c0d",
1172
+ "version_major": 2,
1173
+ "version_minor": 0
1174
+ },
1175
+ "text/plain": [
1176
+ " 0%| | 0/537 [00:00<?, ?ex/s]"
1177
+ ]
1178
+ },
1179
+ "metadata": {},
1180
+ "output_type": "display_data"
1181
+ },
1182
+ {
1183
+ "data": {
1184
+ "application/vnd.jupyter.widget-view+json": {
1185
+ "model_id": "863c62f8741f4efcb130ff1f44f3e0e4",
1186
+ "version_major": 2,
1187
+ "version_minor": 0
1188
+ },
1189
+ "text/plain": [
1190
+ " 0%| | 0/112 [00:00<?, ?ex/s]"
1191
+ ]
1192
+ },
1193
+ "metadata": {},
1194
+ "output_type": "display_data"
1195
+ }
1196
+ ],
1197
+ "source": [
1198
+ "common_voice_train = common_voice_train.map(replace_hatted_characters)\n",
1199
+ "common_voice_test = common_voice_test.map(replace_hatted_characters)"
1200
+ ]
1201
+ },
1202
+ {
1203
+ "cell_type": "code",
1204
+ "execution_count": 40,
1205
+ "metadata": {},
1206
+ "outputs": [],
1207
+ "source": [
1208
+ "def extract_all_chars(batch):\n",
1209
+ " all_text = \" \".join(batch[\"sentence\"])\n",
1210
+ " vocab = list(set(all_text))\n",
1211
+ " return {\"vocab\": [vocab], \"all_text\": [all_text]}"
1212
+ ]
1213
+ },
1214
+ {
1215
+ "cell_type": "code",
1216
+ "execution_count": 41,
1217
+ "metadata": {},
1218
+ "outputs": [
1219
+ {
1220
+ "data": {
1221
+ "application/vnd.jupyter.widget-view+json": {
1222
+ "model_id": "a8ab1aeb1bb240ca821b5558280495f3",
1223
+ "version_major": 2,
1224
+ "version_minor": 0
1225
+ },
1226
+ "text/plain": [
1227
+ " 0%| | 0/1 [00:00<?, ?ba/s]"
1228
+ ]
1229
+ },
1230
+ "metadata": {},
1231
+ "output_type": "display_data"
1232
+ },
1233
+ {
1234
+ "data": {
1235
+ "application/vnd.jupyter.widget-view+json": {
1236
+ "model_id": "f3d8bf90831a4e5d8a3feb7f30cf5966",
1237
+ "version_major": 2,
1238
+ "version_minor": 0
1239
+ },
1240
+ "text/plain": [
1241
+ " 0%| | 0/1 [00:00<?, ?ba/s]"
1242
+ ]
1243
+ },
1244
+ "metadata": {},
1245
+ "output_type": "display_data"
1246
+ }
1247
+ ],
1248
+ "source": [
1249
+ "vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)\n",
1250
+ "vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)"
1251
+ ]
1252
+ },
1253
+ {
1254
+ "cell_type": "code",
1255
+ "execution_count": 42,
1256
+ "metadata": {},
1257
+ "outputs": [],
1258
+ "source": [
1259
+ "vocab_list = list(set(vocab_train[\"vocab\"][0]) | set(vocab_test[\"vocab\"][0]))"
1260
+ ]
1261
+ },
1262
+ {
1263
+ "cell_type": "code",
1264
+ "execution_count": 43,
1265
+ "metadata": {},
1266
+ "outputs": [
1267
+ {
1268
+ "data": {
1269
+ "text/plain": [
1270
+ "{' ': 0,\n",
1271
+ " '|': 1,\n",
1272
+ " '।': 2,\n",
1273
+ " 'ଁ': 3,\n",
1274
+ " 'ଂ': 4,\n",
1275
+ " 'ଃ': 5,\n",
1276
+ " 'ଅ': 6,\n",
1277
+ " 'ଆ': 7,\n",
1278
+ " 'ଇ': 8,\n",
1279
+ " 'ଈ': 9,\n",
1280
+ " 'ଉ': 10,\n",
1281
+ " 'ଊ': 11,\n",
1282
+ " 'ଏ': 12,\n",
1283
+ " 'ଓ': 13,\n",
1284
+ " 'କ': 14,\n",
1285
+ " 'ଖ': 15,\n",
1286
+ " 'ଗ': 16,\n",
1287
+ " 'ଘ': 17,\n",
1288
+ " 'ଙ': 18,\n",
1289
+ " 'ଚ': 19,\n",
1290
+ " 'ଛ': 20,\n",
1291
+ " 'ଜ': 21,\n",
1292
+ " 'ଝ': 22,\n",
1293
+ " 'ଞ': 23,\n",
1294
+ " 'ଟ': 24,\n",
1295
+ " 'ଠ': 25,\n",
1296
+ " 'ଡ': 26,\n",
1297
+ " 'ଢ': 27,\n",
1298
+ " 'ଣ': 28,\n",
1299
+ " 'ତ': 29,\n",
1300
+ " 'ଥ': 30,\n",
1301
+ " 'ଦ': 31,\n",
1302
+ " 'ଧ': 32,\n",
1303
+ " 'ନ': 33,\n",
1304
+ " 'ପ': 34,\n",
1305
+ " 'ଫ': 35,\n",
1306
+ " 'ବ': 36,\n",
1307
+ " 'ଭ': 37,\n",
1308
+ " 'ମ': 38,\n",
1309
+ " 'ଯ': 39,\n",
1310
+ " 'ର': 40,\n",
1311
+ " 'ଲ': 41,\n",
1312
+ " 'ଳ': 42,\n",
1313
+ " 'ଵ': 43,\n",
1314
+ " 'ଶ': 44,\n",
1315
+ " 'ଷ': 45,\n",
1316
+ " 'ସ': 46,\n",
1317
+ " 'ହ': 47,\n",
1318
+ " '଼': 48,\n",
1319
+ " 'ା': 49,\n",
1320
+ " 'ି': 50,\n",
1321
+ " 'ୀ': 51,\n",
1322
+ " 'ୁ': 52,\n",
1323
+ " 'ୂ': 53,\n",
1324
+ " 'ୃ': 54,\n",
1325
+ " 'େ': 55,\n",
1326
+ " 'ୈ': 56,\n",
1327
+ " 'ୋ': 57,\n",
1328
+ " 'ୌ': 58,\n",
1329
+ " '୍': 59,\n",
1330
+ " 'ୟ': 60,\n",
1331
+ " 'ୱ': 61}"
1332
+ ]
1333
+ },
1334
+ "execution_count": 43,
1335
+ "metadata": {},
1336
+ "output_type": "execute_result"
1337
+ }
1338
+ ],
1339
+ "source": [
1340
+ "vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}\n",
1341
+ "vocab_dict"
1342
+ ]
1343
+ },
1344
+ {
1345
+ "cell_type": "code",
1346
+ "execution_count": 48,
1347
+ "metadata": {},
1348
+ "outputs": [
1349
+ {
1350
+ "name": "stdout",
1351
+ "output_type": "stream",
1352
+ "text": [
1353
+ "--2022-01-21 08:33:50-- https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py\n",
1354
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n",
1355
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n",
1356
+ "HTTP request sent, awaiting response... 200 OK\n",
1357
+ "Length: 4419 (4.3K) [text/plain]\n",
1358
+ "Saving to: ‘eval.py’\n",
1359
+ "\n",
1360
+ "eval.py 100%[===================>] 4.32K --.-KB/s in 0s \n",
1361
+ "\n",
1362
+ "2022-01-21 08:33:50 (14.9 MB/s) - ‘eval.py’ saved [4419/4419]\n",
1363
+ "\n",
1364
+ "total 1232676\n",
1365
+ "-rw-r--r-- 1 ovh ovh 686 Jan 21 06:29 vocab.json\n",
1366
+ "-rw-r--r-- 1 ovh ovh 290 Jan 21 06:29 tokenizer_config.json\n",
1367
+ "-rw-r--r-- 1 ovh ovh 502 Jan 21 06:29 special_tokens_map.json\n",
1368
+ "-rw-r--r-- 1 ovh ovh 23 Jan 21 06:29 added_tokens.json\n",
1369
+ "drwxr-xr-x 2 ovh ovh 4096 Jan 21 07:02 checkpoint-1000\n",
1370
+ "drwxr-xr-x 2 ovh ovh 4096 Jan 21 07:19 checkpoint-1500\n",
1371
+ "drwxr-xr-x 2 ovh ovh 4096 Jan 21 07:37 checkpoint-2000\n",
1372
+ "-rw-r--r-- 1 ovh ovh 3953 Jan 21 07:39 trainer_state.json\n",
1373
+ "-rw-r--r-- 1 ovh ovh 194 Jan 21 07:39 train_results.json\n",
1374
+ "-rw-r--r-- 1 ovh ovh 222 Jan 21 07:39 eval_results.json\n",
1375
+ "-rw-r--r-- 1 ovh ovh 2033 Jan 21 07:39 config.json\n",
1376
+ "-rw-r--r-- 1 ovh ovh 394 Jan 21 07:39 all_results.json\n",
1377
+ "-rw-r--r-- 1 ovh ovh 1262186097 Jan 21 07:39 pytorch_model.bin\n",
1378
+ "-rw-r--r-- 1 ovh ovh 3055 Jan 21 07:39 training_args.bin\n",
1379
+ "-rw-r--r-- 1 ovh ovh 212 Jan 21 07:39 preprocessor_config.json\n",
1380
+ "-rw-r--r-- 1 ovh ovh 1825 Jan 21 07:41 README.md\n",
1381
+ "-rw-r--r-- 1 ovh ovh 4419 Jan 21 08:33 eval.py\n"
1382
+ ]
1383
+ }
1384
+ ],
1385
+ "source": [
1386
+ "!wget -O eval.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py\n",
1387
+ "!cp eval.py wav2vec2-large-xls-r-300m-odia\n",
1388
+ "!ls -ltr wav2vec2-large-xls-r-300m-odia"
1389
+ ]
1390
+ },
1391
+ {
1392
+ "cell_type": "code",
1393
+ "execution_count": 50,
1394
+ "metadata": {},
1395
+ "outputs": [
1396
+ {
1397
+ "name": "stdout",
1398
+ "output_type": "stream",
1399
+ "text": [
1400
+ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/or/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n",
1401
+ "100%|███████████████████████████████████████████| 10/10 [00:06<00:00, 1.55ex/s]\n",
1402
+ "Downloading: 5.61kB [00:00, 2.23MB/s] \n",
1403
+ "WER: 1.0921052631578947\n",
1404
+ "CER: 2.5547945205479454\n",
1405
+ "100%|████████████████████████████████████████| 10/10 [00:00<00:00, 13001.56ex/s]\n"
1406
+ ]
1407
+ }
1408
+ ],
1409
+ "source": [
1410
+ "!cd wav2vec2-large-xls-r-300m-odia; python eval.py --model_id ./ --dataset mozilla-foundation/common_voice_7_0 --config or --split test --log_outputs"
1411
+ ]
1412
+ },
1413
+ {
1414
+ "cell_type": "code",
1415
+ "execution_count": null,
1416
+ "metadata": {},
1417
+ "outputs": [],
1418
+ "source": []
1419
+ }
1420
+ ],
1421
+ "metadata": {
1422
+ "accelerator": "GPU",
1423
+ "colab": {
1424
+ "authorship_tag": "ABX9TyM3OaMlm9YQtKpl28c8gBBd",
1425
+ "include_colab_link": true,
1426
+ "name": "DebugOVHTransformers.ipynb",
1427
+ "provenance": []
1428
+ },
1429
+ "kernelspec": {
1430
+ "display_name": "Python 3",
1431
+ "language": "python",
1432
+ "name": "python3"
1433
+ },
1434
+ "language_info": {
1435
+ "codemirror_mode": {
1436
+ "name": "ipython",
1437
+ "version": 3
1438
+ },
1439
+ "file_extension": ".py",
1440
+ "mimetype": "text/x-python",
1441
+ "name": "python",
1442
+ "nbconvert_exporter": "python",
1443
+ "pygments_lexer": "ipython3",
1444
+ "version": "3.8.8"
1445
+ }
1446
+ },
1447
+ "nbformat": 4,
1448
+ "nbformat_minor": 4
1449
+ }