Srulikbdd commited on
Commit
1a0faec
·
1 Parent(s): a6d0ae4

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +90 -1
README.md CHANGED
@@ -20,4 +20,93 @@ model-index:
20
  - name: Test WER
21
  type: wer
22
  value: 20.93
23
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  - name: Test WER
21
  type: wer
22
  value: 20.93
23
+ ---
24
+
25
+
26
+ Wav2Vec2-Large-XLSR-Welsh
27
+ Fine-tuned facebook/wav2vec2-large-xlsr-53 on the Welsh Common Voice dataset.
28
+
29
+ When using this model, make sure that your speech input is sampled at 16kHz.
30
+
31
+ Usage
32
+ The model can be used directly (without a language model) as follows:
33
+ ```
34
+ import torch
35
+ import torchaudio
36
+ from datasets import load_dataset
37
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
38
+
39
+ test_dataset = load_dataset("common_voice", "cy", split="test[:2%]")
40
+
41
+ processor = Wav2Vec2Processor.from_pretrained("Srulikbdd/Wav2vec2-large-xlsr-welsh")
42
+ model = Wav2Vec2ForCTC.from_pretrained("Srulikbdd/Wav2vec2-large-xlsr-welsh")
43
+
44
+ resampler = torchaudio.transforms.Resample(48_000, 16_000)
45
+
46
+ # Preprocessing the datasets.
47
+ # We need to read the aduio files as arrays
48
+ def speech_file_to_array_fn(batch):
49
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
50
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
51
+ return batch
52
+
53
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
54
+ inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
55
+
56
+ with torch.no_grad():
57
+ tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
58
+
59
+ predicted_ids = torch.argmax(logits, dim=-1)
60
+
61
+ print("Prediction:", processor.batch_decode(predicted_ids))
62
+ print("Reference:", test_dataset["sentence"][:2])
63
+ ```
64
+
65
+ Evaluation
66
+ The model can be evaluated as follows on the Welsh test data of Common Voice.
67
+ ```
68
+ import torch
69
+ import torchaudio
70
+ from datasets import load_dataset, load_metric
71
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
72
+ import re
73
+
74
+ test_dataset = load_dataset("common_voice", "cy", split="test")
75
+
76
+ wer = load_metric("wer")
77
+
78
+ processor = Wav2Vec2Processor.from_pretrained("Srulikbdd/Wav2vec2-large-xlsr-welsh")
79
+ model = Wav2Vec2ForCTC.from_pretrained("Srulikbdd/Wav2vec2-large-xlsr-welsh")
80
+
81
+ model.to("cuda")
82
+
83
+ chars_to_ignore_regex = '[\,\?\.\!\-\u2013\u2014\;\:\"\\%\\\]'
84
+
85
+ resampler = torchaudio.transforms.Resample(48_000, 16_000)
86
+
87
+ # Preprocessing the datasets.
88
+ # We need to read the aduio files as arrays
89
+ def speech_file_to_array_fn(batch):
90
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
91
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
92
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
93
+ return batch
94
+
95
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
96
+
97
+ # Preprocessing the datasets.
98
+ # We need to read the aduio files as arrays
99
+ def evaluate(batch):
100
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
101
+
102
+ with torch.no_grad():
103
+ logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
104
+
105
+ pred_ids = torch.argmax(logits, dim=-1)
106
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
107
+ return batch
108
+
109
+ result = test_dataset.map(evaluate, batched=True, batch_size=8)
110
+
111
+ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
112
+ ```