patrickvonplaten commited on
Commit
6b154c5
1 Parent(s): f1f1647

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +8 -8
README.md CHANGED
@@ -36,13 +36,13 @@ The original model can be found under https://github.com/pytorch/fairseq/tree/ma
36
  To transcribe audio files the model can be used as a standalone acoustic model as follows:
37
 
38
  ```python
39
- from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC
40
  from datasets import load_dataset
41
  import soundfile as sf
42
  import torch
43
 
44
  # load model and tokenizer
45
- tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
46
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
47
 
48
  # define function to read in sound file
@@ -56,14 +56,14 @@ To transcribe audio files the model can be used as a standalone acoustic model a
56
  ds = ds.map(map_to_array)
57
 
58
  # tokenize
59
- input_values = tokenizer(ds["speech"][:2], return_tensors="pt", padding="longest").input_values # Batch size 1
60
 
61
  # retrieve logits
62
  logits = model(input_values).logits
63
 
64
  # take argmax and decode
65
  predicted_ids = torch.argmax(logits, dim=-1)
66
- transcription = tokenizer.batch_decode(predicted_ids)
67
  ```
68
 
69
  ## Evaluation
@@ -72,7 +72,7 @@ To transcribe audio files the model can be used as a standalone acoustic model a
72
 
73
  ```python
74
  from datasets import load_dataset
75
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
76
  import soundfile as sf
77
  import torch
78
  from jiwer import wer
@@ -81,7 +81,7 @@ from jiwer import wer
81
  librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
82
 
83
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")
84
- tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
85
 
86
  def map_to_array(batch):
87
  speech, _ = sf.read(batch["file"])
@@ -91,12 +91,12 @@ def map_to_array(batch):
91
  librispeech_eval = librispeech_eval.map(map_to_array)
92
 
93
  def map_to_pred(batch):
94
- input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
95
  with torch.no_grad():
96
  logits = model(input_values.to("cuda")).logits
97
 
98
  predicted_ids = torch.argmax(logits, dim=-1)
99
- transcription = tokenizer.batch_decode(predicted_ids)
100
  batch["transcription"] = transcription
101
  return batch
102
 
 
36
  To transcribe audio files the model can be used as a standalone acoustic model as follows:
37
 
38
  ```python
39
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
40
  from datasets import load_dataset
41
  import soundfile as sf
42
  import torch
43
 
44
  # load model and tokenizer
45
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
46
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
47
 
48
  # define function to read in sound file
 
56
  ds = ds.map(map_to_array)
57
 
58
  # tokenize
59
+ input_values = processor(ds["speech"][:2], return_tensors="pt", padding="longest").input_values # Batch size 1
60
 
61
  # retrieve logits
62
  logits = model(input_values).logits
63
 
64
  # take argmax and decode
65
  predicted_ids = torch.argmax(logits, dim=-1)
66
+ transcription = processor.batch_decode(predicted_ids)
67
  ```
68
 
69
  ## Evaluation
 
72
 
73
  ```python
74
  from datasets import load_dataset
75
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
76
  import soundfile as sf
77
  import torch
78
  from jiwer import wer
 
81
  librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
82
 
83
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")
84
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
85
 
86
  def map_to_array(batch):
87
  speech, _ = sf.read(batch["file"])
 
91
  librispeech_eval = librispeech_eval.map(map_to_array)
92
 
93
  def map_to_pred(batch):
94
+ input_values = processor(batch["speech"], return_tensors="pt", padding="longest").input_values
95
  with torch.no_grad():
96
  logits = model(input_values.to("cuda")).logits
97
 
98
  predicted_ids = torch.argmax(logits, dim=-1)
99
+ transcription = processor.batch_decode(predicted_ids)
100
  batch["transcription"] = transcription
101
  return batch
102