Plim
/

xls-r-300m-lm-fr

@@ -4,7 +4,7 @@ import re
 from typing import Dict
 from datasets import Audio, Dataset, load_dataset, load_metric
 from transformers import AutoFeatureExtractor, pipeline, Wav2Vec2ProcessorWithLM
@@ -62,39 +62,38 @@ def normalize_text(text: str) -> str:
     return text
 def main(args):
     # load dataset
     dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
     # for testing: only process the first two examples as a test
     # dataset = dataset.select(range(10))
     # load processor
-    processor = Wav2Vec2ProcessorWithLM.from_pretrained("Plim/")
-    model = Wav2Vec2ForCTC.from_pretrained(model_id)
-    feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
-    sampling_rate = feature_extractor.sampling_rate
-    # resample audio
-    dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
-    # load eval pipeline
-    asr = pipeline("automatic-speech-recognition", model=args.model_id)
-    # map function to decode audio
-    def map_to_pred(batch):
-        prediction = asr(
-            batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
-        )
-        batch["prediction"] = prediction["text"]
-        batch["target"] = normalize_text(batch["sentence"])
-        return batch
     # run inference on all examples
-    result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
     # compute and log_results
     # do not change function below
@@ -104,9 +103,9 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
-    )
     parser.add_argument(
         "--dataset",
         type=str,

 from typing import Dict
 from datasets import Audio, Dataset, load_dataset, load_metric
+import torch
 from transformers import AutoFeatureExtractor, pipeline, Wav2Vec2ProcessorWithLM
     return text
+def evaluate_with_lm(batch):
+    inputs = processor(batch["audio"]["array"], sampling_rate=16_000, return_tensors="pt", padding=True)
+    with torch.no_grad():
+        logits = model(**inputs.to('cuda')).logits
+    int_result = processor.batch_decode(logits.cpu().numpy())
+    batch["prediction"] = int_result.text
+    batch["target"] = normalize_text(batch["sentence"])
+    del int_result
+    torch.cuda.empty_cache()
+    return batch
 def main(args):
     # load dataset
     dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
+    # resample audio
+    dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
     # for testing: only process the first two examples as a test
     # dataset = dataset.select(range(10))
     # load processor
+    processor = Wav2Vec2ProcessorWithLM.from_pretrained("./")
+    model = Wav2Vec2ForCTC.from_pretrained("./")
+    model.to('cuda')
     # run inference on all examples
+    result = dataset.map(evaluate_with_lm, remove_columns=dataset.column_names)
     # compute and log_results
     # do not change function below
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+#     parser.add_argument(
+#         "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
+#     )
     parser.add_argument(
         "--dataset",
         type=str,