Spaces:
Sleeping
Sleeping
import torch | |
import torchaudio | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer | |
import gradio as gr | |
# Load pre-trained model and tokenizer | |
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") | |
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h") | |
def transcribe_speech(audio_file): | |
# Load and convert audio file to waveform | |
waveform, _ = torchaudio.load(audio_file) | |
# Preprocess waveform | |
input_values = tokenizer(waveform, return_tensors="pt").input_values | |
# Perform inference | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
# Get predicted transcription | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = tokenizer.batch_decode(predicted_ids)[0] | |
return transcription | |
# Define Gradio interface | |
def speech_recognition(audio_file): | |
transcription = transcribe_speech(audio_file) | |
return transcription | |
inputs = gr.inputs.Audio(type="file", label="Upload Audio File") | |
outputs = gr.outputs.Textbox(label="Transcription") | |
interface = gr.Interface(fn=speech_recognition, inputs=inputs, outputs=outputs) | |
# Run the Gradio interface | |
interface.launch() | |