avishkaar-check / app.py
techysanoj's picture
Update app.py
6ab3f9b
raw
history blame
1.18 kB
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import gradio as gr
# Load pre-trained model and tokenizer
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h")
def transcribe_speech(audio_file):
# Load and convert audio file to waveform
waveform, _ = torchaudio.load(audio_file)
# Preprocess waveform
input_values = tokenizer(waveform, return_tensors="pt").input_values
# Perform inference
with torch.no_grad():
logits = model(input_values).logits
# Get predicted transcription
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)[0]
return transcription
# Define Gradio interface
def speech_recognition(audio_file):
transcription = transcribe_speech(audio_file)
return transcription
inputs = gr.inputs.Audio(type="file", label="Upload Audio File")
outputs = gr.outputs.Textbox(label="Transcription")
interface = gr.Interface(fn=speech_recognition, inputs=inputs, outputs=outputs)
# Run the Gradio interface
interface.launch()