import torch import torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer import gradio as gr # Load pre-trained model and tokenizer model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h") def transcribe_speech(audio_file): # Load and convert audio file to waveform waveform, _ = torchaudio.load(audio_file) # Preprocess waveform input_values = tokenizer(waveform, return_tensors="pt").input_values # Perform inference with torch.no_grad(): logits = model(input_values).logits # Get predicted transcription predicted_ids = torch.argmax(logits, dim=-1) transcription = tokenizer.batch_decode(predicted_ids)[0] return transcription # Define Gradio interface def speech_recognition(audio_file): transcription = transcribe_speech(audio_file) return transcription inputs = gr.inputs.Audio(type="file", label="Upload Audio File") outputs = gr.outputs.Textbox(label="Transcription") interface = gr.Interface(fn=speech_recognition, inputs=inputs, outputs=outputs) # Run the Gradio interface interface.launch()