# Importing all the necessary packages import nltk import torch import gradio as gr from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC import numpy as np # Downloading the necessary NLTK data nltk.download("punkt") # Loading the pre-trained model and the processor model_name = "facebook/wav2vec2-base-960h" processor = Wav2Vec2Processor.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name) def correct_casing(input_sentence): sentences = nltk.sent_tokenize(input_sentence) return ' '.join([s.replace(s[0], s[0].capitalize(), 1) for s in sentences]) def asr_transcript(audio): if audio is None or len(audio) == 0: return "" # Ensure audio is a 1D numpy array if isinstance(audio, list): audio = np.array(audio) if audio.ndim > 1: audio = audio.flatten() # Process the audio input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values # Get logits logits = model(input_values).logits # Get predicted IDs predicted_ids = torch.argmax(logits, dim=-1) # Decode the IDs to text transcription = processor.decode(predicted_ids[0]) # Correct the casing transcription = correct_casing(transcription.lower()) return transcription def real_time_asr(audio, state=""): try: if isinstance(audio, dict) and 'array' in audio: audio = audio['array'] transcription = asr_transcript(audio) state += " " + transcription return state, state except Exception as e: return str(e), state # Create the Gradio interface iface = gr.Interface( fn=real_time_asr, inputs=[gr.Audio(streaming=True), gr.State()], outputs=[gr.Textbox(), gr.State()], live=True, title="Real-Time ASR using Wav2Vec 2.0", description="This application displays transcribed text in real-time for given audio input" ) # Launch the interface iface.launch()