import streamlit as st import torch import torchaudio import requests from io import BytesIO # Load the Hugging Face model for speech recognition model_name = "facebook/wav2vec2-large-xlsr-53" model = torch.hub.load('pytorch/fairseq', model_name) # Create a function to transcribe audio from a URL using the model def transcribe_audio(url): # Download the audio file from the URL response = requests.get(url) audio_bytes = BytesIO(response.content) # Load the audio file with Torchaudio and apply preprocessing waveform, sample_rate = torchaudio.load(audio_bytes) with torch.no_grad(): features = model.feature_extractor(waveform) logits = model.feature_aggregator(features) transcription = model.decoder.decode(logits) return transcription[0]['text'] # Define the Streamlit app st.title("Speech Recognition with Hugging Face") # Add a file uploader to allow the user to upload an audio file audio_file = st.file_uploader("Upload an audio file", type=["mp3", "wav"]) if audio_file is not None: # Load the audio file with Torchaudio and apply preprocessing waveform, sample_rate = torchaudio.load(audio_file) with torch.no_grad(): features = model.feature_extractor(waveform) logits = model.feature_aggregator(features) transcription = model.decoder.decode(logits) # Display the transcription st.write("Transcription:") st.write(transcription[0]['text']) # Add a text input to allow the user to enter a URL of an audio file url = st.text_input("Enter the URL of an audio file") if url: # Transcribe the audio from the URL using the model transcription = transcribe_audio(url) # Display the transcription st.write("Transcription:") st.write(transcription)