linguAIcoach / src /whisper_transcription.py
alvaroalon2's picture
chore: first commit
18c0acd
raw
history blame
3.51 kB
import io
from typing import Any, Callable, Dict, Optional, Tuple
import streamlit as st
from openai import OpenAI
from streamlit_mic_recorder import mic_recorder
def whisper_stt( # noqa: C901, PLR0912
start_prompt: str = "Start recording ⏺️",
stop_prompt: str = "Stop recording ⏹️",
just_once: bool = False,
key: Optional[str] = None,
use_container_width: bool = False,
language: Optional[str] = None,
callback: Optional[Callable[..., None]] = None,
n_max_retry: int = 3,
*args: Tuple[Any, ...],
**kwargs: Dict[str, Any]
) -> Optional[str]:
"""
Generate speech-to-text (STT) from recorded audio using OpenAI.
Args:
start_prompt (str): The prompt to start recording.
stop_prompt (str): The prompt to stop recording.
just_once (bool): Flag to record audio just once or continuously.
use_container_width (bool): Flag to use container width for the recording interface.
language (Optional[str]): The language for the text transcription.
callback (Optional[Callable[..., None]]): Callback function to execute after new output is generated.
args (Tuple[Any, ...]): Positional arguments to pass to the callback function.
kwargs (Dict[str, Any]): Keyword arguments to pass to the callback function.
key (Optional[str]): Key to store the output in the session state.
Returns:
Optional[str]: The generated speech-to-text output or None if unsuccessful.
"""
if "openai_client" not in st.session_state:
st.session_state.openai_client = OpenAI(api_key=st.session_state.openai_api_key.get_secret_value())
if "_last_speech_to_text_transcript_id" not in st.session_state:
st.session_state._last_speech_to_text_transcript_id = 0
if "_last_speech_to_text_transcript" not in st.session_state:
st.session_state._last_speech_to_text_transcript = None
if key and key + "_output" not in st.session_state:
st.session_state[key + "_output"] = None
audio = mic_recorder(
start_prompt=start_prompt,
stop_prompt=stop_prompt,
just_once=just_once,
use_container_width=use_container_width,
key=key,
)
new_output = False
if audio is None:
output = None
else:
audio_id = audio["id"]
new_output = audio_id > st.session_state._last_speech_to_text_transcript_id
if new_output:
output = None
st.session_state._last_speech_to_text_transcript_id = audio_id
audio_bio = io.BytesIO(audio["bytes"])
audio_bio.name = "audio.mp3"
success = False
err = 0
while not success and err < n_max_retry:
try:
transcript = st.session_state.openai_client.audio.transcriptions.create(
model="whisper-1", file=audio_bio, language=language
)
except Exception as e:
print(str(e))
err += 1
else:
success = True
output = transcript.text
st.session_state._last_speech_to_text_transcript = output
elif not just_once:
output = st.session_state._last_speech_to_text_transcript
else:
output = None
if key:
st.session_state[key + "_output"] = output
if new_output and callback:
callback(*args, **kwargs)
return output