Spaces:

alvaroalon2
/

linguAIcoach

Sleeping

App Files Files Community

linguAIcoach / src /whisper_transcription.py

alvaroalon2

chore: first commit

18c0acd 10 months ago

raw

history blame

3.51 kB

	import io
	from typing import Any, Callable, Dict, Optional, Tuple

	import streamlit as st
	from openai import OpenAI
	from streamlit_mic_recorder import mic_recorder


	def whisper_stt( # noqa: C901, PLR0912
	start_prompt: str = "Start recording ⏺️",
	stop_prompt: str = "Stop recording ⏹️",
	just_once: bool = False,
	key: Optional[str] = None,
	use_container_width: bool = False,
	language: Optional[str] = None,
	callback: Optional[Callable[..., None]] = None,
	n_max_retry: int = 3,
	*args: Tuple[Any, ...],
	**kwargs: Dict[str, Any]
	) -> Optional[str]:
	"""
	Generate speech-to-text (STT) from recorded audio using OpenAI.

	Args:
	start_prompt (str): The prompt to start recording.
	stop_prompt (str): The prompt to stop recording.
	just_once (bool): Flag to record audio just once or continuously.
	use_container_width (bool): Flag to use container width for the recording interface.
	language (Optional[str]): The language for the text transcription.
	callback (Optional[Callable[..., None]]): Callback function to execute after new output is generated.
	args (Tuple[Any, ...]): Positional arguments to pass to the callback function.
	kwargs (Dict[str, Any]): Keyword arguments to pass to the callback function.
	key (Optional[str]): Key to store the output in the session state.

	Returns:
	Optional[str]: The generated speech-to-text output or None if unsuccessful.
	"""
	if "openai_client" not in st.session_state:
	st.session_state.openai_client = OpenAI(api_key=st.session_state.openai_api_key.get_secret_value())
	if "_last_speech_to_text_transcript_id" not in st.session_state:
	st.session_state._last_speech_to_text_transcript_id = 0
	if "_last_speech_to_text_transcript" not in st.session_state:
	st.session_state._last_speech_to_text_transcript = None
	if key and key + "_output" not in st.session_state:
	st.session_state[key + "_output"] = None

	audio = mic_recorder(
	start_prompt=start_prompt,
	stop_prompt=stop_prompt,
	just_once=just_once,
	use_container_width=use_container_width,
	key=key,
	)

	new_output = False
	if audio is None:
	output = None
	else:
	audio_id = audio["id"]
	new_output = audio_id > st.session_state._last_speech_to_text_transcript_id
	if new_output:
	output = None
	st.session_state._last_speech_to_text_transcript_id = audio_id
	audio_bio = io.BytesIO(audio["bytes"])
	audio_bio.name = "audio.mp3"
	success = False
	err = 0
	while not success and err < n_max_retry:
	try:
	transcript = st.session_state.openai_client.audio.transcriptions.create(
	model="whisper-1", file=audio_bio, language=language
	)
	except Exception as e:
	print(str(e))
	err += 1
	else:
	success = True
	output = transcript.text
	st.session_state._last_speech_to_text_transcript = output
	elif not just_once:
	output = st.session_state._last_speech_to_text_transcript
	else:
	output = None

	if key:
	st.session_state[key + "_output"] = output
	if new_output and callback:
	callback(args, *kwargs)
	return output