from gtts import gTTS from io import BytesIO import base64 from PIL import Image import cv2 import numpy as np import subprocess from speech_recognition import AudioFile, Recognizer def tts(text: str, language="ja", encode=False) -> object: """Converts text into autoplay html. Args: text (str): generated answer of bot language (str): language of text encode (bool): if True, return base64 encoded string Returns: html: autoplay object """ tts_object = gTTS(text=text, lang=language, slow=False) if encode: bytes_object = BytesIO() tts_object.write_to_fp(bytes_object) bytes_object.seek(0) b64 = base64.b64encode(bytes_object.getvalue()).decode() return b64 else: tts_object.save("temp.mp3") return "temp.mp3" def stt(audio: object, language='ja') -> str: """Converts speech to text. Args: audio: record of user speech language (str): language of text Returns: text (str): recognized speech of user """ # Create a Recognizer object r = Recognizer() # Open the audio file with AudioFile(audio) as source: # Listen for the data (load audio to memory) audio_data = r.record(source) # Transcribe the audio using Google's speech-to-text API text = r.recognize_google(audio_data, language=language) return text def read_image_file(file) -> Image.Image: image = Image.open(BytesIO(file)) return image def pil_to_base64(img, format="jpeg", encode=False): if encode: bytes_object = BytesIO() img.save(bytes_object, format) bytes_object.seek(0) b64 = base64.b64encode(bytes_object.getvalue()).decode("ascii") return b64 else: temp_path = f"temp.{format}" img.save(temp_path) return temp_path def base64_to_pil(img_str): if "base64," in img_str: img_str = img_str.split(",")[1] img_raw = base64.b64decode(img_str) img = Image.open(BytesIO(img_raw)) return img def get_hist(image): hist = cv2.calcHist([np.array(image)], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256]) hist = cv2.normalize(hist, hist).flatten() return hist def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: """ Helper function to read an audio file through ffmpeg. """ ar = f"{sampling_rate}" ac = "1" format_for_conversion = "f32le" ffmpeg_command = [ "ffmpeg", "-i", "pipe:0", "-ac", ac, "-ar", ar, "-f", format_for_conversion, "-hide_banner", "-loglevel", "quiet", "pipe:1", ] try: ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) except FileNotFoundError: raise ValueError("ffmpeg was not found but is required to load audio files from filename") output_stream = ffmpeg_process.communicate(bpayload) out_bytes = output_stream[0] audio = np.frombuffer(out_bytes, np.float32) return audio