aisatsu-api / utils.py
vumichien's picture
Update utils.py
cd09ca8
from gtts import gTTS
from io import BytesIO
import base64
from PIL import Image
import cv2
import numpy as np
import subprocess
from speech_recognition import AudioFile, Recognizer
def tts(text: str, language="ja", encode=False) -> object:
"""Converts text into autoplay html.
Args:
text (str): generated answer of bot
language (str): language of text
encode (bool): if True, return base64 encoded string
Returns:
html: autoplay object
"""
tts_object = gTTS(text=text, lang=language, slow=False)
if encode:
bytes_object = BytesIO()
tts_object.write_to_fp(bytes_object)
bytes_object.seek(0)
b64 = base64.b64encode(bytes_object.getvalue()).decode()
return b64
else:
tts_object.save("temp.mp3")
return "temp.mp3"
def stt(audio: object, language='ja') -> str:
"""Converts speech to text.
Args:
audio: record of user speech
language (str): language of text
Returns:
text (str): recognized speech of user
"""
# Create a Recognizer object
r = Recognizer()
# Open the audio file
with AudioFile(audio) as source:
# Listen for the data (load audio to memory)
audio_data = r.record(source)
# Transcribe the audio using Google's speech-to-text API
text = r.recognize_google(audio_data, language=language)
return text
def read_image_file(file) -> Image.Image:
image = Image.open(BytesIO(file))
return image
def pil_to_base64(img, format="jpeg", encode=False):
if encode:
bytes_object = BytesIO()
img.save(bytes_object, format)
bytes_object.seek(0)
b64 = base64.b64encode(bytes_object.getvalue()).decode("ascii")
return b64
else:
temp_path = f"temp.{format}"
img.save(temp_path)
return temp_path
def base64_to_pil(img_str):
if "base64," in img_str:
img_str = img_str.split(",")[1]
img_raw = base64.b64decode(img_str)
img = Image.open(BytesIO(img_raw))
return img
def get_hist(image):
hist = cv2.calcHist([np.array(image)], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
hist = cv2.normalize(hist, hist).flatten()
return hist
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
"""
Helper function to read an audio file through ffmpeg.
"""
ar = f"{sampling_rate}"
ac = "1"
format_for_conversion = "f32le"
ffmpeg_command = [
"ffmpeg",
"-i",
"pipe:0",
"-ac",
ac,
"-ar",
ar,
"-f",
format_for_conversion,
"-hide_banner",
"-loglevel",
"quiet",
"pipe:1",
]
try:
ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
except FileNotFoundError:
raise ValueError("ffmpeg was not found but is required to load audio files from filename")
output_stream = ffmpeg_process.communicate(bpayload)
out_bytes = output_stream[0]
audio = np.frombuffer(out_bytes, np.float32)
return audio