Spaces:
Sleeping
Sleeping
from gtts import gTTS | |
from io import BytesIO | |
import base64 | |
from PIL import Image | |
import cv2 | |
import numpy as np | |
import subprocess | |
from speech_recognition import AudioFile, Recognizer | |
def tts(text: str, language="ja", encode=False) -> object: | |
"""Converts text into autoplay html. | |
Args: | |
text (str): generated answer of bot | |
language (str): language of text | |
encode (bool): if True, return base64 encoded string | |
Returns: | |
html: autoplay object | |
""" | |
tts_object = gTTS(text=text, lang=language, slow=False) | |
if encode: | |
bytes_object = BytesIO() | |
tts_object.write_to_fp(bytes_object) | |
bytes_object.seek(0) | |
b64 = base64.b64encode(bytes_object.getvalue()).decode() | |
return b64 | |
else: | |
tts_object.save("temp.mp3") | |
return "temp.mp3" | |
def stt(audio: object, language='ja') -> str: | |
"""Converts speech to text. | |
Args: | |
audio: record of user speech | |
language (str): language of text | |
Returns: | |
text (str): recognized speech of user | |
""" | |
# Create a Recognizer object | |
r = Recognizer() | |
# Open the audio file | |
with AudioFile(audio) as source: | |
# Listen for the data (load audio to memory) | |
audio_data = r.record(source) | |
# Transcribe the audio using Google's speech-to-text API | |
text = r.recognize_google(audio_data, language=language) | |
return text | |
def read_image_file(file) -> Image.Image: | |
image = Image.open(BytesIO(file)) | |
return image | |
def pil_to_base64(img, format="jpeg", encode=False): | |
if encode: | |
bytes_object = BytesIO() | |
img.save(bytes_object, format) | |
bytes_object.seek(0) | |
b64 = base64.b64encode(bytes_object.getvalue()).decode("ascii") | |
return b64 | |
else: | |
temp_path = f"temp.{format}" | |
img.save(temp_path) | |
return temp_path | |
def base64_to_pil(img_str): | |
if "base64," in img_str: | |
img_str = img_str.split(",")[1] | |
img_raw = base64.b64decode(img_str) | |
img = Image.open(BytesIO(img_raw)) | |
return img | |
def get_hist(image): | |
hist = cv2.calcHist([np.array(image)], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256]) | |
hist = cv2.normalize(hist, hist).flatten() | |
return hist | |
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: | |
""" | |
Helper function to read an audio file through ffmpeg. | |
""" | |
ar = f"{sampling_rate}" | |
ac = "1" | |
format_for_conversion = "f32le" | |
ffmpeg_command = [ | |
"ffmpeg", | |
"-i", | |
"pipe:0", | |
"-ac", | |
ac, | |
"-ar", | |
ar, | |
"-f", | |
format_for_conversion, | |
"-hide_banner", | |
"-loglevel", | |
"quiet", | |
"pipe:1", | |
] | |
try: | |
ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) | |
except FileNotFoundError: | |
raise ValueError("ffmpeg was not found but is required to load audio files from filename") | |
output_stream = ffmpeg_process.communicate(bpayload) | |
out_bytes = output_stream[0] | |
audio = np.frombuffer(out_bytes, np.float32) | |
return audio |