# -*- coding: utf-8 -*- import numpy as np import soundfile import audresample import text_utils import re import subprocess import markdown import json from pathlib import Path from types import SimpleNamespace from flask import Flask, request, send_from_directory from flask_cors import CORS from audiocraft.builders import AudioGen #, audio_write sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval() # ====STYLE VECTOR==== # AFFECTIVE = True # VOICE = 'en_UK/apope_low' # en_US/m-ailabs_low#mary_ann # _dir = '/' if AFFECTIVE else '_v2/' # precomputed_style_vector = msinference.compute_style( # 'assets/wavs/style_vector' + _dir + VOICE.replace( # '/', '_').replace( # '#', '_').replace( # 'cmu-arctic', 'cmu_arctic').replace( # '_low', '') + '.wav') # print('\n STYLE VECTOR \n', precomputed_style_vector.shape) # ==== STYLE VECTOR CACHE_DIR = 'flask_cache/' Path(CACHE_DIR).mkdir(parents=True, exist_ok=True) def tts_multi_sentence(scene=None): if scene is not None and len(scene) >= 4: print(f'Processing: {scene} ..') x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy() x /= np.abs(x).max() + 1e-7 # is 16kHz - AUdiogen Fs x = audresample.resample(x, original_rate=16000, target_rate=24000)[0, :] # print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}') else: print(scene, '\nDrop\n') x = np.zeros(400) # # StyleTTS2 # if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None): # assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.' # x = [] # for _sentence in text: # x.append(msinference.inference(_sentence, # precomputed_style_vector, # alpha=0.3, # beta=0.7, # diffusion_steps=7, # embedding_scale=1)) # x = np.concatenate(x) # return overlay(x, sound_background) return x app = Flask(__name__) cors = CORS(app) @app.route("/") def index(): with open('README.md', 'r') as f: return markdown.markdown(f.read()) @app.route("/", methods=['GET', 'POST', 'PUT']) def serve_wav(): # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post- # object-into-a-representation-suitable-for-mongodb r = request.form.to_dict(flat=False) args = SimpleNamespace( text=None if r.get('text') is None else r.get('text'), # string not file? scene=r.get('scene')[0] ) # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==') x = tts_multi_sentence(args.scene) OUT_FILE = 'tmp.wav' soundfile.write(CACHE_DIR + OUT_FILE, x, 16000) # send server's output as default file -> srv_result.xx print(f'\n=SERVER saved as {OUT_FILE=}\n') response = send_from_directory(CACHE_DIR, path=OUT_FILE) response.headers['suffix-file-type'] = OUT_FILE return response if __name__ == "__main__": app.run(host="0.0.0.0")