File size: 3,640 Bytes
6e78f43 c4effd2 6e78f43 6ab316b 3ac9f34 6ab316b c4effd2 6e78f43 c4effd2 2d0e2b6 3ac9f34 6e78f43 2d0e2b6 2e6c69d 1766442 2e6c69d 2d0e2b6 6e78f43 2d0e2b6 6e78f43 c4effd2 6e78f43 c4effd2 6e78f43 2d0e2b6 6e78f43 c4effd2 2d0e2b6 6e78f43 2d0e2b6 6e78f43 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# -*- coding: utf-8 -*-
import numpy as np
import soundfile
import audresample
import text_utils
import re
import subprocess
import markdown
import json
from pathlib import Path
from types import SimpleNamespace
from flask import Flask, request, send_from_directory
from flask_cors import CORS
from audiocraft.builders import AudioGen #, audio_write
NUM_SOUND_GENERATIONS = 1 # they differ a lot and are unnatural to concatenate, prefer lm.n_draw
sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
# ====STYLE VECTOR====
# AFFECTIVE = True
# VOICE = 'en_UK/apope_low' # en_US/m-ailabs_low#mary_ann
# _dir = '/' if AFFECTIVE else '_v2/'
# precomputed_style_vector = msinference.compute_style(
# 'assets/wavs/style_vector' + _dir + VOICE.replace(
# '/', '_').replace(
# '#', '_').replace(
# 'cmu-arctic', 'cmu_arctic').replace(
# '_low', '') + '.wav')
# print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
# ==== STYLE VECTOR
CACHE_DIR = 'flask_cache/'
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
def tts_multi_sentence(scene=None):
if scene is not None and len(scene) >= 4:
print(f'Processing: {scene} ..')
# x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy()
x = sound_generator.generate(
[scene] * NUM_SOUND_GENERATIONS
).reshape(1, -1).detach().cpu().numpy() # bs, 11400
x /= np.abs(x).max() + 1e-7
# is 16kHz - AUdiogen Fs
x = audresample.resample(x,
original_rate=16000,
target_rate=24000)[0, :]
#
print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
else:
print(scene, '\nDrop\n')
x = np.zeros(400)
# # StyleTTS2
# if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
# assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
# x = []
# for _sentence in text:
# x.append(msinference.inference(_sentence,
# precomputed_style_vector,
# alpha=0.3,
# beta=0.7,
# diffusion_steps=7,
# embedding_scale=1))
# x = np.concatenate(x)
# return overlay(x, sound_background)
return x
app = Flask(__name__)
cors = CORS(app)
@app.route("/")
def index():
with open('README.md', 'r') as f:
return markdown.markdown(f.read())
@app.route("/", methods=['GET', 'POST', 'PUT'])
def serve_wav():
# https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
# object-into-a-representation-suitable-for-mongodb
r = request.form.to_dict(flat=False)
args = SimpleNamespace(
text=None if r.get('text') is None else r.get('text'), # string not file?
scene=r.get('scene')[0]
)
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
x = tts_multi_sentence(args.scene)
OUT_FILE = 'tmp.wav'
soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
# send server's output as default file -> srv_result.xx
print(f'\n=SERVER saved as {OUT_FILE=}\n')
response = send_from_directory(CACHE_DIR, path=OUT_FILE)
response.headers['suffix-file-type'] = OUT_FILE
return response
if __name__ == "__main__":
app.run(host="0.0.0.0")
|