import streamlit as st
import os
from glob import glob
from pathlib import Path
# from TTS.TTS.api import TTS
from TTS.utils.synthesizer import Synthesizer
from Wav2Lip.video_generator import create_video
from diffusers import StableDiffusionPipeline
from diffusers import LMSDiscreteScheduler
if not os.path.exists('temp'):
os.mkdir('temp')
gpu = False
model_path = Path(r"tts_model/model_file.pth")
config_path = Path(r"tts_model/config.json")
vocoder_path = None
vocoder_config_path = None
model_dir = None
language="en"
file_path="generated_audio.wav"
speaker = None
split_sentences = True
pipe_out = None
synthesizer = Synthesizer(
tts_checkpoint=model_path,
tts_config_path=config_path,
tts_speakers_file=None,
tts_languages_file=None,
vocoder_checkpoint=vocoder_path,
vocoder_config=vocoder_config_path,
encoder_checkpoint=None,
encoder_config=None,
model_dir=model_dir,
use_cuda=gpu,
)
# return synthesizer
# synthesizer = get_synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, model_dir, gpu)
def get_audio(synthesizer, speaker, language, speaker_wav, split_sentences, text):
wav = synthesizer.tts(
text=text,
speaker_name=speaker,
language_name=language,
speaker_wav=speaker_wav,
reference_wav=None,
style_wav=None,
style_text=None,
reference_speaker_name=None,
split_sentences=split_sentences
)
synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
# avatar_images_dir = Path('avatar_images')
avatar_images_list = os.listdir('avatar_images')
avatar_names_list = list(map(lambda x: x.split('.')[0], avatar_images_list))
n_cols_avatars = 3
n_rows_avatars = int((len(avatar_images_list) - len(avatar_images_list) % n_cols_avatars) / n_cols_avatars)
if len(avatar_images_list) % n_cols_avatars != 0:
n_rows_avatars += 1
voice_audio_list = os.listdir('voice_audios')
voice_names_list = list(map(lambda x: x.split('.')[0], voice_audio_list))
n_cols_voices = 3
n_rows_voices = int((len(voice_audio_list) - len(voice_audio_list) % n_cols_voices) / n_cols_voices)
if len(voice_audio_list) % n_cols_voices != 0:
n_rows_voices += 1
st.set_page_config(
page_title='Avatar service',
layout='wide'
)
st.markdown("
Avatar video generation
", unsafe_allow_html=True)
# st.title('Avatar video generation')
st.subheader('Step 1: Avatar Selection')
with st.expander('Available avatars'):
n_images_shown = 0
for i in range(n_rows_avatars):
avatar_cols_list = st.columns(n_cols_avatars)
for j in range(n_cols_avatars):
avatar_cols_list[j].image(
os.path.join('avatar_images', avatar_images_list[j+i*3]),
width=150,
caption=avatar_names_list[j+i*3]
)
n_images_shown += 1
if n_images_shown == len(avatar_images_list):
break
def avatar_callback():
if st.session_state.avatar_image:
st.session_state.selected_avatar = st.session_state.avatar_image
if os.path.isfile('generated_avatar.jpg'):
os.remove('generated_avatar.jpg')
# if os.path.isfile('uploaded_avatar_image.jpg'):
# os.remove('uploaded_avatar_image.jpg')
def uploaded_avatar_callback():
if st.session_state.uploaded_avatar_image is None:
pass
else:
image_path = "uploaded_avatar_image" + \
os.path.splitext(st.session_state.uploaded_avatar_image.name)[-1]
with open(image_path, "wb") as f:
f.write(st.session_state.uploaded_avatar_image.getvalue())
step1_col1, step1_col2 = st.columns(2)
with step1_col1:
selected_avatar = st.selectbox(
label='Please select an avatar',
options=avatar_names_list,
key='avatar_image',
on_change=avatar_callback
)
st.write('or')
uploaded_image = st.file_uploader(
label='Please upload an avatar',
type=['png', 'jpg', 'jpeg'],
on_change=uploaded_avatar_callback,
key='uploaded_avatar_image'
)
st.write('or')
st.text_area(
label='Please type a prompt to generate an image for the avatar',
key='image_prompt'
)
def generate_avatar():
if st.session_state.avatar_generator:
# if not os.path.exists('generated_avatars'):
# os.mkdir('generated_avatars')
pipe = StableDiffusionPipeline.from_pretrained(pretrained_model_name_or_path='diffusion_model')
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
pipe_output = pipe(
prompt=st.session_state.image_prompt, # What to generate
negative_prompt="Oversaturated, blurry, low quality, do not show head", # What NOT to generate
height=480,
width=640, # Specify the image size
guidance_scale=13, # How strongly to follow the prompt
num_inference_steps=40, # How many steps to take
# generator=generator, # Fixed random seed
)
pipe_output.images[0].save('generated_avatar.jpg')
else:
pass
st.button(
label='generate_avatar',
key='avatar_generator',
on_click=generate_avatar
)
# st.write(st.session_state.avatar_generator)
with step1_col2:
if uploaded_image is not None:
uploaded_avatar_image_path = glob('uploaded_avatar_image.*')[0]
st.image(uploaded_avatar_image_path, width=300)
elif len(glob('generated_avatar.*')) != 0:
st.image('generated_avatar.jpg', width=300)
else:
st.image(os.path.join('avatar_images', avatar_images_list[avatar_names_list.index(selected_avatar)]), width=300)
st.subheader('Step 2: Audio Selection')
# st.markdown("Option 1
", unsafe_allow_html=True)
option1_expander = st.expander('Option 1')
option1_expander.write(
'''Please select or upload an audio with a voice you want to be used in the video.
Then provide a text that will be used in the video. Afterwards click on
button to get the audio which will be used in the video:
please, take into account that depending on the size of the text it may take some time.
'''
)
with st.expander('Available voice audio'):
n_voices_shown = 0
for i in range(n_rows_voices):
voice_cols_list = st.columns(n_cols_voices)
for j in range(n_cols_avatars):
voice_cols_list[j].audio(
os.path.join('voice_audios', voice_audio_list[j+i*3])
)
voice_cols_list[j].write(voice_names_list[j+i*3])
n_voices_shown += 1
if n_voices_shown == len(voice_audio_list):
break
def voice_callback():
if st.session_state.voice_audio:
st.session_state.selected_voice = st.session_state.voice_audio
def uploaded_voice_callback():
if st.session_state.uploaded_voice_audio is None:
pass
else:
audio_path = "uploaded_voice_audio" + \
os.path.splitext(st.session_state.uploaded_voice_audio.name)[-1]
with open(audio_path, "wb") as f:
f.write(st.session_state.uploaded_voice_audio.getvalue())
step21_col1, step21_col2 = st.columns(2)
with step21_col1:
selected_voice = st.selectbox(
label='Please select a voice to clone',
options=voice_names_list,
key='voice_audio',
on_change=voice_callback
)
st.write('or')
uploaded_voice = st.file_uploader(
"Upload a voice to clone",
type=['mp3', 'wav'],
key='uploaded_voice_audio',
on_change=uploaded_voice_callback
)
with step21_col2:
st.markdown('
', unsafe_allow_html=True)
if uploaded_voice is None:
st.audio(os.path.join('voice_audios', voice_audio_list[voice_names_list.index(selected_voice)]))
else:
uploaded_voice_audio_path = glob('uploaded_voice_audio.*')[0]
st.audio(uploaded_voice_audio_path)
step21txt_col1, step21txt_col2 = st.columns(2)
with step21txt_col1:
uploaded_txt = st.text_area(
label='Please input text for avatar',
key='txt4audio'
)
def generate_audio():
if st.session_state.audio_button:
if uploaded_voice is None:
speaker_wav = os.path.join('voice_audios', voice_audio_list[voice_names_list.index(selected_voice)])
else:
speaker_wav = "uploaded_voice_audio.mp3"
get_audio(
synthesizer, speaker, language,
speaker_wav, split_sentences,
text=st.session_state.txt4audio
)
with step21txt_col2:
st.markdown('
', unsafe_allow_html=True)
st.button(
label='Generate audio from text',
key='audio_button',
on_click=generate_audio
)
if st.session_state.audio_button:
gen_audio_col1, _ = st.columns(2)
gen_audio_col1.audio("generated_audio.wav")
# st.subheader('Step 2 - Option 2')
option1_expander = st.expander('Option 2')
option1_expander.write(
'''Please, just upload an audio that will be reproduced in the video.
'''
)
def uploaded_audio_callback():
if st.session_state.uploaded_audio is None:
pass
else:
audio_path = "uploaded_audio" + \
os.path.splitext(st.session_state.uploaded_audio.name)[-1]
with open(audio_path, "wb") as f:
f.write(st.session_state.uploaded_audio.getvalue())
step22_col1, step22_col2 = st.columns(2)
with step22_col1:
uploaded_audio = st.file_uploader(
"Please, upload an audio",
type=['mp3', 'wav'],
key='uploaded_audio',
on_change=uploaded_audio_callback
)
with step22_col2:
st.markdown('
', unsafe_allow_html=True)
if uploaded_audio is None:
pass
else:
st.audio(glob('uploaded_audio.*')[0])
st.subheader('Step 3')
def generate_video():
if st.session_state.video_button:
if uploaded_audio is None:
voice_audio = glob('generated_audio.*')[0]
else:
voice_audio = glob('uploaded_audio.*')[0]
# if st.session_state.audio_button:
# voice_audio = glob('generated_audio.*')[0]
# else:
# voice_audio = os.path.join('voice_audios', voice_audio_list[voice_names_list.index(selected_voice)])
if uploaded_image is not None:
face = glob('uploaded_avatar_image.*')[0]
elif len(glob('generated_avatar.*')) != 0:
face = glob('generated_avatar.*')[0]
else:
face = os.path.join('avatar_images', avatar_images_list[avatar_names_list.index(selected_avatar)])
create_video(voice_audio, face)
step3_button_col1, _, _ = st.columns([3, 4, 5])
with step3_button_col1:
st.button(
label='Generate video',
key='video_button',
on_click=generate_video
)
if st.session_state.video_button:
step3_col1, _, _ = st.columns([4, 3, 5])
with step3_col1:
st.video(
# os.path.join('avatar_videos', 'generated_video.mp4')
'generated_video.mp4'
)
# with step3_col2:
# # st.markdown('
', unsafe_allow_html=True)
# # with open(os.path.join('avatar_videos', 'generated_video.mp4'), 'rb') as file:
# with open('generated_video.mp4', 'rb') as file:
# st.download_button(
# label='Download generated video',
# data=file,
# file_name='avatar_video.mp4',
# mime='video/mp4'
# )