artificial-styletts2 / audiobook.py
Dionyssos's picture
tune voice for audiobook
9d6172b
# creates .wav file per chapter & full audiobook.wav for assets/INCLUSION_IN_MUSEUMS_audiobook.docx
# __________________________________________________________________________________________________
# ROOT_DIR/voice/voice_CHAPTER_0.wav, .., ROOT_DIR/voice/voice_CHAPTER_10.wav
# ROOT_DIR/voice/voice_full_book.wav
import cv2
import subprocess
import numpy as np
import soundfile
import docx # package = python-docx
import audresample
import urllib
from pathlib import Path
from moviepy.editor import *
FS = 24000
ROOT_DIR = './tts_audiobooks/voices/'
Path(ROOT_DIR).mkdir(parents=True,
exist_ok=True)
voices = [
# 'en_US/vctk_low#p228', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20
# 'af_ZA_google-nwu_0184', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
'en_US/vctk_low#p326', # Native voice
# 'jv_ID_google-gmu_06207',
] # select any voice from - https://audeering.github.io/shift/
#urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "audiobook_TTS.docx")
d = docx.Document('assets/audiobook_TTS.docx') # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'
last_paragraph_was_silence = False # to know to add silence only once after only at the 1st empty paragraph we detect
chapter_counter = 0 # assure chapters start with CHAPTER: ONCE UPON A TIME
for vox in voices:
# string cleanup
vox_str = vox.replace(
'/', '_').replace(
'#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '').replace('-','')
# create dir for chapter_x.wav & audiobook.wav - for this voice vox
Path(ROOT_DIR + vox_str + '/').mkdir(parents=True,
exist_ok=True)
print(vox)
# for new voice start list of audio tiles making up the 1st chapter of book
total = []
chapter = []
for para in d.paragraphs: #[:41]
t = para.text
# start new chapter
if t.startswith('CHAPTER:'):
# silence for end chapter
chapter.append(np.zeros(int(.24 * FS),
dtype=np.float32))
# chapter.wav
audio = np.concatenate(chapter)
soundfile.write(
ROOT_DIR + vox_str + f'/{vox_str}_chapter_{chapter_counter}.wav',
audio,
16000) # 27400?
# fill AUDIO of this chapter into total (for complete audiobook)
total.append(audio)
# new chapter
chapter = []
chapter_counter += 1
print(f'Start Chapter {chapter_counter}, timestamp:{int(np.concatenate(total).shape[0]/16000)//60}:{int(np.concatenate(total).shape[0]/16000)%60}')
# If paragraph is non empty -> TTS
if len(t) > 2 and t[0] != '{' and t[-1] != '}' and 'Figure' not in t:
# place paragraph text to .txt for tts.py
with open('_tmp.txt', 'w') as f:
f.write(t.lower()) # WARNING! cast to lower otherwise accesibiliTy is pronounces accessibili..tay
# TTS
subprocess.run(
[
"python",
"tts.py",
"--text",
"_tmp.txt", #t, # paragraph text tts and append to voice_chapter.wav
# "--affect",
#'--image', '_tmp_banner.png',
# '--scene', 'calm sounds of castle',
'--voice', vox,
'--out_file', '_tmp' # save on _tmp load audio and concat to total
])
audio, _fs = soundfile.read('out/_tmp.wav')
audio = audresample.resample(audio.astype(np.float32), 24000, 16000)[0, :]
# print('CHAPTER\n\n\n\n____', audio.shape,'____\n')
chapter.append(audio)
# flag
last_paragraph_was_silence = False
# append silence if empty paragraph (e.g. end of Section)
else:
if not last_paragraph_was_silence: # skip multiple empty pargraphs - silence is added only once
chapter.append(np.zeros(int(.1 * FS),
dtype=np.float32))
last_paragraph_was_silence = True
# save full .wav audiobook - for this voice
soundfile.write(
ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
np.concatenate(total),
16000) # 27400?
# pic TTS voice
voice_pic = np.zeros((574, 1024, 3), dtype=np.uint8)
shift_logo = cv2.imread('assets/shift_banner.png')
voice_pic[:100, :400, :] = shift_logo[:100, :400, :]
# voice name
# frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
font = cv2.FONT_HERSHEY_SIMPLEX
bottomLeftCornerOfText = (0, 640) # w,h
fontScale = 2
fontColor = (69, 74, 74)
thickness = 4
lineType = 2
# voice
cv2.putText(voice_pic, vox, #'en_US/m-ailabs_low#mary_ann',
bottomLeftCornerOfText,
font,
fontScale,
fontColor,
thickness,
lineType)
# = AUDIOBOOK
cv2.putText(voice_pic, 'AUDIOBOOK',
(170, 170),
font,
4,
fontColor,
thickness,
lineType)
# = VOICE
cv2.putText(voice_pic, 'TTS voice =',
(0, 500),
font,
fontScale,
fontColor,
thickness,
lineType)
STATIC_FRAME = '_tmp.png'
cv2.imwrite(STATIC_FRAME, voice_pic)
# MoviePy silence video
SILENT_VIDEO = '_tmp.mp4'
# SILENT CLIP
clip_silent = ImageClip(STATIC_FRAME).set_duration(5) # as long as the audio - TTS first
clip_silent.write_videofile(SILENT_VIDEO, fps=24)
# fuse vox_full_audiobook.wav & SILENT_VIDEO -> TO FINALLY CONCATENATE into YouTube Video
# write final output video
subprocess.call(
["ffmpeg",
"-y",
"-i",
SILENT_VIDEO,
"-i",
ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
"-c:v",
"copy",
"-map",
"0:v:0",
"-map",
" 1:a:0",
ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4', # OUT_FILE
])