|
import os |
|
import ffmpeg |
|
import librosa |
|
import numpy as np |
|
import soundfile as sf |
|
import tempfile |
|
|
|
from .vad import VoiceActivityDetection |
|
|
|
|
|
class PostProcessor: |
|
|
|
def __init__(self, target_sr:int): |
|
self.target_sr = target_sr |
|
self.vad = VoiceActivityDetection() |
|
|
|
def set_tempo(self, wav:np.ndarray, atempo:str ='1'): |
|
with tempfile.TemporaryDirectory() as tmpdirname: |
|
inpath = os.path.join(tmpdirname, 'input.wav') |
|
outpath = inpath.replace('input.wav', 'output.wav') |
|
sf.write(inpath, wav, self.target_sr) |
|
in_stream = ffmpeg.input(inpath) |
|
audio_stream = ffmpeg.filter_(in_stream, 'atempo', atempo) |
|
audio_stream = audio_stream.output(outpath) |
|
ffmpeg.run(audio_stream, overwrite_output=True) |
|
wav, _ = librosa.load(outpath, sr=self.target_sr) |
|
return wav |
|
|
|
def trim_silence(self, wav:np.ndarray): |
|
return self.vad.process(wav, sc_threshold=40) |
|
|
|
def process(self, wav, lang:str, gender:str): |
|
if type(wav) != np.ndarray: |
|
wav = np.array(wav) |
|
|
|
if (lang == "te") and (gender=='female'): |
|
wav = self.set_tempo(wav, '0.85') |
|
wav = self.trim_silence(wav) |
|
elif (lang == 'mr') and (gender=='female'): |
|
wav = self.trim_silence(wav) |
|
wav = self.set_tempo(wav, '1.15') |
|
elif (lang == 'gu'): |
|
|
|
wav = self.set_tempo(wav, '1.20') |
|
|
|
return wav |
|
|