from soni_translate.logging_setup import logger import torch import gc import numpy as np import os import shutil import warnings import threading from tqdm import tqdm from lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono, ) from lib.audio import load_audio import soundfile as sf import edge_tts import asyncio from soni_translate.utils import remove_directory_contents, create_directories from scipy import signal from time import time as ttime import faiss from vci_pipeline import VC, change_rms, bh, ah import librosa warnings.filterwarnings("ignore") class Config: def __init__(self, only_cpu=False): self.device = "cuda:0" self.is_half = True self.n_cpu = 0 self.gpu_name = None self.gpu_mem = None ( self.x_pad, self.x_query, self.x_center, self.x_max ) = self.device_config(only_cpu) def device_config(self, only_cpu) -> tuple: if torch.cuda.is_available() and not only_cpu: i_device = int(self.device.split(":")[-1]) self.gpu_name = torch.cuda.get_device_name(i_device) if ( ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) or "P40" in self.gpu_name.upper() or "1060" in self.gpu_name or "1070" in self.gpu_name or "1080" in self.gpu_name ): logger.info( "16/10 Series GPUs and P40 excel " "in single-precision tasks." ) self.is_half = False else: self.gpu_name = None self.gpu_mem = int( torch.cuda.get_device_properties(i_device).total_memory / 1024 / 1024 / 1024 + 0.4 ) elif torch.backends.mps.is_available() and not only_cpu: logger.info("Supported N-card not found, using MPS for inference") self.device = "mps" else: logger.info("No supported N-card found, using CPU for inference") self.device = "cpu" self.is_half = False if self.n_cpu == 0: self.n_cpu = os.cpu_count() if self.is_half: # 6GB VRAM configuration x_pad = 3 x_query = 10 x_center = 60 x_max = 65 else: # 5GB VRAM configuration x_pad = 1 x_query = 6 x_center = 38 x_max = 41 if self.gpu_mem is not None and self.gpu_mem <= 4: x_pad = 1 x_query = 5 x_center = 30 x_max = 32 logger.info( f"Config: Device is {self.device}, " f"half precision is {self.is_half}" ) return x_pad, x_query, x_center, x_max BASE_DOWNLOAD_LINK = "https://huggingface.co/r3gm/sonitranslate_voice_models/resolve/main/" BASE_MODELS = [ "hubert_base.pt", "rmvpe.pt" ] BASE_DIR = "." def load_hu_bert(config): from fairseq import checkpoint_utils from soni_translate.utils import download_manager for id_model in BASE_MODELS: download_manager( os.path.join(BASE_DOWNLOAD_LINK, id_model), BASE_DIR ) models, _, _ = checkpoint_utils.load_model_ensemble_and_task( ["hubert_base.pt"], suffix="", ) hubert_model = models[0] hubert_model = hubert_model.to(config.device) if config.is_half: hubert_model = hubert_model.half() else: hubert_model = hubert_model.float() hubert_model.eval() return hubert_model def load_trained_model(model_path, config): if not model_path: raise ValueError("No model found") logger.info("Loading %s" % model_path) cpt = torch.load(model_path, map_location="cpu") tgt_sr = cpt["config"][-1] cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk if_f0 = cpt.get("f0", 1) if if_f0 == 0: # protect to 0.5 need? pass version = cpt.get("version", "v1") if version == "v1": if if_f0 == 1: net_g = SynthesizerTrnMs256NSFsid( *cpt["config"], is_half=config.is_half ) else: net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) elif version == "v2": if if_f0 == 1: net_g = SynthesizerTrnMs768NSFsid( *cpt["config"], is_half=config.is_half ) else: net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) del net_g.enc_q net_g.load_state_dict(cpt["weight"], strict=False) net_g.eval().to(config.device) if config.is_half: net_g = net_g.half() else: net_g = net_g.float() vc = VC(tgt_sr, config) n_spk = cpt["config"][-3] return n_spk, tgt_sr, net_g, vc, cpt, version class ClassVoices: def __init__(self, only_cpu=False): self.model_config = {} self.config = None self.only_cpu = only_cpu def apply_conf( self, tag="base_model", file_model="", pitch_algo="pm", pitch_lvl=0, file_index="", index_influence=0.66, respiration_median_filtering=3, envelope_ratio=0.25, consonant_breath_protection=0.33, resample_sr=0, file_pitch_algo="", ): if not file_model: raise ValueError("Model not found") if file_index is None: file_index = "" if file_pitch_algo is None: file_pitch_algo = "" if not self.config: self.config = Config(self.only_cpu) self.hu_bert_model = None self.model_pitch_estimator = None self.model_config[tag] = { "file_model": file_model, "pitch_algo": pitch_algo, "pitch_lvl": pitch_lvl, # no decimal "file_index": file_index, "index_influence": index_influence, "respiration_median_filtering": respiration_median_filtering, "envelope_ratio": envelope_ratio, "consonant_breath_protection": consonant_breath_protection, "resample_sr": resample_sr, "file_pitch_algo": file_pitch_algo, } return f"CONFIGURATION APPLIED FOR {tag}: {file_model}" def infer( self, task_id, params, # load model n_spk, tgt_sr, net_g, pipe, cpt, version, if_f0, # load index index_rate, index, big_npy, # load f0 file inp_f0, # audio file input_audio_path, overwrite, ): f0_method = params["pitch_algo"] f0_up_key = params["pitch_lvl"] filter_radius = params["respiration_median_filtering"] resample_sr = params["resample_sr"] rms_mix_rate = params["envelope_ratio"] protect = params["consonant_breath_protection"] if not os.path.exists(input_audio_path): raise ValueError( "The audio file was not found or is not " f"a valid file: {input_audio_path}" ) f0_up_key = int(f0_up_key) audio = load_audio(input_audio_path, 16000) # Normalize audio audio_max = np.abs(audio).max() / 0.95 if audio_max > 1: audio /= audio_max times = [0, 0, 0] # filters audio signal, pads it, computes sliding window sums, # and extracts optimized time indices audio = signal.filtfilt(bh, ah, audio) audio_pad = np.pad( audio, (pipe.window // 2, pipe.window // 2), mode="reflect" ) opt_ts = [] if audio_pad.shape[0] > pipe.t_max: audio_sum = np.zeros_like(audio) for i in range(pipe.window): audio_sum += audio_pad[i:i - pipe.window] for t in range(pipe.t_center, audio.shape[0], pipe.t_center): opt_ts.append( t - pipe.t_query + np.where( np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query]) == np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query]).min() )[0][0] ) s = 0 audio_opt = [] t = None t1 = ttime() sid_value = 0 sid = torch.tensor(sid_value, device=pipe.device).unsqueeze(0).long() # Pads audio symmetrically, calculates length divided by window size. audio_pad = np.pad(audio, (pipe.t_pad, pipe.t_pad), mode="reflect") p_len = audio_pad.shape[0] // pipe.window # Estimates pitch from audio signal pitch, pitchf = None, None if if_f0 == 1: pitch, pitchf = pipe.get_f0( input_audio_path, audio_pad, p_len, f0_up_key, f0_method, filter_radius, inp_f0, ) pitch = pitch[:p_len] pitchf = pitchf[:p_len] if pipe.device == "mps": pitchf = pitchf.astype(np.float32) pitch = torch.tensor( pitch, device=pipe.device ).unsqueeze(0).long() pitchf = torch.tensor( pitchf, device=pipe.device ).unsqueeze(0).float() t2 = ttime() times[1] += t2 - t1 for t in opt_ts: t = t // pipe.window * pipe.window if if_f0 == 1: pitch_slice = pitch[ :, s // pipe.window: (t + pipe.t_pad2) // pipe.window ] pitchf_slice = pitchf[ :, s // pipe.window: (t + pipe.t_pad2) // pipe.window ] else: pitch_slice = None pitchf_slice = None audio_slice = audio_pad[s:t + pipe.t_pad2 + pipe.window] audio_opt.append( pipe.vc( self.hu_bert_model, net_g, sid, audio_slice, pitch_slice, pitchf_slice, times, index, big_npy, index_rate, version, protect, )[pipe.t_pad_tgt:-pipe.t_pad_tgt] ) s = t pitch_end_slice = pitch[ :, t // pipe.window: ] if t is not None else pitch pitchf_end_slice = pitchf[ :, t // pipe.window: ] if t is not None else pitchf audio_opt.append( pipe.vc( self.hu_bert_model, net_g, sid, audio_pad[t:], pitch_end_slice, pitchf_end_slice, times, index, big_npy, index_rate, version, protect, )[pipe.t_pad_tgt:-pipe.t_pad_tgt] ) audio_opt = np.concatenate(audio_opt) if rms_mix_rate != 1: audio_opt = change_rms( audio, 16000, audio_opt, tgt_sr, rms_mix_rate ) if resample_sr >= 16000 and tgt_sr != resample_sr: audio_opt = librosa.resample( audio_opt, orig_sr=tgt_sr, target_sr=resample_sr ) audio_max = np.abs(audio_opt).max() / 0.99 max_int16 = 32768 if audio_max > 1: max_int16 /= audio_max audio_opt = (audio_opt * max_int16).astype(np.int16) del pitch, pitchf, sid if torch.cuda.is_available(): torch.cuda.empty_cache() if tgt_sr != resample_sr >= 16000: final_sr = resample_sr else: final_sr = tgt_sr """ "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % ( times[0], times[1], times[2], ), (final_sr, audio_opt) """ if overwrite: output_audio_path = input_audio_path # Overwrite else: basename = os.path.basename(input_audio_path) dirname = os.path.dirname(input_audio_path) new_basename = basename.split( '.')[0] + "_edited." + basename.split('.')[-1] new_path = os.path.join(dirname, new_basename) logger.info(str(new_path)) output_audio_path = new_path # Save file sf.write( file=output_audio_path, samplerate=final_sr, data=audio_opt ) self.model_config[task_id]["result"].append(output_audio_path) self.output_list.append(output_audio_path) def make_test( self, tts_text, tts_voice, model_path, index_path, transpose, f0_method, ): folder_test = "test" tag = "test_edge" tts_file = "test/test.wav" tts_edited = "test/test_edited.wav" create_directories(folder_test) remove_directory_contents(folder_test) if "SET_LIMIT" == os.getenv("DEMO"): if len(tts_text) > 60: tts_text = tts_text[:60] logger.warning("DEMO; limit to 60 characters") try: asyncio.run(edge_tts.Communicate( tts_text, "-".join(tts_voice.split('-')[:-1]) ).save(tts_file)) except Exception as e: raise ValueError( "No audio was received. Please change the " f"tts voice for {tts_voice}. Error: {str(e)}" ) shutil.copy(tts_file, tts_edited) self.apply_conf( tag=tag, file_model=model_path, pitch_algo=f0_method, pitch_lvl=transpose, file_index=index_path, index_influence=0.66, respiration_median_filtering=3, envelope_ratio=0.25, consonant_breath_protection=0.33, ) self( audio_files=tts_edited, tag_list=tag, overwrite=True ) return tts_edited, tts_file def run_threads(self, threads): # Start threads for thread in threads: thread.start() # Wait for all threads to finish for thread in threads: thread.join() gc.collect() torch.cuda.empty_cache() def unload_models(self): self.hu_bert_model = None self.model_pitch_estimator = None gc.collect() torch.cuda.empty_cache() def __call__( self, audio_files=[], tag_list=[], overwrite=False, parallel_workers=1, ): logger.info(f"Parallel workers: {str(parallel_workers)}") self.output_list = [] if not self.model_config: raise ValueError("No model has been configured for inference") if isinstance(audio_files, str): audio_files = [audio_files] if isinstance(tag_list, str): tag_list = [tag_list] if not audio_files: raise ValueError("No audio found to convert") if not tag_list: tag_list = [list(self.model_config.keys())[-1]] * len(audio_files) if len(audio_files) > len(tag_list): logger.info("Extend tag list to match audio files") extend_number = len(audio_files) - len(tag_list) tag_list.extend([tag_list[0]] * extend_number) if len(audio_files) < len(tag_list): logger.info("Cut list tags") tag_list = tag_list[:len(audio_files)] tag_file_pairs = list(zip(tag_list, audio_files)) sorted_tag_file = sorted(tag_file_pairs, key=lambda x: x[0]) # Base params if not self.hu_bert_model: self.hu_bert_model = load_hu_bert(self.config) cache_params = None threads = [] progress_bar = tqdm(total=len(tag_list), desc="Progress") for i, (id_tag, input_audio_path) in enumerate(sorted_tag_file): if id_tag not in self.model_config.keys(): logger.info( f"No configured model for {id_tag} with {input_audio_path}" ) continue if ( len(threads) >= parallel_workers or cache_params != id_tag and cache_params is not None ): self.run_threads(threads) progress_bar.update(len(threads)) threads = [] if cache_params != id_tag: self.model_config[id_tag]["result"] = [] # Unload previous ( n_spk, tgt_sr, net_g, pipe, cpt, version, if_f0, index_rate, index, big_npy, inp_f0, ) = [None] * 11 gc.collect() torch.cuda.empty_cache() # Model params params = self.model_config[id_tag] model_path = params["file_model"] f0_method = params["pitch_algo"] file_index = params["file_index"] index_rate = params["index_influence"] f0_file = params["file_pitch_algo"] # Load model ( n_spk, tgt_sr, net_g, pipe, cpt, version ) = load_trained_model(model_path, self.config) if_f0 = cpt.get("f0", 1) # pitch data # Load index if os.path.exists(file_index) and index_rate != 0: try: index = faiss.read_index(file_index) big_npy = index.reconstruct_n(0, index.ntotal) except Exception as error: logger.error(f"Index: {str(error)}") index_rate = 0 index = big_npy = None else: logger.warning("File index not found") index_rate = 0 index = big_npy = None # Load f0 file inp_f0 = None if os.path.exists(f0_file): try: with open(f0_file, "r") as f: lines = f.read().strip("\n").split("\n") inp_f0 = [] for line in lines: inp_f0.append([float(i) for i in line.split(",")]) inp_f0 = np.array(inp_f0, dtype="float32") except Exception as error: logger.error(f"f0 file: {str(error)}") if "rmvpe" in f0_method: if not self.model_pitch_estimator: from lib.rmvpe import RMVPE logger.info("Loading vocal pitch estimator model") self.model_pitch_estimator = RMVPE( "rmvpe.pt", is_half=self.config.is_half, device=self.config.device ) pipe.model_rmvpe = self.model_pitch_estimator cache_params = id_tag # self.infer( # id_tag, # params, # # load model # n_spk, # tgt_sr, # net_g, # pipe, # cpt, # version, # if_f0, # # load index # index_rate, # index, # big_npy, # # load f0 file # inp_f0, # # output file # input_audio_path, # overwrite, # ) thread = threading.Thread( target=self.infer, args=( id_tag, params, # loaded model n_spk, tgt_sr, net_g, pipe, cpt, version, if_f0, # loaded index index_rate, index, big_npy, # loaded f0 file inp_f0, # audio file input_audio_path, overwrite, ) ) threads.append(thread) # Run last if threads: self.run_threads(threads) progress_bar.update(len(threads)) progress_bar.close() final_result = [] valid_tags = set(tag_list) for tag in valid_tags: if ( tag in self.model_config.keys() and "result" in self.model_config[tag].keys() ): final_result.extend(self.model_config[tag]["result"]) return final_result