Purupuru188
commited on
Upload 4 files
Browse files
fastfinetuning_kr/VC_inference.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from torch import no_grad, LongTensor
|
5 |
+
import argparse
|
6 |
+
import commons
|
7 |
+
from mel_processing import spectrogram_torch
|
8 |
+
import utils
|
9 |
+
from models import SynthesizerTrn
|
10 |
+
import gradio as gr
|
11 |
+
import librosa
|
12 |
+
import webbrowser
|
13 |
+
|
14 |
+
from text import text_to_sequence, _clean_text
|
15 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
16 |
+
import logging
|
17 |
+
logging.getLogger("PIL").setLevel(logging.WARNING)
|
18 |
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
19 |
+
logging.getLogger("markdown_it").setLevel(logging.WARNING)
|
20 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
21 |
+
logging.getLogger("asyncio").setLevel(logging.WARNING)
|
22 |
+
|
23 |
+
language_marks = {
|
24 |
+
"Japanese": "",
|
25 |
+
"日本語": "[JA]",
|
26 |
+
"简体中文": "[ZH]",
|
27 |
+
"English": "[EN]",
|
28 |
+
"한국어": "[KO]",
|
29 |
+
"Mix": "",
|
30 |
+
}
|
31 |
+
lang = ['日本語', '简体中文', 'English', 'Mix','한국어']
|
32 |
+
def get_text(text, hps, is_symbol):
|
33 |
+
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
|
34 |
+
if hps.data.add_blank:
|
35 |
+
text_norm = commons.intersperse(text_norm, 0)
|
36 |
+
text_norm = LongTensor(text_norm)
|
37 |
+
return text_norm
|
38 |
+
|
39 |
+
def create_tts_fn(model, hps, speaker_ids):
|
40 |
+
def tts_fn(text, speaker, language, speed):
|
41 |
+
if language is not None:
|
42 |
+
text = language_marks[language] + text + language_marks[language]
|
43 |
+
speaker_id = speaker_ids[speaker]
|
44 |
+
stn_tst = get_text(text, hps, False)
|
45 |
+
with no_grad():
|
46 |
+
x_tst = stn_tst.unsqueeze(0).to(device)
|
47 |
+
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
|
48 |
+
sid = LongTensor([speaker_id]).to(device)
|
49 |
+
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
|
50 |
+
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
51 |
+
del stn_tst, x_tst, x_tst_lengths, sid
|
52 |
+
return "Success", (hps.data.sampling_rate, audio)
|
53 |
+
|
54 |
+
return tts_fn
|
55 |
+
|
56 |
+
def create_vc_fn(model, hps, speaker_ids):
|
57 |
+
def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
|
58 |
+
input_audio = record_audio if record_audio is not None else upload_audio
|
59 |
+
if input_audio is None:
|
60 |
+
return "You need to record or upload an audio", None
|
61 |
+
sampling_rate, audio = input_audio
|
62 |
+
original_speaker_id = speaker_ids[original_speaker]
|
63 |
+
target_speaker_id = speaker_ids[target_speaker]
|
64 |
+
|
65 |
+
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
66 |
+
if len(audio.shape) > 1:
|
67 |
+
audio = librosa.to_mono(audio.transpose(1, 0))
|
68 |
+
if sampling_rate != hps.data.sampling_rate:
|
69 |
+
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
|
70 |
+
with no_grad():
|
71 |
+
y = torch.FloatTensor(audio)
|
72 |
+
y = y / max(-y.min(), y.max()) / 0.99
|
73 |
+
y = y.to(device)
|
74 |
+
y = y.unsqueeze(0)
|
75 |
+
spec = spectrogram_torch(y, hps.data.filter_length,
|
76 |
+
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
77 |
+
center=False).to(device)
|
78 |
+
spec_lengths = LongTensor([spec.size(-1)]).to(device)
|
79 |
+
sid_src = LongTensor([original_speaker_id]).to(device)
|
80 |
+
sid_tgt = LongTensor([target_speaker_id]).to(device)
|
81 |
+
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
82 |
+
0, 0].data.cpu().float().numpy()
|
83 |
+
del y, spec, spec_lengths, sid_src, sid_tgt
|
84 |
+
return "Success", (hps.data.sampling_rate, audio)
|
85 |
+
|
86 |
+
return vc_fn
|
87 |
+
if __name__ == "__main__":
|
88 |
+
parser = argparse.ArgumentParser()
|
89 |
+
parser.add_argument("--model_dir", default="./G_latest.pth", help="directory to your fine-tuned model")
|
90 |
+
parser.add_argument("--config_dir", default="./finetune_speaker.json", help="directory to your model config file")
|
91 |
+
parser.add_argument("--share", default=False, help="make link public (used in colab)")
|
92 |
+
|
93 |
+
args = parser.parse_args()
|
94 |
+
hps = utils.get_hparams_from_file(args.config_dir)
|
95 |
+
|
96 |
+
|
97 |
+
net_g = SynthesizerTrn(
|
98 |
+
len(hps.symbols),
|
99 |
+
hps.data.filter_length // 2 + 1,
|
100 |
+
hps.train.segment_size // hps.data.hop_length,
|
101 |
+
n_speakers=hps.data.n_speakers,
|
102 |
+
**hps.model).to(device)
|
103 |
+
_ = net_g.eval()
|
104 |
+
|
105 |
+
_ = utils.load_checkpoint(args.model_dir, net_g, None)
|
106 |
+
speaker_ids = hps.speakers
|
107 |
+
speakers = list(hps.speakers.keys())
|
108 |
+
tts_fn = create_tts_fn(net_g, hps, speaker_ids)
|
109 |
+
vc_fn = create_vc_fn(net_g, hps, speaker_ids)
|
110 |
+
app = gr.Blocks()
|
111 |
+
with app:
|
112 |
+
with gr.Tab("Text-to-Speech"):
|
113 |
+
with gr.Row():
|
114 |
+
with gr.Column():
|
115 |
+
textbox = gr.TextArea(label="Text",
|
116 |
+
placeholder="Type your sentence here",
|
117 |
+
value="こんにちわ。", elem_id=f"tts-input")
|
118 |
+
# select character
|
119 |
+
char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
|
120 |
+
language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
|
121 |
+
duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
|
122 |
+
label='速度 Speed')
|
123 |
+
with gr.Column():
|
124 |
+
text_output = gr.Textbox(label="Message")
|
125 |
+
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
126 |
+
btn = gr.Button("Generate!")
|
127 |
+
btn.click(tts_fn,
|
128 |
+
inputs=[textbox, char_dropdown, language_dropdown, duration_slider,],
|
129 |
+
outputs=[text_output, audio_output])
|
130 |
+
with gr.Tab("Voice Conversion"):
|
131 |
+
gr.Markdown("""
|
132 |
+
录制或上传声音,并选择要转换的音色。
|
133 |
+
""")
|
134 |
+
with gr.Column():
|
135 |
+
record_audio = gr.Audio(label="record your voice", source="microphone")
|
136 |
+
upload_audio = gr.Audio(label="or upload audio here", source="upload")
|
137 |
+
source_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="source speaker")
|
138 |
+
target_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="target speaker")
|
139 |
+
with gr.Column():
|
140 |
+
message_box = gr.Textbox(label="Message")
|
141 |
+
converted_audio = gr.Audio(label='converted audio')
|
142 |
+
btn = gr.Button("Convert!")
|
143 |
+
btn.click(vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
|
144 |
+
outputs=[message_box, converted_audio])
|
145 |
+
webbrowser.open("http://127.0.0.1:7860")
|
146 |
+
app.launch(share=args.share)
|
147 |
+
|
fastfinetuning_kr/long_audio_transcribe.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from moviepy.editor import AudioFileClip
|
2 |
+
import whisper
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
import torchaudio
|
6 |
+
import librosa
|
7 |
+
import torch
|
8 |
+
import argparse
|
9 |
+
parent_dir = "./denoised_audio/"
|
10 |
+
filelist = list(os.walk(parent_dir))[0][2]
|
11 |
+
if __name__ == "__main__":
|
12 |
+
parser = argparse.ArgumentParser()
|
13 |
+
parser.add_argument("--languages", default="CJE")
|
14 |
+
parser.add_argument("--whisper_size", default="medium")
|
15 |
+
args = parser.parse_args()
|
16 |
+
if args.languages == "CJE":
|
17 |
+
lang2token = {
|
18 |
+
'zh': "[ZH]",
|
19 |
+
'ja': "[JA]",
|
20 |
+
"en": "[EN]",
|
21 |
+
}
|
22 |
+
elif args.languages == "CJ":
|
23 |
+
lang2token = {
|
24 |
+
'zh': "[ZH]",
|
25 |
+
'ja': "[JA]",
|
26 |
+
}
|
27 |
+
elif args.languages == "C":
|
28 |
+
lang2token = {
|
29 |
+
'zh': "[ZH]",
|
30 |
+
}
|
31 |
+
elif args.languages == "CJKE":
|
32 |
+
lang2token = {
|
33 |
+
'zh': "[ZH]",
|
34 |
+
'ja': "[JA]",
|
35 |
+
"en": "[EN]",
|
36 |
+
"ko": "[KO]",
|
37 |
+
}
|
38 |
+
assert(torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
|
39 |
+
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
40 |
+
hps = json.load(f)
|
41 |
+
target_sr = hps['data']['sampling_rate']
|
42 |
+
model = whisper.load_model(args.whisper_size)
|
43 |
+
speaker_annos = []
|
44 |
+
for file in filelist:
|
45 |
+
print(f"transcribing {parent_dir + file}...\n")
|
46 |
+
options = dict(beam_size=5, best_of=5)
|
47 |
+
transcribe_options = dict(task="transcribe", **options)
|
48 |
+
result = model.transcribe(parent_dir + file, word_timestamps=True, **transcribe_options)
|
49 |
+
segments = result["segments"]
|
50 |
+
# result = model.transcribe(parent_dir + file)
|
51 |
+
lang = result['language']
|
52 |
+
if result['language'] not in list(lang2token.keys()):
|
53 |
+
print(f"{lang} not supported, ignoring...\n")
|
54 |
+
continue
|
55 |
+
# segment audio based on segment results
|
56 |
+
character_name = file.rstrip(".wav").split("_")[0]
|
57 |
+
code = file.rstrip(".wav").split("_")[1]
|
58 |
+
if not os.path.exists("./segmented_character_voice/" + character_name):
|
59 |
+
os.mkdir("./segmented_character_voice/" + character_name)
|
60 |
+
wav, sr = torchaudio.load(parent_dir + file, frame_offset=0, num_frames=-1, normalize=True,
|
61 |
+
channels_first=True)
|
62 |
+
|
63 |
+
for i, seg in enumerate(result['segments']):
|
64 |
+
start_time = seg['start']
|
65 |
+
end_time = seg['end']
|
66 |
+
text = seg['text']
|
67 |
+
text = lang2token[lang] + text.replace("\n", "") + lang2token[lang]
|
68 |
+
text = text + "\n"
|
69 |
+
wav_seg = wav[:, int(start_time*sr):int(end_time*sr)]
|
70 |
+
wav_seg_name = f"{character_name}_{code}_{i}.wav"
|
71 |
+
savepth = "./segmented_character_voice/" + character_name + "/" + wav_seg_name
|
72 |
+
speaker_annos.append(savepth + "|" + character_name + "|" + text)
|
73 |
+
print(f"Transcribed segment: {speaker_annos[-1]}")
|
74 |
+
# trimmed_wav_seg = librosa.effects.trim(wav_seg.squeeze().numpy())
|
75 |
+
# trimmed_wav_seg = torch.tensor(trimmed_wav_seg[0]).unsqueeze(0)
|
76 |
+
torchaudio.save(savepth, wav_seg, target_sr, channels_first=True)
|
77 |
+
if len(speaker_annos) == 0:
|
78 |
+
print("Warning: no long audios & videos found, this IS expected if you have only uploaded short audios")
|
79 |
+
print("this IS NOT expected if you have uploaded any long audios, videos or video links. Please check your file structure or make sure your audio/video language is supported.")
|
80 |
+
with open("./long_character_anno.txt", 'w', encoding='utf-8') as f:
|
81 |
+
for line in speaker_annos:
|
82 |
+
f.write(line)
|
fastfinetuning_kr/preprocess_v2.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import json
|
4 |
+
import sys
|
5 |
+
sys.setrecursionlimit(500000) # Fix the error message of RecursionError: maximum recursion depth exceeded while calling a Python object. You can change the number as you want.
|
6 |
+
|
7 |
+
if __name__ == "__main__":
|
8 |
+
parser = argparse.ArgumentParser()
|
9 |
+
parser.add_argument("--add_auxiliary_data", type=bool, help="Whether to add extra data as fine-tuning helper")
|
10 |
+
parser.add_argument("--languages", default="CJE")
|
11 |
+
args = parser.parse_args()
|
12 |
+
if args.languages == "CJE":
|
13 |
+
langs = ["[ZH]", "[JA]", "[EN]"]
|
14 |
+
elif args.languages == "CJ":
|
15 |
+
langs = ["[ZH]", "[JA]"]
|
16 |
+
elif args.languages == "C":
|
17 |
+
langs = ["[ZH]"]
|
18 |
+
elif args.languages == "CJKE":
|
19 |
+
langs = ["[ZH]", "[JA]", "[EN]", "[KO]"]
|
20 |
+
new_annos = []
|
21 |
+
# Source 1: transcribed short audios
|
22 |
+
if os.path.exists("short_character_anno.txt"):
|
23 |
+
with open("short_character_anno.txt", 'r', encoding='utf-8') as f:
|
24 |
+
short_character_anno = f.readlines()
|
25 |
+
new_annos += short_character_anno
|
26 |
+
# Source 2: transcribed long audio segments
|
27 |
+
if os.path.exists("./long_character_anno.txt"):
|
28 |
+
with open("./long_character_anno.txt", 'r', encoding='utf-8') as f:
|
29 |
+
long_character_anno = f.readlines()
|
30 |
+
new_annos += long_character_anno
|
31 |
+
|
32 |
+
# Get all speaker names
|
33 |
+
speakers = []
|
34 |
+
for line in new_annos:
|
35 |
+
path, speaker, text = line.split("|")
|
36 |
+
if speaker not in speakers:
|
37 |
+
speakers.append(speaker)
|
38 |
+
assert (len(speakers) != 0), "No audio file found. Please check your uploaded file structure."
|
39 |
+
# Source 3 (Optional): sampled audios as extra training helpers
|
40 |
+
if args.add_auxiliary_data:
|
41 |
+
with open("./sampled_audio4ft.txt", 'r', encoding='utf-8') as f:
|
42 |
+
old_annos = f.readlines()
|
43 |
+
# filter old_annos according to supported languages
|
44 |
+
filtered_old_annos = []
|
45 |
+
for line in old_annos:
|
46 |
+
for lang in langs:
|
47 |
+
if lang in line:
|
48 |
+
filtered_old_annos.append(line)
|
49 |
+
old_annos = filtered_old_annos
|
50 |
+
for line in old_annos:
|
51 |
+
path, speaker, text = line.split("|")
|
52 |
+
if speaker not in speakers:
|
53 |
+
speakers.append(speaker)
|
54 |
+
num_old_voices = len(old_annos)
|
55 |
+
num_new_voices = len(new_annos)
|
56 |
+
# STEP 1: balance number of new & old voices
|
57 |
+
cc_duplicate = num_old_voices // num_new_voices
|
58 |
+
if cc_duplicate == 0:
|
59 |
+
cc_duplicate = 1
|
60 |
+
|
61 |
+
|
62 |
+
# STEP 2: modify config file
|
63 |
+
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
64 |
+
hps = json.load(f)
|
65 |
+
|
66 |
+
# assign ids to new speakers
|
67 |
+
speaker2id = {}
|
68 |
+
for i, speaker in enumerate(speakers):
|
69 |
+
speaker2id[speaker] = i
|
70 |
+
# modify n_speakers
|
71 |
+
hps['data']["n_speakers"] = len(speakers)
|
72 |
+
# overwrite speaker names
|
73 |
+
hps['speakers'] = speaker2id
|
74 |
+
hps['train']['log_interval'] = 10
|
75 |
+
hps['train']['eval_interval'] = 100
|
76 |
+
hps['train']['batch_size'] = 16
|
77 |
+
hps['data']['training_files'] = "final_annotation_train.txt"
|
78 |
+
hps['data']['validation_files'] = "final_annotation_val.txt"
|
79 |
+
# save modified config
|
80 |
+
with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
|
81 |
+
json.dump(hps, f, indent=2)
|
82 |
+
|
83 |
+
# STEP 3: clean annotations, replace speaker names with assigned speaker IDs
|
84 |
+
import text
|
85 |
+
cleaned_new_annos = []
|
86 |
+
for i, line in enumerate(new_annos):
|
87 |
+
path, speaker, txt = line.split("|")
|
88 |
+
if len(txt) > 150:
|
89 |
+
continue
|
90 |
+
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
|
91 |
+
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
|
92 |
+
cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
|
93 |
+
cleaned_old_annos = []
|
94 |
+
for i, line in enumerate(old_annos):
|
95 |
+
path, speaker, txt = line.split("|")
|
96 |
+
if len(txt) > 150:
|
97 |
+
continue
|
98 |
+
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
|
99 |
+
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
|
100 |
+
cleaned_old_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
|
101 |
+
# merge with old annotation
|
102 |
+
final_annos = cleaned_old_annos + cc_duplicate * cleaned_new_annos
|
103 |
+
# save annotation file
|
104 |
+
with open("./final_annotation_train.txt", 'w', encoding='utf-8') as f:
|
105 |
+
for line in final_annos:
|
106 |
+
f.write(line)
|
107 |
+
# save annotation file for validation
|
108 |
+
with open("./final_annotation_val.txt", 'w', encoding='utf-8') as f:
|
109 |
+
for line in cleaned_new_annos:
|
110 |
+
f.write(line)
|
111 |
+
print("finished")
|
112 |
+
else:
|
113 |
+
# Do not add extra helper data
|
114 |
+
# STEP 1: modify config file
|
115 |
+
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
116 |
+
hps = json.load(f)
|
117 |
+
|
118 |
+
# assign ids to new speakers
|
119 |
+
speaker2id = {}
|
120 |
+
for i, speaker in enumerate(speakers):
|
121 |
+
speaker2id[speaker] = i
|
122 |
+
# modify n_speakers
|
123 |
+
hps['data']["n_speakers"] = len(speakers)
|
124 |
+
# overwrite speaker names
|
125 |
+
hps['speakers'] = speaker2id
|
126 |
+
hps['train']['log_interval'] = 10
|
127 |
+
hps['train']['eval_interval'] = 100
|
128 |
+
hps['train']['batch_size'] = 16
|
129 |
+
hps['data']['training_files'] = "final_annotation_train.txt"
|
130 |
+
hps['data']['validation_files'] = "final_annotation_val.txt"
|
131 |
+
# save modified config
|
132 |
+
with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
|
133 |
+
json.dump(hps, f, indent=2)
|
134 |
+
|
135 |
+
# STEP 2: clean annotations, replace speaker names with assigned speaker IDs
|
136 |
+
import text
|
137 |
+
|
138 |
+
cleaned_new_annos = []
|
139 |
+
for i, line in enumerate(new_annos):
|
140 |
+
path, speaker, txt = line.split("|")
|
141 |
+
if len(txt) > 150:
|
142 |
+
continue
|
143 |
+
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners']).replace("[ZH]", "")
|
144 |
+
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
|
145 |
+
cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
|
146 |
+
|
147 |
+
final_annos = cleaned_new_annos
|
148 |
+
# save annotation file
|
149 |
+
with open("./final_annotation_train.txt", 'w', encoding='utf-8') as f:
|
150 |
+
for line in final_annos:
|
151 |
+
f.write(line)
|
152 |
+
# save annotation file for validation
|
153 |
+
with open("./final_annotation_val.txt", 'w', encoding='utf-8') as f:
|
154 |
+
for line in cleaned_new_annos:
|
155 |
+
f.write(line)
|
156 |
+
print("finished")
|
fastfinetuning_kr/short_audio_transcribe.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import torchaudio
|
5 |
+
import argparse
|
6 |
+
import torch
|
7 |
+
|
8 |
+
lang2token = {
|
9 |
+
'zh': "[ZH]",
|
10 |
+
'ja': "[JA]",
|
11 |
+
"en": "[EN]",
|
12 |
+
}
|
13 |
+
def transcribe_one(audio_path):
|
14 |
+
# load audio and pad/trim it to fit 30 seconds
|
15 |
+
audio = whisper.load_audio(audio_path)
|
16 |
+
audio = whisper.pad_or_trim(audio)
|
17 |
+
|
18 |
+
# make log-Mel spectrogram and move to the same device as the model
|
19 |
+
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
20 |
+
|
21 |
+
# detect the spoken language
|
22 |
+
_, probs = model.detect_language(mel)
|
23 |
+
print(f"Detected language: {max(probs, key=probs.get)}")
|
24 |
+
lang = max(probs, key=probs.get)
|
25 |
+
# decode the audio
|
26 |
+
options = whisper.DecodingOptions(beam_size=5)
|
27 |
+
result = whisper.decode(model, mel, options)
|
28 |
+
|
29 |
+
# print the recognized text
|
30 |
+
print(result.text)
|
31 |
+
return lang, result.text
|
32 |
+
if __name__ == "__main__":
|
33 |
+
parser = argparse.ArgumentParser()
|
34 |
+
parser.add_argument("--languages", default="CJE")
|
35 |
+
parser.add_argument("--whisper_size", default="medium")
|
36 |
+
args = parser.parse_args()
|
37 |
+
if args.languages == "CJE":
|
38 |
+
lang2token = {
|
39 |
+
'zh': "[ZH]",
|
40 |
+
'ja': "[JA]",
|
41 |
+
"en": "[EN]",
|
42 |
+
}
|
43 |
+
elif args.languages == "CJ":
|
44 |
+
lang2token = {
|
45 |
+
'zh': "[ZH]",
|
46 |
+
'ja': "[JA]",
|
47 |
+
}
|
48 |
+
elif args.languages == "C":
|
49 |
+
lang2token = {
|
50 |
+
'zh': "[ZH]",
|
51 |
+
}
|
52 |
+
elif args.languages == "CJKE":
|
53 |
+
lang2token = {
|
54 |
+
'zh': "[ZH]",
|
55 |
+
'ja': "[JA]",
|
56 |
+
"en": "[EN]",
|
57 |
+
"ko": "[KO]",
|
58 |
+
}
|
59 |
+
assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
|
60 |
+
model = whisper.load_model(args.whisper_size)
|
61 |
+
parent_dir = "./custom_character_voice/"
|
62 |
+
speaker_names = list(os.walk(parent_dir))[0][1]
|
63 |
+
speaker_annos = []
|
64 |
+
total_files = sum([len(files) for r, d, files in os.walk(parent_dir)])
|
65 |
+
# resample audios
|
66 |
+
# 2023/4/21: Get the target sampling rate
|
67 |
+
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
68 |
+
hps = json.load(f)
|
69 |
+
target_sr = hps['data']['sampling_rate']
|
70 |
+
processed_files = 0
|
71 |
+
for speaker in speaker_names:
|
72 |
+
for i, wavfile in enumerate(list(os.walk(parent_dir + speaker))[0][2]):
|
73 |
+
# try to load file as audio
|
74 |
+
if wavfile.startswith("processed_"):
|
75 |
+
continue
|
76 |
+
try:
|
77 |
+
wav, sr = torchaudio.load(parent_dir + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
|
78 |
+
channels_first=True)
|
79 |
+
wav = wav.mean(dim=0).unsqueeze(0)
|
80 |
+
if sr != target_sr:
|
81 |
+
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
|
82 |
+
if wav.shape[1] / sr > 20:
|
83 |
+
print(f"{wavfile} too long, ignoring\n")
|
84 |
+
save_path = parent_dir + speaker + "/" + f"processed_{i}.wav"
|
85 |
+
torchaudio.save(save_path, wav, target_sr, channels_first=True)
|
86 |
+
# transcribe text
|
87 |
+
lang, text = transcribe_one(save_path)
|
88 |
+
if lang not in list(lang2token.keys()):
|
89 |
+
print(f"{lang} not supported, ignoring\n")
|
90 |
+
continue
|
91 |
+
text = lang2token[lang] + text + lang2token[lang] + "\n"
|
92 |
+
speaker_annos.append(save_path + "|" + speaker + "|" + text)
|
93 |
+
|
94 |
+
processed_files += 1
|
95 |
+
print(f"Processed: {processed_files}/{total_files}")
|
96 |
+
except:
|
97 |
+
continue
|
98 |
+
|
99 |
+
# # clean annotation
|
100 |
+
# import argparse
|
101 |
+
# import text
|
102 |
+
# from utils import load_filepaths_and_text
|
103 |
+
# for i, line in enumerate(speaker_annos):
|
104 |
+
# path, sid, txt = line.split("|")
|
105 |
+
# cleaned_text = text._clean_text(txt, ["cjke_cleaners2"])
|
106 |
+
# cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
|
107 |
+
# speaker_annos[i] = path + "|" + sid + "|" + cleaned_text
|
108 |
+
# write into annotation
|
109 |
+
if len(speaker_annos) == 0:
|
110 |
+
print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.")
|
111 |
+
print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.")
|
112 |
+
with open("short_character_anno.txt", 'w', encoding='utf-8') as f:
|
113 |
+
for line in speaker_annos:
|
114 |
+
f.write(line)
|
115 |
+
|
116 |
+
# import json
|
117 |
+
# # generate new config
|
118 |
+
# with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
119 |
+
# hps = json.load(f)
|
120 |
+
# # modify n_speakers
|
121 |
+
# hps['data']["n_speakers"] = 1000 + len(speaker2id)
|
122 |
+
# # add speaker names
|
123 |
+
# for speaker in speaker_names:
|
124 |
+
# hps['speakers'][speaker] = speaker2id[speaker]
|
125 |
+
# # save modified config
|
126 |
+
# with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
|
127 |
+
# json.dump(hps, f, indent=2)
|
128 |
+
# print("finished")
|