|
import os |
|
import argparse |
|
import json |
|
import sys |
|
sys.setrecursionlimit(500000) |
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--add_auxiliary_data", type=bool, help="Whether to add extra data as fine-tuning helper") |
|
parser.add_argument("--languages", default="CJE") |
|
args = parser.parse_args() |
|
if args.languages == "CJE": |
|
langs = ["[ZH]", "[JA]", "[EN]"] |
|
elif args.languages == "CJ": |
|
langs = ["[ZH]", "[JA]"] |
|
elif args.languages == "C": |
|
langs = ["[ZH]"] |
|
elif args.languages == "CJKE": |
|
langs = ["[ZH]", "[JA]", "[EN]", "[KO]"] |
|
new_annos = [] |
|
|
|
if os.path.exists("short_character_anno.txt"): |
|
with open("short_character_anno.txt", 'r', encoding='utf-8') as f: |
|
short_character_anno = f.readlines() |
|
new_annos += short_character_anno |
|
|
|
if os.path.exists("./long_character_anno.txt"): |
|
with open("./long_character_anno.txt", 'r', encoding='utf-8') as f: |
|
long_character_anno = f.readlines() |
|
new_annos += long_character_anno |
|
|
|
|
|
speakers = [] |
|
for line in new_annos: |
|
path, speaker, text = line.split("|") |
|
if speaker not in speakers: |
|
speakers.append(speaker) |
|
assert (len(speakers) != 0), "No audio file found. Please check your uploaded file structure." |
|
|
|
if args.add_auxiliary_data: |
|
with open("./sampled_audio4ft.txt", 'r', encoding='utf-8') as f: |
|
old_annos = f.readlines() |
|
|
|
filtered_old_annos = [] |
|
for line in old_annos: |
|
for lang in langs: |
|
if lang in line: |
|
filtered_old_annos.append(line) |
|
old_annos = filtered_old_annos |
|
for line in old_annos: |
|
path, speaker, text = line.split("|") |
|
if speaker not in speakers: |
|
speakers.append(speaker) |
|
num_old_voices = len(old_annos) |
|
num_new_voices = len(new_annos) |
|
|
|
cc_duplicate = num_old_voices // num_new_voices |
|
if cc_duplicate == 0: |
|
cc_duplicate = 1 |
|
|
|
|
|
|
|
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f: |
|
hps = json.load(f) |
|
|
|
|
|
speaker2id = {} |
|
for i, speaker in enumerate(speakers): |
|
speaker2id[speaker] = i |
|
|
|
hps['data']["n_speakers"] = len(speakers) |
|
|
|
hps['speakers'] = speaker2id |
|
hps['train']['log_interval'] = 10 |
|
hps['train']['eval_interval'] = 100 |
|
hps['train']['batch_size'] = 16 |
|
hps['data']['training_files'] = "final_annotation_train.txt" |
|
hps['data']['validation_files'] = "final_annotation_val.txt" |
|
|
|
with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f: |
|
json.dump(hps, f, indent=2) |
|
|
|
|
|
import text |
|
cleaned_new_annos = [] |
|
for i, line in enumerate(new_annos): |
|
path, speaker, txt = line.split("|") |
|
if len(txt) > 150: |
|
continue |
|
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners']) |
|
cleaned_text += "\n" if not cleaned_text.endswith("\n") else "" |
|
cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text) |
|
cleaned_old_annos = [] |
|
for i, line in enumerate(old_annos): |
|
path, speaker, txt = line.split("|") |
|
if len(txt) > 150: |
|
continue |
|
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners']) |
|
cleaned_text += "\n" if not cleaned_text.endswith("\n") else "" |
|
cleaned_old_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text) |
|
|
|
final_annos = cleaned_old_annos + cc_duplicate * cleaned_new_annos |
|
|
|
with open("./final_annotation_train.txt", 'w', encoding='utf-8') as f: |
|
for line in final_annos: |
|
f.write(line) |
|
|
|
with open("./final_annotation_val.txt", 'w', encoding='utf-8') as f: |
|
for line in cleaned_new_annos: |
|
f.write(line) |
|
print("finished") |
|
else: |
|
|
|
|
|
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f: |
|
hps = json.load(f) |
|
|
|
|
|
speaker2id = {} |
|
for i, speaker in enumerate(speakers): |
|
speaker2id[speaker] = i |
|
|
|
hps['data']["n_speakers"] = len(speakers) |
|
|
|
hps['speakers'] = speaker2id |
|
hps['train']['log_interval'] = 10 |
|
hps['train']['eval_interval'] = 100 |
|
hps['train']['batch_size'] = 16 |
|
hps['data']['training_files'] = "final_annotation_train.txt" |
|
hps['data']['validation_files'] = "final_annotation_val.txt" |
|
|
|
with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f: |
|
json.dump(hps, f, indent=2) |
|
|
|
|
|
import text |
|
|
|
cleaned_new_annos = [] |
|
for i, line in enumerate(new_annos): |
|
path, speaker, txt = line.split("|") |
|
if len(txt) > 150: |
|
continue |
|
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners']).replace("[ZH]", "") |
|
cleaned_text += "\n" if not cleaned_text.endswith("\n") else "" |
|
cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text) |
|
|
|
final_annos = cleaned_new_annos |
|
|
|
with open("./final_annotation_train.txt", 'w', encoding='utf-8') as f: |
|
for line in final_annos: |
|
f.write(line) |
|
|
|
with open("./final_annotation_val.txt", 'w', encoding='utf-8') as f: |
|
for line in cleaned_new_annos: |
|
f.write(line) |
|
print("finished") |
|
|