Spaces:
Running
Running
import argparse | |
import json | |
import os | |
import re | |
from typing import Dict, List | |
def list_directories(path): | |
# μ§μ λ κ²½λ‘μ μλ νλͺ©λ€μ 리μ€νΈλ‘ λ°μμ΄ | |
items = os.listdir(path) | |
# νλͺ©λ€ μ€μμ λλ ν 리(ν΄λ)λ§μ νν°λ§ | |
directories = [item for item in items if os.path.isdir(os.path.join(path, item))] | |
return directories | |
def remove_control_tokens_openchat(txt: str) -> str: | |
return ( | |
txt.replace("GPT4 User:", "") | |
.replace("[EOS]GPT4 Assistant:", "") | |
.replace("GPT4 Assistant:", "") | |
.strip() | |
) | |
# path μ μλ result.json νμΌ μ½μ΄μ μ μ²λ¦¬λ instanceλ€λ‘ λ§λ λ€. | |
def result_file_process(model, task, path): | |
with open(path, encoding="utf8") as f: | |
instances: List[Dict] = json.loads(f.read()) | |
processed_instances = [] | |
for instance in instances: | |
source = remove_control_tokens_openchat(instance["source"]) | |
if "instruction" in instance.keys(): | |
instruction = instance["instruction"] | |
else: | |
instruction, source = source, "" | |
processed_instances.append( | |
{ | |
"model_id": model, | |
"task": task, | |
"instruction": instruction.strip(), | |
"source": source.strip(), | |
"generated": instance["generated_result"], | |
} | |
) | |
return processed_instances | |
# model results λλ ν 리μμ κ²°κ³Όκ° λ³ν μμ | |
def transform_results_folder(input_path, output_path, model_name_pattern): | |
regex_pattern = re.compile(model_name_pattern) | |
tasks = list_directories(input_path) | |
models = list_directories(os.path.join(input_path, tasks[0])) | |
models = [model for model in models if regex_pattern.match(model)] | |
model_results = {} | |
print(f"TASKS: {tasks}") | |
print(f"MODELS: {models}") | |
for task in tasks: | |
models = [ | |
model | |
for model in list_directories(os.path.join(input_path, task)) | |
if regex_pattern.match(model) | |
] | |
for model in models: | |
result_path = os.path.join(input_path, task, model, "result.json") | |
model_name = model | |
if task in model: | |
model_name = model.split(f"-{task}-")[0] | |
instances = result_file_process(model_name, task, result_path) | |
if model_name in model_results.keys(): | |
model_results[model_name] += instances | |
else: | |
model_results[model_name] = instances | |
print(f"{task} results processing is over..") | |
for k, v in model_results.items(): | |
print(f"# of instances in {k} is {len(v)}") | |
for model in model_results.keys(): | |
path = os.path.join(output_path, f"{model}.jsonl") | |
os.makedirs(os.path.dirname(path), exist_ok=True) | |
with open(path, "w", encoding="utf8") as f_out: | |
for instance in model_results[model]: | |
json.dump(instance, f_out, ensure_ascii=False) | |
f_out.write("\n") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"-i", "--input_path", type=str, help="path of generated result directory" | |
) | |
parser.add_argument( | |
"-o", "--output_path", type=str, help="path of processed result directory" | |
) | |
parser.add_argument( | |
"-m", | |
"--model_name_pattern", | |
type=str, | |
help="model name's pattern for regex", | |
default="", | |
) | |
args = parser.parse_args() | |
transform_results_folder(args.input_path, args.output_path, args.model_name_pattern) | |