Spaces:
Running
Running
File size: 3,627 Bytes
c2ba4d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import argparse
import json
import os
import re
from typing import Dict, List
def list_directories(path):
# μ§μ λ κ²½λ‘μ μλ νλͺ©λ€μ 리μ€νΈλ‘ λ°μμ΄
items = os.listdir(path)
# νλͺ©λ€ μ€μμ λλ ν 리(ν΄λ)λ§μ νν°λ§
directories = [item for item in items if os.path.isdir(os.path.join(path, item))]
return directories
def remove_control_tokens_openchat(txt: str) -> str:
return (
txt.replace("GPT4 User:", "")
.replace("[EOS]GPT4 Assistant:", "")
.replace("GPT4 Assistant:", "")
.strip()
)
# path μ μλ result.json νμΌ μ½μ΄μ μ μ²λ¦¬λ instanceλ€λ‘ λ§λ λ€.
def result_file_process(model, task, path):
with open(path, encoding="utf8") as f:
instances: List[Dict] = json.loads(f.read())
processed_instances = []
for instance in instances:
source = remove_control_tokens_openchat(instance["source"])
if "instruction" in instance.keys():
instruction = instance["instruction"]
else:
instruction, source = source, ""
processed_instances.append(
{
"model_id": model,
"task": task,
"instruction": instruction.strip(),
"source": source.strip(),
"generated": instance["generated_result"],
}
)
return processed_instances
# model results λλ ν 리μμ κ²°κ³Όκ° λ³ν μμ
def transform_results_folder(input_path, output_path, model_name_pattern):
regex_pattern = re.compile(model_name_pattern)
tasks = list_directories(input_path)
models = list_directories(os.path.join(input_path, tasks[0]))
models = [model for model in models if regex_pattern.match(model)]
model_results = {}
print(f"TASKS: {tasks}")
print(f"MODELS: {models}")
for task in tasks:
models = [
model
for model in list_directories(os.path.join(input_path, task))
if regex_pattern.match(model)
]
for model in models:
result_path = os.path.join(input_path, task, model, "result.json")
model_name = model
if task in model:
model_name = model.split(f"-{task}-")[0]
instances = result_file_process(model_name, task, result_path)
if model_name in model_results.keys():
model_results[model_name] += instances
else:
model_results[model_name] = instances
print(f"{task} results processing is over..")
for k, v in model_results.items():
print(f"# of instances in {k} is {len(v)}")
for model in model_results.keys():
path = os.path.join(output_path, f"{model}.jsonl")
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf8") as f_out:
for instance in model_results[model]:
json.dump(instance, f_out, ensure_ascii=False)
f_out.write("\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-i", "--input_path", type=str, help="path of generated result directory"
)
parser.add_argument(
"-o", "--output_path", type=str, help="path of processed result directory"
)
parser.add_argument(
"-m",
"--model_name_pattern",
type=str,
help="model name's pattern for regex",
default="",
)
args = parser.parse_args()
transform_results_folder(args.input_path, args.output_path, args.model_name_pattern)
|