VARCO_Arena / varco_arena /data_convert.py
sonsus's picture
others
c2ba4d5
raw
history blame
3.63 kB
import argparse
import json
import os
import re
from typing import Dict, List
def list_directories(path):
# μ§€μ •λœ κ²½λ‘œμ— μžˆλŠ” ν•­λͺ©λ“€μ„ 리슀트둜 λ°›μ•„μ˜΄
items = os.listdir(path)
# ν•­λͺ©λ“€ μ€‘μ—μ„œ 디렉토리(폴더)λ§Œμ„ 필터링
directories = [item for item in items if os.path.isdir(os.path.join(path, item))]
return directories
def remove_control_tokens_openchat(txt: str) -> str:
return (
txt.replace("GPT4 User:", "")
.replace("[EOS]GPT4 Assistant:", "")
.replace("GPT4 Assistant:", "")
.strip()
)
# path 에 μžˆλŠ” result.json 파일 μ½μ–΄μ„œ μ „μ²˜λ¦¬λœ instanceλ“€λ‘œ λ§Œλ“ λ‹€.
def result_file_process(model, task, path):
with open(path, encoding="utf8") as f:
instances: List[Dict] = json.loads(f.read())
processed_instances = []
for instance in instances:
source = remove_control_tokens_openchat(instance["source"])
if "instruction" in instance.keys():
instruction = instance["instruction"]
else:
instruction, source = source, ""
processed_instances.append(
{
"model_id": model,
"task": task,
"instruction": instruction.strip(),
"source": source.strip(),
"generated": instance["generated_result"],
}
)
return processed_instances
# model results λ””λ ‰ν† λ¦¬μ—μ„œ κ²°κ³Όκ°’ λ³€ν™˜ μž‘μ—…
def transform_results_folder(input_path, output_path, model_name_pattern):
regex_pattern = re.compile(model_name_pattern)
tasks = list_directories(input_path)
models = list_directories(os.path.join(input_path, tasks[0]))
models = [model for model in models if regex_pattern.match(model)]
model_results = {}
print(f"TASKS: {tasks}")
print(f"MODELS: {models}")
for task in tasks:
models = [
model
for model in list_directories(os.path.join(input_path, task))
if regex_pattern.match(model)
]
for model in models:
result_path = os.path.join(input_path, task, model, "result.json")
model_name = model
if task in model:
model_name = model.split(f"-{task}-")[0]
instances = result_file_process(model_name, task, result_path)
if model_name in model_results.keys():
model_results[model_name] += instances
else:
model_results[model_name] = instances
print(f"{task} results processing is over..")
for k, v in model_results.items():
print(f"# of instances in {k} is {len(v)}")
for model in model_results.keys():
path = os.path.join(output_path, f"{model}.jsonl")
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf8") as f_out:
for instance in model_results[model]:
json.dump(instance, f_out, ensure_ascii=False)
f_out.write("\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-i", "--input_path", type=str, help="path of generated result directory"
)
parser.add_argument(
"-o", "--output_path", type=str, help="path of processed result directory"
)
parser.add_argument(
"-m",
"--model_name_pattern",
type=str,
help="model name's pattern for regex",
default="",
)
args = parser.parse_args()
transform_results_folder(args.input_path, args.output_path, args.model_name_pattern)