import argparse import json import os import re from typing import Dict, List def list_directories(path): # 지정된 경로에 있는 항목들을 리스트로 받아옴 items = os.listdir(path) # 항목들 중에서 디렉토리(폴더)만을 필터링 directories = [item for item in items if os.path.isdir(os.path.join(path, item))] return directories def remove_control_tokens_openchat(txt: str) -> str: return ( txt.replace("GPT4 User:", "") .replace("[EOS]GPT4 Assistant:", "") .replace("GPT4 Assistant:", "") .strip() ) # path 에 있는 result.json 파일 읽어서 전처리된 instance들로 만든다. def result_file_process(model, task, path): with open(path, encoding="utf8") as f: instances: List[Dict] = json.loads(f.read()) processed_instances = [] for instance in instances: source = remove_control_tokens_openchat(instance["source"]) if "instruction" in instance.keys(): instruction = instance["instruction"] else: instruction, source = source, "" processed_instances.append( { "model_id": model, "task": task, "instruction": instruction.strip(), "source": source.strip(), "generated": instance["generated_result"], } ) return processed_instances # model results 디렉토리에서 결과값 변환 작업 def transform_results_folder(input_path, output_path, model_name_pattern): regex_pattern = re.compile(model_name_pattern) tasks = list_directories(input_path) models = list_directories(os.path.join(input_path, tasks[0])) models = [model for model in models if regex_pattern.match(model)] model_results = {} print(f"TASKS: {tasks}") print(f"MODELS: {models}") for task in tasks: models = [ model for model in list_directories(os.path.join(input_path, task)) if regex_pattern.match(model) ] for model in models: result_path = os.path.join(input_path, task, model, "result.json") model_name = model if task in model: model_name = model.split(f"-{task}-")[0] instances = result_file_process(model_name, task, result_path) if model_name in model_results.keys(): model_results[model_name] += instances else: model_results[model_name] = instances print(f"{task} results processing is over..") for k, v in model_results.items(): print(f"# of instances in {k} is {len(v)}") for model in model_results.keys(): path = os.path.join(output_path, f"{model}.jsonl") os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w", encoding="utf8") as f_out: for instance in model_results[model]: json.dump(instance, f_out, ensure_ascii=False) f_out.write("\n") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-i", "--input_path", type=str, help="path of generated result directory" ) parser.add_argument( "-o", "--output_path", type=str, help="path of processed result directory" ) parser.add_argument( "-m", "--model_name_pattern", type=str, help="model name's pattern for regex", default="", ) args = parser.parse_args() transform_results_folder(args.input_path, args.output_path, args.model_name_pattern)