File size: 3,627 Bytes
c2ba4d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import argparse
import json
import os
import re
from typing import Dict, List


def list_directories(path):
    # μ§€μ •λœ κ²½λ‘œμ— μžˆλŠ” ν•­λͺ©λ“€μ„ 리슀트둜 λ°›μ•„μ˜΄
    items = os.listdir(path)

    # ν•­λͺ©λ“€ μ€‘μ—μ„œ 디렉토리(폴더)λ§Œμ„ 필터링
    directories = [item for item in items if os.path.isdir(os.path.join(path, item))]

    return directories


def remove_control_tokens_openchat(txt: str) -> str:
    return (
        txt.replace("GPT4 User:", "")
        .replace("[EOS]GPT4 Assistant:", "")
        .replace("GPT4 Assistant:", "")
        .strip()
    )


# path 에 μžˆλŠ” result.json 파일 μ½μ–΄μ„œ μ „μ²˜λ¦¬λœ instanceλ“€λ‘œ λ§Œλ“ λ‹€.
def result_file_process(model, task, path):
    with open(path, encoding="utf8") as f:
        instances: List[Dict] = json.loads(f.read())
    processed_instances = []
    for instance in instances:
        source = remove_control_tokens_openchat(instance["source"])
        if "instruction" in instance.keys():
            instruction = instance["instruction"]
        else:
            instruction, source = source, ""

        processed_instances.append(
            {
                "model_id": model,
                "task": task,
                "instruction": instruction.strip(),
                "source": source.strip(),
                "generated": instance["generated_result"],
            }
        )
    return processed_instances


# model results λ””λ ‰ν† λ¦¬μ—μ„œ κ²°κ³Όκ°’ λ³€ν™˜ μž‘μ—…
def transform_results_folder(input_path, output_path, model_name_pattern):
    regex_pattern = re.compile(model_name_pattern)

    tasks = list_directories(input_path)
    models = list_directories(os.path.join(input_path, tasks[0]))
    models = [model for model in models if regex_pattern.match(model)]

    model_results = {}
    print(f"TASKS: {tasks}")
    print(f"MODELS: {models}")
    for task in tasks:
        models = [
            model
            for model in list_directories(os.path.join(input_path, task))
            if regex_pattern.match(model)
        ]
        for model in models:
            result_path = os.path.join(input_path, task, model, "result.json")
            model_name = model
            if task in model:
                model_name = model.split(f"-{task}-")[0]
            instances = result_file_process(model_name, task, result_path)

            if model_name in model_results.keys():
                model_results[model_name] += instances
            else:
                model_results[model_name] = instances

        print(f"{task} results processing is over..")
    for k, v in model_results.items():
        print(f"# of instances in {k} is {len(v)}")

    for model in model_results.keys():
        path = os.path.join(output_path, f"{model}.jsonl")
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, "w", encoding="utf8") as f_out:
            for instance in model_results[model]:
                json.dump(instance, f_out, ensure_ascii=False)
                f_out.write("\n")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-i", "--input_path", type=str, help="path of generated result directory"
    )
    parser.add_argument(
        "-o", "--output_path", type=str, help="path of processed result directory"
    )
    parser.add_argument(
        "-m",
        "--model_name_pattern",
        type=str,
        help="model name's pattern for regex",
        default="",
    )
    args = parser.parse_args()
    transform_results_folder(args.input_path, args.output_path, args.model_name_pattern)