File size: 4,801 Bytes
b25fb44
 
 
 
 
 
 
 
 
 
 
 
 
a25aa8f
 
 
 
 
 
 
b25fb44
a25aa8f
 
 
b25fb44
 
a25aa8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b25fb44
 
a25aa8f
b25fb44
 
 
 
 
a25aa8f
 
b25fb44
a25aa8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
"""
Everything related to parsing the data JSONs into UI-compatible format.
"""

import glob
import os
import re
from typing import Any, Dict, Optional, Tuple, Union

from tqdm import tqdm

from apps.common.auto_zip import AutoZip

ChatHistory = Dict[str, Any]
ParsedChatHistory = Dict[str, Any]
AllChats = Dict[str, Any]
Datasets = Dict[str, AllChats]

REPO_ROOT = os.path.realpath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), "../.."))


def parse(raw_chat: ChatHistory) -> Union[ParsedChatHistory, None]:
    """ Gets the JSON raw chat data, validates it and transforms
        into an easy to work with form.

    Args:
        raw_chat (ChatHistory): In-memory loaded JSON data file.

    Returns:
        Union[ParsedChatHistory, None]: Parsed chat data or None
        if there were parsing errors.
    """

    if "role_1" not in raw_chat:
        return None

    role_1 = raw_chat["role_1"]
    if "_RoleType.ASSISTANT" not in role_1:
        return None
    assistant_role = role_1.split("_RoleType.ASSISTANT")
    if len(assistant_role) < 1:
        return None
    if len(assistant_role[0]) <= 0:
        return None
    assistant_role = assistant_role[0]

    role_2 = raw_chat["role_2"]
    if "_RoleType.USER" not in role_2:
        return None
    user_role = role_2.split("_RoleType.USER")
    if len(user_role) < 1:
        return None
    if len(user_role[0]) <= 0:
        return None
    user_role = user_role[0]

    original_task = raw_chat["original_task"]
    if len(original_task) <= 0:
        return None

    specified_task = raw_chat["specified_task"]
    if len(specified_task) <= 0:
        return None

    messages = dict()
    for key in raw_chat:
        match = re.search("message_(?P<number>[0-9]+)", key)
        if match:
            number = int(match.group("number"))
            messages[number] = raw_chat[key]

    return dict(
        assistant_role=assistant_role,
        user_role=user_role,
        original_task=original_task,
        specified_task=specified_task,
        messages=messages,
    )


def load_zip(zip_path: str) -> AllChats:
    """ Load all JSONs from a zip file and parse them.

    Args:
        path (str): path to the ZIP file.

    Returns:
        AllChats: A dictionary with all possible assistant and
        user roles and the matrix of chats.
    """

    zip_inst = AutoZip(zip_path)
    parsed_list = []
    for raw_chat in tqdm(iter(zip_inst)):
        parsed = parse(raw_chat)
        if parsed is None:
            continue
        parsed_list.append(parsed)

    assistant_roles_set = set()
    user_roles_set = set()
    for parsed in parsed_list:
        assistant_roles_set.add(parsed['assistant_role'])
        user_roles_set.add(parsed['user_role'])
    assistant_roles = list(sorted(assistant_roles_set))
    user_roles = list(sorted(user_roles_set))
    matrix: Dict[Tuple[str, str], Dict[str, Dict]] = dict()
    for parsed in parsed_list:
        key = (parsed['assistant_role'], parsed['user_role'])
        original_task: str = parsed['original_task']
        new_item = {
            k: v
            for k, v in parsed.items()
            if k not in {'assistant_role', 'user_role', 'original_task'}
        }
        if key in matrix:
            matrix[key][original_task] = new_item
        else:
            matrix[key] = {original_task: new_item}

    return dict(
        assistant_roles=assistant_roles,
        user_roles=user_roles,
        matrix=matrix,
    )


def load_datasets(path: Optional[str] = None) -> Datasets:
    """ Load all JSONs from a set of zip files and parse them.

    Args:
        path (str): path to the folder with ZIP datasets.

    Returns:
        Datasets: A dictionary of dataset name and dataset contents.
    """

    if path is None:
        path = os.path.join(REPO_ROOT, "datasets")

    filt = os.path.join(path, "*.zip")
    files = glob.glob(filt)
    datasets = {}
    for file_name in tqdm(files):
        name = os.path.splitext(os.path.basename(file_name))[0]
        datasets[name] = load_zip(file_name)
    return datasets