Spaces:

camel-ai
/

camel-data-explorer

Running

File size: 4,801 Bytes

# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
"""
Everything related to parsing the data JSONs into UI-compatible format.
"""

import glob
import os
import re
from typing import Any, Dict, Optional, Tuple, Union

from tqdm import tqdm

from apps.common.auto_zip import AutoZip

ChatHistory = Dict[str, Any]
ParsedChatHistory = Dict[str, Any]
AllChats = Dict[str, Any]
Datasets = Dict[str, AllChats]

REPO_ROOT = os.path.realpath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), "../.."))


def parse(raw_chat: ChatHistory) -> Union[ParsedChatHistory, None]:
    """ Gets the JSON raw chat data, validates it and transforms
        into an easy to work with form.

    Args:
        raw_chat (ChatHistory): In-memory loaded JSON data file.

    Returns:
        Union[ParsedChatHistory, None]: Parsed chat data or None
        if there were parsing errors.
    """

    if "role_1" not in raw_chat:
        return None

    role_1 = raw_chat["role_1"]
    if "_RoleType.ASSISTANT" not in role_1:
        return None
    assistant_role = role_1.split("_RoleType.ASSISTANT")
    if len(assistant_role) < 1:
        return None
    if len(assistant_role[0]) <= 0:
        return None
    assistant_role = assistant_role[0]

    role_2 = raw_chat["role_2"]
    if "_RoleType.USER" not in role_2:
        return None
    user_role = role_2.split("_RoleType.USER")
    if len(user_role) < 1:
        return None
    if len(user_role[0]) <= 0:
        return None
    user_role = user_role[0]

    original_task = raw_chat["original_task"]
    if len(original_task) <= 0:
        return None

    specified_task = raw_chat["specified_task"]
    if len(specified_task) <= 0:
        return None

    messages = dict()
    for key in raw_chat:
        match = re.search("message_(?P<number>[0-9]+)", key)
        if match:
            number = int(match.group("number"))
            messages[number] = raw_chat[key]

    return dict(
        assistant_role=assistant_role,
        user_role=user_role,
        original_task=original_task,
        specified_task=specified_task,
        messages=messages,
    )


def load_zip(zip_path: str) -> AllChats:
    """ Load all JSONs from a zip file and parse them.

    Args:
        path (str): path to the ZIP file.

    Returns:
        AllChats: A dictionary with all possible assistant and
        user roles and the matrix of chats.
    """

    zip_inst = AutoZip(zip_path)
    parsed_list = []
    for raw_chat in tqdm(iter(zip_inst)):
        parsed = parse(raw_chat)
        if parsed is None:
            continue
        parsed_list.append(parsed)

    assistant_roles_set = set()
    user_roles_set = set()
    for parsed in parsed_list:
        assistant_roles_set.add(parsed['assistant_role'])
        user_roles_set.add(parsed['user_role'])
    assistant_roles = list(sorted(assistant_roles_set))
    user_roles = list(sorted(user_roles_set))
    matrix: Dict[Tuple[str, str], Dict[str, Dict]] = dict()
    for parsed in parsed_list:
        key = (parsed['assistant_role'], parsed['user_role'])
        original_task: str = parsed['original_task']
        new_item = {
            k: v
            for k, v in parsed.items()
            if k not in {'assistant_role', 'user_role', 'original_task'}
        }
        if key in matrix:
            matrix[key][original_task] = new_item
        else:
            matrix[key] = {original_task: new_item}

    return dict(
        assistant_roles=assistant_roles,
        user_roles=user_roles,
        matrix=matrix,
    )


def load_datasets(path: Optional[str] = None) -> Datasets:
    """ Load all JSONs from a set of zip files and parse them.

    Args:
        path (str): path to the folder with ZIP datasets.

    Returns:
        Datasets: A dictionary of dataset name and dataset contents.
    """

    if path is None:
        path = os.path.join(REPO_ROOT, "datasets")

    filt = os.path.join(path, "*.zip")
    files = glob.glob(filt)
    datasets = {}
    for file_name in tqdm(files):
        name = os.path.splitext(os.path.basename(file_name))[0]
        datasets[name] = load_zip(file_name)
    return datasets