Spaces:

ml-energy
/

leaderboard

Running

File size: 1,526 Bytes

b10121d

import json
import random
import base64

SEED = 68
NUM_SAMPLES = 500


def main() -> None:
    random.seed(SEED)

    with open("full.json") as f:
        data = json.load(f)
        data = random.sample(data, NUM_SAMPLES * 2)

    dataset = []
    data_iter = iter(data)
    while len(dataset) < NUM_SAMPLES:
        sample = next(data_iter)

        # 1. The image should exist.
        # 2. Even index messages in the conversation should be from the human.
        # 3. The first message should contain at most one "<image>" substring, which will be removed.
        # 4. Even index messages will be concatenated to form the prompt.
        image_path = "train2017/" + sample["image"]
        conversation = []
        for conv in sample["conversations"][::2]:
            assert conv["from"] == "human", sample
            conversation.append(conv["value"])
        if (ind := conversation[0].find("<image>")) != -1:
            conversation[0] = conversation[0][:ind] + conversation[0][ind+len("<image>"):]
        
        message = ""
        for conv in conversation:
            assert "<image>" not in conv, sample
            message += conv.strip() + " "
        message = message.strip()

        dataset.append(
            dict(
                image=base64.b64encode(open(image_path, "rb").read()).decode("utf-8"),
                prompt=message,
            ),
        )

    with open(f"llava_conversation_{NUM_SAMPLES}.json", "w") as f:
        json.dump(dataset, f)


if __name__ == "__main__":
    main()