File size: 10,308 Bytes
54f3112
43a97f8
 
 
 
20e4287
 
 
cde8217
c4d90ef
 
cde8217
20e4287
43a97f8
cde8217
54f3112
cde8217
20e4287
cde8217
54f3112
20e4287
 
c4d90ef
9049edd
20e4287
54f3112
cde8217
 
 
54f3112
 
43a97f8
 
 
 
d0b41f3
43a97f8
54f3112
 
1f25992
43a97f8
 
3247730
 
29fa0ea
43a97f8
 
 
 
8eef872
 
43a97f8
8eef872
29fa0ea
2059dd1
 
 
 
 
 
a7b9847
29fa0ea
43a97f8
6d72d0f
 
43a97f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a50df5
c34de5a
54f3112
 
 
 
d093c93
cde8217
d093c93
cde8217
d093c93
8b3d9a9
d093c93
8b3d9a9
d093c93
8b3d9a9
d093c93
8b3d9a9
d093c93
8b3d9a9
54f3112
 
 
 
43a97f8
d0b41f3
54f3112
 
 
 
 
d8549de
54f3112
43a97f8
 
 
 
 
 
 
 
54f3112
 
 
 
 
0f12e4d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# this is .py for store constants 
MODEL_INFO = ["Model Type", "Model", "Language Model", "Evaluation Method"]
MODEL_INFO_V2 = ["Model", "Language Model", "Evaluation Method"]
MODEL_SIZE = ["<10B", ">=10B", "-"]
EVALUATION_METHOD = ["PPL", "PPL for A/B/C/D", "Generate", "NG"]
DIMENSION_LEVEL = ["L1", "L2", "L3"]
LEADERBOARD_VERSION = ["Version1", "Version2"]
TASK_INFO = ["Avg. All", "Avg. Img", "Avg. Video", "Scene Understanding", "Instance Identity", "Instance Attribute", "Instance Location", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition",  "Action Recognition", "Action Prediction", "Procedure Understanding"]
TASK_V2_INFO = ["Avg. Single", "Avg. Multi", "Avg. Video", "Avg. P1", "Avg. P2", "Avg. P3", "Scene Understanding", "Instance Identity", "Instance Attribute", "Instance Location", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Celebrity Recognition", "Landmark Recognition", "Chart Understanding", "Visual Referring Expression", "Science Knowledge", "Emotion Recognition", "Visual Mathematics", "Difference Spotting", "Meme Comprehension", "Global Video Understanding", "Action Recognition", "Action Predicion", "Procedure Understanding", "In-Context Captioning", "Interleaved Image-Text Analysis", "Text-to-Image Generation", "Next Image Prediction", "Text-Image Creation"]

AVG_INFO = ["Avg. All", "Avg. Img", "Avg. Video"]
AVG_V2_INFO = ["Avg. Single", "Avg. Multi", "Avg. Video", "Avg. P1", "Avg. P2", "Avg. P3"]

DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
DATA_TITILE_V2_TYPE = ["markdown", "markdown","markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
CSV_DIR = "./file/result.csv"
CSV_TASK_DIR = './file/result_task.csv'
CSV_V2_DIR = "./file/result_v2.csv"
CSV_V2_TASK_DIR = './file/result_v2_task.csv'

COLUMN_NAMES = MODEL_INFO + TASK_INFO
COLUMN_V2_NAMES = MODEL_INFO_V2 + TASK_V2_INFO

DATA_NUM = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 85, 1509, 1225, 1023]
DATA_NUM_V2 = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 435, 330, 500, 501, 199, 277, 501, 132, 501, 159, 1594, 1509, 1225, 1023, 120, 49, 1008, 81, 79]

LEADERBORAD_VERSION = ["SEED-Bench-1", "SEED-Bench-2"]
AVERAGE_TYPE = ["All Average", "Task Average"]

LEADERBORAD_INTRODUCTION = """# SEED-Bench Leaderboard
    Welcome to the leaderboard of the SEED-Bench! πŸ† 

    SEED-Bench-1 consists of 19K multiple-choice questions with accurate human annotations for evaluating Multimodal LLMs, covering 12 evaluation dimensions including both the spatial and temporal understanding.
    Please refer to [SEED-Bench-1 paper](https://arxiv.org/abs/2307.16125) for more details.

    SEED-Bench-2 comprises 24K multiple-choice questions with accurate human annotations, which spans 27 dimensions, including the evaluation of both text and image generation.
    Please refer to [SEED-Bench-2 paper](https://arxiv.org/abs/2311.17092) for more details.
    """


SUBMIT_INTRODUCTION = """# Submit on SEED Benchmark Introduction
    1. Obtain JSON file from our [github repository](https://github.com/AILab-CVC/SEED-Bench#leaderboard-submit) after evaluation. For example on SEED-Bench-1, you can obtain InstructBLIP's JSON file as results/results.json after running 
    ```shell
    python eval.py --model instruct_blip --anno_path SEED-Bench.json --output-dir results
    ```
    And for example on SEED-Bench-2, you can obtain InternLM_Xcomposer_VL's JSON file as results/results.json after running 
    ```shell
    python eval.py --model InternLM_Xcomposer_VL --anno_path SEED-Bench_v2_level1_2_3.json --output-dir results --evaluate_level L2 --evaluate_part all --evaluate_version v2
    ```
    2. If you want to update model performance by uploading new results, please ensure 'Model Name Revision' is the same as what's shown in the leaderboard. For example, if you want to modify InstructBLIP's performance, you need to fill in 'InstructBLIP' in 'Revision Model Name'.
    3. Please provide the correct link of your model's repository for each submission.  
    4. For the evaluation dimension, you can choose "All/Image/Video" for SEED-Bench-1 and "L1/L2/L3" for SEED-Bench-2, and the results of dimensions that are not evaluated will be set to zero.
    5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.


    Note: The format of the submitted json file is a dict for each line. This dict contains two keys: question_id and prediction. Specific examples are as follows: 
    ```shell
    {"question_id": "5_0", "prediction": "B"}
    {"question_id": "3_0", "prediction": "B"}
    ```

    ## Submit Example
    For example on SEED-Bench-1, if you want to upload InstructBLIP's result in the leaderboard, you need to:
    1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
    2. Fill in 'InstructBLIP' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
    3. Select 'ImageLLM' in 'Model Type'.
    4. Fill in 'https://github.com/salesforce/LAVIS' in 'Model Link'.
    5. Fill in '7B' in 'Model size'.
    6. Select 'v1' in 'Benchmark version'.
    7. Select 'Flan-T5-XL' in 'LLM Type'.
    8. Select 'All' in 'Evaluation Dimension for SEED-Bench 1'.
    9. Select 'PPL' in 'Evaluate Method'.
    10. Upload results.json.
    11. Click the 'Submit Eval' button.
    12. Click 'Refresh' to obtain the uploaded leaderboard.

    For example on SEED-Bench-2, if you want to upload InternLM_Xcomposer_VL's result in the leaderboard, you need to:
    1. Fill in 'InternLM_Xcomposer_VL' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
    2. Fill in 'InternLM_Xcomposer_VL' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
    3. Select 'ImageLLM' in 'Model Type'.
    4. Fill in 'https://github.com/InternLM/InternLM-XComposer' in 'Model Link'.
    5. Fill in '7B' in 'Model size'.
    6. Select 'v2' in 'Benchmark version'.
    7. Select 'Other' in 'LLM Type'.
    8. Fill 'InternLM-7B' in 'LLM model(for Other)'
    9. Select 'L2' in 'Evaluation Dimension for SEED-Bench 2'.
    10. Select 'PPL' in 'Evaluate Method'.
    11. Upload results.json.
    12. Click the 'Submit Eval' button.
    13. Click 'Refresh' to obtain the uploaded leaderboard.

    ## If you have any questions, please contact [[email protected]]([email protected]).
"""

TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
        We use accurancy(%) as the primary evaluation metric for each tasks. 

        Performance Average Type is All Average means that calculates the overall accuracy by dividing the total number of correct QA answers by the total number of QA questions.

        Performance Average Type is Task Average represents that the overall accuracy using the average accuracy of each dimension.

        For PPL evaluation method, we count the loss for each candidate and select the lowest loss candidate. For detail, please refer [InternLM_Xcomposer_VL_interface](https://github.com/AILab-CVC/SEED-Bench/blob/387a067b6ba99ae5e8231f39ae2d2e453765765c/SEED-Bench-2/model/InternLM_Xcomposer_VL_interface.py#L74).

        For PPL A/B/C/D evaluation method, please refer [EVAL_SEED.md](https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/seed_bench/EVAL_SEED.md) for more information.

        For Generate evaluation method, please refer [Evaluation.md](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md#seed-bench) for detailed.

        For the NG evaluation method, we indicate that the evaluation method is Not Given.
        
        If you have any questions, please feel free to contact us.
    """

LEADERBORAD_INFO = """
      Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation. 
      [SEED-Bench-1](https://arxiv.org/abs/2307.16125) consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality. 
      [SEED-Bench-2](https://arxiv.org/abs/2311.17092)  comprises 24K multiple-choice questions with accurate human annotations, which spans 27 dimensions, including the evaluation of both text and image generation.
      We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes. 
      Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation. 
      By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
"""


CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@article{li2023seed2,
  title={SEED-Bench-2: Benchmarking Multimodal Large Language Models},
  author={Li, Bohao and Ge, Yuying and Ge, Yixiao and Wang, Guangzhi and Wang, Rui and Zhang, Ruimao and Shan, Ying},
  journal={arXiv preprint arXiv:2311.17092},
  year={2023}
  }

  @article{li2023seed,
  title={SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension},
  author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
  journal={arXiv preprint arXiv:2307.16125},
  year={2023}
}"""