kennymckormick commited on
Commit
3c75092
·
1 Parent(s): 577e18a

update leaderboard

Browse files
Files changed (2) hide show
  1. app.py +144 -0
  2. lb_info.py +233 -0
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import abc
2
+ import gradio as gr
3
+ from lb_info import *
4
+
5
+ with gr.Blocks() as demo:
6
+ struct = load_results()
7
+ timestamp = struct['time']
8
+ EVAL_TIME = format_timestamp(timestamp)
9
+ results = struct['results']
10
+ N_MODEL = len(results)
11
+ N_DATA = len(results['LLaVA-v1.5-7B']) - 1
12
+ DATASETS = list(results['LLaVA-v1.5-7B'])
13
+ DATASETS.remove('META')
14
+ print(DATASETS)
15
+
16
+ gr.Markdown(LEADERBORAD_INTRODUCTION.format(N_MODEL, N_DATA, EVAL_TIME))
17
+ structs = [abc.abstractproperty() for _ in range(N_DATA)]
18
+
19
+ with gr.Tabs(elem_classes='tab-buttons') as tabs:
20
+ with gr.TabItem('🏅 OpenVLM Main Leaderboard', elem_id='main', id=0):
21
+ gr.Markdown(LEADERBOARD_MD['MAIN'])
22
+ table, check_box = BUILD_L1_DF(results, MAIN_FIELDS)
23
+ type_map = check_box['type_map']
24
+ checkbox_group = gr.CheckboxGroup(
25
+ choices=check_box['all'],
26
+ value=check_box['required'],
27
+ label="Evaluation Dimension",
28
+ interactive=True,
29
+ )
30
+ headers = check_box['essential'] + checkbox_group.value
31
+ with gr.Row():
32
+ model_size = gr.CheckboxGroup(
33
+ choices=MODEL_SIZE,
34
+ value=MODEL_SIZE,
35
+ label='Model Size',
36
+ interactive=True
37
+ )
38
+ model_type = gr.CheckboxGroup(
39
+ choices=MODEL_TYPE,
40
+ value=MODEL_TYPE,
41
+ label='Model Type',
42
+ interactive=True
43
+ )
44
+ data_component = gr.components.DataFrame(
45
+ value=table[headers],
46
+ type="pandas",
47
+ datatype=[type_map[x] for x in headers],
48
+ interactive=False,
49
+ visible=True)
50
+
51
+ def filter_df(fields, model_size, model_type):
52
+ headers = check_box['essential'] + fields
53
+ df = cp.deepcopy(table)
54
+ df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
55
+ df = df[df['flag']]
56
+ df.pop('flag')
57
+ if len(df):
58
+ df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
59
+ df = df[df['flag']]
60
+ df.pop('flag')
61
+
62
+ comp = gr.components.DataFrame(
63
+ value=df[headers],
64
+ type="pandas",
65
+ datatype=[type_map[x] for x in headers],
66
+ interactive=False,
67
+ visible=True)
68
+ return comp
69
+
70
+ for cbox in [checkbox_group, model_size, model_type]:
71
+ cbox.change(fn=filter_df, inputs=[checkbox_group, model_size, model_type], outputs=data_component)
72
+
73
+ with gr.TabItem('🔍 About', elem_id='about', id=1):
74
+ gr.Markdown(urlopen(VLMEVALKIT_README).read().decode())
75
+
76
+ for i, dataset in enumerate(DATASETS):
77
+ with gr.TabItem(f'📊 {dataset} Leaderboard', elem_id=dataset, id=i + 2):
78
+ if dataset in LEADERBOARD_MD:
79
+ gr.Markdown(LEADERBOARD_MD[dataset])
80
+
81
+ s = structs[i]
82
+ s.table, s.check_box = BUILD_L2_DF(results, dataset)
83
+ s.type_map = s.check_box['type_map']
84
+ s.checkbox_group = gr.CheckboxGroup(
85
+ choices=s.check_box['all'],
86
+ value=s.check_box['required'],
87
+ label=f"{dataset} CheckBoxes",
88
+ interactive=True,
89
+ )
90
+ s.headers = s.check_box['essential'] + s.checkbox_group.value
91
+ with gr.Row():
92
+ s.model_size = gr.CheckboxGroup(
93
+ choices=MODEL_SIZE,
94
+ value=MODEL_SIZE,
95
+ label='Model Size',
96
+ interactive=True
97
+ )
98
+ s.model_type = gr.CheckboxGroup(
99
+ choices=MODEL_TYPE,
100
+ value=MODEL_TYPE,
101
+ label='Model Type',
102
+ interactive=True
103
+ )
104
+ s.data_component = gr.components.DataFrame(
105
+ value=s.table[s.headers],
106
+ type="pandas",
107
+ datatype=[s.type_map[x] for x in s.headers],
108
+ interactive=False,
109
+ visible=True)
110
+ s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
111
+
112
+ def filter_df_l2(dataset_name, fields, model_size, model_type):
113
+ s = structs[DATASETS.index(dataset_name)]
114
+ headers = s.check_box['essential'] + fields
115
+ df = cp.deepcopy(s.table)
116
+ df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
117
+ df = df[df['flag']]
118
+ df.pop('flag')
119
+ if len(df):
120
+ df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
121
+ df = df[df['flag']]
122
+ df.pop('flag')
123
+
124
+ comp = gr.components.DataFrame(
125
+ value=df[headers],
126
+ type="pandas",
127
+ datatype=[s.type_map[x] for x in headers],
128
+ interactive=False,
129
+ visible=True)
130
+ return comp
131
+
132
+ for cbox in [s.checkbox_group, s.model_size, s.model_type]:
133
+ cbox.change(fn=filter_df_l2, inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type], outputs=s.data_component)
134
+
135
+
136
+ with gr.Row():
137
+ with gr.Accordion("Citation", open=False):
138
+ citation_button = gr.Textbox(
139
+ value=CITATION_BUTTON_TEXT,
140
+ label=CITATION_BUTTON_LABEL,
141
+ elem_id='citation-button')
142
+
143
+ if __name__ == '__main__':
144
+ demo.launch(server_name='0.0.0.0')
lb_info.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ from collections import defaultdict
4
+ import gradio as gr
5
+ import copy as cp
6
+ import numpy as np
7
+ from .misc import listinstr
8
+
9
+ # CONSTANTS-URL
10
+ URL = "http://opencompass.openxlab.space/utils/OpenVLM.json"
11
+ VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
12
+ # CONSTANTS-CITATION
13
+ CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
14
+ title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
15
+ author={OpenCompass Contributors},
16
+ howpublished = {\url{https://github.com/open-compass/opencompass}},
17
+ year={2023}
18
+ }"""
19
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
20
+ # CONSTANTS-TEXT
21
+ LEADERBORAD_INTRODUCTION = """# OpenVLM Leaderboard
22
+ ### Welcome to the OpenVLM Leaderboard! On this leaderboard we share the evaluation results of VLMs obtained by the OpenSource Framework [**VLMEvalKit**](https://github.com/open-compass/VLMEvalKit) 🏆
23
+ ### Currently, OpenVLM Leaderboard covers {} different VLMs (including GPT-4v, Gemini, QwenVLPlus, LLaVA, etc.) and {} different multi-modal benchmarks.
24
+
25
+ This leaderboard was last updated: {}.
26
+ """
27
+ # CONSTANTS-FIELDS
28
+ META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
29
+ MAIN_FIELDS = ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench', 'MME', 'SEEDBench_IMG', 'MMVet', 'MMMU_VAL', 'MathVista', 'HallusionBench', 'LLaVABench']
30
+ MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
31
+ MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
32
+ MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
33
+
34
+ # The README file for each benchmark
35
+ LEADERBOARD_MD = {}
36
+
37
+ LEADERBOARD_MD['MAIN'] = """
38
+ ## Main Evaluation Results
39
+
40
+ - Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
41
+ - Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
42
+ - The overall evaluation results on 10 VLM benchmarks, sorted by the ascending order of Avg Rank.
43
+ """
44
+
45
+ LEADERBOARD_MD['SEEDBench_IMG'] = """
46
+ ## SEEDBench_IMG Scores (Prefetch / ChatGPT Answer Extraction / Official Leaderboard)
47
+
48
+ - **Overall**: The overall accuracy across all questions with **ChatGPT answer matching**.
49
+ - **Overall (prefetch)**: The accuracy when using exact matching for evaluation.
50
+ - **Overall (official)**: SEEDBench_IMG acc on the official leaderboard (if applicable).
51
+ """
52
+
53
+ LEADERBOARD_MD['MMVet'] = """
54
+ ## MMVet Evaluation Results
55
+
56
+ - In MMVet Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported.
57
+ - No specific prompt template adopted for **ALL VLMs**.
58
+ - We also provide performance on the [**Official Leaderboard**](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet) for models that are applicable. Those results are obtained with GPT-4-0314 evaluator (which has been deperacted for new users).
59
+ """
60
+
61
+ LEADERBOARD_MD['MMMU_VAL'] = """
62
+ ## MMMU Validation Evaluation Results
63
+
64
+ - For MMMU, we support the evaluation of the `dev` (150 samples) and `validation` (900 samples) set. Here we only report the results on the `validation` set.
65
+ - **Answer Inference:**
66
+ - For models with `interleave_generate` interface (accept interleaved images & texts as inputs), all testing samples can be inferred. **`interleave_generate` is adopted for inference.**
67
+ - For models without `interleave_generate` interface, samples with more than one images are skipped (42 out of 1050, directly count as wrong). **`generate` is adopted for inference.**
68
+ - **Evaluation**:
69
+ - MMMU include two types of questions: **multi-choice questions** & **open-ended QA**.
70
+ - For **open-ended QA (62/1050)**, we re-formulate it as multi-choice questions: `{'question': 'QQQ', 'answer': 'AAA'} -> {'question': 'QQQ', 'A': 'AAA', 'B': 'Other Answers', 'answer': 'A'}`, and then adopt the same evaluation paradigm for **multi-choice questions**.
71
+ - For **multi-choice questions (988/1050)**, we use **GPT-3.5-Turbo-0613** for matching prediction with options if heuristic matching does not work.
72
+ """
73
+
74
+ LEADERBOARD_MD['MathVista'] = """
75
+ ## MMMU TestMini Evaluation Results
76
+
77
+ - We report the evaluation results on MathVista **TestMini**, which include 1000 test samples.
78
+ - We adopt `GPT-4-Turbo (1106)` as the answer extractor when we failed to extract the answer with heuristic matching.
79
+ - The performance of **Human (High school)** and **Random Choice** are copied from the official leaderboard.
80
+ **Category Definitions:** **FQA:** figure QA, **GPS:** geometry problem solving, **MWP:** math word problem, **TQA:** textbook QA, **VQA:** visual QA, **ALG:** algebraic, **ARI:** arithmetic, **GEO:** geometry, **LOG:** logical , **NUM:** numeric, **SCI:** scientific, **STA:** statistical.
81
+ """
82
+
83
+ LEADERBOARD_MD['HallusionBench'] = """
84
+ [**HallusionBench**](https://github.com/tianyi-lab/HallusionBench) is a benchmark to evaluate hallucination of VLMs. It asks a set of visual questions with one original image and one modified image (the answers for a question can be different, considering the image content).
85
+
86
+ **Examples in HallusionBench:**
87
+
88
+ | Original Figure | Modified Figure |
89
+ | ------------------------------------------------------------ | ------------------------------------------------------------ |
90
+ | ![](http://opencompass.openxlab.space/utils/Hallu0.png) | ![](http://opencompass.openxlab.space/utils/Hallu1.png) |
91
+ | **Q1.** Is the right orange circle the same size as the left orange circle? **A1. Yes** | **Q1.** Is the right orange circle the same size as the left orange circle? **A1. No** |
92
+ | **Q2.** Is the right orange circle larger than the left orange circle? **A2. No** | **Q2.** Is the right orange circle larger than the left orange circle? **A2. Yes** |
93
+ | **Q3.** Is the right orange circle smaller than the left orange circle? **A3. No** | **Q3.** Is the right orange circle smaller than the left orange circle? **A3. No** |
94
+
95
+ **Metrics**:
96
+
97
+ >- aAcc: The overall accuracy of **all** atomic questions.
98
+ >
99
+ >- qAcc: The mean accuracy of unique **questions**. One question can be asked multiple times with different figures, we consider VLM correctly solved a unique question only if it succeeds in all <question, figure> pairs for this unique question.
100
+ >- fAcc: The mean accuracy of all **figures**. One figure is associated with multiple questions, we consider VLM correct on a figure only if it succeeds to solve all questions of this figure.
101
+
102
+ **Evaluation Setting**:
103
+
104
+ > 1. **No-visual** Questions (questions asked without the associated figure) in HallusionBench are **skipped** during evaluation.
105
+ > 2. When we failed to extract Yes / No from the VLM prediction, we adopt **GPT-3.5-Turbo-0613** as the answer extractor.
106
+ > 3. We report aAcc, qAcc, and fAcc for all evaluated VLMs.
107
+
108
+ ## HallusionBench Evaluation Results
109
+ """
110
+
111
+ LEADERBOARD_MD['LLaVABench'] = """
112
+ ## LLaVABench Evaluation Results
113
+
114
+ - In LLaVABench Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported.
115
+ - No specific prompt template adopted for **ALL VLMs**.
116
+ - We also include the official results (obtained by gpt-4-0314) for applicable models.
117
+ """
118
+
119
+ from urllib.request import urlopen
120
+
121
+ def load_results():
122
+ data = json.loads(urlopen(URL).read())
123
+ return data
124
+
125
+ def nth_large(val, vals):
126
+ return sum([1 for v in vals if v > val]) + 1
127
+
128
+ def format_timestamp(timestamp):
129
+ return timestamp[:2] + '.' + timestamp[2:4] + '.' + timestamp[4:6] + ' ' + timestamp[6:8] + ':' + timestamp[8:10] + ':' + timestamp[10:12]
130
+
131
+ def model_size_flag(sz, FIELDS):
132
+ if pd.isna(sz) and 'Unknown' in FIELDS:
133
+ return True
134
+ if pd.isna(sz):
135
+ return False
136
+ if '<10B' in FIELDS and sz < 10:
137
+ return True
138
+ if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
139
+ return True
140
+ if '20B-40B' in FIELDS and sz >= 20 and sz < 40:
141
+ return True
142
+ if '>40B' in FIELDS and sz >= 40:
143
+ return True
144
+ return False
145
+
146
+ def model_type_flag(line, FIELDS):
147
+ if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes':
148
+ return True
149
+ if 'API' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'Yes':
150
+ return True
151
+ if 'Proprietary' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'No':
152
+ return True
153
+ return False
154
+
155
+ def BUILD_L1_DF(results, fields):
156
+ res = defaultdict(list)
157
+ for i, m in enumerate(results):
158
+ item = results[m]
159
+ meta = item['META']
160
+ for k in META_FIELDS:
161
+ if k == 'Parameters (B)':
162
+ param = meta['Parameters']
163
+ res[k].append(float(param.replace('B', '')) if param != '' else None)
164
+ elif k == 'Method':
165
+ name, url = meta['Method']
166
+ res[k].append(f'<a href="{url}">{name}</a>')
167
+ else:
168
+ res[k].append(meta[k])
169
+ scores, ranks = [], []
170
+ for d in fields:
171
+ res[d].append(item[d]['Overall'])
172
+ if d == 'MME':
173
+ scores.append(item[d]['Overall'] / 28)
174
+ else:
175
+ scores.append(item[d]['Overall'])
176
+ ranks.append(nth_large(item[d]['Overall'], [x[d]['Overall'] for x in results.values()]))
177
+ res['Avg Score'].append(round(np.mean(scores), 1))
178
+ res['Avg Rank'].append(round(np.mean(ranks), 2))
179
+
180
+ df = pd.DataFrame(res)
181
+ df = df.sort_values('Avg Rank')
182
+
183
+ check_box = {}
184
+ check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
185
+ check_box['required'] = ['Avg Score', 'Avg Rank']
186
+ check_box['all'] = check_box['required'] + ['OpenSource', 'Verified'] + fields
187
+ type_map = defaultdict(lambda: 'number')
188
+ type_map['Method'] = 'html'
189
+ type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
190
+ check_box['type_map'] = type_map
191
+ return df, check_box
192
+
193
+ def BUILD_L2_DF(results, dataset):
194
+ res = defaultdict(list)
195
+ fields = list(list(results.values())[0][dataset].keys())
196
+ non_overall_fields = [x for x in fields if 'Overall' not in x]
197
+ overall_fields = [x for x in fields if 'Overall' in x]
198
+ if dataset == 'MME':
199
+ non_overall_fields = [x for x in non_overall_fields if not listinstr(['Perception', 'Cognition'], x)]
200
+ overall_fields = overall_fields + ['Perception', 'Cognition']
201
+
202
+ for m in results:
203
+ item = results[m]
204
+ meta = item['META']
205
+ for k in META_FIELDS:
206
+ if k == 'Parameters (B)':
207
+ param = meta['Parameters']
208
+ res[k].append(float(param.replace('B', '')) if param != '' else None)
209
+ elif k == 'Method':
210
+ name, url = meta['Method']
211
+ res[k].append(f'<a href="{url}">{name}</a>')
212
+ else:
213
+ res[k].append(meta[k])
214
+ fields = [x for x in fields]
215
+
216
+ for d in non_overall_fields:
217
+ res[d].append(item[dataset][d])
218
+ for d in overall_fields:
219
+ res[d].append(item[dataset][d])
220
+
221
+ df = pd.DataFrame(res)
222
+ df = df.sort_values('Overall')
223
+ df = df.iloc[::-1]
224
+
225
+ check_box = {}
226
+ check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
227
+ check_box['required'] = overall_fields
228
+ check_box['all'] = non_overall_fields + overall_fields
229
+ type_map = defaultdict(lambda: 'number')
230
+ type_map['Method'] = 'html'
231
+ type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
232
+ check_box['type_map'] = type_map
233
+ return df, check_box