orionweller commited on
Commit
f11b057
·
1 Parent(s): 5250137
.github/workflows/update_leaderboard.yml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will install Python dependencies, run tests and lint with a single version of Python
2
+ # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3
+
4
+ name: update_leaderboard_daily
5
+
6
+ on:
7
+ schedule:
8
+ - cron: '30 2 * * *'
9
+ push:
10
+ branches: [ main ]
11
+
12
+ jobs:
13
+ build:
14
+
15
+ runs-on: ubuntu-latest
16
+
17
+ steps:
18
+ - uses: actions/checkout@v3
19
+ with:
20
+ fetch-depth: 0
21
+ lfs: true
22
+ - name: Set up Python 3.9
23
+ uses: actions/setup-python@v4
24
+ with:
25
+ python-version: 3.9
26
+ - name: Install requirements
27
+ run: |
28
+ pip install -r requirements.txt
29
+ - name: Run leaderboard updating code
30
+ run: |
31
+ python refresh.py
32
+ - name: Commit updates
33
+ uses: stefanzweifel/git-auto-commit-action@v4
34
+ with:
35
+ commit_message: Automated Leaderboard Update
36
+ file_pattern: '*.pkl *.json'
37
+ - name: Push to hub
38
+ env:
39
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
40
+ run: git push https://mteb:[email protected]/spaces/mteb/leaderboard-in-progress main
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  *.pyc
2
- model_infos.json
 
 
1
  *.pyc
2
+ model_infos.json
3
+ space
EXTERNAL_MODEL_RESULTS.json CHANGED
The diff for this file is too large to render. See raw diff
 
all_data_tasks.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05e5d083e86af0a00fe10e97c8d55d4c06280fde7396ff172158d69fa216cb50
3
+ size 531551
app.py CHANGED
@@ -1,58 +1,19 @@
1
  from functools import reduce
2
  import json
 
3
  import os
4
  import re
5
 
6
- from datasets import load_dataset
7
  import gradio as gr
8
- from huggingface_hub import hf_hub_download
9
- from huggingface_hub.repocard import metadata_load
10
  import pandas as pd
11
  from tqdm.autonotebook import tqdm
12
 
13
  from utils.model_size import get_model_parameters_memory
14
- from envs import LEADERBOARD_CONFIG, MODEL_META, REPO_ID, RESULTS_REPO, API
 
 
15
 
16
- TASKS_CONFIG = LEADERBOARD_CONFIG["tasks"]
17
- BOARDS_CONFIG = LEADERBOARD_CONFIG["boards"]
18
 
19
- TASKS = list(TASKS_CONFIG.keys())
20
- PRETTY_NAMES = {
21
- "InstructionRetrieval": "Retrieval w/Instructions",
22
- "PairClassification": "Pair Classification",
23
- "BitextMining": "Bitext Mining",
24
- }
25
-
26
- TASK_TO_METRIC = {k: [v["metric"]] for k, v in TASKS_CONFIG.items()}
27
- # Add legacy metric names
28
- TASK_TO_METRIC["STS"].append("cos_sim_spearman")
29
- TASK_TO_METRIC["STS"].append("cosine_spearman")
30
- TASK_TO_METRIC["Summarization"].append("cos_sim_spearman")
31
- TASK_TO_METRIC["Summarization"].append("cosine_spearman")
32
- TASK_TO_METRIC["PairClassification"].append("cos_sim_ap")
33
- TASK_TO_METRIC["PairClassification"].append("cosine_ap")
34
-
35
-
36
- def make_clickable_model(model_name, link=None):
37
- if link is None:
38
- link = "https://huggingface.co/" + model_name
39
- # Remove user from model name
40
- return (
41
- f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
42
- )
43
-
44
- EXTERNAL_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_external", False)}
45
- EXTERNAL_MODEL_TO_LINK = {k: v["link"] for k,v in MODEL_META["model_meta"].items() if v.get("link", False)}
46
- EXTERNAL_MODEL_TO_DIM = {k: v["dim"] for k,v in MODEL_META["model_meta"].items() if v.get("dim", False)}
47
- EXTERNAL_MODEL_TO_SEQLEN = {k: v["seq_len"] for k,v in MODEL_META["model_meta"].items() if v.get("seq_len", False)}
48
- EXTERNAL_MODEL_TO_SIZE = {k: v["size"] for k,v in MODEL_META["model_meta"].items() if v.get("size", False)}
49
- PROPRIETARY_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_proprietary", False)}
50
- TASK_DESCRIPTIONS = {k: v["task_description"] for k,v in TASKS_CONFIG.items()}
51
- TASK_DESCRIPTIONS["Overall"] = "Overall performance across MTEB tasks."
52
- SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_sentence_transformers_compatible", False)}
53
- MODELS_TO_SKIP = MODEL_META["models_to_skip"]
54
- CROSS_ENCODERS = MODEL_META["cross_encoders"]
55
- BI_ENCODERS = [k for k, _ in MODEL_META["model_meta"].items() if k not in CROSS_ENCODERS + ["bm25"]]
56
 
57
  PROPRIETARY_MODELS = {
58
  make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))
@@ -72,104 +33,6 @@ BI_ENCODERS = {
72
  }
73
 
74
 
75
- TASK_TO_TASK_TYPE = {task_category: [] for task_category in TASKS}
76
- for board_config in BOARDS_CONFIG.values():
77
- for task_category, task_list in board_config["tasks"].items():
78
- TASK_TO_TASK_TYPE[task_category].extend(task_list)
79
-
80
- def add_lang(examples):
81
- if not(examples["eval_language"]):
82
- examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
83
- else:
84
- examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"] + f' ({examples["eval_language"]})'
85
- return examples
86
-
87
- def norm(names): return set([name.split(" ")[0] for name in names])
88
-
89
- def add_task(examples):
90
- # Could be added to the dataset loading script instead
91
- task_name = examples["mteb_dataset_name"]
92
- task_type = None
93
- for task_category, task_list in TASK_TO_TASK_TYPE.items():
94
- if task_name in norm(task_list):
95
- task_type = task_category
96
- break
97
- if task_type is not None:
98
- examples["mteb_task"] = task_type
99
- else:
100
- print("WARNING: Task not found for dataset", examples["mteb_dataset_name"])
101
- examples["mteb_task"] = "Unknown"
102
- return examples
103
-
104
- def filter_metric_external(x, task, metrics):
105
- # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
106
- if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
107
- return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
108
- else:
109
- return x["mteb_task"] == task and x["metric"] in metrics
110
-
111
- def filter_metric_fetched(name, metric, expected_metrics):
112
- # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
113
- return metric == 'ndcg_at_1' if name in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval'] else metric in expected_metrics
114
-
115
- if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
116
- with open("EXTERNAL_MODEL_RESULTS.json") as f:
117
- EXTERNAL_MODEL_RESULTS = json.load(f)
118
- # Update with models not contained
119
- models_to_run = []
120
- for model in EXTERNAL_MODELS:
121
- if model not in EXTERNAL_MODEL_RESULTS:
122
- models_to_run.append(model)
123
- EXTERNAL_MODEL_RESULTS[model] = {k: {v[0]: []} for k, v in TASK_TO_METRIC.items()}
124
- else:
125
- EXTERNAL_MODEL_RESULTS = {model: {k: {v[0]: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
126
- models_to_run = EXTERNAL_MODELS
127
-
128
- pbar = tqdm(models_to_run, desc="Fetching external model results")
129
- for model in pbar:
130
- pbar.set_description(f"Fetching external model results for {model!r}")
131
- ds = load_dataset(RESULTS_REPO, model, trust_remote_code=True)
132
- # For local debugging:
133
- #, download_mode='force_redownload', verification_mode="no_checks")
134
- ds = ds.map(add_lang)
135
- ds = ds.map(add_task)
136
- base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
137
-
138
- for task, metrics in TASK_TO_METRIC.items():
139
- ds_dict = ds.filter(lambda x: filter_metric_external(x, task, metrics))["test"].to_dict()
140
- ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
141
- # metrics[0] is the main name for this metric; other names in the list are legacy for backward-compat
142
- EXTERNAL_MODEL_RESULTS[model][task][metrics[0]].append({**base_dict, **ds_dict})
143
-
144
- # Save & cache EXTERNAL_MODEL_RESULTS
145
- with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
146
- json.dump(EXTERNAL_MODEL_RESULTS, f)
147
-
148
- def get_dim_seq_size(model):
149
- filenames = [sib.rfilename for sib in model.siblings]
150
- dim, seq = "", ""
151
- for filename in filenames:
152
- if re.match("\d+_Pooling/config.json", filename):
153
- st_config_path = hf_hub_download(model.modelId, filename=filename)
154
- dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
155
- break
156
- for filename in filenames:
157
- if re.match("\d+_Dense/config.json", filename):
158
- st_config_path = hf_hub_download(model.modelId, filename=filename)
159
- dim = json.load(open(st_config_path)).get("out_features", dim)
160
- if "config.json" in filenames:
161
- config_path = hf_hub_download(model.modelId, filename="config.json")
162
- config = json.load(open(config_path))
163
- if not dim:
164
- dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
165
- seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
166
-
167
- if dim == "" or seq == "":
168
- raise Exception(f"Could not find dim or seq for model {model.modelId}")
169
-
170
- # Get model file size without downloading. Parameters in million parameters and memory in GB
171
- parameters, memory = get_model_parameters_memory(model)
172
- return dim, seq, parameters, memory
173
 
174
  def make_datasets_clickable(df):
175
  """Does not work"""
@@ -179,235 +42,7 @@ def make_datasets_clickable(df):
179
  columns={f'BornholmBitextMining': '<a target="_blank" style="text-decoration: underline" href="{link}">BornholmBitextMining</a>',})
180
  return df
181
 
182
- def add_rank(df):
183
- cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"]]
184
- if len(cols_to_rank) == 1:
185
- df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
186
- else:
187
- df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
188
- df.sort_values("Average", ascending=False, inplace=True)
189
- df.insert(0, "Rank", list(range(1, len(df) + 1)))
190
- df = df.round(2)
191
- # Fill NaN after averaging
192
- df.fillna("", inplace=True)
193
- return df
194
 
195
- model_infos_path = "model_infos.json"
196
- MODEL_INFOS = {}
197
- if os.path.exists(model_infos_path):
198
- with open(model_infos_path) as f:
199
- MODEL_INFOS = json.load(f)
200
-
201
- def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True, refresh=True):
202
- global MODEL_INFOS
203
- api = API
204
- models = api.list_models(filter="mteb")
205
- # Legacy names changes; Also fetch the old results & merge later
206
- if ('MLSUMClusteringP2P (fr)' in datasets):
207
- datasets.append('MLSUMClusteringP2P')
208
- if ('MLSUMClusteringS2S (fr)' in datasets):
209
- datasets.append('MLSUMClusteringS2S')
210
- # Initialize list to models that we cannot fetch metadata from
211
- df_list = []
212
- for model in EXTERNAL_MODEL_RESULTS:
213
- results_list = []
214
- for task in tasks:
215
- # Not all models have InstructionRetrieval, other new tasks
216
- if task not in EXTERNAL_MODEL_RESULTS[model]: continue
217
- results_list += EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task][0]]
218
-
219
- if len(datasets) > 0:
220
- res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
221
- elif langs:
222
- # Would be cleaner to rely on an extra language column instead
223
- langs_format = [f"({lang})" for lang in langs]
224
- res = {k: v for d in results_list for k, v in d.items() if any([k.split(" ")[-1] in (k, x) for x in langs_format])}
225
- else:
226
- res = {k: v for d in results_list for k, v in d.items()}
227
- # Model & at least one result
228
- if len(res) > 1:
229
- if add_emb_dim:
230
- res["Model Size (Million Parameters)"] = EXTERNAL_MODEL_TO_SIZE.get(model, "")
231
- res["Memory Usage (GB, fp32)"] = round(res["Model Size (Million Parameters)"] * 1e6 * 4 / 1024**3, 2) if res["Model Size (Million Parameters)"] != "" else ""
232
- res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
233
- res["Max Tokens"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
234
- df_list.append(res)
235
-
236
- for model in models:
237
- if model.modelId in MODELS_TO_SKIP: continue
238
- print("MODEL", model.modelId)
239
- if model.modelId not in MODEL_INFOS or refresh:
240
- readme_path = hf_hub_download(model.modelId, filename="README.md")
241
- meta = metadata_load(readme_path)
242
- MODEL_INFOS[model.modelId] = {
243
- "metadata": meta
244
- }
245
- meta = MODEL_INFOS[model.modelId]["metadata"]
246
- if "model-index" not in meta:
247
- continue
248
- # meta['model-index'][0]["results"] is list of elements like:
249
- # {
250
- # "task": {"type": "Classification"},
251
- # "dataset": {
252
- # "type": "mteb/amazon_massive_intent",
253
- # "name": "MTEB MassiveIntentClassification (nb)",
254
- # "config": "nb",
255
- # "split": "test",
256
- # },
257
- # "metrics": [
258
- # {"type": "accuracy", "value": 39.81506388702084},
259
- # {"type": "f1", "value": 38.809586587791664},
260
- # ],
261
- # },
262
- # Use "get" instead of dict indexing to skip incompat metadata instead of erroring out
263
- if len(datasets) > 0:
264
- task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and any([x in sub_res.get("dataset", {}).get("name", "") for x in datasets])]
265
- elif langs:
266
- task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))]
267
- else:
268
- task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)]
269
- try:
270
- out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if filter_metric_fetched(res["dataset"]["name"].replace("MTEB ", ""), score["type"], task_to_metric.get(res["task"]["type"]))][0]} for res in task_results]
271
- except Exception as e:
272
- print("ERROR", model.modelId, e)
273
- continue
274
- out = {k: v for d in out for k, v in d.items()}
275
- out["Model"] = make_clickable_model(model.modelId)
276
- # Model & at least one result
277
- if len(out) > 1:
278
- if add_emb_dim:
279
- # The except clause triggers on gated repos, we can use external metadata for those
280
- try:
281
- if "dim_seq_size" not in MODEL_INFOS[model.modelId] or refresh:
282
- MODEL_INFOS[model.modelId]["dim_seq_size"] = list(get_dim_seq_size(model))
283
- except:
284
- name_without_org = model.modelId.split("/")[-1]
285
- # EXTERNAL_MODEL_TO_SIZE[name_without_org] refers to millions of parameters, so for memory usage
286
- # we multiply by 1e6 to get just the number of parameters, then by 4 to get the number of bytes
287
- # given fp32 precision (4 bytes per float), then divide by 1024**3 to get the number of GB
288
- MODEL_INFOS[model.modelId]["dim_seq_size"] = (
289
- EXTERNAL_MODEL_TO_DIM.get(name_without_org, ""),
290
- EXTERNAL_MODEL_TO_SEQLEN.get(name_without_org, ""),
291
- EXTERNAL_MODEL_TO_SIZE.get(name_without_org, ""),
292
- round(EXTERNAL_MODEL_TO_SIZE[name_without_org] * 1e6 * 4 / 1024**3, 2) if name_without_org in EXTERNAL_MODEL_TO_SIZE else "",
293
- )
294
- out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] = tuple(MODEL_INFOS[model.modelId]["dim_seq_size"])
295
- df_list.append(out)
296
- if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
297
- SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
298
-
299
- # Save & cache MODEL_INFOS
300
- with open("model_infos.json", "w") as f:
301
- json.dump(MODEL_INFOS, f)
302
-
303
- df = pd.DataFrame(df_list)
304
- # If there are any models that are the same, merge them
305
- # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
306
- df = df.groupby("Model", as_index=False).first()
307
- # Put 'Model' column first
308
- cols = sorted(list(df.columns))
309
- base_columns = ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"]
310
- if len(datasets) > 0:
311
- # Update legacy column names to be merged with newer ones
312
- # Update 'MLSUMClusteringP2P (fr)' with values from 'MLSUMClusteringP2P'
313
- if ('MLSUMClusteringP2P (fr)' in datasets) and ('MLSUMClusteringP2P' in cols):
314
- df['MLSUMClusteringP2P (fr)'] = df['MLSUMClusteringP2P (fr)'].fillna(df['MLSUMClusteringP2P'])
315
- datasets.remove('MLSUMClusteringP2P')
316
- if ('MLSUMClusteringS2S (fr)' in datasets) and ('MLSUMClusteringS2S' in cols):
317
- df['MLSUMClusteringS2S (fr)'] = df['MLSUMClusteringS2S (fr)'].fillna(df['MLSUMClusteringS2S'])
318
- datasets.remove('MLSUMClusteringS2S')
319
- # Filter invalid columns
320
- cols = [col for col in cols if col in base_columns + datasets]
321
- i = 0
322
- for column in base_columns:
323
- if column in cols:
324
- cols.insert(i, cols.pop(cols.index(column)))
325
- i += 1
326
- df = df[cols]
327
- if rank:
328
- df = add_rank(df)
329
- if fillna:
330
- df.fillna("", inplace=True)
331
- return df
332
-
333
- # Get dict with a task list for each task category
334
- # E.g. {"Classification": ["AmazonMassiveIntentClassification (en)", ...], "PairClassification": ["SprintDuplicateQuestions", ...]}
335
- def get_mteb_average(task_dict: dict, refresh=True):
336
- all_tasks = reduce(lambda x, y: x + y, task_dict.values())
337
- DATA_OVERALL = get_mteb_data(
338
- tasks=list(task_dict.keys()),
339
- datasets=all_tasks,
340
- fillna=False,
341
- add_emb_dim=True,
342
- rank=False,
343
- refresh=refresh
344
- )
345
- # Debugging:
346
- # DATA_OVERALL.to_csv("overall.csv")
347
-
348
- DATA_OVERALL.insert(1, f"Average ({len(all_tasks)} datasets)", DATA_OVERALL[all_tasks].mean(axis=1, skipna=False))
349
- for i, (task_category, task_category_list) in enumerate(task_dict.items()):
350
- DATA_OVERALL.insert(i+2, f"{task_category} Average ({len(task_category_list)} datasets)", DATA_OVERALL[task_category_list].mean(axis=1, skipna=False))
351
- DATA_OVERALL.sort_values(f"Average ({len(all_tasks)} datasets)", ascending=False, inplace=True)
352
- # Start ranking from 1
353
- DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
354
-
355
- DATA_OVERALL = DATA_OVERALL.round(2)
356
-
357
- DATA_TASKS = {}
358
- for task_category, task_category_list in task_dict.items():
359
- DATA_TASKS[task_category] = add_rank(DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + task_category_list])
360
- DATA_TASKS[task_category] = DATA_TASKS[task_category][DATA_TASKS[task_category].iloc[:, 4:].ne("").any(axis=1)]
361
-
362
- # Fill NaN after averaging
363
- DATA_OVERALL.fillna("", inplace=True)
364
-
365
- data_overall_rows = ["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens", f"Average ({len(all_tasks)} datasets)"]
366
- for task_category, task_category_list in task_dict.items():
367
- data_overall_rows.append(f"{task_category} Average ({len(task_category_list)} datasets)")
368
-
369
- DATA_OVERALL = DATA_OVERALL[data_overall_rows]
370
- DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)]
371
-
372
- return DATA_OVERALL, DATA_TASKS
373
-
374
- boards_data = {}
375
- all_data_tasks = []
376
- for board, board_config in BOARDS_CONFIG.items():
377
- boards_data[board] = {
378
- "data_overall": None,
379
- "data_tasks": {}
380
- }
381
- if board_config["has_overall"]:
382
- data_overall, data_tasks = get_mteb_average(board_config["tasks"], refresh=False)
383
- boards_data[board]["data_overall"] = data_overall
384
- boards_data[board]["data_tasks"] = data_tasks
385
- all_data_tasks.extend(data_tasks.values())
386
- else:
387
- for task_category, task_category_list in board_config["tasks"].items():
388
- data_task_category = get_mteb_data(tasks=[task_category], datasets=task_category_list, refresh=False)
389
- data_task_category.drop(columns=["Embedding Dimensions", "Max Tokens"], inplace=True)
390
- boards_data[board]["data_tasks"][task_category] = data_task_category
391
- all_data_tasks.append(data_task_category)
392
-
393
- # Exact, add all non-nan integer values for every dataset
394
- NUM_SCORES = 0
395
- DATASETS = []
396
- MODELS = []
397
- # LANGUAGES = []
398
- for d in all_data_tasks:
399
- # NUM_SCORES += d.iloc[:, 1:].apply(lambda x: sum([1 for y in x if isinstance(y, float) and not np.isnan(y)]), axis=1).sum()
400
- cols_to_ignore = 4 if "Average" in d.columns else 3
401
- # Count number of scores including only non-nan floats & excluding the rank column
402
- NUM_SCORES += d.iloc[:, cols_to_ignore:].notna().sum().sum()
403
- # Exclude rank & model name column (first two); Do not count different language versions as different datasets
404
- DATASETS += [i.split(" ")[0] for i in d.columns[cols_to_ignore:]]
405
- # LANGUAGES += [i.split(" ")[-1] for i in d.columns[cols_to_ignore:]]
406
- MODELS += d["Model"].tolist()
407
-
408
- NUM_DATASETS = len(set(DATASETS))
409
- # NUM_LANGUAGES = len(set(LANGUAGES))
410
- NUM_MODELS = len(set(MODELS))
411
 
412
  # 1. Force headers to wrap
413
  # 2. Force model column (maximum) width
@@ -438,20 +73,49 @@ Each inner tab can have the following keys:
438
  - description: The description of the leaderboard
439
  - credits: [optional] The credits for the leaderboard
440
  - data: The data for the leaderboard
441
- - refresh: The function to refresh the leaderboard
442
  """
443
 
444
- def get_refresh_function(task_category, task_list):
445
- def _refresh():
446
- data_task_category = get_mteb_data(tasks=[task_category], datasets=task_list)
447
- data_task_category.drop(columns=["Embedding Dimensions", "Max Tokens"], inplace=True)
448
- return data_task_category
449
- return _refresh
 
 
 
 
 
450
 
451
 
452
- def get_refresh_overall_function(tasks):
453
- return lambda: get_mteb_average(tasks)[0]
 
 
454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
 
456
  data = {
457
  "Overall": {"metric": "Various, refer to task tabs", "data": []}
@@ -480,7 +144,7 @@ for board, board_config in BOARDS_CONFIG.items():
480
  "language_long": board_config["language_long"],
481
  "description": f"**Overall MTEB {overall_pretty_name}** 🔮{board_icon}",
482
  "data": boards_data[board]["data_overall"],
483
- "refresh": get_refresh_overall_function(board_config["tasks"]),
484
  "credits": credits,
485
  "metric": metric,
486
  })
@@ -493,7 +157,7 @@ for board, board_config in BOARDS_CONFIG.items():
493
  "language_long": board_config["language_long"],
494
  "description": f"**{task_category} {board_pretty_name}** {task_icon}{board_icon}",
495
  "data": boards_data[board]["data_tasks"][task_category],
496
- "refresh": get_refresh_function(task_category, task_category_list),
497
  "credits": credits,
498
  "metric": metric,
499
  })
@@ -672,9 +336,9 @@ with gr.Blocks(css=css) as block:
672
  full_dataframe = gr.Dataframe(item["data"], datatype=datatype, type="pandas", visible=False)
673
  full_dataframes.append(full_dataframe)
674
 
675
- with gr.Row():
676
- refresh_button = gr.Button("Refresh")
677
- refresh_button.click(item["refresh"], inputs=None, outputs=dataframe, concurrency_limit=20)
678
 
679
  gr.Markdown(f"""
680
  - **Total Datasets**: {NUM_DATASETS}
 
1
  from functools import reduce
2
  import json
3
+ import pickle
4
  import os
5
  import re
6
 
 
7
  import gradio as gr
 
 
8
  import pandas as pd
9
  from tqdm.autonotebook import tqdm
10
 
11
  from utils.model_size import get_model_parameters_memory
12
+ from refresh import TASK_TO_METRIC, TASKS, PRETTY_NAMES, TASKS_CONFIG, BOARDS_CONFIG
13
+ from envs import REPO_ID
14
+ from refresh import PROPRIETARY_MODELS, SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS, CROSS_ENCODERS, BI_ENCODERS, TASK_DESCRIPTIONS, EXTERNAL_MODEL_TO_LINK, make_clickable_model
15
 
 
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  PROPRIETARY_MODELS = {
19
  make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))
 
33
  }
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  def make_datasets_clickable(df):
38
  """Does not work"""
 
42
  columns={f'BornholmBitextMining': '<a target="_blank" style="text-decoration: underline" href="{link}">BornholmBitextMining</a>',})
43
  return df
44
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  # 1. Force headers to wrap
48
  # 2. Force model column (maximum) width
 
73
  - description: The description of the leaderboard
74
  - credits: [optional] The credits for the leaderboard
75
  - data: The data for the leaderboard
 
76
  """
77
 
78
+ # No more refreshing manually, happens daily
79
+ # def get_refresh_function(task_category, task_list):
80
+ # def _refresh():
81
+ # data_task_category = get_mteb_data(tasks=[task_category], datasets=task_list)
82
+ # data_task_category.drop(columns=["Embedding Dimensions", "Max Tokens"], inplace=True)
83
+ # return data_task_category
84
+ # return _refresh
85
+
86
+
87
+ # def get_refresh_overall_function(tasks):
88
+ # return lambda: get_mteb_average(tasks)[0]
89
 
90
 
91
+ # load in the pre-calculated `all_data_tasks` and `boards_data`
92
+ print(f"Loading pre-calculated data....")
93
+ with open("all_data_tasks.pkl", "rb") as f:
94
+ all_data_tasks = pickle.load(f)
95
 
96
+ with open("boards_data.pkl", "rb") as f:
97
+ boards_data = pickle.load(f)
98
+
99
+ #### Caclulate Metadata
100
+ # Exact, add all non-nan integer values for every dataset
101
+ NUM_SCORES = 0
102
+ DATASETS = []
103
+ MODELS = []
104
+ # LANGUAGES = []
105
+ for d in all_data_tasks:
106
+ # NUM_SCORES += d.iloc[:, 1:].apply(lambda x: sum([1 for y in x if isinstance(y, float) and not np.isnan(y)]), axis=1).sum()
107
+ cols_to_ignore = 4 if "Average" in d.columns else 3
108
+ # Count number of scores including only non-nan floats & excluding the rank column
109
+ NUM_SCORES += d.iloc[:, cols_to_ignore:].notna().sum().sum()
110
+ # Exclude rank & model name column (first two); Do not count different language versions as different datasets
111
+ DATASETS += [i.split(" ")[0] for i in d.columns[cols_to_ignore:]]
112
+ # LANGUAGES += [i.split(" ")[-1] for i in d.columns[cols_to_ignore:]]
113
+ MODELS += d["Model"].tolist()
114
+
115
+
116
+ NUM_DATASETS = len(set(DATASETS))
117
+ # NUM_LANGUAGES = len(set(LANGUAGES))
118
+ NUM_MODELS = len(set(MODELS))
119
 
120
  data = {
121
  "Overall": {"metric": "Various, refer to task tabs", "data": []}
 
144
  "language_long": board_config["language_long"],
145
  "description": f"**Overall MTEB {overall_pretty_name}** 🔮{board_icon}",
146
  "data": boards_data[board]["data_overall"],
147
+ # "refresh": get_refresh_overall_function(board_config["tasks"]),
148
  "credits": credits,
149
  "metric": metric,
150
  })
 
157
  "language_long": board_config["language_long"],
158
  "description": f"**{task_category} {board_pretty_name}** {task_icon}{board_icon}",
159
  "data": boards_data[board]["data_tasks"][task_category],
160
+ # "refresh": get_refresh_function(task_category, task_category_list),
161
  "credits": credits,
162
  "metric": metric,
163
  })
 
336
  full_dataframe = gr.Dataframe(item["data"], datatype=datatype, type="pandas", visible=False)
337
  full_dataframes.append(full_dataframe)
338
 
339
+ # with gr.Row():
340
+ # refresh_button = gr.Button("Refresh")
341
+ # refresh_button.click(item["refresh"], inputs=None, outputs=dataframe, concurrency_limit=20)
342
 
343
  gr.Markdown(f"""
344
  - **Total Datasets**: {NUM_DATASETS}
boards_data.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d838cd56fea0716c0263a4e93f176154071f6877cb4df06f767d423b8f7485b
3
+ size 680288
refresh.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import reduce
2
+ import json
3
+ import os
4
+ import pickle
5
+ import re
6
+
7
+ from datasets import load_dataset
8
+ from huggingface_hub import hf_hub_download
9
+ from huggingface_hub.repocard import metadata_load
10
+ import pandas as pd
11
+ from tqdm.autonotebook import tqdm
12
+
13
+ from utils.model_size import get_model_parameters_memory
14
+ from envs import LEADERBOARD_CONFIG, MODEL_META, REPO_ID, RESULTS_REPO, API
15
+
16
+
17
+
18
+ TASKS_CONFIG = LEADERBOARD_CONFIG["tasks"]
19
+ BOARDS_CONFIG = LEADERBOARD_CONFIG["boards"]
20
+
21
+ TASKS = list(TASKS_CONFIG.keys())
22
+ PRETTY_NAMES = {
23
+ "InstructionRetrieval": "Retrieval w/Instructions",
24
+ "PairClassification": "Pair Classification",
25
+ "BitextMining": "Bitext Mining",
26
+ }
27
+
28
+ TASK_TO_METRIC = {k: [v["metric"]] for k, v in TASKS_CONFIG.items()}
29
+ # Add legacy metric names
30
+ TASK_TO_METRIC["STS"].append("cos_sim_spearman")
31
+ TASK_TO_METRIC["STS"].append("cosine_spearman")
32
+ TASK_TO_METRIC["Summarization"].append("cos_sim_spearman")
33
+ TASK_TO_METRIC["Summarization"].append("cosine_spearman")
34
+ TASK_TO_METRIC["PairClassification"].append("cos_sim_ap")
35
+ TASK_TO_METRIC["PairClassification"].append("cosine_ap")
36
+
37
+
38
+ EXTERNAL_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_external", False)}
39
+ EXTERNAL_MODEL_TO_LINK = {k: v["link"] for k,v in MODEL_META["model_meta"].items() if v.get("link", False)}
40
+ EXTERNAL_MODEL_TO_DIM = {k: v["dim"] for k,v in MODEL_META["model_meta"].items() if v.get("dim", False)}
41
+ EXTERNAL_MODEL_TO_SEQLEN = {k: v["seq_len"] for k,v in MODEL_META["model_meta"].items() if v.get("seq_len", False)}
42
+ EXTERNAL_MODEL_TO_SIZE = {k: v["size"] for k,v in MODEL_META["model_meta"].items() if v.get("size", False)}
43
+ PROPRIETARY_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_proprietary", False)}
44
+ TASK_DESCRIPTIONS = {k: v["task_description"] for k,v in TASKS_CONFIG.items()}
45
+ TASK_DESCRIPTIONS["Overall"] = "Overall performance across MTEB tasks."
46
+ SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_sentence_transformers_compatible", False)}
47
+ MODELS_TO_SKIP = MODEL_META["models_to_skip"]
48
+ CROSS_ENCODERS = MODEL_META["cross_encoders"]
49
+ BI_ENCODERS = [k for k, _ in MODEL_META["model_meta"].items() if k not in CROSS_ENCODERS + ["bm25"]]
50
+
51
+
52
+
53
+ TASK_TO_TASK_TYPE = {task_category: [] for task_category in TASKS}
54
+ for board_config in BOARDS_CONFIG.values():
55
+ for task_category, task_list in board_config["tasks"].items():
56
+ TASK_TO_TASK_TYPE[task_category].extend(task_list)
57
+
58
+
59
+ ## Don't cache this because we want to re-compute every time
60
+ # model_infos_path = "model_infos.json"
61
+ MODEL_INFOS = {}
62
+ # if os.path.exists(model_infos_path):
63
+ # with open(model_infos_path) as f:
64
+ # MODEL_INFOS = json.load(f)
65
+
66
+ def add_rank(df):
67
+ cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"]]
68
+ if len(cols_to_rank) == 1:
69
+ df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
70
+ else:
71
+ df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
72
+ df.sort_values("Average", ascending=False, inplace=True)
73
+ df.insert(0, "Rank", list(range(1, len(df) + 1)))
74
+ df = df.round(2)
75
+ # Fill NaN after averaging
76
+ df.fillna("", inplace=True)
77
+ return df
78
+
79
+
80
+ def make_clickable_model(model_name, link=None):
81
+ if link is None:
82
+ link = "https://huggingface.co/" + model_name
83
+ # Remove user from model name
84
+ return (
85
+ f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
86
+ )
87
+
88
+
89
+ def add_lang(examples):
90
+ if not(examples["eval_language"]):
91
+ examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
92
+ else:
93
+ examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"] + f' ({examples["eval_language"]})'
94
+ return examples
95
+
96
+ def norm(names): return set([name.split(" ")[0] for name in names])
97
+
98
+ def add_task(examples):
99
+ # Could be added to the dataset loading script instead
100
+ task_name = examples["mteb_dataset_name"]
101
+ task_type = None
102
+ for task_category, task_list in TASK_TO_TASK_TYPE.items():
103
+ if task_name in norm(task_list):
104
+ task_type = task_category
105
+ break
106
+ if task_type is not None:
107
+ examples["mteb_task"] = task_type
108
+ else:
109
+ print("WARNING: Task not found for dataset", examples["mteb_dataset_name"])
110
+ examples["mteb_task"] = "Unknown"
111
+ return examples
112
+
113
+ def filter_metric_external(x, task, metrics):
114
+ # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
115
+ if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
116
+ return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
117
+ else:
118
+ return x["mteb_task"] == task and x["metric"] in metrics
119
+
120
+ def filter_metric_fetched(name, metric, expected_metrics):
121
+ # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
122
+ return metric == 'ndcg_at_1' if name in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval'] else metric in expected_metrics
123
+
124
+
125
+ def get_dim_seq_size(model):
126
+ filenames = [sib.rfilename for sib in model.siblings]
127
+ dim, seq = "", ""
128
+ for filename in filenames:
129
+ if re.match("\d+_Pooling/config.json", filename):
130
+ st_config_path = hf_hub_download(model.modelId, filename=filename)
131
+ dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
132
+ break
133
+ for filename in filenames:
134
+ if re.match("\d+_Dense/config.json", filename):
135
+ st_config_path = hf_hub_download(model.modelId, filename=filename)
136
+ dim = json.load(open(st_config_path)).get("out_features", dim)
137
+ if "config.json" in filenames:
138
+ config_path = hf_hub_download(model.modelId, filename="config.json")
139
+ config = json.load(open(config_path))
140
+ if not dim:
141
+ dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
142
+ seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
143
+
144
+ if dim == "" or seq == "":
145
+ raise Exception(f"Could not find dim or seq for model {model.modelId}")
146
+
147
+ # Get model file size without downloading. Parameters in million parameters and memory in GB
148
+ parameters, memory = get_model_parameters_memory(model)
149
+ return dim, seq, parameters, memory
150
+
151
+
152
+ def get_external_model_results():
153
+ if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
154
+ with open("EXTERNAL_MODEL_RESULTS.json") as f:
155
+ EXTERNAL_MODEL_RESULTS = json.load(f)
156
+ # Update with models not contained
157
+ models_to_run = []
158
+ for model in EXTERNAL_MODELS:
159
+ if model not in EXTERNAL_MODEL_RESULTS:
160
+ models_to_run.append(model)
161
+ EXTERNAL_MODEL_RESULTS[model] = {k: {v[0]: []} for k, v in TASK_TO_METRIC.items()}
162
+
163
+ ## only if we want to re-calculate all instead of using the cache... it's likely they haven't changed
164
+ ## but if your model results have changed, delete it from the "EXTERNAL_MODEL_RESULTS.json" file
165
+ else:
166
+ EXTERNAL_MODEL_RESULTS = {model: {k: {v[0]: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
167
+ models_to_run = EXTERNAL_MODELS
168
+
169
+ pbar = tqdm(models_to_run, desc="Fetching external model results")
170
+ for model in pbar:
171
+ pbar.set_description(f"Fetching external model results for {model!r}")
172
+ ds = load_dataset(RESULTS_REPO, model, trust_remote_code=True, download_mode='force_redownload', verification_mode="no_checks")
173
+ ds = ds.map(add_lang)
174
+ ds = ds.map(add_task)
175
+ base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
176
+
177
+ for task, metrics in TASK_TO_METRIC.items():
178
+ ds_dict = ds.filter(lambda x: filter_metric_external(x, task, metrics))["test"].to_dict()
179
+ ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
180
+ # metrics[0] is the main name for this metric; other names in the list are legacy for backward-compat
181
+ EXTERNAL_MODEL_RESULTS[model][task][metrics[0]].append({**base_dict, **ds_dict})
182
+
183
+ # Save & cache EXTERNAL_MODEL_RESULTS
184
+ with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
185
+ json.dump(EXTERNAL_MODEL_RESULTS, f, indent=4)
186
+
187
+ return EXTERNAL_MODEL_RESULTS
188
+
189
+
190
+ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True):
191
+ global MODEL_INFOS
192
+
193
+ with open("EXTERNAL_MODEL_RESULTS.json", "r") as f:
194
+ external_model_results = json.load(f)
195
+
196
+ api = API
197
+ models = list(api.list_models(filter="mteb"))
198
+ # Legacy names changes; Also fetch the old results & merge later
199
+ if ('MLSUMClusteringP2P (fr)' in datasets):
200
+ datasets.append('MLSUMClusteringP2P')
201
+ if ('MLSUMClusteringS2S (fr)' in datasets):
202
+ datasets.append('MLSUMClusteringS2S')
203
+ # Initialize list to models that we cannot fetch metadata from
204
+ df_list = []
205
+ for model in external_model_results:
206
+ results_list = []
207
+ for task in tasks:
208
+ # Not all models have InstructionRetrieval, other new tasks
209
+ if task not in external_model_results[model]: continue
210
+ results_list += external_model_results[model][task][task_to_metric[task][0]]
211
+
212
+ if len(datasets) > 0:
213
+ res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
214
+ elif langs:
215
+ # Would be cleaner to rely on an extra language column instead
216
+ langs_format = [f"({lang})" for lang in langs]
217
+ res = {k: v for d in results_list for k, v in d.items() if any([k.split(" ")[-1] in (k, x) for x in langs_format])}
218
+ else:
219
+ res = {k: v for d in results_list for k, v in d.items()}
220
+ # Model & at least one result
221
+ if len(res) > 1:
222
+ if add_emb_dim:
223
+ res["Model Size (Million Parameters)"] = EXTERNAL_MODEL_TO_SIZE.get(model, "")
224
+ res["Memory Usage (GB, fp32)"] = round(res["Model Size (Million Parameters)"] * 1e6 * 4 / 1024**3, 2) if res["Model Size (Million Parameters)"] != "" else ""
225
+ res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
226
+ res["Max Tokens"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
227
+ df_list.append(res)
228
+
229
+ pbar = tqdm(models, desc="Fetching model metadata")
230
+ for model in pbar:
231
+ if model.modelId in MODELS_TO_SKIP: continue
232
+ pbar.set_description(f"Fetching {model.modelId!r} metadata")
233
+ readme_path = hf_hub_download(model.modelId, filename="README.md")
234
+ meta = metadata_load(readme_path)
235
+ MODEL_INFOS[model.modelId] = {
236
+ "metadata": meta
237
+ }
238
+ meta = MODEL_INFOS[model.modelId]["metadata"]
239
+ if "model-index" not in meta:
240
+ continue
241
+ # meta['model-index'][0]["results"] is list of elements like:
242
+ # {
243
+ # "task": {"type": "Classification"},
244
+ # "dataset": {
245
+ # "type": "mteb/amazon_massive_intent",
246
+ # "name": "MTEB MassiveIntentClassification (nb)",
247
+ # "config": "nb",
248
+ # "split": "test",
249
+ # },
250
+ # "metrics": [
251
+ # {"type": "accuracy", "value": 39.81506388702084},
252
+ # {"type": "f1", "value": 38.809586587791664},
253
+ # ],
254
+ # },
255
+ # Use "get" instead of dict indexing to skip incompat metadata instead of erroring out
256
+ if len(datasets) > 0:
257
+ task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and any([x in sub_res.get("dataset", {}).get("name", "") for x in datasets])]
258
+ elif langs:
259
+ task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))]
260
+ else:
261
+ task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)]
262
+ try:
263
+ out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if filter_metric_fetched(res["dataset"]["name"].replace("MTEB ", ""), score["type"], task_to_metric.get(res["task"]["type"]))][0]} for res in task_results]
264
+ except Exception as e:
265
+ print("ERROR", model.modelId, e)
266
+ continue
267
+ out = {k: v for d in out for k, v in d.items()}
268
+ out["Model"] = make_clickable_model(model.modelId)
269
+ # Model & at least one result
270
+ if len(out) > 1:
271
+ if add_emb_dim:
272
+ # The except clause triggers on gated repos, we can use external metadata for those
273
+ try:
274
+ MODEL_INFOS[model.modelId]["dim_seq_size"] = list(get_dim_seq_size(model))
275
+ except:
276
+ name_without_org = model.modelId.split("/")[-1]
277
+ # EXTERNAL_MODEL_TO_SIZE[name_without_org] refers to millions of parameters, so for memory usage
278
+ # we multiply by 1e6 to get just the number of parameters, then by 4 to get the number of bytes
279
+ # given fp32 precision (4 bytes per float), then divide by 1024**3 to get the number of GB
280
+ MODEL_INFOS[model.modelId]["dim_seq_size"] = (
281
+ EXTERNAL_MODEL_TO_DIM.get(name_without_org, ""),
282
+ EXTERNAL_MODEL_TO_SEQLEN.get(name_without_org, ""),
283
+ EXTERNAL_MODEL_TO_SIZE.get(name_without_org, ""),
284
+ round(EXTERNAL_MODEL_TO_SIZE[name_without_org] * 1e6 * 4 / 1024**3, 2) if name_without_org in EXTERNAL_MODEL_TO_SIZE else "",
285
+ )
286
+ out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] = tuple(MODEL_INFOS[model.modelId]["dim_seq_size"])
287
+ df_list.append(out)
288
+ if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
289
+ SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
290
+
291
+ # # Save & cache MODEL_INFOS
292
+ # with open("model_infos.json", "w") as f:
293
+ # json.dump(MODEL_INFOS, f)
294
+
295
+ df = pd.DataFrame(df_list)
296
+ # If there are any models that are the same, merge them
297
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
298
+ df = df.groupby("Model", as_index=False).first()
299
+ # Put 'Model' column first
300
+ cols = sorted(list(df.columns))
301
+ base_columns = ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"]
302
+ if len(datasets) > 0:
303
+ # Update legacy column names to be merged with newer ones
304
+ # Update 'MLSUMClusteringP2P (fr)' with values from 'MLSUMClusteringP2P'
305
+ if ('MLSUMClusteringP2P (fr)' in datasets) and ('MLSUMClusteringP2P' in cols):
306
+ df['MLSUMClusteringP2P (fr)'] = df['MLSUMClusteringP2P (fr)'].fillna(df['MLSUMClusteringP2P'])
307
+ datasets.remove('MLSUMClusteringP2P')
308
+ if ('MLSUMClusteringS2S (fr)' in datasets) and ('MLSUMClusteringS2S' in cols):
309
+ df['MLSUMClusteringS2S (fr)'] = df['MLSUMClusteringS2S (fr)'].fillna(df['MLSUMClusteringS2S'])
310
+ datasets.remove('MLSUMClusteringS2S')
311
+ # Filter invalid columns
312
+ cols = [col for col in cols if col in base_columns + datasets]
313
+ i = 0
314
+ for column in base_columns:
315
+ if column in cols:
316
+ cols.insert(i, cols.pop(cols.index(column)))
317
+ i += 1
318
+ df = df[cols]
319
+ if rank:
320
+ df = add_rank(df)
321
+ if fillna:
322
+ df.fillna("", inplace=True)
323
+ return df
324
+
325
+
326
+ # Get dict with a task list for each task category
327
+ # E.g. {"Classification": ["AmazonMassiveIntentClassification (en)", ...], "PairClassification": ["SprintDuplicateQuestions", ...]}
328
+ def get_mteb_average(task_dict: dict):
329
+ all_tasks = reduce(lambda x, y: x + y, task_dict.values())
330
+ DATA_OVERALL = get_mteb_data(
331
+ tasks=list(task_dict.keys()),
332
+ datasets=all_tasks,
333
+ fillna=False,
334
+ add_emb_dim=True,
335
+ rank=False,
336
+ )
337
+ # Debugging:
338
+ # DATA_OVERALL.to_csv("overall.csv")
339
+
340
+ DATA_OVERALL.insert(1, f"Average ({len(all_tasks)} datasets)", DATA_OVERALL[all_tasks].mean(axis=1, skipna=False))
341
+ for i, (task_category, task_category_list) in enumerate(task_dict.items()):
342
+ DATA_OVERALL.insert(i+2, f"{task_category} Average ({len(task_category_list)} datasets)", DATA_OVERALL[task_category_list].mean(axis=1, skipna=False))
343
+ DATA_OVERALL.sort_values(f"Average ({len(all_tasks)} datasets)", ascending=False, inplace=True)
344
+ # Start ranking from 1
345
+ DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
346
+
347
+ DATA_OVERALL = DATA_OVERALL.round(2)
348
+
349
+ DATA_TASKS = {}
350
+ for task_category, task_category_list in task_dict.items():
351
+ DATA_TASKS[task_category] = add_rank(DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + task_category_list])
352
+ DATA_TASKS[task_category] = DATA_TASKS[task_category][DATA_TASKS[task_category].iloc[:, 4:].ne("").any(axis=1)]
353
+
354
+ # Fill NaN after averaging
355
+ DATA_OVERALL.fillna("", inplace=True)
356
+
357
+ data_overall_rows = ["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens", f"Average ({len(all_tasks)} datasets)"]
358
+ for task_category, task_category_list in task_dict.items():
359
+ data_overall_rows.append(f"{task_category} Average ({len(task_category_list)} datasets)")
360
+
361
+ DATA_OVERALL = DATA_OVERALL[data_overall_rows]
362
+ DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)]
363
+
364
+ return DATA_OVERALL, DATA_TASKS
365
+
366
+
367
+ def refresh_leaderboard():
368
+ """
369
+ The main code to refresh and calculate results for the leaderboard. It does this by fetching the results from the
370
+ external models and the models in the leaderboard, then calculating the average scores for each task category.
371
+
372
+ Returns:
373
+ dict: A dictionary containing the overall leaderboard and the task category leaderboards.
374
+ """
375
+
376
+ # get external model results and cache them
377
+ external_results = get_external_model_results()
378
+
379
+ boards_data = {}
380
+ all_data_tasks = []
381
+ pbar_tasks = tqdm(BOARDS_CONFIG.items(), desc="Fetching leaderboard results for ???", total=len(BOARDS_CONFIG), leave=True)
382
+ for board, board_config in pbar_tasks:
383
+ boards_data[board] = {
384
+ "data_overall": None,
385
+ "data_tasks": {}
386
+ }
387
+ pbar_tasks.set_description(f"Fetching leaderboard results for {board!r}")
388
+ pbar_tasks.refresh()
389
+ if board_config["has_overall"]:
390
+ data_overall, data_tasks = get_mteb_average(board_config["tasks"])
391
+ boards_data[board]["data_overall"] = data_overall
392
+ boards_data[board]["data_tasks"] = data_tasks
393
+ all_data_tasks.extend(data_tasks.values())
394
+ else:
395
+ for task_category, task_category_list in board_config["tasks"].items():
396
+ data_task_category = get_mteb_data(tasks=[task_category], datasets=task_category_list)
397
+ data_task_category.drop(columns=["Embedding Dimensions", "Max Tokens"], inplace=True)
398
+ boards_data[board]["data_tasks"][task_category] = data_task_category
399
+ all_data_tasks.append(data_task_category)
400
+
401
+ return all_data_tasks, boards_data
402
+
403
+
404
+
405
+ if __name__ == "__main__":
406
+ print(f"Refreshing leaderboard statistics...")
407
+ all_data_tasks, boards_data = refresh_leaderboard()
408
+
409
+ print(f"Done calculating, saving...")
410
+ # save them so that the leaderboard can use them, as pickles because they're quite complex objects
411
+ with open("all_data_tasks.pkl", "wb") as f:
412
+ pickle.dump(all_data_tasks, f)
413
+
414
+ with open("boards_data.pkl", "wb") as f:
415
+ pickle.dump(boards_data, f)
test.txt DELETED
@@ -1 +0,0 @@
1
- This is a test