Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Muennighoff
commited on
Commit
·
767d579
1
Parent(s):
f8ed0b8
Multiple LEMB metrics & fix legacy french naming
Browse files- EXTERNAL_MODEL_RESULTS.json +0 -0
- app.py +38 -17
- config.yaml +2 -2
- model_meta.yaml +8 -0
EXTERNAL_MODEL_RESULTS.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from functools import
|
2 |
import json
|
3 |
import os
|
4 |
import re
|
@@ -23,7 +23,7 @@ PRETTY_NAMES = {
|
|
23 |
"BitextMining": "Bitext Mining",
|
24 |
}
|
25 |
|
26 |
-
TASK_TO_METRIC = {k:v["metric"] for k,v in TASKS_CONFIG.items()}
|
27 |
|
28 |
def make_clickable_model(model_name, link=None):
|
29 |
if link is None:
|
@@ -93,6 +93,17 @@ def add_task(examples):
|
|
93 |
examples["mteb_task"] = "Unknown"
|
94 |
return examples
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
|
97 |
with open("EXTERNAL_MODEL_RESULTS.json") as f:
|
98 |
EXTERNAL_MODEL_RESULTS = json.load(f)
|
@@ -115,17 +126,9 @@ for model in pbar:
|
|
115 |
ds = ds.map(add_lang)
|
116 |
ds = ds.map(add_task)
|
117 |
base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
|
118 |
-
|
119 |
-
|
120 |
-
def filter_function(x, task, metric):
|
121 |
-
# This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
|
122 |
-
if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
|
123 |
-
return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
|
124 |
-
else:
|
125 |
-
return x["mteb_task"] == task and x["metric"] == metric
|
126 |
-
|
127 |
for task, metric in TASK_TO_METRIC.items():
|
128 |
-
ds_dict = ds.filter(lambda x:
|
129 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
130 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
131 |
|
@@ -190,6 +193,11 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
190 |
global MODEL_INFOS
|
191 |
api = API
|
192 |
models = api.list_models(filter="mteb")
|
|
|
|
|
|
|
|
|
|
|
193 |
# Initialize list to models that we cannot fetch metadata from
|
194 |
df_list = []
|
195 |
for model in EXTERNAL_MODEL_RESULTS:
|
@@ -253,7 +261,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
253 |
# if model.modelId == "w601sxs/b1ade-embed-kd_3":
|
254 |
# import pdb; pdb.set_trace()
|
255 |
try:
|
256 |
-
out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if score["type"]
|
257 |
except:
|
258 |
print("ERROR", model.modelId)
|
259 |
continue
|
@@ -281,7 +289,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
281 |
df_list.append(out)
|
282 |
if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
|
283 |
SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
|
284 |
-
|
285 |
# Save & cache MODEL_INFOS
|
286 |
with open("model_infos.json", "w") as f:
|
287 |
json.dump(MODEL_INFOS, f)
|
@@ -294,7 +302,18 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
294 |
cols = sorted(list(df.columns))
|
295 |
base_columns = ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"]
|
296 |
if len(datasets) > 0:
|
297 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
cols = [col for col in cols if col in base_columns + datasets]
|
299 |
i = 0
|
300 |
for column in base_columns:
|
@@ -447,6 +466,7 @@ for board, board_config in BOARDS_CONFIG.items():
|
|
447 |
if board_icon is None:
|
448 |
board_icon = ""
|
449 |
credits = board_config.get("credits", None)
|
|
|
450 |
|
451 |
if board_config["has_overall"]:
|
452 |
overall_pretty_name = board_pretty_name
|
@@ -459,6 +479,7 @@ for board, board_config in BOARDS_CONFIG.items():
|
|
459 |
"data": boards_data[board]["data_overall"],
|
460 |
"refresh": get_refresh_overall_function(board_config["tasks"]),
|
461 |
"credits": credits,
|
|
|
462 |
})
|
463 |
for task_category, task_category_list in board_config["tasks"].items():
|
464 |
task_icon = TASKS_CONFIG[task_category]['icon']
|
@@ -471,7 +492,7 @@ for board, board_config in BOARDS_CONFIG.items():
|
|
471 |
"data": boards_data[board]["data_tasks"][task_category],
|
472 |
"refresh": get_refresh_function(task_category, task_category_list),
|
473 |
"credits": credits,
|
474 |
-
"metric":
|
475 |
})
|
476 |
|
477 |
dataframes = []
|
@@ -635,7 +656,7 @@ with gr.Blocks(css=css) as block:
|
|
635 |
gr.Markdown(f"""
|
636 |
{item['description']}
|
637 |
|
638 |
-
- **Metric:** {
|
639 |
- **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
|
640 |
{"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
|
641 |
""")
|
|
|
1 |
+
from functools import reduce
|
2 |
import json
|
3 |
import os
|
4 |
import re
|
|
|
23 |
"BitextMining": "Bitext Mining",
|
24 |
}
|
25 |
|
26 |
+
TASK_TO_METRIC = {k: v["metric"] for k, v in TASKS_CONFIG.items()}
|
27 |
|
28 |
def make_clickable_model(model_name, link=None):
|
29 |
if link is None:
|
|
|
93 |
examples["mteb_task"] = "Unknown"
|
94 |
return examples
|
95 |
|
96 |
+
def filter_metric_external(x, task, metric):
|
97 |
+
# This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
|
98 |
+
if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
|
99 |
+
return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
|
100 |
+
else:
|
101 |
+
return x["mteb_task"] == task and x["metric"] == metric
|
102 |
+
|
103 |
+
def filter_metric_fetched(name, metric, expected_metric):
|
104 |
+
# This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
|
105 |
+
return metric == 'ndcg_at_1' if name in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval'] else metric == expected_metric
|
106 |
+
|
107 |
if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
|
108 |
with open("EXTERNAL_MODEL_RESULTS.json") as f:
|
109 |
EXTERNAL_MODEL_RESULTS = json.load(f)
|
|
|
126 |
ds = ds.map(add_lang)
|
127 |
ds = ds.map(add_task)
|
128 |
base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
|
129 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
for task, metric in TASK_TO_METRIC.items():
|
131 |
+
ds_dict = ds.filter(lambda x: filter_metric_external(x, task, metric))["test"].to_dict()
|
132 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
133 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
134 |
|
|
|
193 |
global MODEL_INFOS
|
194 |
api = API
|
195 |
models = api.list_models(filter="mteb")
|
196 |
+
# Legacy names changes; Also fetch the old results & merge later
|
197 |
+
if ('MLSUMClusteringP2P (fr)' in datasets):
|
198 |
+
datasets.append('MLSUMClusteringP2P')
|
199 |
+
if ('MLSUMClusteringS2S (fr)' in datasets):
|
200 |
+
datasets.append('MLSUMClusteringS2S')
|
201 |
# Initialize list to models that we cannot fetch metadata from
|
202 |
df_list = []
|
203 |
for model in EXTERNAL_MODEL_RESULTS:
|
|
|
261 |
# if model.modelId == "w601sxs/b1ade-embed-kd_3":
|
262 |
# import pdb; pdb.set_trace()
|
263 |
try:
|
264 |
+
out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if filter_metric_fetched(res["dataset"]["name"].replace("MTEB ", ""), score["type"], task_to_metric.get(res["task"]["type"]))][0]} for res in task_results]
|
265 |
except:
|
266 |
print("ERROR", model.modelId)
|
267 |
continue
|
|
|
289 |
df_list.append(out)
|
290 |
if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
|
291 |
SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
|
292 |
+
|
293 |
# Save & cache MODEL_INFOS
|
294 |
with open("model_infos.json", "w") as f:
|
295 |
json.dump(MODEL_INFOS, f)
|
|
|
302 |
cols = sorted(list(df.columns))
|
303 |
base_columns = ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"]
|
304 |
if len(datasets) > 0:
|
305 |
+
# Update legacy column names to be merged with newer ones
|
306 |
+
# Update 'MLSUMClusteringP2P (fr)' with values from 'MLSUMClusteringP2P'
|
307 |
+
#if ('MLSUMClusteringP2P (fr)' in datasets):
|
308 |
+
# import pdb; pdb.set_trace()
|
309 |
+
if ('MLSUMClusteringP2P (fr)' in datasets) and ('MLSUMClusteringP2P' in cols):
|
310 |
+
#import pdb; pdb.set_trace()
|
311 |
+
df['MLSUMClusteringP2P (fr)'] = df['MLSUMClusteringP2P (fr)'].fillna(df['MLSUMClusteringP2P'])
|
312 |
+
datasets.remove('MLSUMClusteringP2P')
|
313 |
+
if ('MLSUMClusteringS2S (fr)' in datasets) and ('MLSUMClusteringS2S' in cols):
|
314 |
+
df['MLSUMClusteringS2S (fr)'] = df['MLSUMClusteringS2S (fr)'].fillna(df['MLSUMClusteringS2S'])
|
315 |
+
datasets.remove('MLSUMClusteringS2S')
|
316 |
+
# Filter invalid columns
|
317 |
cols = [col for col in cols if col in base_columns + datasets]
|
318 |
i = 0
|
319 |
for column in base_columns:
|
|
|
466 |
if board_icon is None:
|
467 |
board_icon = ""
|
468 |
credits = board_config.get("credits", None)
|
469 |
+
metric = board_config.get("metric", None)
|
470 |
|
471 |
if board_config["has_overall"]:
|
472 |
overall_pretty_name = board_pretty_name
|
|
|
479 |
"data": boards_data[board]["data_overall"],
|
480 |
"refresh": get_refresh_overall_function(board_config["tasks"]),
|
481 |
"credits": credits,
|
482 |
+
"metric": metric,
|
483 |
})
|
484 |
for task_category, task_category_list in board_config["tasks"].items():
|
485 |
task_icon = TASKS_CONFIG[task_category]['icon']
|
|
|
492 |
"data": boards_data[board]["data_tasks"][task_category],
|
493 |
"refresh": get_refresh_function(task_category, task_category_list),
|
494 |
"credits": credits,
|
495 |
+
"metric": metric,
|
496 |
})
|
497 |
|
498 |
dataframes = []
|
|
|
656 |
gr.Markdown(f"""
|
657 |
{item['description']}
|
658 |
|
659 |
+
- **Metric:** {item.get('metric', metric)}
|
660 |
- **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
|
661 |
{"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
|
662 |
""")
|
config.yaml
CHANGED
@@ -224,8 +224,8 @@ boards:
|
|
224 |
- AlloProfClusteringP2P
|
225 |
- AlloProfClusteringS2S
|
226 |
- HALClusteringS2S
|
227 |
-
- MLSUMClusteringP2P
|
228 |
-
- MLSUMClusteringS2S
|
229 |
- MasakhaNEWSClusteringP2P (fra)
|
230 |
- MasakhaNEWSClusteringS2S (fra)
|
231 |
PairClassification:
|
|
|
224 |
- AlloProfClusteringP2P
|
225 |
- AlloProfClusteringS2S
|
226 |
- HALClusteringS2S
|
227 |
+
- MLSUMClusteringP2P (fr)
|
228 |
+
- MLSUMClusteringS2S (fr)
|
229 |
- MasakhaNEWSClusteringP2P (fra)
|
230 |
- MasakhaNEWSClusteringS2S (fra)
|
231 |
PairClassification:
|
model_meta.yaml
CHANGED
@@ -1195,6 +1195,14 @@ model_meta:
|
|
1195 |
is_external: true
|
1196 |
is_proprietary: true
|
1197 |
is_sentence_transformers_compatible: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1198 |
xlm-roberta-base:
|
1199 |
link: https://huggingface.co/xlm-roberta-base
|
1200 |
seq_len: 514
|
|
|
1195 |
is_external: true
|
1196 |
is_proprietary: true
|
1197 |
is_sentence_transformers_compatible: false
|
1198 |
+
voyage-multilingual-2:
|
1199 |
+
link: https://docs.voyageai.com/embeddings/
|
1200 |
+
seq_len: 32000
|
1201 |
+
size: null
|
1202 |
+
dim: 1024
|
1203 |
+
is_external: true
|
1204 |
+
is_proprietary: true
|
1205 |
+
is_sentence_transformers_compatible: false
|
1206 |
xlm-roberta-base:
|
1207 |
link: https://huggingface.co/xlm-roberta-base
|
1208 |
seq_len: 514
|