Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Muennighoff
commited on
Commit
·
4d67578
1
Parent(s):
803802d
Add German Clustering; Rmv Models w/o score; Rmv dups; Increment ds
Browse files
app.py
CHANGED
@@ -48,6 +48,13 @@ TASK_LIST_CLUSTERING = [
|
|
48 |
"TwentyNewsgroupsClustering",
|
49 |
]
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
TASK_LIST_PAIR_CLASSIFICATION = [
|
52 |
"SprintDuplicateQuestions",
|
53 |
"TwitterSemEval2015",
|
@@ -117,6 +124,7 @@ TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_
|
|
117 |
TASK_TO_METRIC = {
|
118 |
"BitextMining": "f1",
|
119 |
"Clustering": "v_measure",
|
|
|
120 |
"Classification": "accuracy",
|
121 |
"PairClassification": "cos_sim_ap",
|
122 |
"Reranking": "map",
|
@@ -255,6 +263,9 @@ MODELS_TO_SKIP = {
|
|
255 |
"radames/e5-large", # Duplicate
|
256 |
"gentlebowl/instructor-large-safetensors", # Duplicate
|
257 |
"Consensus/instructor-base", # Duplicate
|
|
|
|
|
|
|
258 |
}
|
259 |
|
260 |
|
@@ -271,7 +282,7 @@ def add_task(examples):
|
|
271 |
# Could be added to the dataset loading script instead
|
272 |
if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM:
|
273 |
examples["mteb_task"] = "Classification"
|
274 |
-
elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING:
|
275 |
examples["mteb_task"] = "Clustering"
|
276 |
elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION:
|
277 |
examples["mteb_task"] = "PairClassification"
|
@@ -288,7 +299,7 @@ def add_task(examples):
|
|
288 |
return examples
|
289 |
|
290 |
for model in EXTERNAL_MODELS:
|
291 |
-
ds = load_dataset("mteb/results", model
|
292 |
# For local debugging:
|
293 |
#, download_mode='force_redownload', verification_mode="no_checks")
|
294 |
ds = ds.map(add_lang)
|
@@ -321,14 +332,16 @@ def get_emb_dim(model):
|
|
321 |
return dim
|
322 |
|
323 |
|
324 |
-
def get_mteb_data(tasks=["Clustering"], langs=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
|
325 |
api = HfApi()
|
326 |
models = api.list_models(filter="mteb")
|
327 |
# Initialize list to models that we cannot fetch metadata from
|
328 |
df_list = []
|
329 |
for model in EXTERNAL_MODEL_RESULTS:
|
330 |
results_list = [res for task in tasks for res in EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]]
|
331 |
-
if
|
|
|
|
|
332 |
# Would be cleaner to rely on an extra language column instead
|
333 |
langs_format = [f"({lang})" for lang in langs]
|
334 |
res = {k: v for d in results_list for k, v in d.items() if any([k.split(" ")[-1] in (k, x) for x in langs_format])}
|
@@ -359,16 +372,20 @@ def get_mteb_data(tasks=["Clustering"], langs=[], fillna=True, add_emb_dim=False
|
|
359 |
# ],
|
360 |
# },
|
361 |
# Use "get" instead of dict indexing to skip incompat metadata instead of erroring out
|
362 |
-
if
|
|
|
|
|
363 |
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))]
|
364 |
else:
|
365 |
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)]
|
366 |
out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if score["type"] == task_to_metric.get(res["task"]["type"])][0]} for res in task_results]
|
367 |
out = {k: v for d in out for k, v in d.items()}
|
368 |
out["Model"] = make_clickable_model(model.modelId)
|
369 |
-
|
370 |
-
|
371 |
-
|
|
|
|
|
372 |
df = pd.DataFrame(df_list)
|
373 |
# Put 'Model' column first
|
374 |
cols = sorted(list(df.columns))
|
@@ -437,7 +454,7 @@ with block:
|
|
437 |
gr.Markdown(f"""
|
438 |
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗
|
439 |
|
440 |
-
- **Total Datasets**:
|
441 |
- **Total Languages**: 112
|
442 |
- **Total Scores**: >{NUM_SCORES}
|
443 |
- **Total Models**: {len(DATA_OVERALL)}
|
@@ -531,27 +548,53 @@ with block:
|
|
531 |
outputs=data_classification,
|
532 |
)
|
533 |
with gr.TabItem("Clustering"):
|
534 |
-
with gr.
|
535 |
-
gr.
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
555 |
with gr.TabItem("Pair Classification"):
|
556 |
with gr.Row():
|
557 |
gr.Markdown("""
|
@@ -681,9 +724,7 @@ with block:
|
|
681 |
)
|
682 |
gr.Markdown(r"""
|
683 |
|
684 |
-
Made with ❤️ for NLP
|
685 |
-
|
686 |
-
If this work is useful to you, please consider citing:
|
687 |
|
688 |
```bibtex
|
689 |
@article{muennighoff2022mteb,
|
@@ -702,7 +743,8 @@ with block:
|
|
702 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
703 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
704 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
705 |
-
block.load(get_mteb_data, inputs=[task_clustering], outputs=data_clustering)
|
|
|
706 |
block.load(get_mteb_data, inputs=[task_pair_classification], outputs=data_pair_classification)
|
707 |
block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
|
708 |
block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
|
|
|
48 |
"TwentyNewsgroupsClustering",
|
49 |
]
|
50 |
|
51 |
+
TASK_LIST_CLUSTERING_DE = [
|
52 |
+
"BlurbsClusteringP2P",
|
53 |
+
"BlurbsClusteringS2S",
|
54 |
+
"TenKGnadClusteringP2P",
|
55 |
+
"TenKGnadClusteringS2S",
|
56 |
+
]
|
57 |
+
|
58 |
TASK_LIST_PAIR_CLASSIFICATION = [
|
59 |
"SprintDuplicateQuestions",
|
60 |
"TwitterSemEval2015",
|
|
|
124 |
TASK_TO_METRIC = {
|
125 |
"BitextMining": "f1",
|
126 |
"Clustering": "v_measure",
|
127 |
+
"Clustering (DE)": "v_measure",
|
128 |
"Classification": "accuracy",
|
129 |
"PairClassification": "cos_sim_ap",
|
130 |
"Reranking": "map",
|
|
|
263 |
"radames/e5-large", # Duplicate
|
264 |
"gentlebowl/instructor-large-safetensors", # Duplicate
|
265 |
"Consensus/instructor-base", # Duplicate
|
266 |
+
"GovCompete/instructor-xl", # Duplicate
|
267 |
+
"GovCompete/e5-large-v2", # Duplicate
|
268 |
+
"t12e/instructor-base", # Duplicate
|
269 |
}
|
270 |
|
271 |
|
|
|
282 |
# Could be added to the dataset loading script instead
|
283 |
if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM:
|
284 |
examples["mteb_task"] = "Classification"
|
285 |
+
elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING + TASK_LIST_CLUSTERING_DE:
|
286 |
examples["mteb_task"] = "Clustering"
|
287 |
elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION:
|
288 |
examples["mteb_task"] = "PairClassification"
|
|
|
299 |
return examples
|
300 |
|
301 |
for model in EXTERNAL_MODELS:
|
302 |
+
ds = load_dataset("mteb/results", model)#, download_mode='force_redownload', verification_mode="no_checks")
|
303 |
# For local debugging:
|
304 |
#, download_mode='force_redownload', verification_mode="no_checks")
|
305 |
ds = ds.map(add_lang)
|
|
|
332 |
return dim
|
333 |
|
334 |
|
335 |
+
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
|
336 |
api = HfApi()
|
337 |
models = api.list_models(filter="mteb")
|
338 |
# Initialize list to models that we cannot fetch metadata from
|
339 |
df_list = []
|
340 |
for model in EXTERNAL_MODEL_RESULTS:
|
341 |
results_list = [res for task in tasks for res in EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]]
|
342 |
+
if len(datasets) > 0:
|
343 |
+
res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
|
344 |
+
elif langs:
|
345 |
# Would be cleaner to rely on an extra language column instead
|
346 |
langs_format = [f"({lang})" for lang in langs]
|
347 |
res = {k: v for d in results_list for k, v in d.items() if any([k.split(" ")[-1] in (k, x) for x in langs_format])}
|
|
|
372 |
# ],
|
373 |
# },
|
374 |
# Use "get" instead of dict indexing to skip incompat metadata instead of erroring out
|
375 |
+
if len(datasets) > 0:
|
376 |
+
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and any([x in sub_res.get("dataset", {}).get("name", "") for x in datasets])]
|
377 |
+
elif langs:
|
378 |
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))]
|
379 |
else:
|
380 |
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)]
|
381 |
out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if score["type"] == task_to_metric.get(res["task"]["type"])][0]} for res in task_results]
|
382 |
out = {k: v for d in out for k, v in d.items()}
|
383 |
out["Model"] = make_clickable_model(model.modelId)
|
384 |
+
# Model & at least one result
|
385 |
+
if len(out) > 1:
|
386 |
+
if add_emb_dim:
|
387 |
+
out["Embedding Dimensions"] = get_emb_dim(model)
|
388 |
+
df_list.append(out)
|
389 |
df = pd.DataFrame(df_list)
|
390 |
# Put 'Model' column first
|
391 |
cols = sorted(list(df.columns))
|
|
|
454 |
gr.Markdown(f"""
|
455 |
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗
|
456 |
|
457 |
+
- **Total Datasets**: 62
|
458 |
- **Total Languages**: 112
|
459 |
- **Total Scores**: >{NUM_SCORES}
|
460 |
- **Total Models**: {len(DATA_OVERALL)}
|
|
|
548 |
outputs=data_classification,
|
549 |
)
|
550 |
with gr.TabItem("Clustering"):
|
551 |
+
with gr.TabItem("English"):
|
552 |
+
with gr.Row():
|
553 |
+
gr.Markdown("""
|
554 |
+
**Clustering Leaderboard ✨**
|
555 |
+
|
556 |
+
- **Metric:** Validity Measure (v_measure)
|
557 |
+
- **Languages:** English
|
558 |
+
""")
|
559 |
+
with gr.Row():
|
560 |
+
data_clustering = gr.components.Dataframe(
|
561 |
+
DATA_CLUSTERING,
|
562 |
+
datatype=["markdown"] + ["number"] * len(DATA_CLUSTERING.columns),
|
563 |
+
type="pandas",
|
564 |
+
)
|
565 |
+
with gr.Row():
|
566 |
+
data_run = gr.Button("Refresh")
|
567 |
+
task_clustering = gr.Variable(value=["Clustering"])
|
568 |
+
empty = gr.Variable(value=[])
|
569 |
+
datasets_clustering = gr.Variable(value=TASK_LIST_CLUSTERING)
|
570 |
+
data_run.click(
|
571 |
+
get_mteb_data,
|
572 |
+
inputs=[task_clustering, empty, datasets_clustering],
|
573 |
+
outputs=data_clustering,
|
574 |
+
)
|
575 |
+
with gr.TabItem("German"):
|
576 |
+
with gr.Row():
|
577 |
+
gr.Markdown("""
|
578 |
+
**Clustering Leaderboard ✨🇩🇪**
|
579 |
+
|
580 |
+
- **Metric:** Validity Measure (v_measure)
|
581 |
+
- **Languages:** German
|
582 |
+
""")
|
583 |
+
with gr.Row():
|
584 |
+
data_clustering_de = gr.components.Dataframe(
|
585 |
+
datatype=["markdown"] + ["number"] * len(TASK_LIST_CLUSTERING_DE),
|
586 |
+
type="pandas",
|
587 |
+
)
|
588 |
+
with gr.Row():
|
589 |
+
data_run = gr.Button("Refresh")
|
590 |
+
task_clustering_de = gr.Variable(value=["Clustering"])
|
591 |
+
empty_de = gr.Variable(value=[])
|
592 |
+
datasets_clustering_de = gr.Variable(value=TASK_LIST_CLUSTERING_DE)
|
593 |
+
data_run.click(
|
594 |
+
get_mteb_data,
|
595 |
+
inputs=[task_clustering_de, empty_de, datasets_clustering_de],
|
596 |
+
outputs=data_clustering_de,
|
597 |
+
)
|
598 |
with gr.TabItem("Pair Classification"):
|
599 |
with gr.Row():
|
600 |
gr.Markdown("""
|
|
|
724 |
)
|
725 |
gr.Markdown(r"""
|
726 |
|
727 |
+
Made with ❤️ for NLP. If this work is useful to you, please consider citing:
|
|
|
|
|
728 |
|
729 |
```bibtex
|
730 |
@article{muennighoff2022mteb,
|
|
|
743 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
744 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
745 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
746 |
+
block.load(get_mteb_data, inputs=[task_clustering, empty, datasets_clustering], outputs=data_clustering)
|
747 |
+
block.load(get_mteb_data, inputs=[task_clustering_de, empty_de, datasets_clustering_de], outputs=data_clustering_de)
|
748 |
block.load(get_mteb_data, inputs=[task_pair_classification], outputs=data_pair_classification)
|
749 |
block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
|
750 |
block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
|