Muennighoff commited on
Commit
556c58e
·
1 Parent(s): a1e84d6

Add C-MTEB

Browse files
Files changed (1) hide show
  1. app.py +524 -116
app.py CHANGED
@@ -66,7 +66,19 @@ TASK_LIST_CLASSIFICATION_SV = [
66
  "SweRecClassification",
67
  ]
68
 
69
- TASK_LIST_CLASSIFICATION_OTHER = ['AmazonCounterfactualClassification (de)', 'AmazonCounterfactualClassification (ja)', 'AmazonReviewsClassification (de)', 'AmazonReviewsClassification (es)', 'AmazonReviewsClassification (fr)', 'AmazonReviewsClassification (ja)', 'AmazonReviewsClassification (zh)', 'MTOPDomainClassification (de)', 'MTOPDomainClassification (es)', 'MTOPDomainClassification (fr)', 'MTOPDomainClassification (hi)', 'MTOPDomainClassification (th)', 'MTOPIntentClassification (de)', 'MTOPIntentClassification (es)', 'MTOPIntentClassification (fr)', 'MTOPIntentClassification (hi)', 'MTOPIntentClassification (th)', 'MassiveIntentClassification (af)', 'MassiveIntentClassification (am)', 'MassiveIntentClassification (ar)', 'MassiveIntentClassification (az)', 'MassiveIntentClassification (bn)', 'MassiveIntentClassification (cy)', 'MassiveIntentClassification (de)', 'MassiveIntentClassification (el)', 'MassiveIntentClassification (es)', 'MassiveIntentClassification (fa)', 'MassiveIntentClassification (fi)', 'MassiveIntentClassification (fr)', 'MassiveIntentClassification (he)', 'MassiveIntentClassification (hi)', 'MassiveIntentClassification (hu)', 'MassiveIntentClassification (hy)', 'MassiveIntentClassification (id)', 'MassiveIntentClassification (is)', 'MassiveIntentClassification (it)', 'MassiveIntentClassification (ja)', 'MassiveIntentClassification (jv)', 'MassiveIntentClassification (ka)', 'MassiveIntentClassification (km)', 'MassiveIntentClassification (kn)', 'MassiveIntentClassification (ko)', 'MassiveIntentClassification (lv)', 'MassiveIntentClassification (ml)', 'MassiveIntentClassification (mn)', 'MassiveIntentClassification (ms)', 'MassiveIntentClassification (my)', 'MassiveIntentClassification (nl)', 'MassiveIntentClassification (pl)', 'MassiveIntentClassification (pt)', 'MassiveIntentClassification (ro)', 'MassiveIntentClassification (ru)', 'MassiveIntentClassification (sl)', 'MassiveIntentClassification (sq)', 'MassiveIntentClassification (sw)', 'MassiveIntentClassification (ta)', 'MassiveIntentClassification (te)', 'MassiveIntentClassification (th)', 'MassiveIntentClassification (tl)', 'MassiveIntentClassification (tr)', 'MassiveIntentClassification (ur)', 'MassiveIntentClassification (vi)', 'MassiveIntentClassification (zh-CN)', 'MassiveIntentClassification (zh-TW)', 'MassiveScenarioClassification (af)', 'MassiveScenarioClassification (am)', 'MassiveScenarioClassification (ar)', 'MassiveScenarioClassification (az)', 'MassiveScenarioClassification (bn)', 'MassiveScenarioClassification (cy)', 'MassiveScenarioClassification (de)', 'MassiveScenarioClassification (el)', 'MassiveScenarioClassification (es)', 'MassiveScenarioClassification (fa)', 'MassiveScenarioClassification (fi)', 'MassiveScenarioClassification (fr)', 'MassiveScenarioClassification (he)', 'MassiveScenarioClassification (hi)', 'MassiveScenarioClassification (hu)', 'MassiveScenarioClassification (hy)', 'MassiveScenarioClassification (id)', 'MassiveScenarioClassification (is)', 'MassiveScenarioClassification (it)', 'MassiveScenarioClassification (ja)', 'MassiveScenarioClassification (jv)', 'MassiveScenarioClassification (ka)', 'MassiveScenarioClassification (km)', 'MassiveScenarioClassification (kn)', 'MassiveScenarioClassification (ko)', 'MassiveScenarioClassification (lv)', 'MassiveScenarioClassification (ml)', 'MassiveScenarioClassification (mn)', 'MassiveScenarioClassification (ms)', 'MassiveScenarioClassification (my)', 'MassiveScenarioClassification (nl)', 'MassiveScenarioClassification (pl)', 'MassiveScenarioClassification (pt)', 'MassiveScenarioClassification (ro)', 'MassiveScenarioClassification (ru)', 'MassiveScenarioClassification (sl)', 'MassiveScenarioClassification (sq)', 'MassiveScenarioClassification (sw)', 'MassiveScenarioClassification (ta)', 'MassiveScenarioClassification (te)', 'MassiveScenarioClassification (th)', 'MassiveScenarioClassification (tl)', 'MassiveScenarioClassification (tr)', 'MassiveScenarioClassification (ur)', 'MassiveScenarioClassification (vi)', 'MassiveScenarioClassification (zh-CN)', 'MassiveScenarioClassification (zh-TW)']
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  TASK_LIST_CLUSTERING = [
72
  "ArxivClusteringP2P",
@@ -90,12 +102,24 @@ TASK_LIST_CLUSTERING_DE = [
90
  "TenKGnadClusteringS2S",
91
  ]
92
 
 
 
 
 
 
 
 
93
  TASK_LIST_PAIR_CLASSIFICATION = [
94
  "SprintDuplicateQuestions",
95
  "TwitterSemEval2015",
96
  "TwitterURLCorpus",
97
  ]
98
 
 
 
 
 
 
99
  TASK_LIST_RERANKING = [
100
  "AskUbuntuDupQuestions",
101
  "MindSmallReranking",
@@ -103,6 +127,13 @@ TASK_LIST_RERANKING = [
103
  "StackOverflowDupQuestions",
104
  ]
105
 
 
 
 
 
 
 
 
106
  TASK_LIST_RETRIEVAL = [
107
  "ArguAna",
108
  "ClimateFEVER",
@@ -124,7 +155,7 @@ TASK_LIST_RETRIEVAL = [
124
  TASK_LIST_RETRIEVAL_PL = [
125
  "ArguAna-PL",
126
  "DBPedia-PL",
127
- "FiQA2018-PL",
128
  "HotpotQA-PL",
129
  "MSMARCO-PL",
130
  "NFCorpus-PL",
@@ -135,6 +166,17 @@ TASK_LIST_RETRIEVAL_PL = [
135
  "TRECCOVID-PL",
136
  ]
137
 
 
 
 
 
 
 
 
 
 
 
 
138
  TASK_LIST_RETRIEVAL_NORM = TASK_LIST_RETRIEVAL + [
139
  "CQADupstackAndroidRetrieval",
140
  "CQADupstackEnglishRetrieval",
@@ -163,13 +205,24 @@ TASK_LIST_STS = [
163
  "STSBenchmark",
164
  ]
165
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  TASK_LIST_STS_NORM = [x.replace(" (en)", "").replace(" (en-en)", "") for x in TASK_LIST_STS]
167
 
168
- TASK_LIST_SUMMARIZATION = [
169
- "SummEval",
170
- ]
171
 
172
  TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS + TASK_LIST_SUMMARIZATION
 
173
 
174
  TASK_TO_METRIC = {
175
  "BitextMining": "f1",
@@ -198,6 +251,10 @@ EXTERNAL_MODELS = [
198
  "allenai-specter",
199
  "bert-base-swedish-cased",
200
  "bert-base-uncased",
 
 
 
 
201
  "contriever-base-msmarco",
202
  "cross-en-de-roberta-sentence-transformer",
203
  "dfm-encoder-large-v1",
@@ -220,8 +277,11 @@ EXTERNAL_MODELS = [
220
  "gtr-t5-xl",
221
  "gtr-t5-xxl",
222
  "komninos",
 
223
  "LASER2",
224
- "LaBSE",
 
 
225
  "msmarco-bert-co-condensor",
226
  "multilingual-e5-base",
227
  "multilingual-e5-large",
@@ -238,6 +298,8 @@ EXTERNAL_MODELS = [
238
  "sentence-t5-xl",
239
  "sentence-t5-xxl",
240
  "sup-simcse-bert-base-uncased",
 
 
241
  "text-embedding-ada-002",
242
  "text-similarity-ada-001",
243
  "text-similarity-babbage-001",
@@ -262,6 +324,10 @@ EXTERNAL_MODEL_TO_LINK = {
262
  "all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
263
  "bert-base-swedish-cased": "https://huggingface.co/KB/bert-base-swedish-cased",
264
  "bert-base-uncased": "https://huggingface.co/bert-base-uncased",
 
 
 
 
265
  "contriever-base-msmarco": "https://huggingface.co/nthakur/contriever-base-msmarco",
266
  "cross-en-de-roberta-sentence-transformer": "https://huggingface.co/T-Systems-onsite/cross-en-de-roberta-sentence-transformer",
267
  "DanskBERT": "https://huggingface.co/vesteinn/DanskBERT",
@@ -284,8 +350,11 @@ EXTERNAL_MODEL_TO_LINK = {
284
  "gtr-t5-xl": "https://huggingface.co/sentence-transformers/gtr-t5-xl",
285
  "gtr-t5-xxl": "https://huggingface.co/sentence-transformers/gtr-t5-xxl",
286
  "komninos": "https://huggingface.co/sentence-transformers/average_word_embeddings_komninos",
 
287
  "LASER2": "https://github.com/facebookresearch/LASER",
288
  "LaBSE": "https://huggingface.co/sentence-transformers/LaBSE",
 
 
289
  "msmarco-bert-co-condensor": "https://huggingface.co/sentence-transformers/msmarco-bert-co-condensor",
290
  "multilingual-e5-base": "https://huggingface.co/intfloat/multilingual-e5-base",
291
  "multilingual-e5-large": "https://huggingface.co/intfloat/multilingual-e5-large",
@@ -302,6 +371,8 @@ EXTERNAL_MODEL_TO_LINK = {
302
  "sentence-t5-xl": "https://huggingface.co/sentence-transformers/sentence-t5-xl",
303
  "sentence-t5-xxl": "https://huggingface.co/sentence-transformers/sentence-t5-xxl",
304
  "sup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/sup-simcse-bert-base-uncased",
 
 
305
  "text-embedding-ada-002": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
306
  "text-similarity-ada-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
307
  "text-similarity-babbage-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
@@ -326,6 +397,10 @@ EXTERNAL_MODEL_TO_DIM = {
326
  "allenai-specter": 768,
327
  "bert-base-swedish-cased": 768,
328
  "bert-base-uncased": 768,
 
 
 
 
329
  "contriever-base-msmarco": 768,
330
  "cross-en-de-roberta-sentence-transformer": 768,
331
  "DanskBERT": 768,
@@ -337,6 +412,7 @@ EXTERNAL_MODEL_TO_DIM = {
337
  "e5-large": 1024,
338
  "electra-small-nordic": 256,
339
  "electra-small-swedish-cased-discriminator": 256,
 
340
  "LASER2": 1024,
341
  "LaBSE": 768,
342
  "gbert-base": 768,
@@ -350,6 +426,8 @@ EXTERNAL_MODEL_TO_DIM = {
350
  "gtr-t5-xl": 768,
351
  "gtr-t5-xxl": 768,
352
  "komninos": 300,
 
 
353
  "msmarco-bert-co-condensor": 768,
354
  "multilingual-e5-base": 768,
355
  "multilingual-e5-small": 384,
@@ -366,8 +444,8 @@ EXTERNAL_MODEL_TO_DIM = {
366
  "sentence-t5-xl": 768,
367
  "sentence-t5-xxl": 768,
368
  "sup-simcse-bert-base-uncased": 768,
369
- "use-cmlm-multilingual": 768,
370
- "unsup-simcse-bert-base-uncased": 768,
371
  "text-embedding-ada-002": 1536,
372
  "text-similarity-ada-001": 1024,
373
  "text-similarity-babbage-001": 2048,
@@ -379,11 +457,12 @@ EXTERNAL_MODEL_TO_DIM = {
379
  "text-search-babbage-001": 2048,
380
  "text-search-curie-001": 4096,
381
  "text-search-davinci-001": 12288,
 
 
382
  "xlm-roberta-base": 768,
383
  "xlm-roberta-large": 1024,
384
  }
385
 
386
-
387
  EXTERNAL_MODEL_TO_SEQLEN = {
388
  "all-MiniLM-L12-v2": 512,
389
  "all-MiniLM-L6-v2": 512,
@@ -391,6 +470,10 @@ EXTERNAL_MODEL_TO_SEQLEN = {
391
  "allenai-specter": 512,
392
  "bert-base-swedish-cased": 512,
393
  "bert-base-uncased": 512,
 
 
 
 
394
  "contriever-base-msmarco": 512,
395
  "cross-en-de-roberta-sentence-transformer": 514,
396
  "DanskBERT": 514,
@@ -413,8 +496,11 @@ EXTERNAL_MODEL_TO_SEQLEN = {
413
  "gtr-t5-xl": 512,
414
  "gtr-t5-xxl": 512,
415
  "komninos": "N/A",
 
416
  "LASER2": "N/A",
417
- "LaBSE": 512,
 
 
418
  "msmarco-bert-co-condensor": 512,
419
  "multilingual-e5-base": 514,
420
  "multilingual-e5-large": 514,
@@ -431,6 +517,8 @@ EXTERNAL_MODEL_TO_SEQLEN = {
431
  "sentence-t5-xl": 512,
432
  "sentence-t5-xxl": 512,
433
  "sup-simcse-bert-base-uncased": 512,
 
 
434
  "text-embedding-ada-002": 8191,
435
  "text-similarity-ada-001": 2046,
436
  "text-similarity-babbage-001": 2046,
@@ -455,6 +543,10 @@ EXTERNAL_MODEL_TO_SIZE = {
455
  "all-mpnet-base-v2": 0.44,
456
  "bert-base-uncased": 0.44,
457
  "bert-base-swedish-cased": 0.50,
 
 
 
 
458
  "cross-en-de-roberta-sentence-transformer": 1.11,
459
  "contriever-base-msmarco": 0.44,
460
  "DanskBERT": 0.50,
@@ -477,8 +569,11 @@ EXTERNAL_MODEL_TO_SIZE = {
477
  "gtr-t5-xl": 2.48,
478
  "gtr-t5-xxl": 9.73,
479
  "komninos": 0.27,
 
480
  "LASER2": 0.17,
481
  "LaBSE": 1.88,
 
 
482
  "msmarco-bert-co-condensor": 0.44,
483
  "multilingual-e5-base": 1.11,
484
  "multilingual-e5-small": 0.47,
@@ -495,6 +590,8 @@ EXTERNAL_MODEL_TO_SIZE = {
495
  "sentence-t5-xl": 2.48,
496
  "sentence-t5-xxl": 9.73,
497
  "sup-simcse-bert-base-uncased": 0.44,
 
 
498
  "unsup-simcse-bert-base-uncased": 0.44,
499
  "use-cmlm-multilingual": 1.89,
500
  "xlm-roberta-base": 1.12,
@@ -523,9 +620,9 @@ MODELS_TO_SKIP = {
523
  "newsrx/instructor-xl",
524
  "dmlls/all-mpnet-base-v2",
525
  "cgldo/semanticClone",
 
526
  }
527
 
528
-
529
  EXTERNAL_MODEL_RESULTS = {model: {k: {v: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
530
 
531
  def add_lang(examples):
@@ -537,26 +634,29 @@ def add_lang(examples):
537
 
538
  def add_task(examples):
539
  # Could be added to the dataset loading script instead
540
- if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM + TASK_LIST_CLASSIFICATION_DA + TASK_LIST_CLASSIFICATION_SV + TASK_LIST_CLASSIFICATION_NB:
541
  examples["mteb_task"] = "Classification"
542
- elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING + TASK_LIST_CLUSTERING_DE:
543
  examples["mteb_task"] = "Clustering"
544
- elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION:
545
  examples["mteb_task"] = "PairClassification"
546
- elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING:
547
  examples["mteb_task"] = "Reranking"
548
- elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM + TASK_LIST_RETRIEVAL_PL:
549
  examples["mteb_task"] = "Retrieval"
550
- elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM:
551
  examples["mteb_task"] = "STS"
552
  elif examples["mteb_dataset_name"] in TASK_LIST_SUMMARIZATION:
553
  examples["mteb_task"] = "Summarization"
554
- else:
555
  examples["mteb_task"] = "BitextMining"
 
 
 
556
  return examples
557
 
558
  for model in EXTERNAL_MODELS:
559
- ds = load_dataset("mteb/results", model)#, download_mode='force_redownload', verification_mode="no_checks")
560
  # For local debugging:
561
  #, download_mode='force_redownload', verification_mode="no_checks")
562
  ds = ds.map(add_lang)
@@ -609,7 +709,6 @@ def make_datasets_clickable(df):
609
  columns={f'BornholmBitextMining': '<a target="_blank" style="text-decoration: underline" href="{link}">BornholmBitextMining</a>',})
610
  return df
611
 
612
-
613
  def add_rank(df):
614
  cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length"]]
615
  if len(cols_to_rank) == 1:
@@ -694,7 +793,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
694
  return df
695
 
696
  def get_mteb_average():
697
- global DATA_OVERALL, DATA_CLASSIFICATION_EN, DATA_CLUSTERING, DATA_PAIR_CLASSIFICATION, DATA_RERANKING, DATA_RETRIEVAL, DATA_STS_EN, DATA_SUMMARIZATION, NUM_SCORES
698
  DATA_OVERALL = get_mteb_data(
699
  tasks=[
700
  "Classification",
@@ -705,7 +804,7 @@ def get_mteb_average():
705
  "STS",
706
  "Summarization",
707
  ],
708
- langs=["en", "en-en"],
709
  fillna=False,
710
  add_emb_dim=True,
711
  rank=False,
@@ -728,36 +827,134 @@ def get_mteb_average():
728
  DATA_OVERALL = DATA_OVERALL.round(2)
729
 
730
  DATA_CLASSIFICATION_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION])
 
 
 
731
  DATA_CLUSTERING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING])
 
 
732
  DATA_PAIR_CLASSIFICATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION])
 
 
733
  DATA_RERANKING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RERANKING])
 
 
734
  DATA_RETRIEVAL = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RETRIEVAL])
 
 
735
  DATA_STS_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_STS])
 
 
736
  DATA_SUMMARIZATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION])
 
737
 
738
  # Fill NaN after averaging
739
  DATA_OVERALL.fillna("", inplace=True)
740
 
741
  DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
 
742
 
743
  return DATA_OVERALL
744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
745
  get_mteb_average()
 
746
  DATA_BITEXT_MINING = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING)
747
  DATA_BITEXT_MINING_OTHER = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING_OTHER)
748
  DATA_CLASSIFICATION_DA = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_DA)
749
  DATA_CLASSIFICATION_NB = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_NB)
750
  DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
751
  DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
752
- DATA_CLUSTERING_GERMAN = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
753
  DATA_RETRIEVAL_PL = get_mteb_data(["Retrieval"], [], TASK_LIST_RETRIEVAL_PL)
754
- DATA_STS = get_mteb_data(["STS"])
755
 
756
  # Exact, add all non-nan integer values for every dataset
757
  NUM_SCORES = 0
758
  DATASETS = []
 
759
  # LANGUAGES = []
760
- for d in [DATA_BITEXT_MINING, DATA_BITEXT_MINING_OTHER, DATA_CLASSIFICATION_EN, DATA_CLASSIFICATION_DA, DATA_CLASSIFICATION_NB, DATA_CLASSIFICATION_SV, DATA_CLASSIFICATION_OTHER, DATA_CLUSTERING, DATA_CLUSTERING_GERMAN, DATA_PAIR_CLASSIFICATION, DATA_RERANKING, DATA_RETRIEVAL, DATA_STS_EN, DATA_STS, DATA_SUMMARIZATION]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761
  # NUM_SCORES += d.iloc[:, 1:].apply(lambda x: sum([1 for y in x if isinstance(y, float) and not np.isnan(y)]), axis=1).sum()
762
  cols_to_ignore = 3 if "Average" in d.columns else 2
763
  # Count number of scores including only non-nan floats & excluding the rank column
@@ -765,9 +962,11 @@ for d in [DATA_BITEXT_MINING, DATA_BITEXT_MINING_OTHER, DATA_CLASSIFICATION_EN,
765
  # Exclude rank & model name column (first two); Do not count different language versions as different datasets
766
  DATASETS += [i.split(" ")[0] for i in d.columns[cols_to_ignore:]]
767
  # LANGUAGES += [i.split(" ")[-1] for i in d.columns[cols_to_ignore:]]
 
768
 
769
  NUM_DATASETS = len(set(DATASETS))
770
  # NUM_LANGUAGES = len(set(LANGUAGES))
 
771
 
772
  block = gr.Blocks()
773
  with block:
@@ -777,32 +976,52 @@ with block:
777
  - **Total Datasets**: {NUM_DATASETS}
778
  - **Total Languages**: 113
779
  - **Total Scores**: {NUM_SCORES}
780
- - **Total Models**: {len(DATA_OVERALL)}
781
  """)
782
  with gr.Tabs():
783
  with gr.TabItem("Overall"):
784
- with gr.Row():
785
- gr.Markdown("""
786
- **Overall MTEB English leaderboard 🔮**
787
-
788
- - **Metric:** Various, refer to task tabs
789
- - **Languages:** English, refer to task tabs for others
790
- """)
791
- with gr.Row():
792
- data_overall = gr.components.Dataframe(
793
- DATA_OVERALL,
794
- datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL.columns),
795
- type="pandas",
796
- wrap=True,
797
- )
798
- with gr.Row():
799
- data_run = gr.Button("Refresh")
800
- data_run.click(get_mteb_average, inputs=None, outputs=data_overall)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
801
  with gr.TabItem("Bitext Mining"):
802
  with gr.TabItem("English-X"):
803
  with gr.Row():
804
  gr.Markdown("""
805
- **Bitext Mining Leaderboard 🎌**
806
 
807
  - **Metric:** [F1](https://huggingface.co/spaces/evaluate-metric/f1)
808
  - **Languages:** 117 (Pairs of: English & other language)
@@ -814,11 +1033,11 @@ with block:
814
  type="pandas",
815
  )
816
  with gr.Row():
817
- data_run = gr.Button("Refresh")
818
  task_bitext_mining = gr.Variable(value=["BitextMining"])
819
  lang_bitext_mining = gr.Variable(value=[])
820
  datasets_bitext_mining = gr.Variable(value=TASK_LIST_BITEXT_MINING)
821
- data_run.click(
822
  get_mteb_data,
823
  inputs=[task_bitext_mining, lang_bitext_mining, datasets_bitext_mining],
824
  outputs=data_bitext_mining,
@@ -839,11 +1058,11 @@ with block:
839
  type="pandas",
840
  )
841
  with gr.Row():
842
- data_run = gr.Button("Refresh")
843
  task_bitext_mining_da = gr.Variable(value=["BitextMining"])
844
  lang_bitext_mining_da = gr.Variable(value=[])
845
  datasets_bitext_mining_da = gr.Variable(value=TASK_LIST_BITEXT_MINING_OTHER)
846
- data_run.click(
847
  get_mteb_data,
848
  inputs=[
849
  task_bitext_mining_da,
@@ -856,7 +1075,7 @@ with block:
856
  with gr.TabItem("English"):
857
  with gr.Row():
858
  gr.Markdown("""
859
- **Classification Leaderboard ❤️**
860
 
861
  - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
862
  - **Languages:** English
@@ -879,6 +1098,35 @@ with block:
879
  ],
880
  outputs=data_classification_en,
881
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
882
  with gr.TabItem("Danish"):
883
  with gr.Row():
884
  gr.Markdown("""
@@ -981,11 +1229,11 @@ with block:
981
  type="pandas",
982
  )
983
  with gr.Row():
984
- data_run = gr.Button("Refresh")
985
  task_classification = gr.Variable(value=["Classification"])
986
  lang_classification = gr.Variable(value=[])
987
  datasets_classification = gr.Variable(value=TASK_LIST_CLASSIFICATION_OTHER)
988
- data_run.click(
989
  get_mteb_data,
990
  inputs=[
991
  task_classification,
@@ -1010,15 +1258,40 @@ with block:
1010
  type="pandas",
1011
  )
1012
  with gr.Row():
1013
- data_run = gr.Button("Refresh")
1014
  task_clustering = gr.Variable(value=["Clustering"])
1015
  lang_clustering = gr.Variable(value=[])
1016
  datasets_clustering = gr.Variable(value=TASK_LIST_CLUSTERING)
1017
- data_run.click(
1018
  get_mteb_data,
1019
  inputs=[task_clustering, lang_clustering, datasets_clustering],
1020
  outputs=data_clustering,
1021
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1022
  with gr.TabItem("German"):
1023
  with gr.Row():
1024
  gr.Markdown("""
@@ -1030,68 +1303,137 @@ with block:
1030
  """)
1031
  with gr.Row():
1032
  data_clustering_de = gr.components.Dataframe(
1033
- DATA_CLUSTERING_GERMAN,
1034
- datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_GERMAN.columns) * 2,
1035
  type="pandas",
1036
  )
1037
  with gr.Row():
1038
- data_run = gr.Button("Refresh")
1039
  task_clustering_de = gr.Variable(value=["Clustering"])
1040
  lang_clustering_de = gr.Variable(value=[])
1041
  datasets_clustering_de = gr.Variable(value=TASK_LIST_CLUSTERING_DE)
1042
- data_run.click(
1043
  get_mteb_data,
1044
  inputs=[task_clustering_de, lang_clustering_de, datasets_clustering_de],
1045
  outputs=data_clustering_de,
1046
  )
1047
  with gr.TabItem("Pair Classification"):
1048
- with gr.Row():
1049
- gr.Markdown("""
1050
- **Pair Classification Leaderboard 🎭**
1051
-
1052
- - **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
1053
- - **Languages:** English
1054
- """)
1055
- with gr.Row():
1056
- data_pair_classification = gr.components.Dataframe(
1057
- DATA_PAIR_CLASSIFICATION,
1058
- datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION.columns),
1059
- type="pandas",
1060
- )
1061
- with gr.Row():
1062
- data_run = gr.Button("Refresh")
1063
- task_pair_classification = gr.Variable(value=["PairClassification"])
1064
- data_run.click(
1065
- get_mteb_data,
1066
- inputs=[task_pair_classification],
1067
- outputs=data_pair_classification,
1068
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1069
  with gr.TabItem("Reranking"):
1070
- with gr.Row():
1071
- gr.Markdown("""
1072
- **Reranking Leaderboard 🥈**
1073
-
1074
- - **Metric:** Mean Average Precision (MAP)
1075
- - **Languages:** English
1076
- """)
1077
- with gr.Row():
1078
- data_reranking = gr.components.Dataframe(
1079
- DATA_RERANKING,
1080
- datatype=["number", "markdown"] + ["number"] * len(DATA_RERANKING.columns),
1081
- type="pandas",
1082
- )
1083
- with gr.Row():
1084
- data_run = gr.Button("Refresh")
1085
- task_reranking = gr.Variable(value=["Reranking"])
1086
- metric_reranking = gr.Variable(value="map")
1087
- data_run.click(
1088
- get_mteb_data, inputs=[task_reranking], outputs=data_reranking
1089
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1090
  with gr.TabItem("Retrieval"):
1091
  with gr.TabItem("English"):
1092
  with gr.Row():
1093
  gr.Markdown("""
1094
- **Retrieval Leaderboard 🔎**
1095
 
1096
  - **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
1097
  - **Languages:** English
@@ -1104,10 +1446,44 @@ with block:
1104
  type="pandas",
1105
  )
1106
  with gr.Row():
1107
- data_run = gr.Button("Refresh")
1108
  task_retrieval = gr.Variable(value=["Retrieval"])
1109
- data_run.click(
1110
- get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1111
  )
1112
  with gr.TabItem("Polish"):
1113
  with gr.Row():
@@ -1126,11 +1502,11 @@ with block:
1126
  type="pandas",
1127
  )
1128
  with gr.Row():
1129
- data_run = gr.Button("Refresh")
1130
  task_retrieval_pl = gr.Variable(value=["Retrieval"])
1131
  lang_retrieval_pl = gr.Variable(value=[])
1132
  datasets_retrieval_pl = gr.Variable(value=TASK_LIST_RETRIEVAL_PL)
1133
- data_run.click(
1134
  get_mteb_data,
1135
  inputs=[task_retrieval_pl, lang_retrieval_pl, datasets_retrieval_pl],
1136
  outputs=data_retrieval_pl
@@ -1139,7 +1515,7 @@ with block:
1139
  with gr.TabItem("English"):
1140
  with gr.Row():
1141
  gr.Markdown("""
1142
- **STS Leaderboard 🤖**
1143
 
1144
  - **Metric:** Spearman correlation based on cosine similarity
1145
  - **Languages:** English
@@ -1153,30 +1529,62 @@ with block:
1153
  with gr.Row():
1154
  data_run_sts_en = gr.Button("Refresh")
1155
  task_sts_en = gr.Variable(value=["STS"])
1156
- lang_sts_en = gr.Variable(value=["en", "en-en"])
 
1157
  data_run_sts_en.click(
1158
  get_mteb_data,
1159
- inputs=[task_sts_en, lang_sts_en],
1160
  outputs=data_sts_en,
1161
  )
1162
- with gr.TabItem("Multilingual"):
1163
  with gr.Row():
1164
  gr.Markdown("""
1165
- **STS Multilingual Leaderboard 👽**
1166
 
1167
  - **Metric:** Spearman correlation based on cosine similarity
1168
- - **Languages:** Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish
 
1169
  """)
1170
  with gr.Row():
1171
- data_sts = gr.components.Dataframe(
1172
- DATA_STS,
1173
- datatype=["number", "markdown"] + ["number"] * len(DATA_STS.columns) * 2,
1174
  type="pandas",
1175
  )
1176
  with gr.Row():
1177
- data_run = gr.Button("Refresh")
1178
- task_sts = gr.Variable(value=["STS"])
1179
- data_run.click(get_mteb_data, inputs=[task_sts], outputs=data_sts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1180
  with gr.TabItem("Summarization"):
1181
  with gr.Row():
1182
  gr.Markdown("""
 
66
  "SweRecClassification",
67
  ]
68
 
69
+ TASK_LIST_CLASSIFICATION_ZH = [
70
+ "AmazonReviewsClassification (zh)",
71
+ "IFlyTek",
72
+ "JDReview",
73
+ "MassiveIntentClassification (zh-CN)",
74
+ "MassiveScenarioClassification (zh-CN)",
75
+ "MultilingualSentiment",
76
+ "OnlineShopping",
77
+ "TNews",
78
+ "Waimai",
79
+ ]
80
+
81
+ TASK_LIST_CLASSIFICATION_OTHER = ['AmazonCounterfactualClassification (de)', 'AmazonCounterfactualClassification (ja)', 'AmazonReviewsClassification (de)', 'AmazonReviewsClassification (es)', 'AmazonReviewsClassification (fr)', 'AmazonReviewsClassification (ja)', 'AmazonReviewsClassification (zh)', 'MTOPDomainClassification (de)', 'MTOPDomainClassification (es)', 'MTOPDomainClassification (fr)', 'MTOPDomainClassification (hi)', 'MTOPDomainClassification (th)', 'MTOPIntentClassification (de)', 'MTOPIntentClassification (es)', 'MTOPIntentClassification (fr)', 'MTOPIntentClassification (hi)', 'MTOPIntentClassification (th)', 'MassiveIntentClassification (af)', 'MassiveIntentClassification (am)', 'MassiveIntentClassification (ar)', 'MassiveIntentClassification (az)', 'MassiveIntentClassification (bn)', 'MassiveIntentClassification (cy)', 'MassiveIntentClassification (de)', 'MassiveIntentClassification (el)', 'MassiveIntentClassification (es)', 'MassiveIntentClassification (fa)', 'MassiveIntentClassification (fi)', 'MassiveIntentClassification (fr)', 'MassiveIntentClassification (he)', 'MassiveIntentClassification (hi)', 'MassiveIntentClassification (hu)', 'MassiveIntentClassification (hy)', 'MassiveIntentClassification (id)', 'MassiveIntentClassification (is)', 'MassiveIntentClassification (it)', 'MassiveIntentClassification (ja)', 'MassiveIntentClassification (jv)', 'MassiveIntentClassification (ka)', 'MassiveIntentClassification (km)', 'MassiveIntentClassification (kn)', 'MassiveIntentClassification (ko)', 'MassiveIntentClassification (lv)', 'MassiveIntentClassification (ml)', 'MassiveIntentClassification (mn)', 'MassiveIntentClassification (ms)', 'MassiveIntentClassification (my)', 'MassiveIntentClassification (nl)', 'MassiveIntentClassification (pl)', 'MassiveIntentClassification (pt)', 'MassiveIntentClassification (ro)', 'MassiveIntentClassification (ru)', 'MassiveIntentClassification (sl)', 'MassiveIntentClassification (sq)', 'MassiveIntentClassification (sw)', 'MassiveIntentClassification (ta)', 'MassiveIntentClassification (te)', 'MassiveIntentClassification (th)', 'MassiveIntentClassification (tl)', 'MassiveIntentClassification (tr)', 'MassiveIntentClassification (ur)', 'MassiveIntentClassification (vi)', 'MassiveIntentClassification (zh-TW)', 'MassiveScenarioClassification (af)', 'MassiveScenarioClassification (am)', 'MassiveScenarioClassification (ar)', 'MassiveScenarioClassification (az)', 'MassiveScenarioClassification (bn)', 'MassiveScenarioClassification (cy)', 'MassiveScenarioClassification (de)', 'MassiveScenarioClassification (el)', 'MassiveScenarioClassification (es)', 'MassiveScenarioClassification (fa)', 'MassiveScenarioClassification (fi)', 'MassiveScenarioClassification (fr)', 'MassiveScenarioClassification (he)', 'MassiveScenarioClassification (hi)', 'MassiveScenarioClassification (hu)', 'MassiveScenarioClassification (hy)', 'MassiveScenarioClassification (id)', 'MassiveScenarioClassification (is)', 'MassiveScenarioClassification (it)', 'MassiveScenarioClassification (ja)', 'MassiveScenarioClassification (jv)', 'MassiveScenarioClassification (ka)', 'MassiveScenarioClassification (km)', 'MassiveScenarioClassification (kn)', 'MassiveScenarioClassification (ko)', 'MassiveScenarioClassification (lv)', 'MassiveScenarioClassification (ml)', 'MassiveScenarioClassification (mn)', 'MassiveScenarioClassification (ms)', 'MassiveScenarioClassification (my)', 'MassiveScenarioClassification (nl)', 'MassiveScenarioClassification (pl)', 'MassiveScenarioClassification (pt)', 'MassiveScenarioClassification (ro)', 'MassiveScenarioClassification (ru)', 'MassiveScenarioClassification (sl)', 'MassiveScenarioClassification (sq)', 'MassiveScenarioClassification (sw)', 'MassiveScenarioClassification (ta)', 'MassiveScenarioClassification (te)', 'MassiveScenarioClassification (th)', 'MassiveScenarioClassification (tl)', 'MassiveScenarioClassification (tr)', 'MassiveScenarioClassification (ur)', 'MassiveScenarioClassification (vi)', 'MassiveScenarioClassification (zh-TW)']
82
 
83
  TASK_LIST_CLUSTERING = [
84
  "ArxivClusteringP2P",
 
102
  "TenKGnadClusteringS2S",
103
  ]
104
 
105
+ TASK_LIST_CLUSTERING_ZH = [
106
+ "CLSClusteringP2P",
107
+ "CLSClusteringS2S",
108
+ "ThuNewsClusteringP2P",
109
+ "ThuNewsClusteringS2S",
110
+ ]
111
+
112
  TASK_LIST_PAIR_CLASSIFICATION = [
113
  "SprintDuplicateQuestions",
114
  "TwitterSemEval2015",
115
  "TwitterURLCorpus",
116
  ]
117
 
118
+ TASK_LIST_PAIR_CLASSIFICATION_ZH = [
119
+ "Cmnli",
120
+ "Ocnli",
121
+ ]
122
+
123
  TASK_LIST_RERANKING = [
124
  "AskUbuntuDupQuestions",
125
  "MindSmallReranking",
 
127
  "StackOverflowDupQuestions",
128
  ]
129
 
130
+ TASK_LIST_RERANKING_ZH = [
131
+ "CMedQAv1",
132
+ "CMedQAv2",
133
+ "MmarcoReranking",
134
+ "T2Reranking",
135
+ ]
136
+
137
  TASK_LIST_RETRIEVAL = [
138
  "ArguAna",
139
  "ClimateFEVER",
 
155
  TASK_LIST_RETRIEVAL_PL = [
156
  "ArguAna-PL",
157
  "DBPedia-PL",
158
+ "FiQA-PL",
159
  "HotpotQA-PL",
160
  "MSMARCO-PL",
161
  "NFCorpus-PL",
 
166
  "TRECCOVID-PL",
167
  ]
168
 
169
+ TASK_LIST_RETRIEVAL_ZH = [
170
+ "CmedqaRetrieval",
171
+ "CovidRetrieval",
172
+ "DuRetrieval",
173
+ "EcomRetrieval",
174
+ "MedicalRetrieval",
175
+ "MMarcoRetrieval",
176
+ "T2Retrieval",
177
+ "VideoRetrieval",
178
+ ]
179
+
180
  TASK_LIST_RETRIEVAL_NORM = TASK_LIST_RETRIEVAL + [
181
  "CQADupstackAndroidRetrieval",
182
  "CQADupstackEnglishRetrieval",
 
205
  "STSBenchmark",
206
  ]
207
 
208
+ TASK_LIST_STS_ZH = [
209
+ "AFQMC",
210
+ "ATEC",
211
+ "BQ",
212
+ "LCQMC",
213
+ "PAWSX",
214
+ "QBQTC",
215
+ "STS22 (zh)",
216
+ "STSB",
217
+ ]
218
+
219
+ TASK_LIST_STS_OTHER = ["STS17 (ar-ar)", "STS17 (en-ar)", "STS17 (en-de)", "STS17 (en-tr)", "STS17 (es-en)", "STS17 (es-es)", "STS17 (fr-en)", "STS17 (it-en)", "STS17 (ko-ko)", "STS17 (nl-en)", "STS22 (ar)", "STS22 (de)", "STS22 (de-en)", "STS22 (de-fr)", "STS22 (de-pl)", "STS22 (es)", "STS22 (es-en)", "STS22 (es-it)", "STS22 (fr)", "STS22 (fr-pl)", "STS22 (it)", "STS22 (pl)", "STS22 (pl-en)", "STS22 (ru)", "STS22 (tr)", "STS22 (zh-en)", "STSBenchmark",]
220
  TASK_LIST_STS_NORM = [x.replace(" (en)", "").replace(" (en-en)", "") for x in TASK_LIST_STS]
221
 
222
+ TASK_LIST_SUMMARIZATION = ["SummEval",]
 
 
223
 
224
  TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS + TASK_LIST_SUMMARIZATION
225
+ TASK_LIST_ZH = TASK_LIST_CLASSIFICATION_ZH + TASK_LIST_CLUSTERING_ZH + TASK_LIST_PAIR_CLASSIFICATION_ZH + TASK_LIST_RERANKING_ZH + TASK_LIST_RETRIEVAL_ZH + TASK_LIST_STS_ZH
226
 
227
  TASK_TO_METRIC = {
228
  "BitextMining": "f1",
 
251
  "allenai-specter",
252
  "bert-base-swedish-cased",
253
  "bert-base-uncased",
254
+ "bge-base-zh",
255
+ "bge-large-zh",
256
+ "bge-large-zh-noinstruct",
257
+ "bge-small-zh",
258
  "contriever-base-msmarco",
259
  "cross-en-de-roberta-sentence-transformer",
260
  "dfm-encoder-large-v1",
 
277
  "gtr-t5-xl",
278
  "gtr-t5-xxl",
279
  "komninos",
280
+ "luotuo-bert-medium",
281
  "LASER2",
282
+ "LaBSE",
283
+ "m3e-base",
284
+ "m3e-large",
285
  "msmarco-bert-co-condensor",
286
  "multilingual-e5-base",
287
  "multilingual-e5-large",
 
298
  "sentence-t5-xl",
299
  "sentence-t5-xxl",
300
  "sup-simcse-bert-base-uncased",
301
+ "text2vec-base-chinese",
302
+ "text2vec-large-chinese",
303
  "text-embedding-ada-002",
304
  "text-similarity-ada-001",
305
  "text-similarity-babbage-001",
 
324
  "all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
325
  "bert-base-swedish-cased": "https://huggingface.co/KB/bert-base-swedish-cased",
326
  "bert-base-uncased": "https://huggingface.co/bert-base-uncased",
327
+ "bge-base-zh": "https://huggingface.co/BAAI/bge-base-zh",
328
+ "bge-large-zh": "https://huggingface.co/BAAI/bge-large-zh",
329
+ "bge-large-zh-noinstruct": "https://huggingface.co/BAAI/bge-large-zh-noinstruct",
330
+ "bge-small-zh": "https://huggingface.co/BAAI/bge-small-zh",
331
  "contriever-base-msmarco": "https://huggingface.co/nthakur/contriever-base-msmarco",
332
  "cross-en-de-roberta-sentence-transformer": "https://huggingface.co/T-Systems-onsite/cross-en-de-roberta-sentence-transformer",
333
  "DanskBERT": "https://huggingface.co/vesteinn/DanskBERT",
 
350
  "gtr-t5-xl": "https://huggingface.co/sentence-transformers/gtr-t5-xl",
351
  "gtr-t5-xxl": "https://huggingface.co/sentence-transformers/gtr-t5-xxl",
352
  "komninos": "https://huggingface.co/sentence-transformers/average_word_embeddings_komninos",
353
+ "luotuo-bert-medium": "https://huggingface.co/silk-road/luotuo-bert-medium",
354
  "LASER2": "https://github.com/facebookresearch/LASER",
355
  "LaBSE": "https://huggingface.co/sentence-transformers/LaBSE",
356
+ "m3e-base": "https://huggingface.co/moka-ai/m3e-base",
357
+ "m3e-large": "https://huggingface.co/moka-ai/m3e-large",
358
  "msmarco-bert-co-condensor": "https://huggingface.co/sentence-transformers/msmarco-bert-co-condensor",
359
  "multilingual-e5-base": "https://huggingface.co/intfloat/multilingual-e5-base",
360
  "multilingual-e5-large": "https://huggingface.co/intfloat/multilingual-e5-large",
 
371
  "sentence-t5-xl": "https://huggingface.co/sentence-transformers/sentence-t5-xl",
372
  "sentence-t5-xxl": "https://huggingface.co/sentence-transformers/sentence-t5-xxl",
373
  "sup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/sup-simcse-bert-base-uncased",
374
+ "text2vec-base-chinese": "https://huggingface.co/shibing624/text2vec-base-chinese",
375
+ "text2vec-large-chinese": "https://huggingface.co/GanymedeNil/text2vec-large-chinese",
376
  "text-embedding-ada-002": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
377
  "text-similarity-ada-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
378
  "text-similarity-babbage-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
 
397
  "allenai-specter": 768,
398
  "bert-base-swedish-cased": 768,
399
  "bert-base-uncased": 768,
400
+ "bge-base-zh": 768,
401
+ "bge-large-zh": 1024,
402
+ "bge-large-zh-noinstruct": 1024,
403
+ "bge-small-zh": 512,
404
  "contriever-base-msmarco": 768,
405
  "cross-en-de-roberta-sentence-transformer": 768,
406
  "DanskBERT": 768,
 
412
  "e5-large": 1024,
413
  "electra-small-nordic": 256,
414
  "electra-small-swedish-cased-discriminator": 256,
415
+ "luotuo-bert-medium": 768,
416
  "LASER2": 1024,
417
  "LaBSE": 768,
418
  "gbert-base": 768,
 
426
  "gtr-t5-xl": 768,
427
  "gtr-t5-xxl": 768,
428
  "komninos": 300,
429
+ "m3e-base": 768,
430
+ "m3e-large": 768,
431
  "msmarco-bert-co-condensor": 768,
432
  "multilingual-e5-base": 768,
433
  "multilingual-e5-small": 384,
 
444
  "sentence-t5-xl": 768,
445
  "sentence-t5-xxl": 768,
446
  "sup-simcse-bert-base-uncased": 768,
447
+ "text2vec-base-chinese": 768,
448
+ "text2vec-large-chinese": 1024,
449
  "text-embedding-ada-002": 1536,
450
  "text-similarity-ada-001": 1024,
451
  "text-similarity-babbage-001": 2048,
 
457
  "text-search-babbage-001": 2048,
458
  "text-search-curie-001": 4096,
459
  "text-search-davinci-001": 12288,
460
+ "unsup-simcse-bert-base-uncased": 768,
461
+ "use-cmlm-multilingual": 768,
462
  "xlm-roberta-base": 768,
463
  "xlm-roberta-large": 1024,
464
  }
465
 
 
466
  EXTERNAL_MODEL_TO_SEQLEN = {
467
  "all-MiniLM-L12-v2": 512,
468
  "all-MiniLM-L6-v2": 512,
 
470
  "allenai-specter": 512,
471
  "bert-base-swedish-cased": 512,
472
  "bert-base-uncased": 512,
473
+ "bge-base-zh": 512,
474
+ "bge-large-zh": 512,
475
+ "bge-large-zh-noinstruct": 512,
476
+ "bge-small-zh": 512,
477
  "contriever-base-msmarco": 512,
478
  "cross-en-de-roberta-sentence-transformer": 514,
479
  "DanskBERT": 514,
 
496
  "gtr-t5-xl": 512,
497
  "gtr-t5-xxl": 512,
498
  "komninos": "N/A",
499
+ "luotuo-bert-medium": 512,
500
  "LASER2": "N/A",
501
+ "LaBSE": 512,
502
+ "m3e-base": 512,
503
+ "m3e-large": 512,
504
  "msmarco-bert-co-condensor": 512,
505
  "multilingual-e5-base": 514,
506
  "multilingual-e5-large": 514,
 
517
  "sentence-t5-xl": 512,
518
  "sentence-t5-xxl": 512,
519
  "sup-simcse-bert-base-uncased": 512,
520
+ "text2vec-base-chinese": 512,
521
+ "text2vec-large-chinese": 512,
522
  "text-embedding-ada-002": 8191,
523
  "text-similarity-ada-001": 2046,
524
  "text-similarity-babbage-001": 2046,
 
543
  "all-mpnet-base-v2": 0.44,
544
  "bert-base-uncased": 0.44,
545
  "bert-base-swedish-cased": 0.50,
546
+ "bge-base-zh": 0.41,
547
+ "bge-large-zh": 1.30,
548
+ "bge-large-zh-noinstruct": 1.30,
549
+ "bge-small-zh": 0.10,
550
  "cross-en-de-roberta-sentence-transformer": 1.11,
551
  "contriever-base-msmarco": 0.44,
552
  "DanskBERT": 0.50,
 
569
  "gtr-t5-xl": 2.48,
570
  "gtr-t5-xxl": 9.73,
571
  "komninos": 0.27,
572
+ "luotuo-bert-medium": 1.31,
573
  "LASER2": 0.17,
574
  "LaBSE": 1.88,
575
+ "m3e-base": 0.41,
576
+ "m3e-large": 0.41,
577
  "msmarco-bert-co-condensor": 0.44,
578
  "multilingual-e5-base": 1.11,
579
  "multilingual-e5-small": 0.47,
 
590
  "sentence-t5-xl": 2.48,
591
  "sentence-t5-xxl": 9.73,
592
  "sup-simcse-bert-base-uncased": 0.44,
593
+ "text2vec-base-chinese": 0.41,
594
+ "text2vec-large-chinese": 1.30,
595
  "unsup-simcse-bert-base-uncased": 0.44,
596
  "use-cmlm-multilingual": 1.89,
597
  "xlm-roberta-base": 1.12,
 
620
  "newsrx/instructor-xl",
621
  "dmlls/all-mpnet-base-v2",
622
  "cgldo/semanticClone",
623
+ "Malmuk1/e5-large-v2_Sharded",
624
  }
625
 
 
626
  EXTERNAL_MODEL_RESULTS = {model: {k: {v: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
627
 
628
  def add_lang(examples):
 
634
 
635
  def add_task(examples):
636
  # Could be added to the dataset loading script instead
637
+ if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM + TASK_LIST_CLASSIFICATION_DA + TASK_LIST_CLASSIFICATION_SV + TASK_LIST_CLASSIFICATION_NB + TASK_LIST_CLASSIFICATION_ZH:
638
  examples["mteb_task"] = "Classification"
639
+ elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING + TASK_LIST_CLUSTERING_DE + TASK_LIST_CLUSTERING_ZH:
640
  examples["mteb_task"] = "Clustering"
641
+ elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_PAIR_CLASSIFICATION_ZH:
642
  examples["mteb_task"] = "PairClassification"
643
+ elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING + TASK_LIST_RERANKING_ZH:
644
  examples["mteb_task"] = "Reranking"
645
+ elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM + TASK_LIST_RETRIEVAL_PL + TASK_LIST_RETRIEVAL_ZH:
646
  examples["mteb_task"] = "Retrieval"
647
+ elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM + TASK_LIST_STS_ZH:
648
  examples["mteb_task"] = "STS"
649
  elif examples["mteb_dataset_name"] in TASK_LIST_SUMMARIZATION:
650
  examples["mteb_task"] = "Summarization"
651
+ elif examples["mteb_dataset_name"] in [x.split(" ")[0] for x in TASK_LIST_BITEXT_MINING + TASK_LIST_BITEXT_MINING_OTHER]:
652
  examples["mteb_task"] = "BitextMining"
653
+ else:
654
+ print("WARNING: Task not found for dataset", examples["mteb_dataset_name"])
655
+ examples["mteb_task"] = "Unknown"
656
  return examples
657
 
658
  for model in EXTERNAL_MODELS:
659
+ ds = load_dataset("mteb/results", model)
660
  # For local debugging:
661
  #, download_mode='force_redownload', verification_mode="no_checks")
662
  ds = ds.map(add_lang)
 
709
  columns={f'BornholmBitextMining': '<a target="_blank" style="text-decoration: underline" href="{link}">BornholmBitextMining</a>',})
710
  return df
711
 
 
712
  def add_rank(df):
713
  cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length"]]
714
  if len(cols_to_rank) == 1:
 
793
  return df
794
 
795
  def get_mteb_average():
796
+ global DATA_OVERALL, DATA_CLASSIFICATION_EN, DATA_CLUSTERING, DATA_PAIR_CLASSIFICATION, DATA_RERANKING, DATA_RETRIEVAL, DATA_STS_EN, DATA_SUMMARIZATION
797
  DATA_OVERALL = get_mteb_data(
798
  tasks=[
799
  "Classification",
 
804
  "STS",
805
  "Summarization",
806
  ],
807
+ datasets=TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS + TASK_LIST_SUMMARIZATION,
808
  fillna=False,
809
  add_emb_dim=True,
810
  rank=False,
 
827
  DATA_OVERALL = DATA_OVERALL.round(2)
828
 
829
  DATA_CLASSIFICATION_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION])
830
+ # Only keep rows with at least one score in addition to the "Model" & rank column
831
+ DATA_CLASSIFICATION_EN = DATA_CLASSIFICATION_EN[DATA_CLASSIFICATION_EN.iloc[:, 2:].ne("").any(axis=1)]
832
+
833
  DATA_CLUSTERING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING])
834
+ DATA_CLUSTERING = DATA_CLUSTERING[DATA_CLUSTERING.iloc[:, 2:].ne("").any(axis=1)]
835
+
836
  DATA_PAIR_CLASSIFICATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION])
837
+ DATA_PAIR_CLASSIFICATION = DATA_PAIR_CLASSIFICATION[DATA_PAIR_CLASSIFICATION.iloc[:, 2:].ne("").any(axis=1)]
838
+
839
  DATA_RERANKING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RERANKING])
840
+ DATA_RERANKING = DATA_RERANKING[DATA_RERANKING.iloc[:, 2:].ne("").any(axis=1)]
841
+
842
  DATA_RETRIEVAL = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RETRIEVAL])
843
+ DATA_RETRIEVAL = DATA_RETRIEVAL[DATA_RETRIEVAL.iloc[:, 2:].ne("").any(axis=1)]
844
+
845
  DATA_STS_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_STS])
846
+ DATA_STS_EN = DATA_STS_EN[DATA_STS_EN.iloc[:, 2:].ne("").any(axis=1)]
847
+
848
  DATA_SUMMARIZATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION])
849
+ DATA_SUMMARIZATION = DATA_SUMMARIZATION[DATA_SUMMARIZATION.iloc[:, 1:].ne("").any(axis=1)]
850
 
851
  # Fill NaN after averaging
852
  DATA_OVERALL.fillna("", inplace=True)
853
 
854
  DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
855
+ DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)]
856
 
857
  return DATA_OVERALL
858
 
859
+ def get_mteb_average_zh():
860
+ global DATA_OVERALL_ZH, DATA_CLASSIFICATION_ZH, DATA_CLUSTERING_ZH, DATA_PAIR_CLASSIFICATION_ZH, DATA_RERANKING_ZH, DATA_RETRIEVAL_ZH, DATA_STS_ZH
861
+ DATA_OVERALL_ZH = get_mteb_data(
862
+ tasks=[
863
+ "Classification",
864
+ "Clustering",
865
+ "PairClassification",
866
+ "Reranking",
867
+ "Retrieval",
868
+ "STS",
869
+ ],
870
+ datasets=TASK_LIST_CLASSIFICATION_ZH + TASK_LIST_CLUSTERING_ZH + TASK_LIST_PAIR_CLASSIFICATION_ZH + TASK_LIST_RERANKING_ZH + TASK_LIST_RETRIEVAL_ZH + TASK_LIST_STS_ZH,
871
+ fillna=False,
872
+ add_emb_dim=True,
873
+ rank=False,
874
+ )
875
+ # Debugging:
876
+ # DATA_OVERALL_ZH.to_csv("overall.csv")
877
+
878
+ DATA_OVERALL_ZH.insert(1, f"Average ({len(TASK_LIST_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_ZH].mean(axis=1, skipna=False))
879
+ DATA_OVERALL_ZH.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_CLASSIFICATION_ZH].mean(axis=1, skipna=False))
880
+ DATA_OVERALL_ZH.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_CLUSTERING_ZH].mean(axis=1, skipna=False))
881
+ DATA_OVERALL_ZH.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_PAIR_CLASSIFICATION_ZH].mean(axis=1, skipna=False))
882
+ DATA_OVERALL_ZH.insert(5, f"Reranking Average ({len(TASK_LIST_RERANKING_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_RERANKING_ZH].mean(axis=1, skipna=False))
883
+ DATA_OVERALL_ZH.insert(6, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_RETRIEVAL_ZH].mean(axis=1, skipna=False))
884
+ DATA_OVERALL_ZH.insert(7, f"STS Average ({len(TASK_LIST_STS_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_STS_ZH].mean(axis=1, skipna=False))
885
+ DATA_OVERALL_ZH.sort_values(f"Average ({len(TASK_LIST_ZH)} datasets)", ascending=False, inplace=True)
886
+ # Start ranking from 1
887
+ DATA_OVERALL_ZH.insert(0, "Rank", list(range(1, len(DATA_OVERALL_ZH) + 1)))
888
+
889
+ DATA_OVERALL_ZH = DATA_OVERALL_ZH.round(2)
890
+
891
+ DATA_CLASSIFICATION_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_CLASSIFICATION_ZH])
892
+ # Only keep rows with at least one score in addition to the "Model" & rank column
893
+ DATA_CLASSIFICATION_ZH = DATA_CLASSIFICATION_ZH[DATA_CLASSIFICATION_ZH.iloc[:, 2:].ne("").any(axis=1)]
894
+
895
+ DATA_CLUSTERING_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_CLUSTERING_ZH])
896
+ DATA_CLUSTERING_ZH = DATA_CLUSTERING_ZH[DATA_CLUSTERING_ZH.iloc[:, 2:].ne("").any(axis=1)]
897
+
898
+ DATA_PAIR_CLASSIFICATION_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_PAIR_CLASSIFICATION_ZH])
899
+ DATA_PAIR_CLASSIFICATION_ZH = DATA_PAIR_CLASSIFICATION_ZH[DATA_PAIR_CLASSIFICATION_ZH.iloc[:, 2:].ne("").any(axis=1)]
900
+
901
+ DATA_RERANKING_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_RERANKING_ZH])
902
+ DATA_RERANKING_ZH = DATA_RERANKING_ZH[DATA_RERANKING_ZH.iloc[:, 2:].ne("").any(axis=1)]
903
+
904
+ DATA_RETRIEVAL_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_RETRIEVAL_ZH])
905
+ DATA_RETRIEVAL_ZH = DATA_RETRIEVAL_ZH[DATA_RETRIEVAL_ZH.iloc[:, 2:].ne("").any(axis=1)]
906
+
907
+ DATA_STS_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_STS_ZH])
908
+ DATA_STS_ZH = DATA_STS_ZH[DATA_STS_ZH.iloc[:, 2:].ne("").any(axis=1)]
909
+
910
+ # Fill NaN after averaging
911
+ DATA_OVERALL_ZH.fillna("", inplace=True)
912
+
913
+ DATA_OVERALL_ZH = DATA_OVERALL_ZH[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_ZH)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_ZH)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_ZH)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_ZH)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING_ZH)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_ZH)} datasets)", f"STS Average ({len(TASK_LIST_STS_ZH)} datasets)"]]
914
+ DATA_OVERALL_ZH = DATA_OVERALL_ZH[DATA_OVERALL_ZH.iloc[:, 5:].ne("").any(axis=1)]
915
+
916
+ return DATA_OVERALL_ZH
917
+
918
  get_mteb_average()
919
+ get_mteb_average_zh()
920
  DATA_BITEXT_MINING = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING)
921
  DATA_BITEXT_MINING_OTHER = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING_OTHER)
922
  DATA_CLASSIFICATION_DA = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_DA)
923
  DATA_CLASSIFICATION_NB = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_NB)
924
  DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
925
  DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
926
+ DATA_CLUSTERING_DE = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
927
  DATA_RETRIEVAL_PL = get_mteb_data(["Retrieval"], [], TASK_LIST_RETRIEVAL_PL)
928
+ DATA_STS_OTHER = get_mteb_data(["STS"], [], TASK_LIST_STS_OTHER)
929
 
930
  # Exact, add all non-nan integer values for every dataset
931
  NUM_SCORES = 0
932
  DATASETS = []
933
+ MODELS = []
934
  # LANGUAGES = []
935
+ for d in [
936
+ DATA_BITEXT_MINING,
937
+ DATA_BITEXT_MINING_OTHER,
938
+ DATA_CLASSIFICATION_EN,
939
+ DATA_CLASSIFICATION_DA,
940
+ DATA_CLASSIFICATION_NB,
941
+ DATA_CLASSIFICATION_SV,
942
+ DATA_CLASSIFICATION_ZH,
943
+ DATA_CLASSIFICATION_OTHER,
944
+ DATA_CLUSTERING,
945
+ DATA_CLUSTERING_DE,
946
+ DATA_CLUSTERING_ZH,
947
+ DATA_PAIR_CLASSIFICATION,
948
+ DATA_PAIR_CLASSIFICATION_ZH,
949
+ DATA_RERANKING,
950
+ DATA_RERANKING_ZH,
951
+ DATA_RETRIEVAL,
952
+ DATA_RETRIEVAL_ZH,
953
+ DATA_STS_EN,
954
+ DATA_STS_ZH,
955
+ DATA_STS_OTHER,
956
+ DATA_SUMMARIZATION,
957
+ ]:
958
  # NUM_SCORES += d.iloc[:, 1:].apply(lambda x: sum([1 for y in x if isinstance(y, float) and not np.isnan(y)]), axis=1).sum()
959
  cols_to_ignore = 3 if "Average" in d.columns else 2
960
  # Count number of scores including only non-nan floats & excluding the rank column
 
962
  # Exclude rank & model name column (first two); Do not count different language versions as different datasets
963
  DATASETS += [i.split(" ")[0] for i in d.columns[cols_to_ignore:]]
964
  # LANGUAGES += [i.split(" ")[-1] for i in d.columns[cols_to_ignore:]]
965
+ MODELS += d["Model"].tolist()
966
 
967
  NUM_DATASETS = len(set(DATASETS))
968
  # NUM_LANGUAGES = len(set(LANGUAGES))
969
+ NUM_MODELS = len(set(MODELS))
970
 
971
  block = gr.Blocks()
972
  with block:
 
976
  - **Total Datasets**: {NUM_DATASETS}
977
  - **Total Languages**: 113
978
  - **Total Scores**: {NUM_SCORES}
979
+ - **Total Models**: {NUM_MODELS}
980
  """)
981
  with gr.Tabs():
982
  with gr.TabItem("Overall"):
983
+ with gr.TabItem("English"):
984
+ with gr.Row():
985
+ gr.Markdown("""
986
+ **Overall MTEB English leaderboard 🔮**
987
+
988
+ - **Metric:** Various, refer to task tabs
989
+ - **Languages:** English
990
+ """)
991
+ with gr.Row():
992
+ data_overall = gr.components.Dataframe(
993
+ DATA_OVERALL,
994
+ datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL.columns),
995
+ type="pandas",
996
+ wrap=True,
997
+ )
998
+ with gr.Row():
999
+ data_run_overall = gr.Button("Refresh")
1000
+ data_run_overall.click(get_mteb_average, inputs=None, outputs=data_overall)
1001
+ with gr.TabItem("Chinese"):
1002
+ with gr.Row():
1003
+ gr.Markdown("""
1004
+ **Overall MTEB Chinese leaderboard (C-MTEB) 🔮🇨🇳**
1005
+
1006
+ - **Metric:** Various, refer to task tabs
1007
+ - **Languages:** Chinese
1008
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1009
+ """)
1010
+ with gr.Row():
1011
+ data_overall_zh = gr.components.Dataframe(
1012
+ DATA_OVERALL_ZH,
1013
+ datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL_ZH.columns),
1014
+ type="pandas",
1015
+ wrap=True,
1016
+ )
1017
+ with gr.Row():
1018
+ data_run_overall_zh = gr.Button("Refresh")
1019
+ data_run_overall_zh.click(get_mteb_average_zh, inputs=None, outputs=data_overall_zh)
1020
  with gr.TabItem("Bitext Mining"):
1021
  with gr.TabItem("English-X"):
1022
  with gr.Row():
1023
  gr.Markdown("""
1024
+ **Bitext Mining English-X Leaderboard 🎌**
1025
 
1026
  - **Metric:** [F1](https://huggingface.co/spaces/evaluate-metric/f1)
1027
  - **Languages:** 117 (Pairs of: English & other language)
 
1033
  type="pandas",
1034
  )
1035
  with gr.Row():
1036
+ data_run_bitext_mining = gr.Button("Refresh")
1037
  task_bitext_mining = gr.Variable(value=["BitextMining"])
1038
  lang_bitext_mining = gr.Variable(value=[])
1039
  datasets_bitext_mining = gr.Variable(value=TASK_LIST_BITEXT_MINING)
1040
+ data_run_bitext_mining.click(
1041
  get_mteb_data,
1042
  inputs=[task_bitext_mining, lang_bitext_mining, datasets_bitext_mining],
1043
  outputs=data_bitext_mining,
 
1058
  type="pandas",
1059
  )
1060
  with gr.Row():
1061
+ data_run_bitext_mining_da = gr.Button("Refresh")
1062
  task_bitext_mining_da = gr.Variable(value=["BitextMining"])
1063
  lang_bitext_mining_da = gr.Variable(value=[])
1064
  datasets_bitext_mining_da = gr.Variable(value=TASK_LIST_BITEXT_MINING_OTHER)
1065
+ data_run_bitext_mining_da.click(
1066
  get_mteb_data,
1067
  inputs=[
1068
  task_bitext_mining_da,
 
1075
  with gr.TabItem("English"):
1076
  with gr.Row():
1077
  gr.Markdown("""
1078
+ **Classification English Leaderboard ❤️**
1079
 
1080
  - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1081
  - **Languages:** English
 
1098
  ],
1099
  outputs=data_classification_en,
1100
  )
1101
+ with gr.TabItem("Chinese"):
1102
+ with gr.Row():
1103
+ gr.Markdown("""
1104
+ **Classification Chinese Leaderboard 🧡🇨🇳**
1105
+
1106
+ - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1107
+ - **Languages:** Chinese
1108
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1109
+ """)
1110
+ with gr.Row():
1111
+ data_classification_zh = gr.components.Dataframe(
1112
+ DATA_CLASSIFICATION_ZH,
1113
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_ZH.columns),
1114
+ type="pandas",
1115
+ )
1116
+ with gr.Row():
1117
+ data_run_classification_zh = gr.Button("Refresh")
1118
+ task_classification_zh = gr.Variable(value=["Classification"])
1119
+ lang_classification_zh = gr.Variable([])
1120
+ datasets_classification_zh = gr.Variable(value=TASK_LIST_CLASSIFICATION_ZH)
1121
+ data_run_classification_zh.click(
1122
+ get_mteb_data,
1123
+ inputs=[
1124
+ task_classification_zh,
1125
+ lang_classification_zh,
1126
+ datasets_classification_zh,
1127
+ ],
1128
+ outputs=data_classification_zh,
1129
+ )
1130
  with gr.TabItem("Danish"):
1131
  with gr.Row():
1132
  gr.Markdown("""
 
1229
  type="pandas",
1230
  )
1231
  with gr.Row():
1232
+ data_run_classification = gr.Button("Refresh")
1233
  task_classification = gr.Variable(value=["Classification"])
1234
  lang_classification = gr.Variable(value=[])
1235
  datasets_classification = gr.Variable(value=TASK_LIST_CLASSIFICATION_OTHER)
1236
+ data_run_classification.click(
1237
  get_mteb_data,
1238
  inputs=[
1239
  task_classification,
 
1258
  type="pandas",
1259
  )
1260
  with gr.Row():
1261
+ data_run_clustering_en = gr.Button("Refresh")
1262
  task_clustering = gr.Variable(value=["Clustering"])
1263
  lang_clustering = gr.Variable(value=[])
1264
  datasets_clustering = gr.Variable(value=TASK_LIST_CLUSTERING)
1265
+ data_run_clustering_en.click(
1266
  get_mteb_data,
1267
  inputs=[task_clustering, lang_clustering, datasets_clustering],
1268
  outputs=data_clustering,
1269
  )
1270
+ with gr.TabItem("Chinese"):
1271
+ with gr.Row():
1272
+ gr.Markdown("""
1273
+ **Clustering Chinese Leaderboard ✨🇨🇳**
1274
+
1275
+ - **Metric:** Validity Measure (v_measure)
1276
+ - **Languages:** Chinese
1277
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1278
+ """)
1279
+ with gr.Row():
1280
+ data_clustering_zh = gr.components.Dataframe(
1281
+ DATA_CLUSTERING_ZH,
1282
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_ZH.columns),
1283
+ type="pandas",
1284
+ )
1285
+ with gr.Row():
1286
+ data_run_clustering_zh = gr.Button("Refresh")
1287
+ task_clustering_zh = gr.Variable(value=["Clustering"])
1288
+ lang_clustering_zh = gr.Variable(value=[])
1289
+ datasets_clustering_zh = gr.Variable(value=TASK_LIST_CLUSTERING_ZH)
1290
+ data_run_clustering_zh.click(
1291
+ get_mteb_data,
1292
+ inputs=[task_clustering_zh, lang_clustering_zh, datasets_clustering_zh],
1293
+ outputs=data_clustering_zh,
1294
+ )
1295
  with gr.TabItem("German"):
1296
  with gr.Row():
1297
  gr.Markdown("""
 
1303
  """)
1304
  with gr.Row():
1305
  data_clustering_de = gr.components.Dataframe(
1306
+ DATA_CLUSTERING_DE,
1307
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_DE.columns) * 2,
1308
  type="pandas",
1309
  )
1310
  with gr.Row():
1311
+ data_run_clustering_de = gr.Button("Refresh")
1312
  task_clustering_de = gr.Variable(value=["Clustering"])
1313
  lang_clustering_de = gr.Variable(value=[])
1314
  datasets_clustering_de = gr.Variable(value=TASK_LIST_CLUSTERING_DE)
1315
+ data_run_clustering_de.click(
1316
  get_mteb_data,
1317
  inputs=[task_clustering_de, lang_clustering_de, datasets_clustering_de],
1318
  outputs=data_clustering_de,
1319
  )
1320
  with gr.TabItem("Pair Classification"):
1321
+ with gr.TabItem("English"):
1322
+ with gr.Row():
1323
+ gr.Markdown("""
1324
+ **Pair Classification English Leaderboard 🎭**
1325
+
1326
+ - **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
1327
+ - **Languages:** English
1328
+ """)
1329
+ with gr.Row():
1330
+ data_pair_classification = gr.components.Dataframe(
1331
+ DATA_PAIR_CLASSIFICATION,
1332
+ datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION.columns),
1333
+ type="pandas",
1334
+ )
1335
+ with gr.Row():
1336
+ data_run_pair_classification = gr.Button("Refresh")
1337
+ task_pair_classification = gr.Variable(value=["PairClassification"])
1338
+ lang_pair_classification = gr.Variable(value=[])
1339
+ datasets_pair_classification = gr.Variable(value=TASK_LIST_PAIR_CLASSIFICATION)
1340
+ data_run_pair_classification.click(
1341
+ get_mteb_data,
1342
+ inputs=[
1343
+ task_pair_classification,
1344
+ lang_pair_classification,
1345
+ datasets_pair_classification,
1346
+ ],
1347
+ outputs=data_pair_classification,
1348
+ )
1349
+ with gr.TabItem("Chinese"):
1350
+ with gr.Row():
1351
+ gr.Markdown("""
1352
+ **Pair Classification Chinese Leaderboard 🎭🇨🇳**
1353
+
1354
+ - **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
1355
+ - **Languages:** Chinese
1356
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1357
+ """)
1358
+ with gr.Row():
1359
+ data_pair_classification_zh = gr.components.Dataframe(
1360
+ DATA_PAIR_CLASSIFICATION_ZH,
1361
+ datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION_ZH.columns),
1362
+ type="pandas",
1363
+ )
1364
+ with gr.Row():
1365
+ data_run = gr.Button("Refresh")
1366
+ task_pair_classification_zh = gr.Variable(value=["PairClassification"])
1367
+ lang_pair_classification_zh = gr.Variable(value=[])
1368
+ datasets_pair_classification_zh = gr.Variable(value=TASK_LIST_PAIR_CLASSIFICATION_ZH)
1369
+ data_run_classification_zh.click(
1370
+ get_mteb_data,
1371
+ inputs=[
1372
+ task_pair_classification_zh,
1373
+ lang_pair_classification_zh,
1374
+ datasets_pair_classification_zh,
1375
+ ],
1376
+ outputs=data_pair_classification_zh,
1377
+ )
1378
  with gr.TabItem("Reranking"):
1379
+ with gr.TabItem("English"):
1380
+ with gr.Row():
1381
+ gr.Markdown("""
1382
+ **Reranking English Leaderboard 🥈**
1383
+
1384
+ - **Metric:** Mean Average Precision (MAP)
1385
+ - **Languages:** English
1386
+ """)
1387
+ with gr.Row():
1388
+ data_reranking = gr.components.Dataframe(
1389
+ DATA_RERANKING,
1390
+ datatype=["number", "markdown"] + ["number"] * len(DATA_RERANKING.columns),
1391
+ type="pandas",
1392
+ )
1393
+ with gr.Row():
1394
+ data_run_reranking = gr.Button("Refresh")
1395
+ task_reranking = gr.Variable(value=["Reranking"])
1396
+ lang_reranking = gr.Variable(value=[])
1397
+ datasets_reranking = gr.Variable(value=TASK_LIST_RERANKING)
1398
+ data_run_reranking.click(
1399
+ get_mteb_data,
1400
+ inputs=[
1401
+ task_reranking,
1402
+ lang_reranking,
1403
+ datasets_reranking,
1404
+ ],
1405
+ outputs=data_reranking
1406
+ )
1407
+ with gr.TabItem("Chinese"):
1408
+ with gr.Row():
1409
+ gr.Markdown("""
1410
+ **Reranking Chinese Leaderboard 🥈🇨🇳**
1411
+
1412
+ - **Metric:** Mean Average Precision (MAP)
1413
+ - **Languages:** Chinese
1414
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1415
+ """)
1416
+ with gr.Row():
1417
+ data_reranking_zh = gr.components.Dataframe(
1418
+ DATA_RERANKING_ZH,
1419
+ datatype=["number", "markdown"] + ["number"] * len(DATA_RERANKING_ZH.columns),
1420
+ type="pandas",
1421
+ )
1422
+ with gr.Row():
1423
+ data_run_reranking_zh = gr.Button("Refresh")
1424
+ task_reranking_zh = gr.Variable(value=["Reranking"])
1425
+ lang_reranking_zh = gr.Variable(value=[])
1426
+ datasets_reranking_zh = gr.Variable(value=TASK_LIST_RERANKING_ZH)
1427
+ data_run_reranking_zh.click(
1428
+ get_mteb_data,
1429
+ inputs=[task_reranking_zh, lang_reranking_zh, datasets_reranking_zh],
1430
+ outputs=data_reranking_zh,
1431
+ )
1432
  with gr.TabItem("Retrieval"):
1433
  with gr.TabItem("English"):
1434
  with gr.Row():
1435
  gr.Markdown("""
1436
+ **Retrieval English Leaderboard 🔎**
1437
 
1438
  - **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
1439
  - **Languages:** English
 
1446
  type="pandas",
1447
  )
1448
  with gr.Row():
1449
+ data_run_retrieval = gr.Button("Refresh")
1450
  task_retrieval = gr.Variable(value=["Retrieval"])
1451
+ lang_retrieval = gr.Variable(value=[])
1452
+ datasets_retrieval = gr.Variable(value=TASK_LIST_RETRIEVAL)
1453
+ data_run_retrieval.click(
1454
+ get_mteb_data,
1455
+ inputs=[
1456
+ task_retrieval,
1457
+ lang_retrieval,
1458
+ datasets_retrieval,
1459
+ ],
1460
+ outputs=data_retrieval
1461
+ )
1462
+ with gr.TabItem("Chinese"):
1463
+ with gr.Row():
1464
+ gr.Markdown("""
1465
+ **Retrieval Chinese Leaderboard 🔎🇨🇳**
1466
+
1467
+ - **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
1468
+ - **Languages:** Chinese
1469
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1470
+ """)
1471
+ with gr.Row():
1472
+ data_retrieval_zh = gr.components.Dataframe(
1473
+ DATA_RETRIEVAL_ZH,
1474
+ # Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
1475
+ datatype=["number", "markdown"] + ["number"] * len(DATA_RETRIEVAL_ZH.columns) * 2,
1476
+ type="pandas",
1477
+ )
1478
+ with gr.Row():
1479
+ data_run_retrieval_zh = gr.Button("Refresh")
1480
+ task_retrieval_zh = gr.Variable(value=["Retrieval"])
1481
+ lang_retrieval_zh = gr.Variable(value=[])
1482
+ datasets_retrieval_zh = gr.Variable(value=TASK_LIST_RETRIEVAL_ZH)
1483
+ data_run_retrieval_zh.click(
1484
+ get_mteb_data,
1485
+ inputs=[task_retrieval_zh, lang_retrieval_zh, datasets_retrieval_zh],
1486
+ outputs=data_retrieval_zh,
1487
  )
1488
  with gr.TabItem("Polish"):
1489
  with gr.Row():
 
1502
  type="pandas",
1503
  )
1504
  with gr.Row():
1505
+ data_run_retrieval_pl = gr.Button("Refresh")
1506
  task_retrieval_pl = gr.Variable(value=["Retrieval"])
1507
  lang_retrieval_pl = gr.Variable(value=[])
1508
  datasets_retrieval_pl = gr.Variable(value=TASK_LIST_RETRIEVAL_PL)
1509
+ data_run_retrieval_pl.click(
1510
  get_mteb_data,
1511
  inputs=[task_retrieval_pl, lang_retrieval_pl, datasets_retrieval_pl],
1512
  outputs=data_retrieval_pl
 
1515
  with gr.TabItem("English"):
1516
  with gr.Row():
1517
  gr.Markdown("""
1518
+ **STS English Leaderboard 🤖**
1519
 
1520
  - **Metric:** Spearman correlation based on cosine similarity
1521
  - **Languages:** English
 
1529
  with gr.Row():
1530
  data_run_sts_en = gr.Button("Refresh")
1531
  task_sts_en = gr.Variable(value=["STS"])
1532
+ lang_sts_en = gr.Variable(value=[])
1533
+ datasets_sts_en = gr.Variable(value=TASK_LIST_STS)
1534
  data_run_sts_en.click(
1535
  get_mteb_data,
1536
+ inputs=[task_sts_en, lang_sts_en, datasets_sts_en],
1537
  outputs=data_sts_en,
1538
  )
1539
+ with gr.TabItem("Chinese"):
1540
  with gr.Row():
1541
  gr.Markdown("""
1542
+ **STS Chinese Leaderboard 🤖🇨🇳**
1543
 
1544
  - **Metric:** Spearman correlation based on cosine similarity
1545
+ - **Languages:** Chinese
1546
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1547
  """)
1548
  with gr.Row():
1549
+ data_sts_zh = gr.components.Dataframe(
1550
+ DATA_STS_ZH,
1551
+ datatype=["number", "markdown"] + ["number"] * len(DATA_STS_ZH.columns),
1552
  type="pandas",
1553
  )
1554
  with gr.Row():
1555
+ data_run_sts_zh = gr.Button("Refresh")
1556
+ task_sts_zh = gr.Variable(value=["STS"])
1557
+ lang_sts_zh = gr.Variable(value=[])
1558
+ datasets_sts_zh = gr.Variable(value=TASK_LIST_STS_ZH)
1559
+ data_run_sts_zh.click(
1560
+ get_mteb_data,
1561
+ inputs=[task_sts_zh, lang_sts_zh, datasets_sts_zh],
1562
+ outputs=data_sts_zh,
1563
+ )
1564
+ with gr.TabItem("Other"):
1565
+ with gr.Row():
1566
+ gr.Markdown("""
1567
+ **STS Other Leaderboard 👽**
1568
+
1569
+ - **Metric:** Spearman correlation based on cosine similarity
1570
+ - **Languages:** Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish (Only language combos not included in the other tabs)
1571
+ """)
1572
+ with gr.Row():
1573
+ data_sts_other = gr.components.Dataframe(
1574
+ DATA_STS_OTHER,
1575
+ datatype=["number", "markdown"] + ["number"] * len(DATA_STS_OTHER.columns) * 2,
1576
+ type="pandas",
1577
+ )
1578
+ with gr.Row():
1579
+ data_run_sts_other = gr.Button("Refresh")
1580
+ task_sts_other = gr.Variable(value=["STS"])
1581
+ lang_sts_other = gr.Variable(value=[])
1582
+ datasets_sts_other = gr.Variable(value=TASK_LIST_STS_OTHER)
1583
+ data_run_sts_other.click(
1584
+ get_mteb_data,
1585
+ inputs=[task_sts_other, lang_sts_other, task_sts_other, datasets_sts_other],
1586
+ outputs=data_sts_other
1587
+ )
1588
  with gr.TabItem("Summarization"):
1589
  with gr.Row():
1590
  gr.Markdown("""