Muennighoff commited on
Commit
f8ed0b8
·
verified ·
1 Parent(s): 70c518e
Files changed (2) hide show
  1. app.py +15 -2
  2. config.yaml +1 -0
app.py CHANGED
@@ -116,8 +116,16 @@ for model in pbar:
116
  ds = ds.map(add_task)
117
  base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
118
  # For now only one metric per task - Could add more metrics lateron
 
 
 
 
 
 
 
 
119
  for task, metric in TASK_TO_METRIC.items():
120
- ds_dict = ds.filter(lambda x: (x["mteb_task"] == task) and (x["metric"] == metric))["test"].to_dict()
121
  ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
122
  EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
123
 
@@ -463,6 +471,7 @@ for board, board_config in BOARDS_CONFIG.items():
463
  "data": boards_data[board]["data_tasks"][task_category],
464
  "refresh": get_refresh_function(task_category, task_category_list),
465
  "credits": credits,
 
466
  })
467
 
468
  dataframes = []
@@ -618,11 +627,15 @@ with gr.Blocks(css=css) as block:
618
  # For updating the 'language' in the URL
619
  item_tab.select(update_url_language, [current_task_language, language_per_task], [current_task_language, language_per_task], trigger_mode="always_last").then(None, [current_task_language], [], js=set_window_url_params)
620
 
 
 
 
 
621
  with gr.Row():
622
  gr.Markdown(f"""
623
  {item['description']}
624
 
625
- - **Metric:** {metric}
626
  - **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
627
  {"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
628
  """)
 
116
  ds = ds.map(add_task)
117
  base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
118
  # For now only one metric per task - Could add more metrics lateron
119
+
120
+ def filter_function(x, task, metric):
121
+ # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
122
+ if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
123
+ return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
124
+ else:
125
+ return x["mteb_task"] == task and x["metric"] == metric
126
+
127
  for task, metric in TASK_TO_METRIC.items():
128
+ ds_dict = ds.filter(lambda x: filter_function(x, task, metric))["test"].to_dict()
129
  ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
130
  EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
131
 
 
471
  "data": boards_data[board]["data_tasks"][task_category],
472
  "refresh": get_refresh_function(task_category, task_category_list),
473
  "credits": credits,
474
+ "metric": board_config.get("metric", None),
475
  })
476
 
477
  dataframes = []
 
627
  # For updating the 'language' in the URL
628
  item_tab.select(update_url_language, [current_task_language, language_per_task], [current_task_language, language_per_task], trigger_mode="always_last").then(None, [current_task_language], [], js=set_window_url_params)
629
 
630
+ specific_metric = metric
631
+ if item.get("metric", None) is not None:
632
+ specific_metric = item['metric']
633
+
634
  with gr.Row():
635
  gr.Markdown(f"""
636
  {item['description']}
637
 
638
+ - **Metric:** {specific_metric}
639
  - **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
640
  {"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
641
  """)
config.yaml CHANGED
@@ -301,6 +301,7 @@ boards:
301
  icon: "📚"
302
  special_icons: null
303
  credits: "[LongEmbed](https://arxiv.org/abs/2404.12096v2)"
 
304
  tasks:
305
  Retrieval:
306
  - LEMBNarrativeQARetrieval
 
301
  icon: "📚"
302
  special_icons: null
303
  credits: "[LongEmbed](https://arxiv.org/abs/2404.12096v2)"
304
+ metric: nDCG@10 (for NarrativeQA, QMSum, SummScreenFD, WikimQA) & nDCG@1 (for passkey and needle)
305
  tasks:
306
  Retrieval:
307
  - LEMBNarrativeQARetrieval