Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Muennighoff
commited on
fix metrics for LongEmbed (#124)
Browse files- init branch (b12b1dc2ff1f79ab6e8f4d7e10fe15eee4c3a0a9)
- small fix (877acad403f31fb7c96c051de7324e9874c47336)
- app.py +15 -2
- config.yaml +1 -0
app.py
CHANGED
@@ -116,8 +116,16 @@ for model in pbar:
|
|
116 |
ds = ds.map(add_task)
|
117 |
base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
|
118 |
# For now only one metric per task - Could add more metrics lateron
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
for task, metric in TASK_TO_METRIC.items():
|
120 |
-
ds_dict = ds.filter(lambda x: (x
|
121 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
122 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
123 |
|
@@ -463,6 +471,7 @@ for board, board_config in BOARDS_CONFIG.items():
|
|
463 |
"data": boards_data[board]["data_tasks"][task_category],
|
464 |
"refresh": get_refresh_function(task_category, task_category_list),
|
465 |
"credits": credits,
|
|
|
466 |
})
|
467 |
|
468 |
dataframes = []
|
@@ -618,11 +627,15 @@ with gr.Blocks(css=css) as block:
|
|
618 |
# For updating the 'language' in the URL
|
619 |
item_tab.select(update_url_language, [current_task_language, language_per_task], [current_task_language, language_per_task], trigger_mode="always_last").then(None, [current_task_language], [], js=set_window_url_params)
|
620 |
|
|
|
|
|
|
|
|
|
621 |
with gr.Row():
|
622 |
gr.Markdown(f"""
|
623 |
{item['description']}
|
624 |
|
625 |
-
- **Metric:** {
|
626 |
- **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
|
627 |
{"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
|
628 |
""")
|
|
|
116 |
ds = ds.map(add_task)
|
117 |
base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
|
118 |
# For now only one metric per task - Could add more metrics lateron
|
119 |
+
|
120 |
+
def filter_function(x, task, metric):
|
121 |
+
# This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
|
122 |
+
if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
|
123 |
+
return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
|
124 |
+
else:
|
125 |
+
return x["mteb_task"] == task and x["metric"] == metric
|
126 |
+
|
127 |
for task, metric in TASK_TO_METRIC.items():
|
128 |
+
ds_dict = ds.filter(lambda x: filter_function(x, task, metric))["test"].to_dict()
|
129 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
130 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
131 |
|
|
|
471 |
"data": boards_data[board]["data_tasks"][task_category],
|
472 |
"refresh": get_refresh_function(task_category, task_category_list),
|
473 |
"credits": credits,
|
474 |
+
"metric": board_config.get("metric", None),
|
475 |
})
|
476 |
|
477 |
dataframes = []
|
|
|
627 |
# For updating the 'language' in the URL
|
628 |
item_tab.select(update_url_language, [current_task_language, language_per_task], [current_task_language, language_per_task], trigger_mode="always_last").then(None, [current_task_language], [], js=set_window_url_params)
|
629 |
|
630 |
+
specific_metric = metric
|
631 |
+
if item.get("metric", None) is not None:
|
632 |
+
specific_metric = item['metric']
|
633 |
+
|
634 |
with gr.Row():
|
635 |
gr.Markdown(f"""
|
636 |
{item['description']}
|
637 |
|
638 |
+
- **Metric:** {specific_metric}
|
639 |
- **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
|
640 |
{"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
|
641 |
""")
|
config.yaml
CHANGED
@@ -301,6 +301,7 @@ boards:
|
|
301 |
icon: "📚"
|
302 |
special_icons: null
|
303 |
credits: "[LongEmbed](https://arxiv.org/abs/2404.12096v2)"
|
|
|
304 |
tasks:
|
305 |
Retrieval:
|
306 |
- LEMBNarrativeQARetrieval
|
|
|
301 |
icon: "📚"
|
302 |
special_icons: null
|
303 |
credits: "[LongEmbed](https://arxiv.org/abs/2404.12096v2)"
|
304 |
+
metric: nDCG@10 (for NarrativeQA, QMSum, SummScreenFD, WikimQA) & nDCG@1 (for passkey and needle)
|
305 |
tasks:
|
306 |
Retrieval:
|
307 |
- LEMBNarrativeQARetrieval
|