Spaces:

mteb
/

leaderboard

Running on CPU Upgrade

App Files Files Community

149

zdwls commited on Jun 12, 2024

Commit

b12b1dc

1 Parent(s): a812c3b

init branch

Browse files

Files changed (2) hide show

app.py +16 -2
config.yaml +1 -0

app.py CHANGED Viewed

@@ -116,11 +116,20 @@ for model in pbar:
     ds = ds.map(add_task)
     base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
     # For now only one metric per task - Could add more metrics lateron
     for task, metric in TASK_TO_METRIC.items():
-        ds_dict = ds.filter(lambda x: (x["mteb_task"] == task) and (x["metric"] == metric))["test"].to_dict()
         ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
         EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
 # Save & cache EXTERNAL_MODEL_RESULTS
 with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
     json.dump(EXTERNAL_MODEL_RESULTS, f)
@@ -457,6 +466,7 @@ for board, board_config in BOARDS_CONFIG.items():
             "data": boards_data[board]["data_tasks"][task_category],
             "refresh": get_refresh_function(task_category, task_category_list),
             "credits": credits,
         })
 dataframes = []
@@ -612,11 +622,15 @@ with gr.Blocks(css=css) as block:
                             # For updating the 'language' in the URL
                             item_tab.select(update_url_language, [current_task_language, language_per_task], [current_task_language, language_per_task], trigger_mode="always_last").then(None, [current_task_language], [], js=set_window_url_params)
                             with gr.Row():
                                 gr.Markdown(f"""
                                 {item['description']}
-                                - **Metric:** {metric}
                                 - **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
                                 {"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
                                 """)

     ds = ds.map(add_task)
     base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
     # For now only one metric per task - Could add more metrics lateron
+    def filter_function(x, task, metric):
+        # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
+        if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
+            return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
+        else:
+            return x["mteb_task"] == task and x["metric"] == metric
     for task, metric in TASK_TO_METRIC.items():
+        ds_dict = ds.filter(lambda x: filter_function(x, task, metric))["test"].to_dict()
         ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
         EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
+print("********************hello********************")
 # Save & cache EXTERNAL_MODEL_RESULTS
 with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
     json.dump(EXTERNAL_MODEL_RESULTS, f)
             "data": boards_data[board]["data_tasks"][task_category],
             "refresh": get_refresh_function(task_category, task_category_list),
             "credits": credits,
+            "metric": board_config.get("metric", None),
         })
 dataframes = []
                             # For updating the 'language' in the URL
                             item_tab.select(update_url_language, [current_task_language, language_per_task], [current_task_language, language_per_task], trigger_mode="always_last").then(None, [current_task_language], [], js=set_window_url_params)
+                            specific_metric = metric
+                            if item.get("metric", None) is not None:
+                                specific_metric = item['metric']
                             with gr.Row():
                                 gr.Markdown(f"""
                                 {item['description']}
+                                - **Metric:** {specific_metric}
                                 - **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
                                 {"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
                                 """)

config.yaml CHANGED Viewed

@@ -301,6 +301,7 @@ boards:
     icon: "📚"
     special_icons: null
     credits: "[LongEmbed](https://arxiv.org/abs/2404.12096v2)"
     tasks:
       Retrieval:
         - LEMBNarrativeQARetrieval

     icon: "📚"
     special_icons: null
     credits: "[LongEmbed](https://arxiv.org/abs/2404.12096v2)"
+    metric: nDCG@10 (for NarrativeQA, QMSum, SummScreenFD, WikimQA) & nDCG@1 (for passkey and needle)
     tasks:
       Retrieval:
         - LEMBNarrativeQARetrieval