zdwls commited on
Commit
b12b1dc
·
1 Parent(s): a812c3b

init branch

Browse files
Files changed (2) hide show
  1. app.py +16 -2
  2. config.yaml +1 -0
app.py CHANGED
@@ -116,11 +116,20 @@ for model in pbar:
116
  ds = ds.map(add_task)
117
  base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
118
  # For now only one metric per task - Could add more metrics lateron
 
 
 
 
 
 
 
 
119
  for task, metric in TASK_TO_METRIC.items():
120
- ds_dict = ds.filter(lambda x: (x["mteb_task"] == task) and (x["metric"] == metric))["test"].to_dict()
121
  ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
122
  EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
123
 
 
124
  # Save & cache EXTERNAL_MODEL_RESULTS
125
  with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
126
  json.dump(EXTERNAL_MODEL_RESULTS, f)
@@ -457,6 +466,7 @@ for board, board_config in BOARDS_CONFIG.items():
457
  "data": boards_data[board]["data_tasks"][task_category],
458
  "refresh": get_refresh_function(task_category, task_category_list),
459
  "credits": credits,
 
460
  })
461
 
462
  dataframes = []
@@ -612,11 +622,15 @@ with gr.Blocks(css=css) as block:
612
  # For updating the 'language' in the URL
613
  item_tab.select(update_url_language, [current_task_language, language_per_task], [current_task_language, language_per_task], trigger_mode="always_last").then(None, [current_task_language], [], js=set_window_url_params)
614
 
 
 
 
 
615
  with gr.Row():
616
  gr.Markdown(f"""
617
  {item['description']}
618
 
619
- - **Metric:** {metric}
620
  - **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
621
  {"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
622
  """)
 
116
  ds = ds.map(add_task)
117
  base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
118
  # For now only one metric per task - Could add more metrics lateron
119
+
120
+ def filter_function(x, task, metric):
121
+ # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
122
+ if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
123
+ return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
124
+ else:
125
+ return x["mteb_task"] == task and x["metric"] == metric
126
+
127
  for task, metric in TASK_TO_METRIC.items():
128
+ ds_dict = ds.filter(lambda x: filter_function(x, task, metric))["test"].to_dict()
129
  ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
130
  EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
131
 
132
+ print("********************hello********************")
133
  # Save & cache EXTERNAL_MODEL_RESULTS
134
  with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
135
  json.dump(EXTERNAL_MODEL_RESULTS, f)
 
466
  "data": boards_data[board]["data_tasks"][task_category],
467
  "refresh": get_refresh_function(task_category, task_category_list),
468
  "credits": credits,
469
+ "metric": board_config.get("metric", None),
470
  })
471
 
472
  dataframes = []
 
622
  # For updating the 'language' in the URL
623
  item_tab.select(update_url_language, [current_task_language, language_per_task], [current_task_language, language_per_task], trigger_mode="always_last").then(None, [current_task_language], [], js=set_window_url_params)
624
 
625
+ specific_metric = metric
626
+ if item.get("metric", None) is not None:
627
+ specific_metric = item['metric']
628
+
629
  with gr.Row():
630
  gr.Markdown(f"""
631
  {item['description']}
632
 
633
+ - **Metric:** {specific_metric}
634
  - **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
635
  {"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
636
  """)
config.yaml CHANGED
@@ -301,6 +301,7 @@ boards:
301
  icon: "📚"
302
  special_icons: null
303
  credits: "[LongEmbed](https://arxiv.org/abs/2404.12096v2)"
 
304
  tasks:
305
  Retrieval:
306
  - LEMBNarrativeQARetrieval
 
301
  icon: "📚"
302
  special_icons: null
303
  credits: "[LongEmbed](https://arxiv.org/abs/2404.12096v2)"
304
+ metric: nDCG@10 (for NarrativeQA, QMSum, SummScreenFD, WikimQA) & nDCG@1 (for passkey and needle)
305
  tasks:
306
  Retrieval:
307
  - LEMBNarrativeQARetrieval