tathagataraha commited on
Commit
09b313f
·
1 Parent(s): 9244862

[ADD] Harness tasks, data display

Browse files
.gitignore CHANGED
@@ -10,4 +10,6 @@ eval-queue/
10
  eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
 
 
13
  logs/
 
10
  eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
13
+ eval-queue-local/
14
+ eval-results-local/
15
  logs/
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: MEDIC Benchmark
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
@@ -7,8 +7,17 @@ sdk: gradio
7
  app_file: app.py
8
  pinned: true
9
  license: apache-2.0
 
 
 
 
 
 
10
  ---
11
 
 
 
 
12
  # Start the configuration
13
 
14
  Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
@@ -41,4 +50,4 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
41
  You'll find
42
  - the main table' columns names and properties in `src/display/utils.py`
43
  - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
44
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
1
  ---
2
+ title: Clinical NER Leaderboard
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
 
7
  app_file: app.py
8
  pinned: true
9
  license: apache-2.0
10
+ tags:
11
+ - leaderboard
12
+ - submission:automatic
13
+ - test:public
14
+ - judge:auto
15
+ - modality:text
16
  ---
17
 
18
+ Also known as the NCER leaderboard HF, Huggingface. See the paper for more info: https://huggingface.co/papers/2410.05046.
19
+
20
+
21
  # Start the configuration
22
 
23
  Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
 
50
  You'll find
51
  - the main table' columns names and properties in `src/display/utils.py`
52
  - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
53
+ - teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py CHANGED
@@ -1,5 +1,6 @@
 
 
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
@@ -9,30 +10,41 @@ from src.about import (
9
  CITATION_BUTTON_TEXT,
10
  EVALUATION_QUEUE_TEXT,
11
  INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
 
 
 
 
13
  TITLE,
 
14
  )
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
 
 
19
  EVAL_COLS,
20
  EVAL_TYPES,
 
 
21
  AutoEvalColumn,
22
  ModelType,
23
- fields,
 
 
24
  WeightType,
25
- Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
 
31
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
- ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
@@ -48,8 +60,20 @@ try:
48
  except Exception:
49
  restart_space()
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
 
54
  (
55
  finished_eval_queue_df,
@@ -57,51 +81,288 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
 
92
  demo = gr.Blocks(css=custom_css)
93
  with demo:
94
  gr.HTML(TITLE)
 
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  with gr.Column():
106
  with gr.Row():
107
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
@@ -146,8 +407,16 @@ with demo:
146
 
147
  with gr.Row():
148
  with gr.Column():
 
149
  model_name_textbox = gr.Textbox(label="Model name")
 
150
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
 
 
 
 
 
 
151
  model_type = gr.Dropdown(
152
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
  label="Model type",
@@ -157,21 +426,29 @@ with demo:
157
  )
158
 
159
  with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
  )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
  multiselect=False,
171
- value="Original",
172
  interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
 
 
 
 
 
 
 
 
175
 
176
  submit_button = gr.Button("Submit Eval")
177
  submission_result = gr.Markdown()
@@ -179,15 +456,20 @@ with demo:
179
  add_new_eval,
180
  [
181
  model_name_textbox,
182
- base_model_name_textbox,
183
  revision_name_textbox,
184
- precision,
185
- weight_type,
 
 
 
 
186
  model_type,
187
  ],
188
  submission_result,
189
  )
190
 
 
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
193
  citation_button = gr.Textbox(
@@ -201,4 +483,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
+ import subprocess
2
+
3
  import gradio as gr
 
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
  from huggingface_hub import snapshot_download
 
10
  CITATION_BUTTON_TEXT,
11
  EVALUATION_QUEUE_TEXT,
12
  INTRODUCTION_TEXT,
13
+ LLM_BENCHMARKS_TEXT_1,
14
+ EVALUATION_EXAMPLE_IMG,
15
+ LLM_BENCHMARKS_TEXT_2,
16
+ # ENTITY_DISTRIBUTION_IMG,
17
+ LLM_BENCHMARKS_TEXT_3,
18
  TITLE,
19
+ LOGO
20
  )
21
  from src.display.css_html_js import custom_css
22
  from src.display.utils import (
23
+ DATASET_BENCHMARK_COLS,
24
+ TYPES_BENCHMARK_COLS,
25
+ DATASET_COLS,
26
+ Clinical_TYPES_COLS,
27
  EVAL_COLS,
28
  EVAL_TYPES,
29
+ NUMERIC_INTERVALS,
30
+ TYPES,
31
  AutoEvalColumn,
32
  ModelType,
33
+ ModelArch,
34
+ PromptTemplateName,
35
+ Precision,
36
  WeightType,
37
+ fields,
38
  )
39
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
40
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
41
+ from src.submission.submit import add_new_eval, PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG
42
 
43
 
44
  def restart_space():
45
  API.restart_space(repo_id=REPO_ID)
46
 
47
+
48
  try:
49
  print(EVAL_REQUESTS_PATH)
50
  snapshot_download(
 
60
  except Exception:
61
  restart_space()
62
 
63
+ # Span based results
64
+ _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
65
+ harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
66
+
67
+ # _, span_based_types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "SpanBasedWithPartialOverlap", "clinical_types")
68
+ # span_based_types_leaderboard_df = span_based_types_original_df.copy()
69
+
70
+ # # Token based results
71
+ # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
72
+ # token_based_datasets_leaderboard_df = token_based_datasets_original_df.copy()
73
+
74
+ # _, token_based_types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "clinical_types")
75
+ # token_based_types_leaderboard_df = token_based_types_original_df.copy()
76
 
 
77
 
78
  (
79
  finished_eval_queue_df,
 
81
  pending_eval_queue_df,
82
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ def update_df(shown_columns, subset="datasets"):
86
+ leaderboard_table_df = harness_datasets_leaderboard_df.copy()
87
+ hidden_leader_board_df = harness_datasets_original_df
88
+ # else:
89
+ # match evaluation_metric:
90
+ # case "Span Based":
91
+ # leaderboard_table_df = span_based_types_leaderboard_df.copy()
92
+ # hidden_leader_board_df = span_based_types_original_df
93
+ # case "Token Based":
94
+ # leaderboard_table_df = token_based_types_leaderboard_df.copy()
95
+ # hidden_leader_board_df = token_based_types_original_df
96
+ # case _:
97
+ # pass
98
+
99
+
100
+ value_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns
101
+
102
+ return leaderboard_table_df[value_cols], hidden_leader_board_df
103
+
104
+
105
+ # Searching and filtering
106
+ def update_table(
107
+ hidden_df: pd.DataFrame,
108
+ columns: list,
109
+ query: str,
110
+ type_query: list = None,
111
+ architecture_query: list = None,
112
+ size_query: list = None,
113
+ precision_query: str = None,
114
+ show_deleted: bool = False,
115
+ ):
116
+ filtered_df = filter_models(hidden_df, type_query, architecture_query, size_query, precision_query, show_deleted)
117
+ filtered_df = filter_queries(query, filtered_df)
118
+ df = select_columns(filtered_df, columns, list(hidden_df.columns))
119
+ return df
120
+
121
+
122
+ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
123
+ return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
124
+
125
+
126
+ def select_columns(df: pd.DataFrame, columns: list, cols:list) -> pd.DataFrame:
127
+ always_here_cols = [
128
+ AutoEvalColumn.model_type_symbol.name,
129
+ AutoEvalColumn.model.name,
130
+ ]
131
+ # We use COLS to maintain sorting
132
+ filtered_df = df[always_here_cols + [c for c in cols if c in df.columns and c in columns]]
133
+ return filtered_df
134
+
135
+
136
+ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
137
+ final_df = []
138
+ if query != "":
139
+ queries = [q.strip() for q in query.split(";")]
140
+ for _q in queries:
141
+ _q = _q.strip()
142
+ if _q != "":
143
+ temp_filtered_df = search_table(filtered_df, _q)
144
+ if len(temp_filtered_df) > 0:
145
+ final_df.append(temp_filtered_df)
146
+ if len(final_df) > 0:
147
+ filtered_df = pd.concat(final_df)
148
+ filtered_df = filtered_df.drop_duplicates(
149
+ subset=[
150
+ AutoEvalColumn.model.name,
151
+ # AutoEvalColumn.precision.name,
152
+ # AutoEvalColumn.revision.name,
153
+ ]
154
+ )
155
+
156
+ return filtered_df
157
+
158
+
159
+ def filter_models(
160
+ df: pd.DataFrame, type_query: list, architecture_query: list, size_query: list, precision_query: list, show_deleted: bool
161
+ ) -> pd.DataFrame:
162
+ # Show all models
163
+ # if show_deleted:
164
+ # filtered_df = df
165
+ # else: # Show only still on the hub models
166
+ # filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
167
+
168
+ filtered_df = df
169
+
170
+ if type_query is not None:
171
+ type_emoji = [t[0] for t in type_query]
172
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
173
+
174
+ if architecture_query is not None:
175
+ arch_types = [t for t in architecture_query]
176
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(arch_types)]
177
+ # filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(architecture_query + ["None"])]
178
+
179
+ if precision_query is not None:
180
+ if AutoEvalColumn.precision.name in df.columns:
181
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
182
+
183
+ if size_query is not None:
184
+ numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
185
+ params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
186
+ mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
187
+ filtered_df = filtered_df.loc[mask]
188
+
189
+ return filtered_df
190
+
191
+ def change_submit_request_form(model_architecture):
192
+ match model_architecture:
193
+ case "Encoder":
194
+ return (
195
+ gr.Textbox(label="Threshold for gliner models", visible=False),
196
+ gr.Radio(
197
+ choices=["True", "False"],
198
+ label="Load GLiNER Tokenizer",
199
+ visible=False
200
+ ),
201
+ gr.Dropdown(
202
+ choices=[prompt_template.value for prompt_template in PromptTemplateName],
203
+ label="Prompt for generation",
204
+ multiselect=False,
205
+ # value="HTML Highlighted Spans",
206
+ interactive=True,
207
+ visible=False
208
+ )
209
+ )
210
+ case "Decoder":
211
+ return (
212
+ gr.Textbox(label="Threshold for gliner models", visible=False),
213
+ gr.Radio(
214
+ choices=["True", "False"],
215
+ label="Load GLiNER Tokenizer",
216
+ visible=False
217
+ ),
218
+ gr.Dropdown(
219
+ choices=[prompt_template.value for prompt_template in PromptTemplateName],
220
+ label="Prompt for generation",
221
+ multiselect=False,
222
+ # value="HTML Highlighted Spans",
223
+ interactive=True,
224
+ visible=True
225
+ )
226
+ )
227
+ case "GLiNER Encoder":
228
+ return (
229
+ gr.Textbox(label="Threshold for gliner models", visible=True),
230
+ gr.Radio(
231
+ choices=["True", "False"],
232
+ label="Load GLiNER Tokenizer",
233
+ visible=True
234
+ ),
235
+ gr.Dropdown(
236
+ choices=[prompt_template.value for prompt_template in PromptTemplateName],
237
+ label="Prompt for generation",
238
+ multiselect=False,
239
+ # value="HTML Highlighted Spans",
240
+ interactive=True,
241
+ visible=False
242
+ )
243
+ )
244
 
245
+
246
  demo = gr.Blocks(css=custom_css)
247
  with demo:
248
  gr.HTML(TITLE)
249
+ gr.HTML(LOGO, elem_classes="logo")
250
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
251
 
252
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
253
+ with gr.TabItem("🏅 Closed Ended Evaluation", elem_id="llm-benchmark-tab-table", id=0):
254
+ with gr.Row():
255
+ with gr.Column():
256
+ with gr.Row():
257
+ search_bar = gr.Textbox(
258
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
259
+ show_label=False,
260
+ elem_id="search-bar",
261
+ )
262
+ with gr.Row():
263
+ shown_columns = gr.CheckboxGroup(
264
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.clinical_type_col],
265
+ value=[
266
+ c.name
267
+ for c in fields(AutoEvalColumn)
268
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.clinical_type_col
269
+ ],
270
+ label="Select columns to show",
271
+ elem_id="column-select",
272
+ interactive=True,
273
+ )
274
+ # with gr.Row():
275
+ # deleted_models_visibility = gr.Checkbox(
276
+ # value=False, label="Show gated/private/deleted models", interactive=True
277
+ # )
278
+ with gr.Column(min_width=320):
279
+ # with gr.Box(elem_id="box-filter"):
280
+ filter_columns_type = gr.CheckboxGroup(
281
+ label="Model Types",
282
+ choices=[t.to_str() for t in ModelType],
283
+ value=[t.to_str() for t in ModelType],
284
+ interactive=True,
285
+ elem_id="filter-columns-type",
286
+ )
287
+ # filter_columns_architecture = gr.CheckboxGroup(
288
+ # label="Architecture Types",
289
+ # choices=[i.value.name for i in ModelArch],
290
+ # value=[i.value.name for i in ModelArch],
291
+ # interactive=True,
292
+ # elem_id="filter-columns-architecture",
293
+ # )
294
+ filter_columns_size = gr.CheckboxGroup(
295
+ label="Model sizes (in billions of parameters)",
296
+ choices=list(NUMERIC_INTERVALS.keys()),
297
+ value=list(NUMERIC_INTERVALS.keys()),
298
+ interactive=True,
299
+ elem_id="filter-columns-size",
300
+ )
301
 
302
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
 
303
 
304
+ leaderboard_table = gr.components.Dataframe(
305
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
306
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
307
+ datatype=TYPES,
308
+ elem_id="leaderboard-table",
309
+ interactive=False,
310
+ visible=True,
311
+ )
312
+
313
+ # Dummy leaderboard for handling the case when the user uses backspace key
314
+ # hidden_leaderboard_table_for_search = gr.components.Dataframe(
315
+ # value=datasets_original_df[DATASET_COLS],
316
+ # headers=DATASET_COLS,
317
+ # datatype=TYPES,
318
+ # visible=False,
319
+ # )
320
+
321
+
322
+ # search_bar.submit(
323
+ # update_table,
324
+ # [
325
+ # hidden_leaderboard_table_for_search,
326
+ # shown_columns,
327
+ # search_bar,
328
+ # filter_columns_type,
329
+ # # filter_columns_architecture
330
+ # ],
331
+ # leaderboard_table,
332
+ # )
333
+ # for selector in [
334
+ # shown_columns,
335
+ # filter_columns_type,
336
+ # # filter_columns_architecture,
337
+ # # filter_columns_size,
338
+ # # deleted_models_visibility,
339
+ # ]:
340
+ # selector.change(
341
+ # update_table,
342
+ # [
343
+ # hidden_leaderboard_table_for_search,
344
+ # shown_columns,
345
+ # search_bar,
346
+ # filter_columns_type,
347
+ # # filter_columns_architecture,
348
+ # ],
349
+ # leaderboard_table,
350
+ # queue=True,
351
+ # )
352
+
353
+ with gr.TabItem("🏅 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
354
+ pass
355
+ with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=2):
356
+ pass
357
+
358
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
359
+ gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
360
+ gr.HTML(EVALUATION_EXAMPLE_IMG, elem_classes="logo")
361
+ gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
362
+ # gr.HTML(ENTITY_DISTRIBUTION_IMG, elem_classes="logo")
363
+ gr.Markdown(LLM_BENCHMARKS_TEXT_3, elem_classes="markdown-text")
364
+
365
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=4):
366
  with gr.Column():
367
  with gr.Row():
368
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
407
 
408
  with gr.Row():
409
  with gr.Column():
410
+
411
  model_name_textbox = gr.Textbox(label="Model name")
412
+
413
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
414
+
415
+ model_arch = gr.Radio(
416
+ choices=[t.to_str(" : ") for t in ModelArch if t != ModelArch.Unknown],
417
+ label="Model Architecture",
418
+ )
419
+
420
  model_type = gr.Dropdown(
421
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
422
  label="Model type",
 
426
  )
427
 
428
  with gr.Column():
429
+ label_normalization_map = gr.Textbox(lines=6, label="Label Normalization Map", placeholder=PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG)
430
+ gliner_threshold = gr.Textbox(label="Threshold for GLiNER models", visible=False)
431
+ gliner_tokenizer_bool = gr.Radio(
432
+ choices=["True", "False"],
433
+ label="Load GLiNER Tokenizer",
434
+ visible=False
435
  )
436
+ prompt_name = gr.Dropdown(
437
+ choices=[prompt_template.value for prompt_template in PromptTemplateName],
438
+ label="Prompt for generation",
439
  multiselect=False,
440
+ value="HTML Highlighted Spans",
441
  interactive=True,
442
+ visible=False
443
+ )# should be a dropdown
444
+
445
+ # parsing_function - this is tied to the prompt & therefore does not need to be specified
446
+ # generation_parameters = gr.Textbox(label="Generation params in json format") just default for now
447
+
448
+ model_arch.change(fn=change_submit_request_form, inputs=model_arch, outputs=[
449
+ gliner_threshold,
450
+ gliner_tokenizer_bool,
451
+ prompt_name])
452
 
453
  submit_button = gr.Button("Submit Eval")
454
  submission_result = gr.Markdown()
 
456
  add_new_eval,
457
  [
458
  model_name_textbox,
459
+ # base_model_name_textbox,
460
  revision_name_textbox,
461
+ model_arch,
462
+ label_normalization_map,
463
+ gliner_threshold,
464
+ gliner_tokenizer_bool,
465
+ prompt_name,
466
+ # weight_type,
467
  model_type,
468
  ],
469
  submission_result,
470
  )
471
 
472
+
473
  with gr.Row():
474
  with gr.Accordion("📙 Citation", open=False):
475
  citation_button = gr.Textbox(
 
483
  scheduler = BackgroundScheduler()
484
  scheduler.add_job(restart_space, "interval", seconds=1800)
485
  scheduler.start()
486
+ demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'])
assets/entity_distribution.png ADDED
assets/image.png ADDED
assets/ner_evaluation_example.png ADDED
eval_metrics_app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ # Function to compute evaluation metrics (dummy implementation)
4
+ def compute_metrics(gt_spans, pred_spans):
5
+ # Dummy implementation of a metric computation
6
+ # Replace this with actual metric computation logic
7
+ tp = len(set(gt_spans) & set(pred_spans))
8
+ fp = len(set(pred_spans) - set(gt_spans))
9
+ fn = len(set(gt_spans) - set(pred_spans))
10
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0
11
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0
12
+ f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
13
+
14
+ return {"precision": precision, "recall": recall, "f1_score": f1_score}
15
+
16
+ def create_app():
17
+ with gr.Blocks() as demo:
18
+ # Input components
19
+ text_input = gr.Textbox(label="Input Text")
20
+ highlight_input = gr.Textbox(label="Highlight Text and Press Add")
21
+
22
+ gt_spans_state = gr.State([])
23
+ pred_spans_state = gr.State([])
24
+
25
+ # Buttons for ground truth and prediction
26
+ add_gt_button = gr.Button("Add to Ground Truth")
27
+ add_pred_button = gr.Button("Add to Predictions")
28
+
29
+ # Outputs for highlighted spans
30
+ gt_output = gr.HighlightedText(label="Ground Truth Spans")
31
+ pred_output = gr.HighlightedText(label="Predicted Spans")
32
+
33
+ # Compute metrics button and its output
34
+ compute_button = gr.Button("Compute Metrics")
35
+ metrics_output = gr.JSON(label="Metrics")
36
+
37
+ # Function to update spans
38
+ def update_spans(text, span, gt_spans, pred_spans, is_gt):
39
+ start_idx = text.find(span)
40
+ end_idx = start_idx + len(span)
41
+ new_span = (start_idx, end_idx)
42
+ if is_gt:
43
+ gt_spans.append(new_span)
44
+ gt_spans = list(set(gt_spans))
45
+ else:
46
+ pred_spans.append(new_span)
47
+ pred_spans = list(set(pred_spans))
48
+ return gt_spans, pred_spans, highlight_spans(text, gt_spans), highlight_spans(text, pred_spans)
49
+
50
+ # Function to highlight spans
51
+ def highlight_spans(text, spans):
52
+ span_dict = {}
53
+ for span in spans:
54
+ span_dict[(span[0], span[1])] = "highlight"
55
+ return span_dict
56
+
57
+ # Event handlers for buttons
58
+ add_gt_button.click(fn=update_spans, inputs=[text_input, highlight_input, gt_spans_state, pred_spans_state, gr.State(True)], outputs=[gt_spans_state, pred_spans_state, gt_output, pred_output])
59
+ add_pred_button.click(fn=update_spans, inputs=[text_input, highlight_input, gt_spans_state, pred_spans_state, gr.State(False)], outputs=[gt_spans_state, pred_spans_state, gt_output, pred_output])
60
+
61
+ # Function to compute metrics
62
+ def on_compute_metrics(gt_spans, pred_spans):
63
+ metrics = compute_metrics(gt_spans, pred_spans)
64
+ return metrics
65
+
66
+ compute_button.click(fn=on_compute_metrics, inputs=[gt_spans_state, pred_spans_state], outputs=metrics_output)
67
+
68
+ # Layout arrangement
69
+ text_input.change(fn=lambda x: x, inputs=text_input, outputs=[gt_output, pred_output])
70
+
71
+ return demo
72
+
73
+ # Run the app
74
+ demo = create_app()
75
+ demo.launch()
medic-harness-requests/.gitattributes ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
56
+ # Video files - compressed
57
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
58
+ *.webm filter=lfs diff=lfs merge=lfs -text
medic-harness-results/.gitattributes ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
56
+ # Video files - compressed
57
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
58
+ *.webm filter=lfs diff=lfs merge=lfs -text
medic-harness-results/aaditya/Llama3-OpenBioLLM-70B/results_2024-07-24T15:26:36Z.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "aaditya/Llama3-OpenBioLLM-70B",
4
+ "revision": "main",
5
+ "submitted_time": "2024-07-24 14:33:56+00:00",
6
+ "model_type": "domain-specific",
7
+ "num_params": 70000000000,
8
+ "private": false,
9
+ "evaluated_time": "2024-07-24T15:26:36Z"
10
+ },
11
+ "results": {
12
+ "MMLU": {
13
+ "accuracy": 90.4
14
+ },
15
+ "MMLU-Pro": {
16
+ "accuracy": 64.2
17
+ },
18
+ "MedMCQA": {
19
+ "accuracy": 73.2
20
+ },
21
+ "MedQA": {
22
+ "accuracy": 76.9
23
+ },
24
+ "USMLE": {
25
+ "accuracy": 79.0
26
+ },
27
+ "PubMedQA": {
28
+ "accuracy": 73.2
29
+ },
30
+ "ToxiGen": {
31
+ "accuracy": 91.3
32
+ },
33
+ "Average": {
34
+ "accuracy": 78.3
35
+ }
36
+ }
37
+ }
medic-harness-results/meta-llama/Llama-3.1-8B-Instruct/results_2024-07-24T15:26:36Z.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "meta-llama/Llama-3.1-8B-Instruct",
4
+ "revision": "main",
5
+ "submitted_time": "2024-07-24 14:33:56+00:00",
6
+ "model_type": "instruct-tuned",
7
+ "num_params": 8000000000,
8
+ "private": false,
9
+ "evaluated_time": "2024-07-24T15:26:36Z"
10
+ },
11
+ "results": {
12
+ "MMLU": {
13
+ "accuracy": 73.4
14
+ },
15
+ "MMLU-Pro": {
16
+ "accuracy": 49.9
17
+ },
18
+ "MedMCQA": {
19
+ "accuracy": 58.4
20
+ },
21
+ "MedQA": {
22
+ "accuracy": 62.0
23
+ },
24
+ "USMLE": {
25
+ "accuracy": 68.2
26
+ },
27
+ "PubMedQA": {
28
+ "accuracy": 76.2
29
+ },
30
+ "ToxiGen": {
31
+ "accuracy": 82.3
32
+ },
33
+ "Average": {
34
+ "accuracy": 67.2
35
+ }
36
+ }
37
+ }
38
+
39
+
requirements.txt CHANGED
@@ -1,16 +1,18 @@
1
  APScheduler
2
  black
 
3
  datasets
4
  gradio
5
- gradio[oauth]
6
- gradio_leaderboard==0.0.9
7
  gradio_client
8
  huggingface-hub>=0.18.0
9
  matplotlib
10
  numpy
11
  pandas
12
  python-dateutil
 
13
  tqdm
14
  transformers
15
  tokenizers>=0.15.0
 
 
16
  sentencepiece
 
1
  APScheduler
2
  black
3
+ click
4
  datasets
5
  gradio
 
 
6
  gradio_client
7
  huggingface-hub>=0.18.0
8
  matplotlib
9
  numpy
10
  pandas
11
  python-dateutil
12
+ requests
13
  tqdm
14
  transformers
15
  tokenizers>=0.15.0
16
+ git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
+ accelerate
18
  sentencepiece
src/about.py CHANGED
@@ -1,72 +1,267 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
- class Task:
6
  benchmark: str
7
  metric: str
8
  col_name: str
9
-
10
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
- # ---------------------------------------------------
20
 
 
 
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 
 
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
- LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
 
34
 
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
 
 
 
 
 
38
  """
 
 
 
 
39
 
40
- EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
-
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
 
 
54
 
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
 
57
 
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
60
 
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
 
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
  """
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
 
 
72
  """
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
+ class HarnessTask:
7
  benchmark: str
8
  metric: str
9
  col_name: str
10
+
11
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
+ class HarnessTasks(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+ # task0 = Task("anli_r1", "acc", "ANLI")
17
+ # task1 = Task("logiqa", "acc_norm", "LogiQA")
18
+ task0 = HarnessTask("MMLU", "accuracy", "MMLU")
19
+ task1 = HarnessTask("MMLU-Pro", "accuracy", "MMLU-Pro")
20
+ task2 = HarnessTask("MedMCQA", "accuracy", "MedMCQA")
21
+ task3 = HarnessTask("MedQA", "accuracy", "MedQA")
22
+ task4 = HarnessTask("USMLE", "accuracy", "USMLE")
23
+ task5 = HarnessTask("PubMedQA", "accuracy", "PubMedQA")
24
+ task6 = HarnessTask("ToxiGen", "accuracy", "ToxiGen")
25
+ task7 = HarnessTask("Average", "accuracy", "Harness-Average")
26
+ # task5 = Task("", "f1", "")
27
+ # task6 = Task("", "f1", "")
28
+
29
+ @dataclass
30
+ class ClinicalType:
31
+ benchmark: str
32
+ metric: str
33
+ col_name: str
34
+
35
+ class ClinicalTypes(Enum):
36
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
37
+ type0 = ClinicalType("condition", "f1", "CONDITION")
38
+ type1 = ClinicalType("measurement", "f1", "MEASUREMENT")
39
+ type2 = ClinicalType("drug", "f1", "DRUG")
40
+ type3 = ClinicalType("procedure", "f1", "PROCEDURE")
41
+ type4 = ClinicalType("gene", "f1", "GENE")
42
+ type5 = ClinicalType("gene variant", "f1", "GENE VARIANT")
43
 
 
 
44
 
45
+ NUM_FEWSHOT = 0 # Change with your few shot
46
+ # ---------------------------------------------------
47
 
48
 
49
  # Your leaderboard name
50
+ TITLE = """""" #<h1 align="center" id="space-title"> NER Leaderboard</h1>"""
51
+ LOGO = """<img src="https://equalengineers.com/wp-content/uploads/2024/04/dummy-logo-5b.png" alt="Clinical X HF" width="500" height="333">"""
52
+ # LOGO = """<img src="https://huggingface.co/spaces/m42-health/clinical_ner_leaderboard/resolve/main/assets/image.png" alt="Clinical X HF" width="500" height="333">"""
53
 
54
  # What does your leaderboard evaluate?
55
  INTRODUCTION_TEXT = """
56
+ The rapid development of Large Language Models (LLMs) for healthcare applications has spurred calls for holistic evaluation beyond frequently-cited benchmarks like USMLE, to better reflect real-world performance. While real-world assessments are valuable indicators of utility, they often lag behind the pace of LLM evolution, likely rendering findings obsolete upon deployment. This temporal disconnect necessitates a comprehensive upfront evaluation that can guide model selection for specific clinical applications. We introduce MEDIC, a framework assessing LLMs across five critical dimensions of clinical competence: medical reasoning, ethics and bias, data and language understanding, in-context learning, and clinical safety. MEDIC features a novel cross-examination framework quantifying LLM performance across areas like coverage and hallucination detection, without requiring reference outputs. We apply MEDIC to evaluate LLMs on medical question-answering, safety, summarization, note generation, and other tasks. Our results show performance disparities across model sizes, baseline vs medically finetuned models, and have implications on model selection for applications requiring specific model strengths, such as low hallucination or lower cost of inference. MEDIC's multifaceted evaluation reveals these performance trade-offs, bridging the gap between theoretical capabilities and practical implementation in healthcare settings, ensuring that the most promising models are identified and adapted for diverse healthcare applications.
57
  """
58
 
59
  # Which evaluations are you running? how can people reproduce what you have?
60
+ LLM_BENCHMARKS_TEXT_1 = f"""
61
+
62
+ ## About
63
 
64
+ The Named Clinical Entity Recognition Leaderboard is aimed at advancing the field of natural language processing in healthcare. It provides a standardized platform for evaluating and comparing the performance of various language models in recognizing named clinical entities, a critical task for applications such as clinical documentation, decision support, and information extraction. By fostering transparency and facilitating benchmarking, the leaderboard's goal is to drive innovation and improvement in NLP models. It also helps researchers identify the strengths and weaknesses of different approaches, ultimately contributing to the development of more accurate and reliable tools for clinical use. Despite its exploratory nature, the leaderboard aims to play a role in guiding research and ensuring that advancements are grounded in rigorous and comprehensive evaluations.
 
65
 
66
+ ## Evaluation method and metrics
67
+ When training a Named Entity Recognition (NER) system, the most common evaluation methods involve measuring precision, recall, and F1-score at the token level. While these metrics are useful for fine-tuning the NER system, evaluating the predicted named entities for downstream tasks requires metrics at the full named-entity level. We include both evaluation methods: token-based and span-based. We provide an example below which helps in understanding the difference between the methods.
68
+ Example Sentence: "The patient was diagnosed with a skin cancer disease."
69
+ For simplicity, let's assume the an example sentence which contains 10 tokens, with a single two-token disease entity (as shown in the figure below).
70
  """
71
+ EVALUATION_EXAMPLE_IMG = """<img src="https://huggingface.co/spaces/m42-health/clinical_ner_leaderboard/resolve/main/assets/ner_evaluation_example.png" alt="Clinical X HF" width="750" height="500">"""
72
+ LLM_BENCHMARKS_TEXT_2 = """
73
+ Token-based evaluation involves obtaining the set of token labels (ground-truth annotations) for the annotated entities and the set of token predictions, comparing these sets, and computing a classification report. Hence, the results for the example above are shown below.
74
+ **Token-based metrics:**
75
 
76
+
77
+
78
+ | Model | TP | FP | FN | Precision | Recall | F1-Score |
79
+ | ------- | --- | --- | --- | --------- | ------ | -------- |
80
+ | Model D | 0 | 1 | 0 | 0.00 | 0.00 | 0.00 |
81
+ | Model C | 1 | 1 | 1 | 0.50 | 0.50 | 0.50 |
82
+ | Model B | 2 | 2 | 0 | 0.50 | 1.00 | 0.67 |
83
+ | Model A | 2 | 1 | 0 | 0.67 | 1.00 | 0.80 |
84
+
85
+
86
+ Where,
87
+ $$ Precision = TP / (TP + FP)$$
88
+ $$ Recall = TP / (TP + FN)$$
89
+ $$ f1score = 2 * (Prec * Rec) / (Prec + Rec)$$
90
+
91
+
92
+
93
+ With this token-based approach, we have a broad idea of the performance of the model at the token level. However, it may misrepresent the performance at the entity level when the entity includes more than 1 token (which may be more relevant for certain applications). In addition, depending on the annotations of certain datasets, we may not want to penalize a model for a "partial" match with a certain entity.
94
+ The span-based method attempts to address some of these issues, by determining the full or partial matches at the entity level to classify the predictions as correct, incorrect, missed and spurious. These are then used to calculate precision, recall and F1-score. Given this, for the example below.
95
+
96
+ **Span-based metrics:**
97
+
98
+
99
+ | Model | Correct | Incorrect | Missed | Spurious | Precision | Recall | F1-Score |
100
+ | ------- | ------- | --------- | ------ | -------- | --------- | ------ | -------- |
101
+ | Model A | 1 | 0 | 0 | 0 | 1.00 | 1.00 | 1.00 |
102
+ | Model B | 1 | 0 | 0 | 0 | 1.00 | 1.00 | 1.00 |
103
+ | Model C | 1 | 0 | 0 | 0 | 1.00 | 1.00 | 1.00 |
104
+ | Model D | 0 | 0 | 1 | 1 | 0.00 | 0.00 | 0.00 |
105
+
106
+
107
+ Where,
108
+ $$ Precision = COR / (COR + INC + SPU)$$
109
+ $$ Recall = COR / (COR + INC + MIS)$$
110
+ $$ f1score = 2 * (Prec * Rec) / (Prec + Rec)$$
111
+
112
+ Note:
113
+ 1. Span-based approach here is equivalent to the 'Span Based Evaluation with Partial Overlap' in [NER Metrics Showdown!](https://huggingface.co/spaces/wadood/ner_evaluation_metrics) and is equivalent to Partial Match ("Type") in the nervaluate python package.
114
+ 2. Token-based approach here is equivalent to the 'Token Based Evaluation With Macro Average' in [NER Metrics Showdown!](https://huggingface.co/spaces/wadood/ner_evaluation_metrics)
115
+
116
+ Additional examples can be tested on the [NER Metrics Showdown!](https://huggingface.co/spaces/wadood/ner_evaluation_metrics) huggingface space.
117
+
118
+ ## Datasets
119
+ The following datasets (test splits only) have been included in the evaluation.
120
+
121
+ ### [NCBI Disease](https://huggingface.co/datasets/m42-health/clinical_ncbi)
122
+ The NCBI Disease corpus includes mention and concept level annotations on PubMed abstracts. It covers annotations of diseases.
123
+
124
+ | | Counts |
125
+ | ---------- | ------ |
126
+ | Samples | 100 |
127
+ | Annotation | 960 |
128
+
129
+
130
+ ### [CHIA](https://huggingface.co/datasets/m42-health/clinical_chia)
131
+ This is a large, annotated corpus of patient eligibility criteria extracted from registered clinical trials (ClinicalTrials.gov). Annotations cover 15 different entity types, including conditions, drugs, procedures, and measurements.
132
+
133
+
134
+ | | Counts |
135
+ | ---------- | ------ |
136
+ | Samples | 194 |
137
+ | Annotation | 3981 |
138
+
139
+
140
+ ### [BC5CDR](https://huggingface.co/datasets/m42-health/clinical_bc5cdr)
141
+ The BC5CDR corpus consists of 1500 PubMed articles with annotated chemicals and diseases.
142
+
143
+
144
+ | | Counts |
145
+ | ---------- | ------ |
146
+ | Samples | 500 |
147
+ | Annotation | 9928 |
148
+
149
+
150
+ ### [BIORED](https://huggingface.co/datasets/m42-health/clinical_biored)
151
+ The BIORED corpus includes a set of PubMed abstracts with annotations of multiple entity types (e.g., gene/protein, disease, chemical).
152
+
153
+
154
+ | | Counts |
155
+ | ---------- | ------ |
156
+ | Samples | 100 |
157
+ | Annotation | 3535 |
158
+
159
+
160
+ Datasets summary
161
+
162
+ A summary of the datasets used are summarized here.
163
+
164
+
165
+ | Dataset | # samples | # annotations | # original entities | # clinical entities |
166
+ | ------- | --------- | ------------- | ------------------- | ------------------- |
167
+ | NCBI | 100 | 960 | 4 | 1 |
168
+ | CHIA | 194 | 3981 | 16 | 4 |
169
+ | BIORED | 500 | 9928 | 2 | 4 |
170
+ | BC5CDR | 100 | 3535 | 6 | 2 |
171
+
172
+
173
+ ## Clinical Entity Types
174
+
175
+ The above datasets are modified to cater to the clinical setting. For this, the entity types that are clinically relevant are retained and the rest are dropped. Further, the clinical entity type is standardized across the dataset to obtain a total of 6 clinical entity types shown below.
176
+
177
+
178
+ | Clinical Entity | Combined Annotation |
179
+ | --------------- | ------------------- |
180
+ | Condition | 7514 |
181
+ | Drug | 6443 |
182
+ | Procedure | 300 |
183
+ | Measurement | 258 |
184
+ | Gene | 1180 |
185
+ | Gene Variant | 241 |
186
+
187
+
188
+ """
189
+
190
+ ENTITY_DISTRIBUTION_IMG = """<img src="file/assets/entity_distribution.png" alt="Clinical X HF" width="750" height="500">"""
191
+ LLM_BENCHMARKS_TEXT_3="""
192
+ ## Decoder Model Evaluation
193
+ Evaluating encoder models, such as BERT, for token classification tasks (e.g., NER) is straightforward given that these models process the entire input sequence simultaneously. This allows them to generate token-level classifications by leveraging bidirectional context, facilitating a direct comparison of predicted tags against the gold standard labels for each token in the input sequence.
194
+
195
+ In contrast, decoder-only models, like GPT, generate responses sequentially, predicting one token at a time based on the preceding context. Evaluating the performance of these models for token classification tasks requires a different approach. First, we prompt the decoder-only LLM with a specific task of tagging the different entity types within a given text. This task is clearly defined to the model, ensuring it understands which types of entities to identify (i.e., conditions, drugs, procedures, etc).
196
+ An example of the task prompt is shown below.
197
+ ```
198
+ ## Instruction
199
+ Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: symptom, disorder. Use HTML <span > tags to highlight these entities. Each <span > should have a class attribute indicating the type of the entity. Do NOT provide further examples and just consider the input provided below. Do NOT provide an explanation nor notes about the reasoning. Do NOT reformat nor summarize the input text. Follow the instruction and the format of the example below.
200
+
201
+ ## Entity markup guide
202
+ Use <span class='symptom' > to denote a symptom.
203
+ Use <span class='disorder' > to denote a disorder.
204
  ```
205
+ To ensure deterministic and consistent outputs, the temperature for generation is kept at 0.0. The model then generates a sequential response that includes the tagged entities, as shown in the example below.
206
+ ```
207
+ ## Input:
208
+ He had been diagnosed with osteoarthritis of the knees and had undergone arthroscopy years prior to admission.
209
+ ## Output:
210
+ He had been diagnosed with <span class="disease" >osteoarthritis of the knees</span >and had undergone <span class="procedure" >arthroscopy</span >years prior to admission.
211
+ ```
212
+
213
+ After the tagged output is generated, it is parsed to extract the tagged entities. The parsed data are then compared against the gold standard labels, and performance metrics are computed as above. This evaluation method ensures a consistent and objective assessment of decoder-only LLM's performance in NER tasks, despite the differences in their architecture compared to encoder models.
214
+
215
+ # Reproducibility
216
+ To reproduce our results, follow the steps detailed [here](https://github.com/WadoodAbdul/clinical_ner_benchmark/blob/master/docs/reproducing_results.md)
217
+
218
+ # Disclaimer and Advisory
219
+ The Leaderboard is maintained by the authors and affiliated entity as part of our ongoing contribution to open research in the field of NLP in healthcare. The leaderboard is intended for academic and exploratory purposes only. The language models evaluated on this platform (to the best knowledge of the authors) have not been approved for clinical use, and their performance should not be interpreted as clinically validated or suitable for real-world medical applications.
220
+
221
+ Users are advised to approach the results with an understanding of the inherent limitations and the experimental nature of this evaluation. The authors and affiliated entity do not endorse any specific model or approach, and the leaderboard is provided without any warranties or guarantees. Researchers and practitioners are encouraged to use the leaderboard as a resource to guide further research and development, keeping in mind the necessity for rigorous testing and validation in clinical settings before any practical application.
222
+
223
+
224
+
225
+
226
+ """
227
+
228
+ EVALUATION_QUEUE_TEXT = """
229
+
230
+ Currently, the benchmark supports evaluation for models hosted on the huggingface hub and of type encoder, decoder or gliner type models.
231
+ If your model needs a custom implementation, follow the steps outlined in the [clinical_ner_benchmark](https://github.com/WadoodAbdul/clinical_ner_benchmark/blob/e66eb566f34e33c4b6c3e5258ac85aba42ec7894/docs/custom_model_implementation.md) repo or reach out to our team!
232
+
233
+
234
+ ### Fields Explanation
235
+
236
+ #### Model Type:
237
+ - Fine-Tuned: If the training data consisted of any split/variation of the datasets on the leaderboard.
238
+ - Zero-Shot: If the model did not have any exposure to the datasets on the leaderboard while training.
239
 
240
+ #### Model Architecture:
241
+ - Encoder: The standard transformer encoder architecture with a token classification head on top.
242
+ - Decoder: Transformer based autoregressive token generation model.
243
+ - GLiNER: Architecture outlined in the [GLiNER Paper](https://arxiv.org/abs/2311.08526)
244
 
245
+ #### Label Normalization Map:
246
+ Not all models have been tuned to output the ner label names in the clinical datasets on this leaderboard. Some models cater to the same entity names with a synonym of it.
247
+ The normalization map can be used to ensure that the models's output are aligned with the labels expected in the datasets.
248
 
249
+ Note: Multiple model labels can be mapped to a single entity type in the leaderboard dataset. Ex: 'synonym' and 'disease' to 'condition'
 
250
 
 
 
251
 
252
+ Upon successful submission of your request, your model's result would be updated on the leaderboard within 5 working days!
 
 
 
253
  """
254
 
255
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
256
  CITATION_BUTTON_TEXT = r"""
257
+ @misc{abdul2024namedclinicalentityrecognition,
258
+ title={Named Clinical Entity Recognition Benchmark},
259
+ author={Wadood M Abdul and Marco AF Pimentel and Muhammad Umar Salman and Tathagata Raha and Clément Christophe and Praveen K Kanithi and Nasir Hayat and Ronnie Rajan and Shadab Khan},
260
+ year={2024},
261
+ eprint={2410.05046},
262
+ archivePrefix={arXiv},
263
+ primaryClass={cs.CL},
264
+ url={https://arxiv.org/abs/2410.05046},
265
+ }
266
+
267
  """
src/display/css_html_js.py CHANGED
@@ -1,4 +1,11 @@
1
  custom_css = """
 
 
 
 
 
 
 
2
 
3
  .markdown-text {
4
  font-size: 16px !important;
 
1
  custom_css = """
2
+ .logo {
3
+ width: 500px;
4
+ height: auto;
5
+ margin: 0 auto;
6
+ max-width: 100%
7
+ object-fit: contain;
8
+ }
9
 
10
  .markdown-text {
11
  font-size: 16px !important;
src/display/utils.py CHANGED
@@ -3,7 +3,9 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks
 
 
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -19,53 +21,64 @@ class ColumnContent:
19
  displayed_by_default: bool
20
  hidden: bool = False
21
  never_hidden: bool = False
 
 
 
22
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
35
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
 
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
49
  model = ColumnContent("model", "markdown", True)
50
  revision = ColumnContent("revision", "str", True)
51
  private = ColumnContent("private", "bool", True)
52
- precision = ColumnContent("precision", "str", True)
53
- weight_type = ColumnContent("weight_type", "str", "Original")
 
54
  status = ColumnContent("status", "str", True)
55
 
 
56
  ## All the model information that we might need
57
  @dataclass
58
  class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
- symbol: str = "" # emoji
62
 
63
 
64
  class ModelType(Enum):
 
 
65
  PT = ModelDetails(name="pretrained", symbol="🟢")
66
  FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
  Unknown = ModelDetails(name="", symbol="?")
70
 
71
  def to_str(self, separator=" "):
@@ -73,24 +86,55 @@ class ModelType(Enum):
73
 
74
  @staticmethod
75
  def from_str(type):
76
- if "fine-tuned" in type or "🔶" in type:
77
- return ModelType.FT
78
- if "pretrained" in type or "🟢" in type:
79
- return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "" in type:
83
- return ModelType.IFT
 
 
 
 
84
  return ModelType.Unknown
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
89
  Delta = ModelDetails("Delta")
90
 
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
 
 
 
 
94
  Unknown = ModelDetails("?")
95
 
96
  def from_str(precision):
@@ -98,13 +142,49 @@ class Precision(Enum):
98
  return Precision.float16
99
  if precision in ["torch.bfloat16", "bfloat16"]:
100
  return Precision.bfloat16
 
 
 
 
 
 
 
 
101
  return Precision.Unknown
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  # Column selection
104
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
 
 
 
105
 
106
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import HarnessTasks
7
+ from src.about import ClinicalTypes
8
+
9
 
10
  def fields(raw_class):
11
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
21
  displayed_by_default: bool
22
  hidden: bool = False
23
  never_hidden: bool = False
24
+ dataset_task_col: bool = False
25
+ clinical_type_col: bool = False
26
+
27
 
28
  ## Leaderboard columns
29
  auto_eval_column_dict = []
30
  # Init
31
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
32
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
33
+ # Scores
34
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
35
+ for task in HarnessTasks:
36
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
37
  # Model information
38
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
39
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
40
+ auto_eval_column_dict.append(["backbone", ColumnContent, ColumnContent("Base Model", "str", False)])
41
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
42
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False, True)])
43
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
44
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
45
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False, True)])
46
+ auto_eval_column_dict.append(
47
+ ["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, True)]
48
+ )
49
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
50
 
51
  # We use make dataclass to dynamically fill the scores from Tasks
52
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
53
 
54
+
55
  ## For the queue columns in the submission tab
56
  @dataclass(frozen=True)
57
  class EvalQueueColumn: # Queue column
58
  model = ColumnContent("model", "markdown", True)
59
  revision = ColumnContent("revision", "str", True)
60
  private = ColumnContent("private", "bool", True)
61
+ architecture = ColumnContent("model_architecture", "bool", True)
62
+ # precision = ColumnContent("precision", "str", True)
63
+ # weight_type = ColumnContent("weight_type", "str", "Original")
64
  status = ColumnContent("status", "str", True)
65
 
66
+
67
  ## All the model information that we might need
68
  @dataclass
69
  class ModelDetails:
70
  name: str
71
  display_name: str = ""
72
+ symbol: str = "" # emoji
73
 
74
 
75
  class ModelType(Enum):
76
+ ZEROSHOT = ModelDetails(name="zero-shot", symbol="⚫")
77
+ FINETUNED = ModelDetails(name="fine-tuned", symbol="⚪")
78
  PT = ModelDetails(name="pretrained", symbol="🟢")
79
  FT = ModelDetails(name="fine-tuned", symbol="🔶")
80
+ # IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
81
+ # RL = ModelDetails(name="RL-tuned", symbol="🟦")
82
  Unknown = ModelDetails(name="", symbol="?")
83
 
84
  def to_str(self, separator=" "):
 
86
 
87
  @staticmethod
88
  def from_str(type):
89
+ if "zero-shot" in type or "" in type:
90
+ return ModelType.ZEROSHOT
91
+ if "fine-tuned" in type or "" in type:
92
+ return ModelType.FINETUNED
93
+ # if "fine-tuned" in type or "🔶" in type:
94
+ # return ModelType.FT
95
+ # if "pretrained" in type or "🟢" in type:
96
+ # return ModelType.PT
97
+ # if "RL-tuned" in type or "🟦" in type:
98
+ # return ModelType.RL
99
+ # if "instruction-tuned" in type or "⭕" in type:
100
+ # return ModelType.IFT
101
  return ModelType.Unknown
102
 
103
+ class ModelArch(Enum):
104
+ Encoder = ModelDetails("Encoder")
105
+ Decoder = ModelDetails("Decoder")
106
+ GLiNEREncoder = ModelDetails("GLiNER Encoder")
107
+ Unknown = ModelDetails(name="Other", symbol="?")
108
+
109
+ def to_str(self, separator=" "):
110
+ return f"{self.value.name}"
111
+
112
+ @staticmethod
113
+ def from_str(type):
114
+ if "Encoder" == type:
115
+ return ModelArch.Encoder
116
+ if "Decoder" == type:
117
+ return ModelArch.Decoder
118
+ if "GLiNER Encoder" == type:
119
+ return ModelArch.GLiNEREncoder
120
+ # if "unknown" in type:
121
+ # return ModelArch.Unknown
122
+ return ModelArch.Unknown
123
+
124
+
125
  class WeightType(Enum):
126
  Adapter = ModelDetails("Adapter")
127
  Original = ModelDetails("Original")
128
  Delta = ModelDetails("Delta")
129
 
130
+
131
  class Precision(Enum):
132
  float16 = ModelDetails("float16")
133
  bfloat16 = ModelDetails("bfloat16")
134
+ float32 = ModelDetails("float32")
135
+ # qt_8bit = ModelDetails("8bit")
136
+ # qt_4bit = ModelDetails("4bit")
137
+ # qt_GPTQ = ModelDetails("GPTQ")
138
  Unknown = ModelDetails("?")
139
 
140
  def from_str(precision):
 
142
  return Precision.float16
143
  if precision in ["torch.bfloat16", "bfloat16"]:
144
  return Precision.bfloat16
145
+ if precision in ["float32"]:
146
+ return Precision.float32
147
+ # if precision in ["8bit"]:
148
+ # return Precision.qt_8bit
149
+ # if precision in ["4bit"]:
150
+ # return Precision.qt_4bit
151
+ # if precision in ["GPTQ", "None"]:
152
+ # return Precision.qt_GPTQ
153
  return Precision.Unknown
154
 
155
+
156
+ class PromptTemplateName(Enum):
157
+ UniversalNERTemplate = "universal_ner"
158
+ LLMHTMLHighlightedSpansTemplate = "llm_html_highlighted_spans"
159
+ LLMHTMLHighlightedSpansTemplateV1 = "llm_html_highlighted_spans_v1"
160
+ LLamaNERTemplate = "llama_70B_ner"
161
+ # MixtralNERTemplate = "mixtral_ner_v0.3"
162
+
163
+ class EvaluationMetrics(Enum):
164
+ SpanBased = "Span Based"
165
+ TokenBased = "Token Based"
166
+
167
+
168
  # Column selection
169
+ DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.clinical_type_col]
170
+ Clinical_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
171
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
172
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
173
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
174
 
175
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
176
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
177
 
178
+ DATASET_BENCHMARK_COLS = [t.value.col_name for t in HarnessTasks]
179
+ TYPES_BENCHMARK_COLS = [t.value.col_name for t in ClinicalTypes]
180
+
181
+ NUMERIC_INTERVALS = {
182
+ "?": pd.Interval(-1, 0, closed="right"),
183
+ "~1.5": pd.Interval(0, 2, closed="right"),
184
+ "~3": pd.Interval(2, 4, closed="right"),
185
+ "~7": pd.Interval(4, 9, closed="right"),
186
+ "~13": pd.Interval(9, 20, closed="right"),
187
+ "~35": pd.Interval(20, 45, closed="right"),
188
+ "~60": pd.Interval(45, 70, closed="right"),
189
+ "70+": pd.Interval(70, 10000, closed="right"),
190
+ }
src/envs.py CHANGED
@@ -4,22 +4,22 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "m42-health" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
+ REPO_ID = f"{OWNER}/MEDIC-Benchmark"
13
+ QUEUE_REPO = f"{OWNER}/medic-harness-requests"
14
+ RESULTS_REPO = f"{OWNER}/medic-harness-results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
+ CACHE_PATH = os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "medic-harness-requests")
21
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "medic-harness-results")
22
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "medic-harness-requests-bk")
23
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "medic-harness-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -8,40 +8,48 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
- results: dict
 
25
  precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
 
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
- date: str = "" # submission date of request file
33
  still_on_hub: bool = False
 
34
 
35
  @classmethod
36
- def init_from_json_file(self, json_filepath):
37
  """Inits the result from the specific model result file"""
38
  with open(json_filepath) as fp:
39
  data = json.load(fp)
40
-
41
  config = data.get("config")
42
 
43
  # Precision
44
  precision = Precision.from_str(config.get("model_dtype"))
 
 
 
 
 
45
 
46
  # Get model and org
47
  org_and_model = config.get("model_name", config.get("model_args", None))
@@ -58,17 +66,17 @@ class EvalResult:
58
  full_model = "/".join(org_and_model)
59
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
63
- architecture = "?"
64
  if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
 
69
  # Extract results available in this file (some results are split in several files)
70
- results = {}
71
- for task in Tasks:
72
  task = task.value
73
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
@@ -76,19 +84,37 @@ class EvalResult:
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
 
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  return self(
83
  eval_name=result_key,
84
  full_model=full_model,
85
  org=org,
86
  model=model,
87
- results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
 
90
  still_on_hub=still_on_hub,
91
- architecture=architecture
 
 
 
 
 
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
@@ -104,32 +130,66 @@ class EvalResult:
104
  self.likes = request.get("likes", 0)
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
 
107
  except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
 
 
 
 
109
 
110
- def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
- data_dict = {
114
- "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
- AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
- }
128
-
129
- for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
-
132
- return data_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
 
135
  def get_request_file_for_model(requests_path, model_name, precision):
@@ -146,15 +206,12 @@ def get_request_file_for_model(requests_path, model_name, precision):
146
  for tmp_request_file in request_files:
147
  with open(tmp_request_file, "r") as f:
148
  req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
  request_file = tmp_request_file
154
  return request_file
155
 
156
 
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
  """From the path of the results folder root, extract all needed info for results"""
159
  model_result_filepaths = []
160
 
@@ -175,20 +232,23 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
175
  eval_results = {}
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
 
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
183
- if eval_name in eval_results.keys():
184
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
- else:
186
- eval_results[eval_name] = eval_result
187
 
188
  results = []
 
189
  for v in eval_results.values():
190
  try:
191
- v.to_dict() # we test if the dict version is complete
 
 
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, ClinicalTypes
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
  class EvalResult:
17
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
18
+
19
+ eval_name: str # org_model_precision (uid)
20
+ full_model: str # org/model (path on hub)
21
+ org: str
22
  model: str
23
+ revision: str # commit hash, "" if main
24
+ dataset_results: dict
25
+ # clinical_type_results:dict
26
  precision: Precision = Precision.Unknown
27
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
+ weight_type: WeightType = WeightType.Original # Original or Adapter
29
+ architecture: str = "Unknown"
30
+ backbone:str = "Unknown"
31
  license: str = "?"
32
  likes: int = 0
33
  num_params: int = 0
34
+ date: str = "" # submission date of request file
35
  still_on_hub: bool = False
36
+ display_result:bool = True
37
 
38
  @classmethod
39
+ def init_from_json_file(self, json_filepath, evaluation_metric):
40
  """Inits the result from the specific model result file"""
41
  with open(json_filepath) as fp:
42
  data = json.load(fp)
43
+
44
  config = data.get("config")
45
 
46
  # Precision
47
  precision = Precision.from_str(config.get("model_dtype"))
48
+ model_type = ModelType.from_str(config.get("model_type", ""))
49
+ license = config.get("license", "?")
50
+ num_params = config.get("num_params", "?")
51
+ display_result = config.get("display_result", True)
52
+ display_result = False if display_result=="False" else True
53
 
54
  # Get model and org
55
  org_and_model = config.get("model_name", config.get("model_args", None))
 
66
  full_model = "/".join(org_and_model)
67
 
68
  still_on_hub, _, model_config = is_model_on_hub(
69
+ full_model, config.get("revision", "main"), trust_remote_code=True, test_tokenizer=False
70
  )
71
+ backbone = "?"
72
  if model_config is not None:
73
+ backbones = getattr(model_config, "architectures", None)
74
+ if backbones:
75
+ backbone = ";".join(backbones)
76
 
77
  # Extract results available in this file (some results are split in several files)
78
+ dataset_results = {}
79
+ for task in HarnessTasks:
80
  task = task.value
81
 
82
  # We average all scores of a given metric (not all metrics are present in all files)
 
84
  if accs.size == 0 or any([acc is None for acc in accs]):
85
  continue
86
 
87
+ mean_acc = np.mean(accs) # * 100.0
88
+ dataset_results[task.benchmark] = mean_acc
89
+ print(dataset_results)
90
+ # types_results = {}
91
+ # for clinical_type in ClinicalTypes:
92
+ # clinical_type = clinical_type.value
93
+
94
+ # # We average all scores of a given metric (not all metrics are present in all files)
95
+ # accs = np.array([v.get(clinical_type.metric, None) for k, v in data[evaluation_metric]["clinical_type_results"].items() if clinical_type.benchmark == k])
96
+ # if accs.size == 0 or any([acc is None for acc in accs]):
97
+ # continue
98
+
99
+ # mean_acc = np.mean(accs) # * 100.0
100
+ # types_results[clinical_type.benchmark] = mean_acc
101
 
102
  return self(
103
  eval_name=result_key,
104
  full_model=full_model,
105
  org=org,
106
  model=model,
107
+ dataset_results=dataset_results,
108
+ # clinical_type_results=types_results,
109
+ precision=precision,
110
+ revision=config.get("revision", ""),
111
  still_on_hub=still_on_hub,
112
+ # architecture=model_architecture,
113
+ backbone=backbone,
114
+ model_type=model_type,
115
+ num_params=num_params,
116
+ license=license,
117
+ display_result=display_result
118
  )
119
 
120
  def update_with_request_file(self, requests_path):
 
130
  self.likes = request.get("likes", 0)
131
  self.num_params = request.get("params", 0)
132
  self.date = request.get("submitted_time", "")
133
+ # self.precision = request.get("precision", "float32")
134
  except Exception:
135
+ pass
136
+ # print(
137
+ # f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
138
+ # )
139
+ # print(f" Args used were - {request_file=}, {requests_path=}, {self.full_model=},")
140
 
141
+ def to_dict(self, subset):
142
  """Converts the Eval Result to a dict compatible with our dataframe display"""
143
+ if subset == "datasets":
144
+ average = sum([v for v in self.dataset_results.values() if v is not None]) / len(HarnessTasks)
145
+ data_dict = {
146
+ "eval_name": self.eval_name, # not a column, just a save name,
147
+ AutoEvalColumn.precision.name: self.precision.value.name,
148
+ AutoEvalColumn.model_type.name: self.model_type.value.name,
149
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
150
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
151
+ # AutoEvalColumn.architecture.name: self.architecture.value.name,
152
+ AutoEvalColumn.backbone.name: self.backbone,
153
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
154
+ AutoEvalColumn.revision.name: self.revision,
155
+ AutoEvalColumn.average.name: average,
156
+ AutoEvalColumn.license.name: self.license,
157
+ AutoEvalColumn.likes.name: self.likes,
158
+ AutoEvalColumn.params.name: self.num_params,
159
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
160
+ "display_result" : self.display_result,
161
+ }
162
+
163
+ for task in HarnessTasks:
164
+ data_dict[task.value.col_name] = self.dataset_results[task.value.benchmark]
165
+
166
+ return data_dict
167
+
168
+ if subset == "clinical_types":
169
+ average = sum([v for v in self.clinical_type_results.values() if v is not None]) / len(ClinicalTypes)
170
+ data_dict = {
171
+ "eval_name": self.eval_name, # not a column, just a save name,
172
+ AutoEvalColumn.precision.name: self.precision.value.name,
173
+ AutoEvalColumn.model_type.name: self.model_type.value.name,
174
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
175
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
176
+ AutoEvalColumn.architecture.name: self.architecture.value.name,
177
+ AutoEvalColumn.backbone.name: self.backbone,
178
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
179
+ AutoEvalColumn.revision.name: self.revision,
180
+ AutoEvalColumn.average.name: average,
181
+ AutoEvalColumn.license.name: self.license,
182
+ AutoEvalColumn.likes.name: self.likes,
183
+ AutoEvalColumn.params.name: self.num_params,
184
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
185
+ "display_result" : self.display_result,
186
+ }
187
+
188
+ for clinical_type in ClinicalTypes:
189
+ data_dict[clinical_type.value.col_name] = self.clinical_type_results[clinical_type.value.benchmark]
190
+
191
+ return data_dict
192
+
193
 
194
 
195
  def get_request_file_for_model(requests_path, model_name, precision):
 
206
  for tmp_request_file in request_files:
207
  with open(tmp_request_file, "r") as f:
208
  req_content = json.load(f)
209
+ if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
 
 
 
210
  request_file = tmp_request_file
211
  return request_file
212
 
213
 
214
+ def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metric: str) -> list[EvalResult]:
215
  """From the path of the results folder root, extract all needed info for results"""
216
  model_result_filepaths = []
217
 
 
232
  eval_results = {}
233
  for model_result_filepath in model_result_filepaths:
234
  # Creation of result
235
+ eval_result = EvalResult.init_from_json_file(model_result_filepath, evaluation_metric)
236
+ # eval_result.update_with_request_file(requests_path)
237
 
238
  # Store results of same eval together
239
  eval_name = eval_result.eval_name
240
+ # if eval_name in eval_results.keys():
241
+ # eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
242
+ # else:
243
+ eval_results[eval_name] = eval_result
244
 
245
  results = []
246
+ # clinical_type_results = []
247
  for v in eval_results.values():
248
  try:
249
+ v.to_dict(subset="dataset") # we test if the dict version is complete
250
+ if not v.display_result:
251
+ continue
252
  results.append(v)
253
  except KeyError: # not all eval values present
254
  continue
src/populate.py CHANGED
@@ -8,18 +8,21 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
 
 
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
21
  df = df[has_no_nan_values(df, benchmark_cols)]
22
- return df
23
 
24
 
25
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
@@ -33,19 +36,19 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
33
  with open(file_path) as fp:
34
  data = json.load(fp)
35
 
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
 
39
  all_evals.append(data)
40
  elif ".md" not in entry:
41
  # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
43
  for sub_entry in sub_entries:
44
  file_path = os.path.join(save_path, entry, sub_entry)
45
  with open(file_path) as fp:
46
  data = json.load(fp)
47
-
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
  all_evals.append(data)
51
 
 
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
+ raw_data = get_raw_eval_results(results_path, requests_path, evaluation_metric)
14
+ # print(raw_data)
15
+ # raise Exception("stop")
16
+ all_data_json = [v.to_dict(subset=subset) for v in raw_data]
17
 
18
  df = pd.DataFrame.from_records(all_data_json)
19
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
20
+ cols = list(set(df.columns).intersection(set(cols)))
21
  df = df[cols].round(decimals=2)
22
 
23
  # filter out if any of the benchmarks have not been produced
24
  df = df[has_no_nan_values(df, benchmark_cols)]
25
+ return raw_data, df
26
 
27
 
28
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
36
  with open(file_path) as fp:
37
  data = json.load(fp)
38
 
39
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
40
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
41
 
42
  all_evals.append(data)
43
  elif ".md" not in entry:
44
  # this is a folder
45
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
46
  for sub_entry in sub_entries:
47
  file_path = os.path.join(save_path, entry, sub_entry)
48
  with open(file_path) as fp:
49
  data = json.load(fp)
50
+ # print(data)
51
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
52
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
53
  all_evals.append(data)
54
 
src/submission/check_validity.py CHANGED
@@ -59,14 +59,24 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
59
  return False, "was not found on hub!", None
60
 
61
 
62
- def get_model_size(model_info: ModelInfo, precision: str):
63
  """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
 
 
64
  try:
65
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
  except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
-
69
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
 
 
 
 
 
 
 
 
70
  model_size = size_factor * model_size
71
  return model_size
72
 
@@ -88,12 +98,12 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
88
  continue
89
  with open(os.path.join(root, file), "r") as f:
90
  info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
 
93
  # Select organisation
94
- if info["model"].count("/") == 0 or "submitted_time" not in info:
95
  continue
96
- organisation, _ = info["model"].split("/")
97
  users_to_submission_dates[organisation].append(info["submitted_time"])
98
 
99
  return set(file_names), users_to_submission_dates
 
59
  return False, "was not found on hub!", None
60
 
61
 
62
+ def get_model_size(model_info: ModelInfo, precision: str=None):
63
  """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
+ size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
65
+
66
  try:
67
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
68
  except (AttributeError, TypeError):
69
+ try:
70
+ size_match = re.search(size_pattern, model_info.id.lower())
71
+ model_size = size_match.group(0)
72
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
73
+ except AttributeError:
74
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
75
+
76
+ if precision:
77
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
78
+ else:
79
+ size_factor = 1
80
  model_size = size_factor * model_size
81
  return model_size
82
 
 
98
  continue
99
  with open(os.path.join(root, file), "r") as f:
100
  info = json.load(f)
101
+ file_names.append(f"{info['model_name']}_{info['revision']}")
102
 
103
  # Select organisation
104
+ if info["model_name"].count("/") == 0 or "submitted_time" not in info:
105
  continue
106
+ organisation, _ = info["model_name"].split("/")
107
  users_to_submission_dates[organisation].append(info["submitted_time"])
108
 
109
  return set(file_names), users_to_submission_dates
src/submission/submit.py CHANGED
@@ -1,5 +1,6 @@
1
  import json
2
  import os
 
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
@@ -10,18 +11,57 @@ from src.submission.check_validity import (
10
  get_model_size,
11
  is_model_on_hub,
12
  )
 
13
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def add_new_eval(
18
  model: str,
19
- base_model: str,
20
  revision: str,
21
- precision: str,
22
- weight_type: str,
 
 
 
 
 
23
  model_type: str,
24
  ):
 
 
 
 
 
 
 
 
 
25
  global REQUESTED_MODELS
26
  global USERS_TO_SUBMISSION_DATES
27
  if not REQUESTED_MODELS:
@@ -33,26 +73,35 @@ def add_new_eval(
33
  user_name = model.split("/")[0]
34
  model_path = model.split("/")[1]
35
 
36
- precision = precision.split(" ")[0]
37
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
 
39
  if model_type is None or model_type == "":
40
  return styled_error("Please select a model type.")
 
 
41
 
42
  # Does the model actually exist?
43
  if revision == "":
44
  revision = "main"
45
 
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
 
52
- if not weight_type == "Adapter":
53
  model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
  if not model_on_hub:
55
  return styled_error(f'Model "{model}" {error}')
 
 
 
 
 
 
 
56
 
57
  # Is the model info correctly filled?
58
  try:
@@ -60,7 +109,7 @@ def add_new_eval(
60
  except Exception:
61
  return styled_error("Could not get your model information. Please fill it up properly.")
62
 
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
 
65
  # Were the model card and license filled?
66
  try:
@@ -72,32 +121,71 @@ def add_new_eval(
72
  if not modelcard_OK:
73
  return styled_error(error_msg)
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  # Seems good, creating the eval
76
  print("Adding new eval")
77
 
 
78
  eval_entry = {
79
- "model": model,
80
- "base_model": base_model,
81
  "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
 
84
  "status": "PENDING",
85
  "submitted_time": current_time,
86
  "model_type": model_type,
87
  "likes": model_info.likes,
88
- "params": model_size,
89
  "license": license,
90
  "private": False,
 
91
  }
92
 
93
  # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
 
96
 
97
  print("Creating eval file")
98
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
  os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
 
102
  with open(out_path, "w") as f:
103
  f.write(json.dumps(eval_entry))
 
1
  import json
2
  import os
3
+ import ast
4
  from datetime import datetime, timezone
5
 
6
  from src.display.formatting import styled_error, styled_message, styled_warning
 
11
  get_model_size,
12
  is_model_on_hub,
13
  )
14
+ from src.display.utils import PromptTemplateName
15
 
16
  REQUESTED_MODELS = None
17
  USERS_TO_SUBMISSION_DATES = None
18
 
19
+ PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG = """{
20
+ "NCBI" : {
21
+ "" : "condition"
22
+ },
23
+ "CHIA" : {
24
+ "" : "condition"
25
+ "" : "drug"
26
+ "" : "procedure"
27
+ "" : "measurement"
28
+ },
29
+ "BIORED" : {
30
+ "" : "condition"
31
+ "" : "drug"
32
+ "" : "gene"
33
+ "" : "gene variant"
34
+ },
35
+ "BC5CDR" : {
36
+ "" : "condition"
37
+ "" : "drug"
38
+ }
39
+ }
40
+
41
+ """
42
+
43
  def add_new_eval(
44
  model: str,
45
+ # base_model: str,
46
  revision: str,
47
+ # precision: str,
48
+ # weight_type: str,
49
+ model_arch: str,
50
+ label_normalization_map: str,
51
+ gliner_threshold:str,
52
+ gliner_tokenizer_bool:str,
53
+ prompt_template_name:str,
54
  model_type: str,
55
  ):
56
+ """
57
+ Saves request if valid else returns the error.
58
+ Validity is checked based on -
59
+ - model's existence on hub
60
+ - necessary info on the model's card
61
+ - label normalization is a valid python dict and contains the keys for all datasets
62
+ - threshold for gliner is a valid float
63
+
64
+ """
65
  global REQUESTED_MODELS
66
  global USERS_TO_SUBMISSION_DATES
67
  if not REQUESTED_MODELS:
 
73
  user_name = model.split("/")[0]
74
  model_path = model.split("/")[1]
75
 
76
+ # precision = precision.split(" ")[0]
77
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
78
 
79
  if model_type is None or model_type == "":
80
  return styled_error("Please select a model type.")
81
+
82
+ model_type = model_type.split(":")[-1].strip()
83
 
84
  # Does the model actually exist?
85
  if revision == "":
86
  revision = "main"
87
 
88
+ # # Is the model on the hub?
89
+ # if weight_type in ["Delta", "Adapter"]:
90
+ # base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
91
+ # if not base_model_on_hub:
92
+ # return styled_error(f'Base model "{base_model}" {error}')
93
 
94
+ if not model_arch == "GLiNER Encoder":
95
  model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
96
  if not model_on_hub:
97
  return styled_error(f'Model "{model}" {error}')
98
+ else:
99
+ model_name_matches = list(API.list_models(model_name=model))
100
+ if len(model_name_matches) < 1:
101
+ return styled_error(f'Model "{model}" does not exist on the hub!')
102
+ elif model_name_matches[0].id != model:
103
+ return styled_error(f'Model "{model}" does not exist on the hub! There might be a typo in the name')
104
+
105
 
106
  # Is the model info correctly filled?
107
  try:
 
109
  except Exception:
110
  return styled_error("Could not get your model information. Please fill it up properly.")
111
 
112
+ model_size = get_model_size(model_info=model_info)
113
 
114
  # Were the model card and license filled?
115
  try:
 
121
  if not modelcard_OK:
122
  return styled_error(error_msg)
123
 
124
+ # Verify the inference config now
125
+ try:
126
+ label_normalization_map = ast.literal_eval(label_normalization_map)
127
+ except Exception as e:
128
+ return styled_error("Please enter a valid json for the labe; normalization map")
129
+
130
+ inference_config = {
131
+ # "model_arch" : model_arch,
132
+ "label_normalization_map": label_normalization_map,
133
+ }
134
+
135
+ match model_arch:
136
+ case "Encoder":
137
+ pass
138
+ case "Decoder":
139
+ if not prompt_template_name in [prompt_template.value for prompt_template in PromptTemplateName]:
140
+ return styled_error("Prompt template name is invalid")
141
+ inference_config = {
142
+ **inference_config,
143
+ "prompt_template_identifier": prompt_template_name,
144
+ }
145
+ case "GLiNER Encoder":
146
+ try:
147
+ gliner_threshold = float(gliner_threshold)
148
+ gliner_tokenizer_bool = ast.literal_eval(gliner_tokenizer_bool)
149
+ inference_config = {
150
+ **inference_config,
151
+ "gliner_threshold": gliner_threshold,
152
+ "gliner_tokenizer_bool" : gliner_tokenizer_bool
153
+ }
154
+ except Exception as e:
155
+ return styled_error("Please enter a valid float for the threshold")
156
+ case _:
157
+ return styled_error("Model Architecture is invalid")
158
+
159
  # Seems good, creating the eval
160
  print("Adding new eval")
161
 
162
+
163
  eval_entry = {
164
+ "model_name": model,
165
+ # "base_model": base_model,
166
  "revision": revision,
167
+ # "precision": precision,
168
+ # "weight_type": weight_type,
169
+ "model_architecture": model_arch,
170
  "status": "PENDING",
171
  "submitted_time": current_time,
172
  "model_type": model_type,
173
  "likes": model_info.likes,
174
+ "num_params": model_size,
175
  "license": license,
176
  "private": False,
177
+ "inference_config":inference_config,
178
  }
179
 
180
  # Check for duplicate submission
181
+
182
+ if f"{model}_{revision}" in REQUESTED_MODELS:
183
+ return styled_warning("This model has been already submitted. Add the revision if the model has been updated.")
184
 
185
  print("Creating eval file")
186
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
187
  os.makedirs(OUT_DIR, exist_ok=True)
188
+ out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request.json"
189
 
190
  with open(out_path, "w") as f:
191
  f.write(json.dumps(eval_entry))