Spaces:
Running
Running
tathagataraha
commited on
Commit
·
09b313f
1
Parent(s):
9244862
[ADD] Harness tasks, data display
Browse files- .gitignore +2 -0
- README.md +11 -2
- app.py +342 -60
- assets/entity_distribution.png +0 -0
- assets/image.png +0 -0
- assets/ner_evaluation_example.png +0 -0
- eval_metrics_app.py +75 -0
- medic-harness-requests/.gitattributes +58 -0
- medic-harness-results/.gitattributes +58 -0
- medic-harness-results/aaditya/Llama3-OpenBioLLM-70B/results_2024-07-24T15:26:36Z.json +37 -0
- medic-harness-results/meta-llama/Llama-3.1-8B-Instruct/results_2024-07-24T15:26:36Z.json +39 -0
- requirements.txt +4 -2
- src/about.py +231 -36
- src/display/css_html_js.py +7 -0
- src/display/utils.py +105 -25
- src/envs.py +10 -10
- src/leaderboard/read_evals.py +122 -62
- src/populate.py +11 -8
- src/submission/check_validity.py +17 -7
- src/submission/submit.py +107 -19
.gitignore
CHANGED
@@ -10,4 +10,6 @@ eval-queue/
|
|
10 |
eval-results/
|
11 |
eval-queue-bk/
|
12 |
eval-results-bk/
|
|
|
|
|
13 |
logs/
|
|
|
10 |
eval-results/
|
11 |
eval-queue-bk/
|
12 |
eval-results-bk/
|
13 |
+
eval-queue-local/
|
14 |
+
eval-results-local/
|
15 |
logs/
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🥇
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
@@ -7,8 +7,17 @@ sdk: gradio
|
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
---
|
11 |
|
|
|
|
|
|
|
12 |
# Start the configuration
|
13 |
|
14 |
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
@@ -41,4 +50,4 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
|
|
41 |
You'll find
|
42 |
- the main table' columns names and properties in `src/display/utils.py`
|
43 |
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
44 |
-
-
|
|
|
1 |
---
|
2 |
+
title: Clinical NER Leaderboard
|
3 |
emoji: 🥇
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
|
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
10 |
+
tags:
|
11 |
+
- leaderboard
|
12 |
+
- submission:automatic
|
13 |
+
- test:public
|
14 |
+
- judge:auto
|
15 |
+
- modality:text
|
16 |
---
|
17 |
|
18 |
+
Also known as the NCER leaderboard HF, Huggingface. See the paper for more info: https://huggingface.co/papers/2410.05046.
|
19 |
+
|
20 |
+
|
21 |
# Start the configuration
|
22 |
|
23 |
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
|
|
50 |
You'll find
|
51 |
- the main table' columns names and properties in `src/display/utils.py`
|
52 |
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
53 |
+
- teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
-
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
from huggingface_hub import snapshot_download
|
@@ -9,30 +10,41 @@ from src.about import (
|
|
9 |
CITATION_BUTTON_TEXT,
|
10 |
EVALUATION_QUEUE_TEXT,
|
11 |
INTRODUCTION_TEXT,
|
12 |
-
|
|
|
|
|
|
|
|
|
13 |
TITLE,
|
|
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
17 |
-
|
18 |
-
|
|
|
|
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
|
|
|
|
21 |
AutoEvalColumn,
|
22 |
ModelType,
|
23 |
-
|
|
|
|
|
24 |
WeightType,
|
25 |
-
|
26 |
)
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
-
from src.submission.submit import add_new_eval
|
30 |
|
31 |
|
32 |
def restart_space():
|
33 |
API.restart_space(repo_id=REPO_ID)
|
34 |
|
35 |
-
|
36 |
try:
|
37 |
print(EVAL_REQUESTS_PATH)
|
38 |
snapshot_download(
|
@@ -48,8 +60,20 @@ try:
|
|
48 |
except Exception:
|
49 |
restart_space()
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
|
54 |
(
|
55 |
finished_eval_queue_df,
|
@@ -57,51 +81,288 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
57 |
pending_eval_queue_df,
|
58 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
60 |
-
def init_leaderboard(dataframe):
|
61 |
-
if dataframe is None or dataframe.empty:
|
62 |
-
raise ValueError("Leaderboard DataFrame is empty or None.")
|
63 |
-
return Leaderboard(
|
64 |
-
value=dataframe,
|
65 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
-
select_columns=SelectColumns(
|
67 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
-
label="Select Columns to Display:",
|
70 |
-
),
|
71 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
-
filter_columns=[
|
74 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
-
ColumnFilter(
|
77 |
-
AutoEvalColumn.params.name,
|
78 |
-
type="slider",
|
79 |
-
min=0.01,
|
80 |
-
max=150,
|
81 |
-
label="Select the number of parameters (B)",
|
82 |
-
),
|
83 |
-
ColumnFilter(
|
84 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
-
),
|
86 |
-
],
|
87 |
-
bool_checkboxgroup_label="Hide models",
|
88 |
-
interactive=False,
|
89 |
-
)
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
|
|
92 |
demo = gr.Blocks(css=custom_css)
|
93 |
with demo:
|
94 |
gr.HTML(TITLE)
|
|
|
95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
-
with gr.TabItem("🏅
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
-
|
102 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
with gr.Column():
|
106 |
with gr.Row():
|
107 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
@@ -146,8 +407,16 @@ with demo:
|
|
146 |
|
147 |
with gr.Row():
|
148 |
with gr.Column():
|
|
|
149 |
model_name_textbox = gr.Textbox(label="Model name")
|
|
|
150 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
model_type = gr.Dropdown(
|
152 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
153 |
label="Model type",
|
@@ -157,21 +426,29 @@ with demo:
|
|
157 |
)
|
158 |
|
159 |
with gr.Column():
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
)
|
167 |
-
|
168 |
-
choices=[
|
169 |
-
label="
|
170 |
multiselect=False,
|
171 |
-
value="
|
172 |
interactive=True,
|
173 |
-
|
174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
submit_button = gr.Button("Submit Eval")
|
177 |
submission_result = gr.Markdown()
|
@@ -179,15 +456,20 @@ with demo:
|
|
179 |
add_new_eval,
|
180 |
[
|
181 |
model_name_textbox,
|
182 |
-
base_model_name_textbox,
|
183 |
revision_name_textbox,
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
186 |
model_type,
|
187 |
],
|
188 |
submission_result,
|
189 |
)
|
190 |
|
|
|
191 |
with gr.Row():
|
192 |
with gr.Accordion("📙 Citation", open=False):
|
193 |
citation_button = gr.Textbox(
|
@@ -201,4 +483,4 @@ with demo:
|
|
201 |
scheduler = BackgroundScheduler()
|
202 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
203 |
scheduler.start()
|
204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
1 |
+
import subprocess
|
2 |
+
|
3 |
import gradio as gr
|
|
|
4 |
import pandas as pd
|
5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
6 |
from huggingface_hub import snapshot_download
|
|
|
10 |
CITATION_BUTTON_TEXT,
|
11 |
EVALUATION_QUEUE_TEXT,
|
12 |
INTRODUCTION_TEXT,
|
13 |
+
LLM_BENCHMARKS_TEXT_1,
|
14 |
+
EVALUATION_EXAMPLE_IMG,
|
15 |
+
LLM_BENCHMARKS_TEXT_2,
|
16 |
+
# ENTITY_DISTRIBUTION_IMG,
|
17 |
+
LLM_BENCHMARKS_TEXT_3,
|
18 |
TITLE,
|
19 |
+
LOGO
|
20 |
)
|
21 |
from src.display.css_html_js import custom_css
|
22 |
from src.display.utils import (
|
23 |
+
DATASET_BENCHMARK_COLS,
|
24 |
+
TYPES_BENCHMARK_COLS,
|
25 |
+
DATASET_COLS,
|
26 |
+
Clinical_TYPES_COLS,
|
27 |
EVAL_COLS,
|
28 |
EVAL_TYPES,
|
29 |
+
NUMERIC_INTERVALS,
|
30 |
+
TYPES,
|
31 |
AutoEvalColumn,
|
32 |
ModelType,
|
33 |
+
ModelArch,
|
34 |
+
PromptTemplateName,
|
35 |
+
Precision,
|
36 |
WeightType,
|
37 |
+
fields,
|
38 |
)
|
39 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
40 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
41 |
+
from src.submission.submit import add_new_eval, PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG
|
42 |
|
43 |
|
44 |
def restart_space():
|
45 |
API.restart_space(repo_id=REPO_ID)
|
46 |
|
47 |
+
|
48 |
try:
|
49 |
print(EVAL_REQUESTS_PATH)
|
50 |
snapshot_download(
|
|
|
60 |
except Exception:
|
61 |
restart_space()
|
62 |
|
63 |
+
# Span based results
|
64 |
+
_, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
|
65 |
+
harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
|
66 |
+
|
67 |
+
# _, span_based_types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "SpanBasedWithPartialOverlap", "clinical_types")
|
68 |
+
# span_based_types_leaderboard_df = span_based_types_original_df.copy()
|
69 |
+
|
70 |
+
# # Token based results
|
71 |
+
# _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
|
72 |
+
# token_based_datasets_leaderboard_df = token_based_datasets_original_df.copy()
|
73 |
+
|
74 |
+
# _, token_based_types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "clinical_types")
|
75 |
+
# token_based_types_leaderboard_df = token_based_types_original_df.copy()
|
76 |
|
|
|
77 |
|
78 |
(
|
79 |
finished_eval_queue_df,
|
|
|
81 |
pending_eval_queue_df,
|
82 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
+
def update_df(shown_columns, subset="datasets"):
|
86 |
+
leaderboard_table_df = harness_datasets_leaderboard_df.copy()
|
87 |
+
hidden_leader_board_df = harness_datasets_original_df
|
88 |
+
# else:
|
89 |
+
# match evaluation_metric:
|
90 |
+
# case "Span Based":
|
91 |
+
# leaderboard_table_df = span_based_types_leaderboard_df.copy()
|
92 |
+
# hidden_leader_board_df = span_based_types_original_df
|
93 |
+
# case "Token Based":
|
94 |
+
# leaderboard_table_df = token_based_types_leaderboard_df.copy()
|
95 |
+
# hidden_leader_board_df = token_based_types_original_df
|
96 |
+
# case _:
|
97 |
+
# pass
|
98 |
+
|
99 |
+
|
100 |
+
value_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns
|
101 |
+
|
102 |
+
return leaderboard_table_df[value_cols], hidden_leader_board_df
|
103 |
+
|
104 |
+
|
105 |
+
# Searching and filtering
|
106 |
+
def update_table(
|
107 |
+
hidden_df: pd.DataFrame,
|
108 |
+
columns: list,
|
109 |
+
query: str,
|
110 |
+
type_query: list = None,
|
111 |
+
architecture_query: list = None,
|
112 |
+
size_query: list = None,
|
113 |
+
precision_query: str = None,
|
114 |
+
show_deleted: bool = False,
|
115 |
+
):
|
116 |
+
filtered_df = filter_models(hidden_df, type_query, architecture_query, size_query, precision_query, show_deleted)
|
117 |
+
filtered_df = filter_queries(query, filtered_df)
|
118 |
+
df = select_columns(filtered_df, columns, list(hidden_df.columns))
|
119 |
+
return df
|
120 |
+
|
121 |
+
|
122 |
+
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
123 |
+
return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
|
124 |
+
|
125 |
+
|
126 |
+
def select_columns(df: pd.DataFrame, columns: list, cols:list) -> pd.DataFrame:
|
127 |
+
always_here_cols = [
|
128 |
+
AutoEvalColumn.model_type_symbol.name,
|
129 |
+
AutoEvalColumn.model.name,
|
130 |
+
]
|
131 |
+
# We use COLS to maintain sorting
|
132 |
+
filtered_df = df[always_here_cols + [c for c in cols if c in df.columns and c in columns]]
|
133 |
+
return filtered_df
|
134 |
+
|
135 |
+
|
136 |
+
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
137 |
+
final_df = []
|
138 |
+
if query != "":
|
139 |
+
queries = [q.strip() for q in query.split(";")]
|
140 |
+
for _q in queries:
|
141 |
+
_q = _q.strip()
|
142 |
+
if _q != "":
|
143 |
+
temp_filtered_df = search_table(filtered_df, _q)
|
144 |
+
if len(temp_filtered_df) > 0:
|
145 |
+
final_df.append(temp_filtered_df)
|
146 |
+
if len(final_df) > 0:
|
147 |
+
filtered_df = pd.concat(final_df)
|
148 |
+
filtered_df = filtered_df.drop_duplicates(
|
149 |
+
subset=[
|
150 |
+
AutoEvalColumn.model.name,
|
151 |
+
# AutoEvalColumn.precision.name,
|
152 |
+
# AutoEvalColumn.revision.name,
|
153 |
+
]
|
154 |
+
)
|
155 |
+
|
156 |
+
return filtered_df
|
157 |
+
|
158 |
+
|
159 |
+
def filter_models(
|
160 |
+
df: pd.DataFrame, type_query: list, architecture_query: list, size_query: list, precision_query: list, show_deleted: bool
|
161 |
+
) -> pd.DataFrame:
|
162 |
+
# Show all models
|
163 |
+
# if show_deleted:
|
164 |
+
# filtered_df = df
|
165 |
+
# else: # Show only still on the hub models
|
166 |
+
# filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
167 |
+
|
168 |
+
filtered_df = df
|
169 |
+
|
170 |
+
if type_query is not None:
|
171 |
+
type_emoji = [t[0] for t in type_query]
|
172 |
+
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
173 |
+
|
174 |
+
if architecture_query is not None:
|
175 |
+
arch_types = [t for t in architecture_query]
|
176 |
+
filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(arch_types)]
|
177 |
+
# filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(architecture_query + ["None"])]
|
178 |
+
|
179 |
+
if precision_query is not None:
|
180 |
+
if AutoEvalColumn.precision.name in df.columns:
|
181 |
+
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
182 |
+
|
183 |
+
if size_query is not None:
|
184 |
+
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
185 |
+
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
186 |
+
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
187 |
+
filtered_df = filtered_df.loc[mask]
|
188 |
+
|
189 |
+
return filtered_df
|
190 |
+
|
191 |
+
def change_submit_request_form(model_architecture):
|
192 |
+
match model_architecture:
|
193 |
+
case "Encoder":
|
194 |
+
return (
|
195 |
+
gr.Textbox(label="Threshold for gliner models", visible=False),
|
196 |
+
gr.Radio(
|
197 |
+
choices=["True", "False"],
|
198 |
+
label="Load GLiNER Tokenizer",
|
199 |
+
visible=False
|
200 |
+
),
|
201 |
+
gr.Dropdown(
|
202 |
+
choices=[prompt_template.value for prompt_template in PromptTemplateName],
|
203 |
+
label="Prompt for generation",
|
204 |
+
multiselect=False,
|
205 |
+
# value="HTML Highlighted Spans",
|
206 |
+
interactive=True,
|
207 |
+
visible=False
|
208 |
+
)
|
209 |
+
)
|
210 |
+
case "Decoder":
|
211 |
+
return (
|
212 |
+
gr.Textbox(label="Threshold for gliner models", visible=False),
|
213 |
+
gr.Radio(
|
214 |
+
choices=["True", "False"],
|
215 |
+
label="Load GLiNER Tokenizer",
|
216 |
+
visible=False
|
217 |
+
),
|
218 |
+
gr.Dropdown(
|
219 |
+
choices=[prompt_template.value for prompt_template in PromptTemplateName],
|
220 |
+
label="Prompt for generation",
|
221 |
+
multiselect=False,
|
222 |
+
# value="HTML Highlighted Spans",
|
223 |
+
interactive=True,
|
224 |
+
visible=True
|
225 |
+
)
|
226 |
+
)
|
227 |
+
case "GLiNER Encoder":
|
228 |
+
return (
|
229 |
+
gr.Textbox(label="Threshold for gliner models", visible=True),
|
230 |
+
gr.Radio(
|
231 |
+
choices=["True", "False"],
|
232 |
+
label="Load GLiNER Tokenizer",
|
233 |
+
visible=True
|
234 |
+
),
|
235 |
+
gr.Dropdown(
|
236 |
+
choices=[prompt_template.value for prompt_template in PromptTemplateName],
|
237 |
+
label="Prompt for generation",
|
238 |
+
multiselect=False,
|
239 |
+
# value="HTML Highlighted Spans",
|
240 |
+
interactive=True,
|
241 |
+
visible=False
|
242 |
+
)
|
243 |
+
)
|
244 |
|
245 |
+
|
246 |
demo = gr.Blocks(css=custom_css)
|
247 |
with demo:
|
248 |
gr.HTML(TITLE)
|
249 |
+
gr.HTML(LOGO, elem_classes="logo")
|
250 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
251 |
|
252 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
253 |
+
with gr.TabItem("🏅 Closed Ended Evaluation", elem_id="llm-benchmark-tab-table", id=0):
|
254 |
+
with gr.Row():
|
255 |
+
with gr.Column():
|
256 |
+
with gr.Row():
|
257 |
+
search_bar = gr.Textbox(
|
258 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
259 |
+
show_label=False,
|
260 |
+
elem_id="search-bar",
|
261 |
+
)
|
262 |
+
with gr.Row():
|
263 |
+
shown_columns = gr.CheckboxGroup(
|
264 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.clinical_type_col],
|
265 |
+
value=[
|
266 |
+
c.name
|
267 |
+
for c in fields(AutoEvalColumn)
|
268 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.clinical_type_col
|
269 |
+
],
|
270 |
+
label="Select columns to show",
|
271 |
+
elem_id="column-select",
|
272 |
+
interactive=True,
|
273 |
+
)
|
274 |
+
# with gr.Row():
|
275 |
+
# deleted_models_visibility = gr.Checkbox(
|
276 |
+
# value=False, label="Show gated/private/deleted models", interactive=True
|
277 |
+
# )
|
278 |
+
with gr.Column(min_width=320):
|
279 |
+
# with gr.Box(elem_id="box-filter"):
|
280 |
+
filter_columns_type = gr.CheckboxGroup(
|
281 |
+
label="Model Types",
|
282 |
+
choices=[t.to_str() for t in ModelType],
|
283 |
+
value=[t.to_str() for t in ModelType],
|
284 |
+
interactive=True,
|
285 |
+
elem_id="filter-columns-type",
|
286 |
+
)
|
287 |
+
# filter_columns_architecture = gr.CheckboxGroup(
|
288 |
+
# label="Architecture Types",
|
289 |
+
# choices=[i.value.name for i in ModelArch],
|
290 |
+
# value=[i.value.name for i in ModelArch],
|
291 |
+
# interactive=True,
|
292 |
+
# elem_id="filter-columns-architecture",
|
293 |
+
# )
|
294 |
+
filter_columns_size = gr.CheckboxGroup(
|
295 |
+
label="Model sizes (in billions of parameters)",
|
296 |
+
choices=list(NUMERIC_INTERVALS.keys()),
|
297 |
+
value=list(NUMERIC_INTERVALS.keys()),
|
298 |
+
interactive=True,
|
299 |
+
elem_id="filter-columns-size",
|
300 |
+
)
|
301 |
|
302 |
+
datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
|
|
|
303 |
|
304 |
+
leaderboard_table = gr.components.Dataframe(
|
305 |
+
value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
306 |
+
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
307 |
+
datatype=TYPES,
|
308 |
+
elem_id="leaderboard-table",
|
309 |
+
interactive=False,
|
310 |
+
visible=True,
|
311 |
+
)
|
312 |
+
|
313 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
314 |
+
# hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
315 |
+
# value=datasets_original_df[DATASET_COLS],
|
316 |
+
# headers=DATASET_COLS,
|
317 |
+
# datatype=TYPES,
|
318 |
+
# visible=False,
|
319 |
+
# )
|
320 |
+
|
321 |
+
|
322 |
+
# search_bar.submit(
|
323 |
+
# update_table,
|
324 |
+
# [
|
325 |
+
# hidden_leaderboard_table_for_search,
|
326 |
+
# shown_columns,
|
327 |
+
# search_bar,
|
328 |
+
# filter_columns_type,
|
329 |
+
# # filter_columns_architecture
|
330 |
+
# ],
|
331 |
+
# leaderboard_table,
|
332 |
+
# )
|
333 |
+
# for selector in [
|
334 |
+
# shown_columns,
|
335 |
+
# filter_columns_type,
|
336 |
+
# # filter_columns_architecture,
|
337 |
+
# # filter_columns_size,
|
338 |
+
# # deleted_models_visibility,
|
339 |
+
# ]:
|
340 |
+
# selector.change(
|
341 |
+
# update_table,
|
342 |
+
# [
|
343 |
+
# hidden_leaderboard_table_for_search,
|
344 |
+
# shown_columns,
|
345 |
+
# search_bar,
|
346 |
+
# filter_columns_type,
|
347 |
+
# # filter_columns_architecture,
|
348 |
+
# ],
|
349 |
+
# leaderboard_table,
|
350 |
+
# queue=True,
|
351 |
+
# )
|
352 |
+
|
353 |
+
with gr.TabItem("🏅 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
|
354 |
+
pass
|
355 |
+
with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=2):
|
356 |
+
pass
|
357 |
+
|
358 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
359 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
|
360 |
+
gr.HTML(EVALUATION_EXAMPLE_IMG, elem_classes="logo")
|
361 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
|
362 |
+
# gr.HTML(ENTITY_DISTRIBUTION_IMG, elem_classes="logo")
|
363 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT_3, elem_classes="markdown-text")
|
364 |
+
|
365 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=4):
|
366 |
with gr.Column():
|
367 |
with gr.Row():
|
368 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
407 |
|
408 |
with gr.Row():
|
409 |
with gr.Column():
|
410 |
+
|
411 |
model_name_textbox = gr.Textbox(label="Model name")
|
412 |
+
|
413 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
414 |
+
|
415 |
+
model_arch = gr.Radio(
|
416 |
+
choices=[t.to_str(" : ") for t in ModelArch if t != ModelArch.Unknown],
|
417 |
+
label="Model Architecture",
|
418 |
+
)
|
419 |
+
|
420 |
model_type = gr.Dropdown(
|
421 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
422 |
label="Model type",
|
|
|
426 |
)
|
427 |
|
428 |
with gr.Column():
|
429 |
+
label_normalization_map = gr.Textbox(lines=6, label="Label Normalization Map", placeholder=PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG)
|
430 |
+
gliner_threshold = gr.Textbox(label="Threshold for GLiNER models", visible=False)
|
431 |
+
gliner_tokenizer_bool = gr.Radio(
|
432 |
+
choices=["True", "False"],
|
433 |
+
label="Load GLiNER Tokenizer",
|
434 |
+
visible=False
|
435 |
)
|
436 |
+
prompt_name = gr.Dropdown(
|
437 |
+
choices=[prompt_template.value for prompt_template in PromptTemplateName],
|
438 |
+
label="Prompt for generation",
|
439 |
multiselect=False,
|
440 |
+
value="HTML Highlighted Spans",
|
441 |
interactive=True,
|
442 |
+
visible=False
|
443 |
+
)# should be a dropdown
|
444 |
+
|
445 |
+
# parsing_function - this is tied to the prompt & therefore does not need to be specified
|
446 |
+
# generation_parameters = gr.Textbox(label="Generation params in json format") just default for now
|
447 |
+
|
448 |
+
model_arch.change(fn=change_submit_request_form, inputs=model_arch, outputs=[
|
449 |
+
gliner_threshold,
|
450 |
+
gliner_tokenizer_bool,
|
451 |
+
prompt_name])
|
452 |
|
453 |
submit_button = gr.Button("Submit Eval")
|
454 |
submission_result = gr.Markdown()
|
|
|
456 |
add_new_eval,
|
457 |
[
|
458 |
model_name_textbox,
|
459 |
+
# base_model_name_textbox,
|
460 |
revision_name_textbox,
|
461 |
+
model_arch,
|
462 |
+
label_normalization_map,
|
463 |
+
gliner_threshold,
|
464 |
+
gliner_tokenizer_bool,
|
465 |
+
prompt_name,
|
466 |
+
# weight_type,
|
467 |
model_type,
|
468 |
],
|
469 |
submission_result,
|
470 |
)
|
471 |
|
472 |
+
|
473 |
with gr.Row():
|
474 |
with gr.Accordion("📙 Citation", open=False):
|
475 |
citation_button = gr.Textbox(
|
|
|
483 |
scheduler = BackgroundScheduler()
|
484 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
485 |
scheduler.start()
|
486 |
+
demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'])
|
assets/entity_distribution.png
ADDED
assets/image.png
ADDED
assets/ner_evaluation_example.png
ADDED
eval_metrics_app.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
# Function to compute evaluation metrics (dummy implementation)
|
4 |
+
def compute_metrics(gt_spans, pred_spans):
|
5 |
+
# Dummy implementation of a metric computation
|
6 |
+
# Replace this with actual metric computation logic
|
7 |
+
tp = len(set(gt_spans) & set(pred_spans))
|
8 |
+
fp = len(set(pred_spans) - set(gt_spans))
|
9 |
+
fn = len(set(gt_spans) - set(pred_spans))
|
10 |
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
11 |
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
12 |
+
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
|
13 |
+
|
14 |
+
return {"precision": precision, "recall": recall, "f1_score": f1_score}
|
15 |
+
|
16 |
+
def create_app():
|
17 |
+
with gr.Blocks() as demo:
|
18 |
+
# Input components
|
19 |
+
text_input = gr.Textbox(label="Input Text")
|
20 |
+
highlight_input = gr.Textbox(label="Highlight Text and Press Add")
|
21 |
+
|
22 |
+
gt_spans_state = gr.State([])
|
23 |
+
pred_spans_state = gr.State([])
|
24 |
+
|
25 |
+
# Buttons for ground truth and prediction
|
26 |
+
add_gt_button = gr.Button("Add to Ground Truth")
|
27 |
+
add_pred_button = gr.Button("Add to Predictions")
|
28 |
+
|
29 |
+
# Outputs for highlighted spans
|
30 |
+
gt_output = gr.HighlightedText(label="Ground Truth Spans")
|
31 |
+
pred_output = gr.HighlightedText(label="Predicted Spans")
|
32 |
+
|
33 |
+
# Compute metrics button and its output
|
34 |
+
compute_button = gr.Button("Compute Metrics")
|
35 |
+
metrics_output = gr.JSON(label="Metrics")
|
36 |
+
|
37 |
+
# Function to update spans
|
38 |
+
def update_spans(text, span, gt_spans, pred_spans, is_gt):
|
39 |
+
start_idx = text.find(span)
|
40 |
+
end_idx = start_idx + len(span)
|
41 |
+
new_span = (start_idx, end_idx)
|
42 |
+
if is_gt:
|
43 |
+
gt_spans.append(new_span)
|
44 |
+
gt_spans = list(set(gt_spans))
|
45 |
+
else:
|
46 |
+
pred_spans.append(new_span)
|
47 |
+
pred_spans = list(set(pred_spans))
|
48 |
+
return gt_spans, pred_spans, highlight_spans(text, gt_spans), highlight_spans(text, pred_spans)
|
49 |
+
|
50 |
+
# Function to highlight spans
|
51 |
+
def highlight_spans(text, spans):
|
52 |
+
span_dict = {}
|
53 |
+
for span in spans:
|
54 |
+
span_dict[(span[0], span[1])] = "highlight"
|
55 |
+
return span_dict
|
56 |
+
|
57 |
+
# Event handlers for buttons
|
58 |
+
add_gt_button.click(fn=update_spans, inputs=[text_input, highlight_input, gt_spans_state, pred_spans_state, gr.State(True)], outputs=[gt_spans_state, pred_spans_state, gt_output, pred_output])
|
59 |
+
add_pred_button.click(fn=update_spans, inputs=[text_input, highlight_input, gt_spans_state, pred_spans_state, gr.State(False)], outputs=[gt_spans_state, pred_spans_state, gt_output, pred_output])
|
60 |
+
|
61 |
+
# Function to compute metrics
|
62 |
+
def on_compute_metrics(gt_spans, pred_spans):
|
63 |
+
metrics = compute_metrics(gt_spans, pred_spans)
|
64 |
+
return metrics
|
65 |
+
|
66 |
+
compute_button.click(fn=on_compute_metrics, inputs=[gt_spans_state, pred_spans_state], outputs=metrics_output)
|
67 |
+
|
68 |
+
# Layout arrangement
|
69 |
+
text_input.change(fn=lambda x: x, inputs=text_input, outputs=[gt_output, pred_output])
|
70 |
+
|
71 |
+
return demo
|
72 |
+
|
73 |
+
# Run the app
|
74 |
+
demo = create_app()
|
75 |
+
demo.launch()
|
medic-harness-requests/.gitattributes
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
+
# Audio files - uncompressed
|
38 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
+
# Audio files - compressed
|
42 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
# Image files - uncompressed
|
48 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
+
# Image files - compressed
|
53 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
56 |
+
# Video files - compressed
|
57 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
58 |
+
*.webm filter=lfs diff=lfs merge=lfs -text
|
medic-harness-results/.gitattributes
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
+
# Audio files - uncompressed
|
38 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
+
# Audio files - compressed
|
42 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
# Image files - uncompressed
|
48 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
+
# Image files - compressed
|
53 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
56 |
+
# Video files - compressed
|
57 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
58 |
+
*.webm filter=lfs diff=lfs merge=lfs -text
|
medic-harness-results/aaditya/Llama3-OpenBioLLM-70B/results_2024-07-24T15:26:36Z.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_name": "aaditya/Llama3-OpenBioLLM-70B",
|
4 |
+
"revision": "main",
|
5 |
+
"submitted_time": "2024-07-24 14:33:56+00:00",
|
6 |
+
"model_type": "domain-specific",
|
7 |
+
"num_params": 70000000000,
|
8 |
+
"private": false,
|
9 |
+
"evaluated_time": "2024-07-24T15:26:36Z"
|
10 |
+
},
|
11 |
+
"results": {
|
12 |
+
"MMLU": {
|
13 |
+
"accuracy": 90.4
|
14 |
+
},
|
15 |
+
"MMLU-Pro": {
|
16 |
+
"accuracy": 64.2
|
17 |
+
},
|
18 |
+
"MedMCQA": {
|
19 |
+
"accuracy": 73.2
|
20 |
+
},
|
21 |
+
"MedQA": {
|
22 |
+
"accuracy": 76.9
|
23 |
+
},
|
24 |
+
"USMLE": {
|
25 |
+
"accuracy": 79.0
|
26 |
+
},
|
27 |
+
"PubMedQA": {
|
28 |
+
"accuracy": 73.2
|
29 |
+
},
|
30 |
+
"ToxiGen": {
|
31 |
+
"accuracy": 91.3
|
32 |
+
},
|
33 |
+
"Average": {
|
34 |
+
"accuracy": 78.3
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
medic-harness-results/meta-llama/Llama-3.1-8B-Instruct/results_2024-07-24T15:26:36Z.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_name": "meta-llama/Llama-3.1-8B-Instruct",
|
4 |
+
"revision": "main",
|
5 |
+
"submitted_time": "2024-07-24 14:33:56+00:00",
|
6 |
+
"model_type": "instruct-tuned",
|
7 |
+
"num_params": 8000000000,
|
8 |
+
"private": false,
|
9 |
+
"evaluated_time": "2024-07-24T15:26:36Z"
|
10 |
+
},
|
11 |
+
"results": {
|
12 |
+
"MMLU": {
|
13 |
+
"accuracy": 73.4
|
14 |
+
},
|
15 |
+
"MMLU-Pro": {
|
16 |
+
"accuracy": 49.9
|
17 |
+
},
|
18 |
+
"MedMCQA": {
|
19 |
+
"accuracy": 58.4
|
20 |
+
},
|
21 |
+
"MedQA": {
|
22 |
+
"accuracy": 62.0
|
23 |
+
},
|
24 |
+
"USMLE": {
|
25 |
+
"accuracy": 68.2
|
26 |
+
},
|
27 |
+
"PubMedQA": {
|
28 |
+
"accuracy": 76.2
|
29 |
+
},
|
30 |
+
"ToxiGen": {
|
31 |
+
"accuracy": 82.3
|
32 |
+
},
|
33 |
+
"Average": {
|
34 |
+
"accuracy": 67.2
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
|
39 |
+
|
requirements.txt
CHANGED
@@ -1,16 +1,18 @@
|
|
1 |
APScheduler
|
2 |
black
|
|
|
3 |
datasets
|
4 |
gradio
|
5 |
-
gradio[oauth]
|
6 |
-
gradio_leaderboard==0.0.9
|
7 |
gradio_client
|
8 |
huggingface-hub>=0.18.0
|
9 |
matplotlib
|
10 |
numpy
|
11 |
pandas
|
12 |
python-dateutil
|
|
|
13 |
tqdm
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
|
|
|
|
16 |
sentencepiece
|
|
|
1 |
APScheduler
|
2 |
black
|
3 |
+
click
|
4 |
datasets
|
5 |
gradio
|
|
|
|
|
6 |
gradio_client
|
7 |
huggingface-hub>=0.18.0
|
8 |
matplotlib
|
9 |
numpy
|
10 |
pandas
|
11 |
python-dateutil
|
12 |
+
requests
|
13 |
tqdm
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
+
git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
|
17 |
+
accelerate
|
18 |
sentencepiece
|
src/about.py
CHANGED
@@ -1,72 +1,267 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
|
|
4 |
@dataclass
|
5 |
-
class
|
6 |
benchmark: str
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
-
|
10 |
|
11 |
# Select your tasks here
|
12 |
# ---------------------------------------------------
|
13 |
-
class
|
14 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
-
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
-
# ---------------------------------------------------
|
20 |
|
|
|
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """
|
|
|
|
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
-
|
33 |
-
|
|
|
34 |
|
35 |
-
|
36 |
-
To reproduce our results, here is the commands you can run:
|
37 |
|
|
|
|
|
|
|
|
|
38 |
"""
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
```
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
-
|
53 |
-
|
|
|
|
|
54 |
|
55 |
-
|
56 |
-
|
|
|
57 |
|
58 |
-
|
59 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
60 |
|
61 |
-
### 4) Fill up your model card
|
62 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
63 |
|
64 |
-
|
65 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
66 |
-
Make sure you have followed the above steps first.
|
67 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
68 |
"""
|
69 |
|
70 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
71 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
"""
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
+
|
5 |
@dataclass
|
6 |
+
class HarnessTask:
|
7 |
benchmark: str
|
8 |
metric: str
|
9 |
col_name: str
|
10 |
+
|
11 |
|
12 |
# Select your tasks here
|
13 |
# ---------------------------------------------------
|
14 |
+
class HarnessTasks(Enum):
|
15 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
+
# task0 = Task("anli_r1", "acc", "ANLI")
|
17 |
+
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
18 |
+
task0 = HarnessTask("MMLU", "accuracy", "MMLU")
|
19 |
+
task1 = HarnessTask("MMLU-Pro", "accuracy", "MMLU-Pro")
|
20 |
+
task2 = HarnessTask("MedMCQA", "accuracy", "MedMCQA")
|
21 |
+
task3 = HarnessTask("MedQA", "accuracy", "MedQA")
|
22 |
+
task4 = HarnessTask("USMLE", "accuracy", "USMLE")
|
23 |
+
task5 = HarnessTask("PubMedQA", "accuracy", "PubMedQA")
|
24 |
+
task6 = HarnessTask("ToxiGen", "accuracy", "ToxiGen")
|
25 |
+
task7 = HarnessTask("Average", "accuracy", "Harness-Average")
|
26 |
+
# task5 = Task("", "f1", "")
|
27 |
+
# task6 = Task("", "f1", "")
|
28 |
+
|
29 |
+
@dataclass
|
30 |
+
class ClinicalType:
|
31 |
+
benchmark: str
|
32 |
+
metric: str
|
33 |
+
col_name: str
|
34 |
+
|
35 |
+
class ClinicalTypes(Enum):
|
36 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
37 |
+
type0 = ClinicalType("condition", "f1", "CONDITION")
|
38 |
+
type1 = ClinicalType("measurement", "f1", "MEASUREMENT")
|
39 |
+
type2 = ClinicalType("drug", "f1", "DRUG")
|
40 |
+
type3 = ClinicalType("procedure", "f1", "PROCEDURE")
|
41 |
+
type4 = ClinicalType("gene", "f1", "GENE")
|
42 |
+
type5 = ClinicalType("gene variant", "f1", "GENE VARIANT")
|
43 |
|
|
|
|
|
44 |
|
45 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
46 |
+
# ---------------------------------------------------
|
47 |
|
48 |
|
49 |
# Your leaderboard name
|
50 |
+
TITLE = """""" #<h1 align="center" id="space-title"> NER Leaderboard</h1>"""
|
51 |
+
LOGO = """<img src="https://equalengineers.com/wp-content/uploads/2024/04/dummy-logo-5b.png" alt="Clinical X HF" width="500" height="333">"""
|
52 |
+
# LOGO = """<img src="https://huggingface.co/spaces/m42-health/clinical_ner_leaderboard/resolve/main/assets/image.png" alt="Clinical X HF" width="500" height="333">"""
|
53 |
|
54 |
# What does your leaderboard evaluate?
|
55 |
INTRODUCTION_TEXT = """
|
56 |
+
The rapid development of Large Language Models (LLMs) for healthcare applications has spurred calls for holistic evaluation beyond frequently-cited benchmarks like USMLE, to better reflect real-world performance. While real-world assessments are valuable indicators of utility, they often lag behind the pace of LLM evolution, likely rendering findings obsolete upon deployment. This temporal disconnect necessitates a comprehensive upfront evaluation that can guide model selection for specific clinical applications. We introduce MEDIC, a framework assessing LLMs across five critical dimensions of clinical competence: medical reasoning, ethics and bias, data and language understanding, in-context learning, and clinical safety. MEDIC features a novel cross-examination framework quantifying LLM performance across areas like coverage and hallucination detection, without requiring reference outputs. We apply MEDIC to evaluate LLMs on medical question-answering, safety, summarization, note generation, and other tasks. Our results show performance disparities across model sizes, baseline vs medically finetuned models, and have implications on model selection for applications requiring specific model strengths, such as low hallucination or lower cost of inference. MEDIC's multifaceted evaluation reveals these performance trade-offs, bridging the gap between theoretical capabilities and practical implementation in healthcare settings, ensuring that the most promising models are identified and adapted for diverse healthcare applications.
|
57 |
"""
|
58 |
|
59 |
# Which evaluations are you running? how can people reproduce what you have?
|
60 |
+
LLM_BENCHMARKS_TEXT_1 = f"""
|
61 |
+
|
62 |
+
## About
|
63 |
|
64 |
+
The Named Clinical Entity Recognition Leaderboard is aimed at advancing the field of natural language processing in healthcare. It provides a standardized platform for evaluating and comparing the performance of various language models in recognizing named clinical entities, a critical task for applications such as clinical documentation, decision support, and information extraction. By fostering transparency and facilitating benchmarking, the leaderboard's goal is to drive innovation and improvement in NLP models. It also helps researchers identify the strengths and weaknesses of different approaches, ultimately contributing to the development of more accurate and reliable tools for clinical use. Despite its exploratory nature, the leaderboard aims to play a role in guiding research and ensuring that advancements are grounded in rigorous and comprehensive evaluations.
|
|
|
65 |
|
66 |
+
## Evaluation method and metrics
|
67 |
+
When training a Named Entity Recognition (NER) system, the most common evaluation methods involve measuring precision, recall, and F1-score at the token level. While these metrics are useful for fine-tuning the NER system, evaluating the predicted named entities for downstream tasks requires metrics at the full named-entity level. We include both evaluation methods: token-based and span-based. We provide an example below which helps in understanding the difference between the methods.
|
68 |
+
Example Sentence: "The patient was diagnosed with a skin cancer disease."
|
69 |
+
For simplicity, let's assume the an example sentence which contains 10 tokens, with a single two-token disease entity (as shown in the figure below).
|
70 |
"""
|
71 |
+
EVALUATION_EXAMPLE_IMG = """<img src="https://huggingface.co/spaces/m42-health/clinical_ner_leaderboard/resolve/main/assets/ner_evaluation_example.png" alt="Clinical X HF" width="750" height="500">"""
|
72 |
+
LLM_BENCHMARKS_TEXT_2 = """
|
73 |
+
Token-based evaluation involves obtaining the set of token labels (ground-truth annotations) for the annotated entities and the set of token predictions, comparing these sets, and computing a classification report. Hence, the results for the example above are shown below.
|
74 |
+
**Token-based metrics:**
|
75 |
|
76 |
+
|
77 |
+
|
78 |
+
| Model | TP | FP | FN | Precision | Recall | F1-Score |
|
79 |
+
| ------- | --- | --- | --- | --------- | ------ | -------- |
|
80 |
+
| Model D | 0 | 1 | 0 | 0.00 | 0.00 | 0.00 |
|
81 |
+
| Model C | 1 | 1 | 1 | 0.50 | 0.50 | 0.50 |
|
82 |
+
| Model B | 2 | 2 | 0 | 0.50 | 1.00 | 0.67 |
|
83 |
+
| Model A | 2 | 1 | 0 | 0.67 | 1.00 | 0.80 |
|
84 |
+
|
85 |
+
|
86 |
+
Where,
|
87 |
+
$$ Precision = TP / (TP + FP)$$
|
88 |
+
$$ Recall = TP / (TP + FN)$$
|
89 |
+
$$ f1score = 2 * (Prec * Rec) / (Prec + Rec)$$
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
With this token-based approach, we have a broad idea of the performance of the model at the token level. However, it may misrepresent the performance at the entity level when the entity includes more than 1 token (which may be more relevant for certain applications). In addition, depending on the annotations of certain datasets, we may not want to penalize a model for a "partial" match with a certain entity.
|
94 |
+
The span-based method attempts to address some of these issues, by determining the full or partial matches at the entity level to classify the predictions as correct, incorrect, missed and spurious. These are then used to calculate precision, recall and F1-score. Given this, for the example below.
|
95 |
+
|
96 |
+
**Span-based metrics:**
|
97 |
+
|
98 |
+
|
99 |
+
| Model | Correct | Incorrect | Missed | Spurious | Precision | Recall | F1-Score |
|
100 |
+
| ------- | ------- | --------- | ------ | -------- | --------- | ------ | -------- |
|
101 |
+
| Model A | 1 | 0 | 0 | 0 | 1.00 | 1.00 | 1.00 |
|
102 |
+
| Model B | 1 | 0 | 0 | 0 | 1.00 | 1.00 | 1.00 |
|
103 |
+
| Model C | 1 | 0 | 0 | 0 | 1.00 | 1.00 | 1.00 |
|
104 |
+
| Model D | 0 | 0 | 1 | 1 | 0.00 | 0.00 | 0.00 |
|
105 |
+
|
106 |
+
|
107 |
+
Where,
|
108 |
+
$$ Precision = COR / (COR + INC + SPU)$$
|
109 |
+
$$ Recall = COR / (COR + INC + MIS)$$
|
110 |
+
$$ f1score = 2 * (Prec * Rec) / (Prec + Rec)$$
|
111 |
+
|
112 |
+
Note:
|
113 |
+
1. Span-based approach here is equivalent to the 'Span Based Evaluation with Partial Overlap' in [NER Metrics Showdown!](https://huggingface.co/spaces/wadood/ner_evaluation_metrics) and is equivalent to Partial Match ("Type") in the nervaluate python package.
|
114 |
+
2. Token-based approach here is equivalent to the 'Token Based Evaluation With Macro Average' in [NER Metrics Showdown!](https://huggingface.co/spaces/wadood/ner_evaluation_metrics)
|
115 |
+
|
116 |
+
Additional examples can be tested on the [NER Metrics Showdown!](https://huggingface.co/spaces/wadood/ner_evaluation_metrics) huggingface space.
|
117 |
+
|
118 |
+
## Datasets
|
119 |
+
The following datasets (test splits only) have been included in the evaluation.
|
120 |
+
|
121 |
+
### [NCBI Disease](https://huggingface.co/datasets/m42-health/clinical_ncbi)
|
122 |
+
The NCBI Disease corpus includes mention and concept level annotations on PubMed abstracts. It covers annotations of diseases.
|
123 |
+
|
124 |
+
| | Counts |
|
125 |
+
| ---------- | ------ |
|
126 |
+
| Samples | 100 |
|
127 |
+
| Annotation | 960 |
|
128 |
+
|
129 |
+
|
130 |
+
### [CHIA](https://huggingface.co/datasets/m42-health/clinical_chia)
|
131 |
+
This is a large, annotated corpus of patient eligibility criteria extracted from registered clinical trials (ClinicalTrials.gov). Annotations cover 15 different entity types, including conditions, drugs, procedures, and measurements.
|
132 |
+
|
133 |
+
|
134 |
+
| | Counts |
|
135 |
+
| ---------- | ------ |
|
136 |
+
| Samples | 194 |
|
137 |
+
| Annotation | 3981 |
|
138 |
+
|
139 |
+
|
140 |
+
### [BC5CDR](https://huggingface.co/datasets/m42-health/clinical_bc5cdr)
|
141 |
+
The BC5CDR corpus consists of 1500 PubMed articles with annotated chemicals and diseases.
|
142 |
+
|
143 |
+
|
144 |
+
| | Counts |
|
145 |
+
| ---------- | ------ |
|
146 |
+
| Samples | 500 |
|
147 |
+
| Annotation | 9928 |
|
148 |
+
|
149 |
+
|
150 |
+
### [BIORED](https://huggingface.co/datasets/m42-health/clinical_biored)
|
151 |
+
The BIORED corpus includes a set of PubMed abstracts with annotations of multiple entity types (e.g., gene/protein, disease, chemical).
|
152 |
+
|
153 |
+
|
154 |
+
| | Counts |
|
155 |
+
| ---------- | ------ |
|
156 |
+
| Samples | 100 |
|
157 |
+
| Annotation | 3535 |
|
158 |
+
|
159 |
+
|
160 |
+
Datasets summary
|
161 |
+
|
162 |
+
A summary of the datasets used are summarized here.
|
163 |
+
|
164 |
+
|
165 |
+
| Dataset | # samples | # annotations | # original entities | # clinical entities |
|
166 |
+
| ------- | --------- | ------------- | ------------------- | ------------------- |
|
167 |
+
| NCBI | 100 | 960 | 4 | 1 |
|
168 |
+
| CHIA | 194 | 3981 | 16 | 4 |
|
169 |
+
| BIORED | 500 | 9928 | 2 | 4 |
|
170 |
+
| BC5CDR | 100 | 3535 | 6 | 2 |
|
171 |
+
|
172 |
+
|
173 |
+
## Clinical Entity Types
|
174 |
+
|
175 |
+
The above datasets are modified to cater to the clinical setting. For this, the entity types that are clinically relevant are retained and the rest are dropped. Further, the clinical entity type is standardized across the dataset to obtain a total of 6 clinical entity types shown below.
|
176 |
+
|
177 |
+
|
178 |
+
| Clinical Entity | Combined Annotation |
|
179 |
+
| --------------- | ------------------- |
|
180 |
+
| Condition | 7514 |
|
181 |
+
| Drug | 6443 |
|
182 |
+
| Procedure | 300 |
|
183 |
+
| Measurement | 258 |
|
184 |
+
| Gene | 1180 |
|
185 |
+
| Gene Variant | 241 |
|
186 |
+
|
187 |
+
|
188 |
+
"""
|
189 |
+
|
190 |
+
ENTITY_DISTRIBUTION_IMG = """<img src="file/assets/entity_distribution.png" alt="Clinical X HF" width="750" height="500">"""
|
191 |
+
LLM_BENCHMARKS_TEXT_3="""
|
192 |
+
## Decoder Model Evaluation
|
193 |
+
Evaluating encoder models, such as BERT, for token classification tasks (e.g., NER) is straightforward given that these models process the entire input sequence simultaneously. This allows them to generate token-level classifications by leveraging bidirectional context, facilitating a direct comparison of predicted tags against the gold standard labels for each token in the input sequence.
|
194 |
+
|
195 |
+
In contrast, decoder-only models, like GPT, generate responses sequentially, predicting one token at a time based on the preceding context. Evaluating the performance of these models for token classification tasks requires a different approach. First, we prompt the decoder-only LLM with a specific task of tagging the different entity types within a given text. This task is clearly defined to the model, ensuring it understands which types of entities to identify (i.e., conditions, drugs, procedures, etc).
|
196 |
+
An example of the task prompt is shown below.
|
197 |
+
```
|
198 |
+
## Instruction
|
199 |
+
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: symptom, disorder. Use HTML <span > tags to highlight these entities. Each <span > should have a class attribute indicating the type of the entity. Do NOT provide further examples and just consider the input provided below. Do NOT provide an explanation nor notes about the reasoning. Do NOT reformat nor summarize the input text. Follow the instruction and the format of the example below.
|
200 |
+
|
201 |
+
## Entity markup guide
|
202 |
+
Use <span class='symptom' > to denote a symptom.
|
203 |
+
Use <span class='disorder' > to denote a disorder.
|
204 |
```
|
205 |
+
To ensure deterministic and consistent outputs, the temperature for generation is kept at 0.0. The model then generates a sequential response that includes the tagged entities, as shown in the example below.
|
206 |
+
```
|
207 |
+
## Input:
|
208 |
+
He had been diagnosed with osteoarthritis of the knees and had undergone arthroscopy years prior to admission.
|
209 |
+
## Output:
|
210 |
+
He had been diagnosed with <span class="disease" >osteoarthritis of the knees</span >and had undergone <span class="procedure" >arthroscopy</span >years prior to admission.
|
211 |
+
```
|
212 |
+
|
213 |
+
After the tagged output is generated, it is parsed to extract the tagged entities. The parsed data are then compared against the gold standard labels, and performance metrics are computed as above. This evaluation method ensures a consistent and objective assessment of decoder-only LLM's performance in NER tasks, despite the differences in their architecture compared to encoder models.
|
214 |
+
|
215 |
+
# Reproducibility
|
216 |
+
To reproduce our results, follow the steps detailed [here](https://github.com/WadoodAbdul/clinical_ner_benchmark/blob/master/docs/reproducing_results.md)
|
217 |
+
|
218 |
+
# Disclaimer and Advisory
|
219 |
+
The Leaderboard is maintained by the authors and affiliated entity as part of our ongoing contribution to open research in the field of NLP in healthcare. The leaderboard is intended for academic and exploratory purposes only. The language models evaluated on this platform (to the best knowledge of the authors) have not been approved for clinical use, and their performance should not be interpreted as clinically validated or suitable for real-world medical applications.
|
220 |
+
|
221 |
+
Users are advised to approach the results with an understanding of the inherent limitations and the experimental nature of this evaluation. The authors and affiliated entity do not endorse any specific model or approach, and the leaderboard is provided without any warranties or guarantees. Researchers and practitioners are encouraged to use the leaderboard as a resource to guide further research and development, keeping in mind the necessity for rigorous testing and validation in clinical settings before any practical application.
|
222 |
+
|
223 |
+
|
224 |
+
|
225 |
+
|
226 |
+
"""
|
227 |
+
|
228 |
+
EVALUATION_QUEUE_TEXT = """
|
229 |
+
|
230 |
+
Currently, the benchmark supports evaluation for models hosted on the huggingface hub and of type encoder, decoder or gliner type models.
|
231 |
+
If your model needs a custom implementation, follow the steps outlined in the [clinical_ner_benchmark](https://github.com/WadoodAbdul/clinical_ner_benchmark/blob/e66eb566f34e33c4b6c3e5258ac85aba42ec7894/docs/custom_model_implementation.md) repo or reach out to our team!
|
232 |
+
|
233 |
+
|
234 |
+
### Fields Explanation
|
235 |
+
|
236 |
+
#### Model Type:
|
237 |
+
- Fine-Tuned: If the training data consisted of any split/variation of the datasets on the leaderboard.
|
238 |
+
- Zero-Shot: If the model did not have any exposure to the datasets on the leaderboard while training.
|
239 |
|
240 |
+
#### Model Architecture:
|
241 |
+
- Encoder: The standard transformer encoder architecture with a token classification head on top.
|
242 |
+
- Decoder: Transformer based autoregressive token generation model.
|
243 |
+
- GLiNER: Architecture outlined in the [GLiNER Paper](https://arxiv.org/abs/2311.08526)
|
244 |
|
245 |
+
#### Label Normalization Map:
|
246 |
+
Not all models have been tuned to output the ner label names in the clinical datasets on this leaderboard. Some models cater to the same entity names with a synonym of it.
|
247 |
+
The normalization map can be used to ensure that the models's output are aligned with the labels expected in the datasets.
|
248 |
|
249 |
+
Note: Multiple model labels can be mapped to a single entity type in the leaderboard dataset. Ex: 'synonym' and 'disease' to 'condition'
|
|
|
250 |
|
|
|
|
|
251 |
|
252 |
+
Upon successful submission of your request, your model's result would be updated on the leaderboard within 5 working days!
|
|
|
|
|
|
|
253 |
"""
|
254 |
|
255 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
256 |
CITATION_BUTTON_TEXT = r"""
|
257 |
+
@misc{abdul2024namedclinicalentityrecognition,
|
258 |
+
title={Named Clinical Entity Recognition Benchmark},
|
259 |
+
author={Wadood M Abdul and Marco AF Pimentel and Muhammad Umar Salman and Tathagata Raha and Clément Christophe and Praveen K Kanithi and Nasir Hayat and Ronnie Rajan and Shadab Khan},
|
260 |
+
year={2024},
|
261 |
+
eprint={2410.05046},
|
262 |
+
archivePrefix={arXiv},
|
263 |
+
primaryClass={cs.CL},
|
264 |
+
url={https://arxiv.org/abs/2410.05046},
|
265 |
+
}
|
266 |
+
|
267 |
"""
|
src/display/css_html_js.py
CHANGED
@@ -1,4 +1,11 @@
|
|
1 |
custom_css = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
.markdown-text {
|
4 |
font-size: 16px !important;
|
|
|
1 |
custom_css = """
|
2 |
+
.logo {
|
3 |
+
width: 500px;
|
4 |
+
height: auto;
|
5 |
+
margin: 0 auto;
|
6 |
+
max-width: 100%
|
7 |
+
object-fit: contain;
|
8 |
+
}
|
9 |
|
10 |
.markdown-text {
|
11 |
font-size: 16px !important;
|
src/display/utils.py
CHANGED
@@ -3,7 +3,9 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import
|
|
|
|
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -19,53 +21,64 @@ class ColumnContent:
|
|
19 |
displayed_by_default: bool
|
20 |
hidden: bool = False
|
21 |
never_hidden: bool = False
|
|
|
|
|
|
|
22 |
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
-
#Scores
|
29 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average
|
30 |
-
for task in
|
31 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
33 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
35 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
-
auto_eval_column_dict.append(
|
41 |
-
|
|
|
|
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
45 |
|
|
|
46 |
## For the queue columns in the submission tab
|
47 |
@dataclass(frozen=True)
|
48 |
class EvalQueueColumn: # Queue column
|
49 |
model = ColumnContent("model", "markdown", True)
|
50 |
revision = ColumnContent("revision", "str", True)
|
51 |
private = ColumnContent("private", "bool", True)
|
52 |
-
|
53 |
-
|
|
|
54 |
status = ColumnContent("status", "str", True)
|
55 |
|
|
|
56 |
## All the model information that we might need
|
57 |
@dataclass
|
58 |
class ModelDetails:
|
59 |
name: str
|
60 |
display_name: str = ""
|
61 |
-
symbol: str = ""
|
62 |
|
63 |
|
64 |
class ModelType(Enum):
|
|
|
|
|
65 |
PT = ModelDetails(name="pretrained", symbol="🟢")
|
66 |
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
67 |
-
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
68 |
-
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
69 |
Unknown = ModelDetails(name="", symbol="?")
|
70 |
|
71 |
def to_str(self, separator=" "):
|
@@ -73,24 +86,55 @@ class ModelType(Enum):
|
|
73 |
|
74 |
@staticmethod
|
75 |
def from_str(type):
|
76 |
-
if "
|
77 |
-
return ModelType.
|
78 |
-
if "
|
79 |
-
return ModelType.
|
80 |
-
if "
|
81 |
-
|
82 |
-
if "
|
83 |
-
|
|
|
|
|
|
|
|
|
84 |
return ModelType.Unknown
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
class WeightType(Enum):
|
87 |
Adapter = ModelDetails("Adapter")
|
88 |
Original = ModelDetails("Original")
|
89 |
Delta = ModelDetails("Delta")
|
90 |
|
|
|
91 |
class Precision(Enum):
|
92 |
float16 = ModelDetails("float16")
|
93 |
bfloat16 = ModelDetails("bfloat16")
|
|
|
|
|
|
|
|
|
94 |
Unknown = ModelDetails("?")
|
95 |
|
96 |
def from_str(precision):
|
@@ -98,13 +142,49 @@ class Precision(Enum):
|
|
98 |
return Precision.float16
|
99 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
100 |
return Precision.bfloat16
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
return Precision.Unknown
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
# Column selection
|
104 |
-
|
|
|
|
|
|
|
|
|
105 |
|
106 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
108 |
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import HarnessTasks
|
7 |
+
from src.about import ClinicalTypes
|
8 |
+
|
9 |
|
10 |
def fields(raw_class):
|
11 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
21 |
displayed_by_default: bool
|
22 |
hidden: bool = False
|
23 |
never_hidden: bool = False
|
24 |
+
dataset_task_col: bool = False
|
25 |
+
clinical_type_col: bool = False
|
26 |
+
|
27 |
|
28 |
## Leaderboard columns
|
29 |
auto_eval_column_dict = []
|
30 |
# Init
|
31 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
32 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
33 |
+
# Scores
|
34 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
35 |
+
for task in HarnessTasks:
|
36 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
|
37 |
# Model information
|
38 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
39 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
40 |
+
auto_eval_column_dict.append(["backbone", ColumnContent, ColumnContent("Base Model", "str", False)])
|
41 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
42 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False, True)])
|
43 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
44 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
45 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False, True)])
|
46 |
+
auto_eval_column_dict.append(
|
47 |
+
["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, True)]
|
48 |
+
)
|
49 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
|
50 |
|
51 |
# We use make dataclass to dynamically fill the scores from Tasks
|
52 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
53 |
|
54 |
+
|
55 |
## For the queue columns in the submission tab
|
56 |
@dataclass(frozen=True)
|
57 |
class EvalQueueColumn: # Queue column
|
58 |
model = ColumnContent("model", "markdown", True)
|
59 |
revision = ColumnContent("revision", "str", True)
|
60 |
private = ColumnContent("private", "bool", True)
|
61 |
+
architecture = ColumnContent("model_architecture", "bool", True)
|
62 |
+
# precision = ColumnContent("precision", "str", True)
|
63 |
+
# weight_type = ColumnContent("weight_type", "str", "Original")
|
64 |
status = ColumnContent("status", "str", True)
|
65 |
|
66 |
+
|
67 |
## All the model information that we might need
|
68 |
@dataclass
|
69 |
class ModelDetails:
|
70 |
name: str
|
71 |
display_name: str = ""
|
72 |
+
symbol: str = "" # emoji
|
73 |
|
74 |
|
75 |
class ModelType(Enum):
|
76 |
+
ZEROSHOT = ModelDetails(name="zero-shot", symbol="⚫")
|
77 |
+
FINETUNED = ModelDetails(name="fine-tuned", symbol="⚪")
|
78 |
PT = ModelDetails(name="pretrained", symbol="🟢")
|
79 |
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
80 |
+
# IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
81 |
+
# RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
82 |
Unknown = ModelDetails(name="", symbol="?")
|
83 |
|
84 |
def to_str(self, separator=" "):
|
|
|
86 |
|
87 |
@staticmethod
|
88 |
def from_str(type):
|
89 |
+
if "zero-shot" in type or "⚫" in type:
|
90 |
+
return ModelType.ZEROSHOT
|
91 |
+
if "fine-tuned" in type or "⚪" in type:
|
92 |
+
return ModelType.FINETUNED
|
93 |
+
# if "fine-tuned" in type or "🔶" in type:
|
94 |
+
# return ModelType.FT
|
95 |
+
# if "pretrained" in type or "🟢" in type:
|
96 |
+
# return ModelType.PT
|
97 |
+
# if "RL-tuned" in type or "🟦" in type:
|
98 |
+
# return ModelType.RL
|
99 |
+
# if "instruction-tuned" in type or "⭕" in type:
|
100 |
+
# return ModelType.IFT
|
101 |
return ModelType.Unknown
|
102 |
|
103 |
+
class ModelArch(Enum):
|
104 |
+
Encoder = ModelDetails("Encoder")
|
105 |
+
Decoder = ModelDetails("Decoder")
|
106 |
+
GLiNEREncoder = ModelDetails("GLiNER Encoder")
|
107 |
+
Unknown = ModelDetails(name="Other", symbol="?")
|
108 |
+
|
109 |
+
def to_str(self, separator=" "):
|
110 |
+
return f"{self.value.name}"
|
111 |
+
|
112 |
+
@staticmethod
|
113 |
+
def from_str(type):
|
114 |
+
if "Encoder" == type:
|
115 |
+
return ModelArch.Encoder
|
116 |
+
if "Decoder" == type:
|
117 |
+
return ModelArch.Decoder
|
118 |
+
if "GLiNER Encoder" == type:
|
119 |
+
return ModelArch.GLiNEREncoder
|
120 |
+
# if "unknown" in type:
|
121 |
+
# return ModelArch.Unknown
|
122 |
+
return ModelArch.Unknown
|
123 |
+
|
124 |
+
|
125 |
class WeightType(Enum):
|
126 |
Adapter = ModelDetails("Adapter")
|
127 |
Original = ModelDetails("Original")
|
128 |
Delta = ModelDetails("Delta")
|
129 |
|
130 |
+
|
131 |
class Precision(Enum):
|
132 |
float16 = ModelDetails("float16")
|
133 |
bfloat16 = ModelDetails("bfloat16")
|
134 |
+
float32 = ModelDetails("float32")
|
135 |
+
# qt_8bit = ModelDetails("8bit")
|
136 |
+
# qt_4bit = ModelDetails("4bit")
|
137 |
+
# qt_GPTQ = ModelDetails("GPTQ")
|
138 |
Unknown = ModelDetails("?")
|
139 |
|
140 |
def from_str(precision):
|
|
|
142 |
return Precision.float16
|
143 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
144 |
return Precision.bfloat16
|
145 |
+
if precision in ["float32"]:
|
146 |
+
return Precision.float32
|
147 |
+
# if precision in ["8bit"]:
|
148 |
+
# return Precision.qt_8bit
|
149 |
+
# if precision in ["4bit"]:
|
150 |
+
# return Precision.qt_4bit
|
151 |
+
# if precision in ["GPTQ", "None"]:
|
152 |
+
# return Precision.qt_GPTQ
|
153 |
return Precision.Unknown
|
154 |
|
155 |
+
|
156 |
+
class PromptTemplateName(Enum):
|
157 |
+
UniversalNERTemplate = "universal_ner"
|
158 |
+
LLMHTMLHighlightedSpansTemplate = "llm_html_highlighted_spans"
|
159 |
+
LLMHTMLHighlightedSpansTemplateV1 = "llm_html_highlighted_spans_v1"
|
160 |
+
LLamaNERTemplate = "llama_70B_ner"
|
161 |
+
# MixtralNERTemplate = "mixtral_ner_v0.3"
|
162 |
+
|
163 |
+
class EvaluationMetrics(Enum):
|
164 |
+
SpanBased = "Span Based"
|
165 |
+
TokenBased = "Token Based"
|
166 |
+
|
167 |
+
|
168 |
# Column selection
|
169 |
+
DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.clinical_type_col]
|
170 |
+
Clinical_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
|
171 |
+
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
172 |
+
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
173 |
+
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
174 |
|
175 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
176 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
177 |
|
178 |
+
DATASET_BENCHMARK_COLS = [t.value.col_name for t in HarnessTasks]
|
179 |
+
TYPES_BENCHMARK_COLS = [t.value.col_name for t in ClinicalTypes]
|
180 |
+
|
181 |
+
NUMERIC_INTERVALS = {
|
182 |
+
"?": pd.Interval(-1, 0, closed="right"),
|
183 |
+
"~1.5": pd.Interval(0, 2, closed="right"),
|
184 |
+
"~3": pd.Interval(2, 4, closed="right"),
|
185 |
+
"~7": pd.Interval(4, 9, closed="right"),
|
186 |
+
"~13": pd.Interval(9, 20, closed="right"),
|
187 |
+
"~35": pd.Interval(20, 45, closed="right"),
|
188 |
+
"~60": pd.Interval(45, 70, closed="right"),
|
189 |
+
"70+": pd.Interval(70, 10000, closed="right"),
|
190 |
+
}
|
src/envs.py
CHANGED
@@ -4,22 +4,22 @@ from huggingface_hub import HfApi
|
|
4 |
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
-
TOKEN = os.environ.get("
|
8 |
|
9 |
-
OWNER = "
|
10 |
# ----------------------------------
|
11 |
|
12 |
-
REPO_ID = f"{OWNER}/
|
13 |
-
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
-
RESULTS_REPO = f"{OWNER}/results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
|
19 |
# Local caches
|
20 |
-
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "
|
21 |
-
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "
|
22 |
-
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "
|
23 |
-
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "
|
24 |
|
25 |
API = HfApi(token=TOKEN)
|
|
|
4 |
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
+
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "m42-health" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
+
REPO_ID = f"{OWNER}/MEDIC-Benchmark"
|
13 |
+
QUEUE_REPO = f"{OWNER}/medic-harness-requests"
|
14 |
+
RESULTS_REPO = f"{OWNER}/medic-harness-results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
18 |
|
19 |
# Local caches
|
20 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "medic-harness-requests")
|
21 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "medic-harness-results")
|
22 |
+
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "medic-harness-requests-bk")
|
23 |
+
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "medic-harness-results-bk")
|
24 |
|
25 |
API = HfApi(token=TOKEN)
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,40 +8,48 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType,
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
15 |
@dataclass
|
16 |
class EvalResult:
|
17 |
-
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
-
|
19 |
-
eval_name: str
|
20 |
-
full_model: str
|
21 |
-
org: str
|
22 |
model: str
|
23 |
-
revision: str
|
24 |
-
|
|
|
25 |
precision: Precision = Precision.Unknown
|
26 |
-
model_type: ModelType = ModelType.Unknown
|
27 |
-
weight_type: WeightType = WeightType.Original
|
28 |
-
architecture: str = "Unknown"
|
|
|
29 |
license: str = "?"
|
30 |
likes: int = 0
|
31 |
num_params: int = 0
|
32 |
-
date: str = ""
|
33 |
still_on_hub: bool = False
|
|
|
34 |
|
35 |
@classmethod
|
36 |
-
def init_from_json_file(self, json_filepath):
|
37 |
"""Inits the result from the specific model result file"""
|
38 |
with open(json_filepath) as fp:
|
39 |
data = json.load(fp)
|
40 |
-
|
41 |
config = data.get("config")
|
42 |
|
43 |
# Precision
|
44 |
precision = Precision.from_str(config.get("model_dtype"))
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
# Get model and org
|
47 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
@@ -58,17 +66,17 @@ class EvalResult:
|
|
58 |
full_model = "/".join(org_and_model)
|
59 |
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
-
full_model, config.get("
|
62 |
)
|
63 |
-
|
64 |
if model_config is not None:
|
65 |
-
|
66 |
-
if
|
67 |
-
|
68 |
|
69 |
# Extract results available in this file (some results are split in several files)
|
70 |
-
|
71 |
-
for task in
|
72 |
task = task.value
|
73 |
|
74 |
# We average all scores of a given metric (not all metrics are present in all files)
|
@@ -76,19 +84,37 @@ class EvalResult:
|
|
76 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
continue
|
78 |
|
79 |
-
mean_acc = np.mean(accs) * 100.0
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
return self(
|
83 |
eval_name=result_key,
|
84 |
full_model=full_model,
|
85 |
org=org,
|
86 |
model=model,
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
90 |
still_on_hub=still_on_hub,
|
91 |
-
architecture=
|
|
|
|
|
|
|
|
|
|
|
92 |
)
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
@@ -104,32 +130,66 @@ class EvalResult:
|
|
104 |
self.likes = request.get("likes", 0)
|
105 |
self.num_params = request.get("params", 0)
|
106 |
self.date = request.get("submitted_time", "")
|
|
|
107 |
except Exception:
|
108 |
-
|
|
|
|
|
|
|
|
|
109 |
|
110 |
-
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
|
135 |
def get_request_file_for_model(requests_path, model_name, precision):
|
@@ -146,15 +206,12 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
146 |
for tmp_request_file in request_files:
|
147 |
with open(tmp_request_file, "r") as f:
|
148 |
req_content = json.load(f)
|
149 |
-
if (
|
150 |
-
req_content["status"] in ["FINISHED"]
|
151 |
-
and req_content["precision"] == precision.split(".")[-1]
|
152 |
-
):
|
153 |
request_file = tmp_request_file
|
154 |
return request_file
|
155 |
|
156 |
|
157 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
158 |
"""From the path of the results folder root, extract all needed info for results"""
|
159 |
model_result_filepaths = []
|
160 |
|
@@ -175,20 +232,23 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
175 |
eval_results = {}
|
176 |
for model_result_filepath in model_result_filepaths:
|
177 |
# Creation of result
|
178 |
-
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
-
eval_result.update_with_request_file(requests_path)
|
180 |
|
181 |
# Store results of same eval together
|
182 |
eval_name = eval_result.eval_name
|
183 |
-
if eval_name in eval_results.keys():
|
184 |
-
|
185 |
-
else:
|
186 |
-
|
187 |
|
188 |
results = []
|
|
|
189 |
for v in eval_results.values():
|
190 |
try:
|
191 |
-
v.to_dict()
|
|
|
|
|
192 |
results.append(v)
|
193 |
except KeyError: # not all eval values present
|
194 |
continue
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, ClinicalTypes
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
15 |
@dataclass
|
16 |
class EvalResult:
|
17 |
+
"""Represents one full evaluation. Built from a combination of the result and request file for a given run."""
|
18 |
+
|
19 |
+
eval_name: str # org_model_precision (uid)
|
20 |
+
full_model: str # org/model (path on hub)
|
21 |
+
org: str
|
22 |
model: str
|
23 |
+
revision: str # commit hash, "" if main
|
24 |
+
dataset_results: dict
|
25 |
+
# clinical_type_results:dict
|
26 |
precision: Precision = Precision.Unknown
|
27 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
28 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
29 |
+
architecture: str = "Unknown"
|
30 |
+
backbone:str = "Unknown"
|
31 |
license: str = "?"
|
32 |
likes: int = 0
|
33 |
num_params: int = 0
|
34 |
+
date: str = "" # submission date of request file
|
35 |
still_on_hub: bool = False
|
36 |
+
display_result:bool = True
|
37 |
|
38 |
@classmethod
|
39 |
+
def init_from_json_file(self, json_filepath, evaluation_metric):
|
40 |
"""Inits the result from the specific model result file"""
|
41 |
with open(json_filepath) as fp:
|
42 |
data = json.load(fp)
|
43 |
+
|
44 |
config = data.get("config")
|
45 |
|
46 |
# Precision
|
47 |
precision = Precision.from_str(config.get("model_dtype"))
|
48 |
+
model_type = ModelType.from_str(config.get("model_type", ""))
|
49 |
+
license = config.get("license", "?")
|
50 |
+
num_params = config.get("num_params", "?")
|
51 |
+
display_result = config.get("display_result", True)
|
52 |
+
display_result = False if display_result=="False" else True
|
53 |
|
54 |
# Get model and org
|
55 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
|
|
66 |
full_model = "/".join(org_and_model)
|
67 |
|
68 |
still_on_hub, _, model_config = is_model_on_hub(
|
69 |
+
full_model, config.get("revision", "main"), trust_remote_code=True, test_tokenizer=False
|
70 |
)
|
71 |
+
backbone = "?"
|
72 |
if model_config is not None:
|
73 |
+
backbones = getattr(model_config, "architectures", None)
|
74 |
+
if backbones:
|
75 |
+
backbone = ";".join(backbones)
|
76 |
|
77 |
# Extract results available in this file (some results are split in several files)
|
78 |
+
dataset_results = {}
|
79 |
+
for task in HarnessTasks:
|
80 |
task = task.value
|
81 |
|
82 |
# We average all scores of a given metric (not all metrics are present in all files)
|
|
|
84 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
85 |
continue
|
86 |
|
87 |
+
mean_acc = np.mean(accs) # * 100.0
|
88 |
+
dataset_results[task.benchmark] = mean_acc
|
89 |
+
print(dataset_results)
|
90 |
+
# types_results = {}
|
91 |
+
# for clinical_type in ClinicalTypes:
|
92 |
+
# clinical_type = clinical_type.value
|
93 |
+
|
94 |
+
# # We average all scores of a given metric (not all metrics are present in all files)
|
95 |
+
# accs = np.array([v.get(clinical_type.metric, None) for k, v in data[evaluation_metric]["clinical_type_results"].items() if clinical_type.benchmark == k])
|
96 |
+
# if accs.size == 0 or any([acc is None for acc in accs]):
|
97 |
+
# continue
|
98 |
+
|
99 |
+
# mean_acc = np.mean(accs) # * 100.0
|
100 |
+
# types_results[clinical_type.benchmark] = mean_acc
|
101 |
|
102 |
return self(
|
103 |
eval_name=result_key,
|
104 |
full_model=full_model,
|
105 |
org=org,
|
106 |
model=model,
|
107 |
+
dataset_results=dataset_results,
|
108 |
+
# clinical_type_results=types_results,
|
109 |
+
precision=precision,
|
110 |
+
revision=config.get("revision", ""),
|
111 |
still_on_hub=still_on_hub,
|
112 |
+
# architecture=model_architecture,
|
113 |
+
backbone=backbone,
|
114 |
+
model_type=model_type,
|
115 |
+
num_params=num_params,
|
116 |
+
license=license,
|
117 |
+
display_result=display_result
|
118 |
)
|
119 |
|
120 |
def update_with_request_file(self, requests_path):
|
|
|
130 |
self.likes = request.get("likes", 0)
|
131 |
self.num_params = request.get("params", 0)
|
132 |
self.date = request.get("submitted_time", "")
|
133 |
+
# self.precision = request.get("precision", "float32")
|
134 |
except Exception:
|
135 |
+
pass
|
136 |
+
# print(
|
137 |
+
# f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
|
138 |
+
# )
|
139 |
+
# print(f" Args used were - {request_file=}, {requests_path=}, {self.full_model=},")
|
140 |
|
141 |
+
def to_dict(self, subset):
|
142 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
143 |
+
if subset == "datasets":
|
144 |
+
average = sum([v for v in self.dataset_results.values() if v is not None]) / len(HarnessTasks)
|
145 |
+
data_dict = {
|
146 |
+
"eval_name": self.eval_name, # not a column, just a save name,
|
147 |
+
AutoEvalColumn.precision.name: self.precision.value.name,
|
148 |
+
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
149 |
+
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
150 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
151 |
+
# AutoEvalColumn.architecture.name: self.architecture.value.name,
|
152 |
+
AutoEvalColumn.backbone.name: self.backbone,
|
153 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
154 |
+
AutoEvalColumn.revision.name: self.revision,
|
155 |
+
AutoEvalColumn.average.name: average,
|
156 |
+
AutoEvalColumn.license.name: self.license,
|
157 |
+
AutoEvalColumn.likes.name: self.likes,
|
158 |
+
AutoEvalColumn.params.name: self.num_params,
|
159 |
+
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
160 |
+
"display_result" : self.display_result,
|
161 |
+
}
|
162 |
+
|
163 |
+
for task in HarnessTasks:
|
164 |
+
data_dict[task.value.col_name] = self.dataset_results[task.value.benchmark]
|
165 |
+
|
166 |
+
return data_dict
|
167 |
+
|
168 |
+
if subset == "clinical_types":
|
169 |
+
average = sum([v for v in self.clinical_type_results.values() if v is not None]) / len(ClinicalTypes)
|
170 |
+
data_dict = {
|
171 |
+
"eval_name": self.eval_name, # not a column, just a save name,
|
172 |
+
AutoEvalColumn.precision.name: self.precision.value.name,
|
173 |
+
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
174 |
+
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
175 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
176 |
+
AutoEvalColumn.architecture.name: self.architecture.value.name,
|
177 |
+
AutoEvalColumn.backbone.name: self.backbone,
|
178 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
179 |
+
AutoEvalColumn.revision.name: self.revision,
|
180 |
+
AutoEvalColumn.average.name: average,
|
181 |
+
AutoEvalColumn.license.name: self.license,
|
182 |
+
AutoEvalColumn.likes.name: self.likes,
|
183 |
+
AutoEvalColumn.params.name: self.num_params,
|
184 |
+
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
185 |
+
"display_result" : self.display_result,
|
186 |
+
}
|
187 |
+
|
188 |
+
for clinical_type in ClinicalTypes:
|
189 |
+
data_dict[clinical_type.value.col_name] = self.clinical_type_results[clinical_type.value.benchmark]
|
190 |
+
|
191 |
+
return data_dict
|
192 |
+
|
193 |
|
194 |
|
195 |
def get_request_file_for_model(requests_path, model_name, precision):
|
|
|
206 |
for tmp_request_file in request_files:
|
207 |
with open(tmp_request_file, "r") as f:
|
208 |
req_content = json.load(f)
|
209 |
+
if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
|
|
|
|
|
|
|
210 |
request_file = tmp_request_file
|
211 |
return request_file
|
212 |
|
213 |
|
214 |
+
def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metric: str) -> list[EvalResult]:
|
215 |
"""From the path of the results folder root, extract all needed info for results"""
|
216 |
model_result_filepaths = []
|
217 |
|
|
|
232 |
eval_results = {}
|
233 |
for model_result_filepath in model_result_filepaths:
|
234 |
# Creation of result
|
235 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath, evaluation_metric)
|
236 |
+
# eval_result.update_with_request_file(requests_path)
|
237 |
|
238 |
# Store results of same eval together
|
239 |
eval_name = eval_result.eval_name
|
240 |
+
# if eval_name in eval_results.keys():
|
241 |
+
# eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
242 |
+
# else:
|
243 |
+
eval_results[eval_name] = eval_result
|
244 |
|
245 |
results = []
|
246 |
+
# clinical_type_results = []
|
247 |
for v in eval_results.values():
|
248 |
try:
|
249 |
+
v.to_dict(subset="dataset") # we test if the dict version is complete
|
250 |
+
if not v.display_result:
|
251 |
+
continue
|
252 |
results.append(v)
|
253 |
except KeyError: # not all eval values present
|
254 |
continue
|
src/populate.py
CHANGED
@@ -8,18 +8,21 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
-
|
|
|
|
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
-
return df
|
23 |
|
24 |
|
25 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
@@ -33,19 +36,19 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
33 |
with open(file_path) as fp:
|
34 |
data = json.load(fp)
|
35 |
|
36 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["
|
37 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
38 |
|
39 |
all_evals.append(data)
|
40 |
elif ".md" not in entry:
|
41 |
# this is a folder
|
42 |
-
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if
|
43 |
for sub_entry in sub_entries:
|
44 |
file_path = os.path.join(save_path, entry, sub_entry)
|
45 |
with open(file_path) as fp:
|
46 |
data = json.load(fp)
|
47 |
-
|
48 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["
|
49 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
50 |
all_evals.append(data)
|
51 |
|
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
+
raw_data = get_raw_eval_results(results_path, requests_path, evaluation_metric)
|
14 |
+
# print(raw_data)
|
15 |
+
# raise Exception("stop")
|
16 |
+
all_data_json = [v.to_dict(subset=subset) for v in raw_data]
|
17 |
|
18 |
df = pd.DataFrame.from_records(all_data_json)
|
19 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
20 |
+
cols = list(set(df.columns).intersection(set(cols)))
|
21 |
df = df[cols].round(decimals=2)
|
22 |
|
23 |
# filter out if any of the benchmarks have not been produced
|
24 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
25 |
+
return raw_data, df
|
26 |
|
27 |
|
28 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
36 |
with open(file_path) as fp:
|
37 |
data = json.load(fp)
|
38 |
|
39 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
|
40 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
41 |
|
42 |
all_evals.append(data)
|
43 |
elif ".md" not in entry:
|
44 |
# this is a folder
|
45 |
+
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
46 |
for sub_entry in sub_entries:
|
47 |
file_path = os.path.join(save_path, entry, sub_entry)
|
48 |
with open(file_path) as fp:
|
49 |
data = json.load(fp)
|
50 |
+
# print(data)
|
51 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
|
52 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
53 |
all_evals.append(data)
|
54 |
|
src/submission/check_validity.py
CHANGED
@@ -59,14 +59,24 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
|
|
59 |
return False, "was not found on hub!", None
|
60 |
|
61 |
|
62 |
-
def get_model_size(model_info: ModelInfo, precision: str):
|
63 |
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
|
|
|
|
64 |
try:
|
65 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
66 |
except (AttributeError, TypeError):
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
model_size = size_factor * model_size
|
71 |
return model_size
|
72 |
|
@@ -88,12 +98,12 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
|
|
88 |
continue
|
89 |
with open(os.path.join(root, file), "r") as f:
|
90 |
info = json.load(f)
|
91 |
-
file_names.append(f"{info['
|
92 |
|
93 |
# Select organisation
|
94 |
-
if info["
|
95 |
continue
|
96 |
-
organisation, _ = info["
|
97 |
users_to_submission_dates[organisation].append(info["submitted_time"])
|
98 |
|
99 |
return set(file_names), users_to_submission_dates
|
|
|
59 |
return False, "was not found on hub!", None
|
60 |
|
61 |
|
62 |
+
def get_model_size(model_info: ModelInfo, precision: str=None):
|
63 |
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
64 |
+
size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
|
65 |
+
|
66 |
try:
|
67 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
68 |
except (AttributeError, TypeError):
|
69 |
+
try:
|
70 |
+
size_match = re.search(size_pattern, model_info.id.lower())
|
71 |
+
model_size = size_match.group(0)
|
72 |
+
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
|
73 |
+
except AttributeError:
|
74 |
+
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
75 |
+
|
76 |
+
if precision:
|
77 |
+
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
78 |
+
else:
|
79 |
+
size_factor = 1
|
80 |
model_size = size_factor * model_size
|
81 |
return model_size
|
82 |
|
|
|
98 |
continue
|
99 |
with open(os.path.join(root, file), "r") as f:
|
100 |
info = json.load(f)
|
101 |
+
file_names.append(f"{info['model_name']}_{info['revision']}")
|
102 |
|
103 |
# Select organisation
|
104 |
+
if info["model_name"].count("/") == 0 or "submitted_time" not in info:
|
105 |
continue
|
106 |
+
organisation, _ = info["model_name"].split("/")
|
107 |
users_to_submission_dates[organisation].append(info["submitted_time"])
|
108 |
|
109 |
return set(file_names), users_to_submission_dates
|
src/submission/submit.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
@@ -10,18 +11,57 @@ from src.submission.check_validity import (
|
|
10 |
get_model_size,
|
11 |
is_model_on_hub,
|
12 |
)
|
|
|
13 |
|
14 |
REQUESTED_MODELS = None
|
15 |
USERS_TO_SUBMISSION_DATES = None
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
def add_new_eval(
|
18 |
model: str,
|
19 |
-
base_model: str,
|
20 |
revision: str,
|
21 |
-
precision: str,
|
22 |
-
weight_type: str,
|
|
|
|
|
|
|
|
|
|
|
23 |
model_type: str,
|
24 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
global REQUESTED_MODELS
|
26 |
global USERS_TO_SUBMISSION_DATES
|
27 |
if not REQUESTED_MODELS:
|
@@ -33,26 +73,35 @@ def add_new_eval(
|
|
33 |
user_name = model.split("/")[0]
|
34 |
model_path = model.split("/")[1]
|
35 |
|
36 |
-
precision = precision.split(" ")[0]
|
37 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
|
39 |
if model_type is None or model_type == "":
|
40 |
return styled_error("Please select a model type.")
|
|
|
|
|
41 |
|
42 |
# Does the model actually exist?
|
43 |
if revision == "":
|
44 |
revision = "main"
|
45 |
|
46 |
-
# Is the model on the hub?
|
47 |
-
if weight_type in ["Delta", "Adapter"]:
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
|
52 |
-
if not
|
53 |
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
54 |
if not model_on_hub:
|
55 |
return styled_error(f'Model "{model}" {error}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
# Is the model info correctly filled?
|
58 |
try:
|
@@ -60,7 +109,7 @@ def add_new_eval(
|
|
60 |
except Exception:
|
61 |
return styled_error("Could not get your model information. Please fill it up properly.")
|
62 |
|
63 |
-
model_size = get_model_size(model_info=model_info
|
64 |
|
65 |
# Were the model card and license filled?
|
66 |
try:
|
@@ -72,32 +121,71 @@ def add_new_eval(
|
|
72 |
if not modelcard_OK:
|
73 |
return styled_error(error_msg)
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
# Seems good, creating the eval
|
76 |
print("Adding new eval")
|
77 |
|
|
|
78 |
eval_entry = {
|
79 |
-
"
|
80 |
-
"base_model": base_model,
|
81 |
"revision": revision,
|
82 |
-
"precision": precision,
|
83 |
-
"weight_type": weight_type,
|
|
|
84 |
"status": "PENDING",
|
85 |
"submitted_time": current_time,
|
86 |
"model_type": model_type,
|
87 |
"likes": model_info.likes,
|
88 |
-
"
|
89 |
"license": license,
|
90 |
"private": False,
|
|
|
91 |
}
|
92 |
|
93 |
# Check for duplicate submission
|
94 |
-
|
95 |
-
|
|
|
96 |
|
97 |
print("Creating eval file")
|
98 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
99 |
os.makedirs(OUT_DIR, exist_ok=True)
|
100 |
-
out_path = f"{OUT_DIR}/{model_path}
|
101 |
|
102 |
with open(out_path, "w") as f:
|
103 |
f.write(json.dumps(eval_entry))
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
import ast
|
4 |
from datetime import datetime, timezone
|
5 |
|
6 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
|
|
11 |
get_model_size,
|
12 |
is_model_on_hub,
|
13 |
)
|
14 |
+
from src.display.utils import PromptTemplateName
|
15 |
|
16 |
REQUESTED_MODELS = None
|
17 |
USERS_TO_SUBMISSION_DATES = None
|
18 |
|
19 |
+
PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG = """{
|
20 |
+
"NCBI" : {
|
21 |
+
"" : "condition"
|
22 |
+
},
|
23 |
+
"CHIA" : {
|
24 |
+
"" : "condition"
|
25 |
+
"" : "drug"
|
26 |
+
"" : "procedure"
|
27 |
+
"" : "measurement"
|
28 |
+
},
|
29 |
+
"BIORED" : {
|
30 |
+
"" : "condition"
|
31 |
+
"" : "drug"
|
32 |
+
"" : "gene"
|
33 |
+
"" : "gene variant"
|
34 |
+
},
|
35 |
+
"BC5CDR" : {
|
36 |
+
"" : "condition"
|
37 |
+
"" : "drug"
|
38 |
+
}
|
39 |
+
}
|
40 |
+
|
41 |
+
"""
|
42 |
+
|
43 |
def add_new_eval(
|
44 |
model: str,
|
45 |
+
# base_model: str,
|
46 |
revision: str,
|
47 |
+
# precision: str,
|
48 |
+
# weight_type: str,
|
49 |
+
model_arch: str,
|
50 |
+
label_normalization_map: str,
|
51 |
+
gliner_threshold:str,
|
52 |
+
gliner_tokenizer_bool:str,
|
53 |
+
prompt_template_name:str,
|
54 |
model_type: str,
|
55 |
):
|
56 |
+
"""
|
57 |
+
Saves request if valid else returns the error.
|
58 |
+
Validity is checked based on -
|
59 |
+
- model's existence on hub
|
60 |
+
- necessary info on the model's card
|
61 |
+
- label normalization is a valid python dict and contains the keys for all datasets
|
62 |
+
- threshold for gliner is a valid float
|
63 |
+
|
64 |
+
"""
|
65 |
global REQUESTED_MODELS
|
66 |
global USERS_TO_SUBMISSION_DATES
|
67 |
if not REQUESTED_MODELS:
|
|
|
73 |
user_name = model.split("/")[0]
|
74 |
model_path = model.split("/")[1]
|
75 |
|
76 |
+
# precision = precision.split(" ")[0]
|
77 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
78 |
|
79 |
if model_type is None or model_type == "":
|
80 |
return styled_error("Please select a model type.")
|
81 |
+
|
82 |
+
model_type = model_type.split(":")[-1].strip()
|
83 |
|
84 |
# Does the model actually exist?
|
85 |
if revision == "":
|
86 |
revision = "main"
|
87 |
|
88 |
+
# # Is the model on the hub?
|
89 |
+
# if weight_type in ["Delta", "Adapter"]:
|
90 |
+
# base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
91 |
+
# if not base_model_on_hub:
|
92 |
+
# return styled_error(f'Base model "{base_model}" {error}')
|
93 |
|
94 |
+
if not model_arch == "GLiNER Encoder":
|
95 |
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
96 |
if not model_on_hub:
|
97 |
return styled_error(f'Model "{model}" {error}')
|
98 |
+
else:
|
99 |
+
model_name_matches = list(API.list_models(model_name=model))
|
100 |
+
if len(model_name_matches) < 1:
|
101 |
+
return styled_error(f'Model "{model}" does not exist on the hub!')
|
102 |
+
elif model_name_matches[0].id != model:
|
103 |
+
return styled_error(f'Model "{model}" does not exist on the hub! There might be a typo in the name')
|
104 |
+
|
105 |
|
106 |
# Is the model info correctly filled?
|
107 |
try:
|
|
|
109 |
except Exception:
|
110 |
return styled_error("Could not get your model information. Please fill it up properly.")
|
111 |
|
112 |
+
model_size = get_model_size(model_info=model_info)
|
113 |
|
114 |
# Were the model card and license filled?
|
115 |
try:
|
|
|
121 |
if not modelcard_OK:
|
122 |
return styled_error(error_msg)
|
123 |
|
124 |
+
# Verify the inference config now
|
125 |
+
try:
|
126 |
+
label_normalization_map = ast.literal_eval(label_normalization_map)
|
127 |
+
except Exception as e:
|
128 |
+
return styled_error("Please enter a valid json for the labe; normalization map")
|
129 |
+
|
130 |
+
inference_config = {
|
131 |
+
# "model_arch" : model_arch,
|
132 |
+
"label_normalization_map": label_normalization_map,
|
133 |
+
}
|
134 |
+
|
135 |
+
match model_arch:
|
136 |
+
case "Encoder":
|
137 |
+
pass
|
138 |
+
case "Decoder":
|
139 |
+
if not prompt_template_name in [prompt_template.value for prompt_template in PromptTemplateName]:
|
140 |
+
return styled_error("Prompt template name is invalid")
|
141 |
+
inference_config = {
|
142 |
+
**inference_config,
|
143 |
+
"prompt_template_identifier": prompt_template_name,
|
144 |
+
}
|
145 |
+
case "GLiNER Encoder":
|
146 |
+
try:
|
147 |
+
gliner_threshold = float(gliner_threshold)
|
148 |
+
gliner_tokenizer_bool = ast.literal_eval(gliner_tokenizer_bool)
|
149 |
+
inference_config = {
|
150 |
+
**inference_config,
|
151 |
+
"gliner_threshold": gliner_threshold,
|
152 |
+
"gliner_tokenizer_bool" : gliner_tokenizer_bool
|
153 |
+
}
|
154 |
+
except Exception as e:
|
155 |
+
return styled_error("Please enter a valid float for the threshold")
|
156 |
+
case _:
|
157 |
+
return styled_error("Model Architecture is invalid")
|
158 |
+
|
159 |
# Seems good, creating the eval
|
160 |
print("Adding new eval")
|
161 |
|
162 |
+
|
163 |
eval_entry = {
|
164 |
+
"model_name": model,
|
165 |
+
# "base_model": base_model,
|
166 |
"revision": revision,
|
167 |
+
# "precision": precision,
|
168 |
+
# "weight_type": weight_type,
|
169 |
+
"model_architecture": model_arch,
|
170 |
"status": "PENDING",
|
171 |
"submitted_time": current_time,
|
172 |
"model_type": model_type,
|
173 |
"likes": model_info.likes,
|
174 |
+
"num_params": model_size,
|
175 |
"license": license,
|
176 |
"private": False,
|
177 |
+
"inference_config":inference_config,
|
178 |
}
|
179 |
|
180 |
# Check for duplicate submission
|
181 |
+
|
182 |
+
if f"{model}_{revision}" in REQUESTED_MODELS:
|
183 |
+
return styled_warning("This model has been already submitted. Add the revision if the model has been updated.")
|
184 |
|
185 |
print("Creating eval file")
|
186 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
187 |
os.makedirs(OUT_DIR, exist_ok=True)
|
188 |
+
out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request.json"
|
189 |
|
190 |
with open(out_path, "w") as f:
|
191 |
f.write(json.dumps(eval_entry))
|