Spaces:
Running
Running
change to MASE and geometric mean to aggregate results
Browse files- app.py +4 -4
- requirements.txt +2 -1
- src/utils.py +12 -6
app.py
CHANGED
@@ -119,15 +119,15 @@ def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
|
|
119 |
merged_df = get_merged_df(ori_dataframe, model_info_df)
|
120 |
new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
|
121 |
merged_df = merged_df[new_cols]
|
122 |
-
print('Merged df: ', merged_df)
|
123 |
if sort_val:
|
124 |
if sort_val in merged_df.columns:
|
125 |
merged_df = merged_df.sort_values(by=[sort_val])
|
126 |
else:
|
127 |
print(f'Warning: cannot sort by {sort_val}')
|
|
|
128 |
# get the data type
|
129 |
datatype_list = [col2type_dict[col] if col in col2type_dict else 'number' for col in merged_df.columns]
|
130 |
-
print('datatype_list: ', datatype_list)
|
131 |
# print('merged_df.column: ', merged_df.columns)
|
132 |
# ipdb.set_trace()
|
133 |
return Leaderboard(
|
@@ -164,7 +164,7 @@ def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
|
|
164 |
ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
165 |
],
|
166 |
# bool_checkboxgroup_label="",
|
167 |
-
column_widths=[40, 150] + [
|
168 |
interactive=False,
|
169 |
)
|
170 |
|
@@ -176,7 +176,7 @@ with demo:
|
|
176 |
|
177 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
178 |
with gr.TabItem('π
Overall', elem_id="llm-benchmark-tab-table", id=5):
|
179 |
-
leaderboard = init_leaderboard(overall_df, model_info_df, sort_val='
|
180 |
print(f'FINAL Overall LEADERBOARD {overall_df}')
|
181 |
with gr.TabItem("π
By Domain", elem_id="llm-benchmark-tab-table", id=0):
|
182 |
leaderboard = init_leaderboard(domain_df, model_info_df)
|
|
|
119 |
merged_df = get_merged_df(ori_dataframe, model_info_df)
|
120 |
new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
|
121 |
merged_df = merged_df[new_cols]
|
|
|
122 |
if sort_val:
|
123 |
if sort_val in merged_df.columns:
|
124 |
merged_df = merged_df.sort_values(by=[sort_val])
|
125 |
else:
|
126 |
print(f'Warning: cannot sort by {sort_val}')
|
127 |
+
print('Merged df: ', merged_df)
|
128 |
# get the data type
|
129 |
datatype_list = [col2type_dict[col] if col in col2type_dict else 'number' for col in merged_df.columns]
|
130 |
+
# print('datatype_list: ', datatype_list)
|
131 |
# print('merged_df.column: ', merged_df.columns)
|
132 |
# ipdb.set_trace()
|
133 |
return Leaderboard(
|
|
|
164 |
ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
165 |
],
|
166 |
# bool_checkboxgroup_label="",
|
167 |
+
column_widths=[40, 150] + [180 for _ in range(len(merged_df.columns)-2)],
|
168 |
interactive=False,
|
169 |
)
|
170 |
|
|
|
176 |
|
177 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
178 |
with gr.TabItem('π
Overall', elem_id="llm-benchmark-tab-table", id=5):
|
179 |
+
leaderboard = init_leaderboard(overall_df, model_info_df, sort_val='Rank')
|
180 |
print(f'FINAL Overall LEADERBOARD {overall_df}')
|
181 |
with gr.TabItem("π
By Domain", elem_id="llm-benchmark-tab-table", id=0):
|
182 |
leaderboard = init_leaderboard(domain_df, model_info_df)
|
requirements.txt
CHANGED
@@ -14,4 +14,5 @@ tqdm
|
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
sentencepiece
|
17 |
-
ipdb
|
|
|
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
sentencepiece
|
17 |
+
ipdb
|
18 |
+
scipy
|
src/utils.py
CHANGED
@@ -2,6 +2,7 @@ import ipdb
|
|
2 |
import pandas as pd
|
3 |
import os
|
4 |
import re
|
|
|
5 |
|
6 |
# Define the formatting function
|
7 |
def format_number(num):
|
@@ -45,7 +46,7 @@ def pivot_df(file_name, tab_name):
|
|
45 |
|
46 |
def rename_metrics(df):
|
47 |
df = df.rename(columns={
|
48 |
-
'eval_metrics/
|
49 |
'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
|
50 |
'rank': 'Rank'
|
51 |
})
|
@@ -89,7 +90,7 @@ def pivot_existed_df(df, tab_name):
|
|
89 |
print('columns', df.columns)
|
90 |
df_melted = pd.melt(df, id_vars=[tab_name, 'model'], var_name='metric', value_name='value')
|
91 |
df_melted['metric'] = df_melted['metric'].replace({
|
92 |
-
'eval_metrics/
|
93 |
'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
|
94 |
'rank': 'Rank',
|
95 |
})
|
@@ -168,9 +169,12 @@ def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_propertie
|
|
168 |
df['univariate'] = df['num_variates'] == 1
|
169 |
|
170 |
# group by domain
|
171 |
-
METRIC_CHOICES = ["eval_metrics/
|
|
|
|
|
|
|
|
|
172 |
|
173 |
-
grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].mean()
|
174 |
# grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
|
175 |
# grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')
|
176 |
grouped_dfs = {}
|
@@ -236,8 +240,10 @@ def standardize_df(df):
|
|
236 |
return df
|
237 |
|
238 |
def group_by(df, col_name):
|
239 |
-
METRIC_CHOICES = ["eval_metrics/
|
240 |
-
grouped_results = df.groupby([col_name, 'model'])[METRIC_CHOICES].
|
|
|
|
|
241 |
# Display the results
|
242 |
# Write the results to a csv file
|
243 |
# grouped_results.to_csv(f'grouped_results_by_{col_name}.csv')
|
|
|
2 |
import pandas as pd
|
3 |
import os
|
4 |
import re
|
5 |
+
from scipy import stats
|
6 |
|
7 |
# Define the formatting function
|
8 |
def format_number(num):
|
|
|
46 |
|
47 |
def rename_metrics(df):
|
48 |
df = df.rename(columns={
|
49 |
+
'eval_metrics/MASE[0.5]': 'MASE',
|
50 |
'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
|
51 |
'rank': 'Rank'
|
52 |
})
|
|
|
90 |
print('columns', df.columns)
|
91 |
df_melted = pd.melt(df, id_vars=[tab_name, 'model'], var_name='metric', value_name='value')
|
92 |
df_melted['metric'] = df_melted['metric'].replace({
|
93 |
+
'eval_metrics/MASE[0.5]': 'MASE',
|
94 |
'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
|
95 |
'rank': 'Rank',
|
96 |
})
|
|
|
169 |
df['univariate'] = df['num_variates'] == 1
|
170 |
|
171 |
# group by domain
|
172 |
+
METRIC_CHOICES = ["eval_metrics/MASE[0.5]", "eval_metrics/mean_weighted_sum_quantile_loss"]
|
173 |
+
# ipdb.set_trace()
|
174 |
+
grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].agg(stats.gmean)
|
175 |
+
grouped_results_overall_rank = df.groupby(['model'])[['rank']].mean()
|
176 |
+
grouped_results_overall = pd.concat([grouped_results_overall, grouped_results_overall_rank], axis=1)
|
177 |
|
|
|
178 |
# grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
|
179 |
# grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')
|
180 |
grouped_dfs = {}
|
|
|
240 |
return df
|
241 |
|
242 |
def group_by(df, col_name):
|
243 |
+
METRIC_CHOICES = ["eval_metrics/MASE[0.5]", "eval_metrics/mean_weighted_sum_quantile_loss"]
|
244 |
+
grouped_results = df.groupby([col_name, 'model'])[METRIC_CHOICES].agg(stats.gmean)
|
245 |
+
grouped_results_rank = df.groupby([col_name, 'model'])[['rank']].mean()
|
246 |
+
grouped_results = pd.concat([grouped_results, grouped_results_rank], axis=1)
|
247 |
# Display the results
|
248 |
# Write the results to a csv file
|
249 |
# grouped_results.to_csv(f'grouped_results_by_{col_name}.csv')
|