juncliu commited on
Commit
d56df1b
Β·
1 Parent(s): 5f6be9e

change to MASE and geometric mean to aggregate results

Browse files
Files changed (3) hide show
  1. app.py +4 -4
  2. requirements.txt +2 -1
  3. src/utils.py +12 -6
app.py CHANGED
@@ -119,15 +119,15 @@ def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
119
  merged_df = get_merged_df(ori_dataframe, model_info_df)
120
  new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
121
  merged_df = merged_df[new_cols]
122
- print('Merged df: ', merged_df)
123
  if sort_val:
124
  if sort_val in merged_df.columns:
125
  merged_df = merged_df.sort_values(by=[sort_val])
126
  else:
127
  print(f'Warning: cannot sort by {sort_val}')
 
128
  # get the data type
129
  datatype_list = [col2type_dict[col] if col in col2type_dict else 'number' for col in merged_df.columns]
130
- print('datatype_list: ', datatype_list)
131
  # print('merged_df.column: ', merged_df.columns)
132
  # ipdb.set_trace()
133
  return Leaderboard(
@@ -164,7 +164,7 @@ def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
164
  ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
165
  ],
166
  # bool_checkboxgroup_label="",
167
- column_widths=[40, 150] + [150 for _ in range(len(merged_df.columns)-2)],
168
  interactive=False,
169
  )
170
 
@@ -176,7 +176,7 @@ with demo:
176
 
177
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
178
  with gr.TabItem('πŸ… Overall', elem_id="llm-benchmark-tab-table", id=5):
179
- leaderboard = init_leaderboard(overall_df, model_info_df, sort_val='MAPE')
180
  print(f'FINAL Overall LEADERBOARD {overall_df}')
181
  with gr.TabItem("πŸ… By Domain", elem_id="llm-benchmark-tab-table", id=0):
182
  leaderboard = init_leaderboard(domain_df, model_info_df)
 
119
  merged_df = get_merged_df(ori_dataframe, model_info_df)
120
  new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
121
  merged_df = merged_df[new_cols]
 
122
  if sort_val:
123
  if sort_val in merged_df.columns:
124
  merged_df = merged_df.sort_values(by=[sort_val])
125
  else:
126
  print(f'Warning: cannot sort by {sort_val}')
127
+ print('Merged df: ', merged_df)
128
  # get the data type
129
  datatype_list = [col2type_dict[col] if col in col2type_dict else 'number' for col in merged_df.columns]
130
+ # print('datatype_list: ', datatype_list)
131
  # print('merged_df.column: ', merged_df.columns)
132
  # ipdb.set_trace()
133
  return Leaderboard(
 
164
  ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
165
  ],
166
  # bool_checkboxgroup_label="",
167
+ column_widths=[40, 150] + [180 for _ in range(len(merged_df.columns)-2)],
168
  interactive=False,
169
  )
170
 
 
176
 
177
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
178
  with gr.TabItem('πŸ… Overall', elem_id="llm-benchmark-tab-table", id=5):
179
+ leaderboard = init_leaderboard(overall_df, model_info_df, sort_val='Rank')
180
  print(f'FINAL Overall LEADERBOARD {overall_df}')
181
  with gr.TabItem("πŸ… By Domain", elem_id="llm-benchmark-tab-table", id=0):
182
  leaderboard = init_leaderboard(domain_df, model_info_df)
requirements.txt CHANGED
@@ -14,4 +14,5 @@ tqdm
14
  transformers
15
  tokenizers>=0.15.0
16
  sentencepiece
17
- ipdb
 
 
14
  transformers
15
  tokenizers>=0.15.0
16
  sentencepiece
17
+ ipdb
18
+ scipy
src/utils.py CHANGED
@@ -2,6 +2,7 @@ import ipdb
2
  import pandas as pd
3
  import os
4
  import re
 
5
 
6
  # Define the formatting function
7
  def format_number(num):
@@ -45,7 +46,7 @@ def pivot_df(file_name, tab_name):
45
 
46
  def rename_metrics(df):
47
  df = df.rename(columns={
48
- 'eval_metrics/MAPE[0.5]': 'MAPE',
49
  'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
50
  'rank': 'Rank'
51
  })
@@ -89,7 +90,7 @@ def pivot_existed_df(df, tab_name):
89
  print('columns', df.columns)
90
  df_melted = pd.melt(df, id_vars=[tab_name, 'model'], var_name='metric', value_name='value')
91
  df_melted['metric'] = df_melted['metric'].replace({
92
- 'eval_metrics/MAPE[0.5]': 'MAPE',
93
  'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
94
  'rank': 'Rank',
95
  })
@@ -168,9 +169,12 @@ def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_propertie
168
  df['univariate'] = df['num_variates'] == 1
169
 
170
  # group by domain
171
- METRIC_CHOICES = ["eval_metrics/MAPE[0.5]", "eval_metrics/mean_weighted_sum_quantile_loss", "rank"]
 
 
 
 
172
 
173
- grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].mean()
174
  # grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
175
  # grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')
176
  grouped_dfs = {}
@@ -236,8 +240,10 @@ def standardize_df(df):
236
  return df
237
 
238
  def group_by(df, col_name):
239
- METRIC_CHOICES = ["eval_metrics/MAPE[0.5]", "eval_metrics/mean_weighted_sum_quantile_loss", "rank"]
240
- grouped_results = df.groupby([col_name, 'model'])[METRIC_CHOICES].mean()
 
 
241
  # Display the results
242
  # Write the results to a csv file
243
  # grouped_results.to_csv(f'grouped_results_by_{col_name}.csv')
 
2
  import pandas as pd
3
  import os
4
  import re
5
+ from scipy import stats
6
 
7
  # Define the formatting function
8
  def format_number(num):
 
46
 
47
  def rename_metrics(df):
48
  df = df.rename(columns={
49
+ 'eval_metrics/MASE[0.5]': 'MASE',
50
  'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
51
  'rank': 'Rank'
52
  })
 
90
  print('columns', df.columns)
91
  df_melted = pd.melt(df, id_vars=[tab_name, 'model'], var_name='metric', value_name='value')
92
  df_melted['metric'] = df_melted['metric'].replace({
93
+ 'eval_metrics/MASE[0.5]': 'MASE',
94
  'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
95
  'rank': 'Rank',
96
  })
 
169
  df['univariate'] = df['num_variates'] == 1
170
 
171
  # group by domain
172
+ METRIC_CHOICES = ["eval_metrics/MASE[0.5]", "eval_metrics/mean_weighted_sum_quantile_loss"]
173
+ # ipdb.set_trace()
174
+ grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].agg(stats.gmean)
175
+ grouped_results_overall_rank = df.groupby(['model'])[['rank']].mean()
176
+ grouped_results_overall = pd.concat([grouped_results_overall, grouped_results_overall_rank], axis=1)
177
 
 
178
  # grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
179
  # grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')
180
  grouped_dfs = {}
 
240
  return df
241
 
242
  def group_by(df, col_name):
243
+ METRIC_CHOICES = ["eval_metrics/MASE[0.5]", "eval_metrics/mean_weighted_sum_quantile_loss"]
244
+ grouped_results = df.groupby([col_name, 'model'])[METRIC_CHOICES].agg(stats.gmean)
245
+ grouped_results_rank = df.groupby([col_name, 'model'])[['rank']].mean()
246
+ grouped_results = pd.concat([grouped_results, grouped_results_rank], axis=1)
247
  # Display the results
248
  # Write the results to a csv file
249
  # grouped_results.to_csv(f'grouped_results_by_{col_name}.csv')