hynky HF staff commited on
Commit
97e6937
·
1 Parent(s): d10da92

allow multiple metrics + concise the cols

Browse files
Files changed (1) hide show
  1. app.py +47 -47
app.py CHANGED
@@ -31,7 +31,8 @@ def get_task_type(df):
31
  def fix_df(df):
32
  # For some reason some metrics and predictions are stored as strings
33
  for col in ["predictions", "metrics", "choices", "gold", "gold_index"]:
34
- df[col] = [ast.literal_eval(x) if isinstance(x, str) else x for x in df[col].values]
 
35
  return df
36
 
37
  def get_run_name_seed(run_name):
@@ -119,15 +120,18 @@ def fetch_run_results(repo_name, runs_to_fetch, checkpoint,
119
  return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
120
 
121
 
122
- def filter_with_metric(df, selected_runs, metric_name):
123
- if df is None or not selected_runs or not metric_name:
124
  return None
125
- kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs]
126
  other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
127
  df = df.drop(columns=other_metrics)
128
  # widths = get_column_widths(df)
129
- df = consize_runname_metric(df, selected_runs, metric_name)
130
- return gr.update(value=df, row_count=(100, 'fixed'))
 
 
 
131
 
132
  def get_column_widths(df):
133
  column_widths = []
@@ -143,17 +147,26 @@ def get_column_widths(df):
143
  return column_widths
144
 
145
 
146
- def consize_runname_metric(df, run_names, metric_name):
147
  """
148
  Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
 
149
  """
150
- # Initialize the new column with empty strings
 
 
151
  for idx, run_name in enumerate(run_names):
152
- original_column = f"metric_{metric_name}_{run_name}"
153
- if original_column in df.columns:
154
- # Append the run name and metric value to the concise column
155
- df[f"{metric_name}_{idx}"] = df[original_column]
156
- df = df.drop(columns=[original_column])
 
 
 
 
 
 
157
  return df
158
 
159
 
@@ -192,16 +205,23 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
192
 
193
  if task_type == "multiple_choice":
194
  n_choices = len(df['choices'])
195
- return df['choices'][np.argmax([pred[0] for pred in predictions[:n_choices]])]
196
 
197
  if task_type == "mixed":
198
  return predictions[0]
199
 
200
  return predictions
 
 
 
 
 
 
 
201
 
202
  prepared_df = pd.DataFrame({
203
  'full_prompt': df['full_prompt'],
204
- f'{run_name}': df.apply(partial(get_choice_predictions, task_type=task_type), axis=1)
205
  })
206
  # For some reason some metrics are stored as strings
207
  metrics = df['metrics']
@@ -213,10 +233,13 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
213
  def get_gold_label(df, task_type):
214
  if task_type == "generative":
215
  return df['gold']
216
- return [df['choices'][idx] for idx in df['gold_index']]
217
 
218
  # Prepare the first DataFrame with choices and gold
219
- combined_df = dfs[0][['full_prompt', 'choices']].set_index('full_prompt')
 
 
 
220
  combined_df['gold'] = dfs[0].apply(lambda row: get_gold_label(row, task_type), axis=1).values
221
 
222
  # Join all prepared DataFrames
@@ -227,32 +250,9 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
227
 
228
  available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
229
  combined_df = combined_df.reset_index()
 
230
 
231
- return combined_df, filter_with_metric(combined_df, runs_to_fetch, available_metrics[0]), gr.update(choices=available_metrics, value=available_metrics[0])
232
-
233
- def render_results_table(df: pd.DataFrame):
234
- if df is None or df.empty:
235
- return None
236
-
237
- # Select a subset of 100 examples
238
- df_subset = df.sample(n=min(100, len(df)), random_state=42)
239
-
240
- # Prepare the data for display
241
- display_data = []
242
- for _, row in df_subset.iterrows():
243
- example_data = {
244
- 'text': row['example'],
245
- 'choices': row['choices'],
246
- 'gold_index': row['gold_index'],
247
- }
248
- for run in df['run'].unique():
249
- run_data = df[(df['run'] == run) & (df['example'] == row['example'])]
250
- if not run_data.empty:
251
- example_data[f'{run}_prediction'] = run_data['predictions'].values[0]
252
- example_data[f'{run}_score'] = run_data['metrics'].values[0]
253
- display_data.append(example_data)
254
-
255
- return pd.DataFrame(display_data)
256
 
257
  with gr.Blocks() as demo:
258
  runs_checkpoints = gr.State({})
@@ -275,7 +275,7 @@ with gr.Blocks() as demo:
275
  checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint")
276
  fetch_res = gr.Button("Fetch results")
277
  task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
278
- metric_name = gr.Dropdown(choices=[], interactive=True, label="Metric")
279
  results_df = gr.Dataframe(interactive=False, wrap=True)
280
 
281
  # Run selection
@@ -316,13 +316,13 @@ with gr.Blocks() as demo:
316
  triggers=[task_name.change],
317
  fn=load_task_data,
318
  inputs=[repo, selected_runs, checkpoint, task_name, tasks_files],
319
- outputs=[results_df_full, results_df, metric_name]
320
  )
321
 
322
  gr.on(
323
- triggers=[metric_name.change],
324
- fn=filter_with_metric,
325
- inputs=[results_df_full, selected_runs, metric_name],
326
  outputs=[results_df]
327
  )
328
 
 
31
  def fix_df(df):
32
  # For some reason some metrics and predictions are stored as strings
33
  for col in ["predictions", "metrics", "choices", "gold", "gold_index"]:
34
+ if col in df.columns:
35
+ df[col] = [ast.literal_eval(x) if isinstance(x, str) else x for x in df[col].values]
36
  return df
37
 
38
  def get_run_name_seed(run_name):
 
120
  return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
121
 
122
 
123
+ def render_table(df, selected_runs, metric_names):
124
+ if df is None or not selected_runs or not metric_names:
125
  return None
126
+ kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
127
  other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
128
  df = df.drop(columns=other_metrics)
129
  # widths = get_column_widths(df)
130
+ df = shorten_column_names(df, selected_runs, metric_names)
131
+
132
+ # Sample 100
133
+ df = df.sample(n=min(100, len(df)), random_state=42)
134
+ return df
135
 
136
  def get_column_widths(df):
137
  column_widths = []
 
147
  return column_widths
148
 
149
 
150
+ def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
151
  """
152
  Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
153
+ Turns generation_{run_name} into generation_i
154
  """
155
+ # Handle metric columns
156
+ # Aggregate columns to rename
157
+ columns_to_rename = {}
158
  for idx, run_name in enumerate(run_names):
159
+ for metric_name in metric_names:
160
+ original_metric_column = f"metric_{metric_name}_{run_name}"
161
+ if original_metric_column in df.columns:
162
+ columns_to_rename[original_metric_column] = f"{metric_name}_{idx}"
163
+
164
+ original_generation_column = f"generation_{run_name}"
165
+ if original_generation_column in df.columns:
166
+ columns_to_rename[original_generation_column] = f"generation_{idx}"
167
+
168
+ # Rename columns in a single operation
169
+ df = df.rename(columns=columns_to_rename)
170
  return df
171
 
172
 
 
205
 
206
  if task_type == "multiple_choice":
207
  n_choices = len(df['choices'])
208
+ return [pred[0] for pred in predictions[:n_choices]]
209
 
210
  if task_type == "mixed":
211
  return predictions[0]
212
 
213
  return predictions
214
+
215
+ generative_columns = {
216
+ f"generation_{run_name}": df.apply(partial(get_choice_predictions, task_type=task_type), axis=1)
217
+ } if task_type == "generative" or task_type == "mixed" else {}
218
+
219
+
220
+
221
 
222
  prepared_df = pd.DataFrame({
223
  'full_prompt': df['full_prompt'],
224
+ **generative_columns,
225
  })
226
  # For some reason some metrics are stored as strings
227
  metrics = df['metrics']
 
233
  def get_gold_label(df, task_type):
234
  if task_type == "generative":
235
  return df['gold']
236
+ return df['gold_index']
237
 
238
  # Prepare the first DataFrame with choices and gold
239
+ combined_df = dfs[0][['full_prompt']].set_index('full_prompt')
240
+ if task_type in ["multiple_choice", "mixed"]:
241
+ combined_df["choices"] = dfs[0]["choices"].values
242
+
243
  combined_df['gold'] = dfs[0].apply(lambda row: get_gold_label(row, task_type), axis=1).values
244
 
245
  # Join all prepared DataFrames
 
250
 
251
  available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
252
  combined_df = combined_df.reset_index()
253
+ chosen_metrics = available_metrics[:1]
254
 
255
+ return combined_df, render_table(combined_df, runs_to_fetch, chosen_metrics), gr.update(choices=available_metrics, value=chosen_metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  with gr.Blocks() as demo:
258
  runs_checkpoints = gr.State({})
 
275
  checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint")
276
  fetch_res = gr.Button("Fetch results")
277
  task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
278
+ metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
279
  results_df = gr.Dataframe(interactive=False, wrap=True)
280
 
281
  # Run selection
 
316
  triggers=[task_name.change],
317
  fn=load_task_data,
318
  inputs=[repo, selected_runs, checkpoint, task_name, tasks_files],
319
+ outputs=[results_df_full, results_df, metric_names]
320
  )
321
 
322
  gr.on(
323
+ triggers=[metric_names.change],
324
+ fn=render_table,
325
+ inputs=[results_df_full, selected_runs, metric_names],
326
  outputs=[results_df]
327
  )
328