hynky HF staff commited on
Commit
a684ff8
1 Parent(s): 6ecb679

show only diferent examples

Browse files
Files changed (1) hide show
  1. app.py +49 -10
app.py CHANGED
@@ -7,6 +7,7 @@ import re
7
  from concurrent.futures import ThreadPoolExecutor
8
  import numpy as np
9
  from datetime import datetime
 
10
 
11
  import gradio as gr
12
  import pandas as pd
@@ -143,7 +144,9 @@ def fetch_run_results(results_uri, selected_run_checkpoint: list[str],
143
  return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
144
 
145
 
146
- def render_table(df, selected_run_checkpoint: list[str], metric_names):
 
 
147
  if df is None or not selected_run_checkpoint or not metric_names:
148
  return None, "0"
149
 
@@ -152,18 +155,24 @@ def render_table(df, selected_run_checkpoint: list[str], metric_names):
152
  for metric_name in metric_names]
153
  other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
154
  df = df.drop(columns=other_metrics)
 
 
 
 
155
  df = shorten_column_names(df, selected_run_checkpoint, metric_names)
156
 
157
- # Sample 100
158
- n_samples = len(df)
159
- df = df.sample(n=min(100, len(df)), random_state=42)
 
 
160
 
161
  # Get column widths for better display
162
  column_widths = get_column_widths(df)
163
  return gr.Dataframe(
164
  value=df,
165
  column_widths=column_widths
166
- ), str(n_samples)
167
 
168
  def update_selected_run_checkpoint(selected_runs: list[str] | None, selected_checkpoint: list[str] | None, split_checkpoints: bool):
169
  if not selected_runs:
@@ -305,7 +314,7 @@ def load_task_data(results_uri, selected_run_checkpoint: list[str], task_name, t
305
  prepared_df = pd.DataFrame({
306
  'prompt': df[prompt_column],
307
  'choices': df['choices'].apply(tuple), # Convert lists to tuples
308
- 'gold': df['gold'].apply(lambda x: tuple(x) if isinstance(x, list) else x), # Convert lists to tuples
309
  'gold_index': df['gold_index'],
310
  **generative_columns,
311
  })
@@ -352,6 +361,34 @@ def load_task_data(results_uri, selected_run_checkpoint: list[str], task_name, t
352
 
353
  return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  with gr.Blocks() as demo:
356
  available_runs_checkpoints = gr.State({})
357
  results_df_full = gr.State(None)
@@ -389,6 +426,8 @@ with gr.Blocks() as demo:
389
  with gr.Column():
390
  num_samples = gr.Text(interactive=False, label="# Samples")
391
  prompt_column = gr.Radio(choices=["full_prompt", "example"], label="Prompt display", value="example")
 
 
392
 
393
  # Run selection
394
  gr.on(
@@ -439,7 +478,7 @@ with gr.Blocks() as demo:
439
  outputs=[results_df_full, metric_names]
440
  ).then(
441
  fn=render_table,
442
- inputs=[results_df_full, selected_run_checkpoint, metric_names],
443
  outputs=[results_df, num_samples]
444
  )
445
 
@@ -451,14 +490,14 @@ with gr.Blocks() as demo:
451
  outputs=[results_df_full, metric_names]
452
  ).then(
453
  fn=render_table,
454
- inputs=[results_df_full, selected_run_checkpoint, metric_names],
455
  outputs=[results_df, num_samples]
456
  )
457
 
458
  gr.on(
459
- triggers=[metric_names.input],
460
  fn=render_table,
461
- inputs=[results_df_full, selected_run_checkpoint, metric_names],
462
  outputs=[results_df, num_samples]
463
  )
464
 
 
7
  from concurrent.futures import ThreadPoolExecutor
8
  import numpy as np
9
  from datetime import datetime
10
+ from typing import Any
11
 
12
  import gradio as gr
13
  import pandas as pd
 
144
  return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
145
 
146
 
147
+ def render_table(df: pd.DataFrame | None, selected_run_checkpoint: list[str],
148
+ metric_names: list[str], filter_different: bool = False,
149
+ n_samples: int = 100):
150
  if df is None or not selected_run_checkpoint or not metric_names:
151
  return None, "0"
152
 
 
155
  for metric_name in metric_names]
156
  other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
157
  df = df.drop(columns=other_metrics)
158
+
159
+ if filter_different:
160
+ df = df[df.apply(lambda row: has_different_values(row, selected_run_checkpoint, metric_names), axis=1)]
161
+
162
  df = shorten_column_names(df, selected_run_checkpoint, metric_names)
163
 
164
+ # Get total number of samples before limiting
165
+ total_samples = len(df)
166
+
167
+ # Take first n_samples instead of random sampling
168
+ df = df.head(n_samples)
169
 
170
  # Get column widths for better display
171
  column_widths = get_column_widths(df)
172
  return gr.Dataframe(
173
  value=df,
174
  column_widths=column_widths
175
+ ), str(total_samples)
176
 
177
  def update_selected_run_checkpoint(selected_runs: list[str] | None, selected_checkpoint: list[str] | None, split_checkpoints: bool):
178
  if not selected_runs:
 
314
  prepared_df = pd.DataFrame({
315
  'prompt': df[prompt_column],
316
  'choices': df['choices'].apply(tuple), # Convert lists to tuples
317
+ 'gold': df['gold'].apply(lambda x: tuple(x) if is_arary_like(x) else x), # Convert lists to tuples
318
  'gold_index': df['gold_index'],
319
  **generative_columns,
320
  })
 
361
 
362
  return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
363
 
364
+ def has_different_values(row: pd.Series, selected_run_checkpoint: list[str], metric_names: list[str]) -> bool:
365
+ """Check if a row has different values across runs for any metric or generation."""
366
+ # Check generations
367
+ generation_cols = [f"generation_{run}" for run in selected_run_checkpoint]
368
+ generation_cols = [col for col in generation_cols if col in row.index]
369
+ if generation_cols:
370
+ generations = row[generation_cols].dropna()
371
+ # Convert lists to tuples for comparison and handle string values
372
+ unique_generations = set()
373
+ for gen in generations:
374
+ if isinstance(gen, list):
375
+ unique_generations.add(tuple(gen))
376
+ else:
377
+ unique_generations.add(gen)
378
+ if len(unique_generations) > 1:
379
+ return True
380
+
381
+ # Check metrics
382
+ for metric in metric_names:
383
+ metric_cols = [f"metric_{metric}_{run}" for run in selected_run_checkpoint]
384
+ metric_cols = [col for col in metric_cols if col in row.index]
385
+ if metric_cols:
386
+ metrics = row[metric_cols].dropna()
387
+ if len(metrics.unique()) > 1:
388
+ return True
389
+
390
+ return False
391
+
392
  with gr.Blocks() as demo:
393
  available_runs_checkpoints = gr.State({})
394
  results_df_full = gr.State(None)
 
426
  with gr.Column():
427
  num_samples = gr.Text(interactive=False, label="# Samples")
428
  prompt_column = gr.Radio(choices=["full_prompt", "example"], label="Prompt display", value="example")
429
+ filter_different = gr.Checkbox(label="Show only samples with differences", value=False)
430
+ n_samples_input = gr.Number(value=100, label="Number of samples to show", minimum=1, maximum=1000, step=1)
431
 
432
  # Run selection
433
  gr.on(
 
478
  outputs=[results_df_full, metric_names]
479
  ).then(
480
  fn=render_table,
481
+ inputs=[results_df_full, selected_run_checkpoint, metric_names, filter_different, n_samples_input],
482
  outputs=[results_df, num_samples]
483
  )
484
 
 
490
  outputs=[results_df_full, metric_names]
491
  ).then(
492
  fn=render_table,
493
+ inputs=[results_df_full, selected_run_checkpoint, metric_names, filter_different, n_samples_input],
494
  outputs=[results_df, num_samples]
495
  )
496
 
497
  gr.on(
498
+ triggers=[metric_names.input, filter_different.change, n_samples_input.change],
499
  fn=render_table,
500
+ inputs=[results_df_full, selected_run_checkpoint, metric_names, filter_different, n_samples_input],
501
  outputs=[results_df, num_samples]
502
  )
503