hynky HF staff commited on
Commit
1c68162
2 Parent(s): 4564048 a684ff8

Merge branch #HuggingFaceFW-Dev/Tasks-Explorer' into 'HuggingFaceFW/Tasks-Explorer'

Browse files
Files changed (1) hide show
  1. app.py +54 -11
app.py CHANGED
@@ -7,6 +7,7 @@ import re
7
  from concurrent.futures import ThreadPoolExecutor
8
  import numpy as np
9
  from datetime import datetime
 
10
 
11
  import gradio as gr
12
  import pandas as pd
@@ -54,7 +55,11 @@ def fetch_repo_structure(results_uri, split_checkpoints=False, oauth_token: gr.O
54
  token = oauth_token.token
55
 
56
  data_folder = DataFolder(results_uri, token=token)
57
- runs = [f.removeprefix("details/") for f in data_folder.list_files("details", recursive=False, include_directories=True) if f != "details"]
 
 
 
 
58
 
59
  if not runs:
60
  return {}, gr.update(choices=[], value=None)
@@ -139,7 +144,9 @@ def fetch_run_results(results_uri, selected_run_checkpoint: list[str],
139
  return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
140
 
141
 
142
- def render_table(df, selected_run_checkpoint: list[str], metric_names):
 
 
143
  if df is None or not selected_run_checkpoint or not metric_names:
144
  return None, "0"
145
 
@@ -148,18 +155,24 @@ def render_table(df, selected_run_checkpoint: list[str], metric_names):
148
  for metric_name in metric_names]
149
  other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
150
  df = df.drop(columns=other_metrics)
 
 
 
 
151
  df = shorten_column_names(df, selected_run_checkpoint, metric_names)
152
 
153
- # Sample 100
154
- n_samples = len(df)
155
- df = df.sample(n=min(100, len(df)), random_state=42)
 
 
156
 
157
  # Get column widths for better display
158
  column_widths = get_column_widths(df)
159
  return gr.Dataframe(
160
  value=df,
161
  column_widths=column_widths
162
- ), str(n_samples)
163
 
164
  def update_selected_run_checkpoint(selected_runs: list[str] | None, selected_checkpoint: list[str] | None, split_checkpoints: bool):
165
  if not selected_runs:
@@ -301,7 +314,7 @@ def load_task_data(results_uri, selected_run_checkpoint: list[str], task_name, t
301
  prepared_df = pd.DataFrame({
302
  'prompt': df[prompt_column],
303
  'choices': df['choices'].apply(tuple), # Convert lists to tuples
304
- 'gold': df['gold'].apply(lambda x: tuple(x) if isinstance(x, list) else x), # Convert lists to tuples
305
  'gold_index': df['gold_index'],
306
  **generative_columns,
307
  })
@@ -348,6 +361,34 @@ def load_task_data(results_uri, selected_run_checkpoint: list[str], task_name, t
348
 
349
  return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  with gr.Blocks() as demo:
352
  available_runs_checkpoints = gr.State({})
353
  results_df_full = gr.State(None)
@@ -385,6 +426,8 @@ with gr.Blocks() as demo:
385
  with gr.Column():
386
  num_samples = gr.Text(interactive=False, label="# Samples")
387
  prompt_column = gr.Radio(choices=["full_prompt", "example"], label="Prompt display", value="example")
 
 
388
 
389
  # Run selection
390
  gr.on(
@@ -435,7 +478,7 @@ with gr.Blocks() as demo:
435
  outputs=[results_df_full, metric_names]
436
  ).then(
437
  fn=render_table,
438
- inputs=[results_df_full, selected_run_checkpoint, metric_names],
439
  outputs=[results_df, num_samples]
440
  )
441
 
@@ -447,14 +490,14 @@ with gr.Blocks() as demo:
447
  outputs=[results_df_full, metric_names]
448
  ).then(
449
  fn=render_table,
450
- inputs=[results_df_full, selected_run_checkpoint, metric_names],
451
  outputs=[results_df, num_samples]
452
  )
453
 
454
  gr.on(
455
- triggers=[metric_names.input],
456
  fn=render_table,
457
- inputs=[results_df_full, selected_run_checkpoint, metric_names],
458
  outputs=[results_df, num_samples]
459
  )
460
 
 
7
  from concurrent.futures import ThreadPoolExecutor
8
  import numpy as np
9
  from datetime import datetime
10
+ from typing import Any
11
 
12
  import gradio as gr
13
  import pandas as pd
 
55
  token = oauth_token.token
56
 
57
  data_folder = DataFolder(results_uri, token=token)
58
+ try:
59
+ runs = [f.removeprefix("details/") for f in data_folder.list_files("details", recursive=False, include_directories=True) if f != "details"]
60
+ except Exception as e:
61
+ print(f"Error fetching repo structure: {e}")
62
+ runs = []
63
 
64
  if not runs:
65
  return {}, gr.update(choices=[], value=None)
 
144
  return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
145
 
146
 
147
+ def render_table(df: pd.DataFrame | None, selected_run_checkpoint: list[str],
148
+ metric_names: list[str], filter_different: bool = False,
149
+ n_samples: int = 100):
150
  if df is None or not selected_run_checkpoint or not metric_names:
151
  return None, "0"
152
 
 
155
  for metric_name in metric_names]
156
  other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
157
  df = df.drop(columns=other_metrics)
158
+
159
+ if filter_different:
160
+ df = df[df.apply(lambda row: has_different_values(row, selected_run_checkpoint, metric_names), axis=1)]
161
+
162
  df = shorten_column_names(df, selected_run_checkpoint, metric_names)
163
 
164
+ # Get total number of samples before limiting
165
+ total_samples = len(df)
166
+
167
+ # Take first n_samples instead of random sampling
168
+ df = df.head(n_samples)
169
 
170
  # Get column widths for better display
171
  column_widths = get_column_widths(df)
172
  return gr.Dataframe(
173
  value=df,
174
  column_widths=column_widths
175
+ ), str(total_samples)
176
 
177
  def update_selected_run_checkpoint(selected_runs: list[str] | None, selected_checkpoint: list[str] | None, split_checkpoints: bool):
178
  if not selected_runs:
 
314
  prepared_df = pd.DataFrame({
315
  'prompt': df[prompt_column],
316
  'choices': df['choices'].apply(tuple), # Convert lists to tuples
317
+ 'gold': df['gold'].apply(lambda x: tuple(x) if is_arary_like(x) else x), # Convert lists to tuples
318
  'gold_index': df['gold_index'],
319
  **generative_columns,
320
  })
 
361
 
362
  return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
363
 
364
+ def has_different_values(row: pd.Series, selected_run_checkpoint: list[str], metric_names: list[str]) -> bool:
365
+ """Check if a row has different values across runs for any metric or generation."""
366
+ # Check generations
367
+ generation_cols = [f"generation_{run}" for run in selected_run_checkpoint]
368
+ generation_cols = [col for col in generation_cols if col in row.index]
369
+ if generation_cols:
370
+ generations = row[generation_cols].dropna()
371
+ # Convert lists to tuples for comparison and handle string values
372
+ unique_generations = set()
373
+ for gen in generations:
374
+ if isinstance(gen, list):
375
+ unique_generations.add(tuple(gen))
376
+ else:
377
+ unique_generations.add(gen)
378
+ if len(unique_generations) > 1:
379
+ return True
380
+
381
+ # Check metrics
382
+ for metric in metric_names:
383
+ metric_cols = [f"metric_{metric}_{run}" for run in selected_run_checkpoint]
384
+ metric_cols = [col for col in metric_cols if col in row.index]
385
+ if metric_cols:
386
+ metrics = row[metric_cols].dropna()
387
+ if len(metrics.unique()) > 1:
388
+ return True
389
+
390
+ return False
391
+
392
  with gr.Blocks() as demo:
393
  available_runs_checkpoints = gr.State({})
394
  results_df_full = gr.State(None)
 
426
  with gr.Column():
427
  num_samples = gr.Text(interactive=False, label="# Samples")
428
  prompt_column = gr.Radio(choices=["full_prompt", "example"], label="Prompt display", value="example")
429
+ filter_different = gr.Checkbox(label="Show only samples with differences", value=False)
430
+ n_samples_input = gr.Number(value=100, label="Number of samples to show", minimum=1, maximum=1000, step=1)
431
 
432
  # Run selection
433
  gr.on(
 
478
  outputs=[results_df_full, metric_names]
479
  ).then(
480
  fn=render_table,
481
+ inputs=[results_df_full, selected_run_checkpoint, metric_names, filter_different, n_samples_input],
482
  outputs=[results_df, num_samples]
483
  )
484
 
 
490
  outputs=[results_df_full, metric_names]
491
  ).then(
492
  fn=render_table,
493
+ inputs=[results_df_full, selected_run_checkpoint, metric_names, filter_different, n_samples_input],
494
  outputs=[results_df, num_samples]
495
  )
496
 
497
  gr.on(
498
+ triggers=[metric_names.input, filter_different.change, n_samples_input.change],
499
  fn=render_table,
500
+ inputs=[results_df_full, selected_run_checkpoint, metric_names, filter_different, n_samples_input],
501
  outputs=[results_df, num_samples]
502
  )
503