Spaces:
Running
Running
show only diferent examples
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ import re
|
|
7 |
from concurrent.futures import ThreadPoolExecutor
|
8 |
import numpy as np
|
9 |
from datetime import datetime
|
|
|
10 |
|
11 |
import gradio as gr
|
12 |
import pandas as pd
|
@@ -143,7 +144,9 @@ def fetch_run_results(results_uri, selected_run_checkpoint: list[str],
|
|
143 |
return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
|
144 |
|
145 |
|
146 |
-
def render_table(df, selected_run_checkpoint: list[str],
|
|
|
|
|
147 |
if df is None or not selected_run_checkpoint or not metric_names:
|
148 |
return None, "0"
|
149 |
|
@@ -152,18 +155,24 @@ def render_table(df, selected_run_checkpoint: list[str], metric_names):
|
|
152 |
for metric_name in metric_names]
|
153 |
other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
|
154 |
df = df.drop(columns=other_metrics)
|
|
|
|
|
|
|
|
|
155 |
df = shorten_column_names(df, selected_run_checkpoint, metric_names)
|
156 |
|
157 |
-
#
|
158 |
-
|
159 |
-
|
|
|
|
|
160 |
|
161 |
# Get column widths for better display
|
162 |
column_widths = get_column_widths(df)
|
163 |
return gr.Dataframe(
|
164 |
value=df,
|
165 |
column_widths=column_widths
|
166 |
-
), str(
|
167 |
|
168 |
def update_selected_run_checkpoint(selected_runs: list[str] | None, selected_checkpoint: list[str] | None, split_checkpoints: bool):
|
169 |
if not selected_runs:
|
@@ -305,7 +314,7 @@ def load_task_data(results_uri, selected_run_checkpoint: list[str], task_name, t
|
|
305 |
prepared_df = pd.DataFrame({
|
306 |
'prompt': df[prompt_column],
|
307 |
'choices': df['choices'].apply(tuple), # Convert lists to tuples
|
308 |
-
'gold': df['gold'].apply(lambda x: tuple(x) if
|
309 |
'gold_index': df['gold_index'],
|
310 |
**generative_columns,
|
311 |
})
|
@@ -352,6 +361,34 @@ def load_task_data(results_uri, selected_run_checkpoint: list[str], task_name, t
|
|
352 |
|
353 |
return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
|
354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
with gr.Blocks() as demo:
|
356 |
available_runs_checkpoints = gr.State({})
|
357 |
results_df_full = gr.State(None)
|
@@ -389,6 +426,8 @@ with gr.Blocks() as demo:
|
|
389 |
with gr.Column():
|
390 |
num_samples = gr.Text(interactive=False, label="# Samples")
|
391 |
prompt_column = gr.Radio(choices=["full_prompt", "example"], label="Prompt display", value="example")
|
|
|
|
|
392 |
|
393 |
# Run selection
|
394 |
gr.on(
|
@@ -439,7 +478,7 @@ with gr.Blocks() as demo:
|
|
439 |
outputs=[results_df_full, metric_names]
|
440 |
).then(
|
441 |
fn=render_table,
|
442 |
-
inputs=[results_df_full, selected_run_checkpoint, metric_names],
|
443 |
outputs=[results_df, num_samples]
|
444 |
)
|
445 |
|
@@ -451,14 +490,14 @@ with gr.Blocks() as demo:
|
|
451 |
outputs=[results_df_full, metric_names]
|
452 |
).then(
|
453 |
fn=render_table,
|
454 |
-
inputs=[results_df_full, selected_run_checkpoint, metric_names],
|
455 |
outputs=[results_df, num_samples]
|
456 |
)
|
457 |
|
458 |
gr.on(
|
459 |
-
triggers=[metric_names.input],
|
460 |
fn=render_table,
|
461 |
-
inputs=[results_df_full, selected_run_checkpoint, metric_names],
|
462 |
outputs=[results_df, num_samples]
|
463 |
)
|
464 |
|
|
|
7 |
from concurrent.futures import ThreadPoolExecutor
|
8 |
import numpy as np
|
9 |
from datetime import datetime
|
10 |
+
from typing import Any
|
11 |
|
12 |
import gradio as gr
|
13 |
import pandas as pd
|
|
|
144 |
return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
|
145 |
|
146 |
|
147 |
+
def render_table(df: pd.DataFrame | None, selected_run_checkpoint: list[str],
|
148 |
+
metric_names: list[str], filter_different: bool = False,
|
149 |
+
n_samples: int = 100):
|
150 |
if df is None or not selected_run_checkpoint or not metric_names:
|
151 |
return None, "0"
|
152 |
|
|
|
155 |
for metric_name in metric_names]
|
156 |
other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
|
157 |
df = df.drop(columns=other_metrics)
|
158 |
+
|
159 |
+
if filter_different:
|
160 |
+
df = df[df.apply(lambda row: has_different_values(row, selected_run_checkpoint, metric_names), axis=1)]
|
161 |
+
|
162 |
df = shorten_column_names(df, selected_run_checkpoint, metric_names)
|
163 |
|
164 |
+
# Get total number of samples before limiting
|
165 |
+
total_samples = len(df)
|
166 |
+
|
167 |
+
# Take first n_samples instead of random sampling
|
168 |
+
df = df.head(n_samples)
|
169 |
|
170 |
# Get column widths for better display
|
171 |
column_widths = get_column_widths(df)
|
172 |
return gr.Dataframe(
|
173 |
value=df,
|
174 |
column_widths=column_widths
|
175 |
+
), str(total_samples)
|
176 |
|
177 |
def update_selected_run_checkpoint(selected_runs: list[str] | None, selected_checkpoint: list[str] | None, split_checkpoints: bool):
|
178 |
if not selected_runs:
|
|
|
314 |
prepared_df = pd.DataFrame({
|
315 |
'prompt': df[prompt_column],
|
316 |
'choices': df['choices'].apply(tuple), # Convert lists to tuples
|
317 |
+
'gold': df['gold'].apply(lambda x: tuple(x) if is_arary_like(x) else x), # Convert lists to tuples
|
318 |
'gold_index': df['gold_index'],
|
319 |
**generative_columns,
|
320 |
})
|
|
|
361 |
|
362 |
return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
|
363 |
|
364 |
+
def has_different_values(row: pd.Series, selected_run_checkpoint: list[str], metric_names: list[str]) -> bool:
|
365 |
+
"""Check if a row has different values across runs for any metric or generation."""
|
366 |
+
# Check generations
|
367 |
+
generation_cols = [f"generation_{run}" for run in selected_run_checkpoint]
|
368 |
+
generation_cols = [col for col in generation_cols if col in row.index]
|
369 |
+
if generation_cols:
|
370 |
+
generations = row[generation_cols].dropna()
|
371 |
+
# Convert lists to tuples for comparison and handle string values
|
372 |
+
unique_generations = set()
|
373 |
+
for gen in generations:
|
374 |
+
if isinstance(gen, list):
|
375 |
+
unique_generations.add(tuple(gen))
|
376 |
+
else:
|
377 |
+
unique_generations.add(gen)
|
378 |
+
if len(unique_generations) > 1:
|
379 |
+
return True
|
380 |
+
|
381 |
+
# Check metrics
|
382 |
+
for metric in metric_names:
|
383 |
+
metric_cols = [f"metric_{metric}_{run}" for run in selected_run_checkpoint]
|
384 |
+
metric_cols = [col for col in metric_cols if col in row.index]
|
385 |
+
if metric_cols:
|
386 |
+
metrics = row[metric_cols].dropna()
|
387 |
+
if len(metrics.unique()) > 1:
|
388 |
+
return True
|
389 |
+
|
390 |
+
return False
|
391 |
+
|
392 |
with gr.Blocks() as demo:
|
393 |
available_runs_checkpoints = gr.State({})
|
394 |
results_df_full = gr.State(None)
|
|
|
426 |
with gr.Column():
|
427 |
num_samples = gr.Text(interactive=False, label="# Samples")
|
428 |
prompt_column = gr.Radio(choices=["full_prompt", "example"], label="Prompt display", value="example")
|
429 |
+
filter_different = gr.Checkbox(label="Show only samples with differences", value=False)
|
430 |
+
n_samples_input = gr.Number(value=100, label="Number of samples to show", minimum=1, maximum=1000, step=1)
|
431 |
|
432 |
# Run selection
|
433 |
gr.on(
|
|
|
478 |
outputs=[results_df_full, metric_names]
|
479 |
).then(
|
480 |
fn=render_table,
|
481 |
+
inputs=[results_df_full, selected_run_checkpoint, metric_names, filter_different, n_samples_input],
|
482 |
outputs=[results_df, num_samples]
|
483 |
)
|
484 |
|
|
|
490 |
outputs=[results_df_full, metric_names]
|
491 |
).then(
|
492 |
fn=render_table,
|
493 |
+
inputs=[results_df_full, selected_run_checkpoint, metric_names, filter_different, n_samples_input],
|
494 |
outputs=[results_df, num_samples]
|
495 |
)
|
496 |
|
497 |
gr.on(
|
498 |
+
triggers=[metric_names.input, filter_different.change, n_samples_input.change],
|
499 |
fn=render_table,
|
500 |
+
inputs=[results_df_full, selected_run_checkpoint, metric_names, filter_different, n_samples_input],
|
501 |
outputs=[results_df, num_samples]
|
502 |
)
|
503 |
|