Tasks-Explorer

Running

App Files Files Community

hynky HF staff commited on Jul 15, 2024

Commit

97e6937

1 Parent(s): d10da92

allow multiple metrics + concise the cols

Browse files

Files changed (1) hide show

app.py +47 -47

app.py CHANGED Viewed

@@ -31,7 +31,8 @@ def get_task_type(df):
 def fix_df(df):
     # For some reason some metrics and predictions are stored as strings
     for col in ["predictions", "metrics", "choices", "gold", "gold_index"]:
-        df[col] = [ast.literal_eval(x) if isinstance(x, str) else x for x in df[col].values]
     return df
 def get_run_name_seed(run_name):
@@ -119,15 +120,18 @@ def fetch_run_results(repo_name, runs_to_fetch, checkpoint,
     return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
-def filter_with_metric(df, selected_runs, metric_name):
-    if df is None or not selected_runs or not metric_name:
         return None
-    kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs]
     other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
     df = df.drop(columns=other_metrics)
     # widths = get_column_widths(df)
-    df = consize_runname_metric(df, selected_runs, metric_name)
-    return gr.update(value=df, row_count=(100, 'fixed'))
 def get_column_widths(df):
     column_widths = []
@@ -143,17 +147,26 @@ def get_column_widths(df):
     return column_widths
-def consize_runname_metric(df, run_names, metric_name):
     """
     Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
     """
-    # Initialize the new column with empty strings
     for idx, run_name in enumerate(run_names):
-        original_column = f"metric_{metric_name}_{run_name}"
-        if original_column in df.columns:
-            # Append the run name and metric value to the concise column
-            df[f"{metric_name}_{idx}"] = df[original_column]
-            df = df.drop(columns=[original_column])
     return df
@@ -192,16 +205,23 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
             if task_type == "multiple_choice":
                 n_choices = len(df['choices'])
-                return df['choices'][np.argmax([pred[0] for pred in predictions[:n_choices]])]
             if task_type == "mixed":
                 return predictions[0]
             return predictions
         prepared_df = pd.DataFrame({
             'full_prompt': df['full_prompt'],
-            f'{run_name}': df.apply(partial(get_choice_predictions, task_type=task_type), axis=1)
         })
         # For some reason some metrics are stored as strings
         metrics = df['metrics']
@@ -213,10 +233,13 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
     def get_gold_label(df, task_type):
         if task_type == "generative":
             return df['gold']
-        return [df['choices'][idx] for idx in df['gold_index']]
     # Prepare the first DataFrame with choices and gold
-    combined_df = dfs[0][['full_prompt', 'choices']].set_index('full_prompt')
     combined_df['gold'] = dfs[0].apply(lambda row: get_gold_label(row, task_type), axis=1).values
     # Join all prepared DataFrames
@@ -227,32 +250,9 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
     available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
     combined_df = combined_df.reset_index()
-    return combined_df, filter_with_metric(combined_df, runs_to_fetch, available_metrics[0]), gr.update(choices=available_metrics, value=available_metrics[0])
-def render_results_table(df: pd.DataFrame):
-    if df is None or df.empty:
-        return None
-    # Select a subset of 100 examples
-    df_subset = df.sample(n=min(100, len(df)), random_state=42)
-    # Prepare the data for display
-    display_data = []
-    for _, row in df_subset.iterrows():
-        example_data = {
-            'text': row['example'],
-            'choices': row['choices'],
-            'gold_index': row['gold_index'],
-        }
-        for run in df['run'].unique():
-            run_data = df[(df['run'] == run) & (df['example'] == row['example'])]
-            if not run_data.empty:
-                example_data[f'{run}_prediction'] = run_data['predictions'].values[0]
-                example_data[f'{run}_score'] = run_data['metrics'].values[0]
-        display_data.append(example_data)
-    return pd.DataFrame(display_data)
 with gr.Blocks() as demo:
     runs_checkpoints = gr.State({})
@@ -275,7 +275,7 @@ with gr.Blocks() as demo:
         checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint")
         fetch_res = gr.Button("Fetch results")
         task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
-        metric_name = gr.Dropdown(choices=[], interactive=True, label="Metric")
         results_df = gr.Dataframe(interactive=False, wrap=True)
     # Run selection
@@ -316,13 +316,13 @@ with gr.Blocks() as demo:
         triggers=[task_name.change],
         fn=load_task_data,
         inputs=[repo, selected_runs, checkpoint, task_name, tasks_files],
-        outputs=[results_df_full, results_df, metric_name]
     )
     gr.on(
-        triggers=[metric_name.change],
-        fn=filter_with_metric,
-        inputs=[results_df_full, selected_runs, metric_name],
         outputs=[results_df]
     )

 def fix_df(df):
     # For some reason some metrics and predictions are stored as strings
     for col in ["predictions", "metrics", "choices", "gold", "gold_index"]:
+        if col in df.columns:
+            df[col] = [ast.literal_eval(x) if isinstance(x, str) else x for x in df[col].values]
     return df
 def get_run_name_seed(run_name):
     return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
+def render_table(df, selected_runs, metric_names):
+    if df is None or not selected_runs or not metric_names:
         return None
+    kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
     other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
     df = df.drop(columns=other_metrics)
     # widths = get_column_widths(df)
+    df = shorten_column_names(df, selected_runs, metric_names)
+    # Sample 100
+    df = df.sample(n=min(100, len(df)), random_state=42)
+    return df
 def get_column_widths(df):
     column_widths = []
     return column_widths
+def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
     """
     Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
+    Turns generation_{run_name} into generation_i
     """
+    # Handle metric columns
+    # Aggregate columns to rename
+    columns_to_rename = {}
     for idx, run_name in enumerate(run_names):
+        for metric_name in metric_names:
+            original_metric_column = f"metric_{metric_name}_{run_name}"
+            if original_metric_column in df.columns:
+                columns_to_rename[original_metric_column] = f"{metric_name}_{idx}"
+        original_generation_column = f"generation_{run_name}"
+        if original_generation_column in df.columns:
+            columns_to_rename[original_generation_column] = f"generation_{idx}"
+    # Rename columns in a single operation
+    df = df.rename(columns=columns_to_rename)
     return df
             if task_type == "multiple_choice":
                 n_choices = len(df['choices'])
+                return [pred[0] for pred in predictions[:n_choices]]
             if task_type == "mixed":
                 return predictions[0]
             return predictions
+        generative_columns = {
+            f"generation_{run_name}": df.apply(partial(get_choice_predictions, task_type=task_type), axis=1)
+        } if task_type == "generative" or task_type == "mixed" else {}
         prepared_df = pd.DataFrame({
             'full_prompt': df['full_prompt'],
+            **generative_columns,
         })
         # For some reason some metrics are stored as strings
         metrics = df['metrics']
     def get_gold_label(df, task_type):
         if task_type == "generative":
             return df['gold']
+        return df['gold_index']
     # Prepare the first DataFrame with choices and gold
+    combined_df = dfs[0][['full_prompt']].set_index('full_prompt')
+    if task_type in ["multiple_choice", "mixed"]:
+        combined_df["choices"] = dfs[0]["choices"].values
     combined_df['gold'] = dfs[0].apply(lambda row: get_gold_label(row, task_type), axis=1).values
     # Join all prepared DataFrames
     available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
     combined_df = combined_df.reset_index()
+    chosen_metrics = available_metrics[:1]
+    return combined_df, render_table(combined_df, runs_to_fetch, chosen_metrics), gr.update(choices=available_metrics, value=chosen_metrics)
 with gr.Blocks() as demo:
     runs_checkpoints = gr.State({})
         checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint")
         fetch_res = gr.Button("Fetch results")
         task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
+        metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
         results_df = gr.Dataframe(interactive=False, wrap=True)
     # Run selection
         triggers=[task_name.change],
         fn=load_task_data,
         inputs=[repo, selected_runs, checkpoint, task_name, tasks_files],
+        outputs=[results_df_full, results_df, metric_names]
     )
     gr.on(
+        triggers=[metric_names.change],
+        fn=render_table,
+        inputs=[results_df_full, selected_runs, metric_names],
         outputs=[results_df]
     )