Tasks-Explorer

Running

App Files Files Community

hynky HF staff commited on Nov 28, 2024

Commit

90e7c81

1 Parent(s): da65ce9

small prompt + only show prompt

Browse files

Files changed (1) hide show

app.py +60 -36

app.py CHANGED Viewed

@@ -18,7 +18,8 @@ def is_arary_like(x):
     return isinstance(x, list) or isinstance(x, tuple) or isinstance(x, np.ndarray)
 def get_task_type(df):
-    if all(isinstance(pred, str) for pred in df['predictions'].iloc[0]):
         return "generative"
     if all(is_arary_like(pred) and all(isinstance(item, float) for item in pred) for pred in df['predictions'].iloc[0]):
         return "multiple_choice"
@@ -31,7 +32,10 @@ def fix_df(df):
             df[col] = [ast.literal_eval(x) if isinstance(x, str) else x for x in df[col].values]
         if col == "predictions":
             df[col] = df[col].apply(lambda x: [[z[0] for z in x]] if is_arary_like(x) and len(x[0]) == 2 else x)
     return df
 def get_run_name_seed(run_name):
@@ -116,8 +120,7 @@ def fetch_available_tasks(results_uri, runs_to_fetch, checkpoint) -> dict[str, d
     return available_tasks
 def fetch_run_results(results_uri, runs_to_fetch, checkpoint,
-                      oauth_token: gr.OAuthToken | None = None, progress=gr.Progress()):
     task_runs_dict = fetch_available_tasks(results_uri, runs_to_fetch, checkpoint)
     task_names = list(task_runs_dict.keys())
     return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
@@ -126,21 +129,21 @@ def fetch_run_results(results_uri, runs_to_fetch, checkpoint,
 def render_table(df, selected_runs, metric_names):
     if df is None or not selected_runs or not metric_names:
         return None, "0"
     kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
     other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
     df = df.drop(columns=other_metrics)
-    # widths = get_column_widths(df)
     df = shorten_column_names(df, selected_runs, metric_names)
     # Sample 100
     n_samples = len(df)
     df = df.sample(n=min(100, len(df)), random_state=42)
-    return df, n_samples
 def get_column_widths(df):
     column_widths = []
     for col in df.columns:
-        if col == "full_prompt":
             column_widths.append("300px")
         elif col in ["choices", "gold"]:
             column_widths.append("250px")
@@ -155,9 +158,9 @@ def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
     """
     Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
     Turns generation_{run_name} into generation_i
     """
     # Handle metric columns
-    # Aggregate columns to rename
     columns_to_rename = {}
     for idx, run_name in enumerate(run_names):
         for metric_name in metric_names:
@@ -171,13 +174,20 @@ def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
     # Rename columns in a single operation
     df = df.rename(columns=columns_to_rename)
     return df
-def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_files, progress=gr.Progress()):
     token = os.environ.get(FALLBACK_TOKEN_NAME)
     if not runs_to_fetch or not task_name:
-        return None, None, None
@@ -204,9 +214,8 @@ def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_file
         return None, None, gr.update(choices=[], value=None)
     task_type = get_task_type(dfs[0])
-    def prepare_df(df, run_name, task_type):
         def get_choice_predictions(df, task_type):
-            # For some evals it's string for other it's list
             predictions = df['predictions']
             if task_type == "generative":
                 return predictions
@@ -223,24 +232,25 @@ def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_file
         generative_columns = {
             f"generation_{run_name}": df.apply(partial(get_choice_predictions, task_type=task_type), axis=1)
         } if task_type == "generative" or task_type == "mixed" else {}
         prepared_df = pd.DataFrame({
-            'full_prompt': df['full_prompt'],
             **generative_columns,
         })
         # For some reason some metrics are stored as strings
         metrics = df['metrics']
-        # Assume all metrics are the same
         available_metrics = set(metric for row_metrics in metrics for metric in row_metrics)
         for metric_key in available_metrics:
             prepared_df[f'metric_{metric_key}_{run_name}'] = [metric.get(metric_key, None) for metric in metrics]
         # Merge rows with the same full_prompt
-        prepared_df = prepared_df.groupby('full_prompt').agg(lambda x: next((item for item in x if item is not None), None)).reset_index()
-        return prepared_df.set_index('full_prompt')
     def get_gold_label(df, task_type):
         if task_type == "generative":
@@ -248,20 +258,27 @@ def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_file
         return df['gold_index']
     # Prepare the first DataFrame with choices and gold
-    combined_df = dfs[0][['full_prompt']].set_index('full_prompt')
     if task_type in ["multiple_choice", "mixed"]:
-        combined_df["choices"] = dfs[0]["choices"].values
-    combined_df['gold'] = dfs[0].apply(lambda row: get_gold_label(row, task_type), axis=1).values
-    # Join all prepared DataFrames
-    for df, run_name in zip(dfs, run_names):
-        prepared_df = prepare_df(df, run_name, task_type)
-        combined_df = combined_df.join(prepared_df, how='outer')
-    available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
-    combined_df = combined_df.reset_index()
     chosen_metrics = available_metrics[:1]
     return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
@@ -274,24 +291,31 @@ with gr.Blocks() as demo:
     results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True)
     with gr.Column():
         gr.Markdown("# FineWeb experiments results explorer")
         with gr.Row():
             with gr.Column():
                 select_by_regex_text = gr.Textbox(label="Regex to select runs",
-                                                  value="ind_minhash(-CC-MAIN-|_)\\d{4}-\\d{2}-seed.*")
                 select_by_regex_button = gr.Button("Select matching runs")
             with gr.Column():
                 select_by_language = gr.Dropdown(choices=["ar", "fr", "ru", "hi", "th", "tr", "zh", "sw", "te"],
-                                                 interactive=True, label="Select by language",
-                                                 info="Choose a language to prefill the regex")
         selected_runs = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Selected runs")
-        checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint")
         fetch_res = gr.Button("Fetch results")
         task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
         metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
-        results_df = gr.Dataframe(interactive=False, wrap=True)
         with gr.Row():
             with gr.Column():
                 num_samples = gr.Text(interactive=False, label="# Samples")
     # Run selection
     gr.on(
@@ -325,7 +349,7 @@ with gr.Blocks() as demo:
         outputs=[task_name, tasks_files]
     ).then(
         fn=load_task_data,
-        inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files],
         outputs=[results_df_full, metric_names]
     ).then(
         fn=render_table,
@@ -337,7 +361,7 @@ with gr.Blocks() as demo:
     gr.on(
         triggers=[task_name.input],
         fn=load_task_data,
-        inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files],
         outputs=[results_df_full, metric_names]
     ).then(
         fn=render_table,

     return isinstance(x, list) or isinstance(x, tuple) or isinstance(x, np.ndarray)
 def get_task_type(df):
+    # Compatibility with old lighteval
+    if all(isinstance(pred, str) or (is_arary_like(pred) and all(isinstance(item, str) for item in pred)) for pred in df['predictions'].iloc[0]):
         return "generative"
     if all(is_arary_like(pred) and all(isinstance(item, float) for item in pred) for pred in df['predictions'].iloc[0]):
         return "multiple_choice"
             df[col] = [ast.literal_eval(x) if isinstance(x, str) else x for x in df[col].values]
         if col == "predictions":
+            # For multiple choice
             df[col] = df[col].apply(lambda x: [[z[0] for z in x]] if is_arary_like(x) and len(x[0]) == 2 else x)
+            # For unwraping of generative
+            df[col] = df[col].apply(lambda x: x[0] if is_arary_like(x) and len(x) == 1 else x)
     return df
 def get_run_name_seed(run_name):
     return available_tasks
 def fetch_run_results(results_uri, runs_to_fetch, checkpoint,
+                                   oauth_token: gr.OAuthToken | None = None, progress=gr.Progress()):
     task_runs_dict = fetch_available_tasks(results_uri, runs_to_fetch, checkpoint)
     task_names = list(task_runs_dict.keys())
     return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
 def render_table(df, selected_runs, metric_names):
     if df is None or not selected_runs or not metric_names:
         return None, "0"
     kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
     other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
     df = df.drop(columns=other_metrics)
     df = shorten_column_names(df, selected_runs, metric_names)
     # Sample 100
     n_samples = len(df)
     df = df.sample(n=min(100, len(df)), random_state=42)
+    return df, str(n_samples)
 def get_column_widths(df):
     column_widths = []
     for col in df.columns:
+        if col == "prompt":
             column_widths.append("300px")
         elif col in ["choices", "gold"]:
             column_widths.append("250px")
     """
     Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
     Turns generation_{run_name} into generation_i
+    Also truncates full_prompt column to 200 chars with expandable view
     """
     # Handle metric columns
     columns_to_rename = {}
     for idx, run_name in enumerate(run_names):
         for metric_name in metric_names:
     # Rename columns in a single operation
     df = df.rename(columns=columns_to_rename)
+    # Add markdown formatting to full_prompt column for truncation with expansion
+    if 'prompt' in df.columns:
+        df['prompt'] = df['prompt'].apply(
+            lambda x: f"<details><summary>{x[:100]}...</summary>\n\n{x}</details>" if len(x) > 100 else x
+        )
     return df
+def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_files, prompt_column, progress=gr.Progress()):
     token = os.environ.get(FALLBACK_TOKEN_NAME)
     if not runs_to_fetch or not task_name:
+        return None, None
         return None, None, gr.update(choices=[], value=None)
     task_type = get_task_type(dfs[0])
+    def prepare_df(df, run_name, task_type, prompt_column):
         def get_choice_predictions(df, task_type):
             predictions = df['predictions']
             if task_type == "generative":
                 return predictions
         generative_columns = {
             f"generation_{run_name}": df.apply(partial(get_choice_predictions, task_type=task_type), axis=1)
         } if task_type == "generative" or task_type == "mixed" else {}
         prepared_df = pd.DataFrame({
+            'prompt': df[prompt_column],
+            'choices': df['choices'].apply(tuple),  # Convert lists to tuples
+            'gold': df['gold'].apply(lambda x: tuple(x) if isinstance(x, list) else x),  # Convert lists to tuples
+            'gold_index': df['gold_index'],
             **generative_columns,
         })
         # For some reason some metrics are stored as strings
         metrics = df['metrics']
         available_metrics = set(metric for row_metrics in metrics for metric in row_metrics)
         for metric_key in available_metrics:
             prepared_df[f'metric_{metric_key}_{run_name}'] = [metric.get(metric_key, None) for metric in metrics]
         # Merge rows with the same full_prompt
+        prepared_df = prepared_df.groupby('prompt').agg(lambda x: next((item for item in x if item is not None), None)).reset_index()
+        prepared_df["prompt"] = prepared_df["prompt"].astype(str)
+        return prepared_df
     def get_gold_label(df, task_type):
         if task_type == "generative":
         return df['gold_index']
     # Prepare the first DataFrame with choices and gold
+    # Join all prepared DataFrames
+    prepared_dfs = [
+        prepare_df(df, run_name, task_type, prompt_column)
+        for df, run_name in zip(dfs, run_names)
+    ]
+    combined_df = prepared_dfs[0]
+    for idx, prepared_df in enumerate(prepared_dfs[1:]):
+        combined_df = combined_df.merge(prepared_df, how='outer', on=("prompt", "gold"), suffixes=(None, f"_{idx}"))
+    to_keep = ["prompt", "gold"]
     if task_type in ["multiple_choice", "mixed"]:
+        to_keep.append("choices")
+    elif task_type == "generative":
+        to_keep.extend([col for col in combined_df.columns if col.startswith("generation_")])
+    combined_df['gold'] = combined_df.apply(lambda row: get_gold_label(row, task_type), axis=1).values
+    metric_cols = [col for col in combined_df.columns if col.startswith("metric_")]
+    combined_df = combined_df[to_keep + metric_cols]
+    available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in metric_cols))
     chosen_metrics = available_metrics[:1]
     return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
     results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True)
     with gr.Column():
         gr.Markdown("# FineWeb experiments results explorer")
+        split_checkpoints = gr.Checkbox(label="Split checkpoints from models", value=True)
         with gr.Row():
             with gr.Column():
                 select_by_regex_text = gr.Textbox(label="Regex to select runs",
+                                                value="ind_minhash(-CC-MAIN-|_)\\d{4}-\\d{2}-seed.*")
                 select_by_regex_button = gr.Button("Select matching runs")
             with gr.Column():
                 select_by_language = gr.Dropdown(choices=["ar", "fr", "ru", "hi", "th", "tr", "zh", "sw", "te"],
+                                               interactive=True, label="Select by language",
+                                               info="Choose a language to prefill the regex")
         selected_runs = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Selected runs")
+        checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint", visible=True)
         fetch_res = gr.Button("Fetch results")
         task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
         metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
+        results_df = gr.Dataframe(
+            interactive=False,
+            wrap=True,
+            line_breaks=True,
+            datatype="markdown"
+        )
         with gr.Row():
             with gr.Column():
                 num_samples = gr.Text(interactive=False, label="# Samples")
+                prompt_column = gr.Radio(choices=["full_prompt", "example"], label="Prompt display", value="example")
     # Run selection
     gr.on(
         outputs=[task_name, tasks_files]
     ).then(
         fn=load_task_data,
+        inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files, prompt_column],
         outputs=[results_df_full, metric_names]
     ).then(
         fn=render_table,
     gr.on(
         triggers=[task_name.input],
         fn=load_task_data,
+        inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files, prompt_column],
         outputs=[results_df_full, metric_names]
     ).then(
         fn=render_table,