Tasks-Explorer

Running

App Files Files Community

hynky HF staff commited on Nov 28, 2024

Commit

f14f2bb

1 Parent(s): 90e7c81

add support for split checkpoints

Browse files

Files changed (1) hide show

app.py +136 -61

app.py CHANGED Viewed

@@ -19,8 +19,11 @@ def is_arary_like(x):
 def get_task_type(df):
     # Compatibility with old lighteval
     if all(isinstance(pred, str) or (is_arary_like(pred) and all(isinstance(item, str) for item in pred)) for pred in df['predictions'].iloc[0]):
         return "generative"
     if all(is_arary_like(pred) and all(isinstance(item, float) for item in pred) for pred in df['predictions'].iloc[0]):
         return "multiple_choice"
     return "mixed"
@@ -44,7 +47,8 @@ def get_run_name_seed(run_name):
     run_name, seed = run_name.split("-seed-")
     return run_name, int(seed)
-def fetch_repo_structure(results_uri, oauth_token: gr.OAuthToken | None = None):
     token = os.environ.get(FALLBACK_TOKEN_NAME)
     if oauth_token:
         token = oauth_token.token
@@ -63,12 +67,16 @@ def fetch_repo_structure(results_uri, oauth_token: gr.OAuthToken | None = None):
         results = list(executor.map(process_run, runs))
     checkpoints_dict = dict(results)
-    return checkpoints_dict, gr.update(choices=list(checkpoints_dict), value=None)
-def update_checkpoints(selected_runs, checkpoints):
-    if not selected_runs:
-        return gr.update(choices=[], value=None)
     common_checkpoints = set(checkpoints[selected_runs[0]])
     for run in selected_runs[1:]:
@@ -76,7 +84,8 @@ def update_checkpoints(selected_runs, checkpoints):
     common_checkpoints = sorted(list(common_checkpoints))
-    return gr.update(choices=common_checkpoints, value=common_checkpoints[0] if common_checkpoints else None)
 def select_runs_by_regex(runs, current_selected, regex_to_select):
@@ -89,15 +98,15 @@ def select_runs_by_language(runs, current_selected, language):
         return select_runs_by_regex(runs, current_selected, f".*-{language}-.*")
     return current_selected
-def fetch_available_tasks(results_uri, runs_to_fetch, checkpoint) -> dict[str, dict[str, str]]:
     token = os.environ.get(FALLBACK_TOKEN_NAME)
     data_folder = DataFolder(results_uri, token=token)
     all_tasks = defaultdict(lambda: defaultdict(dict))
-    for run in runs_to_fetch:
         try:
-            details_folder = f"details/{run}/{checkpoint}"
             files = data_folder.list_files(details_folder, recursive=True)
             parquet_files = [f.removeprefix(details_folder + "/") for f in files if f.endswith('.parquet')]
@@ -105,52 +114,73 @@ def fetch_available_tasks(results_uri, runs_to_fetch, checkpoint) -> dict[str, d
                 task_name, date_str = full_filename.replace('.parquet', '').rsplit('_', 1)
                 date = datetime.strptime(date_str, '%Y-%m-%dT%H-%M-%S.%f')
-                if run not in all_tasks[task_name] or date > all_tasks[task_name][run]['date']:
-                    all_tasks[task_name][run] = {'filename': full_filename, 'date': date}
         except FileNotFoundError:
-            print(f"Checkpoint not found for run: {run}")
     available_tasks = {
-        task: {run: info['filename'] for run, info in runs.items()}
-        for task, runs in all_tasks.items()
-        if set(runs.keys()) == set(runs_to_fetch)
     }
     return available_tasks
-def fetch_run_results(results_uri, runs_to_fetch, checkpoint,
                                    oauth_token: gr.OAuthToken | None = None, progress=gr.Progress()):
-    task_runs_dict = fetch_available_tasks(results_uri, runs_to_fetch, checkpoint)
     task_names = list(task_runs_dict.keys())
     return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
-def render_table(df, selected_runs, metric_names):
-    if df is None or not selected_runs or not metric_names:
         return None, "0"
-    kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
     other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
     df = df.drop(columns=other_metrics)
-    df = shorten_column_names(df, selected_runs, metric_names)
     # Sample 100
     n_samples = len(df)
     df = df.sample(n=min(100, len(df)), random_state=42)
-    return df, str(n_samples)
 def get_column_widths(df):
     column_widths = []
     for col in df.columns:
         if col == "prompt":
-            column_widths.append("300px")
         elif col in ["choices", "gold"]:
-            column_widths.append("250px")
-        elif col.startswith("metric_"):
-            column_widths.append("50px")
         else:
-            column_widths.append("200px")  # Default width for other columns
     return column_widths
@@ -158,7 +188,7 @@ def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
     """
     Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
     Turns generation_{run_name} into generation_i
-    Also truncates full_prompt column to 200 chars with expandable view
     """
     # Handle metric columns
     columns_to_rename = {}
@@ -175,37 +205,54 @@ def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
     # Rename columns in a single operation
     df = df.rename(columns=columns_to_rename)
-    # Add markdown formatting to full_prompt column for truncation with expansion
     if 'prompt' in df.columns:
-        df['prompt'] = df['prompt'].apply(
-            lambda x: f"<details><summary>{x[:100]}...</summary>\n\n{x}</details>" if len(x) > 100 else x
-        )
     return df
-def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_files, prompt_column, progress=gr.Progress()):
     token = os.environ.get(FALLBACK_TOKEN_NAME)
-    if not runs_to_fetch or not task_name:
         return None, None
     data_folder = DataFolder(f"filecache::{results_uri}", token=token, cache_storage="./results-cache")
-    def fetch_run_file(run_to_fetch):
-        file_path = f"details/{run_to_fetch}/{checkpoint}/{tasks_files[task_name][run_to_fetch]}"
         try:
             with data_folder.open(file_path, "rb") as f:
                 df = pd.read_parquet(f)
-            return df, run_to_fetch
         except FileNotFoundError:
-            print(f"File not found: {tasks_files[task_name][run_to_fetch]}")
-            return None, run_to_fetch
     with ThreadPoolExecutor() as pool:
-        results = list(progress.tqdm(pool.map(fetch_run_file, runs_to_fetch), total=len(runs_to_fetch),
-                                     desc="Fetching run data..."))
     dfs = [fix_df(df) for df, _ in results if df is not None]
     run_names = [run for _, run in results if run is not None]
@@ -215,9 +262,20 @@ def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_file
     task_type = get_task_type(dfs[0])
     def prepare_df(df, run_name, task_type, prompt_column):
         def get_choice_predictions(df, task_type):
             predictions = df['predictions']
             if task_type == "generative":
                 return predictions
             if task_type == "multiple_choice":
@@ -284,9 +342,10 @@ def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_file
     return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
 with gr.Blocks() as demo:
-    runs_checkpoints = gr.State({})
     results_df_full = gr.State(None)
     tasks_files = gr.State({})
     login_button = gr.LoginButton(visible=False)
     results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True)
     with gr.Column():
@@ -301,8 +360,10 @@ with gr.Blocks() as demo:
                 select_by_language = gr.Dropdown(choices=["ar", "fr", "ru", "hi", "th", "tr", "zh", "sw", "te"],
                                                interactive=True, label="Select by language",
                                                info="Choose a language to prefill the regex")
-        selected_runs = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Selected runs")
-        checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint", visible=True)
         fetch_res = gr.Button("Fetch results")
         task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
         metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
@@ -310,7 +371,8 @@ with gr.Blocks() as demo:
             interactive=False,
             wrap=True,
             line_breaks=True,
-            datatype="markdown"
         )
         with gr.Row():
             with gr.Column():
@@ -319,63 +381,76 @@ with gr.Blocks() as demo:
     # Run selection
     gr.on(
-        triggers=[results_uri.change],
-        fn=fetch_repo_structure, inputs=[results_uri], outputs=[runs_checkpoints, selected_runs],
     )
     gr.on(
         triggers=[select_by_regex_button.click],
         fn=select_runs_by_regex,
-        inputs=[runs_checkpoints, selected_runs, select_by_regex_text], outputs=[selected_runs]
     )
     gr.on(
         triggers=[select_by_language.change],
         fn=select_runs_by_language,
-        inputs=[runs_checkpoints, selected_runs, select_by_language], outputs=[selected_runs]
     )
     # Update checkpoints based on selected runs
     gr.on(
         triggers=[selected_runs.change],
         fn=update_checkpoints,
-        inputs=[selected_runs, runs_checkpoints],
         outputs=[checkpoint]
     )
     # Fetch available tasks
     gr.on(
         triggers=[fetch_res.click],
         fn=fetch_run_results,
-        inputs=[results_uri, selected_runs, checkpoint],
         outputs=[task_name, tasks_files]
     ).then(
         fn=load_task_data,
-        inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files, prompt_column],
         outputs=[results_df_full, metric_names]
     ).then(
         fn=render_table,
-        inputs=[results_df_full, selected_runs, metric_names],
         outputs=[results_df, num_samples]
     )
     # Update results when task name or metric changes
     gr.on(
-        triggers=[task_name.input],
         fn=load_task_data,
-        inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files, prompt_column],
         outputs=[results_df_full, metric_names]
     ).then(
         fn=render_table,
-        inputs=[results_df_full, selected_runs, metric_names],
         outputs=[results_df, num_samples]
     )
     gr.on(
         triggers=[metric_names.input],
         fn=render_table,
-        inputs=[results_df_full, selected_runs, metric_names],
         outputs=[results_df, num_samples]
     )
-    demo.load(fn=fetch_repo_structure, inputs=[results_uri], outputs=[runs_checkpoints, selected_runs])
 demo.launch()

 def get_task_type(df):
     # Compatibility with old lighteval
+    # [[Pour calculer le bénéfice net de C]] in new lighteval, "Pour calculer le bénéfice net de C" in old lighteval
     if all(isinstance(pred, str) or (is_arary_like(pred) and all(isinstance(item, str) for item in pred)) for pred in df['predictions'].iloc[0]):
         return "generative"
+    # [["1", "2"], ["3", "4"]] in new lighteval, ["1", "2"] in old lighteval
     if all(is_arary_like(pred) and all(isinstance(item, float) for item in pred) for pred in df['predictions'].iloc[0]):
         return "multiple_choice"
     return "mixed"
     run_name, seed = run_name.split("-seed-")
     return run_name, int(seed)
+def fetch_repo_structure(results_uri, split_checkpoints=False, oauth_token: gr.OAuthToken | None = None):
     token = os.environ.get(FALLBACK_TOKEN_NAME)
     if oauth_token:
         token = oauth_token.token
         results = list(executor.map(process_run, runs))
     checkpoints_dict = dict(results)
+    runs = list(checkpoints_dict.keys())
+    if not split_checkpoints:
+        runs = [f"{run}/{checkpoint}" for run, checkpoints in checkpoints_dict.items() for checkpoint in checkpoints]
+    return checkpoints_dict, gr.update(choices=runs, value=[])
+def update_checkpoints(selected_runs, checkpoints, split_checkpoints):
+    if not selected_runs or not split_checkpoints:
+        return gr.update(choices=[], value=[])
     common_checkpoints = set(checkpoints[selected_runs[0]])
     for run in selected_runs[1:]:
     common_checkpoints = sorted(list(common_checkpoints))
+    return gr.update(choices=common_checkpoints, value=[common_checkpoints[0]] if common_checkpoints else [])
 def select_runs_by_regex(runs, current_selected, regex_to_select):
         return select_runs_by_regex(runs, current_selected, f".*-{language}-.*")
     return current_selected
+def fetch_available_tasks(results_uri, selected_run_checkpoint: list[str]) -> dict[str, dict[str, str]]:
     token = os.environ.get(FALLBACK_TOKEN_NAME)
     data_folder = DataFolder(results_uri, token=token)
     all_tasks = defaultdict(lambda: defaultdict(dict))
+    for run_checkpoint in selected_run_checkpoint:
         try:
+            details_folder = f"details/{run_checkpoint}"
             files = data_folder.list_files(details_folder, recursive=True)
             parquet_files = [f.removeprefix(details_folder + "/") for f in files if f.endswith('.parquet')]
                 task_name, date_str = full_filename.replace('.parquet', '').rsplit('_', 1)
                 date = datetime.strptime(date_str, '%Y-%m-%dT%H-%M-%S.%f')
+                if run_checkpoint not in all_tasks[task_name] or date > all_tasks[task_name][run_checkpoint]['date']:
+                    all_tasks[task_name][run_checkpoint] = {'filename': full_filename, 'date': date}
         except FileNotFoundError:
+            print(f"Checkpoint not found for run: {run_checkpoint}")
+    # Get tasks that have data for all selected runs
     available_tasks = {
+        task: {run_checkpoint: info['filename'] for run_checkpoint, info in runs_info.items()}
+        for task, runs_info in all_tasks.items()
+        if set(runs_info.keys()) == set(selected_run_checkpoint)
     }
     return available_tasks
+def fetch_run_results(results_uri, selected_run_checkpoint: list[str],
                                    oauth_token: gr.OAuthToken | None = None, progress=gr.Progress()):
+    task_runs_dict = fetch_available_tasks(results_uri, selected_run_checkpoint)
     task_names = list(task_runs_dict.keys())
     return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
+def render_table(df, selected_run_checkpoint: list[str], metric_names):
+    if df is None or not selected_run_checkpoint or not metric_names:
         return None, "0"
+    kept_metrics = [f"metric_{metric_name}_{run_checkpoint}"
+                   for run_checkpoint in selected_run_checkpoint
+                   for metric_name in metric_names]
     other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
     df = df.drop(columns=other_metrics)
+    df = shorten_column_names(df, selected_run_checkpoint, metric_names)
     # Sample 100
     n_samples = len(df)
     df = df.sample(n=min(100, len(df)), random_state=42)
+    # Get column widths for better display
+    column_widths = get_column_widths(df)
+    return gr.Dataframe(
+        value=df,
+        column_widths=column_widths
+    ), str(n_samples)
+def update_selected_run_checkpoint(selected_runs: list[str] | None, selected_checkpoint: list[str] | None, split_checkpoints: bool):
+    if not selected_runs:
+        return []
+    # In this case we simply return the selected runs which already contain checkpoints
+    if not split_checkpoints:
+        return selected_runs
+    # Otherwise combine runs with checkpoints
+    return [f"{run}/{checkpoint}" for run in selected_runs for checkpoint in (selected_checkpoint if selected_checkpoint else [])]
 def get_column_widths(df):
     column_widths = []
     for col in df.columns:
         if col == "prompt":
+            column_widths.append("300px")  # Fixed width with overflow
+        elif col.startswith("generation_"):
+            column_widths.append("200px")
         elif col in ["choices", "gold"]:
+            column_widths.append("100px")
         else:
+            # Metrics
+            column_widths.append("50px")  # Default width for other columns
     return column_widths
     """
     Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
     Turns generation_{run_name} into generation_i
+    Also truncates full_prompt and generation columns to 100 chars with expandable view
     """
     # Handle metric columns
     columns_to_rename = {}
     # Rename columns in a single operation
     df = df.rename(columns=columns_to_rename)
+    # Add markdown formatting to prompt and generation columns for truncation with expansion
+    def truncate_with_details(text: str | list[str]):
+        if is_arary_like(text) and all(isinstance(item, str) for item in text):
+            return [truncate_with_details(item) for item in text]
+        elif isinstance(text, str):
+            text = text.replace('\n', ' ').strip()  # Replace newlines with spaces
+            if len(text) <= 100:
+                return text
+            return f"""<details><summary>{text[:100]}...</summary>\n\n{text[100:]}</details>"""
+        return text
     if 'prompt' in df.columns:
+        df['prompt'] = df['prompt'].apply(truncate_with_details)
+    # Apply the same truncation to all generation columns
+    generation_columns = [col for col in df.columns if col.startswith('generation_')]
+    for col in generation_columns:
+        df[col] = df[col].apply(truncate_with_details)
     return df
+def unwrap_selected_run_checkpoint(selected_run_checkpoint: list[str]) -> list[str]:
+    return selected_run_checkpoint  # Now just returns the list directly
+def load_task_data(results_uri, selected_run_checkpoint: list[str], task_name, tasks_files, prompt_column, progress=gr.Progress()):
     token = os.environ.get(FALLBACK_TOKEN_NAME)
+    if not selected_run_checkpoint or not task_name:
         return None, None
     data_folder = DataFolder(f"filecache::{results_uri}", token=token, cache_storage="./results-cache")
+    def fetch_run_file(run_checkpoint):
+        file_path = f"details/{run_checkpoint}/{tasks_files[task_name][run_checkpoint]}"
         try:
             with data_folder.open(file_path, "rb") as f:
                 df = pd.read_parquet(f)
+            return df, run_checkpoint
         except FileNotFoundError:
+            print(f"File not found: {tasks_files[task_name][run_checkpoint]}")
+            return None, run_checkpoint
     with ThreadPoolExecutor() as pool:
+        results = list(progress.tqdm(pool.map(fetch_run_file, selected_run_checkpoint),
+                                   total=len(selected_run_checkpoint),
+                                   desc="Fetching run data..."))
     dfs = [fix_df(df) for df, _ in results if df is not None]
     run_names = [run for _, run in results if run is not None]
     task_type = get_task_type(dfs[0])
     def prepare_df(df, run_name, task_type, prompt_column):
+        # Mixed in lighteval-old will look like this: ['광', -13.964999198913574, -13.539217948913574, -13.964999198913574, -13.539217948913574, -12.90467357635498, -13.07825756072998]
+        # Generative in lighteval-old will look like this "prediction"
+        # Multiple choice in lighteval-old will look like this ["choice1", "choice2"]
+        # [np.float64(-132.9295196533203), np.float64(-207.1309356689453), np.float64(-186.64553833007812), np.float64(-230.01414489746094), np.float64(-132.9295196533203), np.float64(-207.1309356689453), np.float64(-186.64553833007812), np.float64(-230.01414489746094), np.float64(-128.63824462890625), np.float64(-203.9550018310547), np.float64(-185.35267639160156), np.float64(-228.23837280273438)]
+        # For the new lighteval we have:
+        # Generative: [[Pour calculer le bénéfice net de C]]
         def get_choice_predictions(df, task_type):
             predictions = df['predictions']
             if task_type == "generative":
+                # This is strange representation in new lighteval...
+                if is_arary_like(predictions) and all(is_arary_like(item) for item in predictions):
+                    return predictions[0]
                 return predictions
             if task_type == "multiple_choice":
     return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
 with gr.Blocks() as demo:
+    available_runs_checkpoints = gr.State({})
     results_df_full = gr.State(None)
     tasks_files = gr.State({})
+    selected_run_checkpoint = gr.State([])
     login_button = gr.LoginButton(visible=False)
     results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True)
     with gr.Column():
                 select_by_language = gr.Dropdown(choices=["ar", "fr", "ru", "hi", "th", "tr", "zh", "sw", "te"],
                                                interactive=True, label="Select by language",
                                                info="Choose a language to prefill the regex")
+        with gr.Row() as run_selection_row:
+            selected_runs = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Selected runs")
+            checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint", multiselect=True)
         fetch_res = gr.Button("Fetch results")
         task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
         metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
             interactive=False,
             wrap=True,
             line_breaks=True,
+            datatype="markdown",
+            column_widths=get_column_widths(pd.DataFrame())  # Initialize with empty dataframe
         )
         with gr.Row():
             with gr.Column():
     # Run selection
     gr.on(
+        triggers=[split_checkpoints.change],
+        fn=lambda split_checkpoints: gr.update(visible=split_checkpoints),
+        inputs=[split_checkpoints],
+        outputs=[checkpoint]
+    )
+    gr.on(
+        triggers=[results_uri.change, split_checkpoints.change],
+        fn=fetch_repo_structure, inputs=[results_uri, split_checkpoints], outputs=[available_runs_checkpoints, selected_runs],
     )
     gr.on(
         triggers=[select_by_regex_button.click],
         fn=select_runs_by_regex,
+        inputs=[available_runs_checkpoints, selected_runs, select_by_regex_text], outputs=[selected_runs]
     )
     gr.on(
         triggers=[select_by_language.change],
         fn=select_runs_by_language,
+        inputs=[available_runs_checkpoints, selected_runs, select_by_language], outputs=[selected_runs]
     )
     # Update checkpoints based on selected runs
     gr.on(
         triggers=[selected_runs.change],
         fn=update_checkpoints,
+        inputs=[selected_runs, available_runs_checkpoints, split_checkpoints],
         outputs=[checkpoint]
     )
+    gr.on(
+        triggers=[checkpoint.change, selected_runs.change],
+        fn=update_selected_run_checkpoint,
+        inputs=[selected_runs, checkpoint, split_checkpoints],
+        outputs=[selected_run_checkpoint]
+    )
     # Fetch available tasks
     gr.on(
         triggers=[fetch_res.click],
         fn=fetch_run_results,
+        inputs=[results_uri, selected_run_checkpoint],
         outputs=[task_name, tasks_files]
     ).then(
         fn=load_task_data,
+        inputs=[results_uri, selected_run_checkpoint, task_name, tasks_files, prompt_column],
         outputs=[results_df_full, metric_names]
     ).then(
         fn=render_table,
+        inputs=[results_df_full, selected_run_checkpoint, metric_names],
         outputs=[results_df, num_samples]
     )
     # Update results when task name or metric changes
     gr.on(
+        triggers=[task_name.input, prompt_column.input],
         fn=load_task_data,
+        inputs=[results_uri, selected_run_checkpoint, task_name, tasks_files, prompt_column],
         outputs=[results_df_full, metric_names]
     ).then(
         fn=render_table,
+        inputs=[results_df_full, selected_run_checkpoint, metric_names],
         outputs=[results_df, num_samples]
     )
     gr.on(
         triggers=[metric_names.input],
         fn=render_table,
+        inputs=[results_df_full, selected_run_checkpoint, metric_names],
         outputs=[results_df, num_samples]
     )
+    demo.load(fn=fetch_repo_structure, inputs=[results_uri, split_checkpoints], outputs=[available_runs_checkpoints, selected_runs])
 demo.launch()