Spaces:
Running
Running
allow multiple metrics + concise the cols
Browse files
app.py
CHANGED
@@ -31,7 +31,8 @@ def get_task_type(df):
|
|
31 |
def fix_df(df):
|
32 |
# For some reason some metrics and predictions are stored as strings
|
33 |
for col in ["predictions", "metrics", "choices", "gold", "gold_index"]:
|
34 |
-
|
|
|
35 |
return df
|
36 |
|
37 |
def get_run_name_seed(run_name):
|
@@ -119,15 +120,18 @@ def fetch_run_results(repo_name, runs_to_fetch, checkpoint,
|
|
119 |
return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
|
120 |
|
121 |
|
122 |
-
def
|
123 |
-
if df is None or not selected_runs or not
|
124 |
return None
|
125 |
-
kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs]
|
126 |
other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
|
127 |
df = df.drop(columns=other_metrics)
|
128 |
# widths = get_column_widths(df)
|
129 |
-
df =
|
130 |
-
|
|
|
|
|
|
|
131 |
|
132 |
def get_column_widths(df):
|
133 |
column_widths = []
|
@@ -143,17 +147,26 @@ def get_column_widths(df):
|
|
143 |
return column_widths
|
144 |
|
145 |
|
146 |
-
def
|
147 |
"""
|
148 |
Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
|
|
|
149 |
"""
|
150 |
-
#
|
|
|
|
|
151 |
for idx, run_name in enumerate(run_names):
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
return df
|
158 |
|
159 |
|
@@ -192,16 +205,23 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
|
|
192 |
|
193 |
if task_type == "multiple_choice":
|
194 |
n_choices = len(df['choices'])
|
195 |
-
return
|
196 |
|
197 |
if task_type == "mixed":
|
198 |
return predictions[0]
|
199 |
|
200 |
return predictions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
prepared_df = pd.DataFrame({
|
203 |
'full_prompt': df['full_prompt'],
|
204 |
-
|
205 |
})
|
206 |
# For some reason some metrics are stored as strings
|
207 |
metrics = df['metrics']
|
@@ -213,10 +233,13 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
|
|
213 |
def get_gold_label(df, task_type):
|
214 |
if task_type == "generative":
|
215 |
return df['gold']
|
216 |
-
return
|
217 |
|
218 |
# Prepare the first DataFrame with choices and gold
|
219 |
-
combined_df = dfs[0][['full_prompt'
|
|
|
|
|
|
|
220 |
combined_df['gold'] = dfs[0].apply(lambda row: get_gold_label(row, task_type), axis=1).values
|
221 |
|
222 |
# Join all prepared DataFrames
|
@@ -227,32 +250,9 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
|
|
227 |
|
228 |
available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
|
229 |
combined_df = combined_df.reset_index()
|
|
|
230 |
|
231 |
-
return combined_df,
|
232 |
-
|
233 |
-
def render_results_table(df: pd.DataFrame):
|
234 |
-
if df is None or df.empty:
|
235 |
-
return None
|
236 |
-
|
237 |
-
# Select a subset of 100 examples
|
238 |
-
df_subset = df.sample(n=min(100, len(df)), random_state=42)
|
239 |
-
|
240 |
-
# Prepare the data for display
|
241 |
-
display_data = []
|
242 |
-
for _, row in df_subset.iterrows():
|
243 |
-
example_data = {
|
244 |
-
'text': row['example'],
|
245 |
-
'choices': row['choices'],
|
246 |
-
'gold_index': row['gold_index'],
|
247 |
-
}
|
248 |
-
for run in df['run'].unique():
|
249 |
-
run_data = df[(df['run'] == run) & (df['example'] == row['example'])]
|
250 |
-
if not run_data.empty:
|
251 |
-
example_data[f'{run}_prediction'] = run_data['predictions'].values[0]
|
252 |
-
example_data[f'{run}_score'] = run_data['metrics'].values[0]
|
253 |
-
display_data.append(example_data)
|
254 |
-
|
255 |
-
return pd.DataFrame(display_data)
|
256 |
|
257 |
with gr.Blocks() as demo:
|
258 |
runs_checkpoints = gr.State({})
|
@@ -275,7 +275,7 @@ with gr.Blocks() as demo:
|
|
275 |
checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint")
|
276 |
fetch_res = gr.Button("Fetch results")
|
277 |
task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
|
278 |
-
|
279 |
results_df = gr.Dataframe(interactive=False, wrap=True)
|
280 |
|
281 |
# Run selection
|
@@ -316,13 +316,13 @@ with gr.Blocks() as demo:
|
|
316 |
triggers=[task_name.change],
|
317 |
fn=load_task_data,
|
318 |
inputs=[repo, selected_runs, checkpoint, task_name, tasks_files],
|
319 |
-
outputs=[results_df_full, results_df,
|
320 |
)
|
321 |
|
322 |
gr.on(
|
323 |
-
triggers=[
|
324 |
-
fn=
|
325 |
-
inputs=[results_df_full, selected_runs,
|
326 |
outputs=[results_df]
|
327 |
)
|
328 |
|
|
|
31 |
def fix_df(df):
|
32 |
# For some reason some metrics and predictions are stored as strings
|
33 |
for col in ["predictions", "metrics", "choices", "gold", "gold_index"]:
|
34 |
+
if col in df.columns:
|
35 |
+
df[col] = [ast.literal_eval(x) if isinstance(x, str) else x for x in df[col].values]
|
36 |
return df
|
37 |
|
38 |
def get_run_name_seed(run_name):
|
|
|
120 |
return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
|
121 |
|
122 |
|
123 |
+
def render_table(df, selected_runs, metric_names):
|
124 |
+
if df is None or not selected_runs or not metric_names:
|
125 |
return None
|
126 |
+
kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
|
127 |
other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
|
128 |
df = df.drop(columns=other_metrics)
|
129 |
# widths = get_column_widths(df)
|
130 |
+
df = shorten_column_names(df, selected_runs, metric_names)
|
131 |
+
|
132 |
+
# Sample 100
|
133 |
+
df = df.sample(n=min(100, len(df)), random_state=42)
|
134 |
+
return df
|
135 |
|
136 |
def get_column_widths(df):
|
137 |
column_widths = []
|
|
|
147 |
return column_widths
|
148 |
|
149 |
|
150 |
+
def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
|
151 |
"""
|
152 |
Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
|
153 |
+
Turns generation_{run_name} into generation_i
|
154 |
"""
|
155 |
+
# Handle metric columns
|
156 |
+
# Aggregate columns to rename
|
157 |
+
columns_to_rename = {}
|
158 |
for idx, run_name in enumerate(run_names):
|
159 |
+
for metric_name in metric_names:
|
160 |
+
original_metric_column = f"metric_{metric_name}_{run_name}"
|
161 |
+
if original_metric_column in df.columns:
|
162 |
+
columns_to_rename[original_metric_column] = f"{metric_name}_{idx}"
|
163 |
+
|
164 |
+
original_generation_column = f"generation_{run_name}"
|
165 |
+
if original_generation_column in df.columns:
|
166 |
+
columns_to_rename[original_generation_column] = f"generation_{idx}"
|
167 |
+
|
168 |
+
# Rename columns in a single operation
|
169 |
+
df = df.rename(columns=columns_to_rename)
|
170 |
return df
|
171 |
|
172 |
|
|
|
205 |
|
206 |
if task_type == "multiple_choice":
|
207 |
n_choices = len(df['choices'])
|
208 |
+
return [pred[0] for pred in predictions[:n_choices]]
|
209 |
|
210 |
if task_type == "mixed":
|
211 |
return predictions[0]
|
212 |
|
213 |
return predictions
|
214 |
+
|
215 |
+
generative_columns = {
|
216 |
+
f"generation_{run_name}": df.apply(partial(get_choice_predictions, task_type=task_type), axis=1)
|
217 |
+
} if task_type == "generative" or task_type == "mixed" else {}
|
218 |
+
|
219 |
+
|
220 |
+
|
221 |
|
222 |
prepared_df = pd.DataFrame({
|
223 |
'full_prompt': df['full_prompt'],
|
224 |
+
**generative_columns,
|
225 |
})
|
226 |
# For some reason some metrics are stored as strings
|
227 |
metrics = df['metrics']
|
|
|
233 |
def get_gold_label(df, task_type):
|
234 |
if task_type == "generative":
|
235 |
return df['gold']
|
236 |
+
return df['gold_index']
|
237 |
|
238 |
# Prepare the first DataFrame with choices and gold
|
239 |
+
combined_df = dfs[0][['full_prompt']].set_index('full_prompt')
|
240 |
+
if task_type in ["multiple_choice", "mixed"]:
|
241 |
+
combined_df["choices"] = dfs[0]["choices"].values
|
242 |
+
|
243 |
combined_df['gold'] = dfs[0].apply(lambda row: get_gold_label(row, task_type), axis=1).values
|
244 |
|
245 |
# Join all prepared DataFrames
|
|
|
250 |
|
251 |
available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
|
252 |
combined_df = combined_df.reset_index()
|
253 |
+
chosen_metrics = available_metrics[:1]
|
254 |
|
255 |
+
return combined_df, render_table(combined_df, runs_to_fetch, chosen_metrics), gr.update(choices=available_metrics, value=chosen_metrics)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
with gr.Blocks() as demo:
|
258 |
runs_checkpoints = gr.State({})
|
|
|
275 |
checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint")
|
276 |
fetch_res = gr.Button("Fetch results")
|
277 |
task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
|
278 |
+
metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
|
279 |
results_df = gr.Dataframe(interactive=False, wrap=True)
|
280 |
|
281 |
# Run selection
|
|
|
316 |
triggers=[task_name.change],
|
317 |
fn=load_task_data,
|
318 |
inputs=[repo, selected_runs, checkpoint, task_name, tasks_files],
|
319 |
+
outputs=[results_df_full, results_df, metric_names]
|
320 |
)
|
321 |
|
322 |
gr.on(
|
323 |
+
triggers=[metric_names.change],
|
324 |
+
fn=render_table,
|
325 |
+
inputs=[results_df_full, selected_runs, metric_names],
|
326 |
outputs=[results_df]
|
327 |
)
|
328 |
|