hynky HF staff commited on
Commit
90e7c81
·
1 Parent(s): da65ce9

small prompt + only show prompt

Browse files
Files changed (1) hide show
  1. app.py +60 -36
app.py CHANGED
@@ -18,7 +18,8 @@ def is_arary_like(x):
18
  return isinstance(x, list) or isinstance(x, tuple) or isinstance(x, np.ndarray)
19
 
20
  def get_task_type(df):
21
- if all(isinstance(pred, str) for pred in df['predictions'].iloc[0]):
 
22
  return "generative"
23
  if all(is_arary_like(pred) and all(isinstance(item, float) for item in pred) for pred in df['predictions'].iloc[0]):
24
  return "multiple_choice"
@@ -31,7 +32,10 @@ def fix_df(df):
31
  df[col] = [ast.literal_eval(x) if isinstance(x, str) else x for x in df[col].values]
32
 
33
  if col == "predictions":
 
34
  df[col] = df[col].apply(lambda x: [[z[0] for z in x]] if is_arary_like(x) and len(x[0]) == 2 else x)
 
 
35
  return df
36
 
37
  def get_run_name_seed(run_name):
@@ -116,8 +120,7 @@ def fetch_available_tasks(results_uri, runs_to_fetch, checkpoint) -> dict[str, d
116
  return available_tasks
117
 
118
  def fetch_run_results(results_uri, runs_to_fetch, checkpoint,
119
- oauth_token: gr.OAuthToken | None = None, progress=gr.Progress()):
120
-
121
  task_runs_dict = fetch_available_tasks(results_uri, runs_to_fetch, checkpoint)
122
  task_names = list(task_runs_dict.keys())
123
  return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
@@ -126,21 +129,21 @@ def fetch_run_results(results_uri, runs_to_fetch, checkpoint,
126
  def render_table(df, selected_runs, metric_names):
127
  if df is None or not selected_runs or not metric_names:
128
  return None, "0"
 
129
  kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
130
  other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
131
  df = df.drop(columns=other_metrics)
132
- # widths = get_column_widths(df)
133
  df = shorten_column_names(df, selected_runs, metric_names)
134
 
135
  # Sample 100
136
  n_samples = len(df)
137
  df = df.sample(n=min(100, len(df)), random_state=42)
138
- return df, n_samples
139
 
140
  def get_column_widths(df):
141
  column_widths = []
142
  for col in df.columns:
143
- if col == "full_prompt":
144
  column_widths.append("300px")
145
  elif col in ["choices", "gold"]:
146
  column_widths.append("250px")
@@ -155,9 +158,9 @@ def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
155
  """
156
  Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
157
  Turns generation_{run_name} into generation_i
 
158
  """
159
  # Handle metric columns
160
- # Aggregate columns to rename
161
  columns_to_rename = {}
162
  for idx, run_name in enumerate(run_names):
163
  for metric_name in metric_names:
@@ -171,13 +174,20 @@ def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
171
 
172
  # Rename columns in a single operation
173
  df = df.rename(columns=columns_to_rename)
 
 
 
 
 
 
 
174
  return df
175
 
176
 
177
- def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_files, progress=gr.Progress()):
178
  token = os.environ.get(FALLBACK_TOKEN_NAME)
179
  if not runs_to_fetch or not task_name:
180
- return None, None, None
181
 
182
 
183
 
@@ -204,9 +214,8 @@ def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_file
204
  return None, None, gr.update(choices=[], value=None)
205
 
206
  task_type = get_task_type(dfs[0])
207
- def prepare_df(df, run_name, task_type):
208
  def get_choice_predictions(df, task_type):
209
- # For some evals it's string for other it's list
210
  predictions = df['predictions']
211
  if task_type == "generative":
212
  return predictions
@@ -223,24 +232,25 @@ def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_file
223
  generative_columns = {
224
  f"generation_{run_name}": df.apply(partial(get_choice_predictions, task_type=task_type), axis=1)
225
  } if task_type == "generative" or task_type == "mixed" else {}
226
-
227
-
228
-
229
 
230
  prepared_df = pd.DataFrame({
231
- 'full_prompt': df['full_prompt'],
 
 
 
232
  **generative_columns,
233
  })
 
234
  # For some reason some metrics are stored as strings
235
  metrics = df['metrics']
236
- # Assume all metrics are the same
237
  available_metrics = set(metric for row_metrics in metrics for metric in row_metrics)
238
  for metric_key in available_metrics:
239
  prepared_df[f'metric_{metric_key}_{run_name}'] = [metric.get(metric_key, None) for metric in metrics]
240
 
241
  # Merge rows with the same full_prompt
242
- prepared_df = prepared_df.groupby('full_prompt').agg(lambda x: next((item for item in x if item is not None), None)).reset_index()
243
- return prepared_df.set_index('full_prompt')
 
244
 
245
  def get_gold_label(df, task_type):
246
  if task_type == "generative":
@@ -248,20 +258,27 @@ def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_file
248
  return df['gold_index']
249
 
250
  # Prepare the first DataFrame with choices and gold
251
- combined_df = dfs[0][['full_prompt']].set_index('full_prompt')
 
 
 
 
 
 
 
 
 
 
252
  if task_type in ["multiple_choice", "mixed"]:
253
- combined_df["choices"] = dfs[0]["choices"].values
 
 
254
 
255
- combined_df['gold'] = dfs[0].apply(lambda row: get_gold_label(row, task_type), axis=1).values
256
-
257
- # Join all prepared DataFrames
258
- for df, run_name in zip(dfs, run_names):
259
- prepared_df = prepare_df(df, run_name, task_type)
260
- combined_df = combined_df.join(prepared_df, how='outer')
261
-
262
 
263
- available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
264
- combined_df = combined_df.reset_index()
265
  chosen_metrics = available_metrics[:1]
266
 
267
  return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
@@ -274,24 +291,31 @@ with gr.Blocks() as demo:
274
  results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True)
275
  with gr.Column():
276
  gr.Markdown("# FineWeb experiments results explorer")
 
277
  with gr.Row():
278
  with gr.Column():
279
  select_by_regex_text = gr.Textbox(label="Regex to select runs",
280
- value="ind_minhash(-CC-MAIN-|_)\\d{4}-\\d{2}-seed.*")
281
  select_by_regex_button = gr.Button("Select matching runs")
282
  with gr.Column():
283
  select_by_language = gr.Dropdown(choices=["ar", "fr", "ru", "hi", "th", "tr", "zh", "sw", "te"],
284
- interactive=True, label="Select by language",
285
- info="Choose a language to prefill the regex")
286
  selected_runs = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Selected runs")
287
- checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint")
288
  fetch_res = gr.Button("Fetch results")
289
  task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
290
  metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
291
- results_df = gr.Dataframe(interactive=False, wrap=True)
 
 
 
 
 
292
  with gr.Row():
293
  with gr.Column():
294
  num_samples = gr.Text(interactive=False, label="# Samples")
 
295
 
296
  # Run selection
297
  gr.on(
@@ -325,7 +349,7 @@ with gr.Blocks() as demo:
325
  outputs=[task_name, tasks_files]
326
  ).then(
327
  fn=load_task_data,
328
- inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files],
329
  outputs=[results_df_full, metric_names]
330
  ).then(
331
  fn=render_table,
@@ -337,7 +361,7 @@ with gr.Blocks() as demo:
337
  gr.on(
338
  triggers=[task_name.input],
339
  fn=load_task_data,
340
- inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files],
341
  outputs=[results_df_full, metric_names]
342
  ).then(
343
  fn=render_table,
 
18
  return isinstance(x, list) or isinstance(x, tuple) or isinstance(x, np.ndarray)
19
 
20
  def get_task_type(df):
21
+ # Compatibility with old lighteval
22
+ if all(isinstance(pred, str) or (is_arary_like(pred) and all(isinstance(item, str) for item in pred)) for pred in df['predictions'].iloc[0]):
23
  return "generative"
24
  if all(is_arary_like(pred) and all(isinstance(item, float) for item in pred) for pred in df['predictions'].iloc[0]):
25
  return "multiple_choice"
 
32
  df[col] = [ast.literal_eval(x) if isinstance(x, str) else x for x in df[col].values]
33
 
34
  if col == "predictions":
35
+ # For multiple choice
36
  df[col] = df[col].apply(lambda x: [[z[0] for z in x]] if is_arary_like(x) and len(x[0]) == 2 else x)
37
+ # For unwraping of generative
38
+ df[col] = df[col].apply(lambda x: x[0] if is_arary_like(x) and len(x) == 1 else x)
39
  return df
40
 
41
  def get_run_name_seed(run_name):
 
120
  return available_tasks
121
 
122
  def fetch_run_results(results_uri, runs_to_fetch, checkpoint,
123
+ oauth_token: gr.OAuthToken | None = None, progress=gr.Progress()):
 
124
  task_runs_dict = fetch_available_tasks(results_uri, runs_to_fetch, checkpoint)
125
  task_names = list(task_runs_dict.keys())
126
  return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
 
129
  def render_table(df, selected_runs, metric_names):
130
  if df is None or not selected_runs or not metric_names:
131
  return None, "0"
132
+
133
  kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
134
  other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
135
  df = df.drop(columns=other_metrics)
 
136
  df = shorten_column_names(df, selected_runs, metric_names)
137
 
138
  # Sample 100
139
  n_samples = len(df)
140
  df = df.sample(n=min(100, len(df)), random_state=42)
141
+ return df, str(n_samples)
142
 
143
  def get_column_widths(df):
144
  column_widths = []
145
  for col in df.columns:
146
+ if col == "prompt":
147
  column_widths.append("300px")
148
  elif col in ["choices", "gold"]:
149
  column_widths.append("250px")
 
158
  """
159
  Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
160
  Turns generation_{run_name} into generation_i
161
+ Also truncates full_prompt column to 200 chars with expandable view
162
  """
163
  # Handle metric columns
 
164
  columns_to_rename = {}
165
  for idx, run_name in enumerate(run_names):
166
  for metric_name in metric_names:
 
174
 
175
  # Rename columns in a single operation
176
  df = df.rename(columns=columns_to_rename)
177
+
178
+ # Add markdown formatting to full_prompt column for truncation with expansion
179
+ if 'prompt' in df.columns:
180
+ df['prompt'] = df['prompt'].apply(
181
+ lambda x: f"<details><summary>{x[:100]}...</summary>\n\n{x}</details>" if len(x) > 100 else x
182
+ )
183
+
184
  return df
185
 
186
 
187
+ def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_files, prompt_column, progress=gr.Progress()):
188
  token = os.environ.get(FALLBACK_TOKEN_NAME)
189
  if not runs_to_fetch or not task_name:
190
+ return None, None
191
 
192
 
193
 
 
214
  return None, None, gr.update(choices=[], value=None)
215
 
216
  task_type = get_task_type(dfs[0])
217
+ def prepare_df(df, run_name, task_type, prompt_column):
218
  def get_choice_predictions(df, task_type):
 
219
  predictions = df['predictions']
220
  if task_type == "generative":
221
  return predictions
 
232
  generative_columns = {
233
  f"generation_{run_name}": df.apply(partial(get_choice_predictions, task_type=task_type), axis=1)
234
  } if task_type == "generative" or task_type == "mixed" else {}
 
 
 
235
 
236
  prepared_df = pd.DataFrame({
237
+ 'prompt': df[prompt_column],
238
+ 'choices': df['choices'].apply(tuple), # Convert lists to tuples
239
+ 'gold': df['gold'].apply(lambda x: tuple(x) if isinstance(x, list) else x), # Convert lists to tuples
240
+ 'gold_index': df['gold_index'],
241
  **generative_columns,
242
  })
243
+
244
  # For some reason some metrics are stored as strings
245
  metrics = df['metrics']
 
246
  available_metrics = set(metric for row_metrics in metrics for metric in row_metrics)
247
  for metric_key in available_metrics:
248
  prepared_df[f'metric_{metric_key}_{run_name}'] = [metric.get(metric_key, None) for metric in metrics]
249
 
250
  # Merge rows with the same full_prompt
251
+ prepared_df = prepared_df.groupby('prompt').agg(lambda x: next((item for item in x if item is not None), None)).reset_index()
252
+ prepared_df["prompt"] = prepared_df["prompt"].astype(str)
253
+ return prepared_df
254
 
255
  def get_gold_label(df, task_type):
256
  if task_type == "generative":
 
258
  return df['gold_index']
259
 
260
  # Prepare the first DataFrame with choices and gold
261
+ # Join all prepared DataFrames
262
+ prepared_dfs = [
263
+ prepare_df(df, run_name, task_type, prompt_column)
264
+ for df, run_name in zip(dfs, run_names)
265
+ ]
266
+
267
+ combined_df = prepared_dfs[0]
268
+ for idx, prepared_df in enumerate(prepared_dfs[1:]):
269
+ combined_df = combined_df.merge(prepared_df, how='outer', on=("prompt", "gold"), suffixes=(None, f"_{idx}"))
270
+ to_keep = ["prompt", "gold"]
271
+
272
  if task_type in ["multiple_choice", "mixed"]:
273
+ to_keep.append("choices")
274
+ elif task_type == "generative":
275
+ to_keep.extend([col for col in combined_df.columns if col.startswith("generation_")])
276
 
277
+ combined_df['gold'] = combined_df.apply(lambda row: get_gold_label(row, task_type), axis=1).values
278
+ metric_cols = [col for col in combined_df.columns if col.startswith("metric_")]
279
+ combined_df = combined_df[to_keep + metric_cols]
 
 
 
 
280
 
281
+ available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in metric_cols))
 
282
  chosen_metrics = available_metrics[:1]
283
 
284
  return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
 
291
  results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True)
292
  with gr.Column():
293
  gr.Markdown("# FineWeb experiments results explorer")
294
+ split_checkpoints = gr.Checkbox(label="Split checkpoints from models", value=True)
295
  with gr.Row():
296
  with gr.Column():
297
  select_by_regex_text = gr.Textbox(label="Regex to select runs",
298
+ value="ind_minhash(-CC-MAIN-|_)\\d{4}-\\d{2}-seed.*")
299
  select_by_regex_button = gr.Button("Select matching runs")
300
  with gr.Column():
301
  select_by_language = gr.Dropdown(choices=["ar", "fr", "ru", "hi", "th", "tr", "zh", "sw", "te"],
302
+ interactive=True, label="Select by language",
303
+ info="Choose a language to prefill the regex")
304
  selected_runs = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Selected runs")
305
+ checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint", visible=True)
306
  fetch_res = gr.Button("Fetch results")
307
  task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
308
  metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
309
+ results_df = gr.Dataframe(
310
+ interactive=False,
311
+ wrap=True,
312
+ line_breaks=True,
313
+ datatype="markdown"
314
+ )
315
  with gr.Row():
316
  with gr.Column():
317
  num_samples = gr.Text(interactive=False, label="# Samples")
318
+ prompt_column = gr.Radio(choices=["full_prompt", "example"], label="Prompt display", value="example")
319
 
320
  # Run selection
321
  gr.on(
 
349
  outputs=[task_name, tasks_files]
350
  ).then(
351
  fn=load_task_data,
352
+ inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files, prompt_column],
353
  outputs=[results_df_full, metric_names]
354
  ).then(
355
  fn=render_table,
 
361
  gr.on(
362
  triggers=[task_name.input],
363
  fn=load_task_data,
364
+ inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files, prompt_column],
365
  outputs=[results_df_full, metric_names]
366
  ).then(
367
  fn=render_table,