hynky HF staff commited on
Commit
f14f2bb
·
1 Parent(s): 90e7c81

add support for split checkpoints

Browse files
Files changed (1) hide show
  1. app.py +136 -61
app.py CHANGED
@@ -19,8 +19,11 @@ def is_arary_like(x):
19
 
20
  def get_task_type(df):
21
  # Compatibility with old lighteval
 
22
  if all(isinstance(pred, str) or (is_arary_like(pred) and all(isinstance(item, str) for item in pred)) for pred in df['predictions'].iloc[0]):
23
  return "generative"
 
 
24
  if all(is_arary_like(pred) and all(isinstance(item, float) for item in pred) for pred in df['predictions'].iloc[0]):
25
  return "multiple_choice"
26
  return "mixed"
@@ -44,7 +47,8 @@ def get_run_name_seed(run_name):
44
  run_name, seed = run_name.split("-seed-")
45
  return run_name, int(seed)
46
 
47
- def fetch_repo_structure(results_uri, oauth_token: gr.OAuthToken | None = None):
 
48
  token = os.environ.get(FALLBACK_TOKEN_NAME)
49
  if oauth_token:
50
  token = oauth_token.token
@@ -63,12 +67,16 @@ def fetch_repo_structure(results_uri, oauth_token: gr.OAuthToken | None = None):
63
  results = list(executor.map(process_run, runs))
64
 
65
  checkpoints_dict = dict(results)
 
66
 
67
- return checkpoints_dict, gr.update(choices=list(checkpoints_dict), value=None)
 
68
 
69
- def update_checkpoints(selected_runs, checkpoints):
70
- if not selected_runs:
71
- return gr.update(choices=[], value=None)
 
 
72
 
73
  common_checkpoints = set(checkpoints[selected_runs[0]])
74
  for run in selected_runs[1:]:
@@ -76,7 +84,8 @@ def update_checkpoints(selected_runs, checkpoints):
76
 
77
  common_checkpoints = sorted(list(common_checkpoints))
78
 
79
- return gr.update(choices=common_checkpoints, value=common_checkpoints[0] if common_checkpoints else None)
 
80
 
81
 
82
  def select_runs_by_regex(runs, current_selected, regex_to_select):
@@ -89,15 +98,15 @@ def select_runs_by_language(runs, current_selected, language):
89
  return select_runs_by_regex(runs, current_selected, f".*-{language}-.*")
90
  return current_selected
91
 
92
- def fetch_available_tasks(results_uri, runs_to_fetch, checkpoint) -> dict[str, dict[str, str]]:
93
  token = os.environ.get(FALLBACK_TOKEN_NAME)
94
 
95
  data_folder = DataFolder(results_uri, token=token)
96
  all_tasks = defaultdict(lambda: defaultdict(dict))
97
 
98
- for run in runs_to_fetch:
99
  try:
100
- details_folder = f"details/{run}/{checkpoint}"
101
  files = data_folder.list_files(details_folder, recursive=True)
102
  parquet_files = [f.removeprefix(details_folder + "/") for f in files if f.endswith('.parquet')]
103
 
@@ -105,52 +114,73 @@ def fetch_available_tasks(results_uri, runs_to_fetch, checkpoint) -> dict[str, d
105
  task_name, date_str = full_filename.replace('.parquet', '').rsplit('_', 1)
106
  date = datetime.strptime(date_str, '%Y-%m-%dT%H-%M-%S.%f')
107
 
108
- if run not in all_tasks[task_name] or date > all_tasks[task_name][run]['date']:
109
- all_tasks[task_name][run] = {'filename': full_filename, 'date': date}
110
  except FileNotFoundError:
111
- print(f"Checkpoint not found for run: {run}")
112
-
113
 
 
114
  available_tasks = {
115
- task: {run: info['filename'] for run, info in runs.items()}
116
- for task, runs in all_tasks.items()
117
- if set(runs.keys()) == set(runs_to_fetch)
118
  }
119
 
120
  return available_tasks
121
 
122
- def fetch_run_results(results_uri, runs_to_fetch, checkpoint,
123
  oauth_token: gr.OAuthToken | None = None, progress=gr.Progress()):
124
- task_runs_dict = fetch_available_tasks(results_uri, runs_to_fetch, checkpoint)
125
  task_names = list(task_runs_dict.keys())
126
  return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
127
 
128
 
129
- def render_table(df, selected_runs, metric_names):
130
- if df is None or not selected_runs or not metric_names:
131
  return None, "0"
132
 
133
- kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
 
 
134
  other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
135
  df = df.drop(columns=other_metrics)
136
- df = shorten_column_names(df, selected_runs, metric_names)
137
 
138
  # Sample 100
139
  n_samples = len(df)
140
  df = df.sample(n=min(100, len(df)), random_state=42)
141
- return df, str(n_samples)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  def get_column_widths(df):
144
  column_widths = []
145
  for col in df.columns:
146
  if col == "prompt":
147
- column_widths.append("300px")
 
 
148
  elif col in ["choices", "gold"]:
149
- column_widths.append("250px")
150
- elif col.startswith("metric_"):
151
- column_widths.append("50px")
152
  else:
153
- column_widths.append("200px") # Default width for other columns
 
154
  return column_widths
155
 
156
 
@@ -158,7 +188,7 @@ def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
158
  """
159
  Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
160
  Turns generation_{run_name} into generation_i
161
- Also truncates full_prompt column to 200 chars with expandable view
162
  """
163
  # Handle metric columns
164
  columns_to_rename = {}
@@ -175,37 +205,54 @@ def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
175
  # Rename columns in a single operation
176
  df = df.rename(columns=columns_to_rename)
177
 
178
- # Add markdown formatting to full_prompt column for truncation with expansion
 
 
 
 
 
 
 
 
 
 
 
179
  if 'prompt' in df.columns:
180
- df['prompt'] = df['prompt'].apply(
181
- lambda x: f"<details><summary>{x[:100]}...</summary>\n\n{x}</details>" if len(x) > 100 else x
182
- )
 
 
 
 
183
 
184
  return df
185
 
186
 
187
- def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_files, prompt_column, progress=gr.Progress()):
 
 
 
188
  token = os.environ.get(FALLBACK_TOKEN_NAME)
189
- if not runs_to_fetch or not task_name:
190
  return None, None
191
 
192
-
193
-
194
  data_folder = DataFolder(f"filecache::{results_uri}", token=token, cache_storage="./results-cache")
195
 
196
- def fetch_run_file(run_to_fetch):
197
- file_path = f"details/{run_to_fetch}/{checkpoint}/{tasks_files[task_name][run_to_fetch]}"
198
  try:
199
  with data_folder.open(file_path, "rb") as f:
200
  df = pd.read_parquet(f)
201
- return df, run_to_fetch
202
  except FileNotFoundError:
203
- print(f"File not found: {tasks_files[task_name][run_to_fetch]}")
204
- return None, run_to_fetch
205
 
206
  with ThreadPoolExecutor() as pool:
207
- results = list(progress.tqdm(pool.map(fetch_run_file, runs_to_fetch), total=len(runs_to_fetch),
208
- desc="Fetching run data..."))
 
209
 
210
  dfs = [fix_df(df) for df, _ in results if df is not None]
211
  run_names = [run for _, run in results if run is not None]
@@ -215,9 +262,20 @@ def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_file
215
 
216
  task_type = get_task_type(dfs[0])
217
  def prepare_df(df, run_name, task_type, prompt_column):
 
 
 
 
 
 
 
 
218
  def get_choice_predictions(df, task_type):
219
  predictions = df['predictions']
220
  if task_type == "generative":
 
 
 
221
  return predictions
222
 
223
  if task_type == "multiple_choice":
@@ -284,9 +342,10 @@ def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_file
284
  return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
285
 
286
  with gr.Blocks() as demo:
287
- runs_checkpoints = gr.State({})
288
  results_df_full = gr.State(None)
289
  tasks_files = gr.State({})
 
290
  login_button = gr.LoginButton(visible=False)
291
  results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True)
292
  with gr.Column():
@@ -301,8 +360,10 @@ with gr.Blocks() as demo:
301
  select_by_language = gr.Dropdown(choices=["ar", "fr", "ru", "hi", "th", "tr", "zh", "sw", "te"],
302
  interactive=True, label="Select by language",
303
  info="Choose a language to prefill the regex")
304
- selected_runs = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Selected runs")
305
- checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint", visible=True)
 
 
306
  fetch_res = gr.Button("Fetch results")
307
  task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
308
  metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
@@ -310,7 +371,8 @@ with gr.Blocks() as demo:
310
  interactive=False,
311
  wrap=True,
312
  line_breaks=True,
313
- datatype="markdown"
 
314
  )
315
  with gr.Row():
316
  with gr.Column():
@@ -319,63 +381,76 @@ with gr.Blocks() as demo:
319
 
320
  # Run selection
321
  gr.on(
322
- triggers=[results_uri.change],
323
- fn=fetch_repo_structure, inputs=[results_uri], outputs=[runs_checkpoints, selected_runs],
 
 
 
 
 
 
324
  )
325
  gr.on(
326
  triggers=[select_by_regex_button.click],
327
  fn=select_runs_by_regex,
328
- inputs=[runs_checkpoints, selected_runs, select_by_regex_text], outputs=[selected_runs]
329
  )
330
  gr.on(
331
  triggers=[select_by_language.change],
332
  fn=select_runs_by_language,
333
- inputs=[runs_checkpoints, selected_runs, select_by_language], outputs=[selected_runs]
334
  )
335
 
336
  # Update checkpoints based on selected runs
337
  gr.on(
338
  triggers=[selected_runs.change],
339
  fn=update_checkpoints,
340
- inputs=[selected_runs, runs_checkpoints],
341
  outputs=[checkpoint]
342
  )
 
 
 
 
 
 
 
343
 
344
  # Fetch available tasks
345
  gr.on(
346
  triggers=[fetch_res.click],
347
  fn=fetch_run_results,
348
- inputs=[results_uri, selected_runs, checkpoint],
349
  outputs=[task_name, tasks_files]
350
  ).then(
351
  fn=load_task_data,
352
- inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files, prompt_column],
353
  outputs=[results_df_full, metric_names]
354
  ).then(
355
  fn=render_table,
356
- inputs=[results_df_full, selected_runs, metric_names],
357
  outputs=[results_df, num_samples]
358
  )
359
 
360
  # Update results when task name or metric changes
361
  gr.on(
362
- triggers=[task_name.input],
363
  fn=load_task_data,
364
- inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files, prompt_column],
365
  outputs=[results_df_full, metric_names]
366
  ).then(
367
  fn=render_table,
368
- inputs=[results_df_full, selected_runs, metric_names],
369
  outputs=[results_df, num_samples]
370
  )
371
 
372
  gr.on(
373
  triggers=[metric_names.input],
374
  fn=render_table,
375
- inputs=[results_df_full, selected_runs, metric_names],
376
  outputs=[results_df, num_samples]
377
  )
378
 
379
- demo.load(fn=fetch_repo_structure, inputs=[results_uri], outputs=[runs_checkpoints, selected_runs])
380
 
381
  demo.launch()
 
19
 
20
  def get_task_type(df):
21
  # Compatibility with old lighteval
22
+ # [[Pour calculer le bénéfice net de C]] in new lighteval, "Pour calculer le bénéfice net de C" in old lighteval
23
  if all(isinstance(pred, str) or (is_arary_like(pred) and all(isinstance(item, str) for item in pred)) for pred in df['predictions'].iloc[0]):
24
  return "generative"
25
+
26
+ # [["1", "2"], ["3", "4"]] in new lighteval, ["1", "2"] in old lighteval
27
  if all(is_arary_like(pred) and all(isinstance(item, float) for item in pred) for pred in df['predictions'].iloc[0]):
28
  return "multiple_choice"
29
  return "mixed"
 
47
  run_name, seed = run_name.split("-seed-")
48
  return run_name, int(seed)
49
 
50
+
51
+ def fetch_repo_structure(results_uri, split_checkpoints=False, oauth_token: gr.OAuthToken | None = None):
52
  token = os.environ.get(FALLBACK_TOKEN_NAME)
53
  if oauth_token:
54
  token = oauth_token.token
 
67
  results = list(executor.map(process_run, runs))
68
 
69
  checkpoints_dict = dict(results)
70
+ runs = list(checkpoints_dict.keys())
71
 
72
+ if not split_checkpoints:
73
+ runs = [f"{run}/{checkpoint}" for run, checkpoints in checkpoints_dict.items() for checkpoint in checkpoints]
74
 
75
+ return checkpoints_dict, gr.update(choices=runs, value=[])
76
+
77
+ def update_checkpoints(selected_runs, checkpoints, split_checkpoints):
78
+ if not selected_runs or not split_checkpoints:
79
+ return gr.update(choices=[], value=[])
80
 
81
  common_checkpoints = set(checkpoints[selected_runs[0]])
82
  for run in selected_runs[1:]:
 
84
 
85
  common_checkpoints = sorted(list(common_checkpoints))
86
 
87
+ return gr.update(choices=common_checkpoints, value=[common_checkpoints[0]] if common_checkpoints else [])
88
+
89
 
90
 
91
  def select_runs_by_regex(runs, current_selected, regex_to_select):
 
98
  return select_runs_by_regex(runs, current_selected, f".*-{language}-.*")
99
  return current_selected
100
 
101
+ def fetch_available_tasks(results_uri, selected_run_checkpoint: list[str]) -> dict[str, dict[str, str]]:
102
  token = os.environ.get(FALLBACK_TOKEN_NAME)
103
 
104
  data_folder = DataFolder(results_uri, token=token)
105
  all_tasks = defaultdict(lambda: defaultdict(dict))
106
 
107
+ for run_checkpoint in selected_run_checkpoint:
108
  try:
109
+ details_folder = f"details/{run_checkpoint}"
110
  files = data_folder.list_files(details_folder, recursive=True)
111
  parquet_files = [f.removeprefix(details_folder + "/") for f in files if f.endswith('.parquet')]
112
 
 
114
  task_name, date_str = full_filename.replace('.parquet', '').rsplit('_', 1)
115
  date = datetime.strptime(date_str, '%Y-%m-%dT%H-%M-%S.%f')
116
 
117
+ if run_checkpoint not in all_tasks[task_name] or date > all_tasks[task_name][run_checkpoint]['date']:
118
+ all_tasks[task_name][run_checkpoint] = {'filename': full_filename, 'date': date}
119
  except FileNotFoundError:
120
+ print(f"Checkpoint not found for run: {run_checkpoint}")
 
121
 
122
+ # Get tasks that have data for all selected runs
123
  available_tasks = {
124
+ task: {run_checkpoint: info['filename'] for run_checkpoint, info in runs_info.items()}
125
+ for task, runs_info in all_tasks.items()
126
+ if set(runs_info.keys()) == set(selected_run_checkpoint)
127
  }
128
 
129
  return available_tasks
130
 
131
+ def fetch_run_results(results_uri, selected_run_checkpoint: list[str],
132
  oauth_token: gr.OAuthToken | None = None, progress=gr.Progress()):
133
+ task_runs_dict = fetch_available_tasks(results_uri, selected_run_checkpoint)
134
  task_names = list(task_runs_dict.keys())
135
  return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
136
 
137
 
138
+ def render_table(df, selected_run_checkpoint: list[str], metric_names):
139
+ if df is None or not selected_run_checkpoint or not metric_names:
140
  return None, "0"
141
 
142
+ kept_metrics = [f"metric_{metric_name}_{run_checkpoint}"
143
+ for run_checkpoint in selected_run_checkpoint
144
+ for metric_name in metric_names]
145
  other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
146
  df = df.drop(columns=other_metrics)
147
+ df = shorten_column_names(df, selected_run_checkpoint, metric_names)
148
 
149
  # Sample 100
150
  n_samples = len(df)
151
  df = df.sample(n=min(100, len(df)), random_state=42)
152
+
153
+ # Get column widths for better display
154
+ column_widths = get_column_widths(df)
155
+ return gr.Dataframe(
156
+ value=df,
157
+ column_widths=column_widths
158
+ ), str(n_samples)
159
+
160
+ def update_selected_run_checkpoint(selected_runs: list[str] | None, selected_checkpoint: list[str] | None, split_checkpoints: bool):
161
+ if not selected_runs:
162
+ return []
163
+
164
+ # In this case we simply return the selected runs which already contain checkpoints
165
+ if not split_checkpoints:
166
+ return selected_runs
167
+
168
+ # Otherwise combine runs with checkpoints
169
+ return [f"{run}/{checkpoint}" for run in selected_runs for checkpoint in (selected_checkpoint if selected_checkpoint else [])]
170
+
171
 
172
  def get_column_widths(df):
173
  column_widths = []
174
  for col in df.columns:
175
  if col == "prompt":
176
+ column_widths.append("300px") # Fixed width with overflow
177
+ elif col.startswith("generation_"):
178
+ column_widths.append("200px")
179
  elif col in ["choices", "gold"]:
180
+ column_widths.append("100px")
 
 
181
  else:
182
+ # Metrics
183
+ column_widths.append("50px") # Default width for other columns
184
  return column_widths
185
 
186
 
 
188
  """
189
  Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
190
  Turns generation_{run_name} into generation_i
191
+ Also truncates full_prompt and generation columns to 100 chars with expandable view
192
  """
193
  # Handle metric columns
194
  columns_to_rename = {}
 
205
  # Rename columns in a single operation
206
  df = df.rename(columns=columns_to_rename)
207
 
208
+ # Add markdown formatting to prompt and generation columns for truncation with expansion
209
+ def truncate_with_details(text: str | list[str]):
210
+ if is_arary_like(text) and all(isinstance(item, str) for item in text):
211
+ return [truncate_with_details(item) for item in text]
212
+ elif isinstance(text, str):
213
+ text = text.replace('\n', ' ').strip() # Replace newlines with spaces
214
+ if len(text) <= 100:
215
+ return text
216
+ return f"""<details><summary>{text[:100]}...</summary>\n\n{text[100:]}</details>"""
217
+
218
+ return text
219
+
220
  if 'prompt' in df.columns:
221
+ df['prompt'] = df['prompt'].apply(truncate_with_details)
222
+
223
+ # Apply the same truncation to all generation columns
224
+ generation_columns = [col for col in df.columns if col.startswith('generation_')]
225
+
226
+ for col in generation_columns:
227
+ df[col] = df[col].apply(truncate_with_details)
228
 
229
  return df
230
 
231
 
232
+ def unwrap_selected_run_checkpoint(selected_run_checkpoint: list[str]) -> list[str]:
233
+ return selected_run_checkpoint # Now just returns the list directly
234
+
235
+ def load_task_data(results_uri, selected_run_checkpoint: list[str], task_name, tasks_files, prompt_column, progress=gr.Progress()):
236
  token = os.environ.get(FALLBACK_TOKEN_NAME)
237
+ if not selected_run_checkpoint or not task_name:
238
  return None, None
239
 
 
 
240
  data_folder = DataFolder(f"filecache::{results_uri}", token=token, cache_storage="./results-cache")
241
 
242
+ def fetch_run_file(run_checkpoint):
243
+ file_path = f"details/{run_checkpoint}/{tasks_files[task_name][run_checkpoint]}"
244
  try:
245
  with data_folder.open(file_path, "rb") as f:
246
  df = pd.read_parquet(f)
247
+ return df, run_checkpoint
248
  except FileNotFoundError:
249
+ print(f"File not found: {tasks_files[task_name][run_checkpoint]}")
250
+ return None, run_checkpoint
251
 
252
  with ThreadPoolExecutor() as pool:
253
+ results = list(progress.tqdm(pool.map(fetch_run_file, selected_run_checkpoint),
254
+ total=len(selected_run_checkpoint),
255
+ desc="Fetching run data..."))
256
 
257
  dfs = [fix_df(df) for df, _ in results if df is not None]
258
  run_names = [run for _, run in results if run is not None]
 
262
 
263
  task_type = get_task_type(dfs[0])
264
  def prepare_df(df, run_name, task_type, prompt_column):
265
+ # Mixed in lighteval-old will look like this: ['광', -13.964999198913574, -13.539217948913574, -13.964999198913574, -13.539217948913574, -12.90467357635498, -13.07825756072998]
266
+ # Generative in lighteval-old will look like this "prediction"
267
+ # Multiple choice in lighteval-old will look like this ["choice1", "choice2"]
268
+ # [np.float64(-132.9295196533203), np.float64(-207.1309356689453), np.float64(-186.64553833007812), np.float64(-230.01414489746094), np.float64(-132.9295196533203), np.float64(-207.1309356689453), np.float64(-186.64553833007812), np.float64(-230.01414489746094), np.float64(-128.63824462890625), np.float64(-203.9550018310547), np.float64(-185.35267639160156), np.float64(-228.23837280273438)]
269
+
270
+ # For the new lighteval we have:
271
+ # Generative: [[Pour calculer le bénéfice net de C]]
272
+
273
  def get_choice_predictions(df, task_type):
274
  predictions = df['predictions']
275
  if task_type == "generative":
276
+ # This is strange representation in new lighteval...
277
+ if is_arary_like(predictions) and all(is_arary_like(item) for item in predictions):
278
+ return predictions[0]
279
  return predictions
280
 
281
  if task_type == "multiple_choice":
 
342
  return combined_df, gr.update(choices=available_metrics, value=chosen_metrics)
343
 
344
  with gr.Blocks() as demo:
345
+ available_runs_checkpoints = gr.State({})
346
  results_df_full = gr.State(None)
347
  tasks_files = gr.State({})
348
+ selected_run_checkpoint = gr.State([])
349
  login_button = gr.LoginButton(visible=False)
350
  results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True)
351
  with gr.Column():
 
360
  select_by_language = gr.Dropdown(choices=["ar", "fr", "ru", "hi", "th", "tr", "zh", "sw", "te"],
361
  interactive=True, label="Select by language",
362
  info="Choose a language to prefill the regex")
363
+ with gr.Row() as run_selection_row:
364
+ selected_runs = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Selected runs")
365
+ checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint", multiselect=True)
366
+
367
  fetch_res = gr.Button("Fetch results")
368
  task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
369
  metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
 
371
  interactive=False,
372
  wrap=True,
373
  line_breaks=True,
374
+ datatype="markdown",
375
+ column_widths=get_column_widths(pd.DataFrame()) # Initialize with empty dataframe
376
  )
377
  with gr.Row():
378
  with gr.Column():
 
381
 
382
  # Run selection
383
  gr.on(
384
+ triggers=[split_checkpoints.change],
385
+ fn=lambda split_checkpoints: gr.update(visible=split_checkpoints),
386
+ inputs=[split_checkpoints],
387
+ outputs=[checkpoint]
388
+ )
389
+ gr.on(
390
+ triggers=[results_uri.change, split_checkpoints.change],
391
+ fn=fetch_repo_structure, inputs=[results_uri, split_checkpoints], outputs=[available_runs_checkpoints, selected_runs],
392
  )
393
  gr.on(
394
  triggers=[select_by_regex_button.click],
395
  fn=select_runs_by_regex,
396
+ inputs=[available_runs_checkpoints, selected_runs, select_by_regex_text], outputs=[selected_runs]
397
  )
398
  gr.on(
399
  triggers=[select_by_language.change],
400
  fn=select_runs_by_language,
401
+ inputs=[available_runs_checkpoints, selected_runs, select_by_language], outputs=[selected_runs]
402
  )
403
 
404
  # Update checkpoints based on selected runs
405
  gr.on(
406
  triggers=[selected_runs.change],
407
  fn=update_checkpoints,
408
+ inputs=[selected_runs, available_runs_checkpoints, split_checkpoints],
409
  outputs=[checkpoint]
410
  )
411
+
412
+ gr.on(
413
+ triggers=[checkpoint.change, selected_runs.change],
414
+ fn=update_selected_run_checkpoint,
415
+ inputs=[selected_runs, checkpoint, split_checkpoints],
416
+ outputs=[selected_run_checkpoint]
417
+ )
418
 
419
  # Fetch available tasks
420
  gr.on(
421
  triggers=[fetch_res.click],
422
  fn=fetch_run_results,
423
+ inputs=[results_uri, selected_run_checkpoint],
424
  outputs=[task_name, tasks_files]
425
  ).then(
426
  fn=load_task_data,
427
+ inputs=[results_uri, selected_run_checkpoint, task_name, tasks_files, prompt_column],
428
  outputs=[results_df_full, metric_names]
429
  ).then(
430
  fn=render_table,
431
+ inputs=[results_df_full, selected_run_checkpoint, metric_names],
432
  outputs=[results_df, num_samples]
433
  )
434
 
435
  # Update results when task name or metric changes
436
  gr.on(
437
+ triggers=[task_name.input, prompt_column.input],
438
  fn=load_task_data,
439
+ inputs=[results_uri, selected_run_checkpoint, task_name, tasks_files, prompt_column],
440
  outputs=[results_df_full, metric_names]
441
  ).then(
442
  fn=render_table,
443
+ inputs=[results_df_full, selected_run_checkpoint, metric_names],
444
  outputs=[results_df, num_samples]
445
  )
446
 
447
  gr.on(
448
  triggers=[metric_names.input],
449
  fn=render_table,
450
+ inputs=[results_df_full, selected_run_checkpoint, metric_names],
451
  outputs=[results_df, num_samples]
452
  )
453
 
454
+ demo.load(fn=fetch_repo_structure, inputs=[results_uri, split_checkpoints], outputs=[available_runs_checkpoints, selected_runs])
455
 
456
  demo.launch()