hynky HF staff commited on
Commit
85f65ce
·
1 Parent(s): f14f2bb

add jsonl support and fix taskname identification

Browse files
Files changed (1) hide show
  1. app.py +12 -5
app.py CHANGED
@@ -108,14 +108,18 @@ def fetch_available_tasks(results_uri, selected_run_checkpoint: list[str]) -> di
108
  try:
109
  details_folder = f"details/{run_checkpoint}"
110
  files = data_folder.list_files(details_folder, recursive=True)
111
- parquet_files = [f.removeprefix(details_folder + "/") for f in files if f.endswith('.parquet')]
112
 
113
- for full_filename in parquet_files:
114
- task_name, date_str = full_filename.replace('.parquet', '').rsplit('_', 1)
 
 
 
115
  date = datetime.strptime(date_str, '%Y-%m-%dT%H-%M-%S.%f')
116
 
117
  if run_checkpoint not in all_tasks[task_name] or date > all_tasks[task_name][run_checkpoint]['date']:
118
  all_tasks[task_name][run_checkpoint] = {'filename': full_filename, 'date': date}
 
119
  except FileNotFoundError:
120
  print(f"Checkpoint not found for run: {run_checkpoint}")
121
 
@@ -243,7 +247,10 @@ def load_task_data(results_uri, selected_run_checkpoint: list[str], task_name, t
243
  file_path = f"details/{run_checkpoint}/{tasks_files[task_name][run_checkpoint]}"
244
  try:
245
  with data_folder.open(file_path, "rb") as f:
246
- df = pd.read_parquet(f)
 
 
 
247
  return df, run_checkpoint
248
  except FileNotFoundError:
249
  print(f"File not found: {tasks_files[task_name][run_checkpoint]}")
@@ -347,7 +354,7 @@ with gr.Blocks() as demo:
347
  tasks_files = gr.State({})
348
  selected_run_checkpoint = gr.State([])
349
  login_button = gr.LoginButton(visible=False)
350
- results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True)
351
  with gr.Column():
352
  gr.Markdown("# FineWeb experiments results explorer")
353
  split_checkpoints = gr.Checkbox(label="Split checkpoints from models", value=True)
 
108
  try:
109
  details_folder = f"details/{run_checkpoint}"
110
  files = data_folder.list_files(details_folder, recursive=True)
111
+ result_files = [f.removeprefix(details_folder + "/") for f in files if f.endswith('.parquet') or f.endswith('.json')]
112
 
113
+ for full_filename in result_files:
114
+ file_ext = '.parquet' if full_filename.endswith('.parquet') else '.json'
115
+ # new lighteval has uses date/task_name_date, old lighteval uses task_name_date
116
+ filename = full_filename.replace(file_ext, '').split("/")[-1]
117
+ task_name, date_str = filename.rsplit('_', 1)
118
  date = datetime.strptime(date_str, '%Y-%m-%dT%H-%M-%S.%f')
119
 
120
  if run_checkpoint not in all_tasks[task_name] or date > all_tasks[task_name][run_checkpoint]['date']:
121
  all_tasks[task_name][run_checkpoint] = {'filename': full_filename, 'date': date}
122
+
123
  except FileNotFoundError:
124
  print(f"Checkpoint not found for run: {run_checkpoint}")
125
 
 
247
  file_path = f"details/{run_checkpoint}/{tasks_files[task_name][run_checkpoint]}"
248
  try:
249
  with data_folder.open(file_path, "rb") as f:
250
+ if file_path.endswith('.parquet'):
251
+ df = pd.read_parquet(f)
252
+ else:
253
+ df = pd.read_json(f, lines=True)
254
  return df, run_checkpoint
255
  except FileNotFoundError:
256
  print(f"File not found: {tasks_files[task_name][run_checkpoint]}")
 
354
  tasks_files = gr.State({})
355
  selected_run_checkpoint = gr.State([])
356
  login_button = gr.LoginButton(visible=False)
357
+ results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True, placeholder="s3://bucket/path/to/results")
358
  with gr.Column():
359
  gr.Markdown("# FineWeb experiments results explorer")
360
  split_checkpoints = gr.Checkbox(label="Split checkpoints from models", value=True)