Spaces:
Running
Running
add jsonl support and fix taskname identification
Browse files
app.py
CHANGED
@@ -108,14 +108,18 @@ def fetch_available_tasks(results_uri, selected_run_checkpoint: list[str]) -> di
|
|
108 |
try:
|
109 |
details_folder = f"details/{run_checkpoint}"
|
110 |
files = data_folder.list_files(details_folder, recursive=True)
|
111 |
-
|
112 |
|
113 |
-
for full_filename in
|
114 |
-
|
|
|
|
|
|
|
115 |
date = datetime.strptime(date_str, '%Y-%m-%dT%H-%M-%S.%f')
|
116 |
|
117 |
if run_checkpoint not in all_tasks[task_name] or date > all_tasks[task_name][run_checkpoint]['date']:
|
118 |
all_tasks[task_name][run_checkpoint] = {'filename': full_filename, 'date': date}
|
|
|
119 |
except FileNotFoundError:
|
120 |
print(f"Checkpoint not found for run: {run_checkpoint}")
|
121 |
|
@@ -243,7 +247,10 @@ def load_task_data(results_uri, selected_run_checkpoint: list[str], task_name, t
|
|
243 |
file_path = f"details/{run_checkpoint}/{tasks_files[task_name][run_checkpoint]}"
|
244 |
try:
|
245 |
with data_folder.open(file_path, "rb") as f:
|
246 |
-
|
|
|
|
|
|
|
247 |
return df, run_checkpoint
|
248 |
except FileNotFoundError:
|
249 |
print(f"File not found: {tasks_files[task_name][run_checkpoint]}")
|
@@ -347,7 +354,7 @@ with gr.Blocks() as demo:
|
|
347 |
tasks_files = gr.State({})
|
348 |
selected_run_checkpoint = gr.State([])
|
349 |
login_button = gr.LoginButton(visible=False)
|
350 |
-
results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True)
|
351 |
with gr.Column():
|
352 |
gr.Markdown("# FineWeb experiments results explorer")
|
353 |
split_checkpoints = gr.Checkbox(label="Split checkpoints from models", value=True)
|
|
|
108 |
try:
|
109 |
details_folder = f"details/{run_checkpoint}"
|
110 |
files = data_folder.list_files(details_folder, recursive=True)
|
111 |
+
result_files = [f.removeprefix(details_folder + "/") for f in files if f.endswith('.parquet') or f.endswith('.json')]
|
112 |
|
113 |
+
for full_filename in result_files:
|
114 |
+
file_ext = '.parquet' if full_filename.endswith('.parquet') else '.json'
|
115 |
+
# new lighteval has uses date/task_name_date, old lighteval uses task_name_date
|
116 |
+
filename = full_filename.replace(file_ext, '').split("/")[-1]
|
117 |
+
task_name, date_str = filename.rsplit('_', 1)
|
118 |
date = datetime.strptime(date_str, '%Y-%m-%dT%H-%M-%S.%f')
|
119 |
|
120 |
if run_checkpoint not in all_tasks[task_name] or date > all_tasks[task_name][run_checkpoint]['date']:
|
121 |
all_tasks[task_name][run_checkpoint] = {'filename': full_filename, 'date': date}
|
122 |
+
|
123 |
except FileNotFoundError:
|
124 |
print(f"Checkpoint not found for run: {run_checkpoint}")
|
125 |
|
|
|
247 |
file_path = f"details/{run_checkpoint}/{tasks_files[task_name][run_checkpoint]}"
|
248 |
try:
|
249 |
with data_folder.open(file_path, "rb") as f:
|
250 |
+
if file_path.endswith('.parquet'):
|
251 |
+
df = pd.read_parquet(f)
|
252 |
+
else:
|
253 |
+
df = pd.read_json(f, lines=True)
|
254 |
return df, run_checkpoint
|
255 |
except FileNotFoundError:
|
256 |
print(f"File not found: {tasks_files[task_name][run_checkpoint]}")
|
|
|
354 |
tasks_files = gr.State({})
|
355 |
selected_run_checkpoint = gr.State([])
|
356 |
login_button = gr.LoginButton(visible=False)
|
357 |
+
results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True, placeholder="s3://bucket/path/to/results")
|
358 |
with gr.Column():
|
359 |
gr.Markdown("# FineWeb experiments results explorer")
|
360 |
split_checkpoints = gr.Checkbox(label="Split checkpoints from models", value=True)
|