Terry Zhuo
commited on
Commit
·
2e84cf2
1
Parent(s):
b65f8f6
update
Browse files- app.py +76 -109
- src/display/utils.py +2 -2
- src/tools/plots.py +12 -12
app.py
CHANGED
@@ -38,8 +38,8 @@ from src.envs import (
|
|
38 |
DATA_VERSION,
|
39 |
DATA_REPO,
|
40 |
HARD_RESULT_REPO,
|
41 |
-
ELO_REPO,
|
42 |
-
HARD_ELO_REPO,
|
43 |
SOLVE_REPO,
|
44 |
HARD_SOLVE_REPO,
|
45 |
HF_TOKEN,
|
@@ -51,7 +51,7 @@ from src.envs import (
|
|
51 |
)
|
52 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
53 |
from src.execute import generate_command, default_command, stream_logs, find_result_file
|
54 |
-
from src.tools.plots import
|
55 |
# from src.voting.vote_system import VoteManager, run_scheduler
|
56 |
|
57 |
# Configure logging
|
@@ -66,10 +66,10 @@ DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
|
|
66 |
NEW_DATA_ON_LEADERBOARD = True
|
67 |
LEADERBOARD_DF = None
|
68 |
HARD_LEADERBOARD_DF = None
|
69 |
-
ELO_TASK_DF = None
|
70 |
-
ELO_BENCH_DF = None
|
71 |
-
HARD_ELO_TASK_DF = None
|
72 |
-
HARD_ELO_BENCH_DF = None
|
73 |
COMPLETE_SOLVE_DF = None
|
74 |
INSTRUCT_SOLVE_DF = None
|
75 |
HARD_COMPLETE_SOLVE_DF = None
|
@@ -154,10 +154,10 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
|
|
154 |
def get_latest_data_leaderboard(
|
155 |
leaderboard_initial_df = None,
|
156 |
hard_leaderboard_initial_df = None,
|
157 |
-
elo_task_df = None,
|
158 |
-
elo_bench_df = None,
|
159 |
-
hard_elo_task_df = None,
|
160 |
-
hard_elo_bench_df = None,
|
161 |
complete_solve_df = None,
|
162 |
instruct_solve_df = None,
|
163 |
hard_complete_solve_df = None,
|
@@ -166,10 +166,10 @@ def get_latest_data_leaderboard(
|
|
166 |
global NEW_DATA_ON_LEADERBOARD
|
167 |
global LEADERBOARD_DF
|
168 |
global HARD_LEADERBOARD_DF
|
169 |
-
global ELO_TASK_DF
|
170 |
-
global ELO_BENCH_DF
|
171 |
-
global HARD_ELO_TASK_DF
|
172 |
-
global HARD_ELO_BENCH_DF
|
173 |
global COMPLETE_SOLVE_DF
|
174 |
global INSTRUCT_SOLVE_DF
|
175 |
global HARD_COMPLETE_SOLVE_DF
|
@@ -182,7 +182,7 @@ def get_latest_data_leaderboard(
|
|
182 |
"default",
|
183 |
split="train",
|
184 |
cache_dir=HF_HOME,
|
185 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
186 |
verification_mode="no_checks"
|
187 |
).filter(lambda x: x['complete'] is not None or x['instruct'] is not None)
|
188 |
LEADERBOARD_DF = get_leaderboard_df(
|
@@ -194,7 +194,7 @@ def get_latest_data_leaderboard(
|
|
194 |
"default",
|
195 |
split="train",
|
196 |
cache_dir=HF_HOME,
|
197 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
198 |
verification_mode="no_checks"
|
199 |
).filter(lambda x: x['complete'] is not None or x['instruct'] is not None)
|
200 |
hard_leaderboard_df = get_leaderboard_df(
|
@@ -202,51 +202,23 @@ def get_latest_data_leaderboard(
|
|
202 |
cols=COLS,
|
203 |
)
|
204 |
HARD_LEADERBOARD_DF = hard_leaderboard_df
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
ELO_REPO,
|
216 |
-
"default",
|
217 |
-
split="benchmark_tie",
|
218 |
-
cache_dir=HF_HOME,
|
219 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
220 |
-
verification_mode="no_checks"
|
221 |
-
).to_pandas()
|
222 |
-
ELO_TASK_DF = elo_task_df
|
223 |
-
ELO_BENCH_DF = elo_bench_df
|
224 |
-
|
225 |
-
hard_elo_task_df = datasets.load_dataset(
|
226 |
-
HARD_ELO_REPO,
|
227 |
-
"default",
|
228 |
-
split="task_no_tie",
|
229 |
-
cache_dir=HF_HOME,
|
230 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
231 |
-
verification_mode="no_checks"
|
232 |
-
).to_pandas()
|
233 |
-
hard_elo_bench_df = datasets.load_dataset(
|
234 |
-
HARD_ELO_REPO,
|
235 |
-
"default",
|
236 |
-
split="benchmark_tie",
|
237 |
-
cache_dir=HF_HOME,
|
238 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
239 |
-
verification_mode="no_checks"
|
240 |
-
).to_pandas()
|
241 |
-
HARD_ELO_TASK_DF = hard_elo_task_df
|
242 |
-
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
243 |
|
244 |
complete_solve_df = datasets.load_dataset(
|
245 |
SOLVE_REPO,
|
246 |
"default",
|
247 |
split="complete",
|
248 |
cache_dir=HF_HOME,
|
249 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
250 |
verification_mode="no_checks"
|
251 |
).to_pandas()
|
252 |
instruct_solve_df = datasets.load_dataset(
|
@@ -254,7 +226,7 @@ def get_latest_data_leaderboard(
|
|
254 |
"default",
|
255 |
split="instruct",
|
256 |
cache_dir=HF_HOME,
|
257 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
258 |
verification_mode="no_checks"
|
259 |
).to_pandas()
|
260 |
COMPLETE_SOLVE_DF = complete_solve_df
|
@@ -265,7 +237,7 @@ def get_latest_data_leaderboard(
|
|
265 |
"default",
|
266 |
split="complete",
|
267 |
cache_dir=HF_HOME,
|
268 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
269 |
verification_mode="no_checks"
|
270 |
).to_pandas()
|
271 |
hard_instruct_solve_df = datasets.load_dataset(
|
@@ -273,7 +245,7 @@ def get_latest_data_leaderboard(
|
|
273 |
"default",
|
274 |
split="instruct",
|
275 |
cache_dir=HF_HOME,
|
276 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
277 |
verification_mode="no_checks"
|
278 |
).to_pandas()
|
279 |
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
|
@@ -283,18 +255,17 @@ def get_latest_data_leaderboard(
|
|
283 |
|
284 |
else:
|
285 |
LEADERBOARD_DF = leaderboard_initial_df
|
286 |
-
|
287 |
-
ELO_TASK_DF = elo_task_df
|
288 |
-
# ELO_BENCH_DF = elo_bench_df
|
289 |
-
# HARD_ELO_TASK_DF = hard_elo_task_df
|
290 |
-
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
291 |
COMPLETE_SOLVE_DF = complete_solve_df
|
292 |
-
|
293 |
-
|
294 |
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
|
295 |
|
296 |
-
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF,
|
297 |
-
# return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
298 |
|
299 |
|
300 |
def init_space():
|
@@ -303,19 +274,19 @@ def init_space():
|
|
303 |
# Always redownload the leaderboard DataFrame
|
304 |
global LEADERBOARD_DF
|
305 |
global HARD_LEADERBOARD_DF
|
306 |
-
global ELO_TASK_DF
|
307 |
-
global ELO_BENCH_DF
|
308 |
-
global HARD_ELO_TASK_DF
|
309 |
-
global HARD_ELO_BENCH_DF
|
310 |
global COMPLETE_SOLVE_DF
|
311 |
global INSTRUCT_SOLVE_DF
|
312 |
global HARD_COMPLETE_SOLVE_DF
|
313 |
global HARD_INSTRUCT_SOLVE_DF
|
314 |
|
315 |
-
LEADERBOARD_DF, HARD_LEADERBOARD_DF,
|
316 |
# HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
|
317 |
|
318 |
-
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF,
|
319 |
# return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
320 |
|
321 |
# Initialize VoteManager
|
@@ -331,10 +302,7 @@ def init_space():
|
|
331 |
|
332 |
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
333 |
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
334 |
-
LEADERBOARD_DF, HARD_LEADERBOARD_DF,
|
335 |
-
ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
|
336 |
-
COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
|
337 |
-
HARD_INSTRUCT_SOLVE_DF = init_space()
|
338 |
# HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
|
339 |
|
340 |
# Data processing for plots now only on demand in the respective Gradio tab
|
@@ -399,7 +367,6 @@ with main_block as demo:
|
|
399 |
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
|
400 |
- `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
|
401 |
- `Average` is the average of `Complete` and `Instruct` when both are available.
|
402 |
-
- `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
|
403 |
- `#Act Params (B)` is the number of activated model parameters during inference.
|
404 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
405 |
- For more details check the 📝 About section.
|
@@ -407,20 +374,21 @@ with main_block as demo:
|
|
407 |
elem_classes="markdown-text",
|
408 |
)
|
409 |
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
|
|
424 |
|
425 |
with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
|
426 |
with gr.Column():
|
@@ -448,27 +416,26 @@ with main_block as demo:
|
|
448 |
- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
|
449 |
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
|
450 |
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
451 |
-
- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
|
452 |
-
- `size` is the amount of activated model weight during inference.
|
453 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
454 |
- For more details check the 📝 About section.
|
455 |
""",
|
456 |
elem_classes="markdown-text",
|
457 |
)
|
458 |
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
|
|
472 |
|
473 |
with gr.TabItem("🧩 Solve Rate", id="full_solve"):
|
474 |
with gr.Column():
|
@@ -602,7 +569,7 @@ with main_block as demo:
|
|
602 |
show_copy_button=True,
|
603 |
)
|
604 |
|
605 |
-
main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard,
|
606 |
# main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
|
607 |
# leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
608 |
# pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
|
|
|
38 |
DATA_VERSION,
|
39 |
DATA_REPO,
|
40 |
HARD_RESULT_REPO,
|
41 |
+
# ELO_REPO, # Comment out
|
42 |
+
# HARD_ELO_REPO, # Comment out
|
43 |
SOLVE_REPO,
|
44 |
HARD_SOLVE_REPO,
|
45 |
HF_TOKEN,
|
|
|
51 |
)
|
52 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
53 |
from src.execute import generate_command, default_command, stream_logs, find_result_file
|
54 |
+
from src.tools.plots import plot_solve_rate
|
55 |
# from src.voting.vote_system import VoteManager, run_scheduler
|
56 |
|
57 |
# Configure logging
|
|
|
66 |
NEW_DATA_ON_LEADERBOARD = True
|
67 |
LEADERBOARD_DF = None
|
68 |
HARD_LEADERBOARD_DF = None
|
69 |
+
# ELO_TASK_DF = None # Comment out
|
70 |
+
# ELO_BENCH_DF = None # Comment out
|
71 |
+
# HARD_ELO_TASK_DF = None # Comment out
|
72 |
+
# HARD_ELO_BENCH_DF = None # Comment out
|
73 |
COMPLETE_SOLVE_DF = None
|
74 |
INSTRUCT_SOLVE_DF = None
|
75 |
HARD_COMPLETE_SOLVE_DF = None
|
|
|
154 |
def get_latest_data_leaderboard(
|
155 |
leaderboard_initial_df = None,
|
156 |
hard_leaderboard_initial_df = None,
|
157 |
+
# elo_task_df = None, # Comment out
|
158 |
+
# elo_bench_df = None, # Comment out
|
159 |
+
# hard_elo_task_df = None, # Comment out
|
160 |
+
# hard_elo_bench_df = None, # Comment out
|
161 |
complete_solve_df = None,
|
162 |
instruct_solve_df = None,
|
163 |
hard_complete_solve_df = None,
|
|
|
166 |
global NEW_DATA_ON_LEADERBOARD
|
167 |
global LEADERBOARD_DF
|
168 |
global HARD_LEADERBOARD_DF
|
169 |
+
# global ELO_TASK_DF # Comment out
|
170 |
+
# global ELO_BENCH_DF # Comment out
|
171 |
+
# global HARD_ELO_TASK_DF # Comment out
|
172 |
+
# global HARD_ELO_BENCH_DF # Comment out
|
173 |
global COMPLETE_SOLVE_DF
|
174 |
global INSTRUCT_SOLVE_DF
|
175 |
global HARD_COMPLETE_SOLVE_DF
|
|
|
182 |
"default",
|
183 |
split="train",
|
184 |
cache_dir=HF_HOME,
|
185 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
186 |
verification_mode="no_checks"
|
187 |
).filter(lambda x: x['complete'] is not None or x['instruct'] is not None)
|
188 |
LEADERBOARD_DF = get_leaderboard_df(
|
|
|
194 |
"default",
|
195 |
split="train",
|
196 |
cache_dir=HF_HOME,
|
197 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
198 |
verification_mode="no_checks"
|
199 |
).filter(lambda x: x['complete'] is not None or x['instruct'] is not None)
|
200 |
hard_leaderboard_df = get_leaderboard_df(
|
|
|
202 |
cols=COLS,
|
203 |
)
|
204 |
HARD_LEADERBOARD_DF = hard_leaderboard_df
|
205 |
+
|
206 |
+
# Comment out Elo dataset loading
|
207 |
+
# elo_task_df = datasets.load_dataset(...)
|
208 |
+
# elo_bench_df = datasets.load_dataset(...)
|
209 |
+
# ELO_TASK_DF = elo_task_df
|
210 |
+
# ELO_BENCH_DF = elo_bench_df
|
211 |
+
# hard_elo_task_df = datasets.load_dataset(...)
|
212 |
+
# hard_elo_bench_df = datasets.load_dataset(...)
|
213 |
+
# HARD_ELO_TASK_DF = hard_elo_task_df
|
214 |
+
# HARD_ELO_BENCH_DF = hard_elo_bench_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
complete_solve_df = datasets.load_dataset(
|
217 |
SOLVE_REPO,
|
218 |
"default",
|
219 |
split="complete",
|
220 |
cache_dir=HF_HOME,
|
221 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
222 |
verification_mode="no_checks"
|
223 |
).to_pandas()
|
224 |
instruct_solve_df = datasets.load_dataset(
|
|
|
226 |
"default",
|
227 |
split="instruct",
|
228 |
cache_dir=HF_HOME,
|
229 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
230 |
verification_mode="no_checks"
|
231 |
).to_pandas()
|
232 |
COMPLETE_SOLVE_DF = complete_solve_df
|
|
|
237 |
"default",
|
238 |
split="complete",
|
239 |
cache_dir=HF_HOME,
|
240 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
241 |
verification_mode="no_checks"
|
242 |
).to_pandas()
|
243 |
hard_instruct_solve_df = datasets.load_dataset(
|
|
|
245 |
"default",
|
246 |
split="instruct",
|
247 |
cache_dir=HF_HOME,
|
248 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
249 |
verification_mode="no_checks"
|
250 |
).to_pandas()
|
251 |
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
|
|
|
255 |
|
256 |
else:
|
257 |
LEADERBOARD_DF = leaderboard_initial_df
|
258 |
+
HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
|
259 |
+
# ELO_TASK_DF = elo_task_df # Comment out
|
260 |
+
# ELO_BENCH_DF = elo_bench_df # Comment out
|
261 |
+
# HARD_ELO_TASK_DF = hard_elo_task_df # Comment out
|
262 |
+
# HARD_ELO_BENCH_DF = hard_elo_bench_df # Comment out
|
263 |
COMPLETE_SOLVE_DF = complete_solve_df
|
264 |
+
INSTRUCT_SOLVE_DF = instruct_solve_df
|
265 |
+
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
|
266 |
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
|
267 |
|
268 |
+
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
|
|
269 |
|
270 |
|
271 |
def init_space():
|
|
|
274 |
# Always redownload the leaderboard DataFrame
|
275 |
global LEADERBOARD_DF
|
276 |
global HARD_LEADERBOARD_DF
|
277 |
+
# global ELO_TASK_DF # Comment out
|
278 |
+
# global ELO_BENCH_DF # Comment out
|
279 |
+
# global HARD_ELO_TASK_DF # Comment out
|
280 |
+
# global HARD_ELO_BENCH_DF # Comment out
|
281 |
global COMPLETE_SOLVE_DF
|
282 |
global INSTRUCT_SOLVE_DF
|
283 |
global HARD_COMPLETE_SOLVE_DF
|
284 |
global HARD_INSTRUCT_SOLVE_DF
|
285 |
|
286 |
+
LEADERBOARD_DF, HARD_LEADERBOARD_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
|
287 |
# HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
|
288 |
|
289 |
+
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
290 |
# return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
291 |
|
292 |
# Initialize VoteManager
|
|
|
302 |
|
303 |
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
304 |
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
305 |
+
LEADERBOARD_DF, HARD_LEADERBOARD_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
|
|
|
|
|
|
|
306 |
# HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
|
307 |
|
308 |
# Data processing for plots now only on demand in the respective Gradio tab
|
|
|
367 |
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
|
368 |
- `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
|
369 |
- `Average` is the average of `Complete` and `Instruct` when both are available.
|
|
|
370 |
- `#Act Params (B)` is the number of activated model parameters during inference.
|
371 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
372 |
- For more details check the 📝 About section.
|
|
|
374 |
elem_classes="markdown-text",
|
375 |
)
|
376 |
|
377 |
+
# Comment out or remove the Elo Rating tab
|
378 |
+
# with gr.TabItem("📊 Elo Rating", id="hard_elo"):
|
379 |
+
# with gr.Column():
|
380 |
+
# with gr.Group():
|
381 |
+
# gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
|
382 |
+
# hard_task_elo_map = gr.Plot()
|
383 |
+
# hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
|
384 |
+
# demo.load(plot_elo_mle, [hard_elo_task_gr],
|
385 |
+
# hard_task_elo_map)
|
386 |
+
# with gr.Group():
|
387 |
+
# gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
388 |
+
# hard_bench_elo_map = gr.Plot()
|
389 |
+
# hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
|
390 |
+
# demo.load(plot_elo_mle, [hard_elo_bench_gr],
|
391 |
+
# hard_bench_elo_map)
|
392 |
|
393 |
with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
|
394 |
with gr.Column():
|
|
|
416 |
- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
|
417 |
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
|
418 |
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
|
|
|
|
419 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
420 |
- For more details check the 📝 About section.
|
421 |
""",
|
422 |
elem_classes="markdown-text",
|
423 |
)
|
424 |
|
425 |
+
# Comment out or remove the Elo Rating tab
|
426 |
+
# with gr.TabItem("📊 Elo Rating", id="full_elo"):
|
427 |
+
# with gr.Column():
|
428 |
+
# with gr.Group():
|
429 |
+
#
|
430 |
+
# gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
|
431 |
+
# task_elo_map = gr.Plot()
|
432 |
+
# elo_task_gr = init_others(ELO_TASK_DF)
|
433 |
+
# demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
|
434 |
+
# with gr.Group():
|
435 |
+
# gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
436 |
+
# bench_elo_map = gr.Plot()
|
437 |
+
# elo_bench_gr = init_others(ELO_BENCH_DF)
|
438 |
+
# demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
|
439 |
|
440 |
with gr.TabItem("🧩 Solve Rate", id="full_solve"):
|
441 |
with gr.Column():
|
|
|
569 |
show_copy_button=True,
|
570 |
)
|
571 |
|
572 |
+
main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
|
573 |
# main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
|
574 |
# leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
575 |
# pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
|
src/display/utils.py
CHANGED
@@ -54,7 +54,7 @@ column_map = {
|
|
54 |
"complete": "Complete",
|
55 |
"instruct": "Instruct",
|
56 |
"average": "Average",
|
57 |
-
"elo_mle": "Elo Rating",
|
58 |
"link": "Link",
|
59 |
"act_param": "#Act Params (B)",
|
60 |
"size": "#Params (B)",
|
@@ -96,7 +96,7 @@ auto_eval_column_dict.append(["size_range", ColumnContent, ColumnContent(column_
|
|
96 |
auto_eval_column_dict.append(["complete", ColumnContent, ColumnContent(column_map["complete"], "number", True)])
|
97 |
auto_eval_column_dict.append(["instruct", ColumnContent, ColumnContent(column_map["instruct"], "number", True)])
|
98 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(column_map["average"], "number", True)])
|
99 |
-
auto_eval_column_dict.append(["elo_mle", ColumnContent, ColumnContent(column_map["elo_mle"], "number", True)])
|
100 |
|
101 |
# Model information
|
102 |
auto_eval_column_dict.append(["act_param", ColumnContent, ColumnContent(column_map["act_param"], "number", True)])
|
|
|
54 |
"complete": "Complete",
|
55 |
"instruct": "Instruct",
|
56 |
"average": "Average",
|
57 |
+
# "elo_mle": "Elo Rating",
|
58 |
"link": "Link",
|
59 |
"act_param": "#Act Params (B)",
|
60 |
"size": "#Params (B)",
|
|
|
96 |
auto_eval_column_dict.append(["complete", ColumnContent, ColumnContent(column_map["complete"], "number", True)])
|
97 |
auto_eval_column_dict.append(["instruct", ColumnContent, ColumnContent(column_map["instruct"], "number", True)])
|
98 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(column_map["average"], "number", True)])
|
99 |
+
# auto_eval_column_dict.append(["elo_mle", ColumnContent, ColumnContent(column_map["elo_mle"], "number", True)])
|
100 |
|
101 |
# Model information
|
102 |
auto_eval_column_dict.append(["act_param", ColumnContent, ColumnContent(column_map["act_param"], "number", True)])
|
src/tools/plots.py
CHANGED
@@ -3,18 +3,18 @@ import plotly.express as px
|
|
3 |
import numpy as np
|
4 |
|
5 |
|
6 |
-
def plot_elo_mle(df):
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
|
19 |
|
20 |
def plot_solve_rate(df, task, rows=30, cols=38):
|
|
|
3 |
import numpy as np
|
4 |
|
5 |
|
6 |
+
# def plot_elo_mle(df):
|
7 |
+
# fig = px.scatter(df, x="model", y="rating", error_y="error_y",
|
8 |
+
# error_y_minus="error_y_minus",
|
9 |
+
# # title="Bootstrap of Elo MLE Estimates (BigCodeBench-Complete)"
|
10 |
+
# )
|
11 |
+
# fig.update_layout(xaxis_title="Model",
|
12 |
+
# yaxis_title="Rating",
|
13 |
+
# autosize=True,
|
14 |
+
# # width=1300,
|
15 |
+
# # height=900,
|
16 |
+
# )
|
17 |
+
# return fig
|
18 |
|
19 |
|
20 |
def plot_solve_rate(df, task, rows=30, cols=38):
|