Terry Zhuo commited on
Commit
2e84cf2
·
1 Parent(s): b65f8f6
Files changed (3) hide show
  1. app.py +76 -109
  2. src/display/utils.py +2 -2
  3. src/tools/plots.py +12 -12
app.py CHANGED
@@ -38,8 +38,8 @@ from src.envs import (
38
  DATA_VERSION,
39
  DATA_REPO,
40
  HARD_RESULT_REPO,
41
- ELO_REPO,
42
- HARD_ELO_REPO,
43
  SOLVE_REPO,
44
  HARD_SOLVE_REPO,
45
  HF_TOKEN,
@@ -51,7 +51,7 @@ from src.envs import (
51
  )
52
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
53
  from src.execute import generate_command, default_command, stream_logs, find_result_file
54
- from src.tools.plots import plot_elo_mle, plot_solve_rate
55
  # from src.voting.vote_system import VoteManager, run_scheduler
56
 
57
  # Configure logging
@@ -66,10 +66,10 @@ DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
66
  NEW_DATA_ON_LEADERBOARD = True
67
  LEADERBOARD_DF = None
68
  HARD_LEADERBOARD_DF = None
69
- ELO_TASK_DF = None
70
- ELO_BENCH_DF = None
71
- HARD_ELO_TASK_DF = None
72
- HARD_ELO_BENCH_DF = None
73
  COMPLETE_SOLVE_DF = None
74
  INSTRUCT_SOLVE_DF = None
75
  HARD_COMPLETE_SOLVE_DF = None
@@ -154,10 +154,10 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
154
  def get_latest_data_leaderboard(
155
  leaderboard_initial_df = None,
156
  hard_leaderboard_initial_df = None,
157
- elo_task_df = None,
158
- elo_bench_df = None,
159
- hard_elo_task_df = None,
160
- hard_elo_bench_df = None,
161
  complete_solve_df = None,
162
  instruct_solve_df = None,
163
  hard_complete_solve_df = None,
@@ -166,10 +166,10 @@ def get_latest_data_leaderboard(
166
  global NEW_DATA_ON_LEADERBOARD
167
  global LEADERBOARD_DF
168
  global HARD_LEADERBOARD_DF
169
- global ELO_TASK_DF
170
- global ELO_BENCH_DF
171
- global HARD_ELO_TASK_DF
172
- global HARD_ELO_BENCH_DF
173
  global COMPLETE_SOLVE_DF
174
  global INSTRUCT_SOLVE_DF
175
  global HARD_COMPLETE_SOLVE_DF
@@ -182,7 +182,7 @@ def get_latest_data_leaderboard(
182
  "default",
183
  split="train",
184
  cache_dir=HF_HOME,
185
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
186
  verification_mode="no_checks"
187
  ).filter(lambda x: x['complete'] is not None or x['instruct'] is not None)
188
  LEADERBOARD_DF = get_leaderboard_df(
@@ -194,7 +194,7 @@ def get_latest_data_leaderboard(
194
  "default",
195
  split="train",
196
  cache_dir=HF_HOME,
197
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
198
  verification_mode="no_checks"
199
  ).filter(lambda x: x['complete'] is not None or x['instruct'] is not None)
200
  hard_leaderboard_df = get_leaderboard_df(
@@ -202,51 +202,23 @@ def get_latest_data_leaderboard(
202
  cols=COLS,
203
  )
204
  HARD_LEADERBOARD_DF = hard_leaderboard_df
205
-
206
- elo_task_df = datasets.load_dataset(
207
- ELO_REPO,
208
- "default",
209
- split="task_no_tie",
210
- cache_dir=HF_HOME,
211
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
212
- verification_mode="no_checks"
213
- ).to_pandas()
214
- elo_bench_df = datasets.load_dataset(
215
- ELO_REPO,
216
- "default",
217
- split="benchmark_tie",
218
- cache_dir=HF_HOME,
219
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
220
- verification_mode="no_checks"
221
- ).to_pandas()
222
- ELO_TASK_DF = elo_task_df
223
- ELO_BENCH_DF = elo_bench_df
224
-
225
- hard_elo_task_df = datasets.load_dataset(
226
- HARD_ELO_REPO,
227
- "default",
228
- split="task_no_tie",
229
- cache_dir=HF_HOME,
230
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
231
- verification_mode="no_checks"
232
- ).to_pandas()
233
- hard_elo_bench_df = datasets.load_dataset(
234
- HARD_ELO_REPO,
235
- "default",
236
- split="benchmark_tie",
237
- cache_dir=HF_HOME,
238
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
239
- verification_mode="no_checks"
240
- ).to_pandas()
241
- HARD_ELO_TASK_DF = hard_elo_task_df
242
- HARD_ELO_BENCH_DF = hard_elo_bench_df
243
 
244
  complete_solve_df = datasets.load_dataset(
245
  SOLVE_REPO,
246
  "default",
247
  split="complete",
248
  cache_dir=HF_HOME,
249
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
250
  verification_mode="no_checks"
251
  ).to_pandas()
252
  instruct_solve_df = datasets.load_dataset(
@@ -254,7 +226,7 @@ def get_latest_data_leaderboard(
254
  "default",
255
  split="instruct",
256
  cache_dir=HF_HOME,
257
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
258
  verification_mode="no_checks"
259
  ).to_pandas()
260
  COMPLETE_SOLVE_DF = complete_solve_df
@@ -265,7 +237,7 @@ def get_latest_data_leaderboard(
265
  "default",
266
  split="complete",
267
  cache_dir=HF_HOME,
268
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
269
  verification_mode="no_checks"
270
  ).to_pandas()
271
  hard_instruct_solve_df = datasets.load_dataset(
@@ -273,7 +245,7 @@ def get_latest_data_leaderboard(
273
  "default",
274
  split="instruct",
275
  cache_dir=HF_HOME,
276
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
277
  verification_mode="no_checks"
278
  ).to_pandas()
279
  HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
@@ -283,18 +255,17 @@ def get_latest_data_leaderboard(
283
 
284
  else:
285
  LEADERBOARD_DF = leaderboard_initial_df
286
- # HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
287
- ELO_TASK_DF = elo_task_df
288
- # ELO_BENCH_DF = elo_bench_df
289
- # HARD_ELO_TASK_DF = hard_elo_task_df
290
- HARD_ELO_BENCH_DF = hard_elo_bench_df
291
  COMPLETE_SOLVE_DF = complete_solve_df
292
- # INSTRUCT_SOLVE_DF = instruct_solve_df
293
- # HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
294
  HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
295
 
296
- return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
297
- # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
298
 
299
 
300
  def init_space():
@@ -303,19 +274,19 @@ def init_space():
303
  # Always redownload the leaderboard DataFrame
304
  global LEADERBOARD_DF
305
  global HARD_LEADERBOARD_DF
306
- global ELO_TASK_DF
307
- global ELO_BENCH_DF
308
- global HARD_ELO_TASK_DF
309
- global HARD_ELO_BENCH_DF
310
  global COMPLETE_SOLVE_DF
311
  global INSTRUCT_SOLVE_DF
312
  global HARD_COMPLETE_SOLVE_DF
313
  global HARD_INSTRUCT_SOLVE_DF
314
 
315
- LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
316
  # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
317
 
318
- return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
319
  # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
320
 
321
  # Initialize VoteManager
@@ -331,10 +302,7 @@ def init_space():
331
 
332
  # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
333
  # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
334
- LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
335
- ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
336
- COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
337
- HARD_INSTRUCT_SOLVE_DF = init_space()
338
  # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
339
 
340
  # Data processing for plots now only on demand in the respective Gradio tab
@@ -399,7 +367,6 @@ with main_block as demo:
399
  - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
400
  - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
401
  - `Average` is the average of `Complete` and `Instruct` when both are available.
402
- - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
403
  - `#Act Params (B)` is the number of activated model parameters during inference.
404
  - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
405
  - For more details check the 📝 About section.
@@ -407,20 +374,21 @@ with main_block as demo:
407
  elem_classes="markdown-text",
408
  )
409
 
410
- with gr.TabItem("📊 Elo Rating", id="hard_elo"):
411
- with gr.Column():
412
- with gr.Group():
413
- gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
414
- hard_task_elo_map = gr.Plot()
415
- hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
416
- demo.load(plot_elo_mle, [hard_elo_task_gr],
417
- hard_task_elo_map)
418
- with gr.Group():
419
- gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
420
- hard_bench_elo_map = gr.Plot()
421
- hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
422
- demo.load(plot_elo_mle, [hard_elo_bench_gr],
423
- hard_bench_elo_map)
 
424
 
425
  with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
426
  with gr.Column():
@@ -448,27 +416,26 @@ with main_block as demo:
448
  - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
449
  - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
450
  - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
451
- - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
452
- - `size` is the amount of activated model weight during inference.
453
  - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
454
  - For more details check the 📝 About section.
455
  """,
456
  elem_classes="markdown-text",
457
  )
458
 
459
- with gr.TabItem("📊 Elo Rating", id="full_elo"):
460
- with gr.Column():
461
- with gr.Group():
462
-
463
- gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
464
- task_elo_map = gr.Plot()
465
- elo_task_gr = init_others(ELO_TASK_DF)
466
- demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
467
- with gr.Group():
468
- gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
469
- bench_elo_map = gr.Plot()
470
- elo_bench_gr = init_others(ELO_BENCH_DF)
471
- demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
 
472
 
473
  with gr.TabItem("🧩 Solve Rate", id="full_solve"):
474
  with gr.Column():
@@ -602,7 +569,7 @@ with main_block as demo:
602
  show_copy_button=True,
603
  )
604
 
605
- main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
606
  # main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
607
  # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
608
  # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
 
38
  DATA_VERSION,
39
  DATA_REPO,
40
  HARD_RESULT_REPO,
41
+ # ELO_REPO, # Comment out
42
+ # HARD_ELO_REPO, # Comment out
43
  SOLVE_REPO,
44
  HARD_SOLVE_REPO,
45
  HF_TOKEN,
 
51
  )
52
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
53
  from src.execute import generate_command, default_command, stream_logs, find_result_file
54
+ from src.tools.plots import plot_solve_rate
55
  # from src.voting.vote_system import VoteManager, run_scheduler
56
 
57
  # Configure logging
 
66
  NEW_DATA_ON_LEADERBOARD = True
67
  LEADERBOARD_DF = None
68
  HARD_LEADERBOARD_DF = None
69
+ # ELO_TASK_DF = None # Comment out
70
+ # ELO_BENCH_DF = None # Comment out
71
+ # HARD_ELO_TASK_DF = None # Comment out
72
+ # HARD_ELO_BENCH_DF = None # Comment out
73
  COMPLETE_SOLVE_DF = None
74
  INSTRUCT_SOLVE_DF = None
75
  HARD_COMPLETE_SOLVE_DF = None
 
154
  def get_latest_data_leaderboard(
155
  leaderboard_initial_df = None,
156
  hard_leaderboard_initial_df = None,
157
+ # elo_task_df = None, # Comment out
158
+ # elo_bench_df = None, # Comment out
159
+ # hard_elo_task_df = None, # Comment out
160
+ # hard_elo_bench_df = None, # Comment out
161
  complete_solve_df = None,
162
  instruct_solve_df = None,
163
  hard_complete_solve_df = None,
 
166
  global NEW_DATA_ON_LEADERBOARD
167
  global LEADERBOARD_DF
168
  global HARD_LEADERBOARD_DF
169
+ # global ELO_TASK_DF # Comment out
170
+ # global ELO_BENCH_DF # Comment out
171
+ # global HARD_ELO_TASK_DF # Comment out
172
+ # global HARD_ELO_BENCH_DF # Comment out
173
  global COMPLETE_SOLVE_DF
174
  global INSTRUCT_SOLVE_DF
175
  global HARD_COMPLETE_SOLVE_DF
 
182
  "default",
183
  split="train",
184
  cache_dir=HF_HOME,
185
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
186
  verification_mode="no_checks"
187
  ).filter(lambda x: x['complete'] is not None or x['instruct'] is not None)
188
  LEADERBOARD_DF = get_leaderboard_df(
 
194
  "default",
195
  split="train",
196
  cache_dir=HF_HOME,
197
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
198
  verification_mode="no_checks"
199
  ).filter(lambda x: x['complete'] is not None or x['instruct'] is not None)
200
  hard_leaderboard_df = get_leaderboard_df(
 
202
  cols=COLS,
203
  )
204
  HARD_LEADERBOARD_DF = hard_leaderboard_df
205
+
206
+ # Comment out Elo dataset loading
207
+ # elo_task_df = datasets.load_dataset(...)
208
+ # elo_bench_df = datasets.load_dataset(...)
209
+ # ELO_TASK_DF = elo_task_df
210
+ # ELO_BENCH_DF = elo_bench_df
211
+ # hard_elo_task_df = datasets.load_dataset(...)
212
+ # hard_elo_bench_df = datasets.load_dataset(...)
213
+ # HARD_ELO_TASK_DF = hard_elo_task_df
214
+ # HARD_ELO_BENCH_DF = hard_elo_bench_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
  complete_solve_df = datasets.load_dataset(
217
  SOLVE_REPO,
218
  "default",
219
  split="complete",
220
  cache_dir=HF_HOME,
221
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
222
  verification_mode="no_checks"
223
  ).to_pandas()
224
  instruct_solve_df = datasets.load_dataset(
 
226
  "default",
227
  split="instruct",
228
  cache_dir=HF_HOME,
229
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
230
  verification_mode="no_checks"
231
  ).to_pandas()
232
  COMPLETE_SOLVE_DF = complete_solve_df
 
237
  "default",
238
  split="complete",
239
  cache_dir=HF_HOME,
240
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
241
  verification_mode="no_checks"
242
  ).to_pandas()
243
  hard_instruct_solve_df = datasets.load_dataset(
 
245
  "default",
246
  split="instruct",
247
  cache_dir=HF_HOME,
248
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
249
  verification_mode="no_checks"
250
  ).to_pandas()
251
  HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
 
255
 
256
  else:
257
  LEADERBOARD_DF = leaderboard_initial_df
258
+ HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
259
+ # ELO_TASK_DF = elo_task_df # Comment out
260
+ # ELO_BENCH_DF = elo_bench_df # Comment out
261
+ # HARD_ELO_TASK_DF = hard_elo_task_df # Comment out
262
+ # HARD_ELO_BENCH_DF = hard_elo_bench_df # Comment out
263
  COMPLETE_SOLVE_DF = complete_solve_df
264
+ INSTRUCT_SOLVE_DF = instruct_solve_df
265
+ HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
266
  HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
267
 
268
+ return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
 
269
 
270
 
271
  def init_space():
 
274
  # Always redownload the leaderboard DataFrame
275
  global LEADERBOARD_DF
276
  global HARD_LEADERBOARD_DF
277
+ # global ELO_TASK_DF # Comment out
278
+ # global ELO_BENCH_DF # Comment out
279
+ # global HARD_ELO_TASK_DF # Comment out
280
+ # global HARD_ELO_BENCH_DF # Comment out
281
  global COMPLETE_SOLVE_DF
282
  global INSTRUCT_SOLVE_DF
283
  global HARD_COMPLETE_SOLVE_DF
284
  global HARD_INSTRUCT_SOLVE_DF
285
 
286
+ LEADERBOARD_DF, HARD_LEADERBOARD_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
287
  # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
288
 
289
+ return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
290
  # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
291
 
292
  # Initialize VoteManager
 
302
 
303
  # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
304
  # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
305
+ LEADERBOARD_DF, HARD_LEADERBOARD_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
 
 
 
306
  # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
307
 
308
  # Data processing for plots now only on demand in the respective Gradio tab
 
367
  - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
368
  - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
369
  - `Average` is the average of `Complete` and `Instruct` when both are available.
 
370
  - `#Act Params (B)` is the number of activated model parameters during inference.
371
  - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
372
  - For more details check the 📝 About section.
 
374
  elem_classes="markdown-text",
375
  )
376
 
377
+ # Comment out or remove the Elo Rating tab
378
+ # with gr.TabItem("📊 Elo Rating", id="hard_elo"):
379
+ # with gr.Column():
380
+ # with gr.Group():
381
+ # gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
382
+ # hard_task_elo_map = gr.Plot()
383
+ # hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
384
+ # demo.load(plot_elo_mle, [hard_elo_task_gr],
385
+ # hard_task_elo_map)
386
+ # with gr.Group():
387
+ # gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
388
+ # hard_bench_elo_map = gr.Plot()
389
+ # hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
390
+ # demo.load(plot_elo_mle, [hard_elo_bench_gr],
391
+ # hard_bench_elo_map)
392
 
393
  with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
394
  with gr.Column():
 
416
  - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
417
  - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
418
  - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
 
 
419
  - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
420
  - For more details check the 📝 About section.
421
  """,
422
  elem_classes="markdown-text",
423
  )
424
 
425
+ # Comment out or remove the Elo Rating tab
426
+ # with gr.TabItem("📊 Elo Rating", id="full_elo"):
427
+ # with gr.Column():
428
+ # with gr.Group():
429
+ #
430
+ # gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
431
+ # task_elo_map = gr.Plot()
432
+ # elo_task_gr = init_others(ELO_TASK_DF)
433
+ # demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
434
+ # with gr.Group():
435
+ # gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
436
+ # bench_elo_map = gr.Plot()
437
+ # elo_bench_gr = init_others(ELO_BENCH_DF)
438
+ # demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
439
 
440
  with gr.TabItem("🧩 Solve Rate", id="full_solve"):
441
  with gr.Column():
 
569
  show_copy_button=True,
570
  )
571
 
572
+ main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
573
  # main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
574
  # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
575
  # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
src/display/utils.py CHANGED
@@ -54,7 +54,7 @@ column_map = {
54
  "complete": "Complete",
55
  "instruct": "Instruct",
56
  "average": "Average",
57
- "elo_mle": "Elo Rating",
58
  "link": "Link",
59
  "act_param": "#Act Params (B)",
60
  "size": "#Params (B)",
@@ -96,7 +96,7 @@ auto_eval_column_dict.append(["size_range", ColumnContent, ColumnContent(column_
96
  auto_eval_column_dict.append(["complete", ColumnContent, ColumnContent(column_map["complete"], "number", True)])
97
  auto_eval_column_dict.append(["instruct", ColumnContent, ColumnContent(column_map["instruct"], "number", True)])
98
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(column_map["average"], "number", True)])
99
- auto_eval_column_dict.append(["elo_mle", ColumnContent, ColumnContent(column_map["elo_mle"], "number", True)])
100
 
101
  # Model information
102
  auto_eval_column_dict.append(["act_param", ColumnContent, ColumnContent(column_map["act_param"], "number", True)])
 
54
  "complete": "Complete",
55
  "instruct": "Instruct",
56
  "average": "Average",
57
+ # "elo_mle": "Elo Rating",
58
  "link": "Link",
59
  "act_param": "#Act Params (B)",
60
  "size": "#Params (B)",
 
96
  auto_eval_column_dict.append(["complete", ColumnContent, ColumnContent(column_map["complete"], "number", True)])
97
  auto_eval_column_dict.append(["instruct", ColumnContent, ColumnContent(column_map["instruct"], "number", True)])
98
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(column_map["average"], "number", True)])
99
+ # auto_eval_column_dict.append(["elo_mle", ColumnContent, ColumnContent(column_map["elo_mle"], "number", True)])
100
 
101
  # Model information
102
  auto_eval_column_dict.append(["act_param", ColumnContent, ColumnContent(column_map["act_param"], "number", True)])
src/tools/plots.py CHANGED
@@ -3,18 +3,18 @@ import plotly.express as px
3
  import numpy as np
4
 
5
 
6
- def plot_elo_mle(df):
7
- fig = px.scatter(df, x="model", y="rating", error_y="error_y",
8
- error_y_minus="error_y_minus",
9
- # title="Bootstrap of Elo MLE Estimates (BigCodeBench-Complete)"
10
- )
11
- fig.update_layout(xaxis_title="Model",
12
- yaxis_title="Rating",
13
- autosize=True,
14
- # width=1300,
15
- # height=900,
16
- )
17
- return fig
18
 
19
 
20
  def plot_solve_rate(df, task, rows=30, cols=38):
 
3
  import numpy as np
4
 
5
 
6
+ # def plot_elo_mle(df):
7
+ # fig = px.scatter(df, x="model", y="rating", error_y="error_y",
8
+ # error_y_minus="error_y_minus",
9
+ # # title="Bootstrap of Elo MLE Estimates (BigCodeBench-Complete)"
10
+ # )
11
+ # fig.update_layout(xaxis_title="Model",
12
+ # yaxis_title="Rating",
13
+ # autosize=True,
14
+ # # width=1300,
15
+ # # height=900,
16
+ # )
17
+ # return fig
18
 
19
 
20
  def plot_solve_rate(df, task, rows=30, cols=38):