yzabc007 commited on
Commit
a56e40b
Β·
1 Parent(s): 2f5cc84

Update space

Browse files
Files changed (1) hide show
  1. app.py +28 -8
app.py CHANGED
@@ -103,7 +103,8 @@ def init_leaderboard(dataframe):
103
  # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
104
  # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
105
  # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
106
- model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
 
107
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
108
 
109
 
@@ -156,7 +157,7 @@ with demo:
156
  AutoEvalColumn.rank_math_probability.name,
157
  AutoEvalColumn.rank_reason_logical.name,
158
  AutoEvalColumn.rank_reason_social.name,
159
- # AutoEvalColumn.rank_chemistry.name,
160
  ],
161
  rank_col=[],
162
  )
@@ -274,6 +275,7 @@ with demo:
274
  [SocialIQA](https://arxiv.org/abs/1904.09728),
275
  [NormBank](https://arxiv.org/abs/2305.17008), covering challenging social reasoning tasks,
276
  such as social commonsense reasoning, social normative reasoning, Theory of Mind (ToM) reasoning, etc.
 
277
 
278
  """
279
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
@@ -314,9 +316,10 @@ with demo:
314
 
315
  with gr.TabItem("πŸ”¬ Science", elem_id="science-table", id=4):
316
  CURRENT_TEXT = """
317
- Sicnece domain is a critical area for evaluating LLMs.
318
- We are working on adding several tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
319
- We have diversely and aggressively collected recent science datasets, including but not limited to
 
320
  [GPQA](https://arxiv.org/abs/2311.12022),
321
  [JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
322
  [MMLU-Pro](https://arxiv.org/abs/2406.01574),
@@ -359,8 +362,7 @@ with demo:
359
 
360
  with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
361
  CURRENT_TEXT = """
362
- # Coming soon!
363
- We are working on adding more tasks in coding domains to the leaderboard.
364
  The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages.
365
  We collect a variety of recent coding datasets, including
366
  [HumanEval](https://huggingface.co/datasets/openai/openai_humaneval),
@@ -371,6 +373,24 @@ with demo:
371
  Our efforts also include synthesizing new code-related queries to ensure diversity!
372
  """
373
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
 
375
 
376
 
@@ -386,7 +406,7 @@ with demo:
386
 
387
  ## Team members
388
  Yanbin Yin, [Zhen Wang](https://zhenwang9102.github.io/), [Kun Zhou](https://lancelot39.github.io/), Xiangdong Zhang,
389
- [Shibo Hao](https://ber666.github.io/), [Yi Gu](https://www.yigu.page/), Jieyuan Liu, Somanshu Singla, [Tianyang Liu](https://leolty.github.io/),
390
  [Eric P. Xing](https://www.cs.cmu.edu/~epxing/), [Zhengzhong Liu](https://hunterhector.github.io/), [Haojian Jin](https://www.haojianj.in/),
391
  [Zhiting Hu](https://zhiting.ucsd.edu/)
392
 
 
103
  # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
104
  # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
105
  # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
106
+ # model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
107
+ model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
108
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
109
 
110
 
 
157
  AutoEvalColumn.rank_math_probability.name,
158
  AutoEvalColumn.rank_reason_logical.name,
159
  AutoEvalColumn.rank_reason_social.name,
160
+ AutoEvalColumn.rank_chemistry.name,
161
  ],
162
  rank_col=[],
163
  )
 
275
  [SocialIQA](https://arxiv.org/abs/1904.09728),
276
  [NormBank](https://arxiv.org/abs/2305.17008), covering challenging social reasoning tasks,
277
  such as social commonsense reasoning, social normative reasoning, Theory of Mind (ToM) reasoning, etc.
278
+ More fine-grained types of reasoning, such as symbolic, analogical, counterfactual reasoning, are planned to be added in the future.
279
 
280
  """
281
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
 
316
 
317
  with gr.TabItem("πŸ”¬ Science", elem_id="science-table", id=4):
318
  CURRENT_TEXT = """
319
+ Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities.
320
+
321
+ We are adding several fine-grained scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
322
+ We have diversely and aggressively collected recent scientific datasets, including but not limited to
323
  [GPQA](https://arxiv.org/abs/2311.12022),
324
  [JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
325
  [MMLU-Pro](https://arxiv.org/abs/2406.01574),
 
362
 
363
  with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
364
  CURRENT_TEXT = """
365
+ We are working on adding more fine-grained tasks in coding domains to the leaderboard.
 
366
  The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages.
367
  We collect a variety of recent coding datasets, including
368
  [HumanEval](https://huggingface.co/datasets/openai/openai_humaneval),
 
373
  Our efforts also include synthesizing new code-related queries to ensure diversity!
374
  """
375
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
376
+
377
+ with gr.TabItem("🐍 Python", elem_id="python_subtab", id=0, elem_classes="subtab"):
378
+ CURRENT_TEXT = """
379
+ # Coming soon!
380
+ """
381
+ gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
382
+
383
+ with gr.TabItem("β˜• Java", elem_id="java_subtab", id=1, elem_classes="subtab"):
384
+ CURRENT_TEXT = """
385
+ # Coming soon!
386
+ """
387
+ gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
388
+
389
+ with gr.TabItem("βž• C++", elem_id="cpp_subtab", id=2, elem_classes="subtab"):
390
+ CURRENT_TEXT = """
391
+ # Coming soon!
392
+ """
393
+ gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
394
 
395
 
396
 
 
406
 
407
  ## Team members
408
  Yanbin Yin, [Zhen Wang](https://zhenwang9102.github.io/), [Kun Zhou](https://lancelot39.github.io/), Xiangdong Zhang,
409
+ [Shibo Hao](https://ber666.github.io/), [Yi Gu](https://www.yigu.page/), [Jieyuan Liu](https://www.linkedin.com/in/jieyuan-liu/), [Somanshu Singla](https://www.linkedin.com/in/somanshu-singla-105636214/), [Tianyang Liu](https://leolty.github.io/),
410
  [Eric P. Xing](https://www.cs.cmu.edu/~epxing/), [Zhengzhong Liu](https://hunterhector.github.io/), [Haojian Jin](https://www.haojianj.in/),
411
  [Zhiting Hu](https://zhiting.ucsd.edu/)
412