Update space
Browse files
app.py
CHANGED
@@ -103,7 +103,8 @@ def init_leaderboard(dataframe):
|
|
103 |
# model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
|
104 |
# model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
|
105 |
# model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
|
106 |
-
model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
|
|
|
107 |
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
108 |
|
109 |
|
@@ -156,7 +157,7 @@ with demo:
|
|
156 |
AutoEvalColumn.rank_math_probability.name,
|
157 |
AutoEvalColumn.rank_reason_logical.name,
|
158 |
AutoEvalColumn.rank_reason_social.name,
|
159 |
-
|
160 |
],
|
161 |
rank_col=[],
|
162 |
)
|
@@ -274,6 +275,7 @@ with demo:
|
|
274 |
[SocialIQA](https://arxiv.org/abs/1904.09728),
|
275 |
[NormBank](https://arxiv.org/abs/2305.17008), covering challenging social reasoning tasks,
|
276 |
such as social commonsense reasoning, social normative reasoning, Theory of Mind (ToM) reasoning, etc.
|
|
|
277 |
|
278 |
"""
|
279 |
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
@@ -314,9 +316,10 @@ with demo:
|
|
314 |
|
315 |
with gr.TabItem("π¬ Science", elem_id="science-table", id=4):
|
316 |
CURRENT_TEXT = """
|
317 |
-
|
318 |
-
|
319 |
-
We
|
|
|
320 |
[GPQA](https://arxiv.org/abs/2311.12022),
|
321 |
[JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
|
322 |
[MMLU-Pro](https://arxiv.org/abs/2406.01574),
|
@@ -359,8 +362,7 @@ with demo:
|
|
359 |
|
360 |
with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
|
361 |
CURRENT_TEXT = """
|
362 |
-
|
363 |
-
We are working on adding more tasks in coding domains to the leaderboard.
|
364 |
The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages.
|
365 |
We collect a variety of recent coding datasets, including
|
366 |
[HumanEval](https://huggingface.co/datasets/openai/openai_humaneval),
|
@@ -371,6 +373,24 @@ with demo:
|
|
371 |
Our efforts also include synthesizing new code-related queries to ensure diversity!
|
372 |
"""
|
373 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
374 |
|
375 |
|
376 |
|
@@ -386,7 +406,7 @@ with demo:
|
|
386 |
|
387 |
## Team members
|
388 |
Yanbin Yin, [Zhen Wang](https://zhenwang9102.github.io/), [Kun Zhou](https://lancelot39.github.io/), Xiangdong Zhang,
|
389 |
-
[Shibo Hao](https://ber666.github.io/), [Yi Gu](https://www.yigu.page/), Jieyuan Liu, Somanshu Singla, [Tianyang Liu](https://leolty.github.io/),
|
390 |
[Eric P. Xing](https://www.cs.cmu.edu/~epxing/), [Zhengzhong Liu](https://hunterhector.github.io/), [Haojian Jin](https://www.haojianj.in/),
|
391 |
[Zhiting Hu](https://zhiting.ucsd.edu/)
|
392 |
|
|
|
103 |
# model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
|
104 |
# model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
|
105 |
# model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
|
106 |
+
# model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
|
107 |
+
model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
|
108 |
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
109 |
|
110 |
|
|
|
157 |
AutoEvalColumn.rank_math_probability.name,
|
158 |
AutoEvalColumn.rank_reason_logical.name,
|
159 |
AutoEvalColumn.rank_reason_social.name,
|
160 |
+
AutoEvalColumn.rank_chemistry.name,
|
161 |
],
|
162 |
rank_col=[],
|
163 |
)
|
|
|
275 |
[SocialIQA](https://arxiv.org/abs/1904.09728),
|
276 |
[NormBank](https://arxiv.org/abs/2305.17008), covering challenging social reasoning tasks,
|
277 |
such as social commonsense reasoning, social normative reasoning, Theory of Mind (ToM) reasoning, etc.
|
278 |
+
More fine-grained types of reasoning, such as symbolic, analogical, counterfactual reasoning, are planned to be added in the future.
|
279 |
|
280 |
"""
|
281 |
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
|
|
316 |
|
317 |
with gr.TabItem("π¬ Science", elem_id="science-table", id=4):
|
318 |
CURRENT_TEXT = """
|
319 |
+
Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities.
|
320 |
+
|
321 |
+
We are adding several fine-grained scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
|
322 |
+
We have diversely and aggressively collected recent scientific datasets, including but not limited to
|
323 |
[GPQA](https://arxiv.org/abs/2311.12022),
|
324 |
[JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
|
325 |
[MMLU-Pro](https://arxiv.org/abs/2406.01574),
|
|
|
362 |
|
363 |
with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
|
364 |
CURRENT_TEXT = """
|
365 |
+
We are working on adding more fine-grained tasks in coding domains to the leaderboard.
|
|
|
366 |
The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages.
|
367 |
We collect a variety of recent coding datasets, including
|
368 |
[HumanEval](https://huggingface.co/datasets/openai/openai_humaneval),
|
|
|
373 |
Our efforts also include synthesizing new code-related queries to ensure diversity!
|
374 |
"""
|
375 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
376 |
+
|
377 |
+
with gr.TabItem("π Python", elem_id="python_subtab", id=0, elem_classes="subtab"):
|
378 |
+
CURRENT_TEXT = """
|
379 |
+
# Coming soon!
|
380 |
+
"""
|
381 |
+
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
382 |
+
|
383 |
+
with gr.TabItem("β Java", elem_id="java_subtab", id=1, elem_classes="subtab"):
|
384 |
+
CURRENT_TEXT = """
|
385 |
+
# Coming soon!
|
386 |
+
"""
|
387 |
+
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
388 |
+
|
389 |
+
with gr.TabItem("β C++", elem_id="cpp_subtab", id=2, elem_classes="subtab"):
|
390 |
+
CURRENT_TEXT = """
|
391 |
+
# Coming soon!
|
392 |
+
"""
|
393 |
+
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
394 |
|
395 |
|
396 |
|
|
|
406 |
|
407 |
## Team members
|
408 |
Yanbin Yin, [Zhen Wang](https://zhenwang9102.github.io/), [Kun Zhou](https://lancelot39.github.io/), Xiangdong Zhang,
|
409 |
+
[Shibo Hao](https://ber666.github.io/), [Yi Gu](https://www.yigu.page/), [Jieyuan Liu](https://www.linkedin.com/in/jieyuan-liu/), [Somanshu Singla](https://www.linkedin.com/in/somanshu-singla-105636214/), [Tianyang Liu](https://leolty.github.io/),
|
410 |
[Eric P. Xing](https://www.cs.cmu.edu/~epxing/), [Zhengzhong Liu](https://hunterhector.github.io/), [Haojian Jin](https://www.haojianj.in/),
|
411 |
[Zhiting Hu](https://zhiting.ucsd.edu/)
|
412 |
|