Spaces:
Running
Running
from pathlib import Path | |
from collections import OrderedDict | |
DEFAULT_K = "∞" | |
# DEFAULT_K = "1500" | |
banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here. | |
BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>' | |
# TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>" | |
CITATION_TEXT = """ | |
@misc{zebralogic2024, | |
title={ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models}, | |
author={Bill Yuchen Lin and Ronan Le Bras and Yejin Choi}, | |
url={https://huggingface.co/spaces/allenai/ZebraLogic}, | |
year={2024} | |
} | |
@article{dziri2024faith, | |
title={Faith and fate: Limits of transformers on compositionality}, | |
author={Nouha Dziri and Ximing Lu and Melanie Sclar and Xiang Lorraine Li and Liwei Jian and Bill Yuchen Lin and Peter West and Chandra Bhagavatula and Ronan Le Bras and Jena D. Hwang and Soumya Sanyal and Sean Welleck and Xiang Ren and Allyson Ettinger and Za{\"i}d Harchaoui and Yejin Choi}, | |
journal={Advances in Neural Information Processing Systems}, | |
volume={36}, | |
year={2024} | |
} | |
""" | |
# make column_names as an ordered dict | |
column_names = OrderedDict({ | |
"Model": "Model", | |
"Mode": "Mode", | |
"Puzzle Acc": "Puzzle Acc", | |
"Cell Acc": "Cell Acc", | |
"No answer": "No answer", | |
"Easy Puzzle Acc": "Easy Puzzle Acc", | |
"Hard Puzzle Acc": "Hard Puzzle Acc", | |
# "Total Puzzles": "Total Puzzles", | |
# "Reason Lens": "Reason Lens", | |
}) | |
LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**. | |
""" | |
# **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**. | |
# The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three. | |
# **WB Score** individually scores each model based on checklists. | |
# Evaluator is GPT-4-Turbo. | |
LEADERBOARD_REMARKS_MAIN = """ | |
""" | |
RANKING_COLUMN = "Puzzle Acc" | |
ORDERED_COLUMN_NAMES = [ | |
"Model", | |
"Mode", | |
"Puzzle Acc", | |
"Easy Puzzle Acc", | |
"Hard Puzzle Acc", | |
"Cell Acc", | |
"No answer", | |
] | |
js_light = """ | |
function refresh() { | |
const url = new URL(window.location); | |
if (url.searchParams.get('__theme') !== 'light') { | |
url.searchParams.set('__theme', 'light'); | |
window.location.href = url.href; | |
} | |
// Find the fieldset with the given id | |
const fieldset = document.getElementById("rank-column-radio"); | |
// Create a new span element with the text "Decoding Mode:" | |
const rankBySpan = document.createElement("span"); | |
rankBySpan.textContent = "Decoding Mode: "; | |
rankBySpan.style.fontWeight = "bold"; // Optional: make the text bold | |
rankBySpan.style.fontSize = "19px"; // Larger font size | |
rankBySpan.style.paddingRight = "18px"; // Add padding on the right | |
// Wrap the span and the labels in a flex container | |
const flexContainer = document.createElement("div"); | |
flexContainer.style.display = "flex"; | |
flexContainer.style.alignItems = "center"; | |
// Insert the rankBySpan at the beginning of the flex container | |
flexContainer.appendChild(rankBySpan); | |
// Move all existing labels into the flex container | |
while (fieldset.firstChild) { | |
flexContainer.appendChild(fieldset.firstChild); | |
} | |
// Append the flex container back to the fieldset | |
fieldset.appendChild(flexContainer); | |
} | |
""" | |
js_code = """ | |
function scroll_top() { | |
console.log("Hello from Gradio!"); | |
const bubbles = document.querySelectorAll('.bubble-wrap'); | |
bubbles.forEach((bubble, index) => { | |
setTimeout(() => { | |
bubble.scrollTop = 0; | |
}, index * 100); // Delay of 100ms between each iteration | |
}); | |
} | |
""" | |
TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)" | |
css = """ | |
code { | |
font-size: large; | |
} | |
footer {visibility: hidden} | |
.top-left-LP{ | |
margin-top: 6px; | |
margin-left: 5px; | |
} | |
.no_margin{ | |
margin-top: 0px; | |
margin-left: 0px; | |
margin-right: 0px; | |
margin-bottom: 0px; | |
padding-top: 0px; | |
padding-left: 0px; | |
padding-right: 0px; | |
padding-bottom: 0px; | |
} | |
.markdown-text{font-size: 14pt} | |
.markdown-text-tiny{font-size: 10pt} | |
.markdown-text-small{font-size: 13pt} | |
.markdown-text-tiny{font-size: 12pt} | |
.markdown-text-tiny-red{ | |
font-size: 12pt; | |
color: red; | |
background-color: yellow; | |
font-color: red; | |
font-weight: bold; | |
} | |
th { | |
text-align: center; | |
font-size: 17px; /* Adjust the font size as needed */ | |
} | |
td { | |
font-size: 15px; /* Adjust the font size as needed */ | |
text-align: center; | |
} | |
.sample_button{ | |
border: 2px solid #000000; | |
border-radius: 10px; | |
padding: 10px; | |
font-size: 17pt; | |
font-weight: bold; | |
margin: 5px; | |
background-color: #D8BFD8; | |
} | |
.chat-common{ | |
height: auto; | |
max-height: 400px; | |
min-height: 100px; | |
} | |
.chat-specific{ | |
height: auto; | |
max-height: 600px; | |
min-height: 200px; | |
} | |
#od-benchmark-tab-table-button{ | |
font-size: 15pt; | |
font-weight: bold; | |
} | |
.btn_boderline{ | |
border: 1px solid #000000; | |
border-radius: 5px; | |
padding: 5px; | |
margin: 5px; | |
font-size: 15pt; | |
font-weight: bold; | |
} | |
.btn_boderline_next{ | |
border: 0.1px solid #000000; | |
border-radius: 5px; | |
padding: 5px; | |
margin: 5px; | |
font-size: 15pt; | |
font-weight: bold; | |
} | |
.btn_boderline_gray{ | |
border: 0.5px solid gray; | |
border-radius: 5px; | |
padding: 5px; | |
margin: 5px; | |
font-size: 15pt; | |
font-weight: italic; | |
} | |
.btn_boderline_selected{ | |
border: 2px solid purple; | |
background-color: #f2f2f2; | |
border-radius: 5px; | |
padding: 5px; | |
margin: 5px; | |
font-size: 15pt; | |
font-weight: bold; | |
} | |
.accordion-label button span{ | |
font-size: 14pt; | |
font-weight: bold; | |
} | |
#show-task-categorized span{ | |
font-size: 13pt; | |
font-weight: bold; | |
} | |
#show-open-source-models span{ | |
font-size: 13pt; | |
font-weight: bold; | |
} | |
#select-models span{ | |
font-size: 10pt; | |
} | |
#select-tasks span{ | |
font-size: 10pt; | |
} | |
.markdown-text-details{ | |
margin: 10px; | |
padding: 10px; | |
} | |
button.selected[role="tab"][aria-selected="true"] { | |
font-size: 18px; /* or any other size you prefer */ | |
font-weight: bold; | |
} | |
#od-benchmark-tab-table-ablation-button { | |
font-size: larger; /* Adjust the font size as needed */ | |
} | |
.plotly-plot{ | |
height: auto; | |
max-height: 600px; | |
min-height: 600px; | |
} | |
#length-margin-radio{ | |
font-size: 10pt; | |
# padding: 0px; | |
# margin: 1px; | |
} | |
#show-task-categorized{ | |
font-size: 12pt; | |
font-decoration: bold; | |
} | |
#show-open-source-models{ | |
font-size: 12pt; | |
font-decoration: bold; | |
} | |
.box_md{ | |
border: 1px solid #000000; | |
border-radius: 10px; | |
padding: 10px; | |
font-size: 12pt; | |
margin: 5px; | |
} | |
""" | |