Spaces:
Running
Running
import os | |
import json | |
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
from pathlib import Path | |
from apscheduler.schedulers.background import BackgroundScheduler | |
from huggingface_hub import snapshot_download | |
from src.about import ( | |
CITATION_BUTTON_LABEL, | |
CITATION_BUTTON_TEXT, | |
EVALUATION_QUEUE_TEXT, | |
INTRODUCTION_TEXT, | |
LLM_BENCHMARKS_TEXT, | |
TITLE, | |
ABOUT_TEXT | |
) | |
from src.display.css_html_js import custom_css | |
# from src.display.utils import ( | |
# BENCHMARK_COLS, | |
# COLS, | |
# EVAL_COLS, | |
# EVAL_TYPES, | |
# NUMERIC_INTERVALS, | |
# TYPES, | |
# AutoEvalColumn, | |
# ModelType, | |
# fields, | |
# WeightType, | |
# Precision | |
# ) | |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN | |
try: | |
print(EVAL_RESULTS_PATH) | |
snapshot_download( | |
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN | |
) | |
except Exception: | |
pass | |
# restart_space() | |
SUBSET_COUNTS = { | |
"Alignment-Object": 250, | |
"Alignment-Attribute": 229, | |
"Alignment-Action": 115, | |
"Alignment-Count": 55, | |
"Alignment-Location": 75, | |
"Safety-Toxicity-Crime": 29, | |
"Safety-Toxicity-Shocking": 31, | |
"Safety-Toxicity-Disgust": 42, | |
"Safety-Nsfw-Evident": 197, | |
"Safety-Nsfw-Evasive": 177, | |
"Safety-Nsfw-Subtle": 98, | |
"Quality-Distortion-Human_face": 169, | |
"Quality-Distortion-Human_limb": 152, | |
"Quality-Distortion-Object": 100, | |
"Quality-Blurry-Defocused": 350, | |
"Quality-Blurry-Motion": 350, | |
"Bias-Age": 80, | |
"Bias-Gender": 140, | |
"Bias-Race": 140, | |
"Bias-Nationality": 120, | |
"Bias-Religion": 60, | |
} | |
PERSPECTIVE_COUNTS= { | |
"Alignment": 724, | |
"Safety": 574, | |
"Quality": 1121, | |
"Bias": 540 | |
} | |
META_DATA = ['Model'] | |
def restart_space(): | |
API.restart_space(repo_id=REPO_ID) | |
# color_map = { | |
# "Score Model": "#7497db", | |
# "Opensource VLM": "#E8ECF2", | |
# "Closesource VLM": "#ffcd75", | |
# "Others": "#75809c", | |
# # #7497db #E8ECF2 #ffcd75 #75809c | |
# } | |
# def color_model_type_column(df, color_map): | |
# """ | |
# Apply color to the 'Model Type' column of the DataFrame based on a given color mapping. | |
# Parameters: | |
# df (pd.DataFrame): The DataFrame containing the 'Model Type' column. | |
# color_map (dict): A dictionary mapping model types to colors. | |
# Returns: | |
# pd.Styler: The styled DataFrame. | |
# """ | |
# # Function to apply color based on the model type | |
# def apply_color(val): | |
# color = color_map.get(val, "default") # Default color if not specified in color_map | |
# return f'background-color: {color}' | |
# # Format for different columns | |
# format_dict = {col: "{:.1f}" for col in df.columns if col not in META_DATA} | |
# format_dict['Overall Score'] = "{:.2f}" | |
# format_dict[''] = "{:d}" | |
# return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='') | |
def regex_table(dataframe, regex, filter_button, style=True): | |
""" | |
Takes a model name as a regex, then returns only the rows that has that in it. | |
""" | |
# Split regex statement by comma and trim whitespace around regexes | |
regex_list = [x.strip() for x in regex.split(",")] | |
# Join the list into a single regex pattern with '|' acting as OR | |
combined_regex = '|'.join(regex_list) | |
# if filter_button, remove all rows with "ai2" in the model name | |
update_scores = False | |
if isinstance(filter_button, list) or isinstance(filter_button, str): | |
if "Integrated LVLM" not in filter_button: | |
dataframe = dataframe[~dataframe["Model Type"].str.contains("Integrated LVLM", case=False, na=False)] | |
if "Interleaved LVLM" not in filter_button: | |
dataframe = dataframe[~dataframe["Model Type"].str.contains("Interleaved LVLM", case=False, na=False)] | |
# Filter the dataframe such that 'model' contains any of the regex patterns | |
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)] | |
data.reset_index(drop=True, inplace=True) | |
# replace column '' with count/rank | |
data.insert(0, '', range(1, 1 + len(data))) | |
# if style: | |
# # apply color | |
# data = color_model_type_column(data, color_map) | |
return data | |
def get_leaderboard_results(results_path): | |
data_dir = Path(results_path) | |
files = [d for d in os.listdir(data_dir)] # TODO check if "Path(data_dir) / d" is a dir | |
df = pd.DataFrame() | |
for file in files: | |
if not file.endswith(".json"): | |
continue | |
with open(results_path / file) as rf: | |
result = json.load(rf) | |
result = pd.DataFrame(result) | |
df = pd.concat([result, df]) | |
df.reset_index(drop=True, inplace=True) | |
return df | |
def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, perspective_counts=PERSPECTIVE_COUNTS): | |
new_df = orig_df[meta_data + columns_name] | |
new_perspective_counts = {col: perspective_counts[col] for col in columns_name} | |
total_count = sum(perspective_counts.values()) | |
weights = {perspective: count / total_count for perspective, count in perspective_counts.items()} | |
def calculate_weighted_avg(row): | |
weighted_sum = sum(row[col] * weights[col] for col in columns_name) | |
return weighted_sum | |
new_df["Overall Score"] = new_df.apply(calculate_weighted_avg, axis=1) | |
cols = meta_data + ["Overall Score"] + columns_name | |
new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True) | |
return new_df | |
data = { | |
"Model": [ | |
"MiniGPT-5", "EMU-2", "GILL", "Anole", | |
"GPT-4o | Openjourney", "GPT-4o | SD-3", "GPT-4o | SD-XL", "GPT-4o | Flux", | |
"Gemini-1.5 | Openjourney", "Gemini-1.5 | SD-3", "Gemini-1.5 | SD-XL", "Gemini-1.5 | Flux", | |
"LLAVA-34b | Openjourney", "LLAVA-34b | SD-3", "LLAVA-34b | SD-XL", "LLAVA-34b | Flux", | |
"Qwen-VL-70b | Openjourney", "Qwen-VL-70b | SD-3", "Qwen-VL-70b | SD-XL", "Qwen-VL-70b | Flux" | |
], | |
"Model Type":[ | |
"Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM", | |
"Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", | |
"Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", | |
"Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", | |
"Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", | |
], | |
"Situational analysis": [ | |
47.63, 39.65, 46.72, 48.95, | |
53.05, 53.00, 56.12, 54.97, | |
48.08, 47.48, 49.43, 47.07, | |
54.12, 54.72, 55.97, 54.23, | |
52.73, 54.98, 52.58, 54.23 | |
], | |
"Project-based learning": [ | |
55.12, 46.12, 57.57, 59.05, | |
71.40, 71.20, 73.25, 68.80, | |
67.93, 68.70, 71.85, 68.33, | |
73.47, 72.55, 74.60, 71.32, | |
71.63, 71.87, 73.57, 69.47 | |
], | |
"Multi-step reasoning": [ | |
42.17, 50.75, 39.33, 51.72, | |
53.67, 53.67, 53.67, 53.67, | |
60.05, 60.05, 60.05, 60.05, | |
47.28, 47.28, 47.28, 47.28, | |
55.63, 55.63, 55.63, 55.63 | |
], | |
"AVG": [ | |
50.92, 45.33, 51.58, 55.22, | |
63.65, 63.52, 65.47, 62.63, | |
61.57, 61.87, 64.15, 61.55, | |
63.93, 63.57, 65.05, 62.73, | |
64.05, 64.75, 65.12, 63.18 | |
] | |
} | |
df = pd.DataFrame(data) | |
total_models = len(df) | |
with gr.Blocks(css=custom_css) as app: | |
with gr.Row(): | |
with gr.Column(scale=6): | |
gr.Markdown(INTRODUCTION_TEXT.format(str(total_models))) | |
with gr.Column(scale=4): | |
gr.Markdown("![](https://huggingface.co/spaces/MMIE/Leaderboard/blob/main/src/overview.jpeg)") | |
# gr.HTML(BGB_LOGO, elem_classes="logo") | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("🏆 MMIE Leaderboard"): | |
with gr.Row(): | |
search_overall = gr.Textbox( | |
label="Model Search (delimit with , )", | |
placeholder="🔍 Search model (separate multiple queries with ``) and press ENTER...", | |
show_label=False | |
) | |
model_type_overall = gr.CheckboxGroup( | |
choices=["Interleaved LVLM", "Integrated LVLM"], | |
value=["Interleaved LVLM", "Integrated LVLM"], | |
label="Model Type", | |
show_label=False, | |
interactive=True, | |
) | |
with gr.Row(): | |
mmie_table_overall_hidden = gr.Dataframe( | |
df, | |
headers=df.columns.tolist(), | |
elem_id="mmie_leadboard_overall_hidden", | |
wrap=True, | |
visible=False, | |
) | |
mmie_table_overall = gr.Dataframe( | |
regex_table( | |
df.copy(), | |
"", | |
["Interleaved LVLM", "Integrated LVLM"] | |
), | |
headers=df.columns.tolist(), | |
elem_id="mmie_leadboard_overall", | |
wrap=True, | |
) | |
with gr.TabItem("About"): | |
with gr.Row(): | |
gr.Markdown(ABOUT_TEXT) | |
with gr.Accordion("📚 Citation", open=False): | |
citation_button = gr.Textbox( | |
value=CITATION_BUTTON_TEXT, | |
lines=7, | |
label="Copy the following to cite these results.", | |
elem_id="citation-button", | |
show_copy_button=True, | |
) | |
search_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall) | |
model_type_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall) | |
scheduler = BackgroundScheduler() | |
scheduler.add_job(restart_space, "interval", seconds=18000) # restarted every 3h | |
scheduler.start() | |
# app.queue(default_concurrency_limit=40).launch() | |
app.launch(allowed_paths=['./', "./src", "./evals"]) |