Clémentine commited on
Commit
4b2522c
·
1 Parent(s): 509661e

need to add the selectors now

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.pyc
2
+ .vscode
README.md CHANGED
@@ -9,4 +9,50 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  pinned: false
10
  ---
11
 
12
+ If you want your leaderboard to appear, feel free to add relevant information in its metadata, and it will be displayed here.
13
+
14
+ # Categories
15
+
16
+ ## Submission type
17
+ Arenas are not concerned by this category.
18
+
19
+ - `submission:automatic`: users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention
20
+ - `submission:semiautomatic`: the leaderboard requires the model owner to run evaluations on his side and submit the results
21
+ - `submission:manual`: the leaderboard requires the leaderboard owner to run evaluations for new submissions
22
+ - `submission:closed`: the leaderboard does not accept submissions at the moment
23
+
24
+ ## Test set status
25
+ Arenas are not concerned by this category.
26
+
27
+ - `test:public`: all the test sets used are public, the evaluations are completely reproducible
28
+ - `test:mix`: some test sets are public and some private
29
+ - `test:private`: all the test sets used are private, the evaluations are hard to game
30
+ - `test:rolling`: the test sets used change regularly through time and evaluation scores are refreshed
31
+
32
+ ## Judges
33
+ - `judge:auto`: evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`
34
+ - `judge:model`: evaluations are run using a model as a judge approach to rate answer
35
+ - `judge:humans`: evaluations are done by humans to rate answer - this is an arena
36
+ - `judge:vibe_check`: evaluations are done manually by one human
37
+
38
+ ## Modalities
39
+ Can be any (or several) of the following list:
40
+ - `modality:text`
41
+ - `modality:image`
42
+ - `modality:video`
43
+ - `modality:audio`
44
+ A bit outside of usual modalities
45
+ - `modality:tools`: requires added tool usage - mostly for assistant models
46
+ - `modality:artefacts`: the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings.
47
+
48
+ ## Evaluation categories
49
+ Can be any (or several) of the following list:
50
+ - `eval:generation`: the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...)
51
+ - `eval:math`
52
+ - `eval:code`
53
+ - `eval:performance`: model performance (speed, energy consumption, ...)
54
+ - `eval:safety`: safety, toxicity, bias evaluations
55
+
56
+ ## Language
57
+ You can indicate the languages covered by your benchmark like so: `language:mylanguage`.
58
+ At the moment, we do not support language codes, please use the language name in English.
app.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from apscheduler.schedulers.background import BackgroundScheduler
3
+ from src.static.env import API, REPO_ID, HF_TOKEN
4
+ from src.static.about import TITLE, INTRO, ABOUT
5
+
6
+ from src.leaderboards.get_from_hub import get_leaderboard_info
7
+
8
+
9
+ def restart_space():
10
+ API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
11
+
12
+ leaderboards_to_info, info_to_leaderboards = get_leaderboard_info()
13
+
14
+
15
+ demo = gr.Blocks()
16
+ with demo:
17
+ gr.HTML(TITLE)
18
+ gr.Markdown(INTRO, elem_classes="markdown-text")
19
+
20
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
21
+ with gr.TabItem("Search"):
22
+ gr.Markdown("Let's look for leaderboards relevant for you! Select the categories of your choice")
23
+
24
+
25
+
26
+ with gr.TabItem("About"):
27
+ gr.Markdown(ABOUT, elem_classes="markdown-text")
28
+
29
+ scheduler = BackgroundScheduler()
30
+ scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
31
+ scheduler.start()
32
+
33
+ demo.queue(default_concurrency_limit=40).launch()
leaderboards_metadata.py DELETED
@@ -1,107 +0,0 @@
1
- from enum import Enum, auto
2
- #from dataclasses import dataclass
3
-
4
- SubmissionType = Enum(
5
- "SubmissionType",
6
- [
7
- "Automatic",
8
- "SemiAutomatic",
9
- "Manual",
10
- "Closed",
11
- "Arena"
12
- ]
13
- )
14
-
15
- Evaluators = Enum(
16
- "Evaluators",
17
- [
18
- "Humans", # Arena
19
- "Automatic",
20
- "Model"
21
- ]
22
- )
23
-
24
- TestSet = Enum(
25
- "TestSet",
26
- [
27
- "Private",
28
- "Public",
29
- "Mix",
30
- "Rolling",
31
- "N/A"
32
- ]
33
- )
34
-
35
- Categories = Enum(
36
- "Categories",
37
- [
38
- "Text",
39
- "Image",
40
- "Audio",
41
- "Video",
42
- "Multimodal",
43
- "Generation",
44
- "Math",
45
- "Code",
46
- "LanguageSpecific",
47
- "Performance",
48
- "Safety",
49
- "VibeCheck",
50
- "Tools",
51
- "Artefacts"
52
- ]
53
- )
54
-
55
- Languages = Enum(
56
- "Languages",
57
- [
58
- "Chinese",
59
- "Korean",
60
- "Dutch",
61
- "Portuguese",
62
- "Italian",
63
- "Malay",
64
- "Polish",
65
- "Turkish"
66
-
67
- ]
68
- )
69
-
70
- leaderboard_to_tags = {
71
- "HuggingFaceH4/open_llm_leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Public, Categories.Text, Categories.Math],
72
- "bigcode/bigcode-models-leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Automatic, TestSet.Public, Categories.Code],
73
- "optimum/llm-perf-leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Performance],
74
- "lmsys/chatbot-arena-leaderboard": [SubmissionType.Arena, Evaluators.Humans, Categories.Text, Categories.Generation],
75
- "llmonitor/benchmarks": [SubmissionType.Manual, Evaluators.Humans, Categories.Text, Categories.VibeCheck],
76
- "mteb/leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, "Embeddings", Categories.Artefacts],
77
- "gaia-benchmark/leaderboard": [SubmissionType.Automatic, TestSet.Private, Evaluators.Automatic, Categories.Text, Categories.Tools, Categories.Multimodal],
78
- "opencompass/opencompass-llm-leaderboard": [SubmissionType.Manual, Categories.Text, Categories.LanguageSpecific, Languages.Chinese],
79
- "upstage/open-ko-llm-leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Mix, Categories.Text, Languages.Korean],
80
- "BramVanroy/open_dutch_llm_leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Text, Languages.Dutch],
81
- "vectara/leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Model, Categories.Text, "Hallucinations"],
82
- "facebook/CyberSecEval": [SubmissionType.Closed, Categories.Code, Categories.Safety],
83
- "mlabonne/Yet_Another_LLM_Leaderboard": [SubmissionType.Manual, Categories.Text, Evaluators.Automatic],
84
- "AI-Secure/llm-trustworthy-leaderboard": [SubmissionType.Automatic, Categories.Safety, Categories.Text],
85
- "AILab-CVC/EvalCrafter": [SubmissionType.Closed, Categories.Video, Categories.Generation],
86
- "mike-ravkine/can-ai-code-results": [SubmissionType.Closed, Categories.Code],
87
- "echo840/ocrbench-leaderboard": [SubmissionType.Closed, Categories.Image, "OCR"],
88
- "NPHardEval/NPHardEval-leaderboard": [SubmissionType.Closed, Categories.Text, Categories.Math, TestSet.Rolling],
89
- "HaizeLabs/red-teaming-resistance-benchmark": [SubmissionType.Manual, Categories.Safety, Categories.Text],
90
- "devingulliver/subquadratic-llm-leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, Categories.Math],
91
- "WildVision/vision-arena": [SubmissionType.Arena, Categories.Image, Categories.Multimodal],
92
- "Vchitect/VBench_Leaderboard": [SubmissionType.SemiAutomatic, Categories.Video, Categories.Generation],
93
- "eduagarcia/open_pt_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Portuguese],
94
- "FinancialSupport/open_ita_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Italian],
95
- "mesolitica/malay-llm-leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Malay],
96
- "TIGER-Lab/GenAI-Arena": [Categories.Image, Categories.Generation, Evaluators.Humans, SubmissionType.Arena],
97
- "q-future/Q-Bench-Leaderboard": [Categories.Image, Evaluators.Automatic, SubmissionType.Closed],
98
- "OpenGenAI/parti-prompts-leaderboard": [Categories.Image, Categories.Generation, SubmissionType.Arena, Evaluators.Humans],
99
- "speakleash/open_pl_llm_leaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Polish],
100
- "malhajar/OpenLLMTurkishLeaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Turkish],
101
- "allenai/WildBench": [Evaluators.Humans, SubmissionType.Arena, Evaluators.Model, Categories.Text, Categories.Generation],
102
- "hf-audio/open_asr_leaderboard": [Evaluators.Automatic, Categories.Audio],
103
- "opencompass/open_vlm_leaderboard": [Evaluators.Automatic, Categories.Generation, Categories.Image],
104
- "livecodebench/benchmarks": [Evaluators.Automatic, Categories.Code],
105
- "allenai/reward-bench": [Evaluators.Automatic, Categories.Artefacts, "Models", Categories.Text],
106
- "TTS-AGI/TTS-Arena": [Evaluators.Humans, Categories.Audio]
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ huggingface_hub
src/leaderboards/get_from_hub.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+
3
+ from src.leaderboards.saved import leaderboard_to_tags
4
+ from src.static.env import API
5
+
6
+ def group_all_tags(input_tags: list[str]) -> dict:
7
+ """Groups the tags by categories, following the division in the README.
8
+
9
+ Args:
10
+ input_tags (list[str]): list of tags
11
+
12
+ Returns:
13
+ dict: category to tag list
14
+ """
15
+ output_tags = defaultdict(list)
16
+ for tag in input_tags:
17
+ if tag == "arena":
18
+ output_tags.append("judge:humans")
19
+ continue
20
+
21
+ try:
22
+ category, value = tag.split(":")
23
+ output_tags[category].append(value)
24
+ except ValueError:
25
+ continue
26
+
27
+ return output_tags
28
+
29
+
30
+ def get_leaderboard_info() -> tuple[list, dict]:
31
+ """Looks up all spaces tagged as leaderboards or arenas on the hub,
32
+ and homogeneizes their tags.
33
+
34
+ Returns:
35
+ dict: All leaderboard names to their tag dicts by category
36
+ """
37
+ leaderboards = [
38
+ (s.id, s.tags) for s in API.list_spaces(
39
+ filter=["leaderboard"]
40
+ )]
41
+ arenas = [
42
+ (s.id, s.tags) for s in API.list_spaces(
43
+ filter=["arena"]
44
+ )]
45
+ saved_leaderboards = [(k, v) for k, v in leaderboard_to_tags.items()]
46
+
47
+ seen_leaderboards = []
48
+ leaderboard_df = []
49
+ info_to_leaderboard = defaultdict(lambda: defaultdict(list))
50
+ for name, tags in leaderboards + arenas + saved_leaderboards:
51
+ if name in seen_leaderboards:
52
+ continue
53
+
54
+ seen_leaderboards.append(name)
55
+
56
+ if name in leaderboard_to_tags:
57
+ tags += leaderboard_to_tags[name]
58
+
59
+ grouped_tags = group_all_tags(tags)
60
+ current_info = grouped_tags
61
+ current_info["name"] = name
62
+ leaderboard_df.append(current_info)
63
+ for category, tags in grouped_tags.items():
64
+ for tag in tags:
65
+ info_to_leaderboard[category][tag].append(name)
66
+ return leaderboard_df, info_to_leaderboard
src/leaderboards/saved.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Default leaderboards with which we initialize the space.
3
+ """
4
+
5
+ leaderboard_to_tags = {
6
+ "HuggingFaceH4/open_llm_leaderboard": ["submission:automatic", "judge:auto", "test:public", "modality:text", "eval:math"],
7
+ "bigcode/bigcode-models-leaderboard": ["submission:semiautomatic", "judge:auto", "test:public", "eval:code"],
8
+ "optimum/llm-perf-leaderboard": ["submission:manual", "judge:auto", "eval:performance"],
9
+ "lmsys/chatbot-arena-leaderboard": ["judge:humans", "modality:text", "eval:generation"],
10
+ "llmonitor/benchmarks": ["submission:manual", "judge:humans", "modality:text", "judge:vibe_check"],
11
+ "mteb/leaderboard": ["submission:semiautomatic", "modality:text", "Embeddings", "modality:artefacts"],
12
+ "gaia-benchmark/leaderboard": ["submission:automatic", "test:private", "judge:auto", "modality:text", "modality:tools", "modality:text", "modality:image", "modality:video"],
13
+ "opencompass/opencompass-llm-leaderboard": ["submission:manual", "modality:text", "language:chinese"],
14
+ "upstage/open-ko-llm-leaderboard": ["submission:automatic", "judge:auto", "test:mix", "modality:text", ],
15
+ "BramVanroy/open_dutch_llm_leaderboard": ["submission:manual", "judge:auto", "modality:text", "language:dutch"],
16
+ "vectara/leaderboard": ["submission:semiautomatic", "judge:model", "modality:text", "Hallucinations"],
17
+ "facebook/CyberSecEval": ["submission:closed", "eval:code", "eval:safety"],
18
+ "mlabonne/Yet_Another_LLM_Leaderboard": ["submission:manual", "modality:text", "judge:auto"],
19
+ "AI-Secure/llm-trustworthy-leaderboard": ["submission:automatic", "eval:safety", "modality:text"],
20
+ "AILab-CVC/EvalCrafter": ["submission:closed", "modality:video", "eval:generation"],
21
+ "mike-ravkine/can-ai-code-results": ["submission:closed", "eval:code"],
22
+ "echo840/ocrbench-leaderboard": ["submission:closed", "modality:image", "OCR"],
23
+ "NPHardEval/NPHardEval-leaderboard": ["submission:closed", "modality:text", "eval:math", "test:rolling"],
24
+ "HaizeLabs/red-teaming-resistance-benchmark": ["submission:manual", "eval:safety", "modality:text"],
25
+ "devingulliver/subquadratic-llm-leaderboard": ["submission:semiautomatic", "modality:text", "eval:math"],
26
+ "WildVision/vision-arena": ["modality:image", "modality:text", "judge:humans"],
27
+ "Vchitect/VBench_Leaderboard": ["submission:semiautomatic", "modality:video", "eval:generation"],
28
+ "eduagarcia/open_pt_llm_leaderboard": ["modality:text", "language:portuguese"],
29
+ "FinancialSupport/open_ita_llm_leaderboard": ["modality:text", "language:italian"],
30
+ "mesolitica/malay-llm-leaderboard": ["modality:text", "language:malay"],
31
+ "TIGER-Lab/GenAI-Arena": ["modality:image", "eval:generation", "judge:humans", ],
32
+ "q-future/Q-Bench-Leaderboard": ["modality:image", "judge:auto", "submission:closed"],
33
+ "OpenGenAI/parti-prompts-leaderboard": ["modality:image", "eval:generation", "judge:humans"],
34
+ "speakleash/open_pl_llm_leaderboard": ["modality:text", "language:polish"],
35
+ "malhajar/OpenLLMTurkishLeaderboard": ["modality:text", "language:turkish"],
36
+ "allenai/WildBench": ["judge:humans", "judge:model", "modality:text", "eval:generation"],
37
+ "hf-audio/open_asr_leaderboard": ["judge:auto", "modality:audio"],
38
+ "opencompass/open_vlm_leaderboard": ["judge:auto", "eval:generation", "modality:image"],
39
+ "livecodebench/benchmarks": ["judge:auto", "eval:code"],
40
+ "allenai/reward-bench": ["judge:auto", "modality:artefacts", "Models", "modality:text"],
41
+ "TTS-AGI/TTS-Arena": ["judge:humans", "modality:audio"]
42
+ }
src/static/about.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE = "# Leaderboard explorer"
2
+
3
+ INTRO = """
4
+ Have you ever wondered which leaderboard would be best for your use case?
5
+ """
6
+
7
+ ABOUT = """
8
+ If you want your leaderboard to appear in our suggestions, feel free to add relevant information in its tag metadata, and it will be displayed here.
9
+
10
+ # First step
11
+
12
+ Make sure to either use the tag `leaderboard` or `arena` to your space, by adding the following to your README
13
+
14
+ ```
15
+ tags:
16
+ - leaderboard
17
+ ```
18
+
19
+ # Extra tags
20
+
21
+ ## Submission type
22
+ Arenas are not concerned by this category.
23
+
24
+ - `submission:automatic`: users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention
25
+ - `submission:semiautomatic`: the leaderboard requires the model owner to run evaluations on his side and submit the results
26
+ - `submission:manual`: the leaderboard requires the leaderboard owner to run evaluations for new submissions
27
+ - `submission:closed`: the leaderboard does not accept submissions at the moment
28
+
29
+ ## Test set status
30
+ Arenas are not concerned by this category.
31
+
32
+ - `test:public`: all the test sets used are public, the evaluations are completely reproducible
33
+ - `test:mix`: some test sets are public and some private
34
+ - `test:private`: all the test sets used are private, the evaluations are hard to game
35
+ - `test:rolling`: the test sets used change regularly through time and evaluation scores are refreshed
36
+
37
+ ## Judges
38
+ - `judge:auto`: evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`
39
+ - `judge:model`: evaluations are run using a model as a judge approach to rate answer
40
+ - `judge:humans`: evaluations are done by humans to rate answer - this is an arena
41
+ - `judge:vibe_check`: evaluations are done manually by one human
42
+
43
+ ## Modalities
44
+ Can be any (or several) of the following list:
45
+ - `modality:text`
46
+ - `modality:image`
47
+ - `modality:video`
48
+ - `modality:audio`
49
+ A bit outside of usual modalities
50
+ - `modality:tools`: requires added tool usage - mostly for assistant models
51
+ - `modality:artefacts`: the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings.
52
+
53
+ ## Evaluation categories
54
+ Can be any (or several) of the following list:
55
+ - `eval:generation`: the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...)
56
+ - `eval:math`
57
+ - `eval:code`
58
+ - `eval:performance`: model performance (speed, energy consumption, ...)
59
+ - `eval:safety`: safety, toxicity, bias evaluations
60
+
61
+ ## Language
62
+ You can indicate the languages covered by your benchmark like so: `language:mylanguage`.
63
+ At the moment, we do not support language codes, please use the language name in English.
64
+ """
src/static/env.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import HfApi
3
+
4
+ REPO_ID = "clefourrier/LeaderboardFinder"
5
+ HF_TOKEN = None #os.getenv("HF_TOKEN")
6
+
7
+ API = HfApi(HF_TOKEN)