Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Clémentine
commited on
Commit
·
509661e
1
Parent(s):
270109b
hardcoded metadata
Browse files- leaderboards_metadata.py +107 -0
leaderboards_metadata.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import Enum, auto
|
2 |
+
#from dataclasses import dataclass
|
3 |
+
|
4 |
+
SubmissionType = Enum(
|
5 |
+
"SubmissionType",
|
6 |
+
[
|
7 |
+
"Automatic",
|
8 |
+
"SemiAutomatic",
|
9 |
+
"Manual",
|
10 |
+
"Closed",
|
11 |
+
"Arena"
|
12 |
+
]
|
13 |
+
)
|
14 |
+
|
15 |
+
Evaluators = Enum(
|
16 |
+
"Evaluators",
|
17 |
+
[
|
18 |
+
"Humans", # Arena
|
19 |
+
"Automatic",
|
20 |
+
"Model"
|
21 |
+
]
|
22 |
+
)
|
23 |
+
|
24 |
+
TestSet = Enum(
|
25 |
+
"TestSet",
|
26 |
+
[
|
27 |
+
"Private",
|
28 |
+
"Public",
|
29 |
+
"Mix",
|
30 |
+
"Rolling",
|
31 |
+
"N/A"
|
32 |
+
]
|
33 |
+
)
|
34 |
+
|
35 |
+
Categories = Enum(
|
36 |
+
"Categories",
|
37 |
+
[
|
38 |
+
"Text",
|
39 |
+
"Image",
|
40 |
+
"Audio",
|
41 |
+
"Video",
|
42 |
+
"Multimodal",
|
43 |
+
"Generation",
|
44 |
+
"Math",
|
45 |
+
"Code",
|
46 |
+
"LanguageSpecific",
|
47 |
+
"Performance",
|
48 |
+
"Safety",
|
49 |
+
"VibeCheck",
|
50 |
+
"Tools",
|
51 |
+
"Artefacts"
|
52 |
+
]
|
53 |
+
)
|
54 |
+
|
55 |
+
Languages = Enum(
|
56 |
+
"Languages",
|
57 |
+
[
|
58 |
+
"Chinese",
|
59 |
+
"Korean",
|
60 |
+
"Dutch",
|
61 |
+
"Portuguese",
|
62 |
+
"Italian",
|
63 |
+
"Malay",
|
64 |
+
"Polish",
|
65 |
+
"Turkish"
|
66 |
+
|
67 |
+
]
|
68 |
+
)
|
69 |
+
|
70 |
+
leaderboard_to_tags = {
|
71 |
+
"HuggingFaceH4/open_llm_leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Public, Categories.Text, Categories.Math],
|
72 |
+
"bigcode/bigcode-models-leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Automatic, TestSet.Public, Categories.Code],
|
73 |
+
"optimum/llm-perf-leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Performance],
|
74 |
+
"lmsys/chatbot-arena-leaderboard": [SubmissionType.Arena, Evaluators.Humans, Categories.Text, Categories.Generation],
|
75 |
+
"llmonitor/benchmarks": [SubmissionType.Manual, Evaluators.Humans, Categories.Text, Categories.VibeCheck],
|
76 |
+
"mteb/leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, "Embeddings", Categories.Artefacts],
|
77 |
+
"gaia-benchmark/leaderboard": [SubmissionType.Automatic, TestSet.Private, Evaluators.Automatic, Categories.Text, Categories.Tools, Categories.Multimodal],
|
78 |
+
"opencompass/opencompass-llm-leaderboard": [SubmissionType.Manual, Categories.Text, Categories.LanguageSpecific, Languages.Chinese],
|
79 |
+
"upstage/open-ko-llm-leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Mix, Categories.Text, Languages.Korean],
|
80 |
+
"BramVanroy/open_dutch_llm_leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Text, Languages.Dutch],
|
81 |
+
"vectara/leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Model, Categories.Text, "Hallucinations"],
|
82 |
+
"facebook/CyberSecEval": [SubmissionType.Closed, Categories.Code, Categories.Safety],
|
83 |
+
"mlabonne/Yet_Another_LLM_Leaderboard": [SubmissionType.Manual, Categories.Text, Evaluators.Automatic],
|
84 |
+
"AI-Secure/llm-trustworthy-leaderboard": [SubmissionType.Automatic, Categories.Safety, Categories.Text],
|
85 |
+
"AILab-CVC/EvalCrafter": [SubmissionType.Closed, Categories.Video, Categories.Generation],
|
86 |
+
"mike-ravkine/can-ai-code-results": [SubmissionType.Closed, Categories.Code],
|
87 |
+
"echo840/ocrbench-leaderboard": [SubmissionType.Closed, Categories.Image, "OCR"],
|
88 |
+
"NPHardEval/NPHardEval-leaderboard": [SubmissionType.Closed, Categories.Text, Categories.Math, TestSet.Rolling],
|
89 |
+
"HaizeLabs/red-teaming-resistance-benchmark": [SubmissionType.Manual, Categories.Safety, Categories.Text],
|
90 |
+
"devingulliver/subquadratic-llm-leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, Categories.Math],
|
91 |
+
"WildVision/vision-arena": [SubmissionType.Arena, Categories.Image, Categories.Multimodal],
|
92 |
+
"Vchitect/VBench_Leaderboard": [SubmissionType.SemiAutomatic, Categories.Video, Categories.Generation],
|
93 |
+
"eduagarcia/open_pt_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Portuguese],
|
94 |
+
"FinancialSupport/open_ita_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Italian],
|
95 |
+
"mesolitica/malay-llm-leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Malay],
|
96 |
+
"TIGER-Lab/GenAI-Arena": [Categories.Image, Categories.Generation, Evaluators.Humans, SubmissionType.Arena],
|
97 |
+
"q-future/Q-Bench-Leaderboard": [Categories.Image, Evaluators.Automatic, SubmissionType.Closed],
|
98 |
+
"OpenGenAI/parti-prompts-leaderboard": [Categories.Image, Categories.Generation, SubmissionType.Arena, Evaluators.Humans],
|
99 |
+
"speakleash/open_pl_llm_leaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Polish],
|
100 |
+
"malhajar/OpenLLMTurkishLeaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Turkish],
|
101 |
+
"allenai/WildBench": [Evaluators.Humans, SubmissionType.Arena, Evaluators.Model, Categories.Text, Categories.Generation],
|
102 |
+
"hf-audio/open_asr_leaderboard": [Evaluators.Automatic, Categories.Audio],
|
103 |
+
"opencompass/open_vlm_leaderboard": [Evaluators.Automatic, Categories.Generation, Categories.Image],
|
104 |
+
"livecodebench/benchmarks": [Evaluators.Automatic, Categories.Code],
|
105 |
+
"allenai/reward-bench": [Evaluators.Automatic, Categories.Artefacts, "Models", Categories.Text],
|
106 |
+
"TTS-AGI/TTS-Arena": [Evaluators.Humans, Categories.Audio]
|
107 |
+
}
|