davidadamczyk
commited on
Fix git conflict
Browse files- app.py +11 -3
- src/display/about.py +46 -5
- src/display/utils.py +52 -22
- src/populate.py +3 -2
app.py
CHANGED
@@ -306,9 +306,17 @@ with demo:
|
|
306 |
with gr.Row():
|
307 |
with gr.Column():
|
308 |
model_name_textbox = gr.Textbox(label="Model name")
|
309 |
-
precision = gr.Radio(["bfloat16", "float16", "4bit"], label="Precision", info="What precision are you using for inference?")
|
310 |
-
|
311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
file_output = gr.File()
|
313 |
upload_button = gr.UploadButton("Upload json", file_types=['.json'])
|
314 |
upload_button.upload(validate_upload, upload_button, file_output)
|
|
|
306 |
with gr.Row():
|
307 |
with gr.Column():
|
308 |
model_name_textbox = gr.Textbox(label="Model name")
|
309 |
+
#precision = gr.Radio(["bfloat16", "float16", "4bit"], label="Precision", info="What precision are you using for inference?")
|
310 |
+
precision = gr.Dropdown(
|
311 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
312 |
+
label="Precision",
|
313 |
+
multiselect=False,
|
314 |
+
value="other",
|
315 |
+
interactive=True,
|
316 |
+
info="What weight precision were you using during the evaluation?"
|
317 |
+
)
|
318 |
+
hf_model_id = gr.Textbox(label="Model link (Optional)", info="URL to the model's Hugging Face repository, or it's official website")
|
319 |
+
contact_email = gr.Textbox(label="Your E-Mail")
|
320 |
file_output = gr.File()
|
321 |
upload_button = gr.UploadButton("Upload json", file_types=['.json'])
|
322 |
upload_button.upload(validate_upload, upload_button, file_output)
|
src/display/about.py
CHANGED
@@ -31,6 +31,8 @@ class Tasks(Enum):
|
|
31 |
# Your leaderboard name
|
32 |
TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
|
33 |
|
|
|
|
|
34 |
# What does your leaderboard evaluate?
|
35 |
INTRODUCTION_TEXT = """
|
36 |
Czech-Bench is a collection of LLM benchmarks available for the Czech language. It currently consists of 15 Czech benchmarks, including new machine translations of the popular ARC, GSM8K, MMLU, and TruthfulQA datasets.
|
@@ -38,20 +40,55 @@ Czech-Bench is a collection of LLM benchmarks available for the Czech language.
|
|
38 |
Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
|
39 |
"""
|
40 |
|
41 |
-
TABLE_DESC = "The values presented in the table represent the accuracy metric."
|
42 |
-
|
43 |
# Which evaluations are you running? how can people reproduce what you have?
|
44 |
LLM_BENCHMARKS_TEXT = f"""
|
45 |
## Basic Information
|
46 |
-
The goal of
|
|
|
|
|
|
|
47 |
|
48 |
Key Features and Benefits:
|
49 |
- **Tailored for the Czech Language:** The benchmark includes both original Czech datasets and adapted versions of international datasets, ensuring relevant evaluation of model performance in the Czech context.
|
50 |
- **Wide Range of Tasks:** It contains 15 different tasks that cover various aspects of language understanding and text generation, enabling a comprehensive assessment of the model's capabilities.
|
|
|
51 |
- **Ease of Use:** The benchmark is designed to be easily integrated into your development process, saving time and resources during model testing and improvement.
|
52 |
- **Up-to-date and Relevant:** We regularly update our datasets to reflect the latest findings and trends in language model development.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
## Evaluation Process
|
57 |
|
@@ -79,10 +116,14 @@ lm_eval --model hf \\
|
|
79 |
--output_path $OUTPUT_PATH \\
|
80 |
--apply_chat_template \\
|
81 |
```
|
|
|
|
|
|
|
82 |
|
83 |
|
84 |
### 3. Upload results to Leaderboard
|
85 |
-
|
|
|
86 |
|
87 |
"""
|
88 |
|
|
|
31 |
# Your leaderboard name
|
32 |
TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
|
33 |
|
34 |
+
TABLE_DESC = "The values presented in the table represent the accuracy metric."
|
35 |
+
|
36 |
# What does your leaderboard evaluate?
|
37 |
INTRODUCTION_TEXT = """
|
38 |
Czech-Bench is a collection of LLM benchmarks available for the Czech language. It currently consists of 15 Czech benchmarks, including new machine translations of the popular ARC, GSM8K, MMLU, and TruthfulQA datasets.
|
|
|
40 |
Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
|
41 |
"""
|
42 |
|
|
|
|
|
43 |
# Which evaluations are you running? how can people reproduce what you have?
|
44 |
LLM_BENCHMARKS_TEXT = f"""
|
45 |
## Basic Information
|
46 |
+
The goal of the CzechBench project is to provide a comprehensive and practical benchmark for evaluating Czech language models.
|
47 |
+
Our [evaluation suite](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme)
|
48 |
+
currently consists of 15 individual tasks, leveraging pre-existing Czech datasets together with new machine translations of popular LLM benchmarks,
|
49 |
+
including ARC, GSM8K, MMLU, and TruthfulQA.
|
50 |
|
51 |
Key Features and Benefits:
|
52 |
- **Tailored for the Czech Language:** The benchmark includes both original Czech datasets and adapted versions of international datasets, ensuring relevant evaluation of model performance in the Czech context.
|
53 |
- **Wide Range of Tasks:** It contains 15 different tasks that cover various aspects of language understanding and text generation, enabling a comprehensive assessment of the model's capabilities.
|
54 |
+
- **Universal model support:** The universal text-to-text evaluation approach adopted in CzechBench allows for direct comparison of models with varying levels of internal access, including commercial APIs.
|
55 |
- **Ease of Use:** The benchmark is designed to be easily integrated into your development process, saving time and resources during model testing and improvement.
|
56 |
- **Up-to-date and Relevant:** We regularly update our datasets to reflect the latest findings and trends in language model development.
|
57 |
+
By using CzechBench, you will gain deep insights into the strengths and weaknesses of your models, allowing you to better focus on key areas for optimization.
|
58 |
+
This will not only improve the performance of your models but also enhance their real-world deployment in various Czech contexts.
|
59 |
+
|
60 |
+
Below, you can find the up-to-date loaderboard of models evaluated on CzechBench.
|
61 |
+
For more information on the included benchmarks and instructions on evaluating your own models, please visit the "About" section below.
|
62 |
+
|
63 |
+
"""
|
64 |
+
# Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
|
65 |
+
|
66 |
+
# Which evaluations are you running? how can people reproduce what you have?
|
67 |
+
LLM_BENCHMARKS_TEXT = f"""
|
68 |
+
## Basic Information
|
69 |
|
70 |
+
The CzechBench evaluation suite is hosted on [GitHub](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme).
|
71 |
+
It is implemented on top of the popular [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) framework, which provides extensive model compatibility and optimal evaluation efficiency.
|
72 |
+
|
73 |
+
All currently supported benchmarks are listed in the table below:
|
74 |
+
|
75 |
+
| Dataset | Language | Task type | Metrics | Samples | Task ID |
|
76 |
+
| ------------------------------------------------------------ | ----------------------------- | -------------------------- | -------------- | ------: | --------------- |
|
77 |
+
| [AGREE](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/agree_cs) | CS (Original) | Subject-verb agreement | Acc | 627 | agree_cs |
|
78 |
+
| [ANLI](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/anli_cs) | CS (Translated) | Natural Language Inference | Acc, Macro F1 | 1200 | anli_cs |
|
79 |
+
| [ARC Challenge](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/arc_cs) | CS (Translated) | Knowledge-Based QA | Acc | 1172 | arc_cs |
|
80 |
+
| [ARC Easy](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/arc_cs) | CS (Translated) | Knowledge-Based QA | Acc | 2376 | arc_cs |
|
81 |
+
| [Belebele](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/belebele_cs) | CS (Professional translation) | Reading Comprehension / QA | Acc | 895 | belebele_cs |
|
82 |
+
| [CTKFacts](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/ctkfacts_cs) | CS (Original) | Natural Language Inference | Acc, Macro F1 | 558 | ctkfacts_cs |
|
83 |
+
| [Czech News](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/czechnews_cs) | CS (Original) | News Topic Classification | Acc, Macro F1 | 1000 | czechnews_cs |
|
84 |
+
| [Facebook Comments](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/fb_comments_cs) | CS (Original) | Sentiment Analysis | Acc, Macro F1 | 1000 | fb_comments_cs |
|
85 |
+
| [GSM8K](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/gsm8k_cs) | CS (Translated) | Mathematical inference | EM Acc | 1319 | gsm8k_cs |
|
86 |
+
| [Klokánek](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/klokanek_cs) | CS (Original) | Math/Logical Inference | Acc | 808 | klokanek_cs |
|
87 |
+
| [Mall Reviews](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/mall_reviews_cs) | CS (Original) | Sentiment Analysis | Acc, Macro F1 | 3000 | mall_reviews_cs |
|
88 |
+
| [MMLU](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/mmlu_cs) | CS (Translated) | Knowledge-Based QA | Acc | 12408 | mmlu_cs |
|
89 |
+
| [SQAD](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/sqad_cs) | CS (Original) | Reading Comprehension / QA | EM Acc, BoW F1 | 843 | sqad_cs |
|
90 |
+
| [Subjectivity](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/subjectivity_cs) | CS (Original) | Subjectivity Analysis | Acc, Macro F1 | 2000 | subjectivity_cs |
|
91 |
+
| [TruthfulQA](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/truthfulqa_cs) | CS (Translated) | Knowledge-Based QA | Acc | 813 | truthfulqa_cs |
|
92 |
|
93 |
## Evaluation Process
|
94 |
|
|
|
116 |
--output_path $OUTPUT_PATH \\
|
117 |
--apply_chat_template \\
|
118 |
```
|
119 |
+
|
120 |
+
For advanced usage instructions, please inspect the [CzechBench README on GitHub](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme)
|
121 |
+
or the official [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) documentation.
|
122 |
|
123 |
|
124 |
### 3. Upload results to Leaderboard
|
125 |
+
Inside the `$OUTPUT_PATH` directory, you can find the file `results.json`.
|
126 |
+
To submit your evaluation results to our leaderboard, please visit the "Submit here!" section above and upload your `results.json` file.
|
127 |
|
128 |
"""
|
129 |
|
src/display/utils.py
CHANGED
@@ -47,30 +47,53 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
|
|
47 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
48 |
"""
|
49 |
|
50 |
-
|
|
|
51 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
52 |
-
auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("
|
53 |
-
auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("
|
54 |
-
auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("
|
55 |
-
auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("
|
56 |
-
auto_eval_column_dict.append(["arc_easy_cs", ColumnContent, ColumnContent("
|
57 |
-
auto_eval_column_dict.append(["belebele_cs", ColumnContent, ColumnContent("
|
58 |
-
auto_eval_column_dict.append(["ctkfacts_cs", ColumnContent, ColumnContent("
|
59 |
-
auto_eval_column_dict.append(["czechnews_cs", ColumnContent, ColumnContent("
|
60 |
-
auto_eval_column_dict.append(["fb_comments_cs", ColumnContent, ColumnContent("
|
61 |
-
auto_eval_column_dict.append(["gsm8k_cs", ColumnContent, ColumnContent("
|
62 |
-
auto_eval_column_dict.append(["klokanek_cs", ColumnContent, ColumnContent("
|
63 |
-
auto_eval_column_dict.append(["mall_reviews_cs", ColumnContent, ColumnContent("
|
64 |
-
auto_eval_column_dict.append(["mmlu_cs", ColumnContent, ColumnContent("
|
65 |
-
auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("
|
66 |
-
auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("
|
67 |
-
auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("
|
68 |
|
69 |
|
70 |
# We use make dataclass to dynamically fill the scores from Tasks
|
71 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
72 |
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
## For the queue columns in the submission tab
|
75 |
@dataclass(frozen=True)
|
76 |
class EvalQueueColumn: # Queue column
|
@@ -120,6 +143,9 @@ class WeightType(Enum):
|
|
120 |
|
121 |
|
122 |
class Precision(Enum):
|
|
|
|
|
|
|
123 |
float16 = ModelDetails("float16")
|
124 |
bfloat16 = ModelDetails("bfloat16")
|
125 |
qt_8bit = ModelDetails("8bit")
|
@@ -128,17 +154,21 @@ class Precision(Enum):
|
|
128 |
Unknown = ModelDetails("?")
|
129 |
|
130 |
def from_str(precision):
|
131 |
-
if precision in ["torch.
|
|
|
|
|
|
|
|
|
132 |
return Precision.float16
|
133 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
134 |
return Precision.bfloat16
|
135 |
-
if precision in ["8bit"]:
|
136 |
return Precision.qt_8bit
|
137 |
-
if precision in ["4bit"]:
|
138 |
return Precision.qt_4bit
|
139 |
if precision in ["GPTQ", "None"]:
|
140 |
return Precision.qt_GPTQ
|
141 |
-
return Precision.
|
142 |
|
143 |
|
144 |
# Column selection
|
@@ -150,7 +180,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
|
|
150 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
151 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
152 |
|
153 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
154 |
|
155 |
NUMERIC_INTERVALS = {
|
156 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
47 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
48 |
"""
|
49 |
|
50 |
+
|
51 |
+
auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
52 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
53 |
+
auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model link (temporary)", "str", True)])
|
54 |
+
auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("AGREE", "number", True)])
|
55 |
+
auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("ANLI", "number", True)])
|
56 |
+
auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("ARC-Challenge", "number", True)])
|
57 |
+
auto_eval_column_dict.append(["arc_easy_cs", ColumnContent, ColumnContent("ARC-Easy", "number", True)])
|
58 |
+
auto_eval_column_dict.append(["belebele_cs", ColumnContent, ColumnContent("Belebele", "number", True)])
|
59 |
+
auto_eval_column_dict.append(["ctkfacts_cs", ColumnContent, ColumnContent("CTKFacts", "number", True)])
|
60 |
+
auto_eval_column_dict.append(["czechnews_cs", ColumnContent, ColumnContent("Czech News", "number", True)])
|
61 |
+
auto_eval_column_dict.append(["fb_comments_cs", ColumnContent, ColumnContent("Facebook Comments", "number", True)])
|
62 |
+
auto_eval_column_dict.append(["gsm8k_cs", ColumnContent, ColumnContent("GSM8K", "number", True)])
|
63 |
+
auto_eval_column_dict.append(["klokanek_cs", ColumnContent, ColumnContent("Klokanek", "number", True)])
|
64 |
+
auto_eval_column_dict.append(["mall_reviews_cs", ColumnContent, ColumnContent("Mall Reviews", "number", True)])
|
65 |
+
auto_eval_column_dict.append(["mmlu_cs", ColumnContent, ColumnContent("MMLU", "number", True)])
|
66 |
+
auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("SQAD", "number", True)])
|
67 |
+
auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("Subjectivity", "number", True)])
|
68 |
+
auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("TruthfulQA", "number", True)])
|
69 |
|
70 |
|
71 |
# We use make dataclass to dynamically fill the scores from Tasks
|
72 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
73 |
|
74 |
|
75 |
+
HEADER_MAP = {
|
76 |
+
"eval_name": "Model",
|
77 |
+
"precision": "Precision",
|
78 |
+
"hf_model_id": "Model link (temporary)",
|
79 |
+
"agree_cs": "AGREE",
|
80 |
+
"anli_cs": "ANLI",
|
81 |
+
"arc_challenge_cs": "ARC-Challenge",
|
82 |
+
"arc_easy_cs": "ARC-Easy",
|
83 |
+
"belebele_cs": "Belebele",
|
84 |
+
"ctkfacts_cs": "CTKFacts",
|
85 |
+
"czechnews_cs": "Czech News",
|
86 |
+
"fb_comments_cs": "Facebook Comments",
|
87 |
+
"gsm8k_cs": "GSM8K",
|
88 |
+
"klokanek_cs": "Klokanek",
|
89 |
+
"mall_reviews_cs": "Mall Reviews",
|
90 |
+
"mmlu_cs": "MMLU",
|
91 |
+
"sqad_cs": "SQAD",
|
92 |
+
"subjectivity_cs": "Subjectivity",
|
93 |
+
"truthfulqa_cs": "TruthfulQA",
|
94 |
+
}
|
95 |
+
|
96 |
+
|
97 |
## For the queue columns in the submission tab
|
98 |
@dataclass(frozen=True)
|
99 |
class EvalQueueColumn: # Queue column
|
|
|
143 |
|
144 |
|
145 |
class Precision(Enum):
|
146 |
+
other = ModelDetails("other")
|
147 |
+
float64 = ModelDetails("float64")
|
148 |
+
float32 = ModelDetails("float32")
|
149 |
float16 = ModelDetails("float16")
|
150 |
bfloat16 = ModelDetails("bfloat16")
|
151 |
qt_8bit = ModelDetails("8bit")
|
|
|
154 |
Unknown = ModelDetails("?")
|
155 |
|
156 |
def from_str(precision):
|
157 |
+
if precision in ["torch.float64", "torch.double" ,"float64"]:
|
158 |
+
return Precision.float64
|
159 |
+
if precision in ["torch.float32", "torch.float" ,"float32"]:
|
160 |
+
return Precision.tfloat32
|
161 |
+
if precision in ["torch.float16", "torch.half", "float16"]:
|
162 |
return Precision.float16
|
163 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
164 |
return Precision.bfloat16
|
165 |
+
if precision in ["8bit", "int8"]:
|
166 |
return Precision.qt_8bit
|
167 |
+
if precision in ["4bit", "int4"]:
|
168 |
return Precision.qt_4bit
|
169 |
if precision in ["GPTQ", "None"]:
|
170 |
return Precision.qt_GPTQ
|
171 |
+
return Precision.other
|
172 |
|
173 |
|
174 |
# Column selection
|
|
|
180 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
181 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
182 |
|
183 |
+
BENCHMARK_COLS = [HEADER_MAP[t.value.col_name] for t in Tasks]
|
184 |
|
185 |
NUMERIC_INTERVALS = {
|
186 |
"?": pd.Interval(-1, 0, closed="right"),
|
src/populate.py
CHANGED
@@ -4,7 +4,8 @@ import numpy as np
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
|
7 |
-
from src.display.
|
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
@@ -13,7 +14,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
13 |
#all_data_json = [v.to_dict() for v in raw_data]
|
14 |
df = pd.DataFrame.from_records(raw_data)
|
15 |
#df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
16 |
-
df = df.rename(columns=
|
17 |
df = df[cols].round(decimals=2)
|
18 |
df.replace(r'\s+', np.nan, regex=True)
|
19 |
# filter out if any of the benchmarks have not been produced
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
|
7 |
+
from src.display.formatting import has_no_nan_values, make_clickable_model
|
8 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn, HEADER_MAP
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
|
|
14 |
#all_data_json = [v.to_dict() for v in raw_data]
|
15 |
df = pd.DataFrame.from_records(raw_data)
|
16 |
#df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
17 |
+
df = df.rename(columns=HEADER_MAP)
|
18 |
df = df[cols].round(decimals=2)
|
19 |
df.replace(r'\s+', np.nan, regex=True)
|
20 |
# filter out if any of the benchmarks have not been produced
|