File size: 10,996 Bytes
efeee6d 314f91a 95f85ed ec84a57 efeee6d 314f91a efeee6d ec84a57 33ce85b 5812da8 efeee6d 48dd4e0 58733e4 2fc1b8f 2b8e93d efeee6d c54febc 66b85dd c2f28e6 0227006 efeee6d c54febc 0098e4c 5cf1886 66b85dd 0a6f522 0098e4c 0227006 0098e4c 0227006 efeee6d 0227006 b117132 66b85dd 041fe0a 66b85dd ae10add 66b85dd ae10add 041fe0a 66b85dd e3e7110 2fc1b8f e3e7110 c2f28e6 041fe0a c2f28e6 d313dbd c2f28e6 041fe0a d16cee2 d313dbd 8c49cb6 d313dbd 58733e4 2a73469 63d2a5e 1e0ea8d 63d2a5e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
from dataclasses import dataclass
from enum import Enum
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
# Init: to update with your specific keys
class Tasks(Enum):
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
task0 = Task("agree_cs", "accuracy", "agree_cs")
task1 = Task("anli_cs", "accuracy", "anli_cs")
task2 = Task("arc_challenge_cs", "accuracy", "arc_challenge_cs")
task3 = Task("arc_easy_cs", "accuracy", "arc_easy_cs")
task4 = Task("belebele_cs", "accuracy", "belebele_cs")
task5 = Task("ctkfacts_cs", "accuracy", "ctkfacts_cs")
task6 = Task("czechnews_cs", "accuracy", "czechnews_cs")
task7 = Task("fb_comments_cs", "accuracy", "fb_comments_cs")
task8 = Task("gsm8k_cs", "accuracy", "gsm8k_cs")
task9 = Task("klokanek_cs", "accuracy", "klokanek_cs")
task10 = Task("mall_reviews_cs", "accuracy", "mall_reviews_cs")
task11 = Task("mmlu_cs", "accuracy", "mmlu_cs")
task12 = Task("sqad_cs", "accuracy", "sqad_cs")
task13 = Task("subjectivity_cs", "accuracy", "subjectivity_cs")
task14 = Task("truthfulqa_cs", "accuracy", "truthfulqa_cs")
# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
TABLE_DESC = "The values shown in the leaderboard table represent the accuracy metric in percentage."
# What does your leaderboard evaluate?
INTRODUCTION_OLD = """
Czech-Bench is a collection of LLM benchmarks available for the Czech language. It currently consists of 15 Czech benchmarks, including new machine translations of the popular ARC, GSM8K, MMLU, and TruthfulQA datasets.
Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
"""
# Which evaluations are you running? how can people reproduce what you have?
INTRODUCTION_TEXT = f"""
The goal of the CzechBench project is to provide a comprehensive and practical benchmark for evaluating Czech language models.
Our [evaluation suite](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme)
currently consists of 15 individual tasks, leveraging pre-existing Czech datasets together with new machine translations of popular LLM benchmarks,
including ARC, GSM8K, MMLU, and TruthfulQA. This work is brought to you by CIIRC CTU and VSB Ostrava.
Key Features and Benefits:
- **Tailored for the Czech Language:**
CzechBench includes both original Czech datasets and adapted versions of international datasets, ensuring relevant evaluation of model performance in the Czech context.
- **Wide Range of Tasks:**
It contains 15 different tasks that cover various aspects of language understanding and text generation, enabling a comprehensive assessment of the model's capabilities.
- **Bilingual performance analysis:**
CzechBench also offers a parallel collection of 9 English tasks corresponding to the Czech versions included in the main suite.
This allows for direct comparison of model performance across both languages with equivalent conditions in terms of prompt formulation and few-shot example selection.
- **Universal model support:**
The universal text-to-text evaluation approach adopted in CzechBench allows for direct comparison of models with varying levels of internal access, including commercial APIs.
- **Ease of Use:**
The benchmark is built upon a commonly used evaluation framework with wide support for state-of-the-art models and inference acceleration tools.
- **Empowering decisions:**
Whether you are a business looking for the best LLM solution to base your application on, or a research team trying to maximize the capabilities of the models they are developing,
CzechBench will help you gain insights into particular strengths and weeknesses of individual models and better focus on key areas for optimization.
Below, you can find the up-to-date loaderboard of models evaluated on CzechBench.
For more information on the included benchmarks and instructions on evaluating your own models, please visit the "About" section below.
"""
# Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## Basic Information
The CzechBench evaluation suite is hosted on [GitHub](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme).
It is implemented on top of the popular [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) framework, which provides extensive model compatibility and optimal evaluation efficiency.
All currently supported benchmarks are listed in the table below:
| Dataset | Language | Task type | Metrics | Samples | Task ID |
| ------------------------------------------------------------ | ----------------------------- | -------------------------- | -------------- | ------: | --------------- |
| [AGREE](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/agree_cs) | CS (Original) | Subject-verb agreement | Acc | 627 | agree_cs |
| [ANLI](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/anli_cs) | CS (Translated) | Natural Language Inference | Acc, Macro F1 | 1200 | anli_cs |
| [ARC Challenge](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/arc_cs) | CS (Translated) | Knowledge-Based QA | Acc | 1172 | arc_cs |
| [ARC Easy](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/arc_cs) | CS (Translated) | Knowledge-Based QA | Acc | 2376 | arc_cs |
| [Belebele](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/belebele_cs) | CS (Professional translation) | Reading Comprehension / QA | Acc | 895 | belebele_cs |
| [CTKFacts](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/ctkfacts_cs) | CS (Original) | Natural Language Inference | Acc, Macro F1 | 558 | ctkfacts_cs |
| [Czech News](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/czechnews_cs) | CS (Original) | News Topic Classification | Acc, Macro F1 | 1000 | czechnews_cs |
| [Facebook Comments](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/fb_comments_cs) | CS (Original) | Sentiment Analysis | Acc, Macro F1 | 1000 | fb_comments_cs |
| [GSM8K](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/gsm8k_cs) | CS (Translated) | Mathematical inference | EM Acc | 1319 | gsm8k_cs |
| [Klokánek](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/klokanek_cs) | CS (Original) | Math/Logical Inference | Acc | 808 | klokanek_cs |
| [Mall Reviews](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/mall_reviews_cs) | CS (Original) | Sentiment Analysis | Acc, Macro F1 | 3000 | mall_reviews_cs |
| [MMLU](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/mmlu_cs) | CS (Translated) | Knowledge-Based QA | Acc | 12408 | mmlu_cs |
| [SQAD](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/sqad_cs) | CS (Original) | Reading Comprehension / QA | EM Acc, BoW F1 | 843 | sqad_cs |
| [Subjectivity](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/subjectivity_cs) | CS (Original) | Subjectivity Analysis | Acc, Macro F1 | 2000 | subjectivity_cs |
| [TruthfulQA](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/truthfulqa_cs) | CS (Translated) | Knowledge-Based QA | Acc | 813 | truthfulqa_cs |
The leaderboard table also displays aggregated scores across task categories, including:
- **Grammar (Avg.):** AGREE
- **Knowledge (Avg.):** ARC-Challenge, ARC-Easy, MMLU, TruthfulQA
- **Reasoning (Avg.):** ANLI, Belebele, CTKFacts, SQAD
- **Math (Avg.):** GSM8K, Klokanek
- **Classification (Avg.):** Czech News, Facebook Comments, Mall Reviews, Subjectivity
- **Aggregate Score:** Average over above categories
## Evaluation Process
### 1. Install CzechBench:
```
git clone https://github.com/jirkoada/czechbench_eval_harness.git
cd czechbench_eval_harness
pip install -e “.[api]”
```
### 2. Run evaluation
* `export MODEL=your_model_name` where your_model_name is HF path for public model. For example: `export MODEL=meta-llama/Meta-Llama-3.1-8B-Instruct`
* `export OUTPUT_PATH=my_output_path` where my_output_path is directory for evaluation reports
Run following command (you can adjust parameters like batch_size or device):
```
lm_eval --model hf \\
--model_args pretrained=$MODEL \\
--tasks czechbench_tasks \\
--device cuda:0 \\
--batch_size 1 \\
--write_out \\
--log_samples \\
--output_path $OUTPUT_PATH \\
--apply_chat_template \\
```
For advanced usage instructions, please inspect the [CzechBench README on GitHub](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme)
or the official [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) documentation.
### 3. Upload results to Leaderboard
Inside the `$OUTPUT_PATH` directory, you can find the file `results.json`.
To submit your evaluation results to our leaderboard, please visit the "Submit here!" section above and upload your `results.json` file.
"""
EVALUATION_QUEUE_TEXT = """
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@misc{czechbench,
title = {CzechBench Leaderboard},
author = {Adam Jirkovský and David Adamczyk and Jan Hůla and Jan Šedivý},
year = {2024},
url = {https://huggingface.co/spaces/CIIRC-NLP/czechbench_leaderboard}
}
@masterthesis{jirkovsky-thesis,
author = {Jirkovský, Adam},
title = {Benchmarking Techniques for Evaluation of Large Language Models},
school = {Czech Technical University in Prague, Faculty of Electrical Engineering},
year = 2024,
URL = {https://dspace.cvut.cz/handle/10467/115227}
}"""
|