davidadamczyk
commited on
Commit
·
66b85dd
1
Parent(s):
d48413e
Update
Browse files- src/display/about.py +12 -28
- src/envs.py +6 -6
src/display/about.py
CHANGED
@@ -38,12 +38,22 @@ TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</
|
|
38 |
|
39 |
# What does your leaderboard evaluate?
|
40 |
INTRODUCTION_TEXT = """
|
41 |
-
|
42 |
"""
|
43 |
|
44 |
# Which evaluations are you running? how can people reproduce what you have?
|
45 |
LLM_BENCHMARKS_TEXT = f"""
|
46 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
## Reproducibility
|
49 |
To reproduce our results, here is the commands you can run:
|
@@ -51,33 +61,7 @@ To reproduce our results, here is the commands you can run:
|
|
51 |
"""
|
52 |
|
53 |
EVALUATION_QUEUE_TEXT = """
|
54 |
-
## Some good practices before submitting a model
|
55 |
-
|
56 |
-
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
57 |
-
```python
|
58 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
59 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
60 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
61 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
62 |
-
```
|
63 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
64 |
-
|
65 |
-
Note: make sure your model is public!
|
66 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
67 |
-
|
68 |
-
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
69 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
70 |
-
|
71 |
-
### 3) Make sure your model has an open license!
|
72 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
73 |
-
|
74 |
-
### 4) Fill up your model card
|
75 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
76 |
|
77 |
-
## In case of model failure
|
78 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
79 |
-
Make sure you have followed the above steps first.
|
80 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
81 |
"""
|
82 |
|
83 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
38 |
|
39 |
# What does your leaderboard evaluate?
|
40 |
INTRODUCTION_TEXT = """
|
41 |
+
Czech-Bench is a collection of LLM benchmarks available for the Czech language. It currently consists of 15 Czech benchmarks, including new machine translations of the popular ARC, GSM8K, MMLU, and TruthfulQA datasets.
|
42 |
"""
|
43 |
|
44 |
# Which evaluations are you running? how can people reproduce what you have?
|
45 |
LLM_BENCHMARKS_TEXT = f"""
|
46 |
+
## Basic Information
|
47 |
+
The goal of this project is to provide a comprehensive and practical benchmark for evaluating Czech language models. This benchmark consists of 15 selected test tasks containing test data in the Czech language. It includes both original Czech datasets and machine translations of popular datasets such as ARC, GSM8K, MMLU, and TruthfulQA. A list of all datasets can be found at [link na tabulku umístěnou na GitHubu]
|
48 |
+
|
49 |
+
Key Features and Benefits:
|
50 |
+
- **Tailored for the Czech Language:** The benchmark includes both original Czech datasets and adapted versions of international datasets, ensuring relevant evaluation of model performance in the Czech context.
|
51 |
+
- **Wide Range of Tasks:** It contains 15 different tasks that cover various aspects of language understanding and text generation, enabling a comprehensive assessment of the model's capabilities.
|
52 |
+
- **Ease of Use:** The benchmark is designed to be easily integrated into your development process, saving time and resources during model testing and improvement.
|
53 |
+
- **Up-to-date and Relevant:** We regularly update our datasets to reflect the latest findings and trends in language model development.
|
54 |
+
|
55 |
+
By using this benchmark, you will gain deep insights into the strengths and weaknesses of your models, allowing you to better focus on key areas for optimization. This will not only improve the performance of your models but also enhance their real-world deployment in various Czech contexts.
|
56 |
+
|
57 |
|
58 |
## Reproducibility
|
59 |
To reproduce our results, here is the commands you can run:
|
|
|
61 |
"""
|
62 |
|
63 |
EVALUATION_QUEUE_TEXT = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
|
|
|
|
|
|
|
|
65 |
"""
|
66 |
|
67 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
src/envs.py
CHANGED
@@ -4,13 +4,13 @@ from huggingface_hub import HfApi
|
|
4 |
|
5 |
# clone / pull the lmeh eval data
|
6 |
TOKEN = os.environ.get("TOKEN", None)
|
7 |
-
print(TOKEN)
|
8 |
-
OWNER = "davidadamczyk"
|
9 |
-
REPO_ID = f"{OWNER}/leaderboard"
|
10 |
-
QUEUE_REPO = f"{OWNER}/leaderboard"
|
11 |
-
RESULTS_REPO = f"{OWNER}/leaderboard"
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
# Local caches
|
16 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
|
|
4 |
|
5 |
# clone / pull the lmeh eval data
|
6 |
TOKEN = os.environ.get("TOKEN", None)
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
OWNER = "CIIRC-NLP"
|
9 |
+
REPO_ID = f"{OWNER}/czechbench_leaderboard"
|
10 |
+
QUEUE_REPO = f"{OWNER}/czechbench_leaderboard"
|
11 |
+
RESULTS_REPO = f"{OWNER}/czechbench_leaderboard"
|
12 |
+
|
13 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
14 |
|
15 |
# Local caches
|
16 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|