davidadamczyk commited on
Commit
2b8e93d
·
2 Parent(s): bc7fa0c cd24b99

Fix git conflict

Browse files
Files changed (4) hide show
  1. app.py +11 -3
  2. src/display/about.py +46 -5
  3. src/display/utils.py +52 -22
  4. src/populate.py +3 -2
app.py CHANGED
@@ -306,9 +306,17 @@ with demo:
306
  with gr.Row():
307
  with gr.Column():
308
  model_name_textbox = gr.Textbox(label="Model name")
309
- precision = gr.Radio(["bfloat16", "float16", "4bit"], label="Precision", info="What precision are you using for inference?")
310
- hf_model_id = gr.Textbox(label="Model URL")
311
- contact_email = gr.Textbox(label="E-Mail")
 
 
 
 
 
 
 
 
312
  file_output = gr.File()
313
  upload_button = gr.UploadButton("Upload json", file_types=['.json'])
314
  upload_button.upload(validate_upload, upload_button, file_output)
 
306
  with gr.Row():
307
  with gr.Column():
308
  model_name_textbox = gr.Textbox(label="Model name")
309
+ #precision = gr.Radio(["bfloat16", "float16", "4bit"], label="Precision", info="What precision are you using for inference?")
310
+ precision = gr.Dropdown(
311
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
312
+ label="Precision",
313
+ multiselect=False,
314
+ value="other",
315
+ interactive=True,
316
+ info="What weight precision were you using during the evaluation?"
317
+ )
318
+ hf_model_id = gr.Textbox(label="Model link (Optional)", info="URL to the model's Hugging Face repository, or it's official website")
319
+ contact_email = gr.Textbox(label="Your E-Mail")
320
  file_output = gr.File()
321
  upload_button = gr.UploadButton("Upload json", file_types=['.json'])
322
  upload_button.upload(validate_upload, upload_button, file_output)
src/display/about.py CHANGED
@@ -31,6 +31,8 @@ class Tasks(Enum):
31
  # Your leaderboard name
32
  TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
33
 
 
 
34
  # What does your leaderboard evaluate?
35
  INTRODUCTION_TEXT = """
36
  Czech-Bench is a collection of LLM benchmarks available for the Czech language. It currently consists of 15 Czech benchmarks, including new machine translations of the popular ARC, GSM8K, MMLU, and TruthfulQA datasets.
@@ -38,20 +40,55 @@ Czech-Bench is a collection of LLM benchmarks available for the Czech language.
38
  Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
39
  """
40
 
41
- TABLE_DESC = "The values presented in the table represent the accuracy metric."
42
-
43
  # Which evaluations are you running? how can people reproduce what you have?
44
  LLM_BENCHMARKS_TEXT = f"""
45
  ## Basic Information
46
- The goal of this project is to provide a comprehensive and practical benchmark for evaluating Czech language models. This benchmark consists of 15 selected test tasks containing test data in the Czech language. It includes both original Czech datasets and machine translations of popular datasets such as ARC, GSM8K, MMLU, and TruthfulQA. A list of all datasets can be found at [GitHub](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme)
 
 
 
47
 
48
  Key Features and Benefits:
49
  - **Tailored for the Czech Language:** The benchmark includes both original Czech datasets and adapted versions of international datasets, ensuring relevant evaluation of model performance in the Czech context.
50
  - **Wide Range of Tasks:** It contains 15 different tasks that cover various aspects of language understanding and text generation, enabling a comprehensive assessment of the model's capabilities.
 
51
  - **Ease of Use:** The benchmark is designed to be easily integrated into your development process, saving time and resources during model testing and improvement.
52
  - **Up-to-date and Relevant:** We regularly update our datasets to reflect the latest findings and trends in language model development.
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- By using this benchmark, you will gain deep insights into the strengths and weaknesses of your models, allowing you to better focus on key areas for optimization. This will not only improve the performance of your models but also enhance their real-world deployment in various Czech contexts.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  ## Evaluation Process
57
 
@@ -79,10 +116,14 @@ lm_eval --model hf \\
79
  --output_path $OUTPUT_PATH \\
80
  --apply_chat_template \\
81
  ```
 
 
 
82
 
83
 
84
  ### 3. Upload results to Leaderboard
85
- in `$OUTPUT_PATH` directory you can find file `results.json` upload `result.json` to [CzechBench Leaderboard](https://huggingface.co/spaces/CIIRC-NLP/czechbench_leaderboard) on **Submit Here!** tab.
 
86
 
87
  """
88
 
 
31
  # Your leaderboard name
32
  TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
33
 
34
+ TABLE_DESC = "The values presented in the table represent the accuracy metric."
35
+
36
  # What does your leaderboard evaluate?
37
  INTRODUCTION_TEXT = """
38
  Czech-Bench is a collection of LLM benchmarks available for the Czech language. It currently consists of 15 Czech benchmarks, including new machine translations of the popular ARC, GSM8K, MMLU, and TruthfulQA datasets.
 
40
  Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
41
  """
42
 
 
 
43
  # Which evaluations are you running? how can people reproduce what you have?
44
  LLM_BENCHMARKS_TEXT = f"""
45
  ## Basic Information
46
+ The goal of the CzechBench project is to provide a comprehensive and practical benchmark for evaluating Czech language models.
47
+ Our [evaluation suite](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme)
48
+ currently consists of 15 individual tasks, leveraging pre-existing Czech datasets together with new machine translations of popular LLM benchmarks,
49
+ including ARC, GSM8K, MMLU, and TruthfulQA.
50
 
51
  Key Features and Benefits:
52
  - **Tailored for the Czech Language:** The benchmark includes both original Czech datasets and adapted versions of international datasets, ensuring relevant evaluation of model performance in the Czech context.
53
  - **Wide Range of Tasks:** It contains 15 different tasks that cover various aspects of language understanding and text generation, enabling a comprehensive assessment of the model's capabilities.
54
+ - **Universal model support:** The universal text-to-text evaluation approach adopted in CzechBench allows for direct comparison of models with varying levels of internal access, including commercial APIs.
55
  - **Ease of Use:** The benchmark is designed to be easily integrated into your development process, saving time and resources during model testing and improvement.
56
  - **Up-to-date and Relevant:** We regularly update our datasets to reflect the latest findings and trends in language model development.
57
+ By using CzechBench, you will gain deep insights into the strengths and weaknesses of your models, allowing you to better focus on key areas for optimization.
58
+ This will not only improve the performance of your models but also enhance their real-world deployment in various Czech contexts.
59
+
60
+ Below, you can find the up-to-date loaderboard of models evaluated on CzechBench.
61
+ For more information on the included benchmarks and instructions on evaluating your own models, please visit the "About" section below.
62
+
63
+ """
64
+ # Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
65
+
66
+ # Which evaluations are you running? how can people reproduce what you have?
67
+ LLM_BENCHMARKS_TEXT = f"""
68
+ ## Basic Information
69
 
70
+ The CzechBench evaluation suite is hosted on [GitHub](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme).
71
+ It is implemented on top of the popular [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) framework, which provides extensive model compatibility and optimal evaluation efficiency.
72
+
73
+ All currently supported benchmarks are listed in the table below:
74
+
75
+ | Dataset | Language | Task type | Metrics | Samples | Task ID |
76
+ | ------------------------------------------------------------ | ----------------------------- | -------------------------- | -------------- | ------: | --------------- |
77
+ | [AGREE](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/agree_cs) | CS (Original) | Subject-verb agreement | Acc | 627 | agree_cs |
78
+ | [ANLI](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/anli_cs) | CS (Translated) | Natural Language Inference | Acc, Macro F1 | 1200 | anli_cs |
79
+ | [ARC Challenge](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/arc_cs) | CS (Translated) | Knowledge-Based QA | Acc | 1172 | arc_cs |
80
+ | [ARC Easy](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/arc_cs) | CS (Translated) | Knowledge-Based QA | Acc | 2376 | arc_cs |
81
+ | [Belebele](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/belebele_cs) | CS (Professional translation) | Reading Comprehension / QA | Acc | 895 | belebele_cs |
82
+ | [CTKFacts](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/ctkfacts_cs) | CS (Original) | Natural Language Inference | Acc, Macro F1 | 558 | ctkfacts_cs |
83
+ | [Czech News](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/czechnews_cs) | CS (Original) | News Topic Classification | Acc, Macro F1 | 1000 | czechnews_cs |
84
+ | [Facebook Comments](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/fb_comments_cs) | CS (Original) | Sentiment Analysis | Acc, Macro F1 | 1000 | fb_comments_cs |
85
+ | [GSM8K](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/gsm8k_cs) | CS (Translated) | Mathematical inference | EM Acc | 1319 | gsm8k_cs |
86
+ | [Klokánek](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/klokanek_cs) | CS (Original) | Math/Logical Inference | Acc | 808 | klokanek_cs |
87
+ | [Mall Reviews](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/mall_reviews_cs) | CS (Original) | Sentiment Analysis | Acc, Macro F1 | 3000 | mall_reviews_cs |
88
+ | [MMLU](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/mmlu_cs) | CS (Translated) | Knowledge-Based QA | Acc | 12408 | mmlu_cs |
89
+ | [SQAD](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/sqad_cs) | CS (Original) | Reading Comprehension / QA | EM Acc, BoW F1 | 843 | sqad_cs |
90
+ | [Subjectivity](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/subjectivity_cs) | CS (Original) | Subjectivity Analysis | Acc, Macro F1 | 2000 | subjectivity_cs |
91
+ | [TruthfulQA](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/truthfulqa_cs) | CS (Translated) | Knowledge-Based QA | Acc | 813 | truthfulqa_cs |
92
 
93
  ## Evaluation Process
94
 
 
116
  --output_path $OUTPUT_PATH \\
117
  --apply_chat_template \\
118
  ```
119
+
120
+ For advanced usage instructions, please inspect the [CzechBench README on GitHub](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme)
121
+ or the official [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) documentation.
122
 
123
 
124
  ### 3. Upload results to Leaderboard
125
+ Inside the `$OUTPUT_PATH` directory, you can find the file `results.json`.
126
+ To submit your evaluation results to our leaderboard, please visit the "Submit here!" section above and upload your `results.json` file.
127
 
128
  """
129
 
src/display/utils.py CHANGED
@@ -47,30 +47,53 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
47
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
48
  """
49
 
50
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
51
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
52
- auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("hf_model_id", "str", False)])
53
- auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("agree_cs", "number", True)])
54
- auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("anli_cs", "number", True)])
55
- auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("arc_challenge_cs", "number", True)])
56
- auto_eval_column_dict.append(["arc_easy_cs", ColumnContent, ColumnContent("arc_easy_cs", "number", True)])
57
- auto_eval_column_dict.append(["belebele_cs", ColumnContent, ColumnContent("belebele_cs", "number", True)])
58
- auto_eval_column_dict.append(["ctkfacts_cs", ColumnContent, ColumnContent("ctkfacts_cs", "number", True)])
59
- auto_eval_column_dict.append(["czechnews_cs", ColumnContent, ColumnContent("czechnews_cs", "number", True)])
60
- auto_eval_column_dict.append(["fb_comments_cs", ColumnContent, ColumnContent("fb_comments_cs", "number", True)])
61
- auto_eval_column_dict.append(["gsm8k_cs", ColumnContent, ColumnContent("gsm8k_cs", "number", True)])
62
- auto_eval_column_dict.append(["klokanek_cs", ColumnContent, ColumnContent("klokanek_cs", "number", True)])
63
- auto_eval_column_dict.append(["mall_reviews_cs", ColumnContent, ColumnContent("mall_reviews_cs", "number", True)])
64
- auto_eval_column_dict.append(["mmlu_cs", ColumnContent, ColumnContent("mmlu_cs", "number", True)])
65
- auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("sqad_cs", "number", True)])
66
- auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("subjectivity_cs", "number", True)])
67
- auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("truthfulqa_cs", "number", True)])
68
 
69
 
70
  # We use make dataclass to dynamically fill the scores from Tasks
71
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  ## For the queue columns in the submission tab
75
  @dataclass(frozen=True)
76
  class EvalQueueColumn: # Queue column
@@ -120,6 +143,9 @@ class WeightType(Enum):
120
 
121
 
122
  class Precision(Enum):
 
 
 
123
  float16 = ModelDetails("float16")
124
  bfloat16 = ModelDetails("bfloat16")
125
  qt_8bit = ModelDetails("8bit")
@@ -128,17 +154,21 @@ class Precision(Enum):
128
  Unknown = ModelDetails("?")
129
 
130
  def from_str(precision):
131
- if precision in ["torch.float16", "float16"]:
 
 
 
 
132
  return Precision.float16
133
  if precision in ["torch.bfloat16", "bfloat16"]:
134
  return Precision.bfloat16
135
- if precision in ["8bit"]:
136
  return Precision.qt_8bit
137
- if precision in ["4bit"]:
138
  return Precision.qt_4bit
139
  if precision in ["GPTQ", "None"]:
140
  return Precision.qt_GPTQ
141
- return Precision.Unknown
142
 
143
 
144
  # Column selection
@@ -150,7 +180,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
150
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
151
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
152
 
153
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
154
 
155
  NUMERIC_INTERVALS = {
156
  "?": pd.Interval(-1, 0, closed="right"),
 
47
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
48
  """
49
 
50
+
51
+ auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
52
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
53
+ auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model link (temporary)", "str", True)])
54
+ auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("AGREE", "number", True)])
55
+ auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("ANLI", "number", True)])
56
+ auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("ARC-Challenge", "number", True)])
57
+ auto_eval_column_dict.append(["arc_easy_cs", ColumnContent, ColumnContent("ARC-Easy", "number", True)])
58
+ auto_eval_column_dict.append(["belebele_cs", ColumnContent, ColumnContent("Belebele", "number", True)])
59
+ auto_eval_column_dict.append(["ctkfacts_cs", ColumnContent, ColumnContent("CTKFacts", "number", True)])
60
+ auto_eval_column_dict.append(["czechnews_cs", ColumnContent, ColumnContent("Czech News", "number", True)])
61
+ auto_eval_column_dict.append(["fb_comments_cs", ColumnContent, ColumnContent("Facebook Comments", "number", True)])
62
+ auto_eval_column_dict.append(["gsm8k_cs", ColumnContent, ColumnContent("GSM8K", "number", True)])
63
+ auto_eval_column_dict.append(["klokanek_cs", ColumnContent, ColumnContent("Klokanek", "number", True)])
64
+ auto_eval_column_dict.append(["mall_reviews_cs", ColumnContent, ColumnContent("Mall Reviews", "number", True)])
65
+ auto_eval_column_dict.append(["mmlu_cs", ColumnContent, ColumnContent("MMLU", "number", True)])
66
+ auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("SQAD", "number", True)])
67
+ auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("Subjectivity", "number", True)])
68
+ auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("TruthfulQA", "number", True)])
69
 
70
 
71
  # We use make dataclass to dynamically fill the scores from Tasks
72
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
73
 
74
 
75
+ HEADER_MAP = {
76
+ "eval_name": "Model",
77
+ "precision": "Precision",
78
+ "hf_model_id": "Model link (temporary)",
79
+ "agree_cs": "AGREE",
80
+ "anli_cs": "ANLI",
81
+ "arc_challenge_cs": "ARC-Challenge",
82
+ "arc_easy_cs": "ARC-Easy",
83
+ "belebele_cs": "Belebele",
84
+ "ctkfacts_cs": "CTKFacts",
85
+ "czechnews_cs": "Czech News",
86
+ "fb_comments_cs": "Facebook Comments",
87
+ "gsm8k_cs": "GSM8K",
88
+ "klokanek_cs": "Klokanek",
89
+ "mall_reviews_cs": "Mall Reviews",
90
+ "mmlu_cs": "MMLU",
91
+ "sqad_cs": "SQAD",
92
+ "subjectivity_cs": "Subjectivity",
93
+ "truthfulqa_cs": "TruthfulQA",
94
+ }
95
+
96
+
97
  ## For the queue columns in the submission tab
98
  @dataclass(frozen=True)
99
  class EvalQueueColumn: # Queue column
 
143
 
144
 
145
  class Precision(Enum):
146
+ other = ModelDetails("other")
147
+ float64 = ModelDetails("float64")
148
+ float32 = ModelDetails("float32")
149
  float16 = ModelDetails("float16")
150
  bfloat16 = ModelDetails("bfloat16")
151
  qt_8bit = ModelDetails("8bit")
 
154
  Unknown = ModelDetails("?")
155
 
156
  def from_str(precision):
157
+ if precision in ["torch.float64", "torch.double" ,"float64"]:
158
+ return Precision.float64
159
+ if precision in ["torch.float32", "torch.float" ,"float32"]:
160
+ return Precision.tfloat32
161
+ if precision in ["torch.float16", "torch.half", "float16"]:
162
  return Precision.float16
163
  if precision in ["torch.bfloat16", "bfloat16"]:
164
  return Precision.bfloat16
165
+ if precision in ["8bit", "int8"]:
166
  return Precision.qt_8bit
167
+ if precision in ["4bit", "int4"]:
168
  return Precision.qt_4bit
169
  if precision in ["GPTQ", "None"]:
170
  return Precision.qt_GPTQ
171
+ return Precision.other
172
 
173
 
174
  # Column selection
 
180
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
181
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
182
 
183
+ BENCHMARK_COLS = [HEADER_MAP[t.value.col_name] for t in Tasks]
184
 
185
  NUMERIC_INTERVALS = {
186
  "?": pd.Interval(-1, 0, closed="right"),
src/populate.py CHANGED
@@ -4,7 +4,8 @@ import numpy as np
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
 
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
@@ -13,7 +14,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
13
  #all_data_json = [v.to_dict() for v in raw_data]
14
  df = pd.DataFrame.from_records(raw_data)
15
  #df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
16
- df = df.rename(columns={'eval_name': 'Model', 'precision': 'Precision'})
17
  df = df[cols].round(decimals=2)
18
  df.replace(r'\s+', np.nan, regex=True)
19
  # filter out if any of the benchmarks have not been produced
 
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
7
+ from src.display.formatting import has_no_nan_values, make_clickable_model
8
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, HEADER_MAP
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
 
14
  #all_data_json = [v.to_dict() for v in raw_data]
15
  df = pd.DataFrame.from_records(raw_data)
16
  #df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
17
+ df = df.rename(columns=HEADER_MAP)
18
  df = df[cols].round(decimals=2)
19
  df.replace(r'\s+', np.nan, regex=True)
20
  # filter out if any of the benchmarks have not been produced