Spaces:

CIIRC-NLP
/

czechbench_leaderboard

Running

App Files Files Community

davidadamczyk commited on Sep 10, 2024

Commit

2b8e93d

2 Parent(s): bc7fa0c cd24b99

Fix git conflict

Browse files

Files changed (4) hide show

app.py +11 -3
src/display/about.py +46 -5
src/display/utils.py +52 -22
src/populate.py +3 -2

app.py CHANGED Viewed

@@ -306,9 +306,17 @@ with demo:
             with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
-                    precision = gr.Radio(["bfloat16", "float16", "4bit"], label="Precision", info="What precision are you using for inference?")
-                    hf_model_id = gr.Textbox(label="Model URL")
-                    contact_email = gr.Textbox(label="E-Mail")
                     file_output = gr.File()
                     upload_button = gr.UploadButton("Upload json", file_types=['.json'])
                     upload_button.upload(validate_upload, upload_button, file_output)

             with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
+                    #precision = gr.Radio(["bfloat16", "float16", "4bit"], label="Precision", info="What precision are you using for inference?")
+                    precision = gr.Dropdown(
+                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
+                        label="Precision",
+                        multiselect=False,
+                        value="other",
+                        interactive=True,
+                        info="What weight precision were you using during the evaluation?"
+                    )
+                    hf_model_id = gr.Textbox(label="Model link (Optional)", info="URL to the model's Hugging Face repository, or it's official website")
+                    contact_email = gr.Textbox(label="Your E-Mail")
                     file_output = gr.File()
                     upload_button = gr.UploadButton("Upload json", file_types=['.json'])
                     upload_button.upload(validate_upload, upload_button, file_output)

src/display/about.py CHANGED Viewed

@@ -31,6 +31,8 @@ class Tasks(Enum):
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 Czech-Bench is a collection of LLM benchmarks available for the Czech language. It currently consists of 15 Czech benchmarks, including new machine translations of the popular ARC, GSM8K, MMLU, and TruthfulQA datasets.
@@ -38,20 +40,55 @@ Czech-Bench is a collection of LLM benchmarks available for the Czech language.
 Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
 """
-TABLE_DESC = "The values presented in the table represent the accuracy metric."
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## Basic Information
-The goal of this project is to provide a comprehensive and practical benchmark for evaluating Czech language models. This benchmark consists of 15 selected test tasks containing test data in the Czech language. It includes both original Czech datasets and machine translations of popular datasets such as ARC, GSM8K, MMLU, and TruthfulQA. A list of all datasets can be found at [GitHub](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme)
 Key Features and Benefits:
 - **Tailored for the Czech Language:** The benchmark includes both original Czech datasets and adapted versions of international datasets, ensuring relevant evaluation of model performance in the Czech context.
 - **Wide Range of Tasks:** It contains 15 different tasks that cover various aspects of language understanding and text generation, enabling a comprehensive assessment of the model's capabilities.
 - **Ease of Use:** The benchmark is designed to be easily integrated into your development process, saving time and resources during model testing and improvement.
 - **Up-to-date and Relevant:** We regularly update our datasets to reflect the latest findings and trends in language model development.
-By using this benchmark, you will gain deep insights into the strengths and weaknesses of your models, allowing you to better focus on key areas for optimization. This will not only improve the performance of your models but also enhance their real-world deployment in various Czech contexts.
 ## Evaluation Process
@@ -79,10 +116,14 @@ lm_eval --model hf \\
     --output_path $OUTPUT_PATH \\
     --apply_chat_template \\
 ```
 ### 3. Upload results to Leaderboard
-in `$OUTPUT_PATH` directory you can find file `results.json` upload `result.json` to [CzechBench Leaderboard](https://huggingface.co/spaces/CIIRC-NLP/czechbench_leaderboard) on **Submit Here!** tab.
 """

 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
+TABLE_DESC = "The values presented in the table represent the accuracy metric."
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 Czech-Bench is a collection of LLM benchmarks available for the Czech language. It currently consists of 15 Czech benchmarks, including new machine translations of the popular ARC, GSM8K, MMLU, and TruthfulQA datasets.
 Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## Basic Information
+The goal of the CzechBench project is to provide a comprehensive and practical benchmark for evaluating Czech language models.
+Our [evaluation suite](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme)
+currently consists of 15 individual tasks, leveraging pre-existing Czech datasets together with new machine translations of popular LLM benchmarks,
+including ARC, GSM8K, MMLU, and TruthfulQA.
 Key Features and Benefits:
 - **Tailored for the Czech Language:** The benchmark includes both original Czech datasets and adapted versions of international datasets, ensuring relevant evaluation of model performance in the Czech context.
 - **Wide Range of Tasks:** It contains 15 different tasks that cover various aspects of language understanding and text generation, enabling a comprehensive assessment of the model's capabilities.
+- **Universal model support:** The universal text-to-text evaluation approach adopted in CzechBench allows for direct comparison of models with varying levels of internal access, including commercial APIs.
 - **Ease of Use:** The benchmark is designed to be easily integrated into your development process, saving time and resources during model testing and improvement.
 - **Up-to-date and Relevant:** We regularly update our datasets to reflect the latest findings and trends in language model development.
+By using CzechBench, you will gain deep insights into the strengths and weaknesses of your models, allowing you to better focus on key areas for optimization.
+This will not only improve the performance of your models but also enhance their real-world deployment in various Czech contexts.
+Below, you can find the up-to-date loaderboard of models evaluated on CzechBench.
+For more information on the included benchmarks and instructions on evaluating your own models, please visit the "About" section below.
+"""
+# Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
+# Which evaluations are you running? how can people reproduce what you have?
+LLM_BENCHMARKS_TEXT = f"""
+## Basic Information
+The CzechBench evaluation suite is hosted on [GitHub](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme).
+It is implemented on top of the popular [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) framework, which provides extensive model compatibility and optimal evaluation efficiency.
+All currently supported benchmarks are listed in the table below:
+| Dataset                                                      | Language                      | Task type                  | Metrics        | Samples | Task ID         |
+| ------------------------------------------------------------ | ----------------------------- | -------------------------- | -------------- | ------: | --------------- |
+| [AGREE](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/agree_cs)                   | CS (Original)                 | Subject-verb agreement     | Acc            | 627     | agree_cs        |
+| [ANLI](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/anli_cs)                     | CS (Translated)               | Natural Language Inference | Acc, Macro F1  | 1200    | anli_cs         |
+| [ARC Challenge](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/arc_cs)             | CS (Translated)               | Knowledge-Based QA         | Acc            | 1172    | arc_cs          |
+| [ARC Easy](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/arc_cs)                  | CS (Translated)               | Knowledge-Based QA         | Acc            | 2376    | arc_cs          |
+| [Belebele](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/belebele_cs)             | CS (Professional translation) | Reading Comprehension / QA | Acc            | 895     | belebele_cs     |
+| [CTKFacts](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/ctkfacts_cs)             | CS (Original)                 | Natural Language Inference | Acc, Macro F1  | 558     | ctkfacts_cs     |
+| [Czech News](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/czechnews_cs)          | CS (Original)                 | News Topic Classification  | Acc, Macro F1  | 1000    | czechnews_cs    |
+| [Facebook Comments](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/fb_comments_cs) | CS (Original)                 | Sentiment Analysis         | Acc, Macro F1  | 1000    | fb_comments_cs  |
+| [GSM8K](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/gsm8k_cs)                   | CS (Translated)               | Mathematical inference     | EM Acc         | 1319    | gsm8k_cs        |
+| [Klokánek](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/klokanek_cs)             | CS (Original)                 | Math/Logical Inference     | Acc            | 808     | klokanek_cs     |
+| [Mall Reviews](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/mall_reviews_cs)     | CS (Original)                 | Sentiment Analysis         | Acc, Macro F1  | 3000    | mall_reviews_cs |
+| [MMLU](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/mmlu_cs)                     | CS (Translated)               | Knowledge-Based QA         | Acc            | 12408   | mmlu_cs         |
+| [SQAD](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/sqad_cs)                     | CS (Original)                 | Reading Comprehension / QA | EM Acc, BoW F1 | 843     | sqad_cs         |
+| [Subjectivity](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/subjectivity_cs)     | CS (Original)                 | Subjectivity Analysis      | Acc, Macro F1  | 2000    | subjectivity_cs |
+| [TruthfulQA](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/truthfulqa_cs)         | CS (Translated)               | Knowledge-Based QA         | Acc            | 813     | truthfulqa_cs   |
 ## Evaluation Process
     --output_path $OUTPUT_PATH \\
     --apply_chat_template \\
 ```
+For advanced usage instructions, please inspect the [CzechBench README on GitHub](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme)
+or the official [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) documentation.
 ### 3. Upload results to Leaderboard
+Inside the `$OUTPUT_PATH` directory, you can find the file `results.json`.
+To submit your evaluation results to our leaderboard, please visit the "Submit here!" section above and upload your `results.json` file.
 """

src/display/utils.py CHANGED Viewed

@@ -47,30 +47,53 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
 """
-auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
-auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("hf_model_id", "str", False)])
-auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("agree_cs", "number", True)])
-auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("anli_cs", "number", True)])
-auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("arc_challenge_cs", "number", True)])
-auto_eval_column_dict.append(["arc_easy_cs", ColumnContent, ColumnContent("arc_easy_cs", "number", True)])
-auto_eval_column_dict.append(["belebele_cs", ColumnContent, ColumnContent("belebele_cs", "number", True)])
-auto_eval_column_dict.append(["ctkfacts_cs", ColumnContent, ColumnContent("ctkfacts_cs", "number", True)])
-auto_eval_column_dict.append(["czechnews_cs", ColumnContent, ColumnContent("czechnews_cs", "number", True)])
-auto_eval_column_dict.append(["fb_comments_cs", ColumnContent, ColumnContent("fb_comments_cs", "number", True)])
-auto_eval_column_dict.append(["gsm8k_cs", ColumnContent, ColumnContent("gsm8k_cs", "number", True)])
-auto_eval_column_dict.append(["klokanek_cs", ColumnContent, ColumnContent("klokanek_cs", "number", True)])
-auto_eval_column_dict.append(["mall_reviews_cs", ColumnContent, ColumnContent("mall_reviews_cs", "number", True)])
-auto_eval_column_dict.append(["mmlu_cs", ColumnContent, ColumnContent("mmlu_cs", "number", True)])
-auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("sqad_cs", "number", True)])
-auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("subjectivity_cs", "number", True)])
-auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("truthfulqa_cs", "number", True)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
@@ -120,6 +143,9 @@ class WeightType(Enum):
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
     qt_8bit = ModelDetails("8bit")
@@ -128,17 +154,21 @@ class Precision(Enum):
     Unknown = ModelDetails("?")
     def from_str(precision):
-        if precision in ["torch.float16", "float16"]:
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
-        if precision in ["8bit"]:
             return Precision.qt_8bit
-        if precision in ["4bit"]:
             return Precision.qt_4bit
         if precision in ["GPTQ", "None"]:
             return Precision.qt_GPTQ
-        return Precision.Unknown
 # Column selection
@@ -150,7 +180,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
 """
+auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
+auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model link (temporary)", "str", True)])
+auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("AGREE", "number", True)])
+auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("ANLI", "number", True)])
+auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("ARC-Challenge", "number", True)])
+auto_eval_column_dict.append(["arc_easy_cs", ColumnContent, ColumnContent("ARC-Easy", "number", True)])
+auto_eval_column_dict.append(["belebele_cs", ColumnContent, ColumnContent("Belebele", "number", True)])
+auto_eval_column_dict.append(["ctkfacts_cs", ColumnContent, ColumnContent("CTKFacts", "number", True)])
+auto_eval_column_dict.append(["czechnews_cs", ColumnContent, ColumnContent("Czech News", "number", True)])
+auto_eval_column_dict.append(["fb_comments_cs", ColumnContent, ColumnContent("Facebook Comments", "number", True)])
+auto_eval_column_dict.append(["gsm8k_cs", ColumnContent, ColumnContent("GSM8K", "number", True)])
+auto_eval_column_dict.append(["klokanek_cs", ColumnContent, ColumnContent("Klokanek", "number", True)])
+auto_eval_column_dict.append(["mall_reviews_cs", ColumnContent, ColumnContent("Mall Reviews", "number", True)])
+auto_eval_column_dict.append(["mmlu_cs", ColumnContent, ColumnContent("MMLU", "number", True)])
+auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("SQAD", "number", True)])
+auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("Subjectivity", "number", True)])
+auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("TruthfulQA", "number", True)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
+HEADER_MAP = {
+    "eval_name": "Model",
+    "precision": "Precision",
+    "hf_model_id": "Model link (temporary)",
+    "agree_cs": "AGREE",
+    "anli_cs": "ANLI",
+    "arc_challenge_cs": "ARC-Challenge",
+    "arc_easy_cs": "ARC-Easy",
+    "belebele_cs": "Belebele",
+    "ctkfacts_cs": "CTKFacts",
+    "czechnews_cs": "Czech News",
+    "fb_comments_cs": "Facebook Comments",
+    "gsm8k_cs": "GSM8K",
+    "klokanek_cs": "Klokanek",
+    "mall_reviews_cs": "Mall Reviews",
+    "mmlu_cs": "MMLU",
+    "sqad_cs": "SQAD",
+    "subjectivity_cs": "Subjectivity",
+    "truthfulqa_cs": "TruthfulQA",
+}
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
 class Precision(Enum):
+    other = ModelDetails("other")
+    float64 = ModelDetails("float64")
+    float32 = ModelDetails("float32")
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
     qt_8bit = ModelDetails("8bit")
     Unknown = ModelDetails("?")
     def from_str(precision):
+        if precision in ["torch.float64", "torch.double" ,"float64"]:
+            return Precision.float64
+        if precision in ["torch.float32", "torch.float" ,"float32"]:
+            return Precision.tfloat32
+        if precision in ["torch.float16", "torch.half", "float16"]:
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
+        if precision in ["8bit", "int8"]:
             return Precision.qt_8bit
+        if precision in ["4bit", "int4"]:
             return Precision.qt_4bit
         if precision in ["GPTQ", "None"]:
             return Precision.qt_GPTQ
+        return Precision.other
 # Column selection
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [HEADER_MAP[t.value.col_name] for t in Tasks]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

src/populate.py CHANGED Viewed

@@ -4,7 +4,8 @@ import numpy as np
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
@@ -13,7 +14,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     #all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(raw_data)
     #df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
-    df = df.rename(columns={'eval_name': 'Model', 'precision': 'Precision'})
     df = df[cols].round(decimals=2)
     df.replace(r'\s+', np.nan, regex=True)
     # filter out if any of the benchmarks have not been produced

 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
+from src.display.formatting import has_no_nan_values, make_clickable_model
+from src.display.utils import AutoEvalColumn, EvalQueueColumn, HEADER_MAP
 from src.leaderboard.read_evals import get_raw_eval_results
     #all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(raw_data)
     #df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
+    df = df.rename(columns=HEADER_MAP)
     df = df[cols].round(decimals=2)
     df.replace(r'\s+', np.nan, regex=True)
     # filter out if any of the benchmarks have not been produced