Bram Vanroy commited on
Commit
5693ee5
·
1 Parent(s): 0ae29b2

revision for Dutch only

Browse files
Files changed (30) hide show
  1. .gitignore +107 -126
  2. app.py +58 -91
  3. content.py +15 -23
  4. css.py +0 -13
  5. evals/arc/arc_nl_Llama-2-7b-chat-hf.json +6 -6
  6. evals/arc/arc_nl_Llama-2-7b-hf.json +6 -6
  7. evals/arc/{arc_nl_Mistral-7B-v0.1.json → arc_nl_Orca-2-7b.json} +6 -6
  8. evals/{truthfulqa/truthfulqa_nl-Llama-2-13b-hf.json → arc/arc_nl_gpt2-large-dutch.json} +8 -8
  9. evals/arc/arc_nl_gpt2-medium-dutch.json +23 -0
  10. evals/arc/arc_nl_zephyr-7b-beta.json +6 -6
  11. evals/hellaswag/hellaswag_nl_Llama-2-7b-chat-hf.json +6 -6
  12. evals/hellaswag/hellaswag_nl_Llama-2-7b-hf.json +6 -6
  13. evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json +6 -6
  14. evals/hellaswag/{hellaswag_nl_zephyr-7b-beta.json → hellaswag_nl_Orca-2-7b.json} +6 -6
  15. evals/hellaswag/hellaswag_nl_gpt2-large-dutch.json +23 -0
  16. evals/hellaswag/hellaswag_nl_gpt2-medium-dutch.json +23 -0
  17. evals/hellaswag/hellaswag_nl_neural-chat-7b-v3-1.json +23 -0
  18. evals/{truthfulqa/truthfulqa_nl_Mistral-7B-v0.1.json → mmlu/mmlu_nl_Mistral-7B-v0.1.json} +8 -8
  19. evals/mmlu/mmlu_nl_gpt2-large-dutch.json +23 -0
  20. evals/mmlu/mmlu_nl_gpt2-medium-dutch.json +23 -0
  21. evals/truthfulqa/truthfulqa_nl_Llama-2-13b-hf.json +0 -23
  22. evals/truthfulqa/truthfulqa_nl_Llama-2-7b-chat-hf.json +6 -6
  23. evals/truthfulqa/truthfulqa_nl_Llama-2-7b-hf.json +4 -4
  24. evals/truthfulqa/{truthfulqa_nl-falcon-40b-ft-alpaca-dolly-dutch.json → truthfulqa_nl_Orca-2-7b.json} +6 -6
  25. evals/truthfulqa/truthfulqa_nl_falcon-40b-ft-alpaca-dolly-dutch.json +0 -23
  26. evals/truthfulqa/truthfulqa_nl_falcon-40b.json +0 -23
  27. evals/truthfulqa/{truthfulqa_nl-llama2-13b-ft-mc4_nl_cleaned_tiny.json → truthfulqa_nl_gpt2-large-dutch.json} +6 -6
  28. evals/truthfulqa/{truthfulqa_nl-falcon-40b.json → truthfulqa_nl_gpt2-medium-dutch.json} +6 -6
  29. evals/truthfulqa/truthfulqa_nl_llama2-13b-ft-mc4_nl_cleaned_tiny.json +0 -23
  30. evals/truthfulqa/truthfulqa_nl_zephyr-7b-beta.json +0 -23
.gitignore CHANGED
@@ -1,92 +1,42 @@
1
- *.txt
2
- !src/**/*.txt
3
- runs*
4
- wandb*
5
- Pipfile*
6
- data/*
7
- muss
8
- models/*
9
- *config.json
10
-
11
- # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
12
- # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
13
-
14
- .idea/
15
- # User-specific stuff
16
- .idea/**/workspace.xml
17
- .idea/**/tasks.xml
18
- .idea/**/usage.statistics.xml
19
- .idea/**/dictionaries
20
- .idea/**/shelf
21
-
22
- # AWS User-specific
23
- .idea/**/aws.xml
24
-
25
- # Generated files
26
- .idea/**/contentModel.xml
27
-
28
- # Sensitive or high-churn files
29
- .idea/**/dataSources/
30
- .idea/**/dataSources.ids
31
- .idea/**/dataSources.local.xml
32
- .idea/**/sqlDataSources.xml
33
- .idea/**/dynamic.xml
34
- .idea/**/uiDesigner.xml
35
- .idea/**/dbnavigator.xml
36
-
37
- # Gradle
38
- .idea/**/gradle.xml
39
- .idea/**/libraries
40
-
41
- # Gradle and Maven with auto-import
42
- # When using Gradle or Maven with auto-import, you should exclude module files,
43
- # since they will be recreated, and may cause churn. Uncomment if using
44
- # auto-import.
45
- # .idea/artifacts
46
- # .idea/compiler.xml
47
- # .idea/jarRepositories.xml
48
- # .idea/modules.xml
49
- # .idea/*.iml
50
- # .idea/modules
51
- # *.iml
52
- # *.ipr
53
-
54
- # CMake
55
- cmake-build-*/
56
 
57
- # Mongo Explorer plugin
58
- .idea/**/mongoSettings.xml
59
 
60
- # File-based project format
61
- *.iws
62
-
63
- # IntelliJ
64
- out/
65
 
66
- # mpeltonen/sbt-idea plugin
67
- .idea_modules/
68
 
69
- # JIRA plugin
70
- atlassian-ide-plugin.xml
71
 
72
- # Cursive Clojure plugin
73
- .idea/replstate.xml
74
 
75
- # SonarLint plugin
76
- .idea/sonarlint/
 
 
 
 
77
 
78
- # Crashlytics plugin (for Android Studio and IntelliJ)
79
- com_crashlytics_export_strings.xml
80
- crashlytics.properties
81
- crashlytics-build.properties
82
- fabric.properties
83
 
84
- # Editor-based Rest Client
85
- .idea/httpRequests
86
 
87
- # Android studio 3.1+ serialized cache file
88
- .idea/caches/build_file_checksums.ser
 
89
 
 
 
90
 
91
  # Byte-compiled / optimized / DLL files
92
  __pycache__/
@@ -110,7 +60,6 @@ parts/
110
  sdist/
111
  var/
112
  wheels/
113
- share/python-wheels/
114
  *.egg-info/
115
  .installed.cfg
116
  *.egg
@@ -129,17 +78,14 @@ pip-delete-this-directory.txt
129
  # Unit test / coverage reports
130
  htmlcov/
131
  .tox/
132
- .nox/
133
  .coverage
134
  .coverage.*
135
  .cache
136
  nosetests.xml
137
  coverage.xml
138
  *.cover
139
- *.py,cover
140
  .hypothesis/
141
  .pytest_cache/
142
- cover/
143
 
144
  # Translations
145
  *.mo
@@ -149,7 +95,6 @@ cover/
149
  *.log
150
  local_settings.py
151
  db.sqlite3
152
- db.sqlite3-journal
153
 
154
  # Flask stuff:
155
  instance/
@@ -162,41 +107,16 @@ instance/
162
  docs/_build/
163
 
164
  # PyBuilder
165
- .pybuilder/
166
  target/
167
 
168
  # Jupyter Notebook
169
  .ipynb_checkpoints
170
 
171
- # IPython
172
- profile_default/
173
- ipython_config.py
174
-
175
  # pyenv
176
- # For a library or package, you might want to ignore these files since the code is
177
- # intended to run in multiple environments; otherwise, check them in:
178
- # .python-version
179
-
180
- # pipenv
181
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
182
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
183
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
184
- # install all needed dependencies.
185
- #Pipfile.lock
186
-
187
- # poetry
188
- # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
189
- # This is especially recommended for binary packages to ensure reproducibility, and is more
190
- # commonly ignored for libraries.
191
- # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
192
- #poetry.lock
193
-
194
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow
195
- __pypackages__/
196
-
197
- # Celery stuff
198
  celerybeat-schedule
199
- celerybeat.pid
200
 
201
  # SageMath parsed files
202
  *.sage.py
@@ -222,21 +142,82 @@ venv.bak/
222
 
223
  # mypy
224
  .mypy_cache/
225
- .dmypy.json
226
- dmypy.json
227
 
228
- # Pyre type checker
229
- .pyre/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
- # pytype static type analyzer
232
- .pytype/
 
 
 
233
 
234
- # Cython debug symbols
235
- cython_debug/
236
 
237
- # PyCharm
238
- # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
239
- # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
240
- # and can be added to the global gitignore or merged into this file. For a more nuclear
241
- # option (not recommended) you can uncomment the following to ignore the entire idea folder.
242
- #.idea/
 
1
+ run-backend.ps
2
+ .eslintrc.js
3
+ .venv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ # ignore compiled styles
6
+ *.css
7
 
8
+ # dependencies
9
+ **/node_modules/
10
+ **/.pnp
11
+ *.pnp.js
 
12
 
13
+ # testing
14
+ /coverage
15
 
16
+ # VSCode
17
+ **/.vscode/
18
 
19
+ # production
20
+ **/build/
21
 
22
+ # misc
23
+ .DS_Store
24
+ .env.local
25
+ .env.development.local
26
+ .env.test.local
27
+ .env.production.local
28
 
29
+ npm-debug.log*
30
+ yarn-debug.log*
31
+ yarn-error.log*
 
 
32
 
 
 
33
 
34
+ # python
35
+ data/
36
+ Pipfile*
37
 
38
+ # .idea (JetBrains)
39
+ **/.idea/
40
 
41
  # Byte-compiled / optimized / DLL files
42
  __pycache__/
 
60
  sdist/
61
  var/
62
  wheels/
 
63
  *.egg-info/
64
  .installed.cfg
65
  *.egg
 
78
  # Unit test / coverage reports
79
  htmlcov/
80
  .tox/
 
81
  .coverage
82
  .coverage.*
83
  .cache
84
  nosetests.xml
85
  coverage.xml
86
  *.cover
 
87
  .hypothesis/
88
  .pytest_cache/
 
89
 
90
  # Translations
91
  *.mo
 
95
  *.log
96
  local_settings.py
97
  db.sqlite3
 
98
 
99
  # Flask stuff:
100
  instance/
 
107
  docs/_build/
108
 
109
  # PyBuilder
 
110
  target/
111
 
112
  # Jupyter Notebook
113
  .ipynb_checkpoints
114
 
 
 
 
 
115
  # pyenv
116
+ .python-version
117
+
118
+ # celery beat schedule file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  celerybeat-schedule
 
120
 
121
  # SageMath parsed files
122
  *.sage.py
 
142
 
143
  # mypy
144
  .mypy_cache/
145
+ test.py
 
146
 
147
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
148
+ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
149
+
150
+ # User-specific stuff
151
+ .idea/**/workspace.xml
152
+ .idea/**/tasks.xml
153
+ .idea/**/usage.statistics.xml
154
+ .idea/**/dictionaries
155
+ .idea/**/shelf
156
+
157
+ # AWS User-specific
158
+ .idea/**/aws.xml
159
+
160
+ # Generated files
161
+ .idea/**/contentModel.xml
162
+
163
+ # Sensitive or high-churn files
164
+ .idea/**/dataSources/
165
+ .idea/**/dataSources.ids
166
+ .idea/**/dataSources.local.xml
167
+ .idea/**/sqlDataSources.xml
168
+ .idea/**/dynamic.xml
169
+ .idea/**/uiDesigner.xml
170
+ .idea/**/dbnavigator.xml
171
+
172
+ # Gradle
173
+ .idea/**/gradle.xml
174
+ .idea/**/libraries
175
+
176
+ # Gradle and Maven with auto-import
177
+ # When using Gradle or Maven with auto-import, you should exclude module files,
178
+ # since they will be recreated, and may cause churn. Uncomment if using
179
+ # auto-import.
180
+ # .idea/artifacts
181
+ # .idea/compiler.xml
182
+ # .idea/jarRepositories.xml
183
+ # .idea/modules.xml
184
+ # .idea/*.iml
185
+ # .idea/modules
186
+ # *.iml
187
+ # *.ipr
188
+
189
+ # CMake
190
+ cmake-build-*/
191
+
192
+ # Mongo Explorer plugin
193
+ .idea/**/mongoSettings.xml
194
+
195
+ # File-based project format
196
+ *.iws
197
+
198
+ # IntelliJ
199
+ out/
200
+
201
+ # mpeltonen/sbt-idea plugin
202
+ .idea_modules/
203
+
204
+ # JIRA plugin
205
+ atlassian-ide-plugin.xml
206
+
207
+ # Cursive Clojure plugin
208
+ .idea/replstate.xml
209
+
210
+ # SonarLint plugin
211
+ .idea/sonarlint/
212
 
213
+ # Crashlytics plugin (for Android Studio and IntelliJ)
214
+ com_crashlytics_export_strings.xml
215
+ crashlytics.properties
216
+ crashlytics-build.properties``
217
+ fabric.properties
218
 
219
+ # Editor-based Rest Client
220
+ .idea/httpRequests
221
 
222
+ # Android studio 3.1+ serialized cache file
223
+ .idea/caches/build_file_checksums.ser
 
 
 
 
app.py CHANGED
@@ -2,12 +2,13 @@ import json
2
  from collections import defaultdict
3
  from pathlib import Path
4
 
 
5
  import pandas as pd
6
  import gradio as gr
 
 
7
 
8
  from content import *
9
- from css import *
10
- import glob
11
 
12
  ARC = "arc"
13
  HELLASWAG = "hellaswag"
@@ -17,51 +18,17 @@ BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
17
 
18
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
19
 
20
- LANGS = "ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh".split(",")
21
-
22
- LANG_NAME = {
23
- "ar": "Arabic",
24
- "bn": "Bengali",
25
- "ca": "Catalan",
26
- "da": "Danish",
27
- "de": "German",
28
- "es": "Spanish",
29
- "eu": "Basque",
30
- "fr": "French",
31
- "gu": "Gujarati",
32
- "hi": "Hindi",
33
- "hr": "Croatian",
34
- "hu": "Hungarian",
35
- "hy": "Armenian",
36
- "id": "Indonesian",
37
- "it": "Italian",
38
- "kn": "Kannada",
39
- "ml": "Malayalam",
40
- "mr": "Marathi",
41
- "ne": "Nepali",
42
- "nl": "Dutch",
43
- "pt": "Portuguese",
44
- "ro": "Romanian",
45
- "ru": "Russian",
46
- "sk": "Slovak",
47
- "sr": "Serbian",
48
- "sv": "Swedish",
49
- "ta": "Tamil",
50
- "te": "Telugu",
51
- "uk": "Ukrainian",
52
- "vi": "Vietnamese",
53
- "zh": "Chinese",
54
- }
55
-
56
-
57
- def collect_results():
58
  performance_dict = defaultdict(dict)
59
- pretrained_models = set()
60
  for pfin in Path("evals").rglob("*.json"):
61
  data = json.loads(pfin.read_text(encoding="utf-8"))
62
- if "results" not in data:
63
- continue
64
- if "config" not in data:
65
  continue
66
  results = data["results"]
67
  config = data["config"]
@@ -74,7 +41,6 @@ def collect_results():
74
  continue
75
  pretrained = pretrained[0].split("=")[1]
76
  pretrained = pretrained.split("/")[-1]
77
- pretrained_models.add(pretrained)
78
 
79
  for lang_task, perfs in results.items():
80
  task, lang = lang_task.split("_")
@@ -85,33 +51,46 @@ def collect_results():
85
  p = round(perfs[metric] * 100, 1)
86
  performance_dict[(pretrained, lang)][task] = p
87
 
88
- return performance_dict, pretrained_models
89
 
90
 
91
- def get_leaderboard_df(performance_dict, pretrained_models):
92
- df = list()
 
 
 
 
 
 
93
  for (pretrained, lang), perfs in performance_dict.items():
94
- lang_name = LANG_NAME[lang]
95
  arc_perf = perfs.get(ARC, 0.0)
96
  hellaswag_perf = perfs.get(HELLASWAG, 0.0)
97
  mmlu_perf = perfs.get(MMLU, 0.0)
98
  truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
99
 
100
  avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
101
- notes = " ".join([pretrained, lang_name])
102
- row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
103
- df.append(row)
104
 
105
- df = pd.DataFrame.from_records(df, columns=COLS)
106
  df = df.sort_values(by=[AVERAGE_COL], ascending=False)
107
- df = df[COLS]
108
-
109
  return df
110
 
111
 
112
- def search_table(df, query):
113
- filtered_df = df[df[NOTES_COL].str.contains(query, case=False)]
114
- return filtered_df
 
 
 
 
 
 
 
 
 
 
 
115
 
116
 
117
  MODEL_COL = "Model"
@@ -120,43 +99,31 @@ ARC_COL = "ARC (25-shot)"
120
  HELLASWAG_COL = "HellaSwag (10-shot)️"
121
  MMLU_COL = "MMLU (5-shot)"
122
  TRUTHFULQA_COL = "TruthfulQA (0-shot)"
123
- NOTES_COL = "Notes" # For search only
124
-
125
- COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
126
- TYPES = ["str", "number", "number", "number", "number", "number", "str"]
127
 
128
- args = collect_results()
129
- original_df = get_leaderboard_df(*args)
130
 
131
- demo = gr.Blocks(css=CUSTOM_CSS)
132
- with demo:
 
 
133
  gr.HTML(TITLE)
134
- gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
135
- gr.Markdown(HOW_TO, elem_classes="markdown-text")
136
-
137
- with gr.Box():
138
- search_bar = gr.Textbox(placeholder="Search models and languages...", show_label=False, elem_id="search-bar")
139
-
140
- leaderboard_table = gr.components.Dataframe(
141
- value=original_df,
142
- headers=COLS,
143
- datatype=TYPES,
144
- max_rows=5,
145
- elem_id="leaderboard-table",
146
- )
147
-
148
- # # Dummy leaderboard for handling the case when the user uses backspace key
149
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
150
- value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
151
- )
152
-
153
- search_bar.change(
154
- search_table,
155
- [hidden_leaderboard_table_for_search, search_bar],
156
- leaderboard_table,
157
- )
158
 
159
  gr.Markdown(CREDIT, elem_classes="markdown-text")
160
  gr.Markdown(CITATION, elem_classes="markdown-text")
161
 
162
- demo.launch()
 
 
 
2
  from collections import defaultdict
3
  from pathlib import Path
4
 
5
+ import numpy as np
6
  import pandas as pd
7
  import gradio as gr
8
+ from pandas import DataFrame
9
+ from pandas.io.formats.style import Styler
10
 
11
  from content import *
 
 
12
 
13
  ARC = "arc"
14
  HELLASWAG = "hellaswag"
 
18
 
19
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
20
 
21
+
22
+ def collect_results() -> dict[tuple[str, str], dict[str, float]]:
23
+ """
24
+ Collects results from the evals folder and returns a dictionary of results
25
+ :return: a dictionary of results where the keys are typles of (model_name, language) and the values are
26
+ dictionaries of the form {benchmark_name: performance_score}
27
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  performance_dict = defaultdict(dict)
 
29
  for pfin in Path("evals").rglob("*.json"):
30
  data = json.loads(pfin.read_text(encoding="utf-8"))
31
+ if "results" not in data or "config" not in data:
 
 
32
  continue
33
  results = data["results"]
34
  config = data["config"]
 
41
  continue
42
  pretrained = pretrained[0].split("=")[1]
43
  pretrained = pretrained.split("/")[-1]
 
44
 
45
  for lang_task, perfs in results.items():
46
  task, lang = lang_task.split("_")
 
51
  p = round(perfs[metric] * 100, 1)
52
  performance_dict[(pretrained, lang)][task] = p
53
 
54
+ return dict(performance_dict)
55
 
56
 
57
+ def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame:
58
+ """
59
+ Builds a dataframe from the performance dictionary
60
+ :param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are
61
+ dictionaries of the form {benchmark_name: performance_score}
62
+ :return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
63
+ """
64
+ data = []
65
  for (pretrained, lang), perfs in performance_dict.items():
 
66
  arc_perf = perfs.get(ARC, 0.0)
67
  hellaswag_perf = perfs.get(HELLASWAG, 0.0)
68
  mmlu_perf = perfs.get(MMLU, 0.0)
69
  truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
70
 
71
  avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
72
+ row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
73
+ data.append(row)
 
74
 
75
+ df = pd.DataFrame.from_records(data, columns=COLS)
76
  df = df.sort_values(by=[AVERAGE_COL], ascending=False)
 
 
77
  return df
78
 
79
 
80
+ def style_df(df: DataFrame) -> Styler:
81
+ """
82
+ Styles the dataframe by rounding to two decimals and putting the max value in bold per column
83
+ :param df: the dataframe to style
84
+ :return: the Styler
85
+ """
86
+ styler = df.style.format("{:.2f}", subset=df.columns[1:])
87
+
88
+ def highlight_max(col):
89
+ return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
90
+
91
+ styler = styler.apply(highlight_max, axis=1, subset=df.columns[1:])
92
+
93
+ return styler
94
 
95
 
96
  MODEL_COL = "Model"
 
99
  HELLASWAG_COL = "HellaSwag (10-shot)️"
100
  MMLU_COL = "MMLU (5-shot)"
101
  TRUTHFULQA_COL = "TruthfulQA (0-shot)"
 
 
 
 
102
 
103
+ COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
104
+ TYPES = ["str", "number", "number", "number", "number", "number"]
105
 
106
+ results = collect_results()
107
+ original_df = build_performance_df(results)
108
+ styled_df = style_df(original_df)
109
+ with gr.Blocks() as demo:
110
  gr.HTML(TITLE)
111
+ gr.Markdown(INTRO_TEXT)
112
+
113
+ gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
114
+ gr.components.Dataframe(
115
+ value=original_df,
116
+ headers=COLS,
117
+ datatype=TYPES,
118
+ elem_id="leaderboard-table",
119
+ )
120
+
121
+ gr.Markdown("## LaTeX")
122
+ gr.Code(styled_df.to_latex(convert_css=True))
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  gr.Markdown(CREDIT, elem_classes="markdown-text")
125
  gr.Markdown(CITATION, elem_classes="markdown-text")
126
 
127
+ if __name__ == '__main__':
128
+ demo.launch()
129
+
content.py CHANGED
@@ -1,44 +1,29 @@
1
- TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Leaderboard</h1>'
2
 
3
  INTRO_TEXT = f"""
4
  ## About
5
 
6
- This leaderboard tracks progress and ranks performance of large language models (LLMs) developed for different languages,
7
- emphasizing on non-English languages to democratize benefits of LLMs to broader society.
8
- Our current leaderboard provides evaluation data for 29 languages, i.e.,
9
- Arabic, Armenian, Basque, Bengali, Catalan, Chinese, Croatian, Danish, Dutch,
10
- French, German, Gujarati, Hindi, Hungarian, Indonesian, Italian, Kannada, Malayalam,
11
- Marathi, Nepali, Portuguese, Romanian, Russian, Serbian, Slovak, Spanish, Swedish,
12
- Tamil, Telugu, Ukrainian, and Vietnamese, that will be expanded along the way.
13
- Both multilingual and language-specific LLMs are welcome in this leaderboard.
14
- We currently evaluate models over four benchmarks:
15
 
16
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot)
17
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot)
18
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot)
19
  - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot)
20
 
21
- The evaluation data was translated into these languages using ChatGPT (gpt-35-turbo).
22
 
23
  """
24
 
25
- HOW_TO = f"""
26
- ## How to list your model performance on this leaderboard:
27
-
28
- Run the evaluation of your model using this repo: <a href="https://github.com/laiviet/lm-evaluation-harness" target="_blank">https://github.com/laiviet/lm-evaluation-harness</a>.
29
-
30
- And then, push the evaluation log and make a pull request.
31
- """
32
-
33
  CREDIT = f"""
34
  ## Credit
35
 
36
- To make this website, we use the following resources:
37
 
38
  - Datasets (AI2_ARC, HellaSwag, MMLU, TruthfulQA)
39
- - Funding and GPU access (Adobe Research)
40
  - Evaluation code (EleutherAI's lm_evaluation_harness repo)
41
  - Leaderboard code (Huggingface4's open_llm_leaderboard repo)
 
42
 
43
  """
44
 
@@ -46,12 +31,19 @@ To make this website, we use the following resources:
46
  CITATION = f"""
47
  ## Citation
48
 
49
- ```
50
 
 
 
 
 
 
 
 
 
51
  @misc{{lai2023openllmbenchmark,
52
  author = {{Viet Lai and Nghia Trung Ngo and Amir Pouran Ben Veyseh and Franck Dernoncourt and Thien Huu Nguyen}},
53
  title={{Open Multilingual LLM Evaluation Leaderboard}},
54
  year={{2023}}
55
  }}
56
  ```
57
- """
 
1
+ TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Leaderboard (Dutch only)</h1>'
2
 
3
  INTRO_TEXT = f"""
4
  ## About
5
 
6
+ This is a fork of the [Open Multilingual LLM Evaluation Leaderboard](https://huggingface.co/spaces/uonlp/open_multilingual_llm_leaderboard), but restricted to only Dutch models and augmented with additional model results.
7
+ We test the models on the following benchmarks **for the Dutch version only!!**, which have been translated into Dutch automatically by the original authors of the Open Multilingual LLM Evaluation Leaderboard with `gpt-35-turbo`.
 
 
 
 
 
 
 
8
 
9
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot)
10
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot)
11
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot)
12
  - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot)
13
 
14
+ I do not maintain those datasets, I only run benchmarks and add the results to this space. For questions regarding the test sets or running them yourself, see [the original Github repository](https://github.com/laiviet/lm-evaluation-harness).
15
 
16
  """
17
 
 
 
 
 
 
 
 
 
18
  CREDIT = f"""
19
  ## Credit
20
 
21
+ This leaderboard has borrowed heavily from the following sources:
22
 
23
  - Datasets (AI2_ARC, HellaSwag, MMLU, TruthfulQA)
 
24
  - Evaluation code (EleutherAI's lm_evaluation_harness repo)
25
  - Leaderboard code (Huggingface4's open_llm_leaderboard repo)
26
+ - The multilingual version of the leaderboard (uonlp's open_multilingual_llm_leaderboard repo)
27
 
28
  """
29
 
 
31
  CITATION = f"""
32
  ## Citation
33
 
 
34
 
35
+ If you use or cite the Dutch benchmark results or this specific leaderboard page, please cite the following paper:
36
+
37
+ TDB
38
+
39
+
40
+ If you use the multilingual benchmarks, please cite the following paper:
41
+
42
+ ```bibtex
43
  @misc{{lai2023openllmbenchmark,
44
  author = {{Viet Lai and Nghia Trung Ngo and Amir Pouran Ben Veyseh and Franck Dernoncourt and Thien Huu Nguyen}},
45
  title={{Open Multilingual LLM Evaluation Leaderboard}},
46
  year={{2023}}
47
  }}
48
  ```
49
+ """
css.py DELETED
@@ -1,13 +0,0 @@
1
- CUSTOM_CSS = """
2
- /* Hides the final column */
3
- table td:last-child,
4
- table th:last-child {
5
- display: none;
6
- }
7
- # table td:first-child,
8
- # table th:first-child {
9
- # max-width: 400px;
10
- # overflow: auto;
11
- # white-space: nowrap;
12
- # }
13
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_nl_Llama-2-7b-chat-hf.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "arc_nl": {
4
- "acc": 0.3609923011120616,
5
- "acc_stderr": 0.014053373664144792,
6
- "acc_norm": 0.3618477331052181,
7
- "acc_norm_stderr": 0.014060593893704966
8
  }
9
  },
10
  "versions": {
@@ -12,8 +12,8 @@
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 8,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
  "arc_nl": {
4
+ "acc": 0.3550042771599658,
5
+ "acc_stderr": 0.014001474982174305,
6
+ "acc_norm": 0.3609923011120616,
7
+ "acc_norm_stderr": 0.014053373664144789
8
  }
9
  },
10
  "versions": {
 
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/arc/arc_nl_Llama-2-7b-hf.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "arc_nl": {
4
- "acc": 0.33704020530367834,
5
- "acc_stderr": 0.013831300903580639,
6
- "acc_norm": 0.3567151411462789,
7
- "acc_norm_stderr": 0.014016546277185005
8
  }
9
  },
10
  "versions": {
@@ -12,8 +12,8 @@
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 8,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
  "arc_nl": {
4
+ "acc": 0.33447390932420873,
5
+ "acc_stderr": 0.013805185437125271,
6
+ "acc_norm": 0.3558597091531223,
7
+ "acc_norm_stderr": 0.014009035017396714
8
  }
9
  },
10
  "versions": {
 
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/arc/{arc_nl_Mistral-7B-v0.1.json → arc_nl_Orca-2-7b.json} RENAMED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "arc_nl": {
4
- "acc": 0.42087254063301965,
5
- "acc_stderr": 0.014445778557368833,
6
- "acc_norm": 0.4294268605645851,
7
- "acc_norm_stderr": 0.014483677397351059
8
  }
9
  },
10
  "versions": {
@@ -12,8 +12,8 @@
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 8,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
  "arc_nl": {
4
+ "acc": 0.3661248930710009,
5
+ "acc_stderr": 0.014095972894279241,
6
+ "acc_norm": 0.3678357570573139,
7
+ "acc_norm_stderr": 0.014109788842173
8
  }
9
  },
10
  "versions": {
 
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=microsoft/Orca-2-7b,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/{truthfulqa/truthfulqa_nl-Llama-2-13b-hf.json → arc/arc_nl_gpt2-large-dutch.json} RENAMED
@@ -1,19 +1,19 @@
1
  {
2
  "results": {
3
- "truthfulqa_nl": {
4
- "mc1": 0.2764331210191083,
5
- "mc1_stderr": 0.01597262688062874,
6
- "mc2": 0.4103755310313891,
7
- "mc2_stderr": 0.014811313488625848
8
  }
9
  },
10
  "versions": {
11
- "truthfulqa_nl": 1
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=meta-llama/Llama-2-13b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 8,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
+ "arc_nl": {
4
+ "acc": 0.20102651839178784,
5
+ "acc_stderr": 0.011726581781869408,
6
+ "acc_norm": 0.24037639007698888,
7
+ "acc_norm_stderr": 0.01250327289928353
8
  }
9
  },
10
  "versions": {
11
+ "arc_nl": 0
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/arc/arc_nl_gpt2-medium-dutch.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_nl": {
4
+ "acc": 0.21471343028229256,
5
+ "acc_stderr": 0.012014958326088981,
6
+ "acc_norm": 0.24294268605645852,
7
+ "acc_norm_stderr": 0.012548588352773891
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_nl": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc/arc_nl_zephyr-7b-beta.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "arc_nl": {
4
- "acc": 0.43798118049615054,
5
- "acc_stderr": 0.01451716231691793,
6
- "acc_norm": 0.4328485885372113,
7
- "acc_norm_stderr": 0.01449759923259859
8
  }
9
  },
10
  "versions": {
@@ -12,8 +12,8 @@
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 8,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
  "arc_nl": {
4
+ "acc": 0.4311377245508982,
5
+ "acc_stderr": 0.014490726457652989,
6
+ "acc_norm": 0.43199315654405473,
7
+ "acc_norm_stderr": 0.014494184864971338
8
  }
9
  },
10
  "versions": {
 
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/hellaswag/hellaswag_nl_Llama-2-7b-chat-hf.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "hellaswag_nl": {
4
- "acc": 0.38467350242849435,
5
- "acc_stderr": 0.005054749888300686,
6
- "acc_norm": 0.4823529411764706,
7
- "acc_norm_stderr": 0.005191586180318448
8
  }
9
  },
10
  "versions": {
@@ -12,8 +12,8 @@
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 64,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
  "hellaswag_nl": {
4
+ "acc": 0.3838100377765785,
5
+ "acc_stderr": 0.005052614927289456,
6
+ "acc_norm": 0.4819212088505127,
7
+ "acc_norm_stderr": 0.005191425828002782
8
  }
9
  },
10
  "versions": {
 
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/hellaswag/hellaswag_nl_Llama-2-7b-hf.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "hellaswag_nl": {
4
- "acc": 0.3878035617916892,
5
- "acc_stderr": 0.005062348307428708,
6
- "acc_norm": 0.5000539665407447,
7
- "acc_norm_stderr": 0.005194822688012659
8
  }
9
  },
10
  "versions": {
@@ -12,8 +12,8 @@
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 64,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
  "hellaswag_nl": {
4
+ "acc": 0.386184565569347,
5
+ "acc_stderr": 0.00505844561828187,
6
+ "acc_norm": 0.4957366432811657,
7
+ "acc_norm_stderr": 0.0051946338704556266
8
  }
9
  },
10
  "versions": {
 
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "hellaswag_nl": {
4
- "acc": 0.43486238532110094,
5
- "acc_stderr": 0.005150551758279897,
6
- "acc_norm": 0.5676200755531571,
7
- "acc_norm_stderr": 0.005147097096977192
8
  }
9
  },
10
  "versions": {
@@ -12,8 +12,8 @@
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 64,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
  "hellaswag_nl": {
4
+ "acc": 0.4336751214247167,
5
+ "acc_stderr": 0.0051489159372014965,
6
+ "acc_norm": 0.5662169454937939,
7
+ "acc_norm_stderr": 0.005149065890785751
8
  }
9
  },
10
  "versions": {
 
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/hellaswag/{hellaswag_nl_zephyr-7b-beta.json → hellaswag_nl_Orca-2-7b.json} RENAMED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "hellaswag_nl": {
4
- "acc": 0.4478143550998381,
5
- "acc_stderr": 0.005166450687025188,
6
- "acc_norm": 0.575067458175931,
7
- "acc_norm_stderr": 0.005135942094754352
8
  }
9
  },
10
  "versions": {
@@ -12,8 +12,8 @@
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 64,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
  "hellaswag_nl": {
4
+ "acc": 0.38456556934700487,
5
+ "acc_stderr": 0.005054483938257531,
6
+ "acc_norm": 0.48041014570966,
7
+ "acc_norm_stderr": 0.005190834031799853
8
  }
9
  },
10
  "versions": {
 
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=microsoft/Orca-2-7b,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/hellaswag/hellaswag_nl_gpt2-large-dutch.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag_nl": {
4
+ "acc": 0.3043712898003238,
5
+ "acc_stderr": 0.004780698091128437,
6
+ "acc_norm": 0.34279546681057743,
7
+ "acc_norm_stderr": 0.004931380767300367
8
+ }
9
+ },
10
+ "versions": {
11
+ "hellaswag_nl": 1
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/hellaswag/hellaswag_nl_gpt2-medium-dutch.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag_nl": {
4
+ "acc": 0.31246627091203455,
5
+ "acc_stderr": 0.004815587775923881,
6
+ "acc_norm": 0.36438208310847275,
7
+ "acc_norm_stderr": 0.00500008398696681
8
+ }
9
+ },
10
+ "versions": {
11
+ "hellaswag_nl": 1
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/hellaswag/hellaswag_nl_neural-chat-7b-v3-1.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag_nl": {
4
+ "acc": 0.44069077172153265,
5
+ "acc_stderr": 0.0051581467942195215,
6
+ "acc_norm": 0.5429033998920669,
7
+ "acc_norm_stderr": 0.005175663147811796
8
+ }
9
+ },
10
+ "versions": {
11
+ "hellaswag_nl": 1
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=Intel/neural-chat-7b-v3-1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/{truthfulqa/truthfulqa_nl_Mistral-7B-v0.1.json → mmlu/mmlu_nl_Mistral-7B-v0.1.json} RENAMED
@@ -1,19 +1,19 @@
1
  {
2
  "results": {
3
- "truthfulqa_nl": {
4
- "mc1": 0.3070063694267516,
5
- "mc1_stderr": 0.01647328769082192,
6
- "mc2": 0.45280570817630444,
7
- "mc2_stderr": 0.015014728029135574
8
  }
9
  },
10
  "versions": {
11
- "truthfulqa_nl": 1
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 64,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
+ "mmlu_nl": {
4
+ "acc": 0.45974045685664416,
5
+ "acc_stderr": 0.004341759787221058,
6
+ "acc_norm": 0.36912802610609396,
7
+ "acc_norm_stderr": 0.0042040447899996366
8
  }
9
  },
10
  "versions": {
11
+ "mmlu_nl": 0
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/mmlu/mmlu_nl_gpt2-large-dutch.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "mmlu_nl": {
4
+ "acc": 0.2301737876603172,
5
+ "acc_stderr": 0.003667182186959482,
6
+ "acc_norm": 0.2436821734841011,
7
+ "acc_norm_stderr": 0.0037400056232706905
8
+ }
9
+ },
10
+ "versions": {
11
+ "mmlu_nl": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/mmlu/mmlu_nl_gpt2-medium-dutch.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "mmlu_nl": {
4
+ "acc": 0.23343704940426502,
5
+ "acc_stderr": 0.0036852504856799066,
6
+ "acc_norm": 0.2483873415800258,
7
+ "acc_norm_stderr": 0.003764176503735655
8
+ }
9
+ },
10
+ "versions": {
11
+ "mmlu_nl": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/truthfulqa/truthfulqa_nl_Llama-2-13b-hf.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "truthfulqa_nl": {
4
- "mc1": 0.2764331210191083,
5
- "mc1_stderr": 0.01597262688062874,
6
- "mc2": 0.4103755310313891,
7
- "mc2_stderr": 0.014811313488625848
8
- }
9
- },
10
- "versions": {
11
- "truthfulqa_nl": 1
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=meta-llama/Llama-2-13b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 8,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/truthfulqa/truthfulqa_nl_Llama-2-7b-chat-hf.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "truthfulqa_nl": {
4
- "mc1": 0.2917197452229299,
5
- "mc1_stderr": 0.016234071293195287,
6
- "mc2": 0.4462996697687161,
7
- "mc2_stderr": 0.016161710042968205
8
  }
9
  },
10
  "versions": {
@@ -12,8 +12,8 @@
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 64,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
  "truthfulqa_nl": {
4
+ "mc1": 0.289171974522293,
5
+ "mc1_stderr": 0.016192068781346693,
6
+ "mc2": 0.4445882138885173,
7
+ "mc2_stderr": 0.016144169053565395
8
  }
9
  },
10
  "versions": {
 
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/truthfulqa/truthfulqa_nl_Llama-2-7b-hf.json CHANGED
@@ -3,8 +3,8 @@
3
  "truthfulqa_nl": {
4
  "mc1": 0.28152866242038216,
5
  "mc1_stderr": 0.016062309899461683,
6
- "mc2": 0.41626070733921117,
7
- "mc2_stderr": 0.014914193769419527
8
  }
9
  },
10
  "versions": {
@@ -12,8 +12,8 @@
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 64,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
3
  "truthfulqa_nl": {
4
  "mc1": 0.28152866242038216,
5
  "mc1_stderr": 0.016062309899461683,
6
+ "mc2": 0.41449853431238814,
7
+ "mc2_stderr": 0.014922005996963188
8
  }
9
  },
10
  "versions": {
 
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/truthfulqa/{truthfulqa_nl-falcon-40b-ft-alpaca-dolly-dutch.json → truthfulqa_nl_Orca-2-7b.json} RENAMED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "truthfulqa_nl": {
4
- "mc1": 0.310828025477707,
5
- "mc1_stderr": 0.016529733724696277,
6
- "mc2": 0.4460845208916539,
7
- "mc2_stderr": 0.01476856418537487
8
  }
9
  },
10
  "versions": {
@@ -12,8 +12,8 @@
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=BramVanroy/falcon-40b-ft-alpaca-dolly-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 8,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
  "truthfulqa_nl": {
4
+ "mc1": 0.3146496815286624,
5
+ "mc1_stderr": 0.01658486445168711,
6
+ "mc2": 0.4488463711895695,
7
+ "mc2_stderr": 0.016292493035951996
8
  }
9
  },
10
  "versions": {
 
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=microsoft/Orca-2-7b,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/truthfulqa/truthfulqa_nl_falcon-40b-ft-alpaca-dolly-dutch.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "truthfulqa_nl": {
4
- "mc1": 0.310828025477707,
5
- "mc1_stderr": 0.016529733724696277,
6
- "mc2": 0.4460845208916539,
7
- "mc2_stderr": 0.01476856418537487
8
- }
9
- },
10
- "versions": {
11
- "truthfulqa_nl": 1
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=BramVanroy/falcon-40b-ft-alpaca-dolly-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 8,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/truthfulqa/truthfulqa_nl_falcon-40b.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "truthfulqa_nl": {
4
- "mc1": 0.2764331210191083,
5
- "mc1_stderr": 0.01597262688062875,
6
- "mc2": 0.4091336161450544,
7
- "mc2_stderr": 0.014605140809282338
8
- }
9
- },
10
- "versions": {
11
- "truthfulqa_nl": 1
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=tiiuae/falcon-40b,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 8,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/truthfulqa/{truthfulqa_nl-llama2-13b-ft-mc4_nl_cleaned_tiny.json → truthfulqa_nl_gpt2-large-dutch.json} RENAMED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "truthfulqa_nl": {
4
- "mc1": 0.2751592356687898,
5
- "mc1_stderr": 0.0159498029022655,
6
- "mc2": 0.41816127879466414,
7
- "mc2_stderr": 0.01474120131034505
8
  }
9
  },
10
  "versions": {
@@ -12,8 +12,8 @@
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=BramVanroy/llama2-13b-ft-mc4_nl_cleaned_tiny,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 8,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
  "truthfulqa_nl": {
4
+ "mc1": 0.25987261146496815,
5
+ "mc1_stderr": 0.015663018533664023,
6
+ "mc2": 0.41961324970531233,
7
+ "mc2_stderr": 0.01509691194885121
8
  }
9
  },
10
  "versions": {
 
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/truthfulqa/{truthfulqa_nl-falcon-40b.json → truthfulqa_nl_gpt2-medium-dutch.json} RENAMED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "truthfulqa_nl": {
4
- "mc1": 0.2764331210191083,
5
- "mc1_stderr": 0.01597262688062875,
6
- "mc2": 0.4091336161450544,
7
- "mc2_stderr": 0.014605140809282338
8
  }
9
  },
10
  "versions": {
@@ -12,8 +12,8 @@
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=tiiuae/falcon-40b,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 8,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
  "truthfulqa_nl": {
4
+ "mc1": 0.2878980891719745,
5
+ "mc1_stderr": 0.0161708346142461,
6
+ "mc2": 0.4527386932512769,
7
+ "mc2_stderr": 0.015417954968769677
8
  }
9
  },
10
  "versions": {
 
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
+ "batch_size": "auto",
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/truthfulqa/truthfulqa_nl_llama2-13b-ft-mc4_nl_cleaned_tiny.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "truthfulqa_nl": {
4
- "mc1": 0.2751592356687898,
5
- "mc1_stderr": 0.0159498029022655,
6
- "mc2": 0.41816127879466414,
7
- "mc2_stderr": 0.01474120131034505
8
- }
9
- },
10
- "versions": {
11
- "truthfulqa_nl": 1
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=BramVanroy/llama2-13b-ft-mc4_nl_cleaned_tiny,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 8,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/truthfulqa/truthfulqa_nl_zephyr-7b-beta.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "truthfulqa_nl": {
4
- "mc1": 0.3719745222929936,
5
- "mc1_stderr": 0.0172618443903749,
6
- "mc2": 0.5294532108691418,
7
- "mc2_stderr": 0.016221848481192833
8
- }
9
- },
10
- "versions": {
11
- "truthfulqa_nl": 1
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
- "batch_size": 64,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }