Sean Cho commited on
Commit
097981b
Β·
1 Parent(s): 86e581e

Big update

Browse files
README.md CHANGED
@@ -4,11 +4,12 @@ emoji: πŸ“‰
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 3.43.2
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
  duplicated_from: HuggingFaceH4/open_llm_leaderboard
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.9.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
  duplicated_from: HuggingFaceH4/open_llm_leaderboard
12
+ fullWidth: true
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,106 +1,69 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
- import re
5
- from distutils.util import strtobool
6
-
7
  import gradio as gr
8
  import pandas as pd
9
  from apscheduler.schedulers.background import BackgroundScheduler
10
- from huggingface_hub import HfApi, snapshot_download
 
11
 
12
- from src.assets.css_html_js import custom_css, get_window_url_params
13
- from src.assets.text_content import (
14
  CITATION_BUTTON_LABEL,
15
  CITATION_BUTTON_TEXT,
16
  EVALUATION_QUEUE_TEXT,
17
  INTRODUCTION_TEXT,
18
  LLM_BENCHMARKS_TEXT,
 
19
  TITLE,
20
  BOTTOM_LOGO,
21
  )
22
- from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
23
- from src.display_models.utils import (
 
 
 
 
 
 
24
  AutoEvalColumn,
25
- EvalQueueColumn,
26
  fields,
27
- styled_error,
28
- styled_message,
29
- styled_warning,
 
 
 
 
 
 
 
 
30
  )
31
- from src.load_from_hub import get_all_requested_models, get_evaluation_queue_df, get_leaderboard_df, is_model_on_hub
32
- from src.rate_limiting import user_submission_permission
33
-
34
- pd.set_option("display.precision", 1)
35
-
36
- # clone / pull the lmeh eval data
37
- H4_TOKEN = os.environ.get("H4_TOKEN", None)
38
-
39
- QUEUE_REPO = "open-ko-llm-leaderboard/requests"
40
- RESULTS_REPO = "open-ko-llm-leaderboard/results"
41
-
42
- PRIVATE_QUEUE_REPO = "open-ko-llm-leaderboard/private-requests"
43
- PRIVATE_RESULTS_REPO = "open-ko-llm-leaderboard/private-results"
44
-
45
- IS_PUBLIC = bool(strtobool(os.environ.get("IS_PUBLIC", "True")))
46
-
47
- EVAL_REQUESTS_PATH = "eval-queue"
48
- EVAL_RESULTS_PATH = "eval-results"
49
-
50
- EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
51
- EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
52
-
53
- api = HfApi(token=H4_TOKEN)
54
 
55
 
56
  def restart_space():
57
- api.restart_space(repo_id="upstage/open-ko-llm-leaderboard", token=H4_TOKEN)
58
-
59
- # Rate limit variables
60
- RATE_LIMIT_PERIOD = 7
61
- RATE_LIMIT_QUOTA = 5
62
-
63
- # Column selection
64
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
65
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
66
- COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
67
- TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
68
-
69
- if not IS_PUBLIC:
70
- COLS.insert(2, AutoEvalColumn.precision.name)
71
- TYPES.insert(2, AutoEvalColumn.precision.type)
72
-
73
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
74
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
75
-
76
- BENCHMARK_COLS = [
77
- c.name
78
- for c in [
79
- AutoEvalColumn.arc,
80
- AutoEvalColumn.hellaswag,
81
- AutoEvalColumn.mmlu,
82
- AutoEvalColumn.truthfulqa,
83
- AutoEvalColumn.commongen_v2,
84
- # TODO: Uncomment when we have results for these
85
- # AutoEvalColumn.ethicalverification,
86
- ]
87
- ]
88
-
89
- snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None)
90
- snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None)
91
- requested_models, users_to_submission_dates = get_all_requested_models(EVAL_REQUESTS_PATH)
92
 
93
- original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
94
- leaderboard_df = original_df.copy()
95
- models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- # Commented out because it causes infinite restart loops in local
98
- # to_be_dumped = f"models = {repr(models)}\n"
99
 
100
- # with open("models_backlinks.py", "w") as f:
101
- # f.write(to_be_dumped)
 
 
102
 
103
- # print(to_be_dumped)
104
 
105
  (
106
  finished_eval_queue_df,
@@ -109,142 +72,34 @@ models = original_df["model_name_for_query"].tolist() # needed for model backlin
109
  failed_eval_queue_df,
110
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
111
 
112
- ## INTERACTION FUNCTIONS
113
- def add_new_eval(
114
- model: str,
115
- base_model: str,
116
- revision: str,
117
- precision: str,
118
- private: bool,
119
- weight_type: str,
120
- model_type: str,
121
- ):
122
- precision = precision.split(" ")[0]
123
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
124
-
125
- num_models_submitted_in_period = user_submission_permission(model, users_to_submission_dates, RATE_LIMIT_PERIOD)
126
- if num_models_submitted_in_period > RATE_LIMIT_QUOTA:
127
- error_msg = f"Organisation or user `{model.split('/')[0]}`"
128
- error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
129
- error_msg += f"in the last {RATE_LIMIT_PERIOD} days.\n"
130
- error_msg += "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard πŸ€—"
131
- return styled_error(error_msg)
132
-
133
- if model_type is None or model_type == "":
134
- return styled_error("Please select a model type.")
135
-
136
- # check the model actually exists before adding the eval
137
- if revision == "":
138
- revision = "main"
139
-
140
- if weight_type in ["Delta", "Adapter"]:
141
- base_model_on_hub, error = is_model_on_hub(base_model, revision)
142
- if not base_model_on_hub:
143
- return styled_error(f'Base model "{base_model}" {error}')
144
-
145
- if not weight_type == "Adapter":
146
- model_on_hub, error = is_model_on_hub(model, revision)
147
- if not model_on_hub:
148
- return styled_error(f'Model "{model}" {error}')
149
-
150
- model_info = api.model_info(repo_id=model, revision=revision)
151
-
152
- size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
153
- try:
154
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
155
- except AttributeError:
156
- try:
157
- size_match = re.search(size_pattern, model.lower())
158
- model_size = size_match.group(0)
159
- model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
160
- except AttributeError:
161
- return 65
162
-
163
- size_factor = 8 if (precision == "GPTQ" or "GPTQ" in model) else 1
164
- model_size = size_factor * model_size
165
-
166
- try:
167
- license = model_info.cardData["license"]
168
- except Exception:
169
- license = "?"
170
-
171
- print("adding new eval")
172
-
173
- eval_entry = {
174
- "model": model,
175
- "base_model": base_model,
176
- "revision": revision,
177
- "private": private,
178
- "precision": precision,
179
- "weight_type": weight_type,
180
- "status": "PENDING",
181
- "submitted_time": current_time,
182
- "model_type": model_type,
183
- }
184
-
185
- user_name = ""
186
- model_path = model
187
- if "/" in model:
188
- user_name = model.split("/")[0]
189
- model_path = model.split("/")[1]
190
-
191
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
192
- os.makedirs(OUT_DIR, exist_ok=True)
193
- out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
194
-
195
- if user_name == "upstage":
196
- return styled_warning("The model participating as a Host in Upstage does not conduct evaluations to ensure the transparency and fairness of the leaderboard. Please take this into consideration.")
197
-
198
- # Check if the model has been forbidden:
199
- if out_path.split("eval-queue/")[1] in DO_NOT_SUBMIT_MODELS:
200
- return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
201
-
202
- # Check for duplicate submission
203
- if f"{model}_{revision}_{precision}" in requested_models:
204
- return styled_warning("This model has been already submitted.")
205
-
206
- with open(out_path, "w") as f:
207
- f.write(json.dumps(eval_entry))
208
-
209
- api.upload_file(
210
- path_or_fileobj=out_path,
211
- path_in_repo=out_path.split("eval-queue/")[1],
212
- repo_id=QUEUE_REPO,
213
- repo_type="dataset",
214
- commit_message=f"Add {model} to eval queue",
215
- )
216
-
217
- # remove the local file
218
- os.remove(out_path)
219
-
220
- return styled_message(
221
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
222
- )
223
-
224
-
225
- # Basics
226
- def change_tab(query_param: str):
227
- query_param = query_param.replace("'", '"')
228
- query_param = json.loads(query_param)
229
-
230
- if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "evaluation":
231
- return gr.Tabs.update(selected=1)
232
- else:
233
- return gr.Tabs.update(selected=0)
234
-
235
 
236
  # Searching and filtering
237
- def update_table(hidden_df: pd.DataFrame, current_columns_df: pd.DataFrame, columns: list, type_query: list, precision_query: str, size_query: list, show_deleted: bool, query: str):
238
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
239
- if query != "":
240
- filtered_df = search_table(filtered_df, query)
 
 
 
 
 
 
 
 
 
241
  df = select_columns(filtered_df, columns)
242
-
243
  return df
244
 
 
 
 
 
 
 
245
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
246
  return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
247
 
 
248
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
249
  always_here_cols = [
250
  AutoEvalColumn.model_type_symbol.name,
@@ -256,18 +111,29 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
256
  ]
257
  return filtered_df
258
 
259
- NUMERIC_INTERVALS = {
260
- "Unknown": pd.Interval(-1, 0, closed="right"),
261
- "0~3B": pd.Interval(0, 3, closed="right"),
262
- "3~7B": pd.Interval(3, 7.3, closed="right"),
263
- "7~13B": pd.Interval(7.3, 13, closed="right"),
264
- "13~35B": pd.Interval(13, 35, closed="right"),
265
- "35~60B": pd.Interval(35, 60, closed="right"),
266
- "60B+": pd.Interval(60, 10000, closed="right"),
267
- }
 
 
 
 
 
 
 
 
 
 
 
268
 
269
  def filter_models(
270
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
271
  ) -> pd.DataFrame:
272
  # Show all models
273
  if show_deleted:
@@ -275,9 +141,15 @@ def filter_models(
275
  else: # Show only still on the hub models
276
  filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
277
 
 
 
 
 
 
 
278
  type_emoji = [t[0] for t in type_query]
279
- filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
280
- filtered_df = filtered_df[df[AutoEvalColumn.precision.name].isin(precision_query)]
281
 
282
  numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
283
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
@@ -286,6 +158,7 @@ def filter_models(
286
 
287
  return filtered_df
288
 
 
289
 
290
  demo = gr.Blocks(css=custom_css)
291
  with demo:
@@ -298,33 +171,21 @@ with demo:
298
  with gr.Column():
299
  with gr.Row():
300
  search_bar = gr.Textbox(
301
- placeholder=" πŸ” Search for your model and press ENTER...",
302
  show_label=False,
303
  elem_id="search-bar",
304
  )
305
  with gr.Row():
306
  shown_columns = gr.CheckboxGroup(
307
  choices=[
308
- c
309
- for c in COLS
310
- if c
311
- not in [
312
- AutoEvalColumn.dummy.name,
313
- AutoEvalColumn.model.name,
314
- AutoEvalColumn.model_type_symbol.name,
315
- AutoEvalColumn.still_on_hub.name,
316
- ]
317
  ],
318
  value=[
319
- c
320
- for c in COLS_LITE
321
- if c
322
- not in [
323
- AutoEvalColumn.dummy.name,
324
- AutoEvalColumn.model.name,
325
- AutoEvalColumn.model_type_symbol.name,
326
- AutoEvalColumn.still_on_hub.name,
327
- ]
328
  ],
329
  label="Select columns to show",
330
  elem_id="column-select",
@@ -332,160 +193,132 @@ with demo:
332
  )
333
  with gr.Row():
334
  deleted_models_visibility = gr.Checkbox(
335
- value=True, label="πŸ‘€ Show gated/private/deleted models", interactive=True
336
- )
337
- with gr.Column(min_width=320):
338
- with gr.Box(elem_id="box-filter"):
339
- filter_columns_type = gr.CheckboxGroup(
340
- label="Model types",
341
- choices=[
342
- ModelType.PT.to_str(),
343
- # ModelType.FT.to_str(),
344
- ModelType.IFT.to_str(),
345
- ModelType.RL.to_str(),
346
- ],
347
- value=[
348
- ModelType.PT.to_str(),
349
- # ModelType.FT.to_str(),
350
- ModelType.IFT.to_str(),
351
- ModelType.RL.to_str(),
352
- ],
353
- interactive=True,
354
- elem_id="filter-columns-type",
355
  )
356
- filter_columns_precision = gr.CheckboxGroup(
357
- label="Precision",
358
- choices=["torch.float16"], #, "torch.bfloat16", "torch.float32", "8bit", "4bit", "GPTQ"],
359
- value=["torch.float16"], #, "torch.bfloat16", "torch.float32", "8bit", "4bit", "GPTQ"],
360
- interactive=False,
361
- elem_id="filter-columns-precision",
362
  )
363
- filter_columns_size = gr.CheckboxGroup(
364
- label="Model sizes",
365
- choices=list(NUMERIC_INTERVALS.keys()),
366
- value=list(NUMERIC_INTERVALS.keys()),
367
- interactive=True,
368
- elem_id="filter-columns-size",
369
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
 
371
  leaderboard_table = gr.components.Dataframe(
372
  value=leaderboard_df[
373
- [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
374
  + shown_columns.value
375
  + [AutoEvalColumn.dummy.name]
376
  ],
377
- headers=[
378
- AutoEvalColumn.model_type_symbol.name,
379
- AutoEvalColumn.model.name,
380
- ]
381
- + shown_columns.value
382
- + [AutoEvalColumn.dummy.name],
383
  datatype=TYPES,
384
- max_rows=None,
385
  elem_id="leaderboard-table",
386
  interactive=False,
387
  visible=True,
 
388
  )
389
 
390
  # Dummy leaderboard for handling the case when the user uses backspace key
391
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
392
- value=original_df,
393
  headers=COLS,
394
  datatype=TYPES,
395
- max_rows=None,
396
  visible=False,
397
  )
398
  search_bar.submit(
399
  update_table,
400
  [
401
  hidden_leaderboard_table_for_search,
402
- leaderboard_table,
403
  shown_columns,
404
  filter_columns_type,
405
  filter_columns_precision,
406
  filter_columns_size,
407
  deleted_models_visibility,
 
 
408
  search_bar,
409
  ],
410
  leaderboard_table,
411
  )
412
- shown_columns.change(
413
- update_table,
414
- [
415
- hidden_leaderboard_table_for_search,
416
- leaderboard_table,
417
- shown_columns,
418
- filter_columns_type,
419
- filter_columns_precision,
420
- filter_columns_size,
421
- deleted_models_visibility,
422
- search_bar,
423
- ],
424
- leaderboard_table,
425
- queue=True,
426
- )
427
- filter_columns_type.change(
428
- update_table,
429
- [
430
- hidden_leaderboard_table_for_search,
431
- leaderboard_table,
432
- shown_columns,
433
- filter_columns_type,
434
- filter_columns_precision,
435
- filter_columns_size,
436
- deleted_models_visibility,
437
- search_bar,
438
- ],
439
- leaderboard_table,
440
- queue=True,
441
- )
442
- filter_columns_precision.change(
443
- update_table,
444
- [
445
- hidden_leaderboard_table_for_search,
446
- leaderboard_table,
447
- shown_columns,
448
- filter_columns_type,
449
- filter_columns_precision,
450
- filter_columns_size,
451
- deleted_models_visibility,
452
- search_bar,
453
- ],
454
- leaderboard_table,
455
- queue=True,
456
- )
457
- filter_columns_size.change(
458
  update_table,
459
  [
460
  hidden_leaderboard_table_for_search,
461
- leaderboard_table,
462
  shown_columns,
463
  filter_columns_type,
464
  filter_columns_precision,
465
  filter_columns_size,
466
  deleted_models_visibility,
 
 
467
  search_bar,
468
  ],
469
  leaderboard_table,
470
- queue=True,
471
  )
472
- deleted_models_visibility.change(
473
- update_table,
474
- [
475
- hidden_leaderboard_table_for_search,
 
 
 
 
 
 
 
 
 
 
 
 
 
476
  leaderboard_table,
477
- shown_columns,
478
- filter_columns_type,
479
- filter_columns_precision,
480
- filter_columns_size,
481
- deleted_models_visibility,
482
- search_bar,
483
- ],
484
- leaderboard_table,
485
- queue=True,
486
- )
 
 
 
 
 
 
 
 
 
487
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
488
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
489
 
490
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
491
  with gr.Column():
@@ -502,7 +335,7 @@ with demo:
502
  value=finished_eval_queue_df,
503
  headers=EVAL_COLS,
504
  datatype=EVAL_TYPES,
505
- max_rows=5,
506
  )
507
  with gr.Accordion(
508
  f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
@@ -513,7 +346,7 @@ with demo:
513
  value=running_eval_queue_df,
514
  headers=EVAL_COLS,
515
  datatype=EVAL_TYPES,
516
- max_rows=5,
517
  )
518
 
519
  with gr.Accordion(
@@ -525,7 +358,7 @@ with demo:
525
  value=pending_eval_queue_df,
526
  headers=EVAL_COLS,
527
  datatype=EVAL_TYPES,
528
- max_rows=5,
529
  )
530
  with gr.Accordion(
531
  f"❌ Failed Evaluations ({len(failed_eval_queue_df)})",
@@ -536,7 +369,7 @@ with demo:
536
  value=failed_eval_queue_df,
537
  headers=EVAL_COLS,
538
  datatype=EVAL_TYPES,
539
- max_rows=5,
540
  )
541
  with gr.Row():
542
  gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
@@ -544,37 +377,26 @@ with demo:
544
  with gr.Row():
545
  with gr.Column():
546
  model_name_textbox = gr.Textbox(label="Model name")
547
- revision_name_textbox = gr.Textbox(label="Revision", placeholder="main")
548
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
549
  model_type = gr.Dropdown(
550
- choices=[
551
- ModelType.PT.to_str(" : "),
552
- # ModelType.FT.to_str(" : "),
553
- ModelType.IFT.to_str(" : "),
554
- ModelType.RL.to_str(" : "),
555
- ],
556
  label="Model type",
557
  multiselect=False,
558
- value=None,
559
  interactive=True,
560
  )
561
 
562
  with gr.Column():
563
  precision = gr.Dropdown(
564
- choices=[
565
- "float16",
566
- # "bfloat16",
567
- # "8bit (LLM.int8)",
568
- # "4bit (QLoRA / FP4)",
569
- # "GPTQ"
570
- ],
571
  label="Precision",
572
  multiselect=False,
573
  value="float16",
574
  interactive=True,
575
  )
576
  weight_type = gr.Dropdown(
577
- choices=["Original", "Delta", "Adapter"],
578
  label="Weights type",
579
  multiselect=False,
580
  value="Original",
@@ -603,20 +425,23 @@ with demo:
603
  citation_button = gr.Textbox(
604
  value=CITATION_BUTTON_TEXT,
605
  label=CITATION_BUTTON_LABEL,
 
606
  elem_id="citation-button",
607
- ).style(show_copy_button=True)
608
-
609
  gr.HTML(BOTTOM_LOGO)
610
 
611
- dummy = gr.Textbox(visible=False)
612
- demo.load(
613
- change_tab,
614
- dummy,
615
- tabs,
616
- _js=get_window_url_params,
617
- )
618
-
619
  scheduler = BackgroundScheduler()
620
  scheduler.add_job(restart_space, "interval", seconds=1800)
621
  scheduler.start()
622
- demo.queue(concurrency_count=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
+ from huggingface_hub import snapshot_download
5
+ from gradio_space_ci import configure_space_ci # FOR CI
6
 
7
+ from src.display.about import (
 
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
10
  EVALUATION_QUEUE_TEXT,
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
+ FAQ_TEXT,
14
  TITLE,
15
  BOTTOM_LOGO,
16
  )
17
+ from src.display.css_html_js import custom_css
18
+ from src.display.utils import (
19
+ BENCHMARK_COLS,
20
+ COLS,
21
+ EVAL_COLS,
22
+ EVAL_TYPES,
23
+ NUMERIC_INTERVALS,
24
+ TYPES,
25
  AutoEvalColumn,
26
+ ModelType,
27
  fields,
28
+ WeightType,
29
+ Precision
30
+ )
31
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
32
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
33
+ from src.submission.submit import add_new_eval
34
+ from src.tools.collections import update_collections
35
+ from src.tools.plots import (
36
+ create_metric_plot_obj,
37
+ create_plot_df,
38
+ create_scores_df,
39
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
 
42
  def restart_space():
43
+ API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ try:
46
+ print(EVAL_REQUESTS_PATH)
47
+ snapshot_download(
48
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
49
+ )
50
+ except Exception:
51
+ restart_space()
52
+ try:
53
+ print(EVAL_RESULTS_PATH)
54
+ snapshot_download(
55
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
56
+ )
57
+ except Exception:
58
+ restart_space()
59
 
 
 
60
 
61
+ raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
62
+ if REPO_ID == "upstage/open-ko-llm-leaderboard": # update only when it's from real leaderboard
63
+ update_collections(original_df.copy())
64
+ leaderboard_df = original_df.copy()
65
 
66
+ plot_df = create_plot_df(create_scores_df(raw_data))
67
 
68
  (
69
  finished_eval_queue_df,
 
72
  failed_eval_queue_df,
73
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  # Searching and filtering
77
+ def update_table(
78
+ hidden_df: pd.DataFrame,
79
+ columns: list,
80
+ type_query: list,
81
+ precision_query: str,
82
+ size_query: list,
83
+ show_deleted: bool,
84
+ show_merges: bool,
85
+ show_flagged: bool,
86
+ query: str,
87
+ ):
88
+ filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted, show_merges, show_flagged)
89
+ filtered_df = filter_queries(query, filtered_df)
90
  df = select_columns(filtered_df, columns)
 
91
  return df
92
 
93
+
94
+ def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
95
+ query = request.query_params.get("query") or ""
96
+ return query, query # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
97
+
98
+
99
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
100
  return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
101
 
102
+
103
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
104
  always_here_cols = [
105
  AutoEvalColumn.model_type_symbol.name,
 
111
  ]
112
  return filtered_df
113
 
114
+
115
+ def filter_queries(query: str, filtered_df: pd.DataFrame):
116
+ """Added by Abishek"""
117
+ final_df = []
118
+ if query != "":
119
+ queries = [q.strip() for q in query.split(";")]
120
+ for _q in queries:
121
+ _q = _q.strip()
122
+ if _q != "":
123
+ temp_filtered_df = search_table(filtered_df, _q)
124
+ if len(temp_filtered_df) > 0:
125
+ final_df.append(temp_filtered_df)
126
+ if len(final_df) > 0:
127
+ filtered_df = pd.concat(final_df)
128
+ filtered_df = filtered_df.drop_duplicates(
129
+ subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
130
+ )
131
+
132
+ return filtered_df
133
+
134
 
135
  def filter_models(
136
+ df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool, show_merges: bool, show_flagged: bool
137
  ) -> pd.DataFrame:
138
  # Show all models
139
  if show_deleted:
 
141
  else: # Show only still on the hub models
142
  filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
143
 
144
+ if not show_merges:
145
+ filtered_df = filtered_df[filtered_df[AutoEvalColumn.merged.name] == False]
146
+
147
+ if not show_flagged:
148
+ filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
149
+
150
  type_emoji = [t[0] for t in type_query]
151
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
152
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
153
 
154
  numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
155
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
 
158
 
159
  return filtered_df
160
 
161
+ leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False, False)
162
 
163
  demo = gr.Blocks(css=custom_css)
164
  with demo:
 
171
  with gr.Column():
172
  with gr.Row():
173
  search_bar = gr.Textbox(
174
+ placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
175
  show_label=False,
176
  elem_id="search-bar",
177
  )
178
  with gr.Row():
179
  shown_columns = gr.CheckboxGroup(
180
  choices=[
181
+ c.name
182
+ for c in fields(AutoEvalColumn)
183
+ if not c.hidden and not c.never_hidden and not c.dummy
 
 
 
 
 
 
184
  ],
185
  value=[
186
+ c.name
187
+ for c in fields(AutoEvalColumn)
188
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
 
 
 
 
 
 
189
  ],
190
  label="Select columns to show",
191
  elem_id="column-select",
 
193
  )
194
  with gr.Row():
195
  deleted_models_visibility = gr.Checkbox(
196
+ value=False, label="Show private/deleted models", interactive=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  )
198
+ merged_models_visibility = gr.Checkbox(
199
+ value=False, label="Show merges", interactive=True
 
 
 
 
200
  )
201
+ flagged_models_visibility = gr.Checkbox(
202
+ value=False, label="Show flagged models", interactive=True
 
 
 
 
203
  )
204
+ with gr.Column(min_width=320):
205
+ #with gr.Box(elem_id="box-filter"):
206
+ filter_columns_type = gr.CheckboxGroup(
207
+ label="Model types",
208
+ choices=[t.to_str() for t in ModelType],
209
+ value=[t.to_str() for t in ModelType],
210
+ interactive=True,
211
+ elem_id="filter-columns-type",
212
+ )
213
+ filter_columns_precision = gr.CheckboxGroup(
214
+ label="Precision",
215
+ choices=[i.value.name for i in Precision],
216
+ value=[i.value.name for i in Precision],
217
+ interactive=True,
218
+ elem_id="filter-columns-precision",
219
+ )
220
+ filter_columns_size = gr.CheckboxGroup(
221
+ label="Model sizes (in billions of parameters)",
222
+ choices=list(NUMERIC_INTERVALS.keys()),
223
+ value=list(NUMERIC_INTERVALS.keys()),
224
+ interactive=True,
225
+ elem_id="filter-columns-size",
226
+ )
227
 
228
  leaderboard_table = gr.components.Dataframe(
229
  value=leaderboard_df[
230
+ [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
231
  + shown_columns.value
232
  + [AutoEvalColumn.dummy.name]
233
  ],
234
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
 
 
 
 
 
235
  datatype=TYPES,
 
236
  elem_id="leaderboard-table",
237
  interactive=False,
238
  visible=True,
239
+ #column_widths=["2%", "33%"]
240
  )
241
 
242
  # Dummy leaderboard for handling the case when the user uses backspace key
243
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
244
+ value=original_df[COLS],
245
  headers=COLS,
246
  datatype=TYPES,
 
247
  visible=False,
248
  )
249
  search_bar.submit(
250
  update_table,
251
  [
252
  hidden_leaderboard_table_for_search,
 
253
  shown_columns,
254
  filter_columns_type,
255
  filter_columns_precision,
256
  filter_columns_size,
257
  deleted_models_visibility,
258
+ merged_models_visibility,
259
+ flagged_models_visibility,
260
  search_bar,
261
  ],
262
  leaderboard_table,
263
  )
264
+
265
+ # Define a hidden component that will trigger a reload only if a query parameter has be set
266
+ hidden_search_bar = gr.Textbox(value="", visible=False)
267
+ hidden_search_bar.change(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  update_table,
269
  [
270
  hidden_leaderboard_table_for_search,
 
271
  shown_columns,
272
  filter_columns_type,
273
  filter_columns_precision,
274
  filter_columns_size,
275
  deleted_models_visibility,
276
+ merged_models_visibility,
277
+ flagged_models_visibility,
278
  search_bar,
279
  ],
280
  leaderboard_table,
 
281
  )
282
+ # Check query parameter once at startup and update search bar + hidden component
283
+ demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
284
+
285
+ for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility, merged_models_visibility, flagged_models_visibility]:
286
+ selector.change(
287
+ update_table,
288
+ [
289
+ hidden_leaderboard_table_for_search,
290
+ shown_columns,
291
+ filter_columns_type,
292
+ filter_columns_precision,
293
+ filter_columns_size,
294
+ deleted_models_visibility,
295
+ merged_models_visibility,
296
+ flagged_models_visibility,
297
+ search_bar,
298
+ ],
299
  leaderboard_table,
300
+ queue=True,
301
+ )
302
+
303
+ with gr.TabItem("πŸ“ˆ Metrics through time", elem_id="llm-benchmark-tab-table", id=4):
304
+ with gr.Row():
305
+ with gr.Column():
306
+ chart = create_metric_plot_obj(
307
+ plot_df,
308
+ [AutoEvalColumn.average.name],
309
+ title="Average of Top Scores Over Time (from last update)",
310
+ )
311
+ gr.Plot(value=chart, min_width=500)
312
+ with gr.Column():
313
+ chart = create_metric_plot_obj(
314
+ plot_df,
315
+ BENCHMARK_COLS,
316
+ title="Top Scores Over Time (from last update)",
317
+ )
318
+ gr.Plot(value=chart, min_width=500)
319
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
320
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
321
+ gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
322
 
323
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
324
  with gr.Column():
 
335
  value=finished_eval_queue_df,
336
  headers=EVAL_COLS,
337
  datatype=EVAL_TYPES,
338
+ row_count=5,
339
  )
340
  with gr.Accordion(
341
  f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
 
346
  value=running_eval_queue_df,
347
  headers=EVAL_COLS,
348
  datatype=EVAL_TYPES,
349
+ row_count=5,
350
  )
351
 
352
  with gr.Accordion(
 
358
  value=pending_eval_queue_df,
359
  headers=EVAL_COLS,
360
  datatype=EVAL_TYPES,
361
+ row_count=5,
362
  )
363
  with gr.Accordion(
364
  f"❌ Failed Evaluations ({len(failed_eval_queue_df)})",
 
369
  value=failed_eval_queue_df,
370
  headers=EVAL_COLS,
371
  datatype=EVAL_TYPES,
372
+ row_count=5,
373
  )
374
  with gr.Row():
375
  gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
 
377
  with gr.Row():
378
  with gr.Column():
379
  model_name_textbox = gr.Textbox(label="Model name")
380
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
381
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
382
  model_type = gr.Dropdown(
383
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
 
 
 
 
 
384
  label="Model type",
385
  multiselect=False,
386
+ value=ModelType.IFT.to_str(" : "),
387
  interactive=True,
388
  )
389
 
390
  with gr.Column():
391
  precision = gr.Dropdown(
392
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
 
 
 
 
 
 
393
  label="Precision",
394
  multiselect=False,
395
  value="float16",
396
  interactive=True,
397
  )
398
  weight_type = gr.Dropdown(
399
+ choices=[i.value.name for i in WeightType],
400
  label="Weights type",
401
  multiselect=False,
402
  value="Original",
 
425
  citation_button = gr.Textbox(
426
  value=CITATION_BUTTON_TEXT,
427
  label=CITATION_BUTTON_LABEL,
428
+ lines=20,
429
  elem_id="citation-button",
430
+ show_copy_button=True,
431
+ )
432
  gr.HTML(BOTTOM_LOGO)
433
 
 
 
 
 
 
 
 
 
434
  scheduler = BackgroundScheduler()
435
  scheduler.add_job(restart_space, "interval", seconds=1800)
436
  scheduler.start()
437
+
438
+ # Both launches the space and its CI
439
+ configure_space_ci(
440
+ demo.queue(default_concurrency_limit=40),
441
+ trusted_authors=[], # add manually trusted authors
442
+ private="True", # ephemeral spaces will have same visibility as the main space. Otherwise, set to `True` or `False` explicitly.
443
+ variables={}, # We overwrite HF_HOME as tmp CI spaces will have no cache
444
+ secrets=["HF_TOKEN", "H4_TOKEN"], # which secret do I want to copy from the main space? Can be a `List[str]`.
445
+ hardware=None, # "cpu-basic" by default. Otherwise set to "auto" to have same hardware as the main space or any valid string value.
446
+ storage=None, # no storage by default. Otherwise set to "auto" to have same storage as the main space or any valid string value.
447
+ ).launch()
model_info_cache.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:337f1fb80e92327e7c7b130c03617439f7923e3f7c5383f5abb07e017ef9cae3
3
- size 715983
 
 
 
 
model_size_cache.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:64d63b51e6f5d6dd985b44ef6ddf513d9a7a138e734d77ae7382fd7a49a137ea
3
- size 20652
 
 
 
 
models_backlinks.py DELETED
@@ -1 +0,0 @@
1
- models = ['upstage/Llama-2-70b-instruct-v2', 'upstage/Llama-2-70b-instruct', 'upstage/llama-65b-instruct', 'upstage/llama-65b-instruct', 'upstage/llama-30b-instruct-2048', 'upstage/llama-30b-instruct', 'baseline']
 
 
package-lock.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "name": "open_llm_leaderboard",
3
+ "lockfileVersion": 3,
4
+ "requires": true,
5
+ "packages": {}
6
+ }
requirements.txt CHANGED
@@ -1,71 +1,18 @@
1
- accelerate==0.23.0
2
- aiofiles==23.1.0
3
- aiohttp==3.8.4
4
- aiosignal==1.3.1
5
- altair==4.2.2
6
- anyio==3.6.2
7
  APScheduler==3.10.1
8
- async-timeout==4.0.2
9
- attrs==23.1.0
10
- certifi==2022.12.7
11
- charset-normalizer==3.1.0
12
  click==8.1.3
13
- contourpy==1.0.7
14
- cycler==0.11.0
15
- datasets==2.12.0
16
- entrypoints==0.4
17
- fastapi==0.95.1
18
- ffmpy==0.3.0
19
- filelock==3.11.0
20
- fonttools==4.39.3
21
- frozenlist==1.3.3
22
- fsspec==2023.4.0
23
- gradio==3.43.2
24
- gradio-client==0.5.0
25
- h11==0.14.0
26
- httpcore==0.17.0
27
- httpx==0.24.0
28
- huggingface-hub==0.16.4
29
- idna==3.4
30
- Jinja2==3.1.2
31
- jsonschema==4.17.3
32
- kiwisolver==1.4.4
33
- linkify-it-py==2.0.0
34
- markdown-it-py==2.2.0
35
- MarkupSafe==2.1.2
36
  matplotlib==3.7.1
37
- mdit-py-plugins==0.3.3
38
- mdurl==0.1.2
39
- multidict==6.0.4
40
  numpy==1.24.2
41
- orjson==3.8.10
42
- packaging==23.1
43
  pandas==2.0.0
44
- Pillow==9.5.0
45
  plotly==5.14.1
46
- pyarrow==11.0.0
47
- pydantic==1.10.7
48
- pydub==0.25.1
49
- pyparsing==3.0.9
50
- pyrsistent==0.19.3
51
  python-dateutil==2.8.2
52
- python-multipart==0.0.6
53
- pytz==2023.3
54
- pytz-deprecation-shim==0.1.0.post0
55
- PyYAML==6.0
56
  requests==2.28.2
57
- semantic-version==2.10.0
58
- six==1.16.0
59
- sniffio==1.3.0
60
- starlette==0.26.1
61
- toolz==0.12.0
62
  tqdm==4.65.0
63
- transformers==4.34.0
64
- typing_extensions==4.5.0
65
- tzdata==2023.3
66
- tzlocal==4.3
67
- uc-micro-py==1.0.1
68
- urllib3==1.26.15
69
- uvicorn==0.21.1
70
- websockets==11.0.1
71
- yarl==1.8.2
 
 
 
 
 
 
 
1
  APScheduler==3.10.1
2
+ black==23.11.0
 
 
 
3
  click==8.1.3
4
+ datasets==2.14.5
5
+ gradio==4.9.0
6
+ gradio_client==0.7.2
7
+ huggingface-hub>=0.18.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  matplotlib==3.7.1
 
 
 
9
  numpy==1.24.2
 
 
10
  pandas==2.0.0
 
11
  plotly==5.14.1
 
 
 
 
 
12
  python-dateutil==2.8.2
 
 
 
 
13
  requests==2.28.2
14
+ sentencepiece
 
 
 
 
15
  tqdm==4.65.0
16
+ transformers==4.36.0
17
+ tokenizers>=0.15.0
18
+ gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
 
 
 
 
 
 
scripts/create_request_file.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pprint
4
+ import re
5
+ from datetime import datetime, timezone
6
+
7
+ import click
8
+ from colorama import Fore
9
+ from huggingface_hub import HfApi, snapshot_download
10
+
11
+ EVAL_REQUESTS_PATH = "eval-queue"
12
+ QUEUE_REPO = "open-ko-llm-leaderboard/requests"
13
+
14
+ precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
15
+ model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
16
+ weight_types = ("Original", "Delta", "Adapter")
17
+
18
+
19
+ def get_model_size(model_info, precision: str):
20
+ size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
21
+ try:
22
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
23
+ except (AttributeError, TypeError):
24
+ try:
25
+ size_match = re.search(size_pattern, model_info.modelId.lower())
26
+ model_size = size_match.group(0)
27
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
28
+ except AttributeError:
29
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
30
+
31
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
32
+ model_size = size_factor * model_size
33
+ return model_size
34
+
35
+
36
+ def main():
37
+ api = HfApi()
38
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
39
+ snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset")
40
+
41
+ model_name = click.prompt("Enter model name")
42
+ revision = click.prompt("Enter revision", default="main")
43
+ precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
44
+ model_type = click.prompt("Enter model type", type=click.Choice(model_types))
45
+ weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
46
+ base_model = click.prompt("Enter base model", default="")
47
+ status = click.prompt("Enter status", default="FINISHED")
48
+
49
+ try:
50
+ model_info = api.model_info(repo_id=model_name, revision=revision)
51
+ except Exception as e:
52
+ print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
53
+ return 1
54
+
55
+ model_size = get_model_size(model_info=model_info, precision=precision)
56
+
57
+ try:
58
+ license = model_info.cardData["license"]
59
+ except Exception:
60
+ license = "?"
61
+
62
+ eval_entry = {
63
+ "model": model_name,
64
+ "base_model": base_model,
65
+ "revision": revision,
66
+ "private": False,
67
+ "precision": precision,
68
+ "weight_type": weight_type,
69
+ "status": status,
70
+ "submitted_time": current_time,
71
+ "model_type": model_type,
72
+ "likes": model_info.likes,
73
+ "params": model_size,
74
+ "license": license,
75
+ }
76
+
77
+ user_name = ""
78
+ model_path = model_name
79
+ if "/" in model_name:
80
+ user_name = model_name.split("/")[0]
81
+ model_path = model_name.split("/")[1]
82
+
83
+ pprint.pprint(eval_entry)
84
+
85
+ if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
86
+ click.echo("continuing...")
87
+
88
+ out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
89
+ os.makedirs(out_dir, exist_ok=True)
90
+ out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
91
+
92
+ with open(out_path, "w") as f:
93
+ f.write(json.dumps(eval_entry))
94
+
95
+ api.upload_file(
96
+ path_or_fileobj=out_path,
97
+ path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
98
+ repo_id=QUEUE_REPO,
99
+ repo_type="dataset",
100
+ commit_message=f"Add {model_name} to eval queue",
101
+ )
102
+ else:
103
+ click.echo("aborting...")
104
+
105
+
106
+ if __name__ == "__main__":
107
+ main()
scripts/update_request_files.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import glob
4
+ import pprint
5
+ import re
6
+ from datetime import datetime, timezone
7
+
8
+ import click
9
+ from colorama import Fore
10
+ from huggingface_hub import HfApi, snapshot_download
11
+ from huggingface_hub.hf_api import ModelInfo
12
+
13
+ API = HfApi()
14
+
15
+
16
+ def get_model_size(model_info: ModelInfo, precision: str):
17
+ size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
18
+ try:
19
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
20
+ except (AttributeError, TypeError ):
21
+ try:
22
+ size_match = re.search(size_pattern, model_info.modelId.split("/")[-1].lower())
23
+ model_size = size_match.group(0)
24
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
25
+ except AttributeError:
26
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
27
+
28
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.split("/")[-1].lower()) else 1
29
+ model_size = size_factor * model_size
30
+ return model_size
31
+
32
+
33
+ def update_request_files(requests_path):
34
+ request_files = os.path.join(
35
+ requests_path, "*/*.json"
36
+ )
37
+ request_files = glob.glob(request_files)
38
+
39
+ request_files = sorted(request_files, reverse=True)
40
+ for tmp_request_file in request_files:
41
+ with open(tmp_request_file, "r") as f:
42
+ req_content = json.load(f)
43
+ new_req_content = add_model_info(req_content)
44
+
45
+ # if new content is different, update the file
46
+ if new_req_content != req_content:
47
+ with open(tmp_request_file, "w") as f:
48
+ f.write(json.dumps(new_req_content, indent=4))
49
+
50
+ def add_model_info(entry):
51
+
52
+ model = entry["model"]
53
+ revision = entry["revision"]
54
+
55
+ try:
56
+ model_info = API.model_info(repo_id=model, revision=revision)
57
+ except Exception:
58
+ print(f"Could not get model information for {model} revision {revision}")
59
+ return entry
60
+
61
+ new_entry = entry.copy()
62
+
63
+ model_size = get_model_size(model_info=model_info, precision='float16')
64
+ new_entry["params"] = model_size
65
+
66
+ new_entry["likes"] = model_info.likes
67
+
68
+ # Were the model card and license filled?
69
+ try:
70
+ license = model_info.cardData["license"]
71
+ new_entry["license"] = license
72
+ except Exception:
73
+ print(f"No license for {model} revision {revision}")
74
+
75
+ print(json.dumps(new_entry, indent=4))
76
+ return new_entry
77
+
78
+
79
+ if __name__ == "__main__":
80
+ # update_request_files("/Users/sean/workspace/leaderboard/leaderboard-test-requests")
81
+ update_request_files("/Volumes/Data-case-sensitive/requests")
82
+
src/assets/hardcoded_evals.py DELETED
@@ -1,14 +0,0 @@
1
- from src.display_models.utils import AutoEvalColumn, model_hyperlink
2
-
3
- baseline = {
4
- AutoEvalColumn.model.name: "<p>Baseline</p>",
5
- AutoEvalColumn.revision.name: "N/A",
6
- AutoEvalColumn.precision.name: None,
7
- AutoEvalColumn.average.name: 25.0,
8
- AutoEvalColumn.arc.name: 25.0,
9
- AutoEvalColumn.hellaswag.name: 25.0,
10
- AutoEvalColumn.mmlu.name: 25.0,
11
- AutoEvalColumn.truthfulqa.name: 25.0,
12
- AutoEvalColumn.dummy.name: "baseline",
13
- AutoEvalColumn.model_type.name: "",
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/{assets/text_content.py β†’ display/about.py} RENAMED
@@ -1,4 +1,5 @@
1
- from src.display_models.model_metadata_type import ModelType
 
2
 
3
  TITLE = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/header_logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
4
  BOTTOM_LOGO = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/footer_logo_1.png" style="width:50%;display:block;margin-left:auto;margin-right:auto">"""
@@ -20,7 +21,6 @@ While outstanding LLM models are being released competitively, most of them are
20
 
21
  ## Icons
22
  {ModelType.PT.to_str(" : ")} model
23
- {ModelType.FT.to_str(" : ")} model
24
  {ModelType.IFT.to_str(" : ")} model
25
  {ModelType.RL.to_str(" : ")} model
26
  If there is no icon, it indicates that there is insufficient information about the model.
@@ -52,6 +52,11 @@ GPUs are provided by __[KT](https://cloud.kt.com/)__ for the evaluations.
52
  If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
53
  """
54
 
 
 
 
 
 
55
  EVALUATION_QUEUE_TEXT = f"""
56
  # Evaluation Queue for the πŸš€ Open Ko-LLM Leaderboard
57
  Models added here will be automatically evaluated on the KT GPU cluster.
 
1
+ from src.display.utils import ModelType
2
+
3
 
4
  TITLE = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/header_logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
5
  BOTTOM_LOGO = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/footer_logo_1.png" style="width:50%;display:block;margin-left:auto;margin-right:auto">"""
 
21
 
22
  ## Icons
23
  {ModelType.PT.to_str(" : ")} model
 
24
  {ModelType.IFT.to_str(" : ")} model
25
  {ModelType.RL.to_str(" : ")} model
26
  If there is no icon, it indicates that there is insufficient information about the model.
 
52
  If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
53
  """
54
 
55
+
56
+ FAQ_TEXT = """
57
+ """
58
+
59
+
60
  EVALUATION_QUEUE_TEXT = f"""
61
  # Evaluation Queue for the πŸš€ Open Ko-LLM Leaderboard
62
  Models added here will be automatically evaluated on the KT GPU cluster.
src/{assets β†’ display}/css_html_js.py RENAMED
@@ -1,5 +1,24 @@
1
  custom_css = """
 
 
 
 
 
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  .markdown-text {
4
  font-size: 16px !important;
5
  }
@@ -21,14 +40,6 @@ custom_css = """
21
  transform: scale(1.3);
22
  }
23
 
24
- #leaderboard-table {
25
- margin-top: 15px
26
- }
27
-
28
- #leaderboard-table-lite {
29
- margin-top: 15px
30
- }
31
-
32
  #search-bar-table-box > div:first-child {
33
  background: none;
34
  border: none;
@@ -38,36 +49,11 @@ custom_css = """
38
  padding: 0px;
39
  }
40
 
41
- /* Hides the final AutoEvalColumn */
42
- #llm-benchmark-tab-table table td:last-child,
43
- #llm-benchmark-tab-table table th:last-child {
44
- display: none;
45
- }
46
-
47
- /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
48
- table td:first-child,
49
- table th:first-child {
50
- max-width: 400px;
51
- overflow: auto;
52
- white-space: nowrap;
53
- }
54
-
55
  .tab-buttons button {
56
  font-size: 20px;
57
  }
58
 
59
- #scale-logo {
60
- border-style: none !important;
61
- box-shadow: none;
62
- display: block;
63
- margin-left: auto;
64
- margin-right: auto;
65
- max-width: 600px;
66
- }
67
-
68
- #scale-logo .download {
69
- display: none;
70
- }
71
  #filter_type{
72
  border: 0;
73
  padding-left: 0;
 
1
  custom_css = """
2
+ /* Hides the final AutoEvalColumn */
3
+ #llm-benchmark-tab-table table td:last-child,
4
+ #llm-benchmark-tab-table table th:last-child {
5
+ display: none;
6
+ }
7
 
8
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
9
+ table td:first-child,
10
+ table th:first-child {
11
+ max-width: 400px;
12
+ overflow: auto;
13
+ white-space: nowrap;
14
+ }
15
+
16
+ /* Full width space */
17
+ .gradio-container {
18
+ max-width: 95%!important;
19
+ }
20
+
21
+ /* Text style and margins */
22
  .markdown-text {
23
  font-size: 16px !important;
24
  }
 
40
  transform: scale(1.3);
41
  }
42
 
 
 
 
 
 
 
 
 
43
  #search-bar-table-box > div:first-child {
44
  background: none;
45
  border: none;
 
49
  padding: 0px;
50
  }
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  .tab-buttons button {
53
  font-size: 20px;
54
  }
55
 
56
+ /* Filters style */
 
 
 
 
 
 
 
 
 
 
 
57
  #filter_type{
58
  border: 0;
59
  padding-left: 0;
src/display/formatting.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime, timezone
3
+
4
+ from huggingface_hub import HfApi
5
+ from huggingface_hub.hf_api import ModelInfo
6
+
7
+
8
+ API = HfApi()
9
+
10
+ def model_hyperlink(link, model_name):
11
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
12
+
13
+
14
+ def make_clickable_model(model_name):
15
+ link = f"https://huggingface.co/{model_name}"
16
+
17
+ details_model_name = model_name.replace("/", "__")
18
+ details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
19
+
20
+ return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "πŸ“‘")
21
+
22
+
23
+ def styled_error(error):
24
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
25
+
26
+
27
+ def styled_warning(warn):
28
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
29
+
30
+
31
+ def styled_message(message):
32
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
33
+
34
+
35
+ def has_no_nan_values(df, columns):
36
+ return df[columns].notna().all(axis=1)
37
+
38
+
39
+ def has_nan_values(df, columns):
40
+ return df[columns].isna().any(axis=1)
src/display/utils.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ from enum import Enum
3
+
4
+ import pandas as pd
5
+
6
+ def fields(raw_class):
7
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
8
+
9
+
10
+ @dataclass
11
+ class Task:
12
+ benchmark: str
13
+ metric: str
14
+ col_name: str
15
+
16
+ class Tasks(Enum):
17
+ arc = Task("ko_arc_challenge", "acc_norm", "Ko-ARC")
18
+ hellaswag = Task("ko_hellaswag", "acc_norm", "Ko-HellaSwag")
19
+ mmlu = Task("ko_mmlu", "acc", "Ko-MMLU")
20
+ truthfulqa = Task("ko_truthfulqa_mc", "mc2", "Ko-TruthfulQA")
21
+ commongen_v2 = Task("ko_commongen_v2", "acc_norm", "Ko-CommonGen V2")
22
+
23
+ # These classes are for user facing column names,
24
+ # to avoid having to change them all around the code
25
+ # when a modif is needed
26
+ @dataclass
27
+ class ColumnContent:
28
+ name: str
29
+ type: str
30
+ displayed_by_default: bool
31
+ hidden: bool = False
32
+ never_hidden: bool = False
33
+ dummy: bool = False
34
+
35
+ auto_eval_column_dict = []
36
+ # Init
37
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
38
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
39
+ #Scores
40
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
41
+ for task in Tasks:
42
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
43
+ # Model information
44
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
45
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
46
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
47
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
48
+ auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "bool", False)])
49
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
50
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
51
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❀️", "number", False)])
52
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
53
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
54
+ auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, False)])
55
+ # Dummy column for the search bar (hidden by the custom CSS)
56
+ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
57
+
58
+ # We use make dataclass to dynamically fill the scores from Tasks
59
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
60
+
61
+ @dataclass(frozen=True)
62
+ class EvalQueueColumn: # Queue column
63
+ model = ColumnContent("model", "markdown", True)
64
+ revision = ColumnContent("revision", "str", True)
65
+ private = ColumnContent("private", "bool", True)
66
+ precision = ColumnContent("precision", "str", True)
67
+ weight_type = ColumnContent("weight_type", "str", "Original")
68
+ status = ColumnContent("status", "str", True)
69
+
70
+ # Define the human baselines
71
+ human_baseline_row = {
72
+ AutoEvalColumn.model.name: "<p>Human performance</p>",
73
+ }
74
+
75
+ @dataclass
76
+ class ModelDetails:
77
+ name: str
78
+ symbol: str = "" # emoji, only for the model type
79
+
80
+
81
+ class ModelType(Enum):
82
+ PT = ModelDetails(name="pretrained", symbol="🟒")
83
+ # FT = ModelDetails(name="fine-tuned", symbol="πŸ”Ά")
84
+ IFT = ModelDetails(name="instruction-tuned", symbol="β­•")
85
+ RL = ModelDetails(name="RL-tuned", symbol="🟦")
86
+ Unknown = ModelDetails(name="", symbol="?")
87
+
88
+ def to_str(self, separator=" "):
89
+ return f"{self.value.symbol}{separator}{self.value.name}"
90
+
91
+ @staticmethod
92
+ def from_str(type):
93
+ # if "fine-tuned" in type or "πŸ”Ά" in type:
94
+ # return ModelType.FT
95
+ if "pretrained" in type or "🟒" in type:
96
+ return ModelType.PT
97
+ if "RL-tuned" in type or "🟦" in type:
98
+ return ModelType.RL
99
+ if "instruction-tuned" in type or "β­•" in type:
100
+ return ModelType.IFT
101
+ return ModelType.Unknown
102
+
103
+ class WeightType(Enum):
104
+ Adapter = ModelDetails("Adapter")
105
+ Original = ModelDetails("Original")
106
+ Delta = ModelDetails("Delta")
107
+
108
+ class Precision(Enum):
109
+ float16 = ModelDetails("float16")
110
+ # bfloat16 = ModelDetails("bfloat16")
111
+ # qt_8bit = ModelDetails("8bit")
112
+ # qt_4bit = ModelDetails("4bit")
113
+ # qt_GPTQ = ModelDetails("GPTQ")
114
+ Unknown = ModelDetails("?")
115
+
116
+ def from_str(precision):
117
+ if precision in ["torch.float16", "float16"]:
118
+ return Precision.float16
119
+ if precision in ["torch.bfloat16", "bfloat16"]:
120
+ return Precision.bfloat16
121
+ if precision in ["8bit"]:
122
+ return Precision.qt_8bit
123
+ if precision in ["4bit"]:
124
+ return Precision.qt_4bit
125
+ if precision in ["GPTQ", "None"]:
126
+ return Precision.qt_GPTQ
127
+ return Precision.Unknown
128
+
129
+
130
+
131
+
132
+ # Column selection
133
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
134
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
135
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
136
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
137
+
138
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
139
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
140
+
141
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
142
+
143
+ NUMERIC_INTERVALS = {
144
+ "Unknown": pd.Interval(-1, 0, closed="right"),
145
+ "0~3B": pd.Interval(0, 3, closed="right"),
146
+ "3~7B": pd.Interval(3, 7.3, closed="right"),
147
+ "7~13B": pd.Interval(7.3, 13, closed="right"),
148
+ "13~35B": pd.Interval(13, 35, closed="right"),
149
+ "35~60B": pd.Interval(35, 60, closed="right"),
150
+ "60B+": pd.Interval(60, 10000, closed="right"),
151
+ }
src/display_models/get_model_metadata.py DELETED
@@ -1,167 +0,0 @@
1
- import glob
2
- import json
3
- import os
4
- import re
5
- import pickle
6
- from typing import List
7
-
8
- import huggingface_hub
9
- from huggingface_hub import HfApi
10
- from tqdm import tqdm
11
- from transformers import AutoModel, AutoConfig
12
- from accelerate import init_empty_weights
13
-
14
- from src.display_models.model_metadata_flags import DO_NOT_SUBMIT_MODELS, FLAGGED_MODELS
15
- from src.display_models.model_metadata_type import MODEL_TYPE_METADATA, ModelType, model_type_from_str
16
- from src.display_models.utils import AutoEvalColumn, model_hyperlink
17
-
18
- api = HfApi(token=os.environ.get("H4_TOKEN", None))
19
-
20
-
21
- def get_model_infos_from_hub(leaderboard_data: List[dict]):
22
- # load cache from disk
23
- try:
24
- with open("model_info_cache.pkl", "rb") as f:
25
- model_info_cache = pickle.load(f)
26
- except (EOFError, FileNotFoundError):
27
- model_info_cache = {}
28
- try:
29
- with open("model_size_cache.pkl", "rb") as f:
30
- model_size_cache = pickle.load(f)
31
- except (EOFError, FileNotFoundError):
32
- model_size_cache = {}
33
-
34
- for model_data in tqdm(leaderboard_data):
35
- model_name = model_data["model_name_for_query"]
36
-
37
- if model_name in model_info_cache:
38
- model_info = model_info_cache[model_name]
39
- else:
40
- try:
41
- model_info = api.model_info(model_name)
42
- model_info_cache[model_name] = model_info
43
- except huggingface_hub.utils._errors.RepositoryNotFoundError:
44
- print("Repo not found!", model_name)
45
- model_data[AutoEvalColumn.license.name] = None
46
- model_data[AutoEvalColumn.likes.name] = None
47
- if model_name not in model_size_cache:
48
- model_size_cache[model_name] = get_model_size(model_name, None)
49
- model_data[AutoEvalColumn.params.name] = model_size_cache[model_name]
50
-
51
- model_data[AutoEvalColumn.license.name] = get_model_license(model_info)
52
- model_data[AutoEvalColumn.likes.name] = get_model_likes(model_info)
53
- if model_name not in model_size_cache:
54
- model_size_cache[model_name] = get_model_size(model_name, model_info)
55
- model_data[AutoEvalColumn.params.name] = model_size_cache[model_name]
56
-
57
- # save cache to disk in pickle format
58
- with open("model_info_cache.pkl", "wb") as f:
59
- pickle.dump(model_info_cache, f)
60
- with open("model_size_cache.pkl", "wb") as f:
61
- pickle.dump(model_size_cache, f)
62
-
63
-
64
- def get_model_license(model_info):
65
- try:
66
- return model_info.cardData["license"]
67
- except Exception:
68
- return "?"
69
-
70
-
71
- def get_model_likes(model_info):
72
- return model_info.likes
73
-
74
-
75
- size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
76
-
77
-
78
- def get_model_size(model_name, model_info):
79
- # In billions
80
- try:
81
- return round(model_info.safetensors["total"] / 1e9, 3)
82
- except AttributeError:
83
- try:
84
- config = AutoConfig.from_pretrained(model_name, trust_remote_code=False)
85
- with init_empty_weights():
86
- model = AutoModel.from_config(config, trust_remote_code=False)
87
- return round(sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e9, 3)
88
- except (EnvironmentError, ValueError): # model config not found, likely private
89
- try:
90
- size_match = re.search(size_pattern, model_name.lower())
91
- size = size_match.group(0)
92
- return round(float(size[:-1]) if size[-1] == "b" else float(size[:-1]) / 1e3, 3)
93
- except AttributeError:
94
- return 0
95
-
96
-
97
- def get_model_type(leaderboard_data: List[dict]):
98
- for model_data in leaderboard_data:
99
- request_files = os.path.join(
100
- "eval-queue",
101
- model_data["model_name_for_query"] + "_eval_request_*" + ".json",
102
- )
103
- request_files = glob.glob(request_files)
104
-
105
- # Select correct request file (precision)
106
- request_file = ""
107
- if len(request_files) == 1:
108
- request_file = request_files[0]
109
- elif len(request_files) > 1:
110
- request_files = sorted(request_files, reverse=True)
111
- for tmp_request_file in request_files:
112
- with open(tmp_request_file, "r") as f:
113
- req_content = json.load(f)
114
- if (
115
- req_content["status"] == "FINISHED"
116
- and req_content["precision"] == model_data["Precision"].split(".")[-1]
117
- ):
118
- request_file = tmp_request_file
119
-
120
- try:
121
- with open(request_file, "r") as f:
122
- request = json.load(f)
123
- model_type = model_type_from_str(request["model_type"])
124
- model_data[AutoEvalColumn.model_type.name] = model_type.value.name
125
- model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol # + ("πŸ”Ί" if is_delta else "")
126
- except Exception:
127
- if model_data["model_name_for_query"] in MODEL_TYPE_METADATA:
128
- model_data[AutoEvalColumn.model_type.name] = MODEL_TYPE_METADATA[
129
- model_data["model_name_for_query"]
130
- ].value.name
131
- model_data[AutoEvalColumn.model_type_symbol.name] = MODEL_TYPE_METADATA[
132
- model_data["model_name_for_query"]
133
- ].value.symbol # + ("πŸ”Ί" if is_delta else "")
134
- else:
135
- model_data[AutoEvalColumn.model_type.name] = ModelType.Unknown.value.name
136
- model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.Unknown.value.symbol
137
-
138
-
139
- def flag_models(leaderboard_data: List[dict]):
140
- for model_data in leaderboard_data:
141
- if model_data["model_name_for_query"] in FLAGGED_MODELS:
142
- issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
143
- issue_link = model_hyperlink(
144
- FLAGGED_MODELS[model_data["model_name_for_query"]],
145
- f"See discussion #{issue_num}",
146
- )
147
- model_data[
148
- AutoEvalColumn.model.name
149
- ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
150
-
151
-
152
- def remove_forbidden_models(leaderboard_data: List[dict]):
153
- indices_to_remove = []
154
- for ix, model in enumerate(leaderboard_data):
155
- if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
156
- indices_to_remove.append(ix)
157
-
158
- for ix in reversed(indices_to_remove):
159
- leaderboard_data.pop(ix)
160
- return leaderboard_data
161
-
162
-
163
- def apply_metadata(leaderboard_data: List[dict]):
164
- leaderboard_data = remove_forbidden_models(leaderboard_data)
165
- get_model_type(leaderboard_data)
166
- get_model_infos_from_hub(leaderboard_data)
167
- flag_models(leaderboard_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display_models/model_metadata_flags.py DELETED
@@ -1,8 +0,0 @@
1
- # Models which have been flagged by users as being problematic for a reason or another
2
- # (Model name to forum discussion link)
3
- FLAGGED_MODELS = {
4
- }
5
-
6
- # Models which have been requested by orgs to not be submitted on the leaderboard
7
- DO_NOT_SUBMIT_MODELS = [
8
- ]
 
 
 
 
 
 
 
 
 
src/display_models/model_metadata_type.py DELETED
@@ -1,553 +0,0 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
- from typing import Dict
4
-
5
-
6
- @dataclass
7
- class ModelInfo:
8
- name: str
9
- symbol: str # emoji
10
-
11
-
12
- class ModelType(Enum):
13
- PT = ModelInfo(name="pretrained", symbol="🟒")
14
- FT = ModelInfo(name="fine-tuned", symbol="πŸ”Ά")
15
- IFT = ModelInfo(name="instruction-tuned", symbol="β­•")
16
- RL = ModelInfo(name="RL-tuned", symbol="🟦")
17
- Unknown = ModelInfo(name="Unknown, add type to request file!", symbol="?")
18
-
19
- def to_str(self, separator=" "):
20
- return f"{self.value.symbol}{separator}{self.value.name}"
21
-
22
-
23
- MODEL_TYPE_METADATA: Dict[str, ModelType] = {
24
- "tiiuae/falcon-180B": ModelType.PT,
25
- "Qwen/Qwen-7B": ModelType.PT,
26
- "Qwen/Qwen-7B-Chat": ModelType.RL,
27
- "notstoic/PygmalionCoT-7b": ModelType.IFT,
28
- "aisquared/dlite-v1-355m": ModelType.IFT,
29
- "aisquared/dlite-v1-1_5b": ModelType.IFT,
30
- "aisquared/dlite-v1-774m": ModelType.IFT,
31
- "aisquared/dlite-v1-124m": ModelType.IFT,
32
- "aisquared/chopt-2_7b": ModelType.IFT,
33
- "aisquared/dlite-v2-124m": ModelType.IFT,
34
- "aisquared/dlite-v2-774m": ModelType.IFT,
35
- "aisquared/dlite-v2-1_5b": ModelType.IFT,
36
- "aisquared/chopt-1_3b": ModelType.IFT,
37
- "aisquared/dlite-v2-355m": ModelType.IFT,
38
- "augtoma/qCammel-13": ModelType.IFT,
39
- "Aspik101/Llama-2-7b-hf-instruct-pl-lora_unload": ModelType.IFT,
40
- "Aspik101/vicuna-7b-v1.3-instruct-pl-lora_unload": ModelType.IFT,
41
- "TheBloke/alpaca-lora-65B-HF": ModelType.FT,
42
- "TheBloke/tulu-7B-fp16": ModelType.IFT,
43
- "TheBloke/guanaco-7B-HF": ModelType.FT,
44
- "TheBloke/koala-7B-HF": ModelType.FT,
45
- "TheBloke/wizardLM-7B-HF": ModelType.IFT,
46
- "TheBloke/airoboros-13B-HF": ModelType.IFT,
47
- "TheBloke/koala-13B-HF": ModelType.FT,
48
- "TheBloke/Wizard-Vicuna-7B-Uncensored-HF": ModelType.FT,
49
- "TheBloke/dromedary-65b-lora-HF": ModelType.IFT,
50
- "TheBloke/wizardLM-13B-1.0-fp16": ModelType.IFT,
51
- "TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16": ModelType.FT,
52
- "TheBloke/Wizard-Vicuna-30B-Uncensored-fp16": ModelType.FT,
53
- "TheBloke/wizard-vicuna-13B-HF": ModelType.IFT,
54
- "TheBloke/UltraLM-13B-fp16": ModelType.IFT,
55
- "TheBloke/OpenAssistant-FT-7-Llama-30B-HF": ModelType.FT,
56
- "TheBloke/vicuna-13B-1.1-HF": ModelType.IFT,
57
- "TheBloke/guanaco-13B-HF": ModelType.FT,
58
- "TheBloke/guanaco-65B-HF": ModelType.FT,
59
- "TheBloke/airoboros-7b-gpt4-fp16": ModelType.IFT,
60
- "TheBloke/llama-30b-supercot-SuperHOT-8K-fp16": ModelType.IFT,
61
- "TheBloke/Llama-2-13B-fp16": ModelType.PT,
62
- "TheBloke/llama-2-70b-Guanaco-QLoRA-fp16": ModelType.FT,
63
- "TheBloke/landmark-attention-llama7b-fp16": ModelType.IFT,
64
- "TheBloke/Planner-7B-fp16": ModelType.IFT,
65
- "TheBloke/Wizard-Vicuna-13B-Uncensored-HF": ModelType.FT,
66
- "TheBloke/gpt4-alpaca-lora-13B-HF": ModelType.IFT,
67
- "TheBloke/gpt4-x-vicuna-13B-HF": ModelType.IFT,
68
- "TheBloke/gpt4-alpaca-lora_mlp-65B-HF": ModelType.IFT,
69
- "TheBloke/tulu-13B-fp16": ModelType.IFT,
70
- "TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16": ModelType.IFT,
71
- "TheBloke/Llama-2-70B-fp16": ModelType.IFT,
72
- "TheBloke/WizardLM-30B-fp16": ModelType.IFT,
73
- "TheBloke/robin-13B-v2-fp16": ModelType.FT,
74
- "TheBloke/robin-33B-v2-fp16": ModelType.FT,
75
- "TheBloke/Vicuna-13B-CoT-fp16": ModelType.IFT,
76
- "TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16": ModelType.IFT,
77
- "TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16": ModelType.FT,
78
- "TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16": ModelType.IFT,
79
- "TheBloke/GPlatty-30B-SuperHOT-8K-fp16": ModelType.FT,
80
- "TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16": ModelType.IFT,
81
- "TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16": ModelType.IFT,
82
- "jphme/orca_mini_v2_ger_7b": ModelType.IFT,
83
- "Ejafa/vicuna_7B_vanilla_1.1": ModelType.FT,
84
- "kevinpro/Vicuna-13B-CoT": ModelType.IFT,
85
- "AlekseyKorshuk/pygmalion-6b-vicuna-chatml": ModelType.FT,
86
- "AlekseyKorshuk/chatml-pyg-v1": ModelType.FT,
87
- "concedo/Vicuzard-30B-Uncensored": ModelType.FT,
88
- "concedo/OPT-19M-ChatSalad": ModelType.FT,
89
- "concedo/Pythia-70M-ChatSalad": ModelType.FT,
90
- "digitous/13B-HyperMantis": ModelType.IFT,
91
- "digitous/Adventien-GPTJ": ModelType.FT,
92
- "digitous/Alpacino13b": ModelType.IFT,
93
- "digitous/GPT-R": ModelType.IFT,
94
- "digitous/Javelin-R": ModelType.IFT,
95
- "digitous/Javalion-GPTJ": ModelType.IFT,
96
- "digitous/Javalion-R": ModelType.IFT,
97
- "digitous/Skegma-GPTJ": ModelType.FT,
98
- "digitous/Alpacino30b": ModelType.IFT,
99
- "digitous/Janin-GPTJ": ModelType.FT,
100
- "digitous/Janin-R": ModelType.FT,
101
- "digitous/Javelin-GPTJ": ModelType.FT,
102
- "SaylorTwift/gpt2_test": ModelType.PT,
103
- "anton-l/gpt-j-tiny-random": ModelType.FT,
104
- "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca": ModelType.FT,
105
- "Lazycuber/pyg-instruct-wizardlm": ModelType.FT,
106
- "Lazycuber/Janemalion-6B": ModelType.FT,
107
- "IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1": ModelType.FT,
108
- "IDEA-CCNL/Ziya-LLaMA-13B-v1": ModelType.IFT,
109
- "dsvv-cair/alpaca-cleaned-llama-30b-bf16": ModelType.FT,
110
- "gpt2-medium": ModelType.PT,
111
- "camel-ai/CAMEL-13B-Combined-Data": ModelType.IFT,
112
- "camel-ai/CAMEL-13B-Role-Playing-Data": ModelType.FT,
113
- "camel-ai/CAMEL-33B-Combined-Data": ModelType.IFT,
114
- "PygmalionAI/pygmalion-6b": ModelType.FT,
115
- "PygmalionAI/metharme-1.3b": ModelType.IFT,
116
- "PygmalionAI/pygmalion-1.3b": ModelType.FT,
117
- "PygmalionAI/pygmalion-350m": ModelType.FT,
118
- "PygmalionAI/pygmalion-2.7b": ModelType.FT,
119
- "medalpaca/medalpaca-7b": ModelType.FT,
120
- "lilloukas/Platypus-30B": ModelType.IFT,
121
- "lilloukas/GPlatty-30B": ModelType.FT,
122
- "mncai/chatdoctor": ModelType.FT,
123
- "chaoyi-wu/MedLLaMA_13B": ModelType.FT,
124
- "LoupGarou/WizardCoder-Guanaco-15B-V1.0": ModelType.IFT,
125
- "LoupGarou/WizardCoder-Guanaco-15B-V1.1": ModelType.FT,
126
- "hakurei/instruct-12b": ModelType.IFT,
127
- "hakurei/lotus-12B": ModelType.FT,
128
- "shibing624/chinese-llama-plus-13b-hf": ModelType.IFT,
129
- "shibing624/chinese-alpaca-plus-7b-hf": ModelType.IFT,
130
- "shibing624/chinese-alpaca-plus-13b-hf": ModelType.IFT,
131
- "mosaicml/mpt-7b-instruct": ModelType.IFT,
132
- "mosaicml/mpt-30b-chat": ModelType.IFT,
133
- "mosaicml/mpt-7b-storywriter": ModelType.FT,
134
- "mosaicml/mpt-30b-instruct": ModelType.IFT,
135
- "mosaicml/mpt-7b-chat": ModelType.IFT,
136
- "mosaicml/mpt-30b": ModelType.PT,
137
- "Corianas/111m": ModelType.IFT,
138
- "Corianas/Quokka_1.3b": ModelType.IFT,
139
- "Corianas/256_5epoch": ModelType.FT,
140
- "Corianas/Quokka_256m": ModelType.IFT,
141
- "Corianas/Quokka_590m": ModelType.IFT,
142
- "Corianas/gpt-j-6B-Dolly": ModelType.FT,
143
- "Corianas/Quokka_2.7b": ModelType.IFT,
144
- "cyberagent/open-calm-7b": ModelType.FT,
145
- "Aspik101/Nous-Hermes-13b-pl-lora_unload": ModelType.IFT,
146
- "THUDM/chatglm2-6b": ModelType.IFT,
147
- "MetaIX/GPT4-X-Alpasta-30b": ModelType.IFT,
148
- "NYTK/PULI-GPTrio": ModelType.PT,
149
- "EleutherAI/pythia-1.3b": ModelType.PT,
150
- "EleutherAI/pythia-2.8b-deduped": ModelType.PT,
151
- "EleutherAI/gpt-neo-125m": ModelType.PT,
152
- "EleutherAI/pythia-160m": ModelType.PT,
153
- "EleutherAI/gpt-neo-2.7B": ModelType.PT,
154
- "EleutherAI/pythia-1b-deduped": ModelType.PT,
155
- "EleutherAI/pythia-6.7b": ModelType.PT,
156
- "EleutherAI/pythia-70m-deduped": ModelType.PT,
157
- "EleutherAI/gpt-neox-20b": ModelType.PT,
158
- "EleutherAI/pythia-1.4b-deduped": ModelType.PT,
159
- "EleutherAI/pythia-2.7b": ModelType.PT,
160
- "EleutherAI/pythia-6.9b-deduped": ModelType.PT,
161
- "EleutherAI/pythia-70m": ModelType.PT,
162
- "EleutherAI/gpt-j-6b": ModelType.PT,
163
- "EleutherAI/pythia-12b-deduped": ModelType.PT,
164
- "EleutherAI/gpt-neo-1.3B": ModelType.PT,
165
- "EleutherAI/pythia-410m-deduped": ModelType.PT,
166
- "EleutherAI/pythia-160m-deduped": ModelType.PT,
167
- "EleutherAI/polyglot-ko-12.8b": ModelType.PT,
168
- "EleutherAI/pythia-12b": ModelType.PT,
169
- "roneneldan/TinyStories-33M": ModelType.PT,
170
- "roneneldan/TinyStories-28M": ModelType.PT,
171
- "roneneldan/TinyStories-1M": ModelType.PT,
172
- "roneneldan/TinyStories-8M": ModelType.PT,
173
- "roneneldan/TinyStories-3M": ModelType.PT,
174
- "jerryjalapeno/nart-100k-7b": ModelType.FT,
175
- "lmsys/vicuna-13b-v1.3": ModelType.IFT,
176
- "lmsys/vicuna-7b-v1.3": ModelType.IFT,
177
- "lmsys/vicuna-13b-v1.1": ModelType.IFT,
178
- "lmsys/vicuna-13b-delta-v1.1": ModelType.IFT,
179
- "lmsys/vicuna-7b-delta-v1.1": ModelType.IFT,
180
- "abhiramtirumala/DialoGPT-sarcastic-medium": ModelType.FT,
181
- "haonan-li/bactrian-x-llama-13b-merged": ModelType.IFT,
182
- "Gryphe/MythoLogic-13b": ModelType.IFT,
183
- "Gryphe/MythoBoros-13b": ModelType.IFT,
184
- "pillowtalks-ai/delta13b": ModelType.FT,
185
- "wannaphong/openthaigpt-0.1.0-beta-full-model_for_open_llm_leaderboard": ModelType.FT,
186
- "bigscience/bloom-7b1": ModelType.PT,
187
- "bigcode/tiny_starcoder_py": ModelType.PT,
188
- "bigcode/starcoderplus": ModelType.FT,
189
- "bigcode/gpt_bigcode-santacoder": ModelType.PT,
190
- "bigcode/starcoder": ModelType.PT,
191
- "Open-Orca/OpenOrca-Preview1-13B": ModelType.IFT,
192
- "microsoft/DialoGPT-large": ModelType.FT,
193
- "microsoft/DialoGPT-small": ModelType.FT,
194
- "microsoft/DialoGPT-medium": ModelType.FT,
195
- "microsoft/CodeGPT-small-py": ModelType.FT,
196
- "Tincando/fiction_story_generator": ModelType.FT,
197
- "Pirr/pythia-13b-deduped-green_devil": ModelType.FT,
198
- "Aeala/GPT4-x-AlpacaDente2-30b": ModelType.FT,
199
- "Aeala/GPT4-x-AlpacaDente-30b": ModelType.FT,
200
- "Aeala/GPT4-x-Alpasta-13b": ModelType.FT,
201
- "Aeala/VicUnlocked-alpaca-30b": ModelType.IFT,
202
- "Tap-M/Luna-AI-Llama2-Uncensored": ModelType.FT,
203
- "illuin/test-custom-llama": ModelType.FT,
204
- "dvruette/oasst-llama-13b-2-epochs": ModelType.FT,
205
- "dvruette/oasst-gpt-neox-20b-1000-steps": ModelType.FT,
206
- "dvruette/llama-13b-pretrained-dropout": ModelType.PT,
207
- "dvruette/llama-13b-pretrained": ModelType.PT,
208
- "dvruette/llama-13b-pretrained-sft-epoch-1": ModelType.FT,
209
- "dvruette/llama-13b-pretrained-sft-do2": ModelType.FT,
210
- "dvruette/oasst-gpt-neox-20b-3000-steps": ModelType.FT,
211
- "dvruette/oasst-pythia-12b-pretrained-sft": ModelType.FT,
212
- "dvruette/oasst-pythia-6.9b-4000-steps": ModelType.FT,
213
- "dvruette/gpt-neox-20b-full-precision": ModelType.FT,
214
- "dvruette/oasst-llama-13b-1000-steps": ModelType.FT,
215
- "openlm-research/open_llama_7b_700bt_preview": ModelType.PT,
216
- "openlm-research/open_llama_7b": ModelType.PT,
217
- "openlm-research/open_llama_7b_v2": ModelType.PT,
218
- "openlm-research/open_llama_3b": ModelType.PT,
219
- "openlm-research/open_llama_13b": ModelType.PT,
220
- "openlm-research/open_llama_3b_v2": ModelType.PT,
221
- "PocketDoc/Dans-PileOfSets-Mk1-llama-13b-merged": ModelType.IFT,
222
- "GeorgiaTechResearchInstitute/galpaca-30b": ModelType.IFT,
223
- "GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct": ModelType.IFT,
224
- "databricks/dolly-v2-7b": ModelType.IFT,
225
- "databricks/dolly-v2-3b": ModelType.IFT,
226
- "databricks/dolly-v2-12b": ModelType.IFT,
227
- "Rachneet/gpt2-xl-alpaca": ModelType.FT,
228
- "Locutusque/gpt2-conversational-or-qa": ModelType.FT,
229
- "psyche/kogpt": ModelType.FT,
230
- "NbAiLab/nb-gpt-j-6B-alpaca": ModelType.IFT,
231
- "Mikael110/llama-2-7b-guanaco-fp16": ModelType.FT,
232
- "Mikael110/llama-2-13b-guanaco-fp16": ModelType.FT,
233
- "Fredithefish/CrimsonPajama": ModelType.IFT,
234
- "Fredithefish/RedPajama-INCITE-Chat-3B-ShareGPT-11K": ModelType.FT,
235
- "Fredithefish/ScarletPajama-3B-HF": ModelType.FT,
236
- "Fredithefish/RedPajama-INCITE-Chat-3B-Instruction-Tuning-with-GPT-4": ModelType.IFT,
237
- "acrastt/RedPajama-INCITE-Chat-Instruct-3B-V1": ModelType.IFT,
238
- "eachadea/vicuna-13b-1.1": ModelType.FT,
239
- "eachadea/vicuna-7b-1.1": ModelType.FT,
240
- "eachadea/vicuna-13b": ModelType.FT,
241
- "openaccess-ai-collective/wizard-mega-13b": ModelType.IFT,
242
- "openaccess-ai-collective/manticore-13b": ModelType.IFT,
243
- "openaccess-ai-collective/manticore-30b-chat-pyg-alpha": ModelType.IFT,
244
- "openaccess-ai-collective/minotaur-13b": ModelType.IFT,
245
- "openaccess-ai-collective/minotaur-13b-fixed": ModelType.IFT,
246
- "openaccess-ai-collective/hippogriff-30b-chat": ModelType.IFT,
247
- "openaccess-ai-collective/manticore-13b-chat-pyg": ModelType.IFT,
248
- "pythainlp/wangchanglm-7.5B-sft-enth": ModelType.IFT,
249
- "pythainlp/wangchanglm-7.5B-sft-en-sharded": ModelType.IFT,
250
- "euclaise/gpt-neox-122m-minipile-digits": ModelType.FT,
251
- "stabilityai/StableBeluga1-Delta": ModelType.IFT,
252
- "stabilityai/stablelm-tuned-alpha-7b": ModelType.IFT,
253
- "stabilityai/StableBeluga2": ModelType.IFT,
254
- "stabilityai/StableBeluga-13B": ModelType.IFT,
255
- "stabilityai/StableBeluga-7B": ModelType.IFT,
256
- "stabilityai/stablelm-base-alpha-7b": ModelType.PT,
257
- "stabilityai/stablelm-base-alpha-3b": ModelType.PT,
258
- "stabilityai/stablelm-tuned-alpha-3b": ModelType.IFT,
259
- "alibidaran/medical_transcription_generator": ModelType.FT,
260
- "CalderaAI/30B-Lazarus": ModelType.IFT,
261
- "CalderaAI/13B-BlueMethod": ModelType.IFT,
262
- "CalderaAI/13B-Ouroboros": ModelType.IFT,
263
- "KoboldAI/OPT-13B-Erebus": ModelType.FT,
264
- "KoboldAI/GPT-J-6B-Janeway": ModelType.FT,
265
- "KoboldAI/GPT-J-6B-Shinen": ModelType.FT,
266
- "KoboldAI/fairseq-dense-2.7B": ModelType.PT,
267
- "KoboldAI/OPT-6B-nerys-v2": ModelType.FT,
268
- "KoboldAI/GPT-NeoX-20B-Skein": ModelType.FT,
269
- "KoboldAI/PPO_Pygway-6b-Mix": ModelType.FT,
270
- "KoboldAI/fairseq-dense-6.7B": ModelType.PT,
271
- "KoboldAI/fairseq-dense-125M": ModelType.PT,
272
- "KoboldAI/OPT-13B-Nerybus-Mix": ModelType.FT,
273
- "KoboldAI/OPT-2.7B-Erebus": ModelType.FT,
274
- "KoboldAI/OPT-350M-Nerys-v2": ModelType.FT,
275
- "KoboldAI/OPT-2.7B-Nerys-v2": ModelType.FT,
276
- "KoboldAI/OPT-2.7B-Nerybus-Mix": ModelType.FT,
277
- "KoboldAI/OPT-13B-Nerys-v2": ModelType.FT,
278
- "KoboldAI/GPT-NeoX-20B-Erebus": ModelType.FT,
279
- "KoboldAI/OPT-6.7B-Erebus": ModelType.FT,
280
- "KoboldAI/fairseq-dense-355M": ModelType.PT,
281
- "KoboldAI/OPT-6.7B-Nerybus-Mix": ModelType.FT,
282
- "KoboldAI/GPT-J-6B-Adventure": ModelType.FT,
283
- "KoboldAI/OPT-350M-Erebus": ModelType.FT,
284
- "KoboldAI/GPT-J-6B-Skein": ModelType.FT,
285
- "KoboldAI/OPT-30B-Erebus": ModelType.FT,
286
- "klosax/pythia-160m-deduped-step92k-193bt": ModelType.PT,
287
- "klosax/open_llama_3b_350bt_preview": ModelType.PT,
288
- "klosax/openllama-3b-350bt": ModelType.PT,
289
- "klosax/pythia-70m-deduped-step44k-92bt": ModelType.PT,
290
- "klosax/open_llama_13b_600bt_preview": ModelType.PT,
291
- "klosax/open_llama_7b_400bt_preview": ModelType.PT,
292
- "kfkas/Llama-2-ko-7b-Chat": ModelType.IFT,
293
- "WeOpenML/Alpaca-7B-v1": ModelType.IFT,
294
- "WeOpenML/PandaLM-Alpaca-7B-v1": ModelType.IFT,
295
- "TFLai/gpt2-turkish-uncased": ModelType.FT,
296
- "ehartford/WizardLM-13B-Uncensored": ModelType.IFT,
297
- "ehartford/dolphin-llama-13b": ModelType.IFT,
298
- "ehartford/Wizard-Vicuna-30B-Uncensored": ModelType.FT,
299
- "ehartford/WizardLM-30B-Uncensored": ModelType.IFT,
300
- "ehartford/Wizard-Vicuna-13B-Uncensored": ModelType.FT,
301
- "ehartford/WizardLM-7B-Uncensored": ModelType.IFT,
302
- "ehartford/based-30b": ModelType.FT,
303
- "ehartford/Wizard-Vicuna-7B-Uncensored": ModelType.FT,
304
- "wahaha1987/llama_7b_sharegpt94k_fastchat": ModelType.FT,
305
- "wahaha1987/llama_13b_sharegpt94k_fastchat": ModelType.FT,
306
- "OpenAssistant/oasst-sft-1-pythia-12b": ModelType.FT,
307
- "OpenAssistant/stablelm-7b-sft-v7-epoch-3": ModelType.IFT,
308
- "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5": ModelType.FT,
309
- "OpenAssistant/pythia-12b-sft-v8-2.5k-steps": ModelType.IFT,
310
- "OpenAssistant/pythia-12b-sft-v8-7k-steps": ModelType.IFT,
311
- "OpenAssistant/pythia-12b-pre-v8-12.5k-steps": ModelType.IFT,
312
- "OpenAssistant/llama2-13b-orca-8k-3319": ModelType.IFT,
313
- "junelee/wizard-vicuna-13b": ModelType.FT,
314
- "BreadAi/gpt-YA-1-1_160M": ModelType.PT,
315
- "BreadAi/MuseCan": ModelType.PT,
316
- "BreadAi/MusePy-1-2": ModelType.PT,
317
- "BreadAi/DiscordPy": ModelType.PT,
318
- "BreadAi/PM_modelV2": ModelType.PT,
319
- "BreadAi/gpt-Youtube": ModelType.PT,
320
- "BreadAi/StoryPy": ModelType.FT,
321
- "julianweng/Llama-2-7b-chat-orcah": ModelType.FT,
322
- "AGI-inc/lora_moe_7b_baseline": ModelType.FT,
323
- "AGI-inc/lora_moe_7b": ModelType.FT,
324
- "togethercomputer/GPT-NeoXT-Chat-Base-20B": ModelType.IFT,
325
- "togethercomputer/RedPajama-INCITE-Chat-7B-v0.1": ModelType.IFT,
326
- "togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1": ModelType.IFT,
327
- "togethercomputer/RedPajama-INCITE-7B-Base": ModelType.PT,
328
- "togethercomputer/RedPajama-INCITE-7B-Instruct": ModelType.IFT,
329
- "togethercomputer/RedPajama-INCITE-Base-3B-v1": ModelType.PT,
330
- "togethercomputer/Pythia-Chat-Base-7B": ModelType.IFT,
331
- "togethercomputer/RedPajama-INCITE-Base-7B-v0.1": ModelType.PT,
332
- "togethercomputer/GPT-JT-6B-v1": ModelType.IFT,
333
- "togethercomputer/GPT-JT-6B-v0": ModelType.IFT,
334
- "togethercomputer/RedPajama-INCITE-Chat-3B-v1": ModelType.IFT,
335
- "togethercomputer/RedPajama-INCITE-7B-Chat": ModelType.IFT,
336
- "togethercomputer/RedPajama-INCITE-Instruct-3B-v1": ModelType.IFT,
337
- "Writer/camel-5b-hf": ModelType.IFT,
338
- "Writer/palmyra-base": ModelType.PT,
339
- "MBZUAI/LaMini-GPT-1.5B": ModelType.IFT,
340
- "MBZUAI/lamini-cerebras-111m": ModelType.IFT,
341
- "MBZUAI/lamini-neo-1.3b": ModelType.IFT,
342
- "MBZUAI/lamini-cerebras-1.3b": ModelType.IFT,
343
- "MBZUAI/lamini-cerebras-256m": ModelType.IFT,
344
- "MBZUAI/LaMini-GPT-124M": ModelType.IFT,
345
- "MBZUAI/lamini-neo-125m": ModelType.IFT,
346
- "TehVenom/DiffMerge-DollyGPT-Pygmalion": ModelType.FT,
347
- "TehVenom/PPO_Shygmalion-6b": ModelType.FT,
348
- "TehVenom/Dolly_Shygmalion-6b-Dev_V8P2": ModelType.FT,
349
- "TehVenom/Pygmalion_AlpacaLora-7b": ModelType.FT,
350
- "TehVenom/PPO_Pygway-V8p4_Dev-6b": ModelType.FT,
351
- "TehVenom/Dolly_Malion-6b": ModelType.FT,
352
- "TehVenom/PPO_Shygmalion-V8p4_Dev-6b": ModelType.FT,
353
- "TehVenom/ChanMalion": ModelType.FT,
354
- "TehVenom/GPT-J-Pyg_PPO-6B": ModelType.IFT,
355
- "TehVenom/Pygmalion-13b-Merged": ModelType.FT,
356
- "TehVenom/Metharme-13b-Merged": ModelType.IFT,
357
- "TehVenom/Dolly_Shygmalion-6b": ModelType.FT,
358
- "TehVenom/GPT-J-Pyg_PPO-6B-Dev-V8p4": ModelType.IFT,
359
- "georgesung/llama2_7b_chat_uncensored": ModelType.FT,
360
- "vicgalle/gpt2-alpaca": ModelType.IFT,
361
- "vicgalle/alpaca-7b": ModelType.FT,
362
- "vicgalle/gpt2-alpaca-gpt4": ModelType.IFT,
363
- "facebook/opt-350m": ModelType.PT,
364
- "facebook/opt-125m": ModelType.PT,
365
- "facebook/xglm-4.5B": ModelType.PT,
366
- "facebook/opt-2.7b": ModelType.PT,
367
- "facebook/opt-6.7b": ModelType.PT,
368
- "facebook/galactica-30b": ModelType.PT,
369
- "facebook/opt-13b": ModelType.PT,
370
- "facebook/opt-66b": ModelType.PT,
371
- "facebook/xglm-7.5B": ModelType.PT,
372
- "facebook/xglm-564M": ModelType.PT,
373
- "facebook/opt-30b": ModelType.PT,
374
- "golaxy/gogpt-7b": ModelType.FT,
375
- "golaxy/gogpt2-7b": ModelType.FT,
376
- "golaxy/gogpt-7b-bloom": ModelType.FT,
377
- "golaxy/gogpt-3b-bloom": ModelType.FT,
378
- "psmathur/orca_mini_v2_7b": ModelType.IFT,
379
- "psmathur/orca_mini_7b": ModelType.IFT,
380
- "psmathur/orca_mini_3b": ModelType.IFT,
381
- "psmathur/orca_mini_v2_13b": ModelType.IFT,
382
- "gpt2-xl": ModelType.PT,
383
- "lxe/Cerebras-GPT-2.7B-Alpaca-SP": ModelType.FT,
384
- "Monero/Manticore-13b-Chat-Pyg-Guanaco": ModelType.FT,
385
- "Monero/WizardLM-Uncensored-SuperCOT-StoryTelling-30b": ModelType.IFT,
386
- "Monero/WizardLM-13b-OpenAssistant-Uncensored": ModelType.IFT,
387
- "Monero/WizardLM-30B-Uncensored-Guanaco-SuperCOT-30b": ModelType.IFT,
388
- "jzjiao/opt-1.3b-rlhf": ModelType.FT,
389
- "HuggingFaceH4/starchat-beta": ModelType.IFT,
390
- "KnutJaegersberg/gpt-2-xl-EvolInstruct": ModelType.IFT,
391
- "KnutJaegersberg/megatron-GPT-2-345m-EvolInstruct": ModelType.IFT,
392
- "KnutJaegersberg/galactica-orca-wizardlm-1.3b": ModelType.IFT,
393
- "openchat/openchat_8192": ModelType.IFT,
394
- "openchat/openchat_v2": ModelType.IFT,
395
- "openchat/openchat_v2_w": ModelType.IFT,
396
- "ausboss/llama-13b-supercot": ModelType.IFT,
397
- "ausboss/llama-30b-supercot": ModelType.IFT,
398
- "Neko-Institute-of-Science/metharme-7b": ModelType.IFT,
399
- "Neko-Institute-of-Science/pygmalion-7b": ModelType.FT,
400
- "SebastianSchramm/Cerebras-GPT-111M-instruction": ModelType.IFT,
401
- "victor123/WizardLM-13B-1.0": ModelType.IFT,
402
- "OpenBuddy/openbuddy-openllama-13b-v7-fp16": ModelType.FT,
403
- "OpenBuddy/openbuddy-llama2-13b-v8.1-fp16": ModelType.FT,
404
- "OpenBuddyEA/openbuddy-llama-30b-v7.1-bf16": ModelType.FT,
405
- "baichuan-inc/Baichuan-7B": ModelType.PT,
406
- "tiiuae/falcon-40b-instruct": ModelType.IFT,
407
- "tiiuae/falcon-40b": ModelType.PT,
408
- "tiiuae/falcon-7b": ModelType.PT,
409
- "YeungNLP/firefly-llama-13b": ModelType.FT,
410
- "YeungNLP/firefly-llama-13b-v1.2": ModelType.FT,
411
- "YeungNLP/firefly-llama2-13b": ModelType.FT,
412
- "YeungNLP/firefly-ziya-13b": ModelType.FT,
413
- "shaohang/Sparse0.5_OPT-1.3": ModelType.FT,
414
- "xzuyn/Alpacino-SuperCOT-13B": ModelType.IFT,
415
- "xzuyn/MedicWizard-7B": ModelType.FT,
416
- "xDAN-AI/xDAN_13b_l2_lora": ModelType.FT,
417
- "beomi/KoAlpaca-Polyglot-5.8B": ModelType.FT,
418
- "beomi/llama-2-ko-7b": ModelType.IFT,
419
- "Salesforce/codegen-6B-multi": ModelType.PT,
420
- "Salesforce/codegen-16B-nl": ModelType.PT,
421
- "Salesforce/codegen-6B-nl": ModelType.PT,
422
- "ai-forever/rugpt3large_based_on_gpt2": ModelType.FT,
423
- "gpt2-large": ModelType.PT,
424
- "frank098/orca_mini_3b_juniper": ModelType.FT,
425
- "frank098/WizardLM_13B_juniper": ModelType.FT,
426
- "FPHam/Free_Sydney_13b_HF": ModelType.FT,
427
- "huggingface/llama-13b": ModelType.PT,
428
- "huggingface/llama-7b": ModelType.PT,
429
- "huggingface/llama-65b": ModelType.PT,
430
- "huggingface/llama-30b": ModelType.PT,
431
- "Henk717/chronoboros-33B": ModelType.IFT,
432
- "jondurbin/airoboros-13b-gpt4-1.4": ModelType.IFT,
433
- "jondurbin/airoboros-7b": ModelType.IFT,
434
- "jondurbin/airoboros-7b-gpt4": ModelType.IFT,
435
- "jondurbin/airoboros-7b-gpt4-1.1": ModelType.IFT,
436
- "jondurbin/airoboros-7b-gpt4-1.2": ModelType.IFT,
437
- "jondurbin/airoboros-7b-gpt4-1.3": ModelType.IFT,
438
- "jondurbin/airoboros-7b-gpt4-1.4": ModelType.IFT,
439
- "jondurbin/airoboros-l2-7b-gpt4-1.4.1": ModelType.IFT,
440
- "jondurbin/airoboros-l2-13b-gpt4-1.4.1": ModelType.IFT,
441
- "jondurbin/airoboros-l2-70b-gpt4-1.4.1": ModelType.IFT,
442
- "jondurbin/airoboros-13b": ModelType.IFT,
443
- "jondurbin/airoboros-33b-gpt4-1.4": ModelType.IFT,
444
- "jondurbin/airoboros-33b-gpt4-1.2": ModelType.IFT,
445
- "jondurbin/airoboros-65b-gpt4-1.2": ModelType.IFT,
446
- "ariellee/SuperPlatty-30B": ModelType.IFT,
447
- "danielhanchen/open_llama_3b_600bt_preview": ModelType.FT,
448
- "cerebras/Cerebras-GPT-256M": ModelType.PT,
449
- "cerebras/Cerebras-GPT-1.3B": ModelType.PT,
450
- "cerebras/Cerebras-GPT-13B": ModelType.PT,
451
- "cerebras/Cerebras-GPT-2.7B": ModelType.PT,
452
- "cerebras/Cerebras-GPT-111M": ModelType.PT,
453
- "cerebras/Cerebras-GPT-6.7B": ModelType.PT,
454
- "Yhyu13/oasst-rlhf-2-llama-30b-7k-steps-hf": ModelType.RL,
455
- "Yhyu13/llama-30B-hf-openassitant": ModelType.FT,
456
- "NousResearch/Nous-Hermes-Llama2-13b": ModelType.IFT,
457
- "NousResearch/Nous-Hermes-llama-2-7b": ModelType.IFT,
458
- "NousResearch/Redmond-Puffin-13B": ModelType.IFT,
459
- "NousResearch/Nous-Hermes-13b": ModelType.IFT,
460
- "project-baize/baize-v2-7b": ModelType.IFT,
461
- "project-baize/baize-v2-13b": ModelType.IFT,
462
- "LLMs/WizardLM-13B-V1.0": ModelType.FT,
463
- "LLMs/AlpacaGPT4-7B-elina": ModelType.FT,
464
- "wenge-research/yayi-7b": ModelType.FT,
465
- "wenge-research/yayi-7b-llama2": ModelType.FT,
466
- "wenge-research/yayi-13b-llama2": ModelType.FT,
467
- "yhyhy3/open_llama_7b_v2_med_instruct": ModelType.IFT,
468
- "llama-anon/instruct-13b": ModelType.IFT,
469
- "huggingtweets/jerma985": ModelType.FT,
470
- "huggingtweets/gladosystem": ModelType.FT,
471
- "huggingtweets/bladeecity-jerma985": ModelType.FT,
472
- "huggyllama/llama-13b": ModelType.PT,
473
- "huggyllama/llama-65b": ModelType.PT,
474
- "FabbriSimo01/Facebook_opt_1.3b_Quantized": ModelType.PT,
475
- "upstage/Llama-2-70b-instruct": ModelType.IFT,
476
- "upstage/Llama-2-70b-instruct-1024": ModelType.IFT,
477
- "upstage/llama-65b-instruct": ModelType.IFT,
478
- "upstage/llama-30b-instruct-2048": ModelType.IFT,
479
- "upstage/llama-30b-instruct": ModelType.IFT,
480
- "WizardLM/WizardLM-13B-1.0": ModelType.IFT,
481
- "WizardLM/WizardLM-13B-V1.1": ModelType.IFT,
482
- "WizardLM/WizardLM-13B-V1.2": ModelType.IFT,
483
- "WizardLM/WizardLM-30B-V1.0": ModelType.IFT,
484
- "WizardLM/WizardCoder-15B-V1.0": ModelType.IFT,
485
- "gpt2": ModelType.PT,
486
- "keyfan/vicuna-chinese-replication-v1.1": ModelType.IFT,
487
- "nthngdy/pythia-owt2-70m-100k": ModelType.FT,
488
- "nthngdy/pythia-owt2-70m-50k": ModelType.FT,
489
- "quantumaikr/KoreanLM-hf": ModelType.FT,
490
- "quantumaikr/open_llama_7b_hf": ModelType.FT,
491
- "quantumaikr/QuantumLM-70B-hf": ModelType.IFT,
492
- "MayaPH/FinOPT-Lincoln": ModelType.FT,
493
- "MayaPH/FinOPT-Franklin": ModelType.FT,
494
- "MayaPH/GodziLLa-30B": ModelType.IFT,
495
- "MayaPH/GodziLLa-30B-plus": ModelType.IFT,
496
- "MayaPH/FinOPT-Washington": ModelType.FT,
497
- "ogimgio/gpt-neo-125m-neurallinguisticpioneers": ModelType.FT,
498
- "layoric/llama-2-13b-code-alpaca": ModelType.FT,
499
- "CobraMamba/mamba-gpt-3b": ModelType.FT,
500
- "CobraMamba/mamba-gpt-3b-v2": ModelType.FT,
501
- "CobraMamba/mamba-gpt-3b-v3": ModelType.FT,
502
- "timdettmers/guanaco-33b-merged": ModelType.FT,
503
- "elinas/chronos-33b": ModelType.IFT,
504
- "heegyu/RedTulu-Uncensored-3B-0719": ModelType.IFT,
505
- "heegyu/WizardVicuna-Uncensored-3B-0719": ModelType.IFT,
506
- "heegyu/WizardVicuna-3B-0719": ModelType.IFT,
507
- "meta-llama/Llama-2-7b-chat-hf": ModelType.RL,
508
- "meta-llama/Llama-2-7b-hf": ModelType.PT,
509
- "meta-llama/Llama-2-13b-chat-hf": ModelType.RL,
510
- "meta-llama/Llama-2-13b-hf": ModelType.PT,
511
- "meta-llama/Llama-2-70b-chat-hf": ModelType.RL,
512
- "meta-llama/Llama-2-70b-hf": ModelType.PT,
513
- "xhyi/PT_GPTNEO350_ATG": ModelType.FT,
514
- "h2oai/h2ogpt-gm-oasst1-en-1024-20b": ModelType.FT,
515
- "h2oai/h2ogpt-gm-oasst1-en-1024-open-llama-7b-preview-400bt": ModelType.FT,
516
- "h2oai/h2ogpt-oig-oasst1-512-6_9b": ModelType.IFT,
517
- "h2oai/h2ogpt-oasst1-512-12b": ModelType.IFT,
518
- "h2oai/h2ogpt-oig-oasst1-256-6_9b": ModelType.IFT,
519
- "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt": ModelType.FT,
520
- "h2oai/h2ogpt-oasst1-512-20b": ModelType.IFT,
521
- "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2": ModelType.FT,
522
- "h2oai/h2ogpt-gm-oasst1-en-1024-12b": ModelType.FT,
523
- "h2oai/h2ogpt-gm-oasst1-multilang-1024-20b": ModelType.FT,
524
- "bofenghuang/vigogne-13b-instruct": ModelType.IFT,
525
- "bofenghuang/vigogne-13b-chat": ModelType.FT,
526
- "bofenghuang/vigogne-2-7b-instruct": ModelType.IFT,
527
- "bofenghuang/vigogne-7b-instruct": ModelType.IFT,
528
- "bofenghuang/vigogne-7b-chat": ModelType.FT,
529
- "Vmware/open-llama-7b-v2-open-instruct": ModelType.IFT,
530
- "VMware/open-llama-0.7T-7B-open-instruct-v1.1": ModelType.IFT,
531
- "ewof/koishi-instruct-3b": ModelType.IFT,
532
- "gywy/llama2-13b-chinese-v1": ModelType.FT,
533
- "GOAT-AI/GOAT-7B-Community": ModelType.FT,
534
- "psyche/kollama2-7b": ModelType.FT,
535
- "TheTravellingEngineer/llama2-7b-hf-guanaco": ModelType.FT,
536
- "beaugogh/pythia-1.4b-deduped-sharegpt": ModelType.FT,
537
- "augtoma/qCammel-70-x": ModelType.IFT,
538
- "Lajonbot/Llama-2-7b-chat-hf-instruct-pl-lora_unload": ModelType.IFT,
539
- "anhnv125/pygmalion-6b-roleplay": ModelType.FT,
540
- "64bits/LexPodLM-13B": ModelType.FT,
541
- }
542
-
543
-
544
- def model_type_from_str(type):
545
- if "fine-tuned" in type or "πŸ”Ά" in type:
546
- return ModelType.FT
547
- if "pretrained" in type or "🟒" in type:
548
- return ModelType.PT
549
- if "RL-tuned" in type or "🟦" in type:
550
- return ModelType.RL
551
- if "instruction-tuned" in type or "β­•" in type:
552
- return ModelType.IFT
553
- return ModelType.Unknown
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display_models/read_results.py DELETED
@@ -1,152 +0,0 @@
1
- import json
2
- import os
3
- from dataclasses import dataclass
4
- from typing import Dict, List, Tuple
5
- from distutils.util import strtobool
6
-
7
- import dateutil
8
- import numpy as np
9
-
10
- from src.display_models.utils import AutoEvalColumn, make_clickable_model
11
-
12
- # ν˜„μš° - ko_commongen_v2 : acc_norm인지 체크 ν•„μš”ν•¨
13
- METRICS = ["acc_norm", "acc_norm", "acc", "mc2", "acc_norm"]
14
- BENCHMARKS = ["ko_arc_challenge", "ko_hellaswag", "ko_mmlu", "ko_truthfulqa_mc", "ko_commongen_v2"] #, "ethicalverification"]
15
- BENCH_TO_NAME = {
16
- "ko_arc_challenge": AutoEvalColumn.arc.name,
17
- "ko_hellaswag": AutoEvalColumn.hellaswag.name,
18
- "ko_mmlu": AutoEvalColumn.mmlu.name,
19
- "ko_truthfulqa_mc": AutoEvalColumn.truthfulqa.name,
20
- "ko_commongen_v2": AutoEvalColumn.commongen_v2.name,
21
- # TODO: Uncomment when we have results for these
22
- # "ethicalverification": AutoEvalColumn.ethicalverification.name,
23
- }
24
- IS_PUBLIC = bool(strtobool(os.environ.get("IS_PUBLIC", "True")))
25
-
26
- @dataclass
27
- class EvalResult:
28
- eval_name: str
29
- org: str
30
- model: str
31
- revision: str
32
- results: dict
33
- precision: str = ""
34
- model_type: str = ""
35
- weight_type: str = ""
36
-
37
- def to_dict(self):
38
- from src.load_from_hub import is_model_on_hub
39
-
40
- if self.org is not None:
41
- base_model = f"{self.org}/{self.model}"
42
- else:
43
- base_model = f"{self.model}"
44
- data_dict = {}
45
-
46
- data_dict["eval_name"] = self.eval_name # not a column, just a save name
47
- data_dict["weight_type"] = self.weight_type # not a column, just a save name
48
- data_dict[AutoEvalColumn.precision.name] = self.precision
49
- data_dict[AutoEvalColumn.model_type.name] = self.model_type
50
- data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
51
- data_dict[AutoEvalColumn.dummy.name] = base_model
52
- data_dict[AutoEvalColumn.revision.name] = self.revision
53
- data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 5.0
54
- data_dict[AutoEvalColumn.still_on_hub.name] = (
55
- is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
56
- )
57
-
58
- for benchmark in BENCHMARKS:
59
- if benchmark not in self.results.keys():
60
- self.results[benchmark] = None
61
-
62
- for k, v in BENCH_TO_NAME.items():
63
- data_dict[v] = self.results[k]
64
-
65
- return data_dict
66
-
67
-
68
- def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
69
- with open(json_filepath) as fp:
70
- data = json.load(fp)
71
-
72
- try:
73
- config = data["config"]
74
- except KeyError:
75
- config = data["config_general"]
76
- model = config.get("model_name", None)
77
- if model is None:
78
- model = config.get("model_args", None)
79
-
80
- model_sha = config.get("model_sha", "")
81
- model_split = model.split("/", 1)
82
-
83
- precision = config.get("model_dtype")
84
-
85
- model = model_split[-1]
86
-
87
- if len(model_split) == 1:
88
- org = None
89
- model = model_split[0]
90
- result_key = f"{model}_{precision}"
91
- else:
92
- org = model_split[0]
93
- model = model_split[1]
94
- result_key = f"{org}_{model}_{precision}"
95
-
96
- eval_results = []
97
- for benchmark, metric in zip(BENCHMARKS, METRICS):
98
- accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
99
- if accs.size == 0 or any([acc is None for acc in accs]):
100
- continue
101
- mean_acc = np.mean(accs) * 100.0
102
- eval_results.append(
103
- EvalResult(
104
- eval_name=result_key,
105
- org=org,
106
- model=model,
107
- revision=model_sha,
108
- results={benchmark: mean_acc},
109
- precision=precision, # todo model_type=, weight_type=
110
- )
111
- )
112
-
113
- return result_key, eval_results
114
-
115
-
116
- def get_eval_results(results_path: str) -> List[EvalResult]:
117
- json_filepaths = []
118
-
119
- for root, dir, files in os.walk(results_path + ("-private" if not IS_PUBLIC else "")):
120
- # We should only have json files in model results
121
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
122
- continue
123
-
124
- # Sort the files by date
125
- # store results by precision maybe?
126
- try:
127
- files.sort(key=lambda x: dateutil.parser.parse(x.split("_", 1)[-1][:-5]))
128
- except dateutil.parser._parser.ParserError:
129
- files = [files[-1]]
130
-
131
- # up_to_date = files[-1]
132
- for file in files:
133
- json_filepaths.append(os.path.join(root, file))
134
-
135
- eval_results = {}
136
- for json_filepath in json_filepaths:
137
- result_key, results = parse_eval_result(json_filepath)
138
- for eval_result in results:
139
- if result_key in eval_results.keys():
140
- eval_results[result_key].results.update(eval_result.results)
141
- else:
142
- eval_results[result_key] = eval_result
143
-
144
- eval_results = [v for v in eval_results.values()]
145
-
146
- return eval_results
147
-
148
-
149
- def get_eval_results_dicts(results_path: str) -> List[Dict]:
150
- eval_results = get_eval_results(results_path)
151
-
152
- return [e.to_dict() for e in eval_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display_models/utils.py DELETED
@@ -1,149 +0,0 @@
1
- import os
2
- from dataclasses import dataclass
3
-
4
- from huggingface_hub import HfApi
5
-
6
- API = HfApi()
7
-
8
-
9
- # These classes are for user facing column names, to avoid having to change them
10
- # all around the code when a modif is needed
11
- @dataclass
12
- class ColumnContent:
13
- name: str
14
- type: str
15
- displayed_by_default: bool
16
- hidden: bool = False
17
-
18
-
19
- def fields(raw_class):
20
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
21
-
22
-
23
- @dataclass(frozen=True)
24
- class AutoEvalColumn: # Auto evals column
25
- model_type_symbol = ColumnContent("T", "str", True)
26
- model = ColumnContent("Model", "markdown", True)
27
- average = ColumnContent("Average ⬆️", "number", True)
28
- arc = ColumnContent("Ko-ARC", "number", True)
29
- hellaswag = ColumnContent("Ko-HellaSwag", "number", True)
30
- mmlu = ColumnContent("Ko-MMLU", "number", True)
31
- truthfulqa = ColumnContent("Ko-TruthfulQA", "number", True)
32
- commongen_v2 = ColumnContent("Ko-CommonGen V2", "number", True)
33
- # TODO: Uncomment when we have results for these
34
- # ethicalverification = ColumnContent("EthicalVerification", "number", True)
35
- model_type = ColumnContent("Type", "str", False)
36
- precision = ColumnContent("Precision", "str", False) # , True)
37
- license = ColumnContent("Hub License", "str", False)
38
- params = ColumnContent("#Params (B)", "number", False)
39
- likes = ColumnContent("Hub ❀️", "number", False)
40
- still_on_hub = ColumnContent("Available on the hub", "bool", False)
41
- revision = ColumnContent("Model sha", "str", False, False)
42
- dummy = ColumnContent(
43
- "model_name_for_query", "str", True
44
- ) # dummy col to implement search bar (hidden by custom CSS)
45
-
46
-
47
- @dataclass(frozen=True)
48
- class EloEvalColumn: # Elo evals column
49
- model = ColumnContent("Model", "markdown", True)
50
- gpt4 = ColumnContent("GPT-4 (all)", "number", True)
51
- human_all = ColumnContent("Human (all)", "number", True)
52
- human_instruct = ColumnContent("Human (instruct)", "number", True)
53
- human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
54
-
55
-
56
- @dataclass(frozen=True)
57
- class EvalQueueColumn: # Queue column
58
- model = ColumnContent("model", "markdown", True)
59
- revision = ColumnContent("revision", "str", True)
60
- private = ColumnContent("private", "bool", True)
61
- precision = ColumnContent("precision", "str", True)
62
- weight_type = ColumnContent("weight_type", "str", "Original")
63
- status = ColumnContent("status", "str", True)
64
-
65
-
66
- LLAMAS = [
67
- "huggingface/llama-7b",
68
- "huggingface/llama-13b",
69
- "huggingface/llama-30b",
70
- "huggingface/llama-65b",
71
- ]
72
-
73
-
74
- KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
75
- VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
76
- OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
77
- DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
78
- MODEL_PAGE = "https://huggingface.co/models"
79
- LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
80
- VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
81
- ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
82
-
83
-
84
- def model_hyperlink(link, model_name):
85
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
86
-
87
-
88
- def make_clickable_model(model_name):
89
- link = f"https://huggingface.co/{model_name}"
90
-
91
- if model_name in LLAMAS:
92
- link = LLAMA_LINK
93
- model_name = model_name.split("/")[1]
94
- elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
95
- link = VICUNA_LINK
96
- model_name = "stable-vicuna-13b"
97
- elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
98
- link = ALPACA_LINK
99
- model_name = "alpaca-13b"
100
- if model_name == "dolly-12b":
101
- link = DOLLY_LINK
102
- elif model_name == "vicuna-13b":
103
- link = VICUNA_LINK
104
- elif model_name == "koala-13b":
105
- link = KOALA_LINK
106
- elif model_name == "oasst-12b":
107
- link = OASST_LINK
108
-
109
- details_model_name = model_name.replace("/", "__")
110
- # details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
111
-
112
- # if not bool(os.getenv("DEBUG", "False")):
113
- # # We only add these checks when not debugging, as they are extremely slow
114
- # print(f"details_link: {details_link}")
115
- # try:
116
- # check_path = list(
117
- # API.list_files_info(
118
- # repo_id=f"open-ko-llm-leaderboard/details_{details_model_name}",
119
- # paths="README.md",
120
- # repo_type="dataset",
121
- # )
122
- # )
123
- # print(f"check_path: {check_path}")
124
- # except Exception as err:
125
- # # No details repo for this model
126
- # print(f"No details repo for this model: {err}")
127
- # return model_hyperlink(link, model_name)
128
-
129
- return model_hyperlink(link, model_name) # + " " + model_hyperlink(details_link, "πŸ“‘")
130
-
131
-
132
- def styled_error(error):
133
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
134
-
135
-
136
- def styled_warning(warn):
137
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
138
-
139
-
140
- def styled_message(message):
141
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
142
-
143
-
144
- def has_no_nan_values(df, columns):
145
- return df[columns].notna().all(axis=1)
146
-
147
-
148
- def has_nan_values(df, columns):
149
- return df[columns].isna().any(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/envs.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # clone / pull the lmeh eval data
6
+ H4_TOKEN = os.environ.get("H4_TOKEN", None)
7
+
8
+ REPO_ID = "upstage/open-ko-llm-leaderboard"
9
+ QUEUE_REPO = "open-ko-llm-leaderboard/requests"
10
+ RESULTS_REPO = "open-ko-llm-leaderboard/results"
11
+
12
+ PRIVATE_QUEUE_REPO = "open-ko-llm-leaderboard/private-requests"
13
+ PRIVATE_RESULTS_REPO = "open-ko-llm-leaderboard/private-results"
14
+
15
+ IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
16
+
17
+ CACHE_PATH=os.getenv("HF_HOME", ".")
18
+
19
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
20
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
21
+
22
+ EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
23
+ EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
24
+
25
+ PATH_TO_COLLECTION = "open-ko-llm-leaderboard/ko-llm-leaderboard-best-models-659c7e45a481ceea4c883506"
26
+
27
+ # Rate limit variables
28
+ RATE_LIMIT_PERIOD = 7
29
+ RATE_LIMIT_QUOTA = 5
30
+ HAS_HIGHER_RATE_LIMIT = []
31
+
32
+ API = HfApi(token=H4_TOKEN)
src/leaderboard/filter_models.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.display.formatting import model_hyperlink
2
+ from src.display.utils import AutoEvalColumn
3
+
4
+ # Models which have been flagged by users as being problematic for a reason or another
5
+ # (Model name to forum discussion link)
6
+ FLAGGED_MODELS = {
7
+ "merged": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
8
+ }
9
+
10
+ # Models which have been requested by orgs to not be submitted on the leaderboard
11
+ DO_NOT_SUBMIT_MODELS = [
12
+ ]
13
+
14
+
15
+ def flag_models(leaderboard_data: list[dict]):
16
+ for model_data in leaderboard_data:
17
+ # Merges are flagged automatically
18
+ if model_data[AutoEvalColumn.flagged.name] == True:
19
+ flag_key = "merged"
20
+ else:
21
+ flag_key = model_data["model_name_for_query"]
22
+
23
+ if flag_key in FLAGGED_MODELS:
24
+ issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
25
+ issue_link = model_hyperlink(
26
+ FLAGGED_MODELS[flag_key],
27
+ f"See discussion #{issue_num}",
28
+ )
29
+ model_data[
30
+ AutoEvalColumn.model.name
31
+ ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
32
+ model_data[AutoEvalColumn.flagged.name] = True
33
+ else:
34
+ model_data[AutoEvalColumn.flagged.name] = False
35
+
36
+
37
+ def remove_forbidden_models(leaderboard_data: list[dict]):
38
+ indices_to_remove = []
39
+ for ix, model in enumerate(leaderboard_data):
40
+ if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
41
+ indices_to_remove.append(ix)
42
+
43
+ for ix in reversed(indices_to_remove):
44
+ leaderboard_data.pop(ix)
45
+ return leaderboard_data
46
+
47
+
48
+ def filter_models(leaderboard_data: list[dict]):
49
+ leaderboard_data = remove_forbidden_models(leaderboard_data)
50
+ flag_models(leaderboard_data)
src/leaderboard/read_evals.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import math
4
+ import os
5
+ from dataclasses import dataclass
6
+
7
+ import dateutil
8
+ import numpy as np
9
+
10
+ from huggingface_hub import ModelCard
11
+
12
+ from src.display.formatting import make_clickable_model
13
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
14
+ from src.submission.check_validity import is_model_on_hub, check_model_card
15
+
16
+
17
+ @dataclass
18
+ class EvalResult:
19
+ # Also see src.display.utils.AutoEvalColumn for what will be displayed.
20
+ eval_name: str # org_model_precision (uid)
21
+ full_model: str # org/model (path on hub)
22
+ org: str
23
+ model: str
24
+ revision: str # commit hash, "" if main
25
+ results: dict
26
+ precision: Precision = Precision.Unknown
27
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
+ weight_type: WeightType = WeightType.Original # Original or Adapter
29
+ architecture: str = "Unknown" # From config file
30
+ license: str = "?"
31
+ likes: int = 0
32
+ num_params: int = 0
33
+ date: str = "" # submission date of request file
34
+ still_on_hub: bool = False
35
+ is_merge: bool = False
36
+ flagged: bool = False
37
+
38
+ @classmethod
39
+ def init_from_json_file(self, json_filepath):
40
+ """Inits the result from the specific model result file"""
41
+ with open(json_filepath) as fp:
42
+ data = json.load(fp)
43
+
44
+ # We manage the legacy config format
45
+ config = data.get("config", data.get("config_general", None))
46
+
47
+ # Precision
48
+ precision = Precision.from_str(config.get("model_dtype"))
49
+
50
+ # Get model and org
51
+ org_and_model = config.get("model_name", config.get("model_args", None))
52
+ org_and_model = org_and_model.split("/", 1)
53
+
54
+ if len(org_and_model) == 1:
55
+ org = None
56
+ model = org_and_model[0]
57
+ result_key = f"{model}_{precision.value.name}"
58
+ else:
59
+ org = org_and_model[0]
60
+ model = org_and_model[1]
61
+ result_key = f"{org}_{model}_{precision.value.name}"
62
+ full_model = "/".join(org_and_model)
63
+
64
+ still_on_hub, error, model_config = is_model_on_hub(
65
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
66
+ )
67
+ architecture = "?"
68
+ if model_config is not None:
69
+ architectures = getattr(model_config, "architectures", None)
70
+ if architectures:
71
+ architecture = ";".join(architectures)
72
+
73
+ # If the model doesn't have a model card or a license, we consider it's deleted
74
+ if still_on_hub:
75
+ try:
76
+ if check_model_card(full_model)[0] is False:
77
+ still_on_hub = False
78
+ except Exception:
79
+ still_on_hub = False
80
+
81
+ # Check if the model is a merge
82
+ is_merge_from_metadata = False
83
+ flagged = False
84
+ if still_on_hub:
85
+ model_card = ModelCard.load(full_model)
86
+
87
+ if model_card.data.tags:
88
+ is_merge_from_metadata = "merge" in model_card.data.tags
89
+ merge_keywords = ["mergekit", "merged model", "merge model", "merging", "merge", "merged", "Carbon"]
90
+ # If the model is a merge but not saying it in the metadata, we flag it
91
+ is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
92
+ flagged = is_merge_from_model_card and not is_merge_from_metadata
93
+
94
+
95
+ # Extract results available in this file (some results are split in several files)
96
+ results = {}
97
+ for task in Tasks:
98
+ task = task.value
99
+
100
+ # Some truthfulQA values are NaNs
101
+ if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
102
+ if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
103
+ results[task.benchmark] = 0.0
104
+ continue
105
+
106
+ # We average all scores of a given metric (mostly for mmlu)
107
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
108
+ if accs.size == 0 or any([acc is None for acc in accs]):
109
+ continue
110
+
111
+ mean_acc = np.mean(accs) * 100.0
112
+ results[task.benchmark] = mean_acc
113
+
114
+ return self(
115
+ eval_name=result_key,
116
+ full_model=full_model,
117
+ org=org,
118
+ model=model,
119
+ results=results,
120
+ precision=precision,
121
+ revision= config.get("model_sha", ""),
122
+ still_on_hub=still_on_hub,
123
+ architecture=architecture,
124
+ is_merge=is_merge_from_metadata,
125
+ flagged=flagged,
126
+ )
127
+
128
+ def update_with_request_file(self, requests_path):
129
+ """Finds the relevant request file for the current model and updates info with it"""
130
+ request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
131
+
132
+ try:
133
+ with open(request_file, "r") as f:
134
+ request = json.load(f)
135
+ self.model_type = ModelType.from_str(request.get("model_type", ""))
136
+ self.weight_type = WeightType[request.get("weight_type", "Original")]
137
+ self.license = request.get("license", "?")
138
+ self.likes = request.get("likes", 0)
139
+ self.num_params = request.get("params", 0)
140
+ self.date = request.get("submitted_time", "")
141
+ except Exception:
142
+ print(f"Could not find request file for {self.org}/{self.model}")
143
+
144
+ def to_dict(self):
145
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
146
+ average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
147
+ data_dict = {
148
+ "eval_name": self.eval_name, # not a column, just a save name,
149
+ AutoEvalColumn.precision.name: self.precision.value.name,
150
+ AutoEvalColumn.model_type.name: self.model_type.value.name,
151
+ AutoEvalColumn.merged.name: self.is_merge,
152
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, # + "πŸ₯¦" if self.is_merge,
153
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
154
+ AutoEvalColumn.architecture.name: self.architecture,
155
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
156
+ AutoEvalColumn.dummy.name: self.full_model,
157
+ AutoEvalColumn.revision.name: self.revision,
158
+ AutoEvalColumn.average.name: average,
159
+ AutoEvalColumn.license.name: self.license,
160
+ AutoEvalColumn.likes.name: self.likes,
161
+ AutoEvalColumn.params.name: self.num_params,
162
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
163
+ AutoEvalColumn.flagged.name: self.flagged
164
+
165
+ }
166
+
167
+ for task in Tasks:
168
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
169
+
170
+ return data_dict
171
+
172
+
173
+ def get_request_file_for_model(requests_path, model_name, precision):
174
+ """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
175
+ request_files = os.path.join(
176
+ requests_path,
177
+ f"{model_name}_eval_request_*.json",
178
+ )
179
+ request_files = glob.glob(request_files)
180
+
181
+ # Select correct request file (precision)
182
+ request_file = ""
183
+ request_files = sorted(request_files, reverse=True)
184
+ for tmp_request_file in request_files:
185
+ with open(tmp_request_file, "r") as f:
186
+ req_content = json.load(f)
187
+ if (
188
+ req_content["status"] in ["FINISHED"]
189
+ and req_content["precision"] == precision.split(".")[-1]
190
+ ):
191
+ request_file = tmp_request_file
192
+ return request_file
193
+
194
+
195
+ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
196
+ """From the path of the results folder root, extract all needed info for results"""
197
+ model_result_filepaths = []
198
+
199
+ for root, _, files in os.walk(results_path):
200
+ # We should only have json files in model results
201
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
202
+ continue
203
+
204
+ # Sort the files by date
205
+ try:
206
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
207
+ except dateutil.parser._parser.ParserError:
208
+ files = [files[-1]]
209
+
210
+ for file in files:
211
+ model_result_filepaths.append(os.path.join(root, file))
212
+
213
+ eval_results = {}
214
+ for model_result_filepath in model_result_filepaths:
215
+ # Creation of result
216
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
217
+ eval_result.update_with_request_file(requests_path)
218
+
219
+ # Store results of same eval together
220
+ eval_name = eval_result.eval_name
221
+ if eval_name in eval_results.keys():
222
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
223
+ else:
224
+ eval_results[eval_name] = eval_result
225
+
226
+ results = []
227
+ for v in eval_results.values():
228
+ try:
229
+ v.to_dict() # we test if the dict version is complete
230
+ results.append(v)
231
+ except KeyError: # not all eval values present
232
+ continue
233
+
234
+ return results
src/{load_from_hub.py β†’ populate.py} RENAMED
@@ -1,56 +1,30 @@
1
  import json
2
  import os
3
- from collections import defaultdict
4
 
5
  import pandas as pd
6
- from transformers import AutoConfig
7
 
8
- from src.assets.hardcoded_evals import baseline
9
- from src.display_models.get_model_metadata import apply_metadata
10
- from src.display_models.read_results import get_eval_results_dicts, make_clickable_model
11
- from src.display_models.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
12
 
13
 
14
- def get_all_requested_models(requested_models_dir: str) -> set[str]:
15
- depth = 1
16
- file_names = []
17
- users_to_submission_dates = defaultdict(list)
 
18
 
19
- for root, _, files in os.walk(requested_models_dir):
20
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
21
- if current_depth == depth:
22
- for file in files:
23
- if not file.endswith(".json"): continue
24
- with open(os.path.join(root, file), "r") as f:
25
- info = json.load(f)
26
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
27
-
28
- # Select organisation
29
- if info["model"].count("/") == 0 or "submitted_time" not in info:
30
- continue
31
- organisation, _ = info["model"].split("/")
32
- users_to_submission_dates[organisation].append(info["submitted_time"])
33
-
34
- return set(file_names), users_to_submission_dates
35
-
36
-
37
- def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
38
- all_data = get_eval_results_dicts(results_path)
39
-
40
- # all_data.append(baseline)
41
- apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
42
-
43
- df = pd.DataFrame.from_records(all_data)
44
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
45
  df = df[cols].round(decimals=2)
46
 
47
  # filter out if any of the benchmarks have not been produced
48
  df = df[has_no_nan_values(df, benchmark_cols)]
49
- return df
50
 
51
 
52
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
53
-
54
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
55
  all_evals = []
56
 
@@ -85,19 +59,3 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
85
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
86
  df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
87
  return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]
88
-
89
-
90
-
91
- def is_model_on_hub(model_name: str, revision: str) -> bool:
92
- try:
93
- AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
94
- return True, None
95
-
96
- except ValueError:
97
- return (
98
- False,
99
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
100
- )
101
-
102
- except Exception:
103
- return False, "was not found on hub!"
 
1
  import json
2
  import os
 
3
 
4
  import pandas as pd
 
5
 
6
+ from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
+ from src.leaderboard.filter_models import filter_models
9
+ from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
+ raw_data = get_raw_eval_results(results_path, requests_path)
14
+ all_data_json = [v.to_dict() for v in raw_data]
15
+ # all_data_json.append(baseline_row)
16
+ filter_models(all_data_json)
17
 
18
+ df = pd.DataFrame.from_records(all_data_json)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
20
  df = df[cols].round(decimals=2)
21
 
22
  # filter out if any of the benchmarks have not been produced
23
  df = df[has_no_nan_values(df, benchmark_cols)]
24
+ return raw_data, df
25
 
26
 
27
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
28
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
29
  all_evals = []
30
 
 
59
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
60
  df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
61
  return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/rate_limiting.py DELETED
@@ -1,16 +0,0 @@
1
-
2
- from datetime import datetime, timezone, timedelta
3
-
4
-
5
- def user_submission_permission(submission_name, users_to_submission_dates, rate_limit_period):
6
- org_or_user, _ = submission_name.split("/")
7
- if org_or_user not in users_to_submission_dates:
8
- return 0
9
- submission_dates = sorted(users_to_submission_dates[org_or_user])
10
-
11
- time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ")
12
- submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
13
-
14
- return len(submissions_after_timelimit)
15
-
16
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from collections import defaultdict
5
+ from datetime import datetime, timedelta, timezone
6
+
7
+ import huggingface_hub
8
+ from huggingface_hub import ModelCard
9
+ from huggingface_hub.hf_api import ModelInfo
10
+ from transformers import AutoConfig, AutoTokenizer
11
+
12
+ from src.envs import HAS_HIGHER_RATE_LIMIT
13
+
14
+
15
+ # ht to @Wauplin, thank you for the snippet!
16
+ # See https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/317
17
+ def check_model_card(repo_id: str) -> tuple[bool, str]:
18
+ # Returns operation status, and error message
19
+ try:
20
+ card = ModelCard.load(repo_id)
21
+ except huggingface_hub.utils.EntryNotFoundError:
22
+ return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
23
+
24
+ # Enforce license metadata
25
+ if card.data.license is None:
26
+ if not ("license_name" in card.data and "license_link" in card.data):
27
+ return False, (
28
+ "License not found. Please add a license to your model card using the `license` metadata or a"
29
+ " `license_name`/`license_link` pair."
30
+ )
31
+
32
+ # Enforce card content
33
+ if len(card.text) < 200:
34
+ return False, "Please add a description to your model card, it is too short."
35
+
36
+ return True, ""
37
+
38
+
39
+ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
40
+ try:
41
+ config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) #, force_download=True)
42
+ if test_tokenizer:
43
+ try:
44
+ tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
45
+ except ValueError as e:
46
+ return (
47
+ False,
48
+ f"uses a tokenizer which is not in a transformers release: {e}",
49
+ None
50
+ )
51
+ except Exception as e:
52
+ return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
53
+ return True, None, config
54
+
55
+ except ValueError:
56
+ return (
57
+ False,
58
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
59
+ None
60
+ )
61
+
62
+ except Exception as e:
63
+ return False, "was not found on hub!", None
64
+
65
+
66
+ def get_model_size(model_info: ModelInfo, precision: str):
67
+ size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
68
+ try:
69
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
70
+ except (AttributeError, TypeError ):
71
+ try:
72
+ size_match = re.search(size_pattern, model_info.modelId.split("/")[-1].lower())
73
+ model_size = size_match.group(0)
74
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
75
+ except AttributeError:
76
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
77
+
78
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.split("/")[-1].lower()) else 1
79
+ model_size = size_factor * model_size
80
+ return model_size
81
+
82
+ def get_model_arch(model_info: ModelInfo):
83
+ return model_info.config.get("architectures", "Unknown")
84
+
85
+ def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
86
+ if org_or_user not in users_to_submission_dates:
87
+ return True, ""
88
+ submission_dates = sorted(users_to_submission_dates[org_or_user])
89
+
90
+ time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ")
91
+ submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
92
+
93
+ num_models_submitted_in_period = len(submissions_after_timelimit)
94
+ if org_or_user in HAS_HIGHER_RATE_LIMIT:
95
+ rate_limit_quota = 2 * rate_limit_quota
96
+
97
+ if num_models_submitted_in_period > rate_limit_quota:
98
+ error_msg = f"Organisation or user `{org_or_user}`"
99
+ error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
100
+ error_msg += f"in the last {rate_limit_period} days.\n"
101
+ error_msg += (
102
+ "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard πŸ€—"
103
+ )
104
+ return False, error_msg
105
+ return True, ""
106
+
107
+
108
+ def already_submitted_models(requested_models_dir: str) -> set[str]:
109
+ depth = 1
110
+ file_names = []
111
+ users_to_submission_dates = defaultdict(list)
112
+
113
+ for root, _, files in os.walk(requested_models_dir):
114
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
115
+ if current_depth == depth:
116
+ for file in files:
117
+ if not file.endswith(".json"):
118
+ continue
119
+ with open(os.path.join(root, file), "r") as f:
120
+ info = json.load(f)
121
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
122
+
123
+ # Select organisation
124
+ if info["model"].count("/") == 0 or "submitted_time" not in info:
125
+ continue
126
+ organisation, _ = info["model"].split("/")
127
+ users_to_submission_dates[organisation].append(info["submitted_time"])
128
+
129
+ return set(file_names), users_to_submission_dates
src/submission/submit.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime, timezone
4
+
5
+ from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
7
+ from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
8
+ from src.submission.check_validity import (
9
+ already_submitted_models,
10
+ check_model_card,
11
+ get_model_size,
12
+ is_model_on_hub,
13
+ user_submission_permission,
14
+ )
15
+
16
+ REQUESTED_MODELS = None
17
+ USERS_TO_SUBMISSION_DATES = None
18
+
19
+ def add_new_eval(
20
+ model: str,
21
+ base_model: str,
22
+ revision: str,
23
+ precision: str,
24
+ private: bool,
25
+ weight_type: str,
26
+ model_type: str,
27
+ ):
28
+ global REQUESTED_MODELS
29
+ global USERS_TO_SUBMISSION_DATES
30
+ if not REQUESTED_MODELS:
31
+ REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
32
+
33
+ user_name = ""
34
+ model_path = model
35
+ if "/" in model:
36
+ user_name = model.split("/")[0]
37
+ model_path = model.split("/")[1]
38
+
39
+ precision = precision.split(" ")[0]
40
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
41
+
42
+ if model_type is None or model_type == "":
43
+ return styled_error("Please select a model type.")
44
+
45
+ # Upstage models are now allowed to be submitted to ensure the transparency and fairness of the leaderboard.
46
+ if user_name == "upstage":
47
+ return styled_warning("We do not conduct evaluations on Upstage models to ensure the transparency and fairness of the leaderboard. Please take this into consideration.")
48
+
49
+ # Is the user rate limited?
50
+ if user_name != "":
51
+ user_can_submit, error_msg = user_submission_permission(
52
+ user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
53
+ )
54
+ if not user_can_submit:
55
+ return styled_error(error_msg)
56
+
57
+ # Did the model authors forbid its submission to the leaderboard?
58
+ if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
59
+ return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
60
+
61
+ # Does the model actually exist?
62
+ if revision == "":
63
+ revision = "main"
64
+
65
+ # Is the model on the hub?
66
+ if weight_type in ["Delta", "Adapter"]:
67
+ base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
68
+ if not base_model_on_hub:
69
+ return styled_error(f'Base model "{base_model}" {error}')
70
+
71
+ if not weight_type == "Adapter":
72
+ model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
73
+ if not model_on_hub:
74
+ return styled_error(f'Model "{model}" {error}')
75
+
76
+ # Is the model info correctly filled?
77
+ try:
78
+ model_info = API.model_info(repo_id=model, revision=revision)
79
+ except Exception:
80
+ return styled_error("Could not get your model information. Please fill it up properly.")
81
+
82
+ model_size = get_model_size(model_info=model_info, precision=precision)
83
+
84
+ # Were the model card and license filled?
85
+ try:
86
+ license = model_info.cardData["license"]
87
+ except Exception:
88
+ return styled_error("Please select a license for your model")
89
+
90
+ modelcard_OK, error_msg = check_model_card(model)
91
+ if not modelcard_OK:
92
+ return styled_error(error_msg)
93
+
94
+ # Seems good, creating the eval
95
+ print("Adding new eval")
96
+
97
+ eval_entry = {
98
+ "model": model,
99
+ "base_model": base_model,
100
+ "revision": revision,
101
+ "private": private,
102
+ "precision": precision,
103
+ "weight_type": weight_type,
104
+ "status": "PENDING",
105
+ "submitted_time": current_time,
106
+ "model_type": model_type,
107
+ "likes": model_info.likes,
108
+ "params": model_size,
109
+ "license": license,
110
+ }
111
+
112
+ # Check for duplicate submission
113
+ if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
114
+ return styled_warning("This model has been already submitted.")
115
+
116
+ print("Creating eval file")
117
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
118
+ os.makedirs(OUT_DIR, exist_ok=True)
119
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
120
+
121
+ with open(out_path, "w") as f:
122
+ f.write(json.dumps(eval_entry))
123
+
124
+ print("Uploading eval file")
125
+ API.upload_file(
126
+ path_or_fileobj=out_path,
127
+ path_in_repo=out_path.split("eval-queue/")[1],
128
+ repo_id=QUEUE_REPO,
129
+ repo_type="dataset",
130
+ commit_message=f"Add {model} to eval queue",
131
+ )
132
+
133
+ # Remove the local file
134
+ os.remove(out_path)
135
+
136
+ return styled_message(
137
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
138
+ )
src/tools/collections.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import pandas as pd
4
+ from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
5
+ from huggingface_hub.utils._errors import HfHubHTTPError
6
+ from pandas import DataFrame
7
+
8
+ from src.display.utils import AutoEvalColumn, ModelType
9
+ from src.envs import H4_TOKEN, PATH_TO_COLLECTION
10
+
11
+ # Specific intervals for the collections
12
+ intervals = {
13
+ "0~3B": pd.Interval(0, 3, closed="right"),
14
+ "3~7B": pd.Interval(3, 7.3, closed="right"),
15
+ "7~13B": pd.Interval(7.3, 13, closed="right"),
16
+ "13~35B": pd.Interval(13, 35, closed="right"),
17
+ "35~60B": pd.Interval(35, 60, closed="right"),
18
+ "60B+": pd.Interval(60, 10000, closed="right"),
19
+ }
20
+
21
+ def update_collections(df: DataFrame):
22
+ """This function updates the Open Ko LLM Leaderboard model collection with the latest best models for
23
+ each size category and type.
24
+ """
25
+ collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
26
+ params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
27
+
28
+ cur_best_models = []
29
+
30
+ ix = 0
31
+ for type in ModelType:
32
+ if type.value.name == "":
33
+ continue
34
+ for size in intervals:
35
+ # We filter the df to gather the relevant models
36
+ type_emoji = [t[0] for t in type.value.symbol]
37
+ filtered_df = df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
38
+
39
+ numeric_interval = pd.IntervalIndex([intervals[size]])
40
+ mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
41
+ filtered_df = filtered_df.loc[mask]
42
+
43
+ best_models = list(
44
+ filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.dummy.name]
45
+ )
46
+ print(type.value.symbol, size, best_models[:10])
47
+
48
+ # We add them one by one to the leaderboard
49
+ for model in best_models:
50
+ ix += 1
51
+ cur_len_collection = len(collection.items)
52
+ try:
53
+ collection = add_collection_item(
54
+ PATH_TO_COLLECTION,
55
+ item_id=model,
56
+ item_type="model",
57
+ exists_ok=True,
58
+ note=f"Best {type.to_str(' ')} model of size {size} on the leaderboard today!",
59
+ token=H4_TOKEN,
60
+ )
61
+ if (
62
+ len(collection.items) > cur_len_collection
63
+ ): # we added an item - we make sure its position is correct
64
+ item_object_id = collection.items[-1].item_object_id
65
+ update_collection_item(
66
+ collection_slug=PATH_TO_COLLECTION, item_object_id=item_object_id, position=ix
67
+ )
68
+ cur_len_collection = len(collection.items)
69
+ cur_best_models.append(model)
70
+ break
71
+ except HfHubHTTPError:
72
+ continue
73
+
74
+ collection = get_collection(PATH_TO_COLLECTION, token=H4_TOKEN)
75
+ for item in collection.items:
76
+ if item.item_id not in cur_best_models:
77
+ try:
78
+ delete_collection_item(
79
+ collection_slug=PATH_TO_COLLECTION, item_object_id=item.item_object_id, token=H4_TOKEN
80
+ )
81
+ except HfHubHTTPError:
82
+ continue
src/tools/model_backlinks.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ models = [
2
+ "baseline",
3
+ ]
src/tools/plots.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import plotly.express as px
4
+ from plotly.graph_objs import Figure
5
+
6
+ from src.leaderboard.filter_models import FLAGGED_MODELS
7
+ from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS
8
+ from src.leaderboard.read_evals import EvalResult
9
+
10
+
11
+
12
+ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
13
+ """
14
+ Generates a DataFrame containing the maximum scores until each date.
15
+
16
+ :param results_df: A DataFrame containing result information including metric scores and dates.
17
+ :return: A new DataFrame containing the maximum scores until each date for every metric.
18
+ """
19
+ # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
20
+ results_df = pd.DataFrame(raw_data)
21
+ #results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
22
+ results_df.sort_values(by="date", inplace=True)
23
+
24
+ # Step 2: Initialize the scores dictionary
25
+ scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
26
+
27
+ # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
28
+ for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
29
+ current_max = 0
30
+ last_date = ""
31
+ column = task.col_name
32
+ for _, row in results_df.iterrows():
33
+ current_model = row["full_model"]
34
+ if current_model in FLAGGED_MODELS:
35
+ continue
36
+
37
+ current_date = row["date"]
38
+ if task.benchmark == "Average":
39
+ current_score = np.mean(list(row["results"].values()))
40
+ else:
41
+ current_score = row["results"][task.benchmark]
42
+
43
+ if current_score > current_max:
44
+ if current_date == last_date and len(scores[column]) > 0:
45
+ scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
46
+ else:
47
+ scores[column].append({"model": current_model, "date": current_date, "score": current_score})
48
+ current_max = current_score
49
+ last_date = current_date
50
+
51
+ # Step 4: Return all dictionaries as DataFrames
52
+ return {k: pd.DataFrame(v) for k, v in scores.items()}
53
+
54
+
55
+ def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
56
+ """
57
+ Transforms the scores DataFrame into a new format suitable for plotting.
58
+
59
+ :param scores_df: A DataFrame containing metric scores and dates.
60
+ :return: A new DataFrame reshaped for plotting purposes.
61
+ """
62
+ # Initialize the list to store DataFrames
63
+ dfs = []
64
+
65
+ # Iterate over the cols and create a new DataFrame for each column
66
+ for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
67
+ d = scores_df[col].reset_index(drop=True)
68
+ d["task"] = col
69
+ dfs.append(d)
70
+
71
+ # Concatenate all the created DataFrames
72
+ concat_df = pd.concat(dfs, ignore_index=True)
73
+
74
+ # Sort values by 'date'
75
+ concat_df.sort_values(by="date", inplace=True)
76
+ concat_df.reset_index(drop=True, inplace=True)
77
+ return concat_df
78
+
79
+
80
+ def create_metric_plot_obj(
81
+ df: pd.DataFrame, metrics: list[str], title: str
82
+ ) -> Figure:
83
+ """
84
+ Create a Plotly figure object with lines representing different metrics
85
+ and horizontal dotted lines representing human baselines.
86
+
87
+ :param df: The DataFrame containing the metric values, names, and dates.
88
+ :param metrics: A list of strings representing the names of the metrics
89
+ to be included in the plot.
90
+ :param title: A string representing the title of the plot.
91
+ :return: A Plotly figure object with lines representing metrics and
92
+ horizontal dotted lines representing human baselines.
93
+ """
94
+
95
+ # Filter the DataFrame based on the specified metrics
96
+ df = df[df["task"].isin(metrics)]
97
+
98
+ # Filter the human baselines based on the specified metrics
99
+ filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
100
+
101
+ # Create a line figure using plotly express with specified markers and custom data
102
+ fig = px.line(
103
+ df,
104
+ x="date",
105
+ y="score",
106
+ color="task",
107
+ markers=True,
108
+ custom_data=["task", "score", "model"],
109
+ title=title,
110
+ )
111
+
112
+ # Update hovertemplate for better hover interaction experience
113
+ fig.update_traces(
114
+ hovertemplate="<br>".join(
115
+ [
116
+ "Model Name: %{customdata[2]}",
117
+ "Metric Name: %{customdata[0]}",
118
+ "Date: %{x}",
119
+ "Metric Value: %{y}",
120
+ ]
121
+ )
122
+ )
123
+
124
+ # Update the range of the y-axis
125
+ fig.update_layout(yaxis_range=[0, 100])
126
+
127
+ # Create a dictionary to hold the color mapping for each metric
128
+ metric_color_mapping = {}
129
+
130
+ # Map each metric name to its color in the figure
131
+ for trace in fig.data:
132
+ metric_color_mapping[trace.name] = trace.line.color
133
+
134
+ # Iterate over filtered human baselines and add horizontal lines to the figure
135
+ for metric, value in filtered_human_baselines.items():
136
+ color = metric_color_mapping.get(metric, "blue") # Retrieve color from mapping; default to blue if not found
137
+ location = "top left" if metric == "Ko-HellaSwag" else "bottom left" # Set annotation position
138
+ # Add horizontal line with matched color and positioned annotation
139
+ fig.add_hline(
140
+ y=value,
141
+ line_dash="dot",
142
+ annotation_text=f"{metric} human baseline",
143
+ annotation_position=location,
144
+ annotation_font_size=10,
145
+ annotation_font_color=color,
146
+ line_color=color,
147
+ )
148
+
149
+ return fig
150
+
151
+
152
+ # Example Usage:
153
+ # human_baselines dictionary is defined.
154
+ # chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")