Sean Cho commited on
Commit
4c0ff9d
ยท
1 Parent(s): 41ebc94

Korean to English

Browse files
Files changed (2) hide show
  1. app.py +8 -8
  2. src/assets/text_content.py +42 -41
app.py CHANGED
@@ -325,11 +325,11 @@ with demo:
325
  )
326
  with gr.Row():
327
  deleted_models_visibility = gr.Checkbox(
328
- value=True, label="๐Ÿ‘€ ์‚ญ์ œ/๋น„๊ณต๊ฐœ๋œ ๋ชจ๋ธ๋„ ํ•จ๊ป˜ ๋ณด๊ธฐ", interactive=True
329
  )
330
  with gr.Column(min_width=320):
331
  search_bar = gr.Textbox(
332
- placeholder="๐Ÿ” ์ฐพ๊ณ ์ž ํ•˜๋Š” ๋ชจ๋ธ ๋ช…์„ ์ž…๋ ฅํ•˜์„ธ์š”",
333
  show_label=False,
334
  elem_id="search-bar",
335
  )
@@ -447,7 +447,7 @@ with demo:
447
 
448
  with gr.Column():
449
  with gr.Accordion(
450
- f"โœ… ํ‰๊ฐ€ ์™„๋ฃŒ ({len(finished_eval_queue_df)})",
451
  open=False,
452
  ):
453
  with gr.Row():
@@ -458,7 +458,7 @@ with demo:
458
  max_rows=5,
459
  )
460
  with gr.Accordion(
461
- f"๐Ÿ”„ ํ‰๊ฐ€ ์ง„ํ–‰ ์ค‘ ({len(running_eval_queue_df)})",
462
  open=False,
463
  ):
464
  with gr.Row():
@@ -470,7 +470,7 @@ with demo:
470
  )
471
 
472
  with gr.Accordion(
473
- f"โณ ํ‰๊ฐ€ ๋Œ€๊ธฐ ์ค‘ ({len(pending_eval_queue_df)})",
474
  open=False,
475
  ):
476
  with gr.Row():
@@ -481,7 +481,7 @@ with demo:
481
  max_rows=5,
482
  )
483
  with gr.Row():
484
- gr.Markdown("# โœ‰๏ธโœจ ์—ฌ๊ธฐ์—์„œ ๋ชจ๋ธ์„ ์ œ์ถœํ•ด์ฃผ์„ธ์š”!", elem_classes="markdown-text")
485
 
486
  with gr.Row():
487
  with gr.Column():
@@ -524,7 +524,7 @@ with demo:
524
  )
525
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
526
 
527
- submit_button = gr.Button("์ œ์ถœํ•˜๊ณ  ํ‰๊ฐ€๋ฐ›๊ธฐ")
528
  submission_result = gr.Markdown()
529
  submit_button.click(
530
  add_new_eval,
@@ -541,7 +541,7 @@ with demo:
541
  )
542
 
543
  with gr.Row():
544
- refresh_button = gr.Button("์ƒˆ๋กœ๊ณ ์นจ")
545
  refresh_button.click(
546
  refresh,
547
  inputs=[],
 
325
  )
326
  with gr.Row():
327
  deleted_models_visibility = gr.Checkbox(
328
+ value=True, label="๐Ÿ‘€ Show gated/private/deleted models", interactive=True
329
  )
330
  with gr.Column(min_width=320):
331
  search_bar = gr.Textbox(
332
+ placeholder="๐Ÿ” Search for your model and press ENTER...",
333
  show_label=False,
334
  elem_id="search-bar",
335
  )
 
447
 
448
  with gr.Column():
449
  with gr.Accordion(
450
+ f"โœ… Finished Evaluations ({len(finished_eval_queue_df)})",
451
  open=False,
452
  ):
453
  with gr.Row():
 
458
  max_rows=5,
459
  )
460
  with gr.Accordion(
461
+ f"๐Ÿ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
462
  open=False,
463
  ):
464
  with gr.Row():
 
470
  )
471
 
472
  with gr.Accordion(
473
+ f"โณ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
474
  open=False,
475
  ):
476
  with gr.Row():
 
481
  max_rows=5,
482
  )
483
  with gr.Row():
484
+ gr.Markdown("# โœ‰๏ธโœจ Submit your model here!", elem_classes="markdown-text")
485
 
486
  with gr.Row():
487
  with gr.Column():
 
524
  )
525
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
526
 
527
+ submit_button = gr.Button("Submit Evalulation!")
528
  submission_result = gr.Markdown()
529
  submit_button.click(
530
  add_new_eval,
 
541
  )
542
 
543
  with gr.Row():
544
+ refresh_button = gr.Button("Refresh")
545
  refresh_button.click(
546
  refresh,
547
  inputs=[],
src/assets/text_content.py CHANGED
@@ -1,62 +1,63 @@
1
  from src.display_models.model_metadata_type import ModelType
2
 
3
- TITLE = """<h1 align="center" id="space-title">๐Ÿš€ Open Ko-LLM Leaderboard</h1>"""
4
 
5
  INTRODUCTION_TEXT = f"""
6
- ๐Ÿš€ Open Ko-LLM Leaderboard๋Š” ํ•œ๊ตญ์–ด ์ดˆ๊ฑฐ๋Œ€ ์–ธ์–ด๋ชจ๋ธ์˜ ์„ฑ๋Šฅ์„ ๊ฐ๊ด€์ ์œผ๋กœ ํ‰๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
7
 
8
- "Submit here!" ํŽ˜์ด์ง€์—์„œ ๋ชจ๋ธ ์ œ์ถœ ์‹œ ์ž๋™์œผ๋กœ ํ‰๊ฐ€๋ฉ๋‹ˆ๋‹ค. ํ‰๊ฐ€์— ์‚ฌ์šฉ๋˜๋Š” GPU๋Š” KT์˜ ์ง€์›์œผ๋กœ ์šด์˜๋ฉ๋‹ˆ๋‹ค.
9
- ํ‰๊ฐ€์— ์‚ฌ์šฉ๋˜๋Š” ๋ฐ์ดํ„ฐ๋Š” ์ „๋ฌธ ์ง€์‹, ์ถ”๋ก  ๋Šฅ๋ ฅ, ํ™˜๊ฐ, ์ƒ์‹์˜ ๋„ค ๊ฐ€์ง€ ์š”์†Œ๋ฅผ ํ‰๊ฐ€ํ•˜๊ธฐ ์œ„ํ•œ ๋ฐ์ดํ„ฐ์…‹์œผ๋กœ ๊ตฌ์„ฑ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.
10
- ๋ฒค์น˜๋งˆํฌ ๋ฐ์ดํ„ฐ์…‹์— ๋Œ€ํ•œ ๋” ์ž์„ธํ•œ ์ •๋ณด๋Š” "About" ํŽ˜์ด์ง€์—์„œ ์ œ๊ณต๋˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค.
 
11
 
12
- ์—…์Šคํ…Œ์ด์ง€์™€ NIA๊ฐ€ ๊ณต๋™ ์ฃผ์ตœํ•˜๋ฉฐ ์—…์Šคํ…Œ์ด์ง€๊ฐ€ ์šด์˜ํ•ฉ๋‹ˆ๋‹ค.
13
  """
14
 
15
  LLM_BENCHMARKS_TEXT = f"""
16
  # Context
17
- ๋›ฐ์–ด๋‚œ LLM ๋ชจ๋ธ๋“ค์ด ์•ž๋‹คํˆฌ์–ด ๊ณต๊ฐœ๋˜๊ณ  ์žˆ์ง€๋งŒ ์ด๋Š” ๋Œ€๋ถ€๋ถ„ ์˜์–ด ์ค‘์‹ฌ์˜, ์˜์–ด ๋ฌธํ™”๊ถŒ์— ์ต์ˆ™ํ•œ ๋ชจ๋ธ์ž…๋‹ˆ๋‹ค. ์ €ํฌ๋Š” ํ•œ๊ตญ์–ด ๋ฆฌ๋”๋ณด๋“œ ๐Ÿš€ Open Ko-LLM์„ ์šด์˜ํ•˜์—ฌ ํ•œ๊ตญ์–ด์™€ ํ•œ๊ตญ ๋ฌธํ™”์˜ ํŠน์„ฑ์„ ๋ฐ˜์˜ํ•œ ๋ชจ๋ธ์„ ํ‰๊ฐ€ํ•˜๊ณ ์ž ํ•ฉ๋‹ˆ๋‹ค. ์ด๋ฅผ ํ†ตํ•ด ํ•œ๊ตญ์–ด ์‚ฌ์šฉ์ž๋“ค์ด ํŽธ๋ฆฌํ•˜๊ฒŒ ๋ฆฌ๋”๋ณด๋“œ๋ฅผ ์ด์šฉํ•˜๊ณ  ์ฐธ์—ฌํ•˜์—ฌ ํ•œ๊ตญ์˜ ์—ฐ๊ตฌ ์ˆ˜์ค€ ํ–ฅ์ƒ์— ๊ธฐ์—ฌํ•  ์ˆ˜ ์žˆ๊ธฐ๋ฅผ ๋ฐ”๋ž๋‹ˆ๋‹ค.
18
 
19
  ## Icons
20
  {ModelType.PT.to_str(" : ")} model
21
  {ModelType.FT.to_str(" : ")} model
22
  {ModelType.IFT.to_str(" : ")} model
23
  {ModelType.RL.to_str(" : ")} model
24
- ๋งŒ์•ฝ ์•„์ด์ฝ˜์ด ์—†๋‹ค๋ฉด ์•„์ง ๋ชจ๋ธ์— ๋Œ€ํ•œ ์ •๋ณด๊ฐ€ ๋ถ€์กฑํ•จ์„ ๋‚˜ํƒ€๋ƒ…๋‹ˆ๋‹ค.
25
- ๋ชจ๋ธ์— ๋Œ€ํ•œ ์ •๋ณด๋Š” issue๋ฅผ ํ†ตํ•ด ์ „๋‹ฌํ•ด์ฃผ์„ธ์š”! ๐Ÿคฉ
26
 
27
- ๐Ÿดโ€โ˜ ๏ธ : ํ•ด๋‹น ์•„์ด์ฝ˜์€ ์ด ๋ชจ๋ธ์ด ์ปค๋ฎค๋‹ˆํ‹ฐ์— ์˜ํ•ด ์ฃผ์˜ ๋Œ€์ƒ์œผ๋กœ ์„ ์ •๋˜์—ˆ์œผ๋ฏ€๋กœ ์ด์šฉ ์ž์ œ๋ฅผ ๋ฐ”๋ž€๋‹ค๋Š” ์˜๋ฏธ์ž…๋‹ˆ๋‹ค. ์•„์ด์ฝ˜์„ ํด๋ฆญ ์‹œ ํ•ด๋‹น ๋ชจ๋ธ์— ๋Œ€ํ•œ discussion์œผ๋กœ ์ด๋™ํ•ฉ๋‹ˆ๋‹ค.
28
- (๋†’์€ ๋ฆฌ๋”๋ณด๋“œ ์ˆœ์œ„๋ฅผ ์œ„ํ•ด ํ‰๊ฐ€์…‹์„ ํ•™์Šต์— ์ด์šฉํ•œ ๋ชจ๋ธ ๋“ฑ์ด ์ฃผ์˜ ๋Œ€์ƒ์œผ๋กœ ์„ ์ •๋ฉ๋‹ˆ๋‹ค)
29
 
30
  ## How it works
31
 
32
- ๐Ÿ“ˆ HuggingFace OpenLLM์—์„œ ์šด์˜ํ•˜๋Š” 4๊ฐœ์˜ ํƒœ์Šคํฌ(HellaSwag, MMLU, Arc, Truthful QA)์˜ ๋ฐ์ดํ„ฐ๋ฅผ ํ•œ๊ตญ์–ด๋กœ ๋ฒˆ์—ญํ•œ ๋ฐ์ดํ„ฐ์…‹์œผ๋กœ ๋ฒค์น˜๋งˆํฌ๋ฅผ ๊ตฌ์„ฑํ–ˆ์Šต๋‹ˆ๋‹ค.
33
- - Ko-HellaSwag (์—…์Šคํ…Œ์ด์ง€ ์ œ๊ณต)
34
- - Ko-MMLU (์—…์Šคํ…Œ์ด์ง€ ์ œ๊ณต)
35
- - Ko-Arc (์—…์Šคํ…Œ์ด์ง€ ์ œ๊ณต)
36
- - Ko-Truthful QA (์—…์Šคํ…Œ์ด์ง€ ์ œ๊ณต)
37
- LLM ์‹œ๋Œ€์— ๊ฑธ๋งž๋Š” ํ‰๊ฐ€๋ฅผ ์œ„ํ•ด ์ „๋ฌธ ์ง€์‹, ์ถ”๋ก , ํ™˜๊ฐ, ์ƒ์‹์˜ ๋„ค ๊ฐ€์ง€ ์š”์†Œ๋ฅผ ํ‰๊ฐ€ํ•˜๊ธฐ์— ์ ํ•ฉํ•œ ๋ฐ์ดํ„ฐ์…‹๋“ค์„ ๋ฒค์น˜๋งˆํฌ๋กœ ์„ ์ •ํ–ˆ์Šต๋‹ˆ๋‹ค. ์ตœ์ข… ์ ์ˆ˜๋Š” 4๊ฐœ์˜ ํ‰๊ฐ€ ๋ฐ์ดํ„ฐ์— ๋Œ€ํ•œ ํ‰๊ท  ์ ์ˆ˜๋กœ ํ™˜์‚ฐํ•ฉ๋‹ˆ๋‹ค.
38
 
39
- KT๋กœ๋ถ€ํ„ฐ ํ‰๊ฐ€์— ์‚ฌ์šฉ๋˜๋Š” GPU๋ฅผ ์ œ๊ณต๋ฐ›์•˜์Šต๋‹ˆ๋‹ค.
40
 
41
- ## ์ข€ ๋” ์ž์„ธํ•œ ์ •๋ณด
42
- - ์ข€ ๋” ์ž์„ธํ•œ ์ˆ˜์น˜ ์ •๋ณด๋Š”: https://huggingface.co/datasets/open-ko-llm-leaderboard/results
43
- - ๋ชจ๋ธ์˜ ํ‰๊ฐ€ ํ์™€ ํ‰๊ฐ€ ์ƒํƒœ๋Š”: https://huggingface.co/datasets/open-ko-llm-leaderboard/requests
44
 
45
- ## ๊ฒฐ๊ณผ ์žฌํ˜„
46
- ํ‰๊ฐ€ ๊ฒฐ๊ณผ๋ฅผ ์žฌํ˜„ํ•˜๊ธฐ ์œ„ํ•ด์„œ๋Š” [์ด ๋ฒ„์ „](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463)์˜ ๋ฐ์ดํ„ฐ์…‹์„ ์ด์šฉํ•˜์„ธ์š”. (๋ฐ‘์—๋Š” ์ฝ”๋“œ ๋ฐ ํ‰๊ฐ€ ํ™˜๊ฒฝ์ด๋ผ์„œ ์ผ๋‹จ skip)
47
 
48
- ## ๋”๋ณด๊ธฐ
49
- ์งˆ๋ฌธ์ด ์žˆ์œผ์‹œ๋ฉด [์—ฌ๊ธฐ](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)์„œ FAQ๋ฅผ ํ™•์ธํ•˜์‹ค ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค!
50
- ๋˜ํ•œ ์ปค๋ฎค๋‹ˆํ‹ฐ, ๋‹ค๋ฅธ ํŒ€ ๋ฐ ์—ฐ๊ตฌ์†Œ์—์„œ ์ œ๊ณตํ•˜๋Š” ๋ฉ‹์ง„ ์ž๋ฃŒ๋ฅผ [์—ฌ๊ธฐ](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/2)์— ๋ชจ์•„ ๋‘์—ˆ์Šต๋‹ˆ๋‹ค. ์ถ”๊ฐ€๋กœ ์ œ์•ˆ ์‚ฌํ•ญ์ด ์žˆ์œผ์‹œ๋‹ค๋ฉด ์ด ๊ณณ์„ ์ด์šฉํ•ด์ฃผ์„ธ์š”.
51
  """
52
 
53
  EVALUATION_QUEUE_TEXT = f"""
54
- # ๐Ÿš€ Open-Ko LLM ๋ฆฌ๋”๋ณด๋“œ์˜ ํ‰๊ฐ€ ํ์ž…๋‹ˆ๋‹ค.
55
- ์ด๊ณณ์— ์ถ”๊ฐ€๋œ ๋ชจ๋ธ๋“ค์€ ๊ณง ์ž๋™์ ์œผ๋กœ KT์˜ GPU ์œ„์—์„œ ํ‰๊ฐ€๋  ์˜ˆ์ •์ž…๋‹ˆ๋‹ค!
56
 
57
- ## <๋ชจ๋ธ ์ œ์ถœ ์ „ ํ™•์ธํ•˜๋ฉด ์ข‹์€ ๊ฒƒ๋“ค>
58
 
59
- ### 1๏ธโƒฃ ๋ชจ๋ธ๊ณผ ํ† ํฌ๋‚˜์ด์ €๊ฐ€ AutoClasses๋กœ ๋ถˆ๋Ÿฌ์˜ฌ ์ˆ˜ ์žˆ๋‚˜์š”?
60
  ```
61
  from transformers import AutoConfig, AutoModel, AutoTokenizer
62
  config = AutoConfig.from_pretrained("your model name", revision=revision)
@@ -64,21 +65,21 @@ model = AutoModel.from_pretrained("your model name", revision=revision)
64
  tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
65
  ```
66
 
67
- ๋งŒ์•ฝ ์ด ๋‹จ๊ณ„๊ฐ€ ์‹คํŒจํ–ˆ๋‹ค๋ฉด ์—๋Ÿฌ ๋ฉ”์„ธ์ง€๋ฅผ ๋”ฐ๋ผ ๋ชจ๋ธ์„ ๋””๋ฒ„๊น…ํ•œ ํ›„์— ์ œ์ถœํ•ด์ฃผ์„ธ์š”.
68
- โš ๏ธ ๋ชจ๋ธ์ด public ์ƒํƒœ์—ฌ์•ผ ํ•ฉ๋‹ˆ๋‹ค!
69
- โš ๏ธ ๋งŒ์•ฝ ๋ชจ๋ธ์ด use_remote_code=True์—ฌ์•ผ ํ•œ๋‹ค๋ฉด ์ž ์‹œ ๊ธฐ๋‹ค๋ ค์ฃผ์„ธ์š”. ํ˜„์žฌ๋กœ์„œ๋Š” ์•„์ง ์ด ์˜ต์…˜์„ ์ง€์›ํ•˜์ง€ ์•Š์ง€๋งŒ ์ž‘๋™ํ•  ์ˆ˜ ์žˆ๋„๋ก ํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค!
70
 
71
- ### 2๏ธโƒฃ ๋ชจ๋ธ์˜ weight๋ฅผ safetensors๋กœ ๋ฐ”๊ฟจ๋‚˜์š”?
72
- safetensors๋Š” weight๋ฅผ ๋ณด๊ด€ํ•˜๋Š” ์ƒˆ๋กœ์šด ํฌ๋งท์œผ๋กœ, ํ›จ์”ฌ ์•ˆ์ „ํ•˜๊ณ  ๋น ๋ฅด๊ฒŒ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ๋˜ํ•œ ๋ชจ๋ธ์˜ parameter ๊ฐœ์ˆ˜๋ฅผ Extended Viewer์— ์ถ”๊ฐ€ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค
73
 
74
- ### 3๏ธโƒฃ ๋ชจ๋ธ์ด ์˜คํ”ˆ ๋ผ์ด์„ผ์Šค๋ฅผ ๋”ฐ๋ฅด๋‚˜์š”?
75
- ๐Ÿš€ Open-Ko LLM์€ Open LLM์„ ์œ„ํ•œ ๋ฆฌ๋”๋ณด๋“œ๋กœ, ๋งŽ์€ ์‚ฌ๋žŒ๋“ค์ด ๋‹ค์–‘ํ•œ ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•˜๊ธฐ๋ฅผ ๋ฐ”๋ž๋‹ˆ๋‹ค
76
 
77
- ### 4๏ธโƒฃ ๋ชจ๋ธ ์นด๋“œ๋ฅผ ์ž‘์„ฑํ•˜์…จ๋‚˜์š”?
78
- ๋ฆฌ๋”๋ณด๋“œ์— ๋ชจ๋ธ์— ๋Œ€ํ•œ ์ถ”๊ฐ€ ์ •๋ณด๋ฅผ ์—…๋กœ๋“œํ•  ๋•Œ ์ž‘์„ฑํ•˜์‹  ๋ชจ๋ธ ์นด๋“œ๊ฐ€ ์—…๋กœ๋“œ๋ฉ๋‹ˆ๋‹ค
79
 
80
- ## ๋ชจ๋ธ์ด ์‹คํŒจํ•œ ๊ฒฝ์šฐ:
81
- ๋งŒ์•ฝ ์ œ์ถœํ•œ ๋ชจ๋ธ์˜ ์ƒํƒœ๊ฐ€ FAILED๊ฐ€ ๋œ๋‹ค๋ฉด ์ด๋Š” ๋ชจ๋ธ์ด ์‹คํ–‰ ์ค‘๋‹จ๋˜์—ˆ์Œ์„ ์˜๋ฏธํ•ฉ๋‹ˆ๋‹ค. ๋จผ์ € ์œ„์˜ ๋„ค ๋‹จ๊ณ„๋ฅผ ๋ชจ๋‘ ๋”ฐ๋ž๋Š”์ง€ ํ™•์ธํ•ด๋ณด์„ธ์š”. ๋ชจ๋“  ๋‹จ๊ณ„๋ฅผ ๋”ฐ๋ž์Œ์—๋„ ๋ถˆ๊ตฌํ•˜๊ณ  ์‹คํ–‰ ์ค‘๋‹จ๋˜์—ˆ์„ ๋•Œ๋Š” EleutherAIHarness ๋ฅผ ๋กœ์ปฌ์—์„œ ์‹คํ–‰ํ•  ์ˆ˜ ์žˆ๋Š”์ง€ ํ™•์ธํ•˜๊ธฐ ์œ„ํ•ด ์œ„์˜ ์ฝ”๋“œ๋ฅผ ์ˆ˜์ • ์—†์ด ์‹คํ–‰ํ•˜์„ธ์š”. (ํƒœ์Šคํฌ ๋ณ„ ์˜ˆ์‹œ์˜ ์ˆ˜๋ฅผ ์ œํ•œํ•˜๊ธฐ ์œ„ํ•ด โ€”limit ํŒŒ๋ผ๋ฏธํ„ฐ๋ฅผ ์ถ”๊ฐ€ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.)
82
  """
83
 
84
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
1
  from src.display_models.model_metadata_type import ModelType
2
 
3
+ TITLE = """<h1 align="center" id="space-title">๐Ÿš€ Open Ko-LLM Leaderboard ๐Ÿ‡ฐ๐Ÿ‡ท</h1>"""
4
 
5
  INTRODUCTION_TEXT = f"""
6
+ ๐Ÿš€ The Open Ko-LLM Leaderboard ๐Ÿ‡ฐ๐Ÿ‡ท objectively evaluates the performance of Korean large language model.
7
 
8
+ When you submit a model on the "Submit here!" page, it is automatically evaluated. The GPU used for evaluation is operated with the support of KT.
9
+ The data used for evaluation consists of datasets to assess expertise, inference ability, hallucination, and common sense.
10
+ The evaluation dataset is exclusively private and only available for evaluation process.
11
+ More detailed information about the benchmark dataset is provided on the โ€œAboutโ€ page.
12
 
13
+ This leaderboard is co-hosted by Upstage and NIA, and operated by Upstage.
14
  """
15
 
16
  LLM_BENCHMARKS_TEXT = f"""
17
  # Context
18
+ While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Korean leaderboard, ๐Ÿš€ Open Ko-LLM, to evaluate models that reflect the characteristics of the Korean language and Korean culture. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in Korean.
19
 
20
  ## Icons
21
  {ModelType.PT.to_str(" : ")} model
22
  {ModelType.FT.to_str(" : ")} model
23
  {ModelType.IFT.to_str(" : ")} model
24
  {ModelType.RL.to_str(" : ")} model
25
+ If there is no icon, it indicates that there is insufficient information about the model.
26
+ Please provide information about the model through an issue! ๐Ÿคฉ
27
 
28
+ ๐Ÿดโ€โ˜ ๏ธ : This icon indicates that the model has been selected as a subject of caution by the community, implying that users should exercise restraint when using it. Clicking on the icon will take you to a discussion about that model.
29
+ (Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)
30
 
31
  ## How it works
32
 
33
+ ๐Ÿ“ˆ We have set up a benchmark using datasets translated into Korean from the four tasks (HellaSwag, MMLU, Arc, Truthful QA) operated by HuggingFace OpenLLM.
34
+ - Ko-HellaSwag (provided by Upstage)
35
+ - Ko-MMLU (provided by Upstage)
36
+ - Ko-Arc (provided by Upstage)
37
+ - Ko-Truthful QA (provided by Upstage)
38
+ To provide an evaluation befitting the LLM era, we've selected benchmark datasets suitable for assessing four elements: expertise, inference, hallucination, and common sense. The final score is converted to the average score from the four evaluation datasets.
39
 
40
+ GPUs are provided by KT for the evaluations.
41
 
42
+ ## Details and Logs
43
+ - Detailed numerical results in the `results` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/results
44
+ - community queries and running status in the `requests` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/requests
45
 
46
+ ## Reproducibility
47
+ To reproduce our results, use [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of dataset.
48
 
49
+ ## More resources
50
+ If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
51
+ We also gather cool resources from the community, other teams, and other labs [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/2)!
52
  """
53
 
54
  EVALUATION_QUEUE_TEXT = f"""
55
+ # Evaluation Queue for the ๐Ÿš€ Open-Ko LLM Leaderboard
56
+ Models added here will be automatically evaluated on the KT GPU cluster.
57
 
58
+ ## <Some good practices before submitting a model>
59
 
60
+ ### 1๏ธโƒฃ Make sure you can load your model and tokenizer using AutoClasses
61
  ```
62
  from transformers import AutoConfig, AutoModel, AutoTokenizer
63
  config = AutoConfig.from_pretrained("your model name", revision=revision)
 
65
  tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
66
  ```
67
 
68
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
69
+ โš ๏ธ Make sure your model is public!
70
+ โš ๏ธ If your model needs use_remote_code=True, we do not support this option yet but we are working on adding it, stay posted!
71
 
72
+ ### 2๏ธโƒฃ Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
73
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
74
 
75
+ ### 3๏ธโƒฃ Make sure your model has an open license!
76
+ This is a leaderboard for ๐Ÿš€ Open-Ko LLMs, and we'd love for as many people as possible to know they can use your model
77
 
78
+ ### 4๏ธโƒฃ Fill up your model card
79
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
80
 
81
+ ## In case of model failure
82
+ If your model is displayed in the `FAILED` category, its execution stopped. Make sure you have followed the above steps first. If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
83
  """
84
 
85
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"