Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Sean Cho
commited on
Commit
ยท
4c0ff9d
1
Parent(s):
41ebc94
Korean to English
Browse files- app.py +8 -8
- src/assets/text_content.py +42 -41
app.py
CHANGED
@@ -325,11 +325,11 @@ with demo:
|
|
325 |
)
|
326 |
with gr.Row():
|
327 |
deleted_models_visibility = gr.Checkbox(
|
328 |
-
value=True, label="๐
|
329 |
)
|
330 |
with gr.Column(min_width=320):
|
331 |
search_bar = gr.Textbox(
|
332 |
-
placeholder="๐
|
333 |
show_label=False,
|
334 |
elem_id="search-bar",
|
335 |
)
|
@@ -447,7 +447,7 @@ with demo:
|
|
447 |
|
448 |
with gr.Column():
|
449 |
with gr.Accordion(
|
450 |
-
f"โ
|
451 |
open=False,
|
452 |
):
|
453 |
with gr.Row():
|
@@ -458,7 +458,7 @@ with demo:
|
|
458 |
max_rows=5,
|
459 |
)
|
460 |
with gr.Accordion(
|
461 |
-
f"๐
|
462 |
open=False,
|
463 |
):
|
464 |
with gr.Row():
|
@@ -470,7 +470,7 @@ with demo:
|
|
470 |
)
|
471 |
|
472 |
with gr.Accordion(
|
473 |
-
f"โณ
|
474 |
open=False,
|
475 |
):
|
476 |
with gr.Row():
|
@@ -481,7 +481,7 @@ with demo:
|
|
481 |
max_rows=5,
|
482 |
)
|
483 |
with gr.Row():
|
484 |
-
gr.Markdown("# โ๏ธโจ
|
485 |
|
486 |
with gr.Row():
|
487 |
with gr.Column():
|
@@ -524,7 +524,7 @@ with demo:
|
|
524 |
)
|
525 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
526 |
|
527 |
-
submit_button = gr.Button("
|
528 |
submission_result = gr.Markdown()
|
529 |
submit_button.click(
|
530 |
add_new_eval,
|
@@ -541,7 +541,7 @@ with demo:
|
|
541 |
)
|
542 |
|
543 |
with gr.Row():
|
544 |
-
refresh_button = gr.Button("
|
545 |
refresh_button.click(
|
546 |
refresh,
|
547 |
inputs=[],
|
|
|
325 |
)
|
326 |
with gr.Row():
|
327 |
deleted_models_visibility = gr.Checkbox(
|
328 |
+
value=True, label="๐ Show gated/private/deleted models", interactive=True
|
329 |
)
|
330 |
with gr.Column(min_width=320):
|
331 |
search_bar = gr.Textbox(
|
332 |
+
placeholder="๐ Search for your model and press ENTER...",
|
333 |
show_label=False,
|
334 |
elem_id="search-bar",
|
335 |
)
|
|
|
447 |
|
448 |
with gr.Column():
|
449 |
with gr.Accordion(
|
450 |
+
f"โ
Finished Evaluations ({len(finished_eval_queue_df)})",
|
451 |
open=False,
|
452 |
):
|
453 |
with gr.Row():
|
|
|
458 |
max_rows=5,
|
459 |
)
|
460 |
with gr.Accordion(
|
461 |
+
f"๐ Running Evaluation Queue ({len(running_eval_queue_df)})",
|
462 |
open=False,
|
463 |
):
|
464 |
with gr.Row():
|
|
|
470 |
)
|
471 |
|
472 |
with gr.Accordion(
|
473 |
+
f"โณ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
474 |
open=False,
|
475 |
):
|
476 |
with gr.Row():
|
|
|
481 |
max_rows=5,
|
482 |
)
|
483 |
with gr.Row():
|
484 |
+
gr.Markdown("# โ๏ธโจ Submit your model here!", elem_classes="markdown-text")
|
485 |
|
486 |
with gr.Row():
|
487 |
with gr.Column():
|
|
|
524 |
)
|
525 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
526 |
|
527 |
+
submit_button = gr.Button("Submit Evalulation!")
|
528 |
submission_result = gr.Markdown()
|
529 |
submit_button.click(
|
530 |
add_new_eval,
|
|
|
541 |
)
|
542 |
|
543 |
with gr.Row():
|
544 |
+
refresh_button = gr.Button("Refresh")
|
545 |
refresh_button.click(
|
546 |
refresh,
|
547 |
inputs=[],
|
src/assets/text_content.py
CHANGED
@@ -1,62 +1,63 @@
|
|
1 |
from src.display_models.model_metadata_type import ModelType
|
2 |
|
3 |
-
TITLE = """<h1 align="center" id="space-title">๐ Open Ko-LLM Leaderboard
|
4 |
|
5 |
INTRODUCTION_TEXT = f"""
|
6 |
-
๐ Open Ko-LLM Leaderboard
|
7 |
|
8 |
-
"Submit here!"
|
9 |
-
|
10 |
-
|
|
|
11 |
|
12 |
-
|
13 |
"""
|
14 |
|
15 |
LLM_BENCHMARKS_TEXT = f"""
|
16 |
# Context
|
17 |
-
|
18 |
|
19 |
## Icons
|
20 |
{ModelType.PT.to_str(" : ")} model
|
21 |
{ModelType.FT.to_str(" : ")} model
|
22 |
{ModelType.IFT.to_str(" : ")} model
|
23 |
{ModelType.RL.to_str(" : ")} model
|
24 |
-
|
25 |
-
|
26 |
|
27 |
-
๐ดโโ ๏ธ :
|
28 |
-
(
|
29 |
|
30 |
## How it works
|
31 |
|
32 |
-
๐
|
33 |
-
- Ko-HellaSwag (
|
34 |
-
- Ko-MMLU (
|
35 |
-
- Ko-Arc (
|
36 |
-
- Ko-Truthful QA (
|
37 |
-
LLM
|
38 |
|
39 |
-
KT
|
40 |
|
41 |
-
##
|
42 |
-
-
|
43 |
-
-
|
44 |
|
45 |
-
##
|
46 |
-
|
47 |
|
48 |
-
##
|
49 |
-
|
50 |
-
|
51 |
"""
|
52 |
|
53 |
EVALUATION_QUEUE_TEXT = f"""
|
54 |
-
# ๐ Open-Ko LLM
|
55 |
-
|
56 |
|
57 |
-
##
|
58 |
|
59 |
-
### 1๏ธโฃ
|
60 |
```
|
61 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
62 |
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
@@ -64,21 +65,21 @@ model = AutoModel.from_pretrained("your model name", revision=revision)
|
|
64 |
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
65 |
```
|
66 |
|
67 |
-
|
68 |
-
โ ๏ธ
|
69 |
-
โ ๏ธ
|
70 |
|
71 |
-
### 2๏ธโฃ
|
72 |
-
|
73 |
|
74 |
-
### 3๏ธโฃ
|
75 |
-
๐ Open-Ko
|
76 |
|
77 |
-
### 4๏ธโฃ
|
78 |
-
|
79 |
|
80 |
-
##
|
81 |
-
|
82 |
"""
|
83 |
|
84 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
1 |
from src.display_models.model_metadata_type import ModelType
|
2 |
|
3 |
+
TITLE = """<h1 align="center" id="space-title">๐ Open Ko-LLM Leaderboard ๐ฐ๐ท</h1>"""
|
4 |
|
5 |
INTRODUCTION_TEXT = f"""
|
6 |
+
๐ The Open Ko-LLM Leaderboard ๐ฐ๐ท objectively evaluates the performance of Korean large language model.
|
7 |
|
8 |
+
When you submit a model on the "Submit here!" page, it is automatically evaluated. The GPU used for evaluation is operated with the support of KT.
|
9 |
+
The data used for evaluation consists of datasets to assess expertise, inference ability, hallucination, and common sense.
|
10 |
+
The evaluation dataset is exclusively private and only available for evaluation process.
|
11 |
+
More detailed information about the benchmark dataset is provided on the โAboutโ page.
|
12 |
|
13 |
+
This leaderboard is co-hosted by Upstage and NIA, and operated by Upstage.
|
14 |
"""
|
15 |
|
16 |
LLM_BENCHMARKS_TEXT = f"""
|
17 |
# Context
|
18 |
+
While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Korean leaderboard, ๐ Open Ko-LLM, to evaluate models that reflect the characteristics of the Korean language and Korean culture. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in Korean.
|
19 |
|
20 |
## Icons
|
21 |
{ModelType.PT.to_str(" : ")} model
|
22 |
{ModelType.FT.to_str(" : ")} model
|
23 |
{ModelType.IFT.to_str(" : ")} model
|
24 |
{ModelType.RL.to_str(" : ")} model
|
25 |
+
If there is no icon, it indicates that there is insufficient information about the model.
|
26 |
+
Please provide information about the model through an issue! ๐คฉ
|
27 |
|
28 |
+
๐ดโโ ๏ธ : This icon indicates that the model has been selected as a subject of caution by the community, implying that users should exercise restraint when using it. Clicking on the icon will take you to a discussion about that model.
|
29 |
+
(Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)
|
30 |
|
31 |
## How it works
|
32 |
|
33 |
+
๐ We have set up a benchmark using datasets translated into Korean from the four tasks (HellaSwag, MMLU, Arc, Truthful QA) operated by HuggingFace OpenLLM.
|
34 |
+
- Ko-HellaSwag (provided by Upstage)
|
35 |
+
- Ko-MMLU (provided by Upstage)
|
36 |
+
- Ko-Arc (provided by Upstage)
|
37 |
+
- Ko-Truthful QA (provided by Upstage)
|
38 |
+
To provide an evaluation befitting the LLM era, we've selected benchmark datasets suitable for assessing four elements: expertise, inference, hallucination, and common sense. The final score is converted to the average score from the four evaluation datasets.
|
39 |
|
40 |
+
GPUs are provided by KT for the evaluations.
|
41 |
|
42 |
+
## Details and Logs
|
43 |
+
- Detailed numerical results in the `results` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/results
|
44 |
+
- community queries and running status in the `requests` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/requests
|
45 |
|
46 |
+
## Reproducibility
|
47 |
+
To reproduce our results, use [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of dataset.
|
48 |
|
49 |
+
## More resources
|
50 |
+
If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
|
51 |
+
We also gather cool resources from the community, other teams, and other labs [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/2)!
|
52 |
"""
|
53 |
|
54 |
EVALUATION_QUEUE_TEXT = f"""
|
55 |
+
# Evaluation Queue for the ๐ Open-Ko LLM Leaderboard
|
56 |
+
Models added here will be automatically evaluated on the KT GPU cluster.
|
57 |
|
58 |
+
## <Some good practices before submitting a model>
|
59 |
|
60 |
+
### 1๏ธโฃ Make sure you can load your model and tokenizer using AutoClasses
|
61 |
```
|
62 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
63 |
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
|
|
65 |
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
66 |
```
|
67 |
|
68 |
+
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
69 |
+
โ ๏ธ Make sure your model is public!
|
70 |
+
โ ๏ธ If your model needs use_remote_code=True, we do not support this option yet but we are working on adding it, stay posted!
|
71 |
|
72 |
+
### 2๏ธโฃ Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
73 |
+
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
74 |
|
75 |
+
### 3๏ธโฃ Make sure your model has an open license!
|
76 |
+
This is a leaderboard for ๐ Open-Ko LLMs, and we'd love for as many people as possible to know they can use your model
|
77 |
|
78 |
+
### 4๏ธโฃ Fill up your model card
|
79 |
+
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
80 |
|
81 |
+
## In case of model failure
|
82 |
+
If your model is displayed in the `FAILED` category, its execution stopped. Make sure you have followed the above steps first. If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
83 |
"""
|
84 |
|
85 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|