add open-source
Browse files- app.py +4 -2
- eval-results/demo-leaderboard/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +4 -4
- eval-results/demo-leaderboard/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +4 -4
- eval-results/demo-leaderboard/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json +4 -4
- eval-results/demo-leaderboard/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json +4 -4
- eval-results/demo-leaderboard/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +4 -4
- eval-results/demo-leaderboard/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +4 -4
- eval-results/demo-leaderboard/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json +4 -4
- eval-results/demo-leaderboard/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json +4 -4
- eval-results/demo-leaderboard/qwen2-72b_bge-large-zh/results_2023-12-08 15:46:20.425378.json +6 -6
- eval-results/demo-leaderboard/qwen2-72b_bge-m3/results_2023-12-08 15:46:20.425378.json +6 -6
- eval-results/demo-leaderboard/qwen2-72b_e5-mistral-7b/results_2023-12-08 15:46:20.425378.json +6 -6
- eval-results/demo-leaderboard/qwen2-72b_gte-qwen2-1.5b/results_2023-12-08 15:46:20.425378.json +6 -6
- eval-results/demo-leaderboard/qwen2-72b_jina-zh/results_2023-12-08 15:46:20.425378.json +8 -8
- src/about.py +119 -17
- src/display/formatting.py +2 -0
- src/display/utils.py +7 -3
- src/leaderboard/read_evals.py +22 -16
- src/populate.py +3 -1
app.py
CHANGED
@@ -65,13 +65,15 @@ def init_leaderboard(dataframe):
|
|
65 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
select_columns=SelectColumns(
|
67 |
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
label="Select Columns to Display:",
|
70 |
),
|
71 |
-
search_columns=[AutoEvalColumn.
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
filter_columns=[
|
74 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
|
|
|
|
75 |
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
# ColumnFilter(
|
77 |
# AutoEvalColumn.params.name,
|
|
|
65 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
select_columns=SelectColumns(
|
67 |
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.never_displayed],
|
69 |
label="Select Columns to Display:",
|
70 |
),
|
71 |
+
search_columns=[AutoEvalColumn.generative_model_link.name, AutoEvalColumn.retrieval_model_link.name],
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
filter_columns=[
|
74 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
+
ColumnFilter(AutoEvalColumn.retrieval_model.name, type="checkboxgroup", label="Retrieval models"),
|
76 |
+
ColumnFilter(AutoEvalColumn.generative_model.name, type="checkboxgroup", label="Generative models"),
|
77 |
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
78 |
# ColumnFilter(
|
79 |
# AutoEvalColumn.params.name,
|
eval-results/demo-leaderboard/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
CHANGED
@@ -19,16 +19,16 @@
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "CLOSE_deepseek-v2-chat",
|
22 |
-
"
|
23 |
-
"
|
24 |
"name": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
25 |
-
"num_params":
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "CLOSE",
|
29 |
"retrieval_model_args": {
|
30 |
"num_params": 0.0,
|
31 |
-
"open_source":
|
32 |
}
|
33 |
}
|
34 |
}
|
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "CLOSE_deepseek-v2-chat",
|
22 |
+
"generative_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
23 |
+
"generative_model_args": {
|
24 |
"name": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
25 |
+
"num_params": 236,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "CLOSE",
|
29 |
"retrieval_model_args": {
|
30 |
"num_params": 0.0,
|
31 |
+
"open_source": true
|
32 |
}
|
33 |
}
|
34 |
}
|
eval-results/demo-leaderboard/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
CHANGED
@@ -19,16 +19,16 @@
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "CLOSE_llama3-70b-instruct",
|
22 |
-
"
|
23 |
-
"
|
24 |
"name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
25 |
-
"num_params": 70,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "CLOSE",
|
29 |
"retrieval_model_args": {
|
30 |
"num_params": 0.0,
|
31 |
-
"open_source":
|
32 |
}
|
33 |
}
|
34 |
}
|
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "CLOSE_llama3-70b-instruct",
|
22 |
+
"generative_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
"name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
25 |
+
"num_params": 70.6,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "CLOSE",
|
29 |
"retrieval_model_args": {
|
30 |
"num_params": 0.0,
|
31 |
+
"open_source": true
|
32 |
}
|
33 |
}
|
34 |
}
|
eval-results/demo-leaderboard/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json
CHANGED
@@ -19,16 +19,16 @@
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "CLOSE_qwen2-72b",
|
22 |
-
"
|
23 |
-
"
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
-
"num_params": 72,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "CLOSE",
|
29 |
"retrieval_model_args": {
|
30 |
"num_params": 0.0,
|
31 |
-
"open_source":
|
32 |
}
|
33 |
}
|
34 |
}
|
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "CLOSE_qwen2-72b",
|
22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72.7,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "CLOSE",
|
29 |
"retrieval_model_args": {
|
30 |
"num_params": 0.0,
|
31 |
+
"open_source": true
|
32 |
}
|
33 |
}
|
34 |
}
|
eval-results/demo-leaderboard/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json
CHANGED
@@ -19,16 +19,16 @@
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "CLOSE_yi15-34b",
|
22 |
-
"
|
23 |
-
"
|
24 |
"name": "01ai/Yi-1.5-34B-Chat-16K",
|
25 |
-
"num_params": 34,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "CLOSE",
|
29 |
"retrieval_model_args": {
|
30 |
"num_params": 0.0,
|
31 |
-
"open_source":
|
32 |
}
|
33 |
}
|
34 |
}
|
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "CLOSE_yi15-34b",
|
22 |
+
"generative_model": "01ai/Yi-1.5-34B-Chat-16K",
|
23 |
+
"generative_model_args": {
|
24 |
"name": "01ai/Yi-1.5-34B-Chat-16K",
|
25 |
+
"num_params": 34.4,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "CLOSE",
|
29 |
"retrieval_model_args": {
|
30 |
"num_params": 0.0,
|
31 |
+
"open_source": true
|
32 |
}
|
33 |
}
|
34 |
}
|
eval-results/demo-leaderboard/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
CHANGED
@@ -19,16 +19,16 @@
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "gte-qwen2-1.5b_deepseek-v2-chat",
|
22 |
-
"
|
23 |
-
"
|
24 |
"name": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
25 |
-
"num_params":
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
-
"num_params": 1.
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "gte-qwen2-1.5b_deepseek-v2-chat",
|
22 |
+
"generative_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
23 |
+
"generative_model_args": {
|
24 |
"name": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
25 |
+
"num_params": 236,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
+
"num_params": 1.78,
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
eval-results/demo-leaderboard/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
CHANGED
@@ -19,16 +19,16 @@
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "gte-qwen2-1.5b_llama3-70b-instruct",
|
22 |
-
"
|
23 |
-
"
|
24 |
"name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
25 |
-
"num_params": 70,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
-
"num_params": 1.
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "gte-qwen2-1.5b_llama3-70b-instruct",
|
22 |
+
"generative_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
"name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
25 |
+
"num_params": 70.6,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
+
"num_params": 1.78,
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
eval-results/demo-leaderboard/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json
CHANGED
@@ -19,16 +19,16 @@
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "gte-qwen2-1.5b_qwen2-72b",
|
22 |
-
"
|
23 |
-
"
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
-
"num_params": 72,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
-
"num_params": 1.
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "gte-qwen2-1.5b_qwen2-72b",
|
22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72.7,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
+
"num_params": 1.78,
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
eval-results/demo-leaderboard/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json
CHANGED
@@ -19,16 +19,16 @@
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "gte-qwen2-1.5b_yi15-34b",
|
22 |
-
"
|
23 |
-
"
|
24 |
"name": "01ai/Yi-1.5-34B-Chat-16K",
|
25 |
-
"num_params": 34,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
-
"num_params": 1.
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
|
|
19 |
},
|
20 |
"config": {
|
21 |
"eval_name": "gte-qwen2-1.5b_yi15-34b",
|
22 |
+
"generative_model": "01ai/Yi-1.5-34B-Chat-16K",
|
23 |
+
"generative_model_args": {
|
24 |
"name": "01ai/Yi-1.5-34B-Chat-16K",
|
25 |
+
"num_params": 34.4,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
+
"num_params": 1.78,
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
eval-results/demo-leaderboard/qwen2-72b_bge-large-zh/results_2023-12-08 15:46:20.425378.json
CHANGED
@@ -18,17 +18,17 @@
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
-
"eval_name": "
|
22 |
-
"
|
23 |
-
"
|
24 |
-
"name": "
|
25 |
-
"num_params":
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "BAAI/bge-large-zh",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "BAAI/bge-large-zh",
|
31 |
-
"num_params": 0.
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
+
"eval_name": "qwen2-72b_bge-large-zh",
|
22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72.7,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "BAAI/bge-large-zh",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "BAAI/bge-large-zh",
|
31 |
+
"num_params": 0.326,
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
eval-results/demo-leaderboard/qwen2-72b_bge-m3/results_2023-12-08 15:46:20.425378.json
CHANGED
@@ -18,17 +18,17 @@
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
-
"eval_name": "
|
22 |
-
"
|
23 |
-
"
|
24 |
-
"name": "
|
25 |
-
"num_params":
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "BAAI/bge-m3",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "BAAI/bge-m3",
|
31 |
-
"num_params": 0.
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
+
"eval_name": "qwen2-72b_bge-m3",
|
22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72.7,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "BAAI/bge-m3",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "BAAI/bge-m3",
|
31 |
+
"num_params": 0.5,
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
eval-results/demo-leaderboard/qwen2-72b_e5-mistral-7b/results_2023-12-08 15:46:20.425378.json
CHANGED
@@ -18,17 +18,17 @@
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
-
"eval_name": "
|
22 |
-
"
|
23 |
-
"
|
24 |
-
"name": "
|
25 |
-
"num_params": 7,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "intfloat/e5-mistral-7b-instruct",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "intfloat/e5-mistral-7b-instruct",
|
31 |
-
"num_params": 7,
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
+
"eval_name": "qwen2-72b_e5-mistral-7b",
|
22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72.7,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "intfloat/e5-mistral-7b-instruct",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "intfloat/e5-mistral-7b-instruct",
|
31 |
+
"num_params": 7.11,
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
eval-results/demo-leaderboard/qwen2-72b_gte-qwen2-1.5b/results_2023-12-08 15:46:20.425378.json
CHANGED
@@ -18,17 +18,17 @@
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
-
"eval_name": "
|
22 |
-
"
|
23 |
-
"
|
24 |
-
"name": "
|
25 |
-
"num_params":
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
-
"num_params": 1.
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
+
"eval_name": "qwen2-72b_gte-qwen2-1.5b",
|
22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72.7,
|
26 |
"open_source": true
|
27 |
},
|
28 |
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
"retrieval_model_args": {
|
30 |
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
+
"num_params": 1.78,
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
eval-results/demo-leaderboard/qwen2-72b_jina-zh/results_2023-12-08 15:46:20.425378.json
CHANGED
@@ -18,17 +18,17 @@
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
-
"eval_name": "
|
22 |
-
"
|
23 |
-
"
|
24 |
-
"name": "
|
25 |
-
"num_params":
|
26 |
"open_source": true
|
27 |
},
|
28 |
-
"retrieval_model": "jinaai/
|
29 |
"retrieval_model_args": {
|
30 |
-
"name": "jinaai/
|
31 |
-
"num_params": 0.
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
|
|
18 |
}
|
19 |
},
|
20 |
"config": {
|
21 |
+
"eval_name": "qwen2-72b_jina-zh",
|
22 |
+
"generative_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generative_model_args": {
|
24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72.7,
|
26 |
"open_source": true
|
27 |
},
|
28 |
+
"retrieval_model": "jinaai/jina-embeddings-v2-base-zh",
|
29 |
"retrieval_model_args": {
|
30 |
+
"name": "jinaai/jina-embeddings-v2-base-zh",
|
31 |
+
"num_params": 0.161,
|
32 |
"open_source": true
|
33 |
}
|
34 |
}
|
src/about.py
CHANGED
@@ -16,21 +16,21 @@ class Tasks(Enum):
|
|
16 |
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
|
18 |
# retrieval tasks
|
19 |
-
mrr = Task("retrieval", "mrr", "MRR")
|
20 |
-
map = Task("retrieval", "map", "MAP")
|
21 |
|
22 |
# generation tasks
|
23 |
-
em = Task("generation", "em", "EM")
|
24 |
-
f1 = Task("generation", "f1", "F1")
|
25 |
-
rouge1 = Task("generation", "rouge1", "Rouge-1")
|
26 |
-
rouge2 = Task("generation", "rouge2", "Rouge-2")
|
27 |
-
rougeL = Task("generation", "rougeL", "Rouge-L")
|
28 |
|
29 |
-
accuracy = Task("generation", "accuracy", "ACC")
|
30 |
-
completeness = Task("generation", "completeness", "COMP")
|
31 |
-
hallucination = Task("generation", "hallucination", "HAL")
|
32 |
-
utilization = Task("generation", "utilization", "UTIL")
|
33 |
-
numerical_accuracy = Task("generation", "numerical_accuracy", "MACC")
|
34 |
|
35 |
|
36 |
NUM_FEWSHOT = 0 # Change with your few shot
|
@@ -39,19 +39,121 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
39 |
|
40 |
|
41 |
# Your leaderboard name
|
42 |
-
TITLE = """<h1 align="center" id="space-title"
|
43 |
|
44 |
# What does your leaderboard evaluate?
|
45 |
INTRODUCTION_TEXT = """
|
46 |
-
|
47 |
"""
|
48 |
|
49 |
# Which evaluations are you running? how can people reproduce what you have?
|
50 |
LLM_BENCHMARKS_TEXT = f"""
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
## Reproducibility
|
54 |
-
To reproduce our results, here is the commands you can run:
|
55 |
|
56 |
"""
|
57 |
|
|
|
16 |
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
|
18 |
# retrieval tasks
|
19 |
+
mrr = Task("retrieval", "mrr", "MRR ⬆️")
|
20 |
+
map = Task("retrieval", "map", "MAP ⬆️")
|
21 |
|
22 |
# generation tasks
|
23 |
+
em = Task("generation", "em", "EM ⬆️")
|
24 |
+
f1 = Task("generation", "f1", "F1 ⬆️")
|
25 |
+
rouge1 = Task("generation", "rouge1", "Rouge-1 ⬆️")
|
26 |
+
rouge2 = Task("generation", "rouge2", "Rouge-2 ⬆️")
|
27 |
+
rougeL = Task("generation", "rougeL", "Rouge-L ⬆️")
|
28 |
|
29 |
+
accuracy = Task("generation", "accuracy", "ACC ⬆️")
|
30 |
+
completeness = Task("generation", "completeness", "COMP ⬆️")
|
31 |
+
hallucination = Task("generation", "hallucination", "HAL ⬇️")
|
32 |
+
utilization = Task("generation", "utilization", "UTIL ⬆️")
|
33 |
+
numerical_accuracy = Task("generation", "numerical_accuracy", "MACC ⬆️")
|
34 |
|
35 |
|
36 |
NUM_FEWSHOT = 0 # Change with your few shot
|
|
|
39 |
|
40 |
|
41 |
# Your leaderboard name
|
42 |
+
TITLE = """<h1 align="center" id="space-title">🏅 OmniEval Leaderboard</h1>"""
|
43 |
|
44 |
# What does your leaderboard evaluate?
|
45 |
INTRODUCTION_TEXT = """
|
46 |
+
<div align="center">OmniEval: Omnidirectional and Automatic RAG Evaluation Benchmark in Financial Domain</div>
|
47 |
"""
|
48 |
|
49 |
# Which evaluations are you running? how can people reproduce what you have?
|
50 |
LLM_BENCHMARKS_TEXT = f"""
|
51 |
+
# <div align="center">OmniEval: Omnidirectional and Automatic RAG Evaluation Benchmark in Financial Domain</div>
|
52 |
+
|
53 |
+
|
54 |
+
<div align="center">
|
55 |
+
<!-- <a href="https://arxiv.org/abs/2405.13576" target="_blank"><img src=https://img.shields.io/badge/arXiv-b5212f.svg?logo=arxiv></a> -->
|
56 |
+
<!-- <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace%20Datasets-27b3b4.svg></a> -->
|
57 |
+
<!-- <a href="https://huggingface.co/ShootingWong/OmniEval-ModelEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace%20Checkpoint-5fc372.svg></a> -->
|
58 |
+
<!-- <a href="https://huggingface.co/ShootingWong/OmniEval-HallucinationEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace%20Checkpoint-b181d9.svg></a> -->
|
59 |
+
<a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a>
|
60 |
+
<a href="https://huggingface.co/ShootingWong/OmniEval-ModelEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Checkpoint-5fc372></a>
|
61 |
+
<a href="https://huggingface.co/ShootingWong/OmniEval-HallucinationEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Checkpoint-b181d9></a>
|
62 |
+
<a href="https://huggingface.co/spaces/NLPIR-RAG/OmniEval" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Leaderboard-blue></a>
|
63 |
+
<a href="https://github.com/RUC-NLPIR/FlashRAG/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/LICENSE-MIT-green"></a>
|
64 |
+
<a><img alt="Static Badge" src="https://img.shields.io/badge/made_with-Python-blue"></a>
|
65 |
+
</div>
|
66 |
+
|
67 |
+
<!-- [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Leaderboard-blue)](https://huggingface.co/spaces/Vchitect/VBench_Leaderboard) -->
|
68 |
+
|
69 |
+
<h4 align="center">
|
70 |
+
|
71 |
+
<p>
|
72 |
+
<a href="#wrench-installation">Installation</a> |
|
73 |
+
<!-- <a href="#sparkles-features">Features</a> | -->
|
74 |
+
<a href="#rocket-quick-start">Quick-Start</a> |
|
75 |
+
<a href="#bookmark-license">License</a> |
|
76 |
+
<a href="#star2-citation">Citation</a>
|
77 |
+
|
78 |
+
</p>
|
79 |
+
|
80 |
+
</h4>
|
81 |
+
|
82 |
+
<!--
|
83 |
+
With FlashRAG and provided resources, you can effortlessly reproduce existing SOTA works in the RAG domain or implement your custom RAG processes and components. -->
|
84 |
+
|
85 |
+
|
86 |
+
## :wrench: Installation
|
87 |
+
`conda env create -f environment.yml && conda activate finrag`
|
88 |
+
|
89 |
+
<!-- ## :sparkles: Features
|
90 |
+
1. -->
|
91 |
+
## :rocket: Quick-Start
|
92 |
+
Notion:
|
93 |
+
1. The code run path is `./OpenFinBench`
|
94 |
+
2. We provide our auto-generated evaluation dataset in <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a>
|
95 |
+
### 1. Build the Retrieval Corpus
|
96 |
+
```
|
97 |
+
# cd OpenFinBench
|
98 |
+
sh corpus_builder/build_corpus.sh # Please see the annotation inner the bash file to set parameters.
|
99 |
+
```
|
100 |
+
### 2. Generate Evaluation Data Samples
|
101 |
+
1. Generate evaluation instances
|
102 |
+
```
|
103 |
+
# cd OpenFinBench
|
104 |
+
sh data_generator/generate_data.sh
|
105 |
+
```
|
106 |
+
2. Filter (quality inspection) evaluation instances
|
107 |
+
```
|
108 |
+
sh data_generator/generate_data_filter.sh
|
109 |
+
```
|
110 |
+
### 3. Inference Your Models
|
111 |
+
```
|
112 |
+
# cd OpenFinBench
|
113 |
+
sh evaluator/inference/rag_inference.sh
|
114 |
+
```
|
115 |
+
### 4. Evaluate Your Models
|
116 |
+
#### (a) Rule-based Evaluation
|
117 |
+
```
|
118 |
+
# cd OpenFinBench
|
119 |
+
sh evaluator/judgement/judger.sh # by setting judge_type="rule"
|
120 |
+
```
|
121 |
+
#### (b) Model-based Evalution
|
122 |
+
We propose five model-based metric: accuracy, completeness, utilization, numerical_accuracy, and hallucination. We have trained two models from Qwen2.5-7B by the lora strategy and human-annotation labels to implement model-based evaluation.
|
123 |
+
|
124 |
+
Note that the evaluator of hallucination is different from other four. Their model checkpoint can be load from the following huggingface links:
|
125 |
+
1. The evaluator for hallucination metric: <a href="https://huggingface.co/ShootingWong/OmniEval-HallucinationEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Checkpoint-b181d9></a>
|
126 |
+
2. The evaluator for other metric: <a href="https://huggingface.co/ShootingWong/OmniEval-ModelEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Checkpoint-5fc372></a>
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
To implement model-based evaluation, you can first set up two vllm servers by the following codes:
|
131 |
+
```
|
132 |
+
```
|
133 |
+
|
134 |
+
Then conduct the model-based evaluate using the following codes, (change the parameters inner the bash file).
|
135 |
+
```
|
136 |
+
sh evaluator/judgement/judger.sh
|
137 |
+
```
|
138 |
+
|
139 |
+
## :bookmark: License
|
140 |
+
|
141 |
+
OmniEval is licensed under the [<u>MIT License</u>](./LICENSE).
|
142 |
+
|
143 |
+
## :star2: Citation
|
144 |
+
The paper is waiting to be released!
|
145 |
+
|
146 |
+
<!-- # Check Infos
|
147 |
+
## Pipeline
|
148 |
+
1. Build corpus
|
149 |
+
2. Data generation
|
150 |
+
3. RAG inference
|
151 |
+
4. Result evaluatioin
|
152 |
+
|
153 |
+
## Code
|
154 |
+
1. remove "baichuan"
|
155 |
+
2. remove useless annotation -->
|
156 |
|
|
|
|
|
157 |
|
158 |
"""
|
159 |
|
src/display/formatting.py
CHANGED
@@ -5,6 +5,8 @@ def model_hyperlink(link, model_name):
|
|
5 |
def make_clickable_model(model_name, model_link=None):
|
6 |
if model_link:
|
7 |
return model_hyperlink(model_link, model_name)
|
|
|
|
|
8 |
link = f"https://huggingface.co/{model_name}"
|
9 |
return model_hyperlink(link, model_name)
|
10 |
|
|
|
5 |
def make_clickable_model(model_name, model_link=None):
|
6 |
if model_link:
|
7 |
return model_hyperlink(model_link, model_name)
|
8 |
+
if model_name == "CLOSE":
|
9 |
+
return model_name
|
10 |
link = f"https://huggingface.co/{model_name}"
|
11 |
return model_hyperlink(link, model_name)
|
12 |
|
src/display/utils.py
CHANGED
@@ -17,17 +17,21 @@ class ColumnContent:
|
|
17 |
displayed_by_default: bool
|
18 |
hidden: bool = False
|
19 |
never_hidden: bool = False
|
|
|
20 |
|
21 |
## Leaderboard columns
|
22 |
auto_eval_column_dict = []
|
23 |
# Init
|
24 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("Type Symbol", "str", True, never_hidden=True)])
|
25 |
# auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
26 |
-
auto_eval_column_dict.append(["retrieval_model", ColumnContent, ColumnContent("Retrieval Model", "markdown",
|
27 |
-
auto_eval_column_dict.append(["
|
|
|
|
|
28 |
|
29 |
#Scores
|
30 |
-
auto_eval_column_dict.append(["
|
|
|
31 |
for task in Tasks:
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
|
|
|
17 |
displayed_by_default: bool
|
18 |
hidden: bool = False
|
19 |
never_hidden: bool = False
|
20 |
+
never_displayed: bool = False
|
21 |
|
22 |
## Leaderboard columns
|
23 |
auto_eval_column_dict = []
|
24 |
# Init
|
25 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("Type Symbol", "str", True, never_hidden=True)])
|
26 |
# auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
27 |
+
auto_eval_column_dict.append(["retrieval_model", ColumnContent, ColumnContent("Retrieval Model Plain", "markdown", False, never_displayed=True)])
|
28 |
+
auto_eval_column_dict.append(["generative_model", ColumnContent, ColumnContent("Generative Model Plain", "markdown", False, never_displayed=True)])
|
29 |
+
auto_eval_column_dict.append(["retrieval_model_link", ColumnContent, ColumnContent("Retrieval Model", "markdown", True, never_hidden=True)])
|
30 |
+
auto_eval_column_dict.append(["generative_model_link", ColumnContent, ColumnContent("Generative Model", "markdown", True, never_hidden=True)])
|
31 |
|
32 |
#Scores
|
33 |
+
auto_eval_column_dict.append(["gen_average", ColumnContent, ColumnContent("Gen Average ⬆️", "number", True)])
|
34 |
+
auto_eval_column_dict.append(["ret_average", ColumnContent, ColumnContent("Ret Average ⬆️", "number", True)])
|
35 |
for task in Tasks:
|
36 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
37 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -18,12 +18,12 @@ class EvalResult:
|
|
18 |
"""
|
19 |
eval_name: str # org_model_precision (uid)
|
20 |
org: str
|
21 |
-
|
22 |
retrieval_model: str
|
23 |
# revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
-
|
26 |
-
|
27 |
retrieval_model_link: str = "" # link to the model on the hub
|
28 |
retrieval_model_args: dict = None
|
29 |
precision: Precision = Precision.Unknown
|
@@ -48,7 +48,7 @@ class EvalResult:
|
|
48 |
|
49 |
# Get model and org
|
50 |
eval_name= config.get("eval_name", "")
|
51 |
-
|
52 |
retrieval_model = config.get("retrieval_model", "")
|
53 |
org= config.get("org", "")
|
54 |
# org_and_model = org_and_model.split("/", 1)
|
@@ -77,17 +77,21 @@ class EvalResult:
|
|
77 |
task = task.value
|
78 |
|
79 |
# We average all scores of a given metric (not all metrics are present in all files)
|
80 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
81 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
82 |
continue
|
83 |
|
84 |
mean_acc = np.mean(accs) * 100.0
|
85 |
results[task.benchmark] = mean_acc
|
|
|
|
|
|
|
86 |
|
87 |
-
|
|
|
88 |
retrieval_model_args = config.get("retrieval_model_args", None)
|
89 |
open_source= True
|
90 |
-
if not
|
91 |
open_source = False
|
92 |
if not retrieval_model_args or not retrieval_model_args.get("open_source", False):
|
93 |
open_source = False
|
@@ -96,10 +100,10 @@ class EvalResult:
|
|
96 |
eval_name=eval_name,
|
97 |
# full_model=full_model,
|
98 |
org=org,
|
99 |
-
|
100 |
retrieval_model=retrieval_model,
|
101 |
results=results,
|
102 |
-
|
103 |
retrieval_model_args=retrieval_model_args,
|
104 |
model_type=ModelType.OpenSource if open_source else ModelType.ClosedSource,
|
105 |
# precision=precision,
|
@@ -126,7 +130,6 @@ class EvalResult:
|
|
126 |
|
127 |
def to_dict(self):
|
128 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
129 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
130 |
data_dict = {
|
131 |
"eval_name": self.eval_name, # not a column, just a save name,
|
132 |
# AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -134,20 +137,23 @@ class EvalResult:
|
|
134 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
135 |
# AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
136 |
# AutoEvalColumn.architecture.name: self.architecture,
|
137 |
-
AutoEvalColumn.
|
138 |
-
AutoEvalColumn.retrieval_model.name:
|
|
|
|
|
139 |
# AutoEvalColumn.revision.name: self.revision,
|
140 |
-
AutoEvalColumn.
|
|
|
141 |
# AutoEvalColumn.license.name: self.license,
|
142 |
# AutoEvalColumn.likes.name: self.likes,
|
143 |
-
# AutoEvalColumn.
|
144 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
145 |
-
"Gen#Params (B)": self.
|
146 |
"Ret#Params (B)": self.retrieval_model_args.get("num_params", "Unknown"),
|
147 |
}
|
148 |
|
149 |
for task in Tasks:
|
150 |
-
data_dict[task.value.col_name] = self.results[task.value.
|
151 |
|
152 |
return data_dict
|
153 |
|
|
|
18 |
"""
|
19 |
eval_name: str # org_model_precision (uid)
|
20 |
org: str
|
21 |
+
generative_model: str
|
22 |
retrieval_model: str
|
23 |
# revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
+
generative_model_link: str = "" # link to the model on the hub
|
26 |
+
generative_model_args: dict = None
|
27 |
retrieval_model_link: str = "" # link to the model on the hub
|
28 |
retrieval_model_args: dict = None
|
29 |
precision: Precision = Precision.Unknown
|
|
|
48 |
|
49 |
# Get model and org
|
50 |
eval_name= config.get("eval_name", "")
|
51 |
+
generative_model = config.get("generative_model", "")
|
52 |
retrieval_model = config.get("retrieval_model", "")
|
53 |
org= config.get("org", "")
|
54 |
# org_and_model = org_and_model.split("/", 1)
|
|
|
77 |
task = task.value
|
78 |
|
79 |
# We average all scores of a given metric (not all metrics are present in all files)
|
80 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k and task.col_name != "hallucination"])
|
81 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
82 |
continue
|
83 |
|
84 |
mean_acc = np.mean(accs) * 100.0
|
85 |
results[task.benchmark] = mean_acc
|
86 |
+
results[task.metric] = data["results"][task.benchmark].get(task.metric, None)
|
87 |
+
if results[task.metric] is not None:
|
88 |
+
results[task.metric] = results[task.metric] * 100.0
|
89 |
|
90 |
+
|
91 |
+
generative_model_args = config.get("generative_model_args", None)
|
92 |
retrieval_model_args = config.get("retrieval_model_args", None)
|
93 |
open_source= True
|
94 |
+
if not generative_model_args or not generative_model_args.get("open_source", False):
|
95 |
open_source = False
|
96 |
if not retrieval_model_args or not retrieval_model_args.get("open_source", False):
|
97 |
open_source = False
|
|
|
100 |
eval_name=eval_name,
|
101 |
# full_model=full_model,
|
102 |
org=org,
|
103 |
+
generative_model=generative_model,
|
104 |
retrieval_model=retrieval_model,
|
105 |
results=results,
|
106 |
+
generative_model_args=generative_model_args,
|
107 |
retrieval_model_args=retrieval_model_args,
|
108 |
model_type=ModelType.OpenSource if open_source else ModelType.ClosedSource,
|
109 |
# precision=precision,
|
|
|
130 |
|
131 |
def to_dict(self):
|
132 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
|
133 |
data_dict = {
|
134 |
"eval_name": self.eval_name, # not a column, just a save name,
|
135 |
# AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
137 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
138 |
# AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
139 |
# AutoEvalColumn.architecture.name: self.architecture,
|
140 |
+
AutoEvalColumn.generative_model.name: self.generative_model,
|
141 |
+
AutoEvalColumn.retrieval_model.name: self.retrieval_model,
|
142 |
+
AutoEvalColumn.generative_model_link.name: make_clickable_model(self.generative_model, self.generative_model_link),
|
143 |
+
AutoEvalColumn.retrieval_model_link.name: make_clickable_model(self.retrieval_model, self.retrieval_model_link),
|
144 |
# AutoEvalColumn.revision.name: self.revision,
|
145 |
+
AutoEvalColumn.ret_average.name: self.results["retrieval"],
|
146 |
+
AutoEvalColumn.gen_average.name: self.results["generation"],
|
147 |
# AutoEvalColumn.license.name: self.license,
|
148 |
# AutoEvalColumn.likes.name: self.likes,
|
149 |
+
# AutoEvalColumn.generative_model_params.name: self.num_params,
|
150 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
151 |
+
"Gen#Params (B)": self.generative_model_args.get("num_params", "Unknown"),
|
152 |
"Ret#Params (B)": self.retrieval_model_args.get("num_params", "Unknown"),
|
153 |
}
|
154 |
|
155 |
for task in Tasks:
|
156 |
+
data_dict[task.value.col_name] = self.results[task.value.metric]
|
157 |
|
158 |
return data_dict
|
159 |
|
src/populate.py
CHANGED
@@ -15,8 +15,10 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
15 |
print(all_data_json)
|
16 |
|
17 |
df = pd.DataFrame.from_records(all_data_json)
|
18 |
-
df = df.sort_values(by=[AutoEvalColumn.
|
19 |
df = df[cols].round(decimals=2)
|
|
|
|
|
20 |
|
21 |
# filter out if any of the benchmarks have not been produced
|
22 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
15 |
print(all_data_json)
|
16 |
|
17 |
df = pd.DataFrame.from_records(all_data_json)
|
18 |
+
df = df.sort_values(by=[AutoEvalColumn.gen_average.name], ascending=False)
|
19 |
df = df[cols].round(decimals=2)
|
20 |
+
# assert column 'Retrieval Model Plain' in df.columns
|
21 |
+
# assert AutoEvalColumn.retrieval_model.name in df.columns, f"Column {df.columns} does not contain 'Retrieval Model Plain'"
|
22 |
|
23 |
# filter out if any of the benchmarks have not been produced
|
24 |
df = df[has_no_nan_values(df, benchmark_cols)]
|