zstanjj commited on
Commit
921b8ba
·
1 Parent(s): 67ce912

add open-source

Browse files
app.py CHANGED
@@ -65,13 +65,15 @@ def init_leaderboard(dataframe):
65
  datatype=[c.type for c in fields(AutoEvalColumn)],
66
  select_columns=SelectColumns(
67
  default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.generation_model.name, AutoEvalColumn.retrieval_model.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
 
 
75
  # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
  # ColumnFilter(
77
  # AutoEvalColumn.params.name,
 
65
  datatype=[c.type for c in fields(AutoEvalColumn)],
66
  select_columns=SelectColumns(
67
  default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.never_displayed],
69
  label="Select Columns to Display:",
70
  ),
71
+ search_columns=[AutoEvalColumn.generative_model_link.name, AutoEvalColumn.retrieval_model_link.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
+ ColumnFilter(AutoEvalColumn.retrieval_model.name, type="checkboxgroup", label="Retrieval models"),
76
+ ColumnFilter(AutoEvalColumn.generative_model.name, type="checkboxgroup", label="Generative models"),
77
  # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
78
  # ColumnFilter(
79
  # AutoEvalColumn.params.name,
eval-results/demo-leaderboard/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json CHANGED
@@ -19,16 +19,16 @@
19
  },
20
  "config": {
21
  "eval_name": "CLOSE_deepseek-v2-chat",
22
- "generation_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
23
- "generation_model_args": {
24
  "name": "deepseek-ai/DeepSeek-V2-Chat-0628",
25
- "num_params": 80,
26
  "open_source": true
27
  },
28
  "retrieval_model": "CLOSE",
29
  "retrieval_model_args": {
30
  "num_params": 0.0,
31
- "open_source": false
32
  }
33
  }
34
  }
 
19
  },
20
  "config": {
21
  "eval_name": "CLOSE_deepseek-v2-chat",
22
+ "generative_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
23
+ "generative_model_args": {
24
  "name": "deepseek-ai/DeepSeek-V2-Chat-0628",
25
+ "num_params": 236,
26
  "open_source": true
27
  },
28
  "retrieval_model": "CLOSE",
29
  "retrieval_model_args": {
30
  "num_params": 0.0,
31
+ "open_source": true
32
  }
33
  }
34
  }
eval-results/demo-leaderboard/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json CHANGED
@@ -19,16 +19,16 @@
19
  },
20
  "config": {
21
  "eval_name": "CLOSE_llama3-70b-instruct",
22
- "generation_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
23
- "generation_model_args": {
24
  "name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
25
- "num_params": 70,
26
  "open_source": true
27
  },
28
  "retrieval_model": "CLOSE",
29
  "retrieval_model_args": {
30
  "num_params": 0.0,
31
- "open_source": false
32
  }
33
  }
34
  }
 
19
  },
20
  "config": {
21
  "eval_name": "CLOSE_llama3-70b-instruct",
22
+ "generative_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
23
+ "generative_model_args": {
24
  "name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
25
+ "num_params": 70.6,
26
  "open_source": true
27
  },
28
  "retrieval_model": "CLOSE",
29
  "retrieval_model_args": {
30
  "num_params": 0.0,
31
+ "open_source": true
32
  }
33
  }
34
  }
eval-results/demo-leaderboard/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json CHANGED
@@ -19,16 +19,16 @@
19
  },
20
  "config": {
21
  "eval_name": "CLOSE_qwen2-72b",
22
- "generation_model": "Qwen/Qwen2.5-72B-Instruct",
23
- "generation_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
25
- "num_params": 72,
26
  "open_source": true
27
  },
28
  "retrieval_model": "CLOSE",
29
  "retrieval_model_args": {
30
  "num_params": 0.0,
31
- "open_source": false
32
  }
33
  }
34
  }
 
19
  },
20
  "config": {
21
  "eval_name": "CLOSE_qwen2-72b",
22
+ "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generative_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72.7,
26
  "open_source": true
27
  },
28
  "retrieval_model": "CLOSE",
29
  "retrieval_model_args": {
30
  "num_params": 0.0,
31
+ "open_source": true
32
  }
33
  }
34
  }
eval-results/demo-leaderboard/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json CHANGED
@@ -19,16 +19,16 @@
19
  },
20
  "config": {
21
  "eval_name": "CLOSE_yi15-34b",
22
- "generation_model": "01ai/Yi-1.5-34B-Chat-16K",
23
- "generation_model_args": {
24
  "name": "01ai/Yi-1.5-34B-Chat-16K",
25
- "num_params": 34,
26
  "open_source": true
27
  },
28
  "retrieval_model": "CLOSE",
29
  "retrieval_model_args": {
30
  "num_params": 0.0,
31
- "open_source": false
32
  }
33
  }
34
  }
 
19
  },
20
  "config": {
21
  "eval_name": "CLOSE_yi15-34b",
22
+ "generative_model": "01ai/Yi-1.5-34B-Chat-16K",
23
+ "generative_model_args": {
24
  "name": "01ai/Yi-1.5-34B-Chat-16K",
25
+ "num_params": 34.4,
26
  "open_source": true
27
  },
28
  "retrieval_model": "CLOSE",
29
  "retrieval_model_args": {
30
  "num_params": 0.0,
31
+ "open_source": true
32
  }
33
  }
34
  }
eval-results/demo-leaderboard/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json CHANGED
@@ -19,16 +19,16 @@
19
  },
20
  "config": {
21
  "eval_name": "gte-qwen2-1.5b_deepseek-v2-chat",
22
- "generation_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
23
- "generation_model_args": {
24
  "name": "deepseek-ai/DeepSeek-V2-Chat-0628",
25
- "num_params": 80,
26
  "open_source": true
27
  },
28
  "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
  "retrieval_model_args": {
30
  "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
- "num_params": 1.5,
32
  "open_source": true
33
  }
34
  }
 
19
  },
20
  "config": {
21
  "eval_name": "gte-qwen2-1.5b_deepseek-v2-chat",
22
+ "generative_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
23
+ "generative_model_args": {
24
  "name": "deepseek-ai/DeepSeek-V2-Chat-0628",
25
+ "num_params": 236,
26
  "open_source": true
27
  },
28
  "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
  "retrieval_model_args": {
30
  "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
+ "num_params": 1.78,
32
  "open_source": true
33
  }
34
  }
eval-results/demo-leaderboard/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json CHANGED
@@ -19,16 +19,16 @@
19
  },
20
  "config": {
21
  "eval_name": "gte-qwen2-1.5b_llama3-70b-instruct",
22
- "generation_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
23
- "generation_model_args": {
24
  "name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
25
- "num_params": 70,
26
  "open_source": true
27
  },
28
  "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
  "retrieval_model_args": {
30
  "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
- "num_params": 1.5,
32
  "open_source": true
33
  }
34
  }
 
19
  },
20
  "config": {
21
  "eval_name": "gte-qwen2-1.5b_llama3-70b-instruct",
22
+ "generative_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
23
+ "generative_model_args": {
24
  "name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
25
+ "num_params": 70.6,
26
  "open_source": true
27
  },
28
  "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
  "retrieval_model_args": {
30
  "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
+ "num_params": 1.78,
32
  "open_source": true
33
  }
34
  }
eval-results/demo-leaderboard/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json CHANGED
@@ -19,16 +19,16 @@
19
  },
20
  "config": {
21
  "eval_name": "gte-qwen2-1.5b_qwen2-72b",
22
- "generation_model": "Qwen/Qwen2.5-72B-Instruct",
23
- "generation_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
25
- "num_params": 72,
26
  "open_source": true
27
  },
28
  "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
  "retrieval_model_args": {
30
  "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
- "num_params": 1.5,
32
  "open_source": true
33
  }
34
  }
 
19
  },
20
  "config": {
21
  "eval_name": "gte-qwen2-1.5b_qwen2-72b",
22
+ "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generative_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72.7,
26
  "open_source": true
27
  },
28
  "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
  "retrieval_model_args": {
30
  "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
+ "num_params": 1.78,
32
  "open_source": true
33
  }
34
  }
eval-results/demo-leaderboard/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json CHANGED
@@ -19,16 +19,16 @@
19
  },
20
  "config": {
21
  "eval_name": "gte-qwen2-1.5b_yi15-34b",
22
- "generation_model": "01ai/Yi-1.5-34B-Chat-16K",
23
- "generation_model_args": {
24
  "name": "01ai/Yi-1.5-34B-Chat-16K",
25
- "num_params": 34,
26
  "open_source": true
27
  },
28
  "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
  "retrieval_model_args": {
30
  "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
- "num_params": 1.5,
32
  "open_source": true
33
  }
34
  }
 
19
  },
20
  "config": {
21
  "eval_name": "gte-qwen2-1.5b_yi15-34b",
22
+ "generative_model": "01ai/Yi-1.5-34B-Chat-16K",
23
+ "generative_model_args": {
24
  "name": "01ai/Yi-1.5-34B-Chat-16K",
25
+ "num_params": 34.4,
26
  "open_source": true
27
  },
28
  "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
  "retrieval_model_args": {
30
  "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
+ "num_params": 1.78,
32
  "open_source": true
33
  }
34
  }
eval-results/demo-leaderboard/qwen2-72b_bge-large-zh/results_2023-12-08 15:46:20.425378.json CHANGED
@@ -18,17 +18,17 @@
18
  }
19
  },
20
  "config": {
21
- "eval_name": "bge-large-zh_bge-large-zh",
22
- "generation_model": "BAAI/bge-large-zh",
23
- "generation_model_args": {
24
- "name": "BAAI/bge-large-zh",
25
- "num_params": 0.2,
26
  "open_source": true
27
  },
28
  "retrieval_model": "BAAI/bge-large-zh",
29
  "retrieval_model_args": {
30
  "name": "BAAI/bge-large-zh",
31
- "num_params": 0.2,
32
  "open_source": true
33
  }
34
  }
 
18
  }
19
  },
20
  "config": {
21
+ "eval_name": "qwen2-72b_bge-large-zh",
22
+ "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generative_model_args": {
24
+ "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72.7,
26
  "open_source": true
27
  },
28
  "retrieval_model": "BAAI/bge-large-zh",
29
  "retrieval_model_args": {
30
  "name": "BAAI/bge-large-zh",
31
+ "num_params": 0.326,
32
  "open_source": true
33
  }
34
  }
eval-results/demo-leaderboard/qwen2-72b_bge-m3/results_2023-12-08 15:46:20.425378.json CHANGED
@@ -18,17 +18,17 @@
18
  }
19
  },
20
  "config": {
21
- "eval_name": "bge-m3_bge-m3",
22
- "generation_model": "BAAI/bge-m3",
23
- "generation_model_args": {
24
- "name": "BAAI/bge-m3",
25
- "num_params": 0.2,
26
  "open_source": true
27
  },
28
  "retrieval_model": "BAAI/bge-m3",
29
  "retrieval_model_args": {
30
  "name": "BAAI/bge-m3",
31
- "num_params": 0.2,
32
  "open_source": true
33
  }
34
  }
 
18
  }
19
  },
20
  "config": {
21
+ "eval_name": "qwen2-72b_bge-m3",
22
+ "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generative_model_args": {
24
+ "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72.7,
26
  "open_source": true
27
  },
28
  "retrieval_model": "BAAI/bge-m3",
29
  "retrieval_model_args": {
30
  "name": "BAAI/bge-m3",
31
+ "num_params": 0.5,
32
  "open_source": true
33
  }
34
  }
eval-results/demo-leaderboard/qwen2-72b_e5-mistral-7b/results_2023-12-08 15:46:20.425378.json CHANGED
@@ -18,17 +18,17 @@
18
  }
19
  },
20
  "config": {
21
- "eval_name": "e5-mistral-7b_e5-mistral-7b",
22
- "generation_model": "intfloat/e5-mistral-7b-instruct",
23
- "generation_model_args": {
24
- "name": "intfloat/e5-mistral-7b-instruct",
25
- "num_params": 7,
26
  "open_source": true
27
  },
28
  "retrieval_model": "intfloat/e5-mistral-7b-instruct",
29
  "retrieval_model_args": {
30
  "name": "intfloat/e5-mistral-7b-instruct",
31
- "num_params": 7,
32
  "open_source": true
33
  }
34
  }
 
18
  }
19
  },
20
  "config": {
21
+ "eval_name": "qwen2-72b_e5-mistral-7b",
22
+ "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generative_model_args": {
24
+ "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72.7,
26
  "open_source": true
27
  },
28
  "retrieval_model": "intfloat/e5-mistral-7b-instruct",
29
  "retrieval_model_args": {
30
  "name": "intfloat/e5-mistral-7b-instruct",
31
+ "num_params": 7.11,
32
  "open_source": true
33
  }
34
  }
eval-results/demo-leaderboard/qwen2-72b_gte-qwen2-1.5b/results_2023-12-08 15:46:20.425378.json CHANGED
@@ -18,17 +18,17 @@
18
  }
19
  },
20
  "config": {
21
- "eval_name": "gte-qwen2-1.5b_gte-qwen2-1.5b",
22
- "generation_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
23
- "generation_model_args": {
24
- "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
25
- "num_params": 1.5,
26
  "open_source": true
27
  },
28
  "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
  "retrieval_model_args": {
30
  "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
- "num_params": 1.5,
32
  "open_source": true
33
  }
34
  }
 
18
  }
19
  },
20
  "config": {
21
+ "eval_name": "qwen2-72b_gte-qwen2-1.5b",
22
+ "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generative_model_args": {
24
+ "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72.7,
26
  "open_source": true
27
  },
28
  "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
  "retrieval_model_args": {
30
  "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
+ "num_params": 1.78,
32
  "open_source": true
33
  }
34
  }
eval-results/demo-leaderboard/qwen2-72b_jina-zh/results_2023-12-08 15:46:20.425378.json CHANGED
@@ -18,17 +18,17 @@
18
  }
19
  },
20
  "config": {
21
- "eval_name": "jina-zh_jina-zh",
22
- "generation_model": "jinaai/reader-lm-0.5b",
23
- "generation_model_args": {
24
- "name": "jinaai/reader-lm-0.5b",
25
- "num_params": 0.2,
26
  "open_source": true
27
  },
28
- "retrieval_model": "jinaai/reader-lm-0.5b",
29
  "retrieval_model_args": {
30
- "name": "jinaai/reader-lm-0.5b",
31
- "num_params": 0.2,
32
  "open_source": true
33
  }
34
  }
 
18
  }
19
  },
20
  "config": {
21
+ "eval_name": "qwen2-72b_jina-zh",
22
+ "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generative_model_args": {
24
+ "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72.7,
26
  "open_source": true
27
  },
28
+ "retrieval_model": "jinaai/jina-embeddings-v2-base-zh",
29
  "retrieval_model_args": {
30
+ "name": "jinaai/jina-embeddings-v2-base-zh",
31
+ "num_params": 0.161,
32
  "open_source": true
33
  }
34
  }
src/about.py CHANGED
@@ -16,21 +16,21 @@ class Tasks(Enum):
16
  # task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
18
  # retrieval tasks
19
- mrr = Task("retrieval", "mrr", "MRR")
20
- map = Task("retrieval", "map", "MAP")
21
 
22
  # generation tasks
23
- em = Task("generation", "em", "EM")
24
- f1 = Task("generation", "f1", "F1")
25
- rouge1 = Task("generation", "rouge1", "Rouge-1")
26
- rouge2 = Task("generation", "rouge2", "Rouge-2")
27
- rougeL = Task("generation", "rougeL", "Rouge-L")
28
 
29
- accuracy = Task("generation", "accuracy", "ACC")
30
- completeness = Task("generation", "completeness", "COMP")
31
- hallucination = Task("generation", "hallucination", "HAL")
32
- utilization = Task("generation", "utilization", "UTIL")
33
- numerical_accuracy = Task("generation", "numerical_accuracy", "MACC")
34
 
35
 
36
  NUM_FEWSHOT = 0 # Change with your few shot
@@ -39,19 +39,121 @@ NUM_FEWSHOT = 0 # Change with your few shot
39
 
40
 
41
  # Your leaderboard name
42
- TITLE = """<h1 align="center" id="space-title">Fin Benchmark leaderboard</h1>"""
43
 
44
  # What does your leaderboard evaluate?
45
  INTRODUCTION_TEXT = """
46
- Intro text
47
  """
48
 
49
  # Which evaluations are you running? how can people reproduce what you have?
50
  LLM_BENCHMARKS_TEXT = f"""
51
- ## How it works
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- ## Reproducibility
54
- To reproduce our results, here is the commands you can run:
55
 
56
  """
57
 
 
16
  # task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
18
  # retrieval tasks
19
+ mrr = Task("retrieval", "mrr", "MRR ⬆️")
20
+ map = Task("retrieval", "map", "MAP ⬆️")
21
 
22
  # generation tasks
23
+ em = Task("generation", "em", "EM ⬆️")
24
+ f1 = Task("generation", "f1", "F1 ⬆️")
25
+ rouge1 = Task("generation", "rouge1", "Rouge-1 ⬆️")
26
+ rouge2 = Task("generation", "rouge2", "Rouge-2 ⬆️")
27
+ rougeL = Task("generation", "rougeL", "Rouge-L ⬆️")
28
 
29
+ accuracy = Task("generation", "accuracy", "ACC ⬆️")
30
+ completeness = Task("generation", "completeness", "COMP ⬆️")
31
+ hallucination = Task("generation", "hallucination", "HAL ⬇️")
32
+ utilization = Task("generation", "utilization", "UTIL ⬆️")
33
+ numerical_accuracy = Task("generation", "numerical_accuracy", "MACC ⬆️")
34
 
35
 
36
  NUM_FEWSHOT = 0 # Change with your few shot
 
39
 
40
 
41
  # Your leaderboard name
42
+ TITLE = """<h1 align="center" id="space-title">🏅 OmniEval Leaderboard</h1>"""
43
 
44
  # What does your leaderboard evaluate?
45
  INTRODUCTION_TEXT = """
46
+ <div align="center">OmniEval: Omnidirectional and Automatic RAG Evaluation Benchmark in Financial Domain</div>
47
  """
48
 
49
  # Which evaluations are you running? how can people reproduce what you have?
50
  LLM_BENCHMARKS_TEXT = f"""
51
+ # <div align="center">OmniEval: Omnidirectional and Automatic RAG Evaluation Benchmark in Financial Domain</div>
52
+
53
+
54
+ <div align="center">
55
+ <!-- <a href="https://arxiv.org/abs/2405.13576" target="_blank"><img src=https://img.shields.io/badge/arXiv-b5212f.svg?logo=arxiv></a> -->
56
+ <!-- <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace%20Datasets-27b3b4.svg></a> -->
57
+ <!-- <a href="https://huggingface.co/ShootingWong/OmniEval-ModelEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace%20Checkpoint-5fc372.svg></a> -->
58
+ <!-- <a href="https://huggingface.co/ShootingWong/OmniEval-HallucinationEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace%20Checkpoint-b181d9.svg></a> -->
59
+ <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a>
60
+ <a href="https://huggingface.co/ShootingWong/OmniEval-ModelEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Checkpoint-5fc372></a>
61
+ <a href="https://huggingface.co/ShootingWong/OmniEval-HallucinationEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Checkpoint-b181d9></a>
62
+ <a href="https://huggingface.co/spaces/NLPIR-RAG/OmniEval" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Leaderboard-blue></a>
63
+ <a href="https://github.com/RUC-NLPIR/FlashRAG/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/LICENSE-MIT-green"></a>
64
+ <a><img alt="Static Badge" src="https://img.shields.io/badge/made_with-Python-blue"></a>
65
+ </div>
66
+
67
+ <!-- [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Leaderboard-blue)](https://huggingface.co/spaces/Vchitect/VBench_Leaderboard) -->
68
+
69
+ <h4 align="center">
70
+
71
+ <p>
72
+ <a href="#wrench-installation">Installation</a> |
73
+ <!-- <a href="#sparkles-features">Features</a> | -->
74
+ <a href="#rocket-quick-start">Quick-Start</a> |
75
+ <a href="#bookmark-license">License</a> |
76
+ <a href="#star2-citation">Citation</a>
77
+
78
+ </p>
79
+
80
+ </h4>
81
+
82
+ <!--
83
+ With FlashRAG and provided resources, you can effortlessly reproduce existing SOTA works in the RAG domain or implement your custom RAG processes and components. -->
84
+
85
+
86
+ ## :wrench: Installation
87
+ `conda env create -f environment.yml && conda activate finrag`
88
+
89
+ <!-- ## :sparkles: Features
90
+ 1. -->
91
+ ## :rocket: Quick-Start
92
+ Notion:
93
+ 1. The code run path is `./OpenFinBench`
94
+ 2. We provide our auto-generated evaluation dataset in <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a>
95
+ ### 1. Build the Retrieval Corpus
96
+ ```
97
+ # cd OpenFinBench
98
+ sh corpus_builder/build_corpus.sh # Please see the annotation inner the bash file to set parameters.
99
+ ```
100
+ ### 2. Generate Evaluation Data Samples
101
+ 1. Generate evaluation instances
102
+ ```
103
+ # cd OpenFinBench
104
+ sh data_generator/generate_data.sh
105
+ ```
106
+ 2. Filter (quality inspection) evaluation instances
107
+ ```
108
+ sh data_generator/generate_data_filter.sh
109
+ ```
110
+ ### 3. Inference Your Models
111
+ ```
112
+ # cd OpenFinBench
113
+ sh evaluator/inference/rag_inference.sh
114
+ ```
115
+ ### 4. Evaluate Your Models
116
+ #### (a) Rule-based Evaluation
117
+ ```
118
+ # cd OpenFinBench
119
+ sh evaluator/judgement/judger.sh # by setting judge_type="rule"
120
+ ```
121
+ #### (b) Model-based Evalution
122
+ We propose five model-based metric: accuracy, completeness, utilization, numerical_accuracy, and hallucination. We have trained two models from Qwen2.5-7B by the lora strategy and human-annotation labels to implement model-based evaluation.
123
+
124
+ Note that the evaluator of hallucination is different from other four. Their model checkpoint can be load from the following huggingface links:
125
+ 1. The evaluator for hallucination metric: <a href="https://huggingface.co/ShootingWong/OmniEval-HallucinationEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Checkpoint-b181d9></a>
126
+ 2. The evaluator for other metric: <a href="https://huggingface.co/ShootingWong/OmniEval-ModelEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Checkpoint-5fc372></a>
127
+
128
+
129
+
130
+ To implement model-based evaluation, you can first set up two vllm servers by the following codes:
131
+ ```
132
+ ```
133
+
134
+ Then conduct the model-based evaluate using the following codes, (change the parameters inner the bash file).
135
+ ```
136
+ sh evaluator/judgement/judger.sh
137
+ ```
138
+
139
+ ## :bookmark: License
140
+
141
+ OmniEval is licensed under the [<u>MIT License</u>](./LICENSE).
142
+
143
+ ## :star2: Citation
144
+ The paper is waiting to be released!
145
+
146
+ <!-- # Check Infos
147
+ ## Pipeline
148
+ 1. Build corpus
149
+ 2. Data generation
150
+ 3. RAG inference
151
+ 4. Result evaluatioin
152
+
153
+ ## Code
154
+ 1. remove "baichuan"
155
+ 2. remove useless annotation -->
156
 
 
 
157
 
158
  """
159
 
src/display/formatting.py CHANGED
@@ -5,6 +5,8 @@ def model_hyperlink(link, model_name):
5
  def make_clickable_model(model_name, model_link=None):
6
  if model_link:
7
  return model_hyperlink(model_link, model_name)
 
 
8
  link = f"https://huggingface.co/{model_name}"
9
  return model_hyperlink(link, model_name)
10
 
 
5
  def make_clickable_model(model_name, model_link=None):
6
  if model_link:
7
  return model_hyperlink(model_link, model_name)
8
+ if model_name == "CLOSE":
9
+ return model_name
10
  link = f"https://huggingface.co/{model_name}"
11
  return model_hyperlink(link, model_name)
12
 
src/display/utils.py CHANGED
@@ -17,17 +17,21 @@ class ColumnContent:
17
  displayed_by_default: bool
18
  hidden: bool = False
19
  never_hidden: bool = False
 
20
 
21
  ## Leaderboard columns
22
  auto_eval_column_dict = []
23
  # Init
24
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("Type Symbol", "str", True, never_hidden=True)])
25
  # auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
26
- auto_eval_column_dict.append(["retrieval_model", ColumnContent, ColumnContent("Retrieval Model", "markdown", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["generation_model", ColumnContent, ColumnContent("Generation Model", "markdown", True, never_hidden=True)])
 
 
28
 
29
  #Scores
30
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
 
 
17
  displayed_by_default: bool
18
  hidden: bool = False
19
  never_hidden: bool = False
20
+ never_displayed: bool = False
21
 
22
  ## Leaderboard columns
23
  auto_eval_column_dict = []
24
  # Init
25
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("Type Symbol", "str", True, never_hidden=True)])
26
  # auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
27
+ auto_eval_column_dict.append(["retrieval_model", ColumnContent, ColumnContent("Retrieval Model Plain", "markdown", False, never_displayed=True)])
28
+ auto_eval_column_dict.append(["generative_model", ColumnContent, ColumnContent("Generative Model Plain", "markdown", False, never_displayed=True)])
29
+ auto_eval_column_dict.append(["retrieval_model_link", ColumnContent, ColumnContent("Retrieval Model", "markdown", True, never_hidden=True)])
30
+ auto_eval_column_dict.append(["generative_model_link", ColumnContent, ColumnContent("Generative Model", "markdown", True, never_hidden=True)])
31
 
32
  #Scores
33
+ auto_eval_column_dict.append(["gen_average", ColumnContent, ColumnContent("Gen Average ⬆️", "number", True)])
34
+ auto_eval_column_dict.append(["ret_average", ColumnContent, ColumnContent("Ret Average ⬆️", "number", True)])
35
  for task in Tasks:
36
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
37
 
src/leaderboard/read_evals.py CHANGED
@@ -18,12 +18,12 @@ class EvalResult:
18
  """
19
  eval_name: str # org_model_precision (uid)
20
  org: str
21
- generation_model: str
22
  retrieval_model: str
23
  # revision: str # commit hash, "" if main
24
  results: dict
25
- generation_model_link: str = "" # link to the model on the hub
26
- generation_model_args: dict = None
27
  retrieval_model_link: str = "" # link to the model on the hub
28
  retrieval_model_args: dict = None
29
  precision: Precision = Precision.Unknown
@@ -48,7 +48,7 @@ class EvalResult:
48
 
49
  # Get model and org
50
  eval_name= config.get("eval_name", "")
51
- generation_model = config.get("generation_model", "")
52
  retrieval_model = config.get("retrieval_model", "")
53
  org= config.get("org", "")
54
  # org_and_model = org_and_model.split("/", 1)
@@ -77,17 +77,21 @@ class EvalResult:
77
  task = task.value
78
 
79
  # We average all scores of a given metric (not all metrics are present in all files)
80
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
81
  if accs.size == 0 or any([acc is None for acc in accs]):
82
  continue
83
 
84
  mean_acc = np.mean(accs) * 100.0
85
  results[task.benchmark] = mean_acc
 
 
 
86
 
87
- generation_model_args = config.get("generation_model_args", None)
 
88
  retrieval_model_args = config.get("retrieval_model_args", None)
89
  open_source= True
90
- if not generation_model_args or not generation_model_args.get("open_source", False):
91
  open_source = False
92
  if not retrieval_model_args or not retrieval_model_args.get("open_source", False):
93
  open_source = False
@@ -96,10 +100,10 @@ class EvalResult:
96
  eval_name=eval_name,
97
  # full_model=full_model,
98
  org=org,
99
- generation_model=generation_model,
100
  retrieval_model=retrieval_model,
101
  results=results,
102
- generation_model_args=generation_model_args,
103
  retrieval_model_args=retrieval_model_args,
104
  model_type=ModelType.OpenSource if open_source else ModelType.ClosedSource,
105
  # precision=precision,
@@ -126,7 +130,6 @@ class EvalResult:
126
 
127
  def to_dict(self):
128
  """Converts the Eval Result to a dict compatible with our dataframe display"""
129
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
130
  data_dict = {
131
  "eval_name": self.eval_name, # not a column, just a save name,
132
  # AutoEvalColumn.precision.name: self.precision.value.name,
@@ -134,20 +137,23 @@ class EvalResult:
134
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
135
  # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
136
  # AutoEvalColumn.architecture.name: self.architecture,
137
- AutoEvalColumn.generation_model.name: make_clickable_model(self.generation_model, self.generation_model_link),
138
- AutoEvalColumn.retrieval_model.name: make_clickable_model(self.retrieval_model, self.retrieval_model_link),
 
 
139
  # AutoEvalColumn.revision.name: self.revision,
140
- AutoEvalColumn.average.name: average,
 
141
  # AutoEvalColumn.license.name: self.license,
142
  # AutoEvalColumn.likes.name: self.likes,
143
- # AutoEvalColumn.generation_model_params.name: self.num_params,
144
  # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
145
- "Gen#Params (B)": self.generation_model_args.get("num_params", "Unknown"),
146
  "Ret#Params (B)": self.retrieval_model_args.get("num_params", "Unknown"),
147
  }
148
 
149
  for task in Tasks:
150
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
151
 
152
  return data_dict
153
 
 
18
  """
19
  eval_name: str # org_model_precision (uid)
20
  org: str
21
+ generative_model: str
22
  retrieval_model: str
23
  # revision: str # commit hash, "" if main
24
  results: dict
25
+ generative_model_link: str = "" # link to the model on the hub
26
+ generative_model_args: dict = None
27
  retrieval_model_link: str = "" # link to the model on the hub
28
  retrieval_model_args: dict = None
29
  precision: Precision = Precision.Unknown
 
48
 
49
  # Get model and org
50
  eval_name= config.get("eval_name", "")
51
+ generative_model = config.get("generative_model", "")
52
  retrieval_model = config.get("retrieval_model", "")
53
  org= config.get("org", "")
54
  # org_and_model = org_and_model.split("/", 1)
 
77
  task = task.value
78
 
79
  # We average all scores of a given metric (not all metrics are present in all files)
80
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k and task.col_name != "hallucination"])
81
  if accs.size == 0 or any([acc is None for acc in accs]):
82
  continue
83
 
84
  mean_acc = np.mean(accs) * 100.0
85
  results[task.benchmark] = mean_acc
86
+ results[task.metric] = data["results"][task.benchmark].get(task.metric, None)
87
+ if results[task.metric] is not None:
88
+ results[task.metric] = results[task.metric] * 100.0
89
 
90
+
91
+ generative_model_args = config.get("generative_model_args", None)
92
  retrieval_model_args = config.get("retrieval_model_args", None)
93
  open_source= True
94
+ if not generative_model_args or not generative_model_args.get("open_source", False):
95
  open_source = False
96
  if not retrieval_model_args or not retrieval_model_args.get("open_source", False):
97
  open_source = False
 
100
  eval_name=eval_name,
101
  # full_model=full_model,
102
  org=org,
103
+ generative_model=generative_model,
104
  retrieval_model=retrieval_model,
105
  results=results,
106
+ generative_model_args=generative_model_args,
107
  retrieval_model_args=retrieval_model_args,
108
  model_type=ModelType.OpenSource if open_source else ModelType.ClosedSource,
109
  # precision=precision,
 
130
 
131
  def to_dict(self):
132
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
133
  data_dict = {
134
  "eval_name": self.eval_name, # not a column, just a save name,
135
  # AutoEvalColumn.precision.name: self.precision.value.name,
 
137
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
138
  # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
139
  # AutoEvalColumn.architecture.name: self.architecture,
140
+ AutoEvalColumn.generative_model.name: self.generative_model,
141
+ AutoEvalColumn.retrieval_model.name: self.retrieval_model,
142
+ AutoEvalColumn.generative_model_link.name: make_clickable_model(self.generative_model, self.generative_model_link),
143
+ AutoEvalColumn.retrieval_model_link.name: make_clickable_model(self.retrieval_model, self.retrieval_model_link),
144
  # AutoEvalColumn.revision.name: self.revision,
145
+ AutoEvalColumn.ret_average.name: self.results["retrieval"],
146
+ AutoEvalColumn.gen_average.name: self.results["generation"],
147
  # AutoEvalColumn.license.name: self.license,
148
  # AutoEvalColumn.likes.name: self.likes,
149
+ # AutoEvalColumn.generative_model_params.name: self.num_params,
150
  # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
151
+ "Gen#Params (B)": self.generative_model_args.get("num_params", "Unknown"),
152
  "Ret#Params (B)": self.retrieval_model_args.get("num_params", "Unknown"),
153
  }
154
 
155
  for task in Tasks:
156
+ data_dict[task.value.col_name] = self.results[task.value.metric]
157
 
158
  return data_dict
159
 
src/populate.py CHANGED
@@ -15,8 +15,10 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
15
  print(all_data_json)
16
 
17
  df = pd.DataFrame.from_records(all_data_json)
18
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
19
  df = df[cols].round(decimals=2)
 
 
20
 
21
  # filter out if any of the benchmarks have not been produced
22
  df = df[has_no_nan_values(df, benchmark_cols)]
 
15
  print(all_data_json)
16
 
17
  df = pd.DataFrame.from_records(all_data_json)
18
+ df = df.sort_values(by=[AutoEvalColumn.gen_average.name], ascending=False)
19
  df = df[cols].round(decimals=2)
20
+ # assert column 'Retrieval Model Plain' in df.columns
21
+ # assert AutoEvalColumn.retrieval_model.name in df.columns, f"Column {df.columns} does not contain 'Retrieval Model Plain'"
22
 
23
  # filter out if any of the benchmarks have not been produced
24
  df = df[has_no_nan_values(df, benchmark_cols)]