yzabc007 commited on
Commit
4575ae2
·
1 Parent(s): c400723

Update space

Browse files
Files changed (2) hide show
  1. src/about.py +4 -5
  2. src/display/utils.py +20 -20
src/about.py CHANGED
@@ -57,11 +57,10 @@ TITLE = """<h1 align="center" id="space-title">Decentralized Arena</h1>"""
57
 
58
  # What does your leaderboard evaluate?
59
  INTRODUCTION_TEXT = """
60
- # Introduction
61
- TL;DR: We release Decentralized Arena that automates and scales “Chatbot Arena” for LLM evaluation across various
62
- fine-grained dimensions (e.g., math algebra, geometry, probability; logical reasoning, social reasoning,
63
- biology, chemistry, …). The evaluation is decentralized and democratic, with all LLMs participating
64
- in evaluating others. It achieves a 97\% correlation with Chatbot Arena's overall rankings, while being fully transparent and reproducible.
65
  """
66
 
67
  # Which evaluations are you running? how can people reproduce what you have?
 
57
 
58
  # What does your leaderboard evaluate?
59
  INTRODUCTION_TEXT = """
60
+ Decentralized Arena automates and scales "Chatbot Arena" for LLM evaluation across various fine-grained dimensions
61
+ (e.g., math algebra, geometry, probability; logical reasoning, social reasoning, biology, chemistry, …).
62
+ The evaluation is decentralized and democratic, with all LLMs participating in evaluating others.
63
+ It achieves a 95\% correlation with Chatbot Arena's overall rankings, while being fully transparent and reproducible.
 
64
  """
65
 
66
  # Which evaluations are you running? how can people reproduce what you have?
src/display/utils.py CHANGED
@@ -64,26 +64,26 @@ auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=l
64
  auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
65
 
66
  # fine-graine dimensions
67
- auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Overall", "number", True))])
68
- auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Math (Algebra)", "number", True))])
69
- auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Math (Geometry)", "number", True))])
70
- auto_eval_column_dict.append(["score_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Math (Probability)", "number", True))])
71
- auto_eval_column_dict.append(["score_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Logical Reasoning", "number", True))])
72
- auto_eval_column_dict.append(["score_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Social Reasoning", "number", True))])
73
-
74
- auto_eval_column_dict.append(["sd_overall", ColumnContent, field(default_factory=lambda: ColumnContent("SD Overall", "number", True))])
75
- auto_eval_column_dict.append(["sd_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("SD Math (Algebra)", "number", True))])
76
- auto_eval_column_dict.append(["sd_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("SD Math (Geometry)", "number", True))])
77
- auto_eval_column_dict.append(["sd_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("SD Math (Probability)", "number", True))])
78
- auto_eval_column_dict.append(["sd_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("SD Logical Reasoning", "number", True))])
79
- auto_eval_column_dict.append(["sd_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("SD Social Reasoning", "number", True))])
80
-
81
- auto_eval_column_dict.append(["rank_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Overall", "number", True))])
82
- auto_eval_column_dict.append(["rank_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Math (Algebra)", "number", True))])
83
- auto_eval_column_dict.append(["rank_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Math (Geometry)", "number", True))])
84
- auto_eval_column_dict.append(["rank_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Math (Probability)", "number", True))])
85
- auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Logical Reasoning", "number", True))])
86
- auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Social Reasoning", "number", True))])
87
 
88
 
89
  for task in Tasks:
 
64
  auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
65
 
66
  # fine-graine dimensions
67
+ auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Overall)", "number", True))])
68
+ auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
69
+ auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
70
+ auto_eval_column_dict.append(["score_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Probability)", "number", True))])
71
+ auto_eval_column_dict.append(["score_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Logical Reasoning)", "number", True))])
72
+ auto_eval_column_dict.append(["score_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Social Reasoning)", "number", True))])
73
+
74
+ auto_eval_column_dict.append(["sd_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev(Overall)", "number", True))])
75
+ auto_eval_column_dict.append(["sd_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Algebra)", "number", True))])
76
+ auto_eval_column_dict.append(["sd_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Geometry)", "number", True))])
77
+ auto_eval_column_dict.append(["sd_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Probability)", "number", True))])
78
+ auto_eval_column_dict.append(["sd_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Logical Reasoning)", "number", True))])
79
+ auto_eval_column_dict.append(["sd_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Social Reasoning)", "number", True))])
80
+
81
+ auto_eval_column_dict.append(["rank_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Overall)", "number", True))])
82
+ auto_eval_column_dict.append(["rank_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Algebra)", "number", True))])
83
+ auto_eval_column_dict.append(["rank_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Geometry)", "number", True))])
84
+ auto_eval_column_dict.append(["rank_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Probability)", "number", True))])
85
+ auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
86
+ auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
87
 
88
 
89
  for task in Tasks: