benediktstroebl commited on
Commit
56a86ce
·
1 Parent(s): 3a9a3a3

big update with dynamic pricing, agent metadata, about page on top, and new benchmarks

Browse files
Files changed (8) hide show
  1. .gitignore +2 -0
  2. agents_metadata.yaml +272 -0
  3. app.py +0 -0
  4. config.py +1 -0
  5. css.css +4 -0
  6. utils/db.py +395 -153
  7. utils/viz.py +1 -1
  8. verified_agents.yaml +0 -129
.gitignore CHANGED
@@ -3,3 +3,5 @@
3
  evals_upload/*
4
  evals_live/*
5
  evals_processed/*
 
 
 
3
  evals_upload/*
4
  evals_live/*
5
  evals_processed/*
6
+ *.db
7
+ .env
agents_metadata.yaml ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file contains metadata about agents for different benchmarks.
2
+ # Format:
3
+ # benchmark_name:
4
+ # - agent_name: "Name of the agent"
5
+ # verification_date: YYYY-MM-DD # Optional verification date
6
+ # url: "https://..." # Optional link to agent code/paper
7
+
8
+ gaia:
9
+ - agent_name: "Inspect ReAct Agent (gpt-4o-mini-2024-07-18)"
10
+ verification_date: 2024-11-30
11
+ url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
12
+ - agent_name: "Inspect ReAct Agent (gpt-4o-2024-11-20)"
13
+ verification_date: 2024-11-30
14
+ url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
15
+ - agent_name: "Inspect ReAct Agent (claude-3-5-sonnet-20241022)"
16
+ verification_date: 2024-11-30
17
+ url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
18
+ - agent_name: "Inspect ReAct Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
19
+ verification_date: 2024-11-30
20
+ url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
21
+ - agent_name: "Inspect ReAct Agent (o1-mini-2024-09-12)"
22
+ verification_date: 2024-11-30
23
+ url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
24
+
25
+ cybench:
26
+ - agent_name: "Inspect ReAct Agent (gpt-4o-mini-2024-07-18)"
27
+ verification_date: 2024-11-30
28
+ url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
29
+ - agent_name: "Inspect ReAct Agent (gpt-4o-2024-11-20)"
30
+ verification_date: 2024-11-30
31
+ url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
32
+ - agent_name: "Inspect ReAct Agent (claude-3-5-sonnet-20241022)"
33
+ verification_date: 2024-11-30
34
+ url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
35
+ - agent_name: "Inspect ReAct Agent (o1-mini-2024-09-12)"
36
+ verification_date: 2024-11-30
37
+ url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
38
+ - agent_name: "Inspect ReAct Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
39
+ verification_date: 2024-11-30
40
+ url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
41
+
42
+ swebench_verified:
43
+ - agent_name: "Moatless (gpt-4o-2024-08-06)"
44
+ verification_date: 2024-10-30
45
+ url: "https://github.com/aorwall/moatless-tools"
46
+ - agent_name: "Agentless (o1-mini-2024-09-12)"
47
+ verification_date: 2024-10-30
48
+ url: "https://github.com/OpenAutoCoder/Agentless"
49
+ - agent_name: "Moatless (claude-3-5-sonnet-20241022)"
50
+ verification_date: 2024-11-30
51
+ url: "https://github.com/aorwall/moatless-tools"
52
+ - agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
53
+ verification_date: 2024-11-30
54
+ url: "https://github.com/OpenAutoCoder/Agentless"
55
+
56
+ swebench_verified_mini:
57
+ - agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
58
+ verification_date: 2024-08-17
59
+ url: "https://github.com/OpenAutoCoder/Agentless"
60
+ - agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)"
61
+ verification_date: 2024-08-19
62
+ url: "https://github.com/princeton-nlp/SWE-agent"
63
+ - agent_name: "Moatless (gpt-4o-mini-2024-07-18)"
64
+ verification_date: 2024-10-30
65
+ url: "https://github.com/aorwall/moatless-tools"
66
+ - agent_name: "Moatless (gpt-4o-2024-08-06)"
67
+ verification_date: 2024-10-30
68
+ url: "https://github.com/aorwall/moatless-tools"
69
+ - agent_name: "Moatless (claude-3-5-sonnet-20241022)"
70
+ verification_date: 2024-10-30
71
+ url: "https://github.com/aorwall/moatless-tools"
72
+ - agent_name: "Agentless (o1-mini-2024-09-12)"
73
+ verification_date: 2024-10-30
74
+ url: "https://github.com/OpenAutoCoder/Agentless"
75
+
76
+ corebench_easy:
77
+ - agent_name: "AutoGPT (GPT-4o)"
78
+ verification_date: 2024-09-28
79
+ url: "https://github.com/princeton-nlp/SWE-bench"
80
+ - agent_name: "AutoGPT (GPT-4o-mini)"
81
+ verification_date: 2024-09-28
82
+ url: "https://github.com/princeton-nlp/SWE-bench"
83
+ - agent_name: "CORE-Agent (GPT-4o)"
84
+ verification_date: 2024-09-28
85
+ url: "https://github.com/princeton-nlp/SWE-bench"
86
+ - agent_name: "CORE-Agent (GPT-4o-mini)"
87
+ verification_date: 2024-09-28
88
+ url: "https://github.com/princeton-nlp/SWE-bench"
89
+
90
+ corebench_medium:
91
+ - agent_name: "AutoGPT (GPT-4o)"
92
+ verification_date: 2024-09-28
93
+ url: "https://github.com/Significant-Gravitas/AutoGPT"
94
+ - agent_name: "AutoGPT (GPT-4o-mini)"
95
+ verification_date: 2024-09-28
96
+ url: "https://github.com/Significant-Gravitas/AutoGPT"
97
+ - agent_name: "CORE-Agent (GPT-4o)"
98
+ verification_date: 2024-09-28
99
+ url: "https://github.com/siegelz/core-bench"
100
+ - agent_name: "CORE-Agent (GPT-4o-mini)"
101
+ verification_date: 2024-09-28
102
+ url: "https://github.com/siegelz/core-bench"
103
+
104
+ corebench_hard:
105
+ - agent_name: "AutoGPT (GPT-4o)"
106
+ verification_date: 2024-09-28
107
+ url: "https://github.com/Significant-Gravitas/AutoGPT"
108
+ - agent_name: "AutoGPT (GPT-4o-mini)"
109
+ verification_date: 2024-09-28
110
+ url: "https://github.com/Significant-Gravitas/AutoGPT"
111
+ - agent_name: "CORE-Agent (GPT-4o)"
112
+ verification_date: 2024-09-28
113
+ url: "https://github.com/siegelz/core-bench"
114
+ - agent_name: "CORE-Agent (GPT-4o-mini)"
115
+ verification_date: 2024-09-28
116
+ url: "https://github.com/siegelz/core-bench"
117
+
118
+ usaco:
119
+ - agent_name: "USACO Reflexion + Episodic (gpt-4o-mini-2024-07-18)"
120
+ verification_date: 2024-08-20
121
+ url: "https://github.com/princeton-nlp/USACO"
122
+ - agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-mini-2024-07-18)"
123
+ verification_date: 2024-08-20
124
+ url: "https://github.com/princeton-nlp/USACO"
125
+ - agent_name: "USACO Reflexion (gpt-4o-mini-2024-07-18)"
126
+ verification_date: 2024-08-20
127
+ url: "https://github.com/princeton-nlp/USACO"
128
+ - agent_name: "USACO Episodic (gpt-4o-mini-2024-07-18)"
129
+ verification_date: 2024-08-12
130
+ url: "https://github.com/princeton-nlp/USACO"
131
+ - agent_name: "USACO Reflexion + Semantic (gpt-4o-mini-2024-07-18)"
132
+ verification_date: 2024-08-20
133
+ url: "https://github.com/princeton-nlp/USACO"
134
+ - agent_name: "USACO Zero-shot (gpt-4o-mini-2024-07-18)"
135
+ verification_date: 2024-08-11
136
+ url: "https://github.com/princeton-nlp/USACO"
137
+ - agent_name: "USACO Semantic (gpt-4o-mini-2024-07-18)"
138
+ verification_date: 2024-08-12
139
+ url: "https://github.com/princeton-nlp/USACO"
140
+ - agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-2024-05-13)"
141
+ verification_date: 2024-08-25
142
+ url: "https://github.com/princeton-nlp/USACO"
143
+ - agent_name: "USACO Reflexion + Episodic (gpt-4o-2024-05-13)"
144
+ verification_date: 2024-08-25
145
+ url: "https://github.com/princeton-nlp/USACO"
146
+ - agent_name: "USACO Reflexion + Semantic (gpt-4o-2024-05-13)"
147
+ verification_date: 2024-08-25
148
+ url: "https://github.com/princeton-nlp/USACO"
149
+ - agent_name: "Episodic Retrial (2x) (gpt-4o-2024-05-13)"
150
+ verification_date: 2024-08-25
151
+ url: "https://github.com/princeton-nlp/USACO"
152
+ - agent_name: "Episodic Retrial (3x) (gpt-4o-mini-2024-07-18)"
153
+ verification_date: 2024-08-25
154
+ url: "https://github.com/princeton-nlp/USACO"
155
+ - agent_name: "Episodic Retrial (2x) (gpt-4o-mini-2024-07-18)"
156
+ verification_date: 2024-08-25
157
+ url: "https://github.com/princeton-nlp/USACO"
158
+ - agent_name: "Episodic Retrial (5x) (gpt-4o-mini-2024-07-18)"
159
+ verification_date: 2024-08-25
160
+ url: "https://github.com/princeton-nlp/USACO"
161
+ - agent_name: "Episodic Warming (3 Steps) (gpt-4o-mini-2024-07-18)"
162
+ verification_date: 2024-08-24
163
+ url: "https://github.com/princeton-nlp/USACO"
164
+ - agent_name: "USACO Episodic (gpt-4o-2024-05-13)"
165
+ verification_date: 2024-08-24
166
+ url: "https://github.com/princeton-nlp/USACO"
167
+ - agent_name: "USACO Semantic (gpt-4o-2024-05-13)"
168
+ verification_date: 2024-08-24
169
+ url: "https://github.com/princeton-nlp/USACO"
170
+ - agent_name: "Zero-shot Retrial (2x) (gpt-4o-mini-2024-07-18)"
171
+ verification_date: 2024-08-24
172
+ url: "https://github.com/princeton-nlp/USACO"
173
+ - agent_name: "Zero-shot Retrial (3x) (gpt-4o-mini-2024-07-18)"
174
+ verification_date: 2024-08-24
175
+ url: "https://github.com/princeton-nlp/USACO"
176
+ - agent_name: "Zero-shot Retrial (5x) (gpt-4o-mini-2024-07-18)"
177
+ verification_date: 2024-08-24
178
+ url: "https://github.com/princeton-nlp/USACO"
179
+ - agent_name: "USACO Zero-shot (gpt-4o-2024-05-13)"
180
+ verification_date: 2024-08-24
181
+ url: "https://github.com/princeton-nlp/USACO"
182
+ - agent_name: "USACO Zero-Shot (claude-3-5-sonnet-20241022)"
183
+ verification_date: 2024-11-30
184
+ url: "https://github.com/princeton-nlp/USACO"
185
+
186
+ appworld_test_normal:
187
+ - agent_name: "ReAct (gpt-4o-2024-05-13)"
188
+ verification_date: 2024-12-03
189
+ url: "https://appworld.dev/"
190
+ - agent_name: "PlanExec (gpt-4o-2024-05-13)"
191
+ verification_date: 2024-12-03
192
+ url: "https://appworld.dev/"
193
+ - agent_name: "FullCodeRefl (gpt-4o-2024-05-13)"
194
+ verification_date: 2024-12-03
195
+ url: "https://appworld.dev/"
196
+ - agent_name: "PlanExec (gpt-4-turbo-2024-04-09)"
197
+ verification_date: 2024-12-03
198
+ url: "https://appworld.dev/"
199
+ - agent_name: "IPFunCall (gpt-4o-2024-05-13)"
200
+ verification_date: 2024-12-03
201
+ url: "https://appworld.dev/"
202
+ - agent_name: "IPFunCall (gpt-4-turbo-2024-04-09)"
203
+ verification_date: 2024-12-03
204
+ url: "https://appworld.dev/"
205
+ - agent_name: "ReAct (gpt-4-turbo-2024-04-09)"
206
+ verification_date: 2024-12-03
207
+ url: "https://appworld.dev/"
208
+ - agent_name: "FullCodeRefl (gpt-4-turbo-2024-04-09)"
209
+ verification_date: 2024-12-03
210
+ url: "https://appworld.dev/"
211
+ - agent_name: "FullCodeRefl (meta-llama/Llama-3-70b-chat-hf)"
212
+ verification_date: 2024-12-03
213
+ url: "https://appworld.dev/"
214
+ - agent_name: "ReAct (meta-llama/Llama-3-70b-chat-hf)"
215
+ verification_date: 2024-12-03
216
+ url: "https://appworld.dev/"
217
+ - agent_name: "FullCodeRefl (deepseek-ai/deepseek-coder-33b-instruct)"
218
+ verification_date: 2024-12-03
219
+ url: "https://appworld.dev/"
220
+ - agent_name: "PlanExec (meta-llama/Llama-3-70b-chat-hf)"
221
+ verification_date: 2024-12-03
222
+ url: "https://appworld.dev/"
223
+ - agent_name: "ReAct (deepseek-ai/deepseek-coder-33b-instruct)"
224
+ verification_date: 2024-12-03
225
+ url: "https://appworld.dev/"
226
+ - agent_name: "PlanExec (deepseek-ai/deepseek-coder-33b-instruct)"
227
+ verification_date: 2024-12-03
228
+ url: "https://appworld.dev/"
229
+
230
+ appworld_test_challenge:
231
+ - agent_name: "ReAct (gpt-4o-2024-05-13)"
232
+ verification_date: 2024-12-03
233
+ url: "https://appworld.dev/"
234
+ - agent_name: "PlanExec (gpt-4o-2024-05-13)"
235
+ verification_date: 2024-12-03
236
+ url: "https://appworld.dev/"
237
+ - agent_name: "FullCodeRefl (gpt-4o-2024-05-13)"
238
+ verification_date: 2024-12-03
239
+ url: "https://appworld.dev/"
240
+ - agent_name: "PlanExec (gpt-4-turbo-2024-04-09)"
241
+ verification_date: 2024-12-03
242
+ url: "https://appworld.dev/"
243
+ - agent_name: "IPFunCall (gpt-4o-2024-05-13)"
244
+ verification_date: 2024-12-03
245
+ url: "https://appworld.dev/"
246
+ - agent_name: "IPFunCall (gpt-4-turbo-2024-04-09)"
247
+ verification_date: 2024-12-03
248
+ url: "https://appworld.dev/"
249
+ - agent_name: "ReAct (gpt-4-turbo-2024-04-09)"
250
+ verification_date: 2024-12-03
251
+ url: "https://appworld.dev/"
252
+ - agent_name: "FullCodeRefl (gpt-4-turbo-2024-04-09)"
253
+ verification_date: 2024-12-03
254
+ url: "https://appworld.dev/"
255
+ - agent_name: "FullCodeRefl (meta-llama/Llama-3-70b-chat-hf)"
256
+ verification_date: 2024-12-03
257
+ url: "https://appworld.dev/"
258
+ - agent_name: "ReAct (meta-llama/Llama-3-70b-chat-hf)"
259
+ verification_date: 2024-12-03
260
+ url: "https://appworld.dev/"
261
+ - agent_name: "FullCodeRefl (deepseek-ai/deepseek-coder-33b-instruct)"
262
+ verification_date: 2024-12-03
263
+ url: "https://appworld.dev/"
264
+ - agent_name: "PlanExec (meta-llama/Llama-3-70b-chat-hf)"
265
+ verification_date: 2024-12-03
266
+ url: "https://appworld.dev/"
267
+ - agent_name: "ReAct (deepseek-ai/deepseek-coder-33b-instruct)"
268
+ verification_date: 2024-12-03
269
+ url: "https://appworld.dev/"
270
+ - agent_name: "PlanExec (deepseek-ai/deepseek-coder-33b-instruct)"
271
+ verification_date: 2024-12-03
272
+ url: "https://appworld.dev/"
app.py CHANGED
The diff for this file is too large to render. See raw diff
 
config.py CHANGED
@@ -69,6 +69,7 @@ APPWORLD_ON_LOAD_COLUMNS = [
69
  "Accuracy",
70
  "Total Cost",
71
  "Runs",
 
72
  ]
73
  APPWORLD_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
74
  APPWORLD_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
 
69
  "Accuracy",
70
  "Total Cost",
71
  "Runs",
72
+ "Scenario Goal Completion"
73
  ]
74
  APPWORLD_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
75
  APPWORLD_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
css.css CHANGED
@@ -25,6 +25,10 @@ cursor: pointer;
25
  transition: background-color 0.3s;
26
  }
27
 
 
 
 
 
28
  .tab-nav button:hover,
29
  .tab-nav button.selected {
30
  background-color: var(--primary-color);
 
25
  transition: background-color 0.3s;
26
  }
27
 
28
+ a {
29
+ color: #000;
30
+ }
31
+
32
  .tab-nav button:hover,
33
  .tab-nav button.selected {
34
  background-color: var(--primary-color);
utils/db.py CHANGED
@@ -10,18 +10,162 @@ from scipy import stats
10
  import yaml
11
  import numpy as np
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  class TracePreprocessor:
14
- def __init__(self, db_path='preprocessed_traces.db'):
15
- self.db_path = db_path
 
16
  self.local = threading.local()
 
17
 
18
- def get_conn(self):
19
- if not hasattr(self.local, 'conn'):
20
- self.local.conn = sqlite3.connect(self.db_path)
21
- return self.local.conn
 
 
 
 
 
 
 
 
 
 
22
 
23
- def create_tables(self):
24
- with self.get_conn() as conn:
 
 
 
 
 
 
 
 
 
 
25
  conn.execute('''
26
  CREATE TABLE IF NOT EXISTS preprocessed_traces (
27
  benchmark_name TEXT,
@@ -43,37 +187,23 @@ class TracePreprocessor:
43
  )
44
  ''')
45
  conn.execute('''
46
- CREATE TABLE IF NOT EXISTS parsed_results (
47
  benchmark_name TEXT,
48
  agent_name TEXT,
49
- date TEXT,
50
  run_id TEXT,
51
- successful_tasks TEXT,
52
- failed_tasks TEXT,
53
- total_cost REAL,
54
- accuracy REAL,
55
- precision REAL,
56
- recall REAL,
57
- f1_score REAL,
58
- auc REAL,
59
- overall_score REAL,
60
- vectorization_score REAL,
61
- fathomnet_score REAL,
62
- feedback_score REAL,
63
- house_price_score REAL,
64
- spaceship_titanic_score REAL,
65
- amp_parkinsons_disease_progression_prediction_score REAL,
66
- cifar10_score REAL,
67
- imdb_score REAL,
68
- level_1_accuracy REAL,
69
- level_2_accuracy REAL,
70
- level_3_accuracy REAL,
71
- PRIMARY KEY (benchmark_name, agent_name, run_id)
72
  )
73
  ''')
74
 
75
  def preprocess_traces(self, processed_dir="evals_live"):
76
- self.create_tables()
77
  processed_dir = Path(processed_dir)
78
  for file in processed_dir.glob('*.json'):
79
  with open(file, 'r') as f:
@@ -85,9 +215,12 @@ class TracePreprocessor:
85
  date = data['config']['date']
86
  config = data['config']
87
 
 
 
 
88
  try:
89
  raw_logging_results = pickle.dumps(data['raw_logging_results'])
90
- with self.get_conn() as conn:
91
  conn.execute('''
92
  INSERT OR REPLACE INTO preprocessed_traces
93
  (benchmark_name, agent_name, date, run_id, raw_logging_results)
@@ -98,63 +231,76 @@ class TracePreprocessor:
98
 
99
  try:
100
  failure_report = pickle.dumps(data['failure_report'])
101
- with self.get_conn() as conn:
102
  conn.execute('''
103
  INSERT INTO failure_reports
104
  (benchmark_name, agent_name, date, run_id, failure_report)
105
- VALUES (?, ?, ?, ? ,?)
106
  ''', (benchmark_name, agent_name, date, config['run_id'], failure_report))
107
  except Exception as e:
108
  print(f"Error preprocessing failure_report in {file}: {e}")
109
 
110
  try:
111
- config = data['config']
112
  results = data['results']
113
- with self.get_conn() as conn:
114
- conn.execute('''
115
- INSERT INTO parsed_results
116
- (benchmark_name, agent_name, date, run_id, successful_tasks, failed_tasks, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score, level_1_accuracy, level_2_accuracy, level_3_accuracy)
117
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
118
- ''', (
 
119
  benchmark_name,
120
  agent_name,
121
  config['date'],
122
- config['run_id'],
123
- str(results.get('successful_tasks')),
124
- str(results.get('failed_tasks')),
125
- results.get('total_cost'),
126
- results.get('accuracy'),
127
- results.get('precision'),
128
- results.get('recall'),
129
- results.get('f1_score'),
130
- results.get('auc'),
131
- results.get('overall_score'),
132
- results.get('vectorization_score'),
133
- results.get('fathomnet_score'),
134
- results.get('feedback_score'),
135
- results.get('house-price_score'),
136
- results.get('spaceship-titanic_score'),
137
- results.get('amp-parkinsons-disease-progression-prediction_score'),
138
- results.get('cifar10_score'),
139
- results.get('imdb_score'),
140
- results.get('level_1_accuracy'),
141
- results.get('level_2_accuracy'),
142
- results.get('level_3_accuracy')
143
- ))
144
  except Exception as e:
145
  print(f"Error preprocessing parsed results in {file}: {e}")
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  @lru_cache(maxsize=100)
148
  def get_analyzed_traces(self, agent_name, benchmark_name):
149
- with self.get_conn() as conn:
150
  query = '''
151
  SELECT agent_name, raw_logging_results, date FROM preprocessed_traces
152
  WHERE benchmark_name = ? AND agent_name = ?
153
  '''
154
  df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
155
 
156
-
157
- # check for each row if raw_logging_results is not None with pickle.loads because it is stored as a byte string
158
  df = df[df['raw_logging_results'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
159
 
160
  if len(df) == 0:
@@ -163,32 +309,26 @@ class TracePreprocessor:
163
  # select latest run
164
  df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
165
 
166
-
167
  return pickle.loads(df['raw_logging_results'][0])
168
 
169
-
170
  @lru_cache(maxsize=100)
171
  def get_failure_report(self, agent_name, benchmark_name):
172
- with self.get_conn() as conn:
173
  query = '''
174
  SELECT agent_name, date, failure_report FROM failure_reports
175
  WHERE benchmark_name = ? AND agent_name = ?
176
  '''
177
  df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
178
 
179
- # Select only rows for which failure report is not None and None is a string
180
  df = df[df['failure_report'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
181
 
182
  if len(df) == 0:
183
  return None
184
-
185
 
186
- # if there is multiple failure reports, take the last one
187
  df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
188
 
189
- # if there is a failure report, return the first one
190
  return pickle.loads(df['failure_report'][0])
191
-
192
  def _calculate_ci(self, data, confidence=0.95, type='minmax'):
193
  data = data[np.isfinite(data)]
194
 
@@ -209,7 +349,7 @@ class TracePreprocessor:
209
  return mean, ci[0], ci[1]
210
 
211
  def get_parsed_results(self, benchmark_name, aggregate=True):
212
- with self.get_conn() as conn:
213
  query = '''
214
  SELECT * FROM parsed_results
215
  WHERE benchmark_name = ?
@@ -217,21 +357,44 @@ class TracePreprocessor:
217
  '''
218
  df = pd.read_sql_query(query, conn, params=(benchmark_name,))
219
 
220
- # Load verified agents
221
- verified_agents = self.load_verified_agents()
 
 
 
 
 
 
 
 
222
 
223
  # Add 'Verified' column
 
224
  df['Verified'] = df.apply(lambda row: '✓' if (benchmark_name, row['agent_name']) in verified_agents else '', axis=1)
225
 
226
-
 
 
 
227
 
228
  # Add column for how many times an agent_name appears in the DataFrame
229
  df['Runs'] = df.groupby('agent_name')['agent_name'].transform('count')
230
 
231
  # Compute the 95% confidence interval for accuracy and cost for agents that have been run more than once
232
- df['acc_ci'] = None
233
  df['cost_ci'] = None
234
 
 
 
 
 
 
 
 
 
 
 
 
235
  for agent_name in df['agent_name'].unique():
236
  agent_df = df[df['agent_name'] == agent_name]
237
 
@@ -239,86 +402,28 @@ class TracePreprocessor:
239
  accuracy_mean, accuracy_lower, accuracy_upper = self._calculate_ci(agent_df['accuracy'], type='minmax')
240
  cost_mean, cost_lower, cost_upper = self._calculate_ci(agent_df['total_cost'], type='minmax')
241
 
242
- # format the confidence interval with +/- sign
243
- # accuracy_ci = f"± {abs(accuracy_mean - accuracy_lower):.3f}"
244
- # cost_ci = f"± {abs(cost_mean - cost_lower):.3f}"
245
-
246
- accuracy_ci = f"-{abs(accuracy_mean - accuracy_lower):.3f}/+{abs(accuracy_mean - accuracy_upper):.3f}"
247
- cost_ci = f"-{abs(cost_mean - cost_lower):.3f}/+{abs(cost_mean - cost_upper):.3f}"
248
 
249
- df.loc[df['agent_name'] == agent_name, 'acc_ci'] = accuracy_ci
250
  df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
251
 
252
-
253
  df = df.drop(columns=['successful_tasks', 'failed_tasks', 'run_id'], axis=1)
254
 
255
  if aggregate:
256
- # For agents that have been run more than once, compute the average accuracy and cost and use that as the value in the DataFrame
257
- df = df.groupby('agent_name').agg({
258
- 'date': 'first',
259
- 'total_cost': 'mean',
260
- 'accuracy': 'mean',
261
- 'precision': 'mean',
262
- 'recall': 'mean',
263
- 'f1_score': 'mean',
264
- 'auc': 'mean',
265
- 'overall_score': 'mean',
266
- 'vectorization_score': 'mean',
267
- 'fathomnet_score': 'mean',
268
- 'feedback_score': 'mean',
269
- 'house_price_score': 'mean',
270
- 'spaceship_titanic_score': 'mean',
271
- 'amp_parkinsons_disease_progression_prediction_score': 'mean',
272
- 'cifar10_score': 'mean',
273
- 'imdb_score': 'mean',
274
- 'level_1_accuracy': 'mean',
275
- 'level_2_accuracy': 'mean',
276
- 'level_3_accuracy': 'mean',
277
- 'Verified': 'first',
278
- 'Runs': 'first',
279
- 'acc_ci': 'first',
280
- 'cost_ci': 'first'
281
- }).reset_index()
282
-
283
- # Round float columns to 3 decimal places
284
- float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score', 'level_1_accuracy', 'level_2_accuracy', 'level_3_accuracy']
285
- for column in float_columns:
286
- if column in df.columns:
287
- df[column] = df[column].round(3)
288
-
289
- # sort by accuracy
290
- df = df.sort_values('accuracy', ascending=False)
291
 
292
- # Rename columns
293
- df = df.rename(columns={
294
- 'agent_name': 'Agent Name',
295
- 'date': 'Date',
296
- 'total_cost': 'Total Cost',
297
- 'accuracy': 'Accuracy',
298
- 'precision': 'Precision',
299
- 'recall': 'Recall',
300
- 'f1_score': 'F1 Score',
301
- 'auc': 'AUC',
302
- 'overall_score': 'Overall Score',
303
- 'vectorization_score': 'Vectorization Score',
304
- 'fathomnet_score': 'Fathomnet Score',
305
- 'feedback_score': 'Feedback Score',
306
- 'house_price_score': 'House Price Score',
307
- 'spaceship_titanic_score': 'Spaceship Titanic Score',
308
- 'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
309
- 'cifar10_score': 'CIFAR10 Score',
310
- 'imdb_score': 'IMDB Score',
311
- 'level_1_accuracy': 'Level 1 Accuracy',
312
- 'level_2_accuracy': 'Level 2 Accuracy',
313
- 'level_3_accuracy': 'Level 3 Accuracy',
314
- 'acc_ci': 'Accuracy CI',
315
- 'cost_ci': 'Total Cost CI'
316
- })
317
 
318
  return df
319
 
320
  def get_task_success_data(self, benchmark_name):
321
- with self.get_conn() as conn:
322
  query = '''
323
  SELECT agent_name, accuracy, successful_tasks, failed_tasks
324
  FROM parsed_results
@@ -359,17 +464,154 @@ class TracePreprocessor:
359
 
360
  return df
361
 
362
- def load_verified_agents(self, file_path='verified_agents.yaml'):
363
  with open(file_path, 'r') as f:
364
- verified_data = yaml.safe_load(f)
365
 
366
  verified_agents = set()
367
- for benchmark, agents in verified_data.items():
368
  for agent in agents:
369
- verified_agents.add((benchmark, agent['agent_name']))
 
370
 
371
  return verified_agents
372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  if __name__ == '__main__':
374
  preprocessor = TracePreprocessor()
375
  preprocessor.preprocess_traces()
 
10
  import yaml
11
  import numpy as np
12
 
13
+ # Define column schemas
14
+ PARSED_RESULTS_COLUMNS = {
15
+ 'benchmark_name': 'TEXT',
16
+ 'agent_name': 'TEXT',
17
+ 'date': 'TEXT',
18
+ 'run_id': 'TEXT',
19
+ 'successful_tasks': 'TEXT',
20
+ 'failed_tasks': 'TEXT',
21
+ 'total_cost': 'REAL',
22
+ 'accuracy': 'REAL',
23
+ 'precision': 'REAL',
24
+ 'recall': 'REAL',
25
+ 'f1_score': 'REAL',
26
+ 'auc': 'REAL',
27
+ 'overall_score': 'REAL',
28
+ 'vectorization_score': 'REAL',
29
+ 'fathomnet_score': 'REAL',
30
+ 'feedback_score': 'REAL',
31
+ 'house_price_score': 'REAL',
32
+ 'spaceship_titanic_score': 'REAL',
33
+ 'amp_parkinsons_disease_progression_prediction_score': 'REAL',
34
+ 'cifar10_score': 'REAL',
35
+ 'imdb_score': 'REAL',
36
+ 'level_1_accuracy': 'REAL',
37
+ 'level_2_accuracy': 'REAL',
38
+ 'level_3_accuracy': 'REAL',
39
+ 'task_goal_completion': 'REAL', # New column
40
+ 'scenario_goal_completion': 'REAL', # New column
41
+ 'accuracy_ci': 'TEXT', # Using TEXT since it stores formatted strings like "-0.123/+0.456"
42
+ 'cost_ci': 'TEXT',
43
+ }
44
+
45
+ # Define which columns should be included in aggregation and how
46
+ AGGREGATION_RULES = {
47
+ 'date': 'first',
48
+ 'total_cost': 'mean',
49
+ 'accuracy': 'mean',
50
+ 'precision': 'mean',
51
+ 'recall': 'mean',
52
+ 'f1_score': 'mean',
53
+ 'auc': 'mean',
54
+ 'overall_score': 'mean',
55
+ 'vectorization_score': 'mean',
56
+ 'fathomnet_score': 'mean',
57
+ 'feedback_score': 'mean',
58
+ 'house_price_score': 'mean',
59
+ 'spaceship_titanic_score': 'mean',
60
+ 'amp_parkinsons_disease_progression_prediction_score': 'mean',
61
+ 'cifar10_score': 'mean',
62
+ 'imdb_score': 'mean',
63
+ 'level_1_accuracy': 'mean',
64
+ 'level_2_accuracy': 'mean',
65
+ 'level_3_accuracy': 'mean',
66
+ 'task_goal_completion': 'mean',
67
+ 'scenario_goal_completion': 'mean',
68
+ 'Verified': 'first',
69
+ 'Runs': 'first',
70
+ 'accuracy_ci': 'first',
71
+ 'cost_ci': 'first',
72
+ }
73
+
74
+ # Define column display names
75
+ COLUMN_DISPLAY_NAMES = {
76
+ 'agent_name': 'Agent Name',
77
+ 'date': 'Date',
78
+ 'total_cost': 'Total Cost',
79
+ 'accuracy': 'Accuracy',
80
+ 'precision': 'Precision',
81
+ 'recall': 'Recall',
82
+ 'f1_score': 'F1 Score',
83
+ 'auc': 'AUC',
84
+ 'overall_score': 'Overall Score',
85
+ 'vectorization_score': 'Vectorization Score',
86
+ 'fathomnet_score': 'Fathomnet Score',
87
+ 'feedback_score': 'Feedback Score',
88
+ 'house_price_score': 'House Price Score',
89
+ 'spaceship_titanic_score': 'Spaceship Titanic Score',
90
+ 'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
91
+ 'cifar10_score': 'CIFAR10 Score',
92
+ 'imdb_score': 'IMDB Score',
93
+ 'level_1_accuracy': 'Level 1 Accuracy',
94
+ 'level_2_accuracy': 'Level 2 Accuracy',
95
+ 'level_3_accuracy': 'Level 3 Accuracy',
96
+ 'task_goal_completion': 'Task Goal Completion',
97
+ 'scenario_goal_completion': 'Scenario Goal Completion',
98
+ 'accuracy_ci': 'Accuracy CI',
99
+ 'cost_ci': 'Total Cost CI',
100
+ }
101
+
102
+ DEFAULT_PRICING = {
103
+ "text-embedding-3-small": {"prompt_tokens": 0.02, "completion_tokens": 0},
104
+ "text-embedding-3-large": {"prompt_tokens": 0.13, "completion_tokens": 0},
105
+ "gpt-4o-2024-05-13": {"prompt_tokens": 2.5, "completion_tokens": 10},
106
+ "gpt-4o-2024-08-06": {"prompt_tokens": 2.5, "completion_tokens": 10},
107
+ "gpt-3.5-turbo-0125": {"prompt_tokens": 0.5, "completion_tokens": 1.5},
108
+ "gpt-3.5-turbo": {"prompt_tokens": 0.5, "completion_tokens": 1.5},
109
+ "gpt-4-turbo-2024-04-09": {"prompt_tokens": 10, "completion_tokens": 30},
110
+ "gpt-4-turbo": {"prompt_tokens": 10, "completion_tokens": 30},
111
+ "gpt-4o-mini-2024-07-18": {"prompt_tokens": 0.15, "completion_tokens": 0.6},
112
+ "meta-llama/Meta-Llama-3.1-8B-Instruct": {"prompt_tokens": 0.18, "completion_tokens": 0.18},
113
+ "meta-llama/Meta-Llama-3.1-70B-Instruct": {"prompt_tokens": 0.88, "completion_tokens": 0.88},
114
+ "meta-llama/Meta-Llama-3.1-405B-Instruct": {"prompt_tokens": 5, "completion_tokens": 15},
115
+ "gpt-4o": {"prompt_tokens": 2.5, "completion_tokens": 10},
116
+ "o1-mini-2024-09-12": {"prompt_tokens": 3, "completion_tokens": 12},
117
+ "o1-preview-2024-09-12": {"prompt_tokens": 15, "completion_tokens": 60},
118
+ "claude-3-5-sonnet-20240620": {"prompt_tokens": 3, "completion_tokens": 15},
119
+ "claude-3-5-sonnet-20241022": {"prompt_tokens": 3, "completion_tokens": 15},
120
+ "us.anthropic.claude-3-5-sonnet-20240620-v1:0": {"prompt_tokens": 3, "completion_tokens": 15},
121
+ "us.anthropic.claude-3-5-sonnet-20241022-v2:0": {"prompt_tokens": 3, "completion_tokens": 15},
122
+ "openai/gpt-4o-2024-11-20": {"prompt_tokens": 2.5, "completion_tokens": 10},
123
+ "openai/gpt-4o-2024-08-06": {"prompt_tokens": 2.5, "completion_tokens": 10},
124
+ "openai/gpt-4o-mini-2024-07-18": {"prompt_tokens": 0.15, "completion_tokens": 0.6},
125
+ "openai/o1-mini-2024-09-12": {"prompt_tokens": 3, "completion_tokens": 12},
126
+ "openai/o1-preview-2024-09-12": {"prompt_tokens": 15, "completion_tokens": 60},
127
+ "anthropic/claude-3-5-sonnet-20240620": {"prompt_tokens": 3, "completion_tokens": 15},
128
+ "anthropic/claude-3-5-sonnet-20241022": {"prompt_tokens": 3, "completion_tokens": 15},
129
+ "google/gemini-1.5-pro": {"prompt_tokens": 1.25, "completion_tokens": 5},
130
+ "google/gemini-1.5-flash": {"prompt_tokens": 0.075, "completion_tokens": 0.3},
131
+ "together/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo": {"prompt_tokens": 3.5, "completion_tokens": 3.5},
132
+ "together/meta-llama/Meta-Llama-3.1-70B-Instruct": {"prompt_tokens": 0.88, "completion_tokens": 0.88},
133
+ }
134
+
135
  class TracePreprocessor:
136
+ def __init__(self, db_dir='preprocessed_traces'):
137
+ self.db_dir = Path(db_dir)
138
+ self.db_dir.mkdir(exist_ok=True)
139
  self.local = threading.local()
140
+ self.connections = {}
141
 
142
+ def get_conn(self, benchmark_name):
143
+ # Sanitize benchmark name for filename
144
+ safe_name = benchmark_name.replace('/', '_').replace('\\', '_')
145
+ db_path = self.db_dir / f"{safe_name}.db"
146
+
147
+ # Get thread-specific connections dictionary
148
+ if not hasattr(self.local, 'connections'):
149
+ self.local.connections = {}
150
+
151
+ # Create new connection if not exists for this benchmark
152
+ if safe_name not in self.local.connections:
153
+ self.local.connections[safe_name] = sqlite3.connect(db_path)
154
+
155
+ return self.local.connections[safe_name]
156
 
157
+ def create_tables(self, benchmark_name):
158
+ with self.get_conn(benchmark_name) as conn:
159
+ # Create parsed_results table dynamically from schema
160
+ columns = [f"{col} {dtype}" for col, dtype in PARSED_RESULTS_COLUMNS.items()]
161
+ create_parsed_results = f'''
162
+ CREATE TABLE IF NOT EXISTS parsed_results (
163
+ {', '.join(columns)},
164
+ PRIMARY KEY (benchmark_name, agent_name, run_id)
165
+ )
166
+ '''
167
+ conn.execute(create_parsed_results)
168
+
169
  conn.execute('''
170
  CREATE TABLE IF NOT EXISTS preprocessed_traces (
171
  benchmark_name TEXT,
 
187
  )
188
  ''')
189
  conn.execute('''
190
+ CREATE TABLE IF NOT EXISTS token_usage (
191
  benchmark_name TEXT,
192
  agent_name TEXT,
 
193
  run_id TEXT,
194
+ model_name TEXT,
195
+ prompt_tokens INTEGER,
196
+ completion_tokens INTEGER,
197
+ input_tokens INTEGER,
198
+ output_tokens INTEGER,
199
+ total_tokens INTEGER,
200
+ input_tokens_cache_write INTEGER,
201
+ input_tokens_cache_read INTEGER,
202
+ PRIMARY KEY (benchmark_name, agent_name, run_id, model_name)
 
 
 
 
 
 
 
 
 
 
 
 
203
  )
204
  ''')
205
 
206
  def preprocess_traces(self, processed_dir="evals_live"):
 
207
  processed_dir = Path(processed_dir)
208
  for file in processed_dir.glob('*.json'):
209
  with open(file, 'r') as f:
 
215
  date = data['config']['date']
216
  config = data['config']
217
 
218
+ # Create tables for this benchmark if they don't exist
219
+ self.create_tables(benchmark_name)
220
+
221
  try:
222
  raw_logging_results = pickle.dumps(data['raw_logging_results'])
223
+ with self.get_conn(benchmark_name) as conn:
224
  conn.execute('''
225
  INSERT OR REPLACE INTO preprocessed_traces
226
  (benchmark_name, agent_name, date, run_id, raw_logging_results)
 
231
 
232
  try:
233
  failure_report = pickle.dumps(data['failure_report'])
234
+ with self.get_conn(benchmark_name) as conn:
235
  conn.execute('''
236
  INSERT INTO failure_reports
237
  (benchmark_name, agent_name, date, run_id, failure_report)
238
+ VALUES (?, ?, ?, ?, ?)
239
  ''', (benchmark_name, agent_name, date, config['run_id'], failure_report))
240
  except Exception as e:
241
  print(f"Error preprocessing failure_report in {file}: {e}")
242
 
243
  try:
 
244
  results = data['results']
245
+ with self.get_conn(benchmark_name) as conn:
246
+ # Dynamically create placeholders and values list
247
+ columns = [col for col in PARSED_RESULTS_COLUMNS.keys()
248
+ if col not in ['benchmark_name', 'agent_name', 'date', 'run_id']]
249
+ placeholders = ','.join(['?'] * (len(columns) + 4)) # +4 for benchmark_name, agent_name, date, run_id
250
+
251
+ values = [
252
  benchmark_name,
253
  agent_name,
254
  config['date'],
255
+ config['run_id']
256
+ ] + [str(results.get(col)) if col in ['successful_tasks', 'failed_tasks']
257
+ else results.get(col) for col in columns]
258
+
259
+ query = f'''
260
+ INSERT INTO parsed_results
261
+ ({', '.join(PARSED_RESULTS_COLUMNS.keys())})
262
+ VALUES ({placeholders})
263
+ '''
264
+ conn.execute(query, values)
 
 
 
 
 
 
 
 
 
 
 
 
265
  except Exception as e:
266
  print(f"Error preprocessing parsed results in {file}: {e}")
267
 
268
+ try:
269
+ total_usage = data.get('total_usage', {})
270
+ for model_name, usage in total_usage.items():
271
+ with self.get_conn(benchmark_name) as conn:
272
+ conn.execute('''
273
+ INSERT INTO token_usage
274
+ (benchmark_name, agent_name, run_id, model_name,
275
+ prompt_tokens, completion_tokens, input_tokens, output_tokens, total_tokens,
276
+ input_tokens_cache_write, input_tokens_cache_read)
277
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
278
+ ''', (
279
+ benchmark_name,
280
+ agent_name,
281
+ config['run_id'],
282
+ model_name,
283
+ usage.get('prompt_tokens', 0),
284
+ usage.get('completion_tokens', 0),
285
+ usage.get('input_tokens', 0),
286
+ usage.get('output_tokens', 0),
287
+ usage.get('total_tokens', 0),
288
+ usage.get('input_tokens_cache_write', 0),
289
+ usage.get('input_tokens_cache_read', 0)
290
+ ))
291
+ except Exception as e:
292
+ print(f"Error preprocessing token usage in {file}: {e}")
293
+
294
  @lru_cache(maxsize=100)
295
  def get_analyzed_traces(self, agent_name, benchmark_name):
296
+ with self.get_conn(benchmark_name) as conn:
297
  query = '''
298
  SELECT agent_name, raw_logging_results, date FROM preprocessed_traces
299
  WHERE benchmark_name = ? AND agent_name = ?
300
  '''
301
  df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
302
 
303
+ # check for each row if raw_logging_results is not None
 
304
  df = df[df['raw_logging_results'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
305
 
306
  if len(df) == 0:
 
309
  # select latest run
310
  df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
311
 
 
312
  return pickle.loads(df['raw_logging_results'][0])
313
 
 
314
  @lru_cache(maxsize=100)
315
  def get_failure_report(self, agent_name, benchmark_name):
316
+ with self.get_conn(benchmark_name) as conn:
317
  query = '''
318
  SELECT agent_name, date, failure_report FROM failure_reports
319
  WHERE benchmark_name = ? AND agent_name = ?
320
  '''
321
  df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
322
 
 
323
  df = df[df['failure_report'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
324
 
325
  if len(df) == 0:
326
  return None
 
327
 
 
328
  df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
329
 
 
330
  return pickle.loads(df['failure_report'][0])
331
+
332
  def _calculate_ci(self, data, confidence=0.95, type='minmax'):
333
  data = data[np.isfinite(data)]
334
 
 
349
  return mean, ci[0], ci[1]
350
 
351
  def get_parsed_results(self, benchmark_name, aggregate=True):
352
+ with self.get_conn(benchmark_name) as conn:
353
  query = '''
354
  SELECT * FROM parsed_results
355
  WHERE benchmark_name = ?
 
357
  '''
358
  df = pd.read_sql_query(query, conn, params=(benchmark_name,))
359
 
360
+ # Load metadata
361
+ with open('agents_metadata.yaml', 'r') as f:
362
+ metadata = yaml.safe_load(f)
363
+
364
+ # Create URL mapping
365
+ url_mapping = {}
366
+ if benchmark_name in metadata:
367
+ for agent in metadata[benchmark_name]:
368
+ if 'url' in agent and agent['url']: # Only add if URL exists and is not empty
369
+ url_mapping[agent['agent_name']] = agent['url']
370
 
371
  # Add 'Verified' column
372
+ verified_agents = self.load_verified_agents()
373
  df['Verified'] = df.apply(lambda row: '✓' if (benchmark_name, row['agent_name']) in verified_agents else '', axis=1)
374
 
375
+ # Add URLs to agent names if they exist
376
+ df['agent_name'] = df['agent_name'].apply(
377
+ lambda x: f'[{x}]({url_mapping[x]})' if x in url_mapping else x
378
+ )
379
 
380
  # Add column for how many times an agent_name appears in the DataFrame
381
  df['Runs'] = df.groupby('agent_name')['agent_name'].transform('count')
382
 
383
  # Compute the 95% confidence interval for accuracy and cost for agents that have been run more than once
384
+ df['accuracy_ci'] = None
385
  df['cost_ci'] = None
386
 
387
+ # Round float columns to 2 decimal places
388
+ float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc',
389
+ 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score',
390
+ 'house_price_score', 'spaceship_titanic_score',
391
+ 'amp_parkinsons_disease_progression_prediction_score', 'cifar10_score',
392
+ 'imdb_score', 'level_1_accuracy', 'level_2_accuracy', 'level_3_accuracy']
393
+
394
+ for column in float_columns:
395
+ if column in df.columns:
396
+ df[column] = df[column].round(2)
397
+
398
  for agent_name in df['agent_name'].unique():
399
  agent_df = df[df['agent_name'] == agent_name]
400
 
 
402
  accuracy_mean, accuracy_lower, accuracy_upper = self._calculate_ci(agent_df['accuracy'], type='minmax')
403
  cost_mean, cost_lower, cost_upper = self._calculate_ci(agent_df['total_cost'], type='minmax')
404
 
405
+ # Round CI values to 2 decimals
406
+ accuracy_ci = f"-{abs(accuracy_mean - accuracy_lower):.2f}/+{abs(accuracy_mean - accuracy_upper):.2f}"
407
+ cost_ci = f"-{abs(cost_mean - cost_lower):.2f}/+{abs(cost_mean - cost_upper):.2f}"
 
 
 
408
 
409
+ df.loc[df['agent_name'] == agent_name, 'accuracy_ci'] = accuracy_ci
410
  df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
411
 
 
412
  df = df.drop(columns=['successful_tasks', 'failed_tasks', 'run_id'], axis=1)
413
 
414
  if aggregate:
415
+ df = df.groupby('agent_name').agg(AGGREGATION_RULES).reset_index()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
 
417
+ # Rename columns using the display names mapping
418
+ df = df.rename(columns=COLUMN_DISPLAY_NAMES)
419
+
420
+ # Sort by Accuracy in descending order
421
+ df = df.sort_values('Accuracy', ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
 
423
  return df
424
 
425
  def get_task_success_data(self, benchmark_name):
426
+ with self.get_conn(benchmark_name) as conn:
427
  query = '''
428
  SELECT agent_name, accuracy, successful_tasks, failed_tasks
429
  FROM parsed_results
 
464
 
465
  return df
466
 
467
+ def load_verified_agents(self, file_path='agents_metadata.yaml'):
468
  with open(file_path, 'r') as f:
469
+ metadata = yaml.safe_load(f)
470
 
471
  verified_agents = set()
472
+ for benchmark, agents in metadata.items():
473
  for agent in agents:
474
+ if 'verification_date' in agent: # Only add if verified
475
+ verified_agents.add((benchmark, agent['agent_name']))
476
 
477
  return verified_agents
478
 
479
+ def get_token_usage_with_costs(self, benchmark_name, pricing_config=None):
480
+ """Get token usage data with configurable pricing"""
481
+ if pricing_config is None:
482
+ pricing_config = DEFAULT_PRICING
483
+
484
+ with self.get_conn(benchmark_name) as conn:
485
+ query = '''
486
+ SELECT agent_name, model_name,
487
+ SUM(prompt_tokens) as prompt_tokens,
488
+ SUM(completion_tokens) as completion_tokens,
489
+ SUM(input_tokens) as input_tokens,
490
+ SUM(output_tokens) as output_tokens,
491
+ SUM(total_tokens) as total_tokens,
492
+ SUM(input_tokens_cache_write) as input_tokens_cache_write,
493
+ SUM(input_tokens_cache_read) as input_tokens_cache_read
494
+ FROM token_usage
495
+ WHERE benchmark_name = ?
496
+ GROUP BY agent_name, model_name
497
+ '''
498
+ df = pd.read_sql_query(query, conn, params=(benchmark_name,))
499
+
500
+ # Calculate costs based on pricing config (prices are per 1M tokens)
501
+ df['total_cost'] = 0.0
502
+ for model, prices in pricing_config.items():
503
+ mask = df['model_name'] == model
504
+ df.loc[mask, 'total_cost'] = (
505
+ df.loc[mask, 'input_tokens'] * prices['prompt_tokens'] / 1e6 +
506
+ df.loc[mask, 'output_tokens'] * prices['completion_tokens'] / 1e6 +
507
+ df.loc[mask, 'input_tokens_cache_read'] * prices['prompt_tokens'] / 1e6 +
508
+ df.loc[mask, 'input_tokens_cache_write'] * prices['prompt_tokens'] / 1e6 +
509
+ df.loc[mask, 'prompt_tokens'] * prices['prompt_tokens'] / 1e6 +
510
+ df.loc[mask, 'completion_tokens'] * prices['completion_tokens'] / 1e6
511
+ )
512
+
513
+ return df
514
+
515
+ def get_parsed_results_with_costs(self, benchmark_name, pricing_config=None, aggregate=True):
516
+ """Get parsed results with recalculated costs based on token usage"""
517
+ # Get base results
518
+ results_df = self.get_parsed_results(benchmark_name, aggregate=False)
519
+
520
+ # Get token usage with new costs
521
+ token_costs = self.get_token_usage_with_costs(benchmark_name, pricing_config)
522
+
523
+ # Group token costs by agent
524
+ agent_costs = token_costs.groupby('agent_name')['total_cost'].sum().reset_index()
525
+
526
+ agent_costs = agent_costs.rename(columns={
527
+ 'agent_name': 'agent_name_temp',
528
+ 'total_cost': 'Total Cost'
529
+ })
530
+
531
+ # Drop existing Total Cost column if it exists
532
+ if 'Total Cost' in results_df.columns:
533
+ results_df = results_df.drop('Total Cost', axis=1)
534
+
535
+ # create temp column that is whatever is in agent_name [x] because of url we added to agent_name
536
+ results_df['agent_name_temp'] = results_df['Agent Name'].apply(lambda x: x.split('[')[1].split(']')[0])
537
+
538
+ # Update costs in results
539
+ results_df = results_df.merge(agent_costs, on='agent_name_temp', how='left')
540
+
541
+ # Drop temp column
542
+ results_df = results_df.drop('agent_name_temp', axis=1)
543
+
544
+ # Fill any missing costs with 0
545
+ # results_df['Total Cost'] = results_df['Total Cost'].fillna(0)
546
+
547
+ if aggregate:
548
+ # Aggregate results
549
+ results_df = results_df.groupby('Agent Name').agg({
550
+ 'Date': 'first',
551
+ 'Total Cost': 'mean',
552
+ 'Accuracy': 'mean',
553
+ 'Precision': 'mean',
554
+ 'Recall': 'mean',
555
+ 'F1 Score': 'mean',
556
+ 'AUC': 'mean',
557
+ 'Overall Score': 'mean',
558
+ 'Vectorization Score': 'mean',
559
+ 'Fathomnet Score': 'mean',
560
+ 'Feedback Score': 'mean',
561
+ 'House Price Score': 'mean',
562
+ 'Spaceship Titanic Score': 'mean',
563
+ 'AMP Parkinsons Disease Progression Prediction Score': 'mean',
564
+ 'CIFAR10 Score': 'mean',
565
+ 'IMDB Score': 'mean',
566
+ 'Level 1 Accuracy': 'mean',
567
+ 'Level 2 Accuracy': 'mean',
568
+ 'Level 3 Accuracy': 'mean',
569
+ 'Verified': 'first',
570
+ 'Runs': 'first',
571
+ 'Accuracy CI': 'first',
572
+ 'Total Cost CI': 'first'
573
+ }).reset_index()
574
+
575
+ # Round the cost values
576
+ results_df['Total Cost'] = results_df['Total Cost'].round(3)
577
+
578
+ # Sort by Accuracy in descending order
579
+ results_df = results_df.sort_values('Accuracy', ascending=False)
580
+
581
+ return results_df
582
+
583
+ def check_token_usage_data(self, benchmark_name):
584
+ """Debug helper to check token usage data"""
585
+ with self.get_conn(benchmark_name) as conn:
586
+ query = '''
587
+ SELECT * FROM token_usage
588
+ WHERE benchmark_name = ?
589
+ '''
590
+ df = pd.read_sql_query(query, conn, params=(benchmark_name,))
591
+ return df
592
+
593
+ def get_models_for_benchmark(self, benchmark_name):
594
+ """Get list of unique model names used in a benchmark"""
595
+ with self.get_conn(benchmark_name) as conn:
596
+ query = '''
597
+ SELECT DISTINCT model_name
598
+ FROM token_usage
599
+ WHERE benchmark_name = ?
600
+ '''
601
+ df = pd.read_sql_query(query, conn, params=(benchmark_name,))
602
+ return df['model_name'].tolist()
603
+
604
+ def get_all_agents(self, benchmark_name):
605
+ """Get list of all agent names for a benchmark"""
606
+ with self.get_conn(benchmark_name) as conn:
607
+ query = '''
608
+ SELECT DISTINCT agent_name
609
+ FROM parsed_results
610
+ WHERE benchmark_name = ?
611
+ '''
612
+ df = pd.read_sql_query(query, conn, params=(benchmark_name,))
613
+ return df['agent_name'].tolist()
614
+
615
  if __name__ == '__main__':
616
  preprocessor = TracePreprocessor()
617
  preprocessor.preprocess_traces()
utils/viz.py CHANGED
@@ -19,7 +19,7 @@ def create_leaderboard(df, ci_metrics = None):
19
  # for rows in the df for which CI metric is not None, join the metric and CI columns by looping through the CI metrics columns
20
  for i, row in df.iterrows():
21
  if str(row[CI_metric]) != 'None':
22
- df.at[i, metric] = str(row[metric]) + " (" + str(row[CI_metric]) + ")"
23
 
24
  return df
25
 
 
19
  # for rows in the df for which CI metric is not None, join the metric and CI columns by looping through the CI metrics columns
20
  for i, row in df.iterrows():
21
  if str(row[CI_metric]) != 'None':
22
+ df.at[i, metric] = str(round(float(row[metric]), 2)) + " (" + str(row[CI_metric]) + ")"
23
 
24
  return df
25
 
verified_agents.yaml DELETED
@@ -1,129 +0,0 @@
1
- # This file contains information about verified agent results for different benchmarks.
2
- # Format:
3
- # benchmark_name:
4
- # - agent_name: "Name of the agent"
5
- # verification_date: YYYY-MM-DD
6
-
7
- usaco:
8
- - agent_name: "USACO Reflexion + Episodic (gpt-4o-mini-2024-07-18)"
9
- verification_date: 2024-08-20
10
- - agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-mini-2024-07-18)"
11
- verification_date: 2024-08-20
12
- - agent_name: "USACO Reflexion (gpt-4o-mini-2024-07-18)"
13
- verification_date: 2024-08-20
14
- - agent_name: "USACO Episodic (gpt-4o-mini-2024-07-18)"
15
- verification_date: 2024-08-12
16
- - agent_name: "USACO Reflexion + Semantic (gpt-4o-mini-2024-07-18)"
17
- verification_date: 2024-08-20
18
- - agent_name: "USACO Zero-shot (gpt-4o-mini-2024-07-18)"
19
- verification_date: 2024-08-11
20
- - agent_name: "USACO Semantic (gpt-4o-mini-2024-07-18)"
21
- verification_date: 2024-08-12
22
- - agent_name: USACO Reflexion + Episodic + Semantic (gpt-4o-2024-05-13)
23
- verification_date: 2024-08-25
24
- - agent_name: USACO Reflexion + Episodic (gpt-4o-2024-05-13)
25
- verification_date: 2024-08-25
26
- - agent_name: USACO Reflexion + Semantic (gpt-4o-2024-05-13)
27
- verification_date: 2024-08-25
28
- - agent_name: Episodic Retrial (2x) (gpt-4o-2024-05-13)
29
- verification_date: 2024-08-25
30
- - agent_name: Episodic Retrial (3x) (gpt-4o-mini-2024-07-18)
31
- verification_date: 2024-08-25
32
- - agent_name: Episodic Retrial (2x) (gpt-4o-mini-2024-07-18)
33
- verification_date: 2024-08-25
34
- - agent_name: Episodic Retrial (5x) (gpt-4o-mini-2024-07-18)
35
- verification_date: 2024-08-25
36
- - agent_name: Episodic Warming (3 Steps) (gpt-4o-mini-2024-07-18)
37
- verification_date: 2024-08-24
38
- - agent_name: USACO Episodic (gpt-4o-2024-05-13)
39
- verification_date: 2024-08-24
40
- - agent_name: USACO Semantic (gpt-4o-2024-05-13)
41
- verification_date: 2024-08-24
42
- - agent_name: Zero-shot Retrial (2x) (gpt-4o-mini-2024-07-18)
43
- verification_date: 2024-08-24
44
- - agent_name: Zero-shot Retrial (3x) (gpt-4o-mini-2024-07-18)
45
- verification_date: 2024-08-24
46
- - agent_name: Zero-shot Retrial (5x) (gpt-4o-mini-2024-07-18)
47
- verification_date: 2024-08-24
48
- - agent_name: USACO Zero-shot (gpt-4o-2024-05-13)
49
- verification_date: 2024-08-24
50
-
51
- swebench_verified_mini:
52
- - agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
53
- verification_date: 2024-08-17
54
- - agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)"
55
- verification_date: 2024-08-19
56
- - agent_name: "Moatless (gpt-4o-mini-2024-07-18)"
57
- verification_date: 2024-10-30
58
- - agent_name: "Moatless (gpt-4o-2024-08-06)"
59
- verification_date: 2024-10-30
60
- - agent_name: "Moatless (claude-3-5-sonnet-20241022)"
61
- verification_date: 2024-10-30
62
- - agent_name: "Agentless (o1-mini-2024-09-12)"
63
- verification_date: 2024-10-30
64
-
65
-
66
- swebench_verified:
67
- - agent_name: "Moatless (gpt-4o-2024-08-06)"
68
- verification_date: 2024-10-30
69
- - agent_name: "Agentless (o1-mini-2024-09-12)"
70
- verification_date: 2024-10-30
71
-
72
- mlagentbench:
73
- - agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
74
- verification_date: 2024-08-19
75
-
76
-
77
- corebench_easy:
78
- - agent_name: "AutoGPT (GPT-4o)"
79
- verification_date: 2024-09-28
80
- - agent_name: "AutoGPT (GPT-4o-mini)"
81
- verification_date: 2024-09-28
82
- - agent_name: "CORE-Agent (GPT-4o)"
83
- verification_date: 2024-09-28
84
- - agent_name: "CORE-Agent (GPT-4o-mini)"
85
- verification_date: 2024-09-28
86
-
87
- corebench_medium:
88
- - agent_name: "AutoGPT (GPT-4o)"
89
- verification_date: 2024-09-28
90
- - agent_name: "AutoGPT (GPT-4o-mini)"
91
- verification_date: 2024-09-28
92
- - agent_name: "CORE-Agent (GPT-4o)"
93
- verification_date: 2024-09-28
94
- - agent_name: "CORE-Agent (GPT-4o-mini)"
95
- verification_date: 2024-09-28
96
-
97
- corebench_hard:
98
- - agent_name: "AutoGPT (GPT-4o)"
99
- verification_date: 2024-09-28
100
- - agent_name: "AutoGPT (GPT-4o-mini)"
101
- verification_date: 2024-09-28
102
- - agent_name: "CORE-Agent (GPT-4o)"
103
- verification_date: 2024-09-28
104
- - agent_name: "CORE-Agent (GPT-4o-mini)"
105
- verification_date: 2024-09-28
106
-
107
- gaia:
108
- - agent_name: "Inspect Default Agent (gpt-4o-mini-2024-07-18)"
109
- verification_date: 2024-11-30
110
- - agent_name: "Inspect Default Agent (gpt-4o-2024-11-20)"
111
- verification_date: 2024-11-30
112
- - agent_name: "Inspect Default Agent (claude-3-5-sonnet-20241022)"
113
- verification_date: 2024-11-30
114
- - agent_name: "Inspect Default Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
115
- verification_date: 2024-11-30
116
-
117
- cybench:
118
- - agent_name: "Inspect Default Agent (gpt-4o-mini-2024-07-18)"
119
- verification_date: 2024-11-30
120
- - agent_name: "Inspect Default Agent (gpt-4o-2024-11-20)"
121
- verification_date: 2024-11-30
122
- - agent_name: "Inspect Default Agent (claude-3-5-sonnet-20241022)"
123
- verification_date: 2024-11-30
124
- - agent_name: "Inspect Default Agent (o1-mini-2024-09-12)"
125
- verification_date: 2024-11-30
126
- - agent_name: "Inspect Default Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
127
- verification_date: 2024-11-30
128
-
129
-