Spaces:
Running
Running
benediktstroebl
commited on
Commit
·
56a86ce
1
Parent(s):
3a9a3a3
big update with dynamic pricing, agent metadata, about page on top, and new benchmarks
Browse files- .gitignore +2 -0
- agents_metadata.yaml +272 -0
- app.py +0 -0
- config.py +1 -0
- css.css +4 -0
- utils/db.py +395 -153
- utils/viz.py +1 -1
- verified_agents.yaml +0 -129
.gitignore
CHANGED
@@ -3,3 +3,5 @@
|
|
3 |
evals_upload/*
|
4 |
evals_live/*
|
5 |
evals_processed/*
|
|
|
|
|
|
3 |
evals_upload/*
|
4 |
evals_live/*
|
5 |
evals_processed/*
|
6 |
+
*.db
|
7 |
+
.env
|
agents_metadata.yaml
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This file contains metadata about agents for different benchmarks.
|
2 |
+
# Format:
|
3 |
+
# benchmark_name:
|
4 |
+
# - agent_name: "Name of the agent"
|
5 |
+
# verification_date: YYYY-MM-DD # Optional verification date
|
6 |
+
# url: "https://..." # Optional link to agent code/paper
|
7 |
+
|
8 |
+
gaia:
|
9 |
+
- agent_name: "Inspect ReAct Agent (gpt-4o-mini-2024-07-18)"
|
10 |
+
verification_date: 2024-11-30
|
11 |
+
url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
|
12 |
+
- agent_name: "Inspect ReAct Agent (gpt-4o-2024-11-20)"
|
13 |
+
verification_date: 2024-11-30
|
14 |
+
url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
|
15 |
+
- agent_name: "Inspect ReAct Agent (claude-3-5-sonnet-20241022)"
|
16 |
+
verification_date: 2024-11-30
|
17 |
+
url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
|
18 |
+
- agent_name: "Inspect ReAct Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
|
19 |
+
verification_date: 2024-11-30
|
20 |
+
url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
|
21 |
+
- agent_name: "Inspect ReAct Agent (o1-mini-2024-09-12)"
|
22 |
+
verification_date: 2024-11-30
|
23 |
+
url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
|
24 |
+
|
25 |
+
cybench:
|
26 |
+
- agent_name: "Inspect ReAct Agent (gpt-4o-mini-2024-07-18)"
|
27 |
+
verification_date: 2024-11-30
|
28 |
+
url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
|
29 |
+
- agent_name: "Inspect ReAct Agent (gpt-4o-2024-11-20)"
|
30 |
+
verification_date: 2024-11-30
|
31 |
+
url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
|
32 |
+
- agent_name: "Inspect ReAct Agent (claude-3-5-sonnet-20241022)"
|
33 |
+
verification_date: 2024-11-30
|
34 |
+
url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
|
35 |
+
- agent_name: "Inspect ReAct Agent (o1-mini-2024-09-12)"
|
36 |
+
verification_date: 2024-11-30
|
37 |
+
url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
|
38 |
+
- agent_name: "Inspect ReAct Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
|
39 |
+
verification_date: 2024-11-30
|
40 |
+
url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
|
41 |
+
|
42 |
+
swebench_verified:
|
43 |
+
- agent_name: "Moatless (gpt-4o-2024-08-06)"
|
44 |
+
verification_date: 2024-10-30
|
45 |
+
url: "https://github.com/aorwall/moatless-tools"
|
46 |
+
- agent_name: "Agentless (o1-mini-2024-09-12)"
|
47 |
+
verification_date: 2024-10-30
|
48 |
+
url: "https://github.com/OpenAutoCoder/Agentless"
|
49 |
+
- agent_name: "Moatless (claude-3-5-sonnet-20241022)"
|
50 |
+
verification_date: 2024-11-30
|
51 |
+
url: "https://github.com/aorwall/moatless-tools"
|
52 |
+
- agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
|
53 |
+
verification_date: 2024-11-30
|
54 |
+
url: "https://github.com/OpenAutoCoder/Agentless"
|
55 |
+
|
56 |
+
swebench_verified_mini:
|
57 |
+
- agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
|
58 |
+
verification_date: 2024-08-17
|
59 |
+
url: "https://github.com/OpenAutoCoder/Agentless"
|
60 |
+
- agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)"
|
61 |
+
verification_date: 2024-08-19
|
62 |
+
url: "https://github.com/princeton-nlp/SWE-agent"
|
63 |
+
- agent_name: "Moatless (gpt-4o-mini-2024-07-18)"
|
64 |
+
verification_date: 2024-10-30
|
65 |
+
url: "https://github.com/aorwall/moatless-tools"
|
66 |
+
- agent_name: "Moatless (gpt-4o-2024-08-06)"
|
67 |
+
verification_date: 2024-10-30
|
68 |
+
url: "https://github.com/aorwall/moatless-tools"
|
69 |
+
- agent_name: "Moatless (claude-3-5-sonnet-20241022)"
|
70 |
+
verification_date: 2024-10-30
|
71 |
+
url: "https://github.com/aorwall/moatless-tools"
|
72 |
+
- agent_name: "Agentless (o1-mini-2024-09-12)"
|
73 |
+
verification_date: 2024-10-30
|
74 |
+
url: "https://github.com/OpenAutoCoder/Agentless"
|
75 |
+
|
76 |
+
corebench_easy:
|
77 |
+
- agent_name: "AutoGPT (GPT-4o)"
|
78 |
+
verification_date: 2024-09-28
|
79 |
+
url: "https://github.com/princeton-nlp/SWE-bench"
|
80 |
+
- agent_name: "AutoGPT (GPT-4o-mini)"
|
81 |
+
verification_date: 2024-09-28
|
82 |
+
url: "https://github.com/princeton-nlp/SWE-bench"
|
83 |
+
- agent_name: "CORE-Agent (GPT-4o)"
|
84 |
+
verification_date: 2024-09-28
|
85 |
+
url: "https://github.com/princeton-nlp/SWE-bench"
|
86 |
+
- agent_name: "CORE-Agent (GPT-4o-mini)"
|
87 |
+
verification_date: 2024-09-28
|
88 |
+
url: "https://github.com/princeton-nlp/SWE-bench"
|
89 |
+
|
90 |
+
corebench_medium:
|
91 |
+
- agent_name: "AutoGPT (GPT-4o)"
|
92 |
+
verification_date: 2024-09-28
|
93 |
+
url: "https://github.com/Significant-Gravitas/AutoGPT"
|
94 |
+
- agent_name: "AutoGPT (GPT-4o-mini)"
|
95 |
+
verification_date: 2024-09-28
|
96 |
+
url: "https://github.com/Significant-Gravitas/AutoGPT"
|
97 |
+
- agent_name: "CORE-Agent (GPT-4o)"
|
98 |
+
verification_date: 2024-09-28
|
99 |
+
url: "https://github.com/siegelz/core-bench"
|
100 |
+
- agent_name: "CORE-Agent (GPT-4o-mini)"
|
101 |
+
verification_date: 2024-09-28
|
102 |
+
url: "https://github.com/siegelz/core-bench"
|
103 |
+
|
104 |
+
corebench_hard:
|
105 |
+
- agent_name: "AutoGPT (GPT-4o)"
|
106 |
+
verification_date: 2024-09-28
|
107 |
+
url: "https://github.com/Significant-Gravitas/AutoGPT"
|
108 |
+
- agent_name: "AutoGPT (GPT-4o-mini)"
|
109 |
+
verification_date: 2024-09-28
|
110 |
+
url: "https://github.com/Significant-Gravitas/AutoGPT"
|
111 |
+
- agent_name: "CORE-Agent (GPT-4o)"
|
112 |
+
verification_date: 2024-09-28
|
113 |
+
url: "https://github.com/siegelz/core-bench"
|
114 |
+
- agent_name: "CORE-Agent (GPT-4o-mini)"
|
115 |
+
verification_date: 2024-09-28
|
116 |
+
url: "https://github.com/siegelz/core-bench"
|
117 |
+
|
118 |
+
usaco:
|
119 |
+
- agent_name: "USACO Reflexion + Episodic (gpt-4o-mini-2024-07-18)"
|
120 |
+
verification_date: 2024-08-20
|
121 |
+
url: "https://github.com/princeton-nlp/USACO"
|
122 |
+
- agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-mini-2024-07-18)"
|
123 |
+
verification_date: 2024-08-20
|
124 |
+
url: "https://github.com/princeton-nlp/USACO"
|
125 |
+
- agent_name: "USACO Reflexion (gpt-4o-mini-2024-07-18)"
|
126 |
+
verification_date: 2024-08-20
|
127 |
+
url: "https://github.com/princeton-nlp/USACO"
|
128 |
+
- agent_name: "USACO Episodic (gpt-4o-mini-2024-07-18)"
|
129 |
+
verification_date: 2024-08-12
|
130 |
+
url: "https://github.com/princeton-nlp/USACO"
|
131 |
+
- agent_name: "USACO Reflexion + Semantic (gpt-4o-mini-2024-07-18)"
|
132 |
+
verification_date: 2024-08-20
|
133 |
+
url: "https://github.com/princeton-nlp/USACO"
|
134 |
+
- agent_name: "USACO Zero-shot (gpt-4o-mini-2024-07-18)"
|
135 |
+
verification_date: 2024-08-11
|
136 |
+
url: "https://github.com/princeton-nlp/USACO"
|
137 |
+
- agent_name: "USACO Semantic (gpt-4o-mini-2024-07-18)"
|
138 |
+
verification_date: 2024-08-12
|
139 |
+
url: "https://github.com/princeton-nlp/USACO"
|
140 |
+
- agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-2024-05-13)"
|
141 |
+
verification_date: 2024-08-25
|
142 |
+
url: "https://github.com/princeton-nlp/USACO"
|
143 |
+
- agent_name: "USACO Reflexion + Episodic (gpt-4o-2024-05-13)"
|
144 |
+
verification_date: 2024-08-25
|
145 |
+
url: "https://github.com/princeton-nlp/USACO"
|
146 |
+
- agent_name: "USACO Reflexion + Semantic (gpt-4o-2024-05-13)"
|
147 |
+
verification_date: 2024-08-25
|
148 |
+
url: "https://github.com/princeton-nlp/USACO"
|
149 |
+
- agent_name: "Episodic Retrial (2x) (gpt-4o-2024-05-13)"
|
150 |
+
verification_date: 2024-08-25
|
151 |
+
url: "https://github.com/princeton-nlp/USACO"
|
152 |
+
- agent_name: "Episodic Retrial (3x) (gpt-4o-mini-2024-07-18)"
|
153 |
+
verification_date: 2024-08-25
|
154 |
+
url: "https://github.com/princeton-nlp/USACO"
|
155 |
+
- agent_name: "Episodic Retrial (2x) (gpt-4o-mini-2024-07-18)"
|
156 |
+
verification_date: 2024-08-25
|
157 |
+
url: "https://github.com/princeton-nlp/USACO"
|
158 |
+
- agent_name: "Episodic Retrial (5x) (gpt-4o-mini-2024-07-18)"
|
159 |
+
verification_date: 2024-08-25
|
160 |
+
url: "https://github.com/princeton-nlp/USACO"
|
161 |
+
- agent_name: "Episodic Warming (3 Steps) (gpt-4o-mini-2024-07-18)"
|
162 |
+
verification_date: 2024-08-24
|
163 |
+
url: "https://github.com/princeton-nlp/USACO"
|
164 |
+
- agent_name: "USACO Episodic (gpt-4o-2024-05-13)"
|
165 |
+
verification_date: 2024-08-24
|
166 |
+
url: "https://github.com/princeton-nlp/USACO"
|
167 |
+
- agent_name: "USACO Semantic (gpt-4o-2024-05-13)"
|
168 |
+
verification_date: 2024-08-24
|
169 |
+
url: "https://github.com/princeton-nlp/USACO"
|
170 |
+
- agent_name: "Zero-shot Retrial (2x) (gpt-4o-mini-2024-07-18)"
|
171 |
+
verification_date: 2024-08-24
|
172 |
+
url: "https://github.com/princeton-nlp/USACO"
|
173 |
+
- agent_name: "Zero-shot Retrial (3x) (gpt-4o-mini-2024-07-18)"
|
174 |
+
verification_date: 2024-08-24
|
175 |
+
url: "https://github.com/princeton-nlp/USACO"
|
176 |
+
- agent_name: "Zero-shot Retrial (5x) (gpt-4o-mini-2024-07-18)"
|
177 |
+
verification_date: 2024-08-24
|
178 |
+
url: "https://github.com/princeton-nlp/USACO"
|
179 |
+
- agent_name: "USACO Zero-shot (gpt-4o-2024-05-13)"
|
180 |
+
verification_date: 2024-08-24
|
181 |
+
url: "https://github.com/princeton-nlp/USACO"
|
182 |
+
- agent_name: "USACO Zero-Shot (claude-3-5-sonnet-20241022)"
|
183 |
+
verification_date: 2024-11-30
|
184 |
+
url: "https://github.com/princeton-nlp/USACO"
|
185 |
+
|
186 |
+
appworld_test_normal:
|
187 |
+
- agent_name: "ReAct (gpt-4o-2024-05-13)"
|
188 |
+
verification_date: 2024-12-03
|
189 |
+
url: "https://appworld.dev/"
|
190 |
+
- agent_name: "PlanExec (gpt-4o-2024-05-13)"
|
191 |
+
verification_date: 2024-12-03
|
192 |
+
url: "https://appworld.dev/"
|
193 |
+
- agent_name: "FullCodeRefl (gpt-4o-2024-05-13)"
|
194 |
+
verification_date: 2024-12-03
|
195 |
+
url: "https://appworld.dev/"
|
196 |
+
- agent_name: "PlanExec (gpt-4-turbo-2024-04-09)"
|
197 |
+
verification_date: 2024-12-03
|
198 |
+
url: "https://appworld.dev/"
|
199 |
+
- agent_name: "IPFunCall (gpt-4o-2024-05-13)"
|
200 |
+
verification_date: 2024-12-03
|
201 |
+
url: "https://appworld.dev/"
|
202 |
+
- agent_name: "IPFunCall (gpt-4-turbo-2024-04-09)"
|
203 |
+
verification_date: 2024-12-03
|
204 |
+
url: "https://appworld.dev/"
|
205 |
+
- agent_name: "ReAct (gpt-4-turbo-2024-04-09)"
|
206 |
+
verification_date: 2024-12-03
|
207 |
+
url: "https://appworld.dev/"
|
208 |
+
- agent_name: "FullCodeRefl (gpt-4-turbo-2024-04-09)"
|
209 |
+
verification_date: 2024-12-03
|
210 |
+
url: "https://appworld.dev/"
|
211 |
+
- agent_name: "FullCodeRefl (meta-llama/Llama-3-70b-chat-hf)"
|
212 |
+
verification_date: 2024-12-03
|
213 |
+
url: "https://appworld.dev/"
|
214 |
+
- agent_name: "ReAct (meta-llama/Llama-3-70b-chat-hf)"
|
215 |
+
verification_date: 2024-12-03
|
216 |
+
url: "https://appworld.dev/"
|
217 |
+
- agent_name: "FullCodeRefl (deepseek-ai/deepseek-coder-33b-instruct)"
|
218 |
+
verification_date: 2024-12-03
|
219 |
+
url: "https://appworld.dev/"
|
220 |
+
- agent_name: "PlanExec (meta-llama/Llama-3-70b-chat-hf)"
|
221 |
+
verification_date: 2024-12-03
|
222 |
+
url: "https://appworld.dev/"
|
223 |
+
- agent_name: "ReAct (deepseek-ai/deepseek-coder-33b-instruct)"
|
224 |
+
verification_date: 2024-12-03
|
225 |
+
url: "https://appworld.dev/"
|
226 |
+
- agent_name: "PlanExec (deepseek-ai/deepseek-coder-33b-instruct)"
|
227 |
+
verification_date: 2024-12-03
|
228 |
+
url: "https://appworld.dev/"
|
229 |
+
|
230 |
+
appworld_test_challenge:
|
231 |
+
- agent_name: "ReAct (gpt-4o-2024-05-13)"
|
232 |
+
verification_date: 2024-12-03
|
233 |
+
url: "https://appworld.dev/"
|
234 |
+
- agent_name: "PlanExec (gpt-4o-2024-05-13)"
|
235 |
+
verification_date: 2024-12-03
|
236 |
+
url: "https://appworld.dev/"
|
237 |
+
- agent_name: "FullCodeRefl (gpt-4o-2024-05-13)"
|
238 |
+
verification_date: 2024-12-03
|
239 |
+
url: "https://appworld.dev/"
|
240 |
+
- agent_name: "PlanExec (gpt-4-turbo-2024-04-09)"
|
241 |
+
verification_date: 2024-12-03
|
242 |
+
url: "https://appworld.dev/"
|
243 |
+
- agent_name: "IPFunCall (gpt-4o-2024-05-13)"
|
244 |
+
verification_date: 2024-12-03
|
245 |
+
url: "https://appworld.dev/"
|
246 |
+
- agent_name: "IPFunCall (gpt-4-turbo-2024-04-09)"
|
247 |
+
verification_date: 2024-12-03
|
248 |
+
url: "https://appworld.dev/"
|
249 |
+
- agent_name: "ReAct (gpt-4-turbo-2024-04-09)"
|
250 |
+
verification_date: 2024-12-03
|
251 |
+
url: "https://appworld.dev/"
|
252 |
+
- agent_name: "FullCodeRefl (gpt-4-turbo-2024-04-09)"
|
253 |
+
verification_date: 2024-12-03
|
254 |
+
url: "https://appworld.dev/"
|
255 |
+
- agent_name: "FullCodeRefl (meta-llama/Llama-3-70b-chat-hf)"
|
256 |
+
verification_date: 2024-12-03
|
257 |
+
url: "https://appworld.dev/"
|
258 |
+
- agent_name: "ReAct (meta-llama/Llama-3-70b-chat-hf)"
|
259 |
+
verification_date: 2024-12-03
|
260 |
+
url: "https://appworld.dev/"
|
261 |
+
- agent_name: "FullCodeRefl (deepseek-ai/deepseek-coder-33b-instruct)"
|
262 |
+
verification_date: 2024-12-03
|
263 |
+
url: "https://appworld.dev/"
|
264 |
+
- agent_name: "PlanExec (meta-llama/Llama-3-70b-chat-hf)"
|
265 |
+
verification_date: 2024-12-03
|
266 |
+
url: "https://appworld.dev/"
|
267 |
+
- agent_name: "ReAct (deepseek-ai/deepseek-coder-33b-instruct)"
|
268 |
+
verification_date: 2024-12-03
|
269 |
+
url: "https://appworld.dev/"
|
270 |
+
- agent_name: "PlanExec (deepseek-ai/deepseek-coder-33b-instruct)"
|
271 |
+
verification_date: 2024-12-03
|
272 |
+
url: "https://appworld.dev/"
|
app.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
config.py
CHANGED
@@ -69,6 +69,7 @@ APPWORLD_ON_LOAD_COLUMNS = [
|
|
69 |
"Accuracy",
|
70 |
"Total Cost",
|
71 |
"Runs",
|
|
|
72 |
]
|
73 |
APPWORLD_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
|
74 |
APPWORLD_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
|
|
|
69 |
"Accuracy",
|
70 |
"Total Cost",
|
71 |
"Runs",
|
72 |
+
"Scenario Goal Completion"
|
73 |
]
|
74 |
APPWORLD_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
|
75 |
APPWORLD_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
|
css.css
CHANGED
@@ -25,6 +25,10 @@ cursor: pointer;
|
|
25 |
transition: background-color 0.3s;
|
26 |
}
|
27 |
|
|
|
|
|
|
|
|
|
28 |
.tab-nav button:hover,
|
29 |
.tab-nav button.selected {
|
30 |
background-color: var(--primary-color);
|
|
|
25 |
transition: background-color 0.3s;
|
26 |
}
|
27 |
|
28 |
+
a {
|
29 |
+
color: #000;
|
30 |
+
}
|
31 |
+
|
32 |
.tab-nav button:hover,
|
33 |
.tab-nav button.selected {
|
34 |
background-color: var(--primary-color);
|
utils/db.py
CHANGED
@@ -10,18 +10,162 @@ from scipy import stats
|
|
10 |
import yaml
|
11 |
import numpy as np
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
class TracePreprocessor:
|
14 |
-
def __init__(self,
|
15 |
-
self.
|
|
|
16 |
self.local = threading.local()
|
|
|
17 |
|
18 |
-
def get_conn(self):
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
def create_tables(self):
|
24 |
-
with self.get_conn() as conn:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
conn.execute('''
|
26 |
CREATE TABLE IF NOT EXISTS preprocessed_traces (
|
27 |
benchmark_name TEXT,
|
@@ -43,37 +187,23 @@ class TracePreprocessor:
|
|
43 |
)
|
44 |
''')
|
45 |
conn.execute('''
|
46 |
-
CREATE TABLE IF NOT EXISTS
|
47 |
benchmark_name TEXT,
|
48 |
agent_name TEXT,
|
49 |
-
date TEXT,
|
50 |
run_id TEXT,
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
vectorization_score REAL,
|
61 |
-
fathomnet_score REAL,
|
62 |
-
feedback_score REAL,
|
63 |
-
house_price_score REAL,
|
64 |
-
spaceship_titanic_score REAL,
|
65 |
-
amp_parkinsons_disease_progression_prediction_score REAL,
|
66 |
-
cifar10_score REAL,
|
67 |
-
imdb_score REAL,
|
68 |
-
level_1_accuracy REAL,
|
69 |
-
level_2_accuracy REAL,
|
70 |
-
level_3_accuracy REAL,
|
71 |
-
PRIMARY KEY (benchmark_name, agent_name, run_id)
|
72 |
)
|
73 |
''')
|
74 |
|
75 |
def preprocess_traces(self, processed_dir="evals_live"):
|
76 |
-
self.create_tables()
|
77 |
processed_dir = Path(processed_dir)
|
78 |
for file in processed_dir.glob('*.json'):
|
79 |
with open(file, 'r') as f:
|
@@ -85,9 +215,12 @@ class TracePreprocessor:
|
|
85 |
date = data['config']['date']
|
86 |
config = data['config']
|
87 |
|
|
|
|
|
|
|
88 |
try:
|
89 |
raw_logging_results = pickle.dumps(data['raw_logging_results'])
|
90 |
-
with self.get_conn() as conn:
|
91 |
conn.execute('''
|
92 |
INSERT OR REPLACE INTO preprocessed_traces
|
93 |
(benchmark_name, agent_name, date, run_id, raw_logging_results)
|
@@ -98,63 +231,76 @@ class TracePreprocessor:
|
|
98 |
|
99 |
try:
|
100 |
failure_report = pickle.dumps(data['failure_report'])
|
101 |
-
with self.get_conn() as conn:
|
102 |
conn.execute('''
|
103 |
INSERT INTO failure_reports
|
104 |
(benchmark_name, agent_name, date, run_id, failure_report)
|
105 |
-
VALUES (?, ?, ?, ?
|
106 |
''', (benchmark_name, agent_name, date, config['run_id'], failure_report))
|
107 |
except Exception as e:
|
108 |
print(f"Error preprocessing failure_report in {file}: {e}")
|
109 |
|
110 |
try:
|
111 |
-
config = data['config']
|
112 |
results = data['results']
|
113 |
-
with self.get_conn() as conn:
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
119 |
benchmark_name,
|
120 |
agent_name,
|
121 |
config['date'],
|
122 |
-
config['run_id']
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
results.get('vectorization_score'),
|
133 |
-
results.get('fathomnet_score'),
|
134 |
-
results.get('feedback_score'),
|
135 |
-
results.get('house-price_score'),
|
136 |
-
results.get('spaceship-titanic_score'),
|
137 |
-
results.get('amp-parkinsons-disease-progression-prediction_score'),
|
138 |
-
results.get('cifar10_score'),
|
139 |
-
results.get('imdb_score'),
|
140 |
-
results.get('level_1_accuracy'),
|
141 |
-
results.get('level_2_accuracy'),
|
142 |
-
results.get('level_3_accuracy')
|
143 |
-
))
|
144 |
except Exception as e:
|
145 |
print(f"Error preprocessing parsed results in {file}: {e}")
|
146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
@lru_cache(maxsize=100)
|
148 |
def get_analyzed_traces(self, agent_name, benchmark_name):
|
149 |
-
with self.get_conn() as conn:
|
150 |
query = '''
|
151 |
SELECT agent_name, raw_logging_results, date FROM preprocessed_traces
|
152 |
WHERE benchmark_name = ? AND agent_name = ?
|
153 |
'''
|
154 |
df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
|
155 |
|
156 |
-
|
157 |
-
# check for each row if raw_logging_results is not None with pickle.loads because it is stored as a byte string
|
158 |
df = df[df['raw_logging_results'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
|
159 |
|
160 |
if len(df) == 0:
|
@@ -163,32 +309,26 @@ class TracePreprocessor:
|
|
163 |
# select latest run
|
164 |
df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
|
165 |
|
166 |
-
|
167 |
return pickle.loads(df['raw_logging_results'][0])
|
168 |
|
169 |
-
|
170 |
@lru_cache(maxsize=100)
|
171 |
def get_failure_report(self, agent_name, benchmark_name):
|
172 |
-
with self.get_conn() as conn:
|
173 |
query = '''
|
174 |
SELECT agent_name, date, failure_report FROM failure_reports
|
175 |
WHERE benchmark_name = ? AND agent_name = ?
|
176 |
'''
|
177 |
df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
|
178 |
|
179 |
-
# Select only rows for which failure report is not None and None is a string
|
180 |
df = df[df['failure_report'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
|
181 |
|
182 |
if len(df) == 0:
|
183 |
return None
|
184 |
-
|
185 |
|
186 |
-
# if there is multiple failure reports, take the last one
|
187 |
df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
|
188 |
|
189 |
-
# if there is a failure report, return the first one
|
190 |
return pickle.loads(df['failure_report'][0])
|
191 |
-
|
192 |
def _calculate_ci(self, data, confidence=0.95, type='minmax'):
|
193 |
data = data[np.isfinite(data)]
|
194 |
|
@@ -209,7 +349,7 @@ class TracePreprocessor:
|
|
209 |
return mean, ci[0], ci[1]
|
210 |
|
211 |
def get_parsed_results(self, benchmark_name, aggregate=True):
|
212 |
-
with self.get_conn() as conn:
|
213 |
query = '''
|
214 |
SELECT * FROM parsed_results
|
215 |
WHERE benchmark_name = ?
|
@@ -217,21 +357,44 @@ class TracePreprocessor:
|
|
217 |
'''
|
218 |
df = pd.read_sql_query(query, conn, params=(benchmark_name,))
|
219 |
|
220 |
-
# Load
|
221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
# Add 'Verified' column
|
|
|
224 |
df['Verified'] = df.apply(lambda row: '✓' if (benchmark_name, row['agent_name']) in verified_agents else '', axis=1)
|
225 |
|
226 |
-
|
|
|
|
|
|
|
227 |
|
228 |
# Add column for how many times an agent_name appears in the DataFrame
|
229 |
df['Runs'] = df.groupby('agent_name')['agent_name'].transform('count')
|
230 |
|
231 |
# Compute the 95% confidence interval for accuracy and cost for agents that have been run more than once
|
232 |
-
df['
|
233 |
df['cost_ci'] = None
|
234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
for agent_name in df['agent_name'].unique():
|
236 |
agent_df = df[df['agent_name'] == agent_name]
|
237 |
|
@@ -239,86 +402,28 @@ class TracePreprocessor:
|
|
239 |
accuracy_mean, accuracy_lower, accuracy_upper = self._calculate_ci(agent_df['accuracy'], type='minmax')
|
240 |
cost_mean, cost_lower, cost_upper = self._calculate_ci(agent_df['total_cost'], type='minmax')
|
241 |
|
242 |
-
#
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
accuracy_ci = f"-{abs(accuracy_mean - accuracy_lower):.3f}/+{abs(accuracy_mean - accuracy_upper):.3f}"
|
247 |
-
cost_ci = f"-{abs(cost_mean - cost_lower):.3f}/+{abs(cost_mean - cost_upper):.3f}"
|
248 |
|
249 |
-
df.loc[df['agent_name'] == agent_name, '
|
250 |
df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
|
251 |
|
252 |
-
|
253 |
df = df.drop(columns=['successful_tasks', 'failed_tasks', 'run_id'], axis=1)
|
254 |
|
255 |
if aggregate:
|
256 |
-
|
257 |
-
df = df.groupby('agent_name').agg({
|
258 |
-
'date': 'first',
|
259 |
-
'total_cost': 'mean',
|
260 |
-
'accuracy': 'mean',
|
261 |
-
'precision': 'mean',
|
262 |
-
'recall': 'mean',
|
263 |
-
'f1_score': 'mean',
|
264 |
-
'auc': 'mean',
|
265 |
-
'overall_score': 'mean',
|
266 |
-
'vectorization_score': 'mean',
|
267 |
-
'fathomnet_score': 'mean',
|
268 |
-
'feedback_score': 'mean',
|
269 |
-
'house_price_score': 'mean',
|
270 |
-
'spaceship_titanic_score': 'mean',
|
271 |
-
'amp_parkinsons_disease_progression_prediction_score': 'mean',
|
272 |
-
'cifar10_score': 'mean',
|
273 |
-
'imdb_score': 'mean',
|
274 |
-
'level_1_accuracy': 'mean',
|
275 |
-
'level_2_accuracy': 'mean',
|
276 |
-
'level_3_accuracy': 'mean',
|
277 |
-
'Verified': 'first',
|
278 |
-
'Runs': 'first',
|
279 |
-
'acc_ci': 'first',
|
280 |
-
'cost_ci': 'first'
|
281 |
-
}).reset_index()
|
282 |
-
|
283 |
-
# Round float columns to 3 decimal places
|
284 |
-
float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score', 'level_1_accuracy', 'level_2_accuracy', 'level_3_accuracy']
|
285 |
-
for column in float_columns:
|
286 |
-
if column in df.columns:
|
287 |
-
df[column] = df[column].round(3)
|
288 |
-
|
289 |
-
# sort by accuracy
|
290 |
-
df = df.sort_values('accuracy', ascending=False)
|
291 |
|
292 |
-
# Rename columns
|
293 |
-
df = df.rename(columns=
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
'accuracy': 'Accuracy',
|
298 |
-
'precision': 'Precision',
|
299 |
-
'recall': 'Recall',
|
300 |
-
'f1_score': 'F1 Score',
|
301 |
-
'auc': 'AUC',
|
302 |
-
'overall_score': 'Overall Score',
|
303 |
-
'vectorization_score': 'Vectorization Score',
|
304 |
-
'fathomnet_score': 'Fathomnet Score',
|
305 |
-
'feedback_score': 'Feedback Score',
|
306 |
-
'house_price_score': 'House Price Score',
|
307 |
-
'spaceship_titanic_score': 'Spaceship Titanic Score',
|
308 |
-
'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
|
309 |
-
'cifar10_score': 'CIFAR10 Score',
|
310 |
-
'imdb_score': 'IMDB Score',
|
311 |
-
'level_1_accuracy': 'Level 1 Accuracy',
|
312 |
-
'level_2_accuracy': 'Level 2 Accuracy',
|
313 |
-
'level_3_accuracy': 'Level 3 Accuracy',
|
314 |
-
'acc_ci': 'Accuracy CI',
|
315 |
-
'cost_ci': 'Total Cost CI'
|
316 |
-
})
|
317 |
|
318 |
return df
|
319 |
|
320 |
def get_task_success_data(self, benchmark_name):
|
321 |
-
with self.get_conn() as conn:
|
322 |
query = '''
|
323 |
SELECT agent_name, accuracy, successful_tasks, failed_tasks
|
324 |
FROM parsed_results
|
@@ -359,17 +464,154 @@ class TracePreprocessor:
|
|
359 |
|
360 |
return df
|
361 |
|
362 |
-
def load_verified_agents(self, file_path='
|
363 |
with open(file_path, 'r') as f:
|
364 |
-
|
365 |
|
366 |
verified_agents = set()
|
367 |
-
for benchmark, agents in
|
368 |
for agent in agents:
|
369 |
-
|
|
|
370 |
|
371 |
return verified_agents
|
372 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
373 |
if __name__ == '__main__':
|
374 |
preprocessor = TracePreprocessor()
|
375 |
preprocessor.preprocess_traces()
|
|
|
10 |
import yaml
|
11 |
import numpy as np
|
12 |
|
13 |
+
# Define column schemas
|
14 |
+
PARSED_RESULTS_COLUMNS = {
|
15 |
+
'benchmark_name': 'TEXT',
|
16 |
+
'agent_name': 'TEXT',
|
17 |
+
'date': 'TEXT',
|
18 |
+
'run_id': 'TEXT',
|
19 |
+
'successful_tasks': 'TEXT',
|
20 |
+
'failed_tasks': 'TEXT',
|
21 |
+
'total_cost': 'REAL',
|
22 |
+
'accuracy': 'REAL',
|
23 |
+
'precision': 'REAL',
|
24 |
+
'recall': 'REAL',
|
25 |
+
'f1_score': 'REAL',
|
26 |
+
'auc': 'REAL',
|
27 |
+
'overall_score': 'REAL',
|
28 |
+
'vectorization_score': 'REAL',
|
29 |
+
'fathomnet_score': 'REAL',
|
30 |
+
'feedback_score': 'REAL',
|
31 |
+
'house_price_score': 'REAL',
|
32 |
+
'spaceship_titanic_score': 'REAL',
|
33 |
+
'amp_parkinsons_disease_progression_prediction_score': 'REAL',
|
34 |
+
'cifar10_score': 'REAL',
|
35 |
+
'imdb_score': 'REAL',
|
36 |
+
'level_1_accuracy': 'REAL',
|
37 |
+
'level_2_accuracy': 'REAL',
|
38 |
+
'level_3_accuracy': 'REAL',
|
39 |
+
'task_goal_completion': 'REAL', # New column
|
40 |
+
'scenario_goal_completion': 'REAL', # New column
|
41 |
+
'accuracy_ci': 'TEXT', # Using TEXT since it stores formatted strings like "-0.123/+0.456"
|
42 |
+
'cost_ci': 'TEXT',
|
43 |
+
}
|
44 |
+
|
45 |
+
# Define which columns should be included in aggregation and how
|
46 |
+
AGGREGATION_RULES = {
|
47 |
+
'date': 'first',
|
48 |
+
'total_cost': 'mean',
|
49 |
+
'accuracy': 'mean',
|
50 |
+
'precision': 'mean',
|
51 |
+
'recall': 'mean',
|
52 |
+
'f1_score': 'mean',
|
53 |
+
'auc': 'mean',
|
54 |
+
'overall_score': 'mean',
|
55 |
+
'vectorization_score': 'mean',
|
56 |
+
'fathomnet_score': 'mean',
|
57 |
+
'feedback_score': 'mean',
|
58 |
+
'house_price_score': 'mean',
|
59 |
+
'spaceship_titanic_score': 'mean',
|
60 |
+
'amp_parkinsons_disease_progression_prediction_score': 'mean',
|
61 |
+
'cifar10_score': 'mean',
|
62 |
+
'imdb_score': 'mean',
|
63 |
+
'level_1_accuracy': 'mean',
|
64 |
+
'level_2_accuracy': 'mean',
|
65 |
+
'level_3_accuracy': 'mean',
|
66 |
+
'task_goal_completion': 'mean',
|
67 |
+
'scenario_goal_completion': 'mean',
|
68 |
+
'Verified': 'first',
|
69 |
+
'Runs': 'first',
|
70 |
+
'accuracy_ci': 'first',
|
71 |
+
'cost_ci': 'first',
|
72 |
+
}
|
73 |
+
|
74 |
+
# Define column display names
|
75 |
+
COLUMN_DISPLAY_NAMES = {
|
76 |
+
'agent_name': 'Agent Name',
|
77 |
+
'date': 'Date',
|
78 |
+
'total_cost': 'Total Cost',
|
79 |
+
'accuracy': 'Accuracy',
|
80 |
+
'precision': 'Precision',
|
81 |
+
'recall': 'Recall',
|
82 |
+
'f1_score': 'F1 Score',
|
83 |
+
'auc': 'AUC',
|
84 |
+
'overall_score': 'Overall Score',
|
85 |
+
'vectorization_score': 'Vectorization Score',
|
86 |
+
'fathomnet_score': 'Fathomnet Score',
|
87 |
+
'feedback_score': 'Feedback Score',
|
88 |
+
'house_price_score': 'House Price Score',
|
89 |
+
'spaceship_titanic_score': 'Spaceship Titanic Score',
|
90 |
+
'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
|
91 |
+
'cifar10_score': 'CIFAR10 Score',
|
92 |
+
'imdb_score': 'IMDB Score',
|
93 |
+
'level_1_accuracy': 'Level 1 Accuracy',
|
94 |
+
'level_2_accuracy': 'Level 2 Accuracy',
|
95 |
+
'level_3_accuracy': 'Level 3 Accuracy',
|
96 |
+
'task_goal_completion': 'Task Goal Completion',
|
97 |
+
'scenario_goal_completion': 'Scenario Goal Completion',
|
98 |
+
'accuracy_ci': 'Accuracy CI',
|
99 |
+
'cost_ci': 'Total Cost CI',
|
100 |
+
}
|
101 |
+
|
102 |
+
DEFAULT_PRICING = {
|
103 |
+
"text-embedding-3-small": {"prompt_tokens": 0.02, "completion_tokens": 0},
|
104 |
+
"text-embedding-3-large": {"prompt_tokens": 0.13, "completion_tokens": 0},
|
105 |
+
"gpt-4o-2024-05-13": {"prompt_tokens": 2.5, "completion_tokens": 10},
|
106 |
+
"gpt-4o-2024-08-06": {"prompt_tokens": 2.5, "completion_tokens": 10},
|
107 |
+
"gpt-3.5-turbo-0125": {"prompt_tokens": 0.5, "completion_tokens": 1.5},
|
108 |
+
"gpt-3.5-turbo": {"prompt_tokens": 0.5, "completion_tokens": 1.5},
|
109 |
+
"gpt-4-turbo-2024-04-09": {"prompt_tokens": 10, "completion_tokens": 30},
|
110 |
+
"gpt-4-turbo": {"prompt_tokens": 10, "completion_tokens": 30},
|
111 |
+
"gpt-4o-mini-2024-07-18": {"prompt_tokens": 0.15, "completion_tokens": 0.6},
|
112 |
+
"meta-llama/Meta-Llama-3.1-8B-Instruct": {"prompt_tokens": 0.18, "completion_tokens": 0.18},
|
113 |
+
"meta-llama/Meta-Llama-3.1-70B-Instruct": {"prompt_tokens": 0.88, "completion_tokens": 0.88},
|
114 |
+
"meta-llama/Meta-Llama-3.1-405B-Instruct": {"prompt_tokens": 5, "completion_tokens": 15},
|
115 |
+
"gpt-4o": {"prompt_tokens": 2.5, "completion_tokens": 10},
|
116 |
+
"o1-mini-2024-09-12": {"prompt_tokens": 3, "completion_tokens": 12},
|
117 |
+
"o1-preview-2024-09-12": {"prompt_tokens": 15, "completion_tokens": 60},
|
118 |
+
"claude-3-5-sonnet-20240620": {"prompt_tokens": 3, "completion_tokens": 15},
|
119 |
+
"claude-3-5-sonnet-20241022": {"prompt_tokens": 3, "completion_tokens": 15},
|
120 |
+
"us.anthropic.claude-3-5-sonnet-20240620-v1:0": {"prompt_tokens": 3, "completion_tokens": 15},
|
121 |
+
"us.anthropic.claude-3-5-sonnet-20241022-v2:0": {"prompt_tokens": 3, "completion_tokens": 15},
|
122 |
+
"openai/gpt-4o-2024-11-20": {"prompt_tokens": 2.5, "completion_tokens": 10},
|
123 |
+
"openai/gpt-4o-2024-08-06": {"prompt_tokens": 2.5, "completion_tokens": 10},
|
124 |
+
"openai/gpt-4o-mini-2024-07-18": {"prompt_tokens": 0.15, "completion_tokens": 0.6},
|
125 |
+
"openai/o1-mini-2024-09-12": {"prompt_tokens": 3, "completion_tokens": 12},
|
126 |
+
"openai/o1-preview-2024-09-12": {"prompt_tokens": 15, "completion_tokens": 60},
|
127 |
+
"anthropic/claude-3-5-sonnet-20240620": {"prompt_tokens": 3, "completion_tokens": 15},
|
128 |
+
"anthropic/claude-3-5-sonnet-20241022": {"prompt_tokens": 3, "completion_tokens": 15},
|
129 |
+
"google/gemini-1.5-pro": {"prompt_tokens": 1.25, "completion_tokens": 5},
|
130 |
+
"google/gemini-1.5-flash": {"prompt_tokens": 0.075, "completion_tokens": 0.3},
|
131 |
+
"together/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo": {"prompt_tokens": 3.5, "completion_tokens": 3.5},
|
132 |
+
"together/meta-llama/Meta-Llama-3.1-70B-Instruct": {"prompt_tokens": 0.88, "completion_tokens": 0.88},
|
133 |
+
}
|
134 |
+
|
135 |
class TracePreprocessor:
|
136 |
+
def __init__(self, db_dir='preprocessed_traces'):
|
137 |
+
self.db_dir = Path(db_dir)
|
138 |
+
self.db_dir.mkdir(exist_ok=True)
|
139 |
self.local = threading.local()
|
140 |
+
self.connections = {}
|
141 |
|
142 |
+
def get_conn(self, benchmark_name):
|
143 |
+
# Sanitize benchmark name for filename
|
144 |
+
safe_name = benchmark_name.replace('/', '_').replace('\\', '_')
|
145 |
+
db_path = self.db_dir / f"{safe_name}.db"
|
146 |
+
|
147 |
+
# Get thread-specific connections dictionary
|
148 |
+
if not hasattr(self.local, 'connections'):
|
149 |
+
self.local.connections = {}
|
150 |
+
|
151 |
+
# Create new connection if not exists for this benchmark
|
152 |
+
if safe_name not in self.local.connections:
|
153 |
+
self.local.connections[safe_name] = sqlite3.connect(db_path)
|
154 |
+
|
155 |
+
return self.local.connections[safe_name]
|
156 |
|
157 |
+
def create_tables(self, benchmark_name):
|
158 |
+
with self.get_conn(benchmark_name) as conn:
|
159 |
+
# Create parsed_results table dynamically from schema
|
160 |
+
columns = [f"{col} {dtype}" for col, dtype in PARSED_RESULTS_COLUMNS.items()]
|
161 |
+
create_parsed_results = f'''
|
162 |
+
CREATE TABLE IF NOT EXISTS parsed_results (
|
163 |
+
{', '.join(columns)},
|
164 |
+
PRIMARY KEY (benchmark_name, agent_name, run_id)
|
165 |
+
)
|
166 |
+
'''
|
167 |
+
conn.execute(create_parsed_results)
|
168 |
+
|
169 |
conn.execute('''
|
170 |
CREATE TABLE IF NOT EXISTS preprocessed_traces (
|
171 |
benchmark_name TEXT,
|
|
|
187 |
)
|
188 |
''')
|
189 |
conn.execute('''
|
190 |
+
CREATE TABLE IF NOT EXISTS token_usage (
|
191 |
benchmark_name TEXT,
|
192 |
agent_name TEXT,
|
|
|
193 |
run_id TEXT,
|
194 |
+
model_name TEXT,
|
195 |
+
prompt_tokens INTEGER,
|
196 |
+
completion_tokens INTEGER,
|
197 |
+
input_tokens INTEGER,
|
198 |
+
output_tokens INTEGER,
|
199 |
+
total_tokens INTEGER,
|
200 |
+
input_tokens_cache_write INTEGER,
|
201 |
+
input_tokens_cache_read INTEGER,
|
202 |
+
PRIMARY KEY (benchmark_name, agent_name, run_id, model_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
)
|
204 |
''')
|
205 |
|
206 |
def preprocess_traces(self, processed_dir="evals_live"):
|
|
|
207 |
processed_dir = Path(processed_dir)
|
208 |
for file in processed_dir.glob('*.json'):
|
209 |
with open(file, 'r') as f:
|
|
|
215 |
date = data['config']['date']
|
216 |
config = data['config']
|
217 |
|
218 |
+
# Create tables for this benchmark if they don't exist
|
219 |
+
self.create_tables(benchmark_name)
|
220 |
+
|
221 |
try:
|
222 |
raw_logging_results = pickle.dumps(data['raw_logging_results'])
|
223 |
+
with self.get_conn(benchmark_name) as conn:
|
224 |
conn.execute('''
|
225 |
INSERT OR REPLACE INTO preprocessed_traces
|
226 |
(benchmark_name, agent_name, date, run_id, raw_logging_results)
|
|
|
231 |
|
232 |
try:
|
233 |
failure_report = pickle.dumps(data['failure_report'])
|
234 |
+
with self.get_conn(benchmark_name) as conn:
|
235 |
conn.execute('''
|
236 |
INSERT INTO failure_reports
|
237 |
(benchmark_name, agent_name, date, run_id, failure_report)
|
238 |
+
VALUES (?, ?, ?, ?, ?)
|
239 |
''', (benchmark_name, agent_name, date, config['run_id'], failure_report))
|
240 |
except Exception as e:
|
241 |
print(f"Error preprocessing failure_report in {file}: {e}")
|
242 |
|
243 |
try:
|
|
|
244 |
results = data['results']
|
245 |
+
with self.get_conn(benchmark_name) as conn:
|
246 |
+
# Dynamically create placeholders and values list
|
247 |
+
columns = [col for col in PARSED_RESULTS_COLUMNS.keys()
|
248 |
+
if col not in ['benchmark_name', 'agent_name', 'date', 'run_id']]
|
249 |
+
placeholders = ','.join(['?'] * (len(columns) + 4)) # +4 for benchmark_name, agent_name, date, run_id
|
250 |
+
|
251 |
+
values = [
|
252 |
benchmark_name,
|
253 |
agent_name,
|
254 |
config['date'],
|
255 |
+
config['run_id']
|
256 |
+
] + [str(results.get(col)) if col in ['successful_tasks', 'failed_tasks']
|
257 |
+
else results.get(col) for col in columns]
|
258 |
+
|
259 |
+
query = f'''
|
260 |
+
INSERT INTO parsed_results
|
261 |
+
({', '.join(PARSED_RESULTS_COLUMNS.keys())})
|
262 |
+
VALUES ({placeholders})
|
263 |
+
'''
|
264 |
+
conn.execute(query, values)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
except Exception as e:
|
266 |
print(f"Error preprocessing parsed results in {file}: {e}")
|
267 |
|
268 |
+
try:
|
269 |
+
total_usage = data.get('total_usage', {})
|
270 |
+
for model_name, usage in total_usage.items():
|
271 |
+
with self.get_conn(benchmark_name) as conn:
|
272 |
+
conn.execute('''
|
273 |
+
INSERT INTO token_usage
|
274 |
+
(benchmark_name, agent_name, run_id, model_name,
|
275 |
+
prompt_tokens, completion_tokens, input_tokens, output_tokens, total_tokens,
|
276 |
+
input_tokens_cache_write, input_tokens_cache_read)
|
277 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
278 |
+
''', (
|
279 |
+
benchmark_name,
|
280 |
+
agent_name,
|
281 |
+
config['run_id'],
|
282 |
+
model_name,
|
283 |
+
usage.get('prompt_tokens', 0),
|
284 |
+
usage.get('completion_tokens', 0),
|
285 |
+
usage.get('input_tokens', 0),
|
286 |
+
usage.get('output_tokens', 0),
|
287 |
+
usage.get('total_tokens', 0),
|
288 |
+
usage.get('input_tokens_cache_write', 0),
|
289 |
+
usage.get('input_tokens_cache_read', 0)
|
290 |
+
))
|
291 |
+
except Exception as e:
|
292 |
+
print(f"Error preprocessing token usage in {file}: {e}")
|
293 |
+
|
294 |
@lru_cache(maxsize=100)
|
295 |
def get_analyzed_traces(self, agent_name, benchmark_name):
|
296 |
+
with self.get_conn(benchmark_name) as conn:
|
297 |
query = '''
|
298 |
SELECT agent_name, raw_logging_results, date FROM preprocessed_traces
|
299 |
WHERE benchmark_name = ? AND agent_name = ?
|
300 |
'''
|
301 |
df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
|
302 |
|
303 |
+
# check for each row if raw_logging_results is not None
|
|
|
304 |
df = df[df['raw_logging_results'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
|
305 |
|
306 |
if len(df) == 0:
|
|
|
309 |
# select latest run
|
310 |
df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
|
311 |
|
|
|
312 |
return pickle.loads(df['raw_logging_results'][0])
|
313 |
|
|
|
314 |
@lru_cache(maxsize=100)
|
315 |
def get_failure_report(self, agent_name, benchmark_name):
|
316 |
+
with self.get_conn(benchmark_name) as conn:
|
317 |
query = '''
|
318 |
SELECT agent_name, date, failure_report FROM failure_reports
|
319 |
WHERE benchmark_name = ? AND agent_name = ?
|
320 |
'''
|
321 |
df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
|
322 |
|
|
|
323 |
df = df[df['failure_report'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
|
324 |
|
325 |
if len(df) == 0:
|
326 |
return None
|
|
|
327 |
|
|
|
328 |
df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
|
329 |
|
|
|
330 |
return pickle.loads(df['failure_report'][0])
|
331 |
+
|
332 |
def _calculate_ci(self, data, confidence=0.95, type='minmax'):
|
333 |
data = data[np.isfinite(data)]
|
334 |
|
|
|
349 |
return mean, ci[0], ci[1]
|
350 |
|
351 |
def get_parsed_results(self, benchmark_name, aggregate=True):
|
352 |
+
with self.get_conn(benchmark_name) as conn:
|
353 |
query = '''
|
354 |
SELECT * FROM parsed_results
|
355 |
WHERE benchmark_name = ?
|
|
|
357 |
'''
|
358 |
df = pd.read_sql_query(query, conn, params=(benchmark_name,))
|
359 |
|
360 |
+
# Load metadata
|
361 |
+
with open('agents_metadata.yaml', 'r') as f:
|
362 |
+
metadata = yaml.safe_load(f)
|
363 |
+
|
364 |
+
# Create URL mapping
|
365 |
+
url_mapping = {}
|
366 |
+
if benchmark_name in metadata:
|
367 |
+
for agent in metadata[benchmark_name]:
|
368 |
+
if 'url' in agent and agent['url']: # Only add if URL exists and is not empty
|
369 |
+
url_mapping[agent['agent_name']] = agent['url']
|
370 |
|
371 |
# Add 'Verified' column
|
372 |
+
verified_agents = self.load_verified_agents()
|
373 |
df['Verified'] = df.apply(lambda row: '✓' if (benchmark_name, row['agent_name']) in verified_agents else '', axis=1)
|
374 |
|
375 |
+
# Add URLs to agent names if they exist
|
376 |
+
df['agent_name'] = df['agent_name'].apply(
|
377 |
+
lambda x: f'[{x}]({url_mapping[x]})' if x in url_mapping else x
|
378 |
+
)
|
379 |
|
380 |
# Add column for how many times an agent_name appears in the DataFrame
|
381 |
df['Runs'] = df.groupby('agent_name')['agent_name'].transform('count')
|
382 |
|
383 |
# Compute the 95% confidence interval for accuracy and cost for agents that have been run more than once
|
384 |
+
df['accuracy_ci'] = None
|
385 |
df['cost_ci'] = None
|
386 |
|
387 |
+
# Round float columns to 2 decimal places
|
388 |
+
float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc',
|
389 |
+
'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score',
|
390 |
+
'house_price_score', 'spaceship_titanic_score',
|
391 |
+
'amp_parkinsons_disease_progression_prediction_score', 'cifar10_score',
|
392 |
+
'imdb_score', 'level_1_accuracy', 'level_2_accuracy', 'level_3_accuracy']
|
393 |
+
|
394 |
+
for column in float_columns:
|
395 |
+
if column in df.columns:
|
396 |
+
df[column] = df[column].round(2)
|
397 |
+
|
398 |
for agent_name in df['agent_name'].unique():
|
399 |
agent_df = df[df['agent_name'] == agent_name]
|
400 |
|
|
|
402 |
accuracy_mean, accuracy_lower, accuracy_upper = self._calculate_ci(agent_df['accuracy'], type='minmax')
|
403 |
cost_mean, cost_lower, cost_upper = self._calculate_ci(agent_df['total_cost'], type='minmax')
|
404 |
|
405 |
+
# Round CI values to 2 decimals
|
406 |
+
accuracy_ci = f"-{abs(accuracy_mean - accuracy_lower):.2f}/+{abs(accuracy_mean - accuracy_upper):.2f}"
|
407 |
+
cost_ci = f"-{abs(cost_mean - cost_lower):.2f}/+{abs(cost_mean - cost_upper):.2f}"
|
|
|
|
|
|
|
408 |
|
409 |
+
df.loc[df['agent_name'] == agent_name, 'accuracy_ci'] = accuracy_ci
|
410 |
df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
|
411 |
|
|
|
412 |
df = df.drop(columns=['successful_tasks', 'failed_tasks', 'run_id'], axis=1)
|
413 |
|
414 |
if aggregate:
|
415 |
+
df = df.groupby('agent_name').agg(AGGREGATION_RULES).reset_index()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
|
417 |
+
# Rename columns using the display names mapping
|
418 |
+
df = df.rename(columns=COLUMN_DISPLAY_NAMES)
|
419 |
+
|
420 |
+
# Sort by Accuracy in descending order
|
421 |
+
df = df.sort_values('Accuracy', ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
422 |
|
423 |
return df
|
424 |
|
425 |
def get_task_success_data(self, benchmark_name):
|
426 |
+
with self.get_conn(benchmark_name) as conn:
|
427 |
query = '''
|
428 |
SELECT agent_name, accuracy, successful_tasks, failed_tasks
|
429 |
FROM parsed_results
|
|
|
464 |
|
465 |
return df
|
466 |
|
467 |
+
def load_verified_agents(self, file_path='agents_metadata.yaml'):
|
468 |
with open(file_path, 'r') as f:
|
469 |
+
metadata = yaml.safe_load(f)
|
470 |
|
471 |
verified_agents = set()
|
472 |
+
for benchmark, agents in metadata.items():
|
473 |
for agent in agents:
|
474 |
+
if 'verification_date' in agent: # Only add if verified
|
475 |
+
verified_agents.add((benchmark, agent['agent_name']))
|
476 |
|
477 |
return verified_agents
|
478 |
|
479 |
+
def get_token_usage_with_costs(self, benchmark_name, pricing_config=None):
|
480 |
+
"""Get token usage data with configurable pricing"""
|
481 |
+
if pricing_config is None:
|
482 |
+
pricing_config = DEFAULT_PRICING
|
483 |
+
|
484 |
+
with self.get_conn(benchmark_name) as conn:
|
485 |
+
query = '''
|
486 |
+
SELECT agent_name, model_name,
|
487 |
+
SUM(prompt_tokens) as prompt_tokens,
|
488 |
+
SUM(completion_tokens) as completion_tokens,
|
489 |
+
SUM(input_tokens) as input_tokens,
|
490 |
+
SUM(output_tokens) as output_tokens,
|
491 |
+
SUM(total_tokens) as total_tokens,
|
492 |
+
SUM(input_tokens_cache_write) as input_tokens_cache_write,
|
493 |
+
SUM(input_tokens_cache_read) as input_tokens_cache_read
|
494 |
+
FROM token_usage
|
495 |
+
WHERE benchmark_name = ?
|
496 |
+
GROUP BY agent_name, model_name
|
497 |
+
'''
|
498 |
+
df = pd.read_sql_query(query, conn, params=(benchmark_name,))
|
499 |
+
|
500 |
+
# Calculate costs based on pricing config (prices are per 1M tokens)
|
501 |
+
df['total_cost'] = 0.0
|
502 |
+
for model, prices in pricing_config.items():
|
503 |
+
mask = df['model_name'] == model
|
504 |
+
df.loc[mask, 'total_cost'] = (
|
505 |
+
df.loc[mask, 'input_tokens'] * prices['prompt_tokens'] / 1e6 +
|
506 |
+
df.loc[mask, 'output_tokens'] * prices['completion_tokens'] / 1e6 +
|
507 |
+
df.loc[mask, 'input_tokens_cache_read'] * prices['prompt_tokens'] / 1e6 +
|
508 |
+
df.loc[mask, 'input_tokens_cache_write'] * prices['prompt_tokens'] / 1e6 +
|
509 |
+
df.loc[mask, 'prompt_tokens'] * prices['prompt_tokens'] / 1e6 +
|
510 |
+
df.loc[mask, 'completion_tokens'] * prices['completion_tokens'] / 1e6
|
511 |
+
)
|
512 |
+
|
513 |
+
return df
|
514 |
+
|
515 |
+
def get_parsed_results_with_costs(self, benchmark_name, pricing_config=None, aggregate=True):
|
516 |
+
"""Get parsed results with recalculated costs based on token usage"""
|
517 |
+
# Get base results
|
518 |
+
results_df = self.get_parsed_results(benchmark_name, aggregate=False)
|
519 |
+
|
520 |
+
# Get token usage with new costs
|
521 |
+
token_costs = self.get_token_usage_with_costs(benchmark_name, pricing_config)
|
522 |
+
|
523 |
+
# Group token costs by agent
|
524 |
+
agent_costs = token_costs.groupby('agent_name')['total_cost'].sum().reset_index()
|
525 |
+
|
526 |
+
agent_costs = agent_costs.rename(columns={
|
527 |
+
'agent_name': 'agent_name_temp',
|
528 |
+
'total_cost': 'Total Cost'
|
529 |
+
})
|
530 |
+
|
531 |
+
# Drop existing Total Cost column if it exists
|
532 |
+
if 'Total Cost' in results_df.columns:
|
533 |
+
results_df = results_df.drop('Total Cost', axis=1)
|
534 |
+
|
535 |
+
# create temp column that is whatever is in agent_name [x] because of url we added to agent_name
|
536 |
+
results_df['agent_name_temp'] = results_df['Agent Name'].apply(lambda x: x.split('[')[1].split(']')[0])
|
537 |
+
|
538 |
+
# Update costs in results
|
539 |
+
results_df = results_df.merge(agent_costs, on='agent_name_temp', how='left')
|
540 |
+
|
541 |
+
# Drop temp column
|
542 |
+
results_df = results_df.drop('agent_name_temp', axis=1)
|
543 |
+
|
544 |
+
# Fill any missing costs with 0
|
545 |
+
# results_df['Total Cost'] = results_df['Total Cost'].fillna(0)
|
546 |
+
|
547 |
+
if aggregate:
|
548 |
+
# Aggregate results
|
549 |
+
results_df = results_df.groupby('Agent Name').agg({
|
550 |
+
'Date': 'first',
|
551 |
+
'Total Cost': 'mean',
|
552 |
+
'Accuracy': 'mean',
|
553 |
+
'Precision': 'mean',
|
554 |
+
'Recall': 'mean',
|
555 |
+
'F1 Score': 'mean',
|
556 |
+
'AUC': 'mean',
|
557 |
+
'Overall Score': 'mean',
|
558 |
+
'Vectorization Score': 'mean',
|
559 |
+
'Fathomnet Score': 'mean',
|
560 |
+
'Feedback Score': 'mean',
|
561 |
+
'House Price Score': 'mean',
|
562 |
+
'Spaceship Titanic Score': 'mean',
|
563 |
+
'AMP Parkinsons Disease Progression Prediction Score': 'mean',
|
564 |
+
'CIFAR10 Score': 'mean',
|
565 |
+
'IMDB Score': 'mean',
|
566 |
+
'Level 1 Accuracy': 'mean',
|
567 |
+
'Level 2 Accuracy': 'mean',
|
568 |
+
'Level 3 Accuracy': 'mean',
|
569 |
+
'Verified': 'first',
|
570 |
+
'Runs': 'first',
|
571 |
+
'Accuracy CI': 'first',
|
572 |
+
'Total Cost CI': 'first'
|
573 |
+
}).reset_index()
|
574 |
+
|
575 |
+
# Round the cost values
|
576 |
+
results_df['Total Cost'] = results_df['Total Cost'].round(3)
|
577 |
+
|
578 |
+
# Sort by Accuracy in descending order
|
579 |
+
results_df = results_df.sort_values('Accuracy', ascending=False)
|
580 |
+
|
581 |
+
return results_df
|
582 |
+
|
583 |
+
def check_token_usage_data(self, benchmark_name):
|
584 |
+
"""Debug helper to check token usage data"""
|
585 |
+
with self.get_conn(benchmark_name) as conn:
|
586 |
+
query = '''
|
587 |
+
SELECT * FROM token_usage
|
588 |
+
WHERE benchmark_name = ?
|
589 |
+
'''
|
590 |
+
df = pd.read_sql_query(query, conn, params=(benchmark_name,))
|
591 |
+
return df
|
592 |
+
|
593 |
+
def get_models_for_benchmark(self, benchmark_name):
|
594 |
+
"""Get list of unique model names used in a benchmark"""
|
595 |
+
with self.get_conn(benchmark_name) as conn:
|
596 |
+
query = '''
|
597 |
+
SELECT DISTINCT model_name
|
598 |
+
FROM token_usage
|
599 |
+
WHERE benchmark_name = ?
|
600 |
+
'''
|
601 |
+
df = pd.read_sql_query(query, conn, params=(benchmark_name,))
|
602 |
+
return df['model_name'].tolist()
|
603 |
+
|
604 |
+
def get_all_agents(self, benchmark_name):
|
605 |
+
"""Get list of all agent names for a benchmark"""
|
606 |
+
with self.get_conn(benchmark_name) as conn:
|
607 |
+
query = '''
|
608 |
+
SELECT DISTINCT agent_name
|
609 |
+
FROM parsed_results
|
610 |
+
WHERE benchmark_name = ?
|
611 |
+
'''
|
612 |
+
df = pd.read_sql_query(query, conn, params=(benchmark_name,))
|
613 |
+
return df['agent_name'].tolist()
|
614 |
+
|
615 |
if __name__ == '__main__':
|
616 |
preprocessor = TracePreprocessor()
|
617 |
preprocessor.preprocess_traces()
|
utils/viz.py
CHANGED
@@ -19,7 +19,7 @@ def create_leaderboard(df, ci_metrics = None):
|
|
19 |
# for rows in the df for which CI metric is not None, join the metric and CI columns by looping through the CI metrics columns
|
20 |
for i, row in df.iterrows():
|
21 |
if str(row[CI_metric]) != 'None':
|
22 |
-
df.at[i, metric] = str(row[metric]) + " (" + str(row[CI_metric]) + ")"
|
23 |
|
24 |
return df
|
25 |
|
|
|
19 |
# for rows in the df for which CI metric is not None, join the metric and CI columns by looping through the CI metrics columns
|
20 |
for i, row in df.iterrows():
|
21 |
if str(row[CI_metric]) != 'None':
|
22 |
+
df.at[i, metric] = str(round(float(row[metric]), 2)) + " (" + str(row[CI_metric]) + ")"
|
23 |
|
24 |
return df
|
25 |
|
verified_agents.yaml
DELETED
@@ -1,129 +0,0 @@
|
|
1 |
-
# This file contains information about verified agent results for different benchmarks.
|
2 |
-
# Format:
|
3 |
-
# benchmark_name:
|
4 |
-
# - agent_name: "Name of the agent"
|
5 |
-
# verification_date: YYYY-MM-DD
|
6 |
-
|
7 |
-
usaco:
|
8 |
-
- agent_name: "USACO Reflexion + Episodic (gpt-4o-mini-2024-07-18)"
|
9 |
-
verification_date: 2024-08-20
|
10 |
-
- agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-mini-2024-07-18)"
|
11 |
-
verification_date: 2024-08-20
|
12 |
-
- agent_name: "USACO Reflexion (gpt-4o-mini-2024-07-18)"
|
13 |
-
verification_date: 2024-08-20
|
14 |
-
- agent_name: "USACO Episodic (gpt-4o-mini-2024-07-18)"
|
15 |
-
verification_date: 2024-08-12
|
16 |
-
- agent_name: "USACO Reflexion + Semantic (gpt-4o-mini-2024-07-18)"
|
17 |
-
verification_date: 2024-08-20
|
18 |
-
- agent_name: "USACO Zero-shot (gpt-4o-mini-2024-07-18)"
|
19 |
-
verification_date: 2024-08-11
|
20 |
-
- agent_name: "USACO Semantic (gpt-4o-mini-2024-07-18)"
|
21 |
-
verification_date: 2024-08-12
|
22 |
-
- agent_name: USACO Reflexion + Episodic + Semantic (gpt-4o-2024-05-13)
|
23 |
-
verification_date: 2024-08-25
|
24 |
-
- agent_name: USACO Reflexion + Episodic (gpt-4o-2024-05-13)
|
25 |
-
verification_date: 2024-08-25
|
26 |
-
- agent_name: USACO Reflexion + Semantic (gpt-4o-2024-05-13)
|
27 |
-
verification_date: 2024-08-25
|
28 |
-
- agent_name: Episodic Retrial (2x) (gpt-4o-2024-05-13)
|
29 |
-
verification_date: 2024-08-25
|
30 |
-
- agent_name: Episodic Retrial (3x) (gpt-4o-mini-2024-07-18)
|
31 |
-
verification_date: 2024-08-25
|
32 |
-
- agent_name: Episodic Retrial (2x) (gpt-4o-mini-2024-07-18)
|
33 |
-
verification_date: 2024-08-25
|
34 |
-
- agent_name: Episodic Retrial (5x) (gpt-4o-mini-2024-07-18)
|
35 |
-
verification_date: 2024-08-25
|
36 |
-
- agent_name: Episodic Warming (3 Steps) (gpt-4o-mini-2024-07-18)
|
37 |
-
verification_date: 2024-08-24
|
38 |
-
- agent_name: USACO Episodic (gpt-4o-2024-05-13)
|
39 |
-
verification_date: 2024-08-24
|
40 |
-
- agent_name: USACO Semantic (gpt-4o-2024-05-13)
|
41 |
-
verification_date: 2024-08-24
|
42 |
-
- agent_name: Zero-shot Retrial (2x) (gpt-4o-mini-2024-07-18)
|
43 |
-
verification_date: 2024-08-24
|
44 |
-
- agent_name: Zero-shot Retrial (3x) (gpt-4o-mini-2024-07-18)
|
45 |
-
verification_date: 2024-08-24
|
46 |
-
- agent_name: Zero-shot Retrial (5x) (gpt-4o-mini-2024-07-18)
|
47 |
-
verification_date: 2024-08-24
|
48 |
-
- agent_name: USACO Zero-shot (gpt-4o-2024-05-13)
|
49 |
-
verification_date: 2024-08-24
|
50 |
-
|
51 |
-
swebench_verified_mini:
|
52 |
-
- agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
|
53 |
-
verification_date: 2024-08-17
|
54 |
-
- agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)"
|
55 |
-
verification_date: 2024-08-19
|
56 |
-
- agent_name: "Moatless (gpt-4o-mini-2024-07-18)"
|
57 |
-
verification_date: 2024-10-30
|
58 |
-
- agent_name: "Moatless (gpt-4o-2024-08-06)"
|
59 |
-
verification_date: 2024-10-30
|
60 |
-
- agent_name: "Moatless (claude-3-5-sonnet-20241022)"
|
61 |
-
verification_date: 2024-10-30
|
62 |
-
- agent_name: "Agentless (o1-mini-2024-09-12)"
|
63 |
-
verification_date: 2024-10-30
|
64 |
-
|
65 |
-
|
66 |
-
swebench_verified:
|
67 |
-
- agent_name: "Moatless (gpt-4o-2024-08-06)"
|
68 |
-
verification_date: 2024-10-30
|
69 |
-
- agent_name: "Agentless (o1-mini-2024-09-12)"
|
70 |
-
verification_date: 2024-10-30
|
71 |
-
|
72 |
-
mlagentbench:
|
73 |
-
- agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
|
74 |
-
verification_date: 2024-08-19
|
75 |
-
|
76 |
-
|
77 |
-
corebench_easy:
|
78 |
-
- agent_name: "AutoGPT (GPT-4o)"
|
79 |
-
verification_date: 2024-09-28
|
80 |
-
- agent_name: "AutoGPT (GPT-4o-mini)"
|
81 |
-
verification_date: 2024-09-28
|
82 |
-
- agent_name: "CORE-Agent (GPT-4o)"
|
83 |
-
verification_date: 2024-09-28
|
84 |
-
- agent_name: "CORE-Agent (GPT-4o-mini)"
|
85 |
-
verification_date: 2024-09-28
|
86 |
-
|
87 |
-
corebench_medium:
|
88 |
-
- agent_name: "AutoGPT (GPT-4o)"
|
89 |
-
verification_date: 2024-09-28
|
90 |
-
- agent_name: "AutoGPT (GPT-4o-mini)"
|
91 |
-
verification_date: 2024-09-28
|
92 |
-
- agent_name: "CORE-Agent (GPT-4o)"
|
93 |
-
verification_date: 2024-09-28
|
94 |
-
- agent_name: "CORE-Agent (GPT-4o-mini)"
|
95 |
-
verification_date: 2024-09-28
|
96 |
-
|
97 |
-
corebench_hard:
|
98 |
-
- agent_name: "AutoGPT (GPT-4o)"
|
99 |
-
verification_date: 2024-09-28
|
100 |
-
- agent_name: "AutoGPT (GPT-4o-mini)"
|
101 |
-
verification_date: 2024-09-28
|
102 |
-
- agent_name: "CORE-Agent (GPT-4o)"
|
103 |
-
verification_date: 2024-09-28
|
104 |
-
- agent_name: "CORE-Agent (GPT-4o-mini)"
|
105 |
-
verification_date: 2024-09-28
|
106 |
-
|
107 |
-
gaia:
|
108 |
-
- agent_name: "Inspect Default Agent (gpt-4o-mini-2024-07-18)"
|
109 |
-
verification_date: 2024-11-30
|
110 |
-
- agent_name: "Inspect Default Agent (gpt-4o-2024-11-20)"
|
111 |
-
verification_date: 2024-11-30
|
112 |
-
- agent_name: "Inspect Default Agent (claude-3-5-sonnet-20241022)"
|
113 |
-
verification_date: 2024-11-30
|
114 |
-
- agent_name: "Inspect Default Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
|
115 |
-
verification_date: 2024-11-30
|
116 |
-
|
117 |
-
cybench:
|
118 |
-
- agent_name: "Inspect Default Agent (gpt-4o-mini-2024-07-18)"
|
119 |
-
verification_date: 2024-11-30
|
120 |
-
- agent_name: "Inspect Default Agent (gpt-4o-2024-11-20)"
|
121 |
-
verification_date: 2024-11-30
|
122 |
-
- agent_name: "Inspect Default Agent (claude-3-5-sonnet-20241022)"
|
123 |
-
verification_date: 2024-11-30
|
124 |
-
- agent_name: "Inspect Default Agent (o1-mini-2024-09-12)"
|
125 |
-
verification_date: 2024-11-30
|
126 |
-
- agent_name: "Inspect Default Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
|
127 |
-
verification_date: 2024-11-30
|
128 |
-
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|