Spaces:

agent-evals
/

leaderboard

Running

App Files Files Community

benediktstroebl commited on Dec 3, 2024

Commit

56a86ce

1 Parent(s): 3a9a3a3

big update with dynamic pricing, agent metadata, about page on top, and new benchmarks

Browse files

Files changed (8) hide show

.gitignore +2 -0
agents_metadata.yaml +272 -0
app.py +0 -0
config.py +1 -0
css.css +4 -0
utils/db.py +395 -153
utils/viz.py +1 -1
verified_agents.yaml +0 -129

.gitignore CHANGED Viewed

@@ -3,3 +3,5 @@
 evals_upload/*
 evals_live/*
 evals_processed/*

 evals_upload/*
 evals_live/*
 evals_processed/*
+*.db
+.env

agents_metadata.yaml ADDED Viewed

	@@ -0,0 +1,272 @@

+# This file contains metadata about agents for different benchmarks.
+# Format:
+#   benchmark_name:
+#     - agent_name: "Name of the agent"
+#       verification_date: YYYY-MM-DD  # Optional verification date
+#       url: "https://..."  # Optional link to agent code/paper
+gaia:
+  - agent_name: "Inspect ReAct Agent (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-11-30
+    url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
+  - agent_name: "Inspect ReAct Agent (gpt-4o-2024-11-20)"
+    verification_date: 2024-11-30
+    url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
+  - agent_name: "Inspect ReAct Agent (claude-3-5-sonnet-20241022)"
+    verification_date: 2024-11-30
+    url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
+  - agent_name: "Inspect ReAct Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
+    verification_date: 2024-11-30
+    url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
+  - agent_name: "Inspect ReAct Agent (o1-mini-2024-09-12)"
+    verification_date: 2024-11-30
+    url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
+cybench:
+  - agent_name: "Inspect ReAct Agent (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-11-30
+    url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
+  - agent_name: "Inspect ReAct Agent (gpt-4o-2024-11-20)"
+    verification_date: 2024-11-30
+    url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
+  - agent_name: "Inspect ReAct Agent (claude-3-5-sonnet-20241022)"
+    verification_date: 2024-11-30
+    url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
+  - agent_name: "Inspect ReAct Agent (o1-mini-2024-09-12)"
+    verification_date: 2024-11-30
+    url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
+  - agent_name: "Inspect ReAct Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
+    verification_date: 2024-11-30
+    url: "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/cybench"
+swebench_verified:
+  - agent_name: "Moatless (gpt-4o-2024-08-06)"
+    verification_date: 2024-10-30
+    url: "https://github.com/aorwall/moatless-tools"
+  - agent_name: "Agentless (o1-mini-2024-09-12)"
+    verification_date: 2024-10-30
+    url: "https://github.com/OpenAutoCoder/Agentless"
+  - agent_name: "Moatless (claude-3-5-sonnet-20241022)"
+    verification_date: 2024-11-30
+    url: "https://github.com/aorwall/moatless-tools"
+  - agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-11-30
+    url: "https://github.com/OpenAutoCoder/Agentless"
+swebench_verified_mini:
+  - agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-17
+    url: "https://github.com/OpenAutoCoder/Agentless"
+  - agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)"
+    verification_date: 2024-08-19
+    url: "https://github.com/princeton-nlp/SWE-agent"
+  - agent_name: "Moatless (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-10-30
+    url: "https://github.com/aorwall/moatless-tools"
+  - agent_name: "Moatless (gpt-4o-2024-08-06)"
+    verification_date: 2024-10-30
+    url: "https://github.com/aorwall/moatless-tools"
+  - agent_name: "Moatless (claude-3-5-sonnet-20241022)"
+    verification_date: 2024-10-30
+    url: "https://github.com/aorwall/moatless-tools"
+  - agent_name: "Agentless (o1-mini-2024-09-12)"
+    verification_date: 2024-10-30
+    url: "https://github.com/OpenAutoCoder/Agentless"
+corebench_easy:
+  - agent_name: "AutoGPT (GPT-4o)"
+    verification_date: 2024-09-28
+    url: "https://github.com/princeton-nlp/SWE-bench"
+  - agent_name: "AutoGPT (GPT-4o-mini)"
+    verification_date: 2024-09-28
+    url: "https://github.com/princeton-nlp/SWE-bench"
+  - agent_name: "CORE-Agent (GPT-4o)"
+    verification_date: 2024-09-28
+    url: "https://github.com/princeton-nlp/SWE-bench"
+  - agent_name: "CORE-Agent (GPT-4o-mini)"
+    verification_date: 2024-09-28
+    url: "https://github.com/princeton-nlp/SWE-bench"
+corebench_medium:
+  - agent_name: "AutoGPT (GPT-4o)"
+    verification_date: 2024-09-28
+    url: "https://github.com/Significant-Gravitas/AutoGPT"
+  - agent_name: "AutoGPT (GPT-4o-mini)"
+    verification_date: 2024-09-28
+    url: "https://github.com/Significant-Gravitas/AutoGPT"
+  - agent_name: "CORE-Agent (GPT-4o)"
+    verification_date: 2024-09-28
+    url: "https://github.com/siegelz/core-bench"
+  - agent_name: "CORE-Agent (GPT-4o-mini)"
+    verification_date: 2024-09-28
+    url: "https://github.com/siegelz/core-bench"
+corebench_hard:
+  - agent_name: "AutoGPT (GPT-4o)"
+    verification_date: 2024-09-28
+    url: "https://github.com/Significant-Gravitas/AutoGPT"
+  - agent_name: "AutoGPT (GPT-4o-mini)"
+    verification_date: 2024-09-28
+    url: "https://github.com/Significant-Gravitas/AutoGPT"
+  - agent_name: "CORE-Agent (GPT-4o)"
+    verification_date: 2024-09-28
+    url: "https://github.com/siegelz/core-bench"
+  - agent_name: "CORE-Agent (GPT-4o-mini)"
+    verification_date: 2024-09-28
+    url: "https://github.com/siegelz/core-bench"
+usaco:
+  - agent_name: "USACO Reflexion + Episodic (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-20
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-20
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "USACO Reflexion (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-20
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "USACO Episodic (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-12
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "USACO Reflexion + Semantic (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-20
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "USACO Zero-shot (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-11
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "USACO Semantic (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-12
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-2024-05-13)"
+    verification_date: 2024-08-25
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "USACO Reflexion + Episodic (gpt-4o-2024-05-13)"
+    verification_date: 2024-08-25
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "USACO Reflexion + Semantic (gpt-4o-2024-05-13)"
+    verification_date: 2024-08-25
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "Episodic Retrial (2x) (gpt-4o-2024-05-13)"
+    verification_date: 2024-08-25
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "Episodic Retrial (3x) (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-25
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "Episodic Retrial (2x) (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-25
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "Episodic Retrial (5x) (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-25
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "Episodic Warming (3 Steps) (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-24
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "USACO Episodic (gpt-4o-2024-05-13)"
+    verification_date: 2024-08-24
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "USACO Semantic (gpt-4o-2024-05-13)"
+    verification_date: 2024-08-24
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "Zero-shot Retrial (2x) (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-24
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "Zero-shot Retrial (3x) (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-24
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "Zero-shot Retrial (5x) (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-08-24
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "USACO Zero-shot (gpt-4o-2024-05-13)"
+    verification_date: 2024-08-24
+    url: "https://github.com/princeton-nlp/USACO"
+  - agent_name: "USACO Zero-Shot (claude-3-5-sonnet-20241022)"
+    verification_date: 2024-11-30
+    url: "https://github.com/princeton-nlp/USACO"
+appworld_test_normal:
+  - agent_name: "ReAct (gpt-4o-2024-05-13)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "PlanExec (gpt-4o-2024-05-13)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "FullCodeRefl (gpt-4o-2024-05-13)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "PlanExec (gpt-4-turbo-2024-04-09)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "IPFunCall (gpt-4o-2024-05-13)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "IPFunCall (gpt-4-turbo-2024-04-09)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "ReAct (gpt-4-turbo-2024-04-09)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "FullCodeRefl (gpt-4-turbo-2024-04-09)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "FullCodeRefl (meta-llama/Llama-3-70b-chat-hf)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "ReAct (meta-llama/Llama-3-70b-chat-hf)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "FullCodeRefl (deepseek-ai/deepseek-coder-33b-instruct)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "PlanExec (meta-llama/Llama-3-70b-chat-hf)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "ReAct (deepseek-ai/deepseek-coder-33b-instruct)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "PlanExec (deepseek-ai/deepseek-coder-33b-instruct)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+appworld_test_challenge:
+  - agent_name: "ReAct (gpt-4o-2024-05-13)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "PlanExec (gpt-4o-2024-05-13)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "FullCodeRefl (gpt-4o-2024-05-13)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "PlanExec (gpt-4-turbo-2024-04-09)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "IPFunCall (gpt-4o-2024-05-13)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "IPFunCall (gpt-4-turbo-2024-04-09)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "ReAct (gpt-4-turbo-2024-04-09)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "FullCodeRefl (gpt-4-turbo-2024-04-09)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "FullCodeRefl (meta-llama/Llama-3-70b-chat-hf)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "ReAct (meta-llama/Llama-3-70b-chat-hf)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "FullCodeRefl (deepseek-ai/deepseek-coder-33b-instruct)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "PlanExec (meta-llama/Llama-3-70b-chat-hf)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "ReAct (deepseek-ai/deepseek-coder-33b-instruct)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"
+  - agent_name: "PlanExec (deepseek-ai/deepseek-coder-33b-instruct)"
+    verification_date: 2024-12-03
+    url: "https://appworld.dev/"

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

config.py CHANGED Viewed

@@ -69,6 +69,7 @@ APPWORLD_ON_LOAD_COLUMNS = [
     "Accuracy",
     "Total Cost",
     "Runs",
 ]
 APPWORLD_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
 APPWORLD_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

     "Accuracy",
     "Total Cost",
     "Runs",
+    "Scenario Goal Completion"
 ]
 APPWORLD_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
 APPWORLD_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

css.css CHANGED Viewed

@@ -25,6 +25,10 @@ cursor: pointer;
 transition: background-color 0.3s;
 }
 .tab-nav button:hover,
 .tab-nav button.selected {
 background-color: var(--primary-color);

 transition: background-color 0.3s;
 }
+a {
+	color: #000;
+}
 .tab-nav button:hover,
 .tab-nav button.selected {
 background-color: var(--primary-color);

utils/db.py CHANGED Viewed

@@ -10,18 +10,162 @@ from scipy import stats
 import yaml
 import numpy as np
 class TracePreprocessor:
-    def __init__(self, db_path='preprocessed_traces.db'):
-        self.db_path = db_path
         self.local = threading.local()
-    def get_conn(self):
-        if not hasattr(self.local, 'conn'):
-            self.local.conn = sqlite3.connect(self.db_path)
-        return self.local.conn
-    def create_tables(self):
-        with self.get_conn() as conn:
             conn.execute('''
                 CREATE TABLE IF NOT EXISTS preprocessed_traces (
                     benchmark_name TEXT,
@@ -43,37 +187,23 @@ class TracePreprocessor:
                 )
             ''')
             conn.execute('''
-                CREATE TABLE IF NOT EXISTS parsed_results (
                     benchmark_name TEXT,
                     agent_name TEXT,
-                    date TEXT,
                     run_id TEXT,
-                    successful_tasks TEXT,
-                    failed_tasks TEXT,
-                    total_cost REAL,
-                    accuracy REAL,
-                    precision REAL,
-                    recall REAL,
-                    f1_score REAL,
-                    auc REAL,
-                    overall_score REAL,
-                    vectorization_score REAL,
-                    fathomnet_score REAL,
-                    feedback_score REAL,
-                    house_price_score REAL,
-                    spaceship_titanic_score REAL,
-                    amp_parkinsons_disease_progression_prediction_score REAL,
-                    cifar10_score REAL,
-                    imdb_score REAL,
-                    level_1_accuracy REAL,
-                    level_2_accuracy REAL,
-                    level_3_accuracy REAL,
-                    PRIMARY KEY (benchmark_name, agent_name, run_id)
                 )
             ''')
     def preprocess_traces(self, processed_dir="evals_live"):
-        self.create_tables()
         processed_dir = Path(processed_dir)
         for file in processed_dir.glob('*.json'):
             with open(file, 'r') as f:
@@ -85,9 +215,12 @@ class TracePreprocessor:
                 date = data['config']['date']
                 config = data['config']
             try:
                 raw_logging_results = pickle.dumps(data['raw_logging_results'])
-                with self.get_conn() as conn:
                     conn.execute('''
                         INSERT OR REPLACE INTO preprocessed_traces
                         (benchmark_name, agent_name, date, run_id, raw_logging_results)
@@ -98,63 +231,76 @@ class TracePreprocessor:
             try:
                 failure_report = pickle.dumps(data['failure_report'])
-                with self.get_conn() as conn:
                     conn.execute('''
                         INSERT INTO failure_reports
                         (benchmark_name, agent_name, date, run_id, failure_report)
-                        VALUES (?, ?, ?, ? ,?)
                     ''', (benchmark_name, agent_name, date, config['run_id'], failure_report))
             except Exception as e:
                 print(f"Error preprocessing failure_report in {file}: {e}")
             try:
-                config = data['config']
                 results = data['results']
-                with self.get_conn() as conn:
-                    conn.execute('''
-                        INSERT INTO parsed_results
-                        (benchmark_name, agent_name, date, run_id, successful_tasks, failed_tasks, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score, level_1_accuracy, level_2_accuracy, level_3_accuracy)
-                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
-                    ''', (
                         benchmark_name,
                         agent_name,
                         config['date'],
-                        config['run_id'],
-                        str(results.get('successful_tasks')),
-                        str(results.get('failed_tasks')),
-                        results.get('total_cost'),
-                        results.get('accuracy'),
-                        results.get('precision'),
-                        results.get('recall'),
-                        results.get('f1_score'),
-                        results.get('auc'),
-                        results.get('overall_score'),
-                        results.get('vectorization_score'),
-                        results.get('fathomnet_score'),
-                        results.get('feedback_score'),
-                        results.get('house-price_score'),
-                        results.get('spaceship-titanic_score'),
-                        results.get('amp-parkinsons-disease-progression-prediction_score'),
-                        results.get('cifar10_score'),
-                        results.get('imdb_score'),
-                        results.get('level_1_accuracy'),
-                        results.get('level_2_accuracy'),
-                        results.get('level_3_accuracy')
-                    ))
             except Exception as e:
                 print(f"Error preprocessing parsed results in {file}: {e}")
     @lru_cache(maxsize=100)
     def get_analyzed_traces(self, agent_name, benchmark_name):
-        with self.get_conn() as conn:
             query = '''
                 SELECT agent_name, raw_logging_results, date FROM preprocessed_traces
                 WHERE benchmark_name = ? AND agent_name = ?
             '''
             df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
-        # check for each row if raw_logging_results is not None with pickle.loads because it is stored as a byte string
         df = df[df['raw_logging_results'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
         if len(df) == 0:
@@ -163,32 +309,26 @@ class TracePreprocessor:
         # select latest run
         df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
         return pickle.loads(df['raw_logging_results'][0])
     @lru_cache(maxsize=100)
     def get_failure_report(self, agent_name, benchmark_name):
-        with self.get_conn() as conn:
             query = '''
                 SELECT agent_name, date, failure_report FROM failure_reports
                 WHERE benchmark_name = ? AND agent_name = ?
             '''
             df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
-        # Select only rows for which failure report is not None and None is a string
         df = df[df['failure_report'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
         if len(df) == 0:
             return None
-        # if there is multiple failure reports, take the last one
         df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
-        # if there is a failure report, return the first one
         return pickle.loads(df['failure_report'][0])
     def _calculate_ci(self, data, confidence=0.95, type='minmax'):
         data = data[np.isfinite(data)]
@@ -209,7 +349,7 @@ class TracePreprocessor:
         return mean, ci[0], ci[1]
     def get_parsed_results(self, benchmark_name, aggregate=True):
-        with self.get_conn() as conn:
             query = '''
                 SELECT * FROM parsed_results
                 WHERE benchmark_name = ?
@@ -217,21 +357,44 @@ class TracePreprocessor:
             '''
             df = pd.read_sql_query(query, conn, params=(benchmark_name,))
-        # Load verified agents
-        verified_agents = self.load_verified_agents()
         # Add 'Verified' column
         df['Verified'] = df.apply(lambda row: '✓' if (benchmark_name, row['agent_name']) in verified_agents else '', axis=1)
         # Add column for how many times an agent_name appears in the DataFrame
         df['Runs'] = df.groupby('agent_name')['agent_name'].transform('count')
         # Compute the 95% confidence interval for accuracy and cost for agents that have been run more than once
-        df['acc_ci'] = None
         df['cost_ci'] = None
         for agent_name in df['agent_name'].unique():
             agent_df = df[df['agent_name'] == agent_name]
@@ -239,86 +402,28 @@ class TracePreprocessor:
                 accuracy_mean, accuracy_lower, accuracy_upper = self._calculate_ci(agent_df['accuracy'], type='minmax')
                 cost_mean, cost_lower, cost_upper = self._calculate_ci(agent_df['total_cost'], type='minmax')
-                # format the confidence interval with +/- sign
-                # accuracy_ci = f"± {abs(accuracy_mean - accuracy_lower):.3f}"
-                # cost_ci = f"± {abs(cost_mean - cost_lower):.3f}"
-                accuracy_ci = f"-{abs(accuracy_mean - accuracy_lower):.3f}/+{abs(accuracy_mean - accuracy_upper):.3f}"
-                cost_ci = f"-{abs(cost_mean - cost_lower):.3f}/+{abs(cost_mean - cost_upper):.3f}"
-                df.loc[df['agent_name'] == agent_name, 'acc_ci'] = accuracy_ci
                 df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
         df = df.drop(columns=['successful_tasks', 'failed_tasks', 'run_id'], axis=1)
         if aggregate:
-            # For agents that have been run more than once, compute the average accuracy and cost and use that as the value in the DataFrame
-            df = df.groupby('agent_name').agg({
-                'date': 'first',
-                'total_cost': 'mean',
-                'accuracy': 'mean',
-                'precision': 'mean',
-                'recall': 'mean',
-                'f1_score': 'mean',
-                'auc': 'mean',
-                'overall_score': 'mean',
-                'vectorization_score': 'mean',
-                'fathomnet_score': 'mean',
-                'feedback_score': 'mean',
-                'house_price_score': 'mean',
-                'spaceship_titanic_score': 'mean',
-                'amp_parkinsons_disease_progression_prediction_score': 'mean',
-                'cifar10_score': 'mean',
-                'imdb_score': 'mean',
-                'level_1_accuracy': 'mean',
-                'level_2_accuracy': 'mean',
-                'level_3_accuracy': 'mean',
-                'Verified': 'first',
-                'Runs': 'first',
-                'acc_ci': 'first',
-                'cost_ci': 'first'
-            }).reset_index()
-        # Round float columns to 3 decimal places
-        float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score', 'level_1_accuracy', 'level_2_accuracy', 'level_3_accuracy']
-        for column in float_columns:
-            if column in df.columns:
-                df[column] = df[column].round(3)
-        # sort by accuracy
-        df = df.sort_values('accuracy', ascending=False)
-        # Rename columns
-        df = df.rename(columns={
-            'agent_name': 'Agent Name',
-            'date': 'Date',
-            'total_cost': 'Total Cost',
-            'accuracy': 'Accuracy',
-            'precision': 'Precision',
-            'recall': 'Recall',
-            'f1_score': 'F1 Score',
-            'auc': 'AUC',
-            'overall_score': 'Overall Score',
-            'vectorization_score': 'Vectorization Score',
-            'fathomnet_score': 'Fathomnet Score',
-            'feedback_score': 'Feedback Score',
-            'house_price_score': 'House Price Score',
-            'spaceship_titanic_score': 'Spaceship Titanic Score',
-            'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
-            'cifar10_score': 'CIFAR10 Score',
-            'imdb_score': 'IMDB Score',
-            'level_1_accuracy': 'Level 1 Accuracy',
-            'level_2_accuracy': 'Level 2 Accuracy',
-            'level_3_accuracy': 'Level 3 Accuracy',
-            'acc_ci': 'Accuracy CI',
-            'cost_ci': 'Total Cost CI'
-        })
         return df
     def get_task_success_data(self, benchmark_name):
-        with self.get_conn() as conn:
             query = '''
                 SELECT agent_name, accuracy, successful_tasks, failed_tasks
                 FROM parsed_results
@@ -359,17 +464,154 @@ class TracePreprocessor:
         return df
-    def load_verified_agents(self, file_path='verified_agents.yaml'):
         with open(file_path, 'r') as f:
-            verified_data = yaml.safe_load(f)
         verified_agents = set()
-        for benchmark, agents in verified_data.items():
             for agent in agents:
-                verified_agents.add((benchmark, agent['agent_name']))
         return verified_agents
 if __name__ == '__main__':
     preprocessor = TracePreprocessor()
     preprocessor.preprocess_traces()

 import yaml
 import numpy as np
+# Define column schemas
+PARSED_RESULTS_COLUMNS = {
+    'benchmark_name': 'TEXT',
+    'agent_name': 'TEXT',
+    'date': 'TEXT',
+    'run_id': 'TEXT',
+    'successful_tasks': 'TEXT',
+    'failed_tasks': 'TEXT',
+    'total_cost': 'REAL',
+    'accuracy': 'REAL',
+    'precision': 'REAL',
+    'recall': 'REAL',
+    'f1_score': 'REAL',
+    'auc': 'REAL',
+    'overall_score': 'REAL',
+    'vectorization_score': 'REAL',
+    'fathomnet_score': 'REAL',
+    'feedback_score': 'REAL',
+    'house_price_score': 'REAL',
+    'spaceship_titanic_score': 'REAL',
+    'amp_parkinsons_disease_progression_prediction_score': 'REAL',
+    'cifar10_score': 'REAL',
+    'imdb_score': 'REAL',
+    'level_1_accuracy': 'REAL',
+    'level_2_accuracy': 'REAL',
+    'level_3_accuracy': 'REAL',
+    'task_goal_completion': 'REAL',  # New column
+    'scenario_goal_completion': 'REAL',  # New column
+    'accuracy_ci': 'TEXT',  # Using TEXT since it stores formatted strings like "-0.123/+0.456"
+    'cost_ci': 'TEXT',
+}
+# Define which columns should be included in aggregation and how
+AGGREGATION_RULES = {
+    'date': 'first',
+    'total_cost': 'mean',
+    'accuracy': 'mean',
+    'precision': 'mean',
+    'recall': 'mean',
+    'f1_score': 'mean',
+    'auc': 'mean',
+    'overall_score': 'mean',
+    'vectorization_score': 'mean',
+    'fathomnet_score': 'mean',
+    'feedback_score': 'mean',
+    'house_price_score': 'mean',
+    'spaceship_titanic_score': 'mean',
+    'amp_parkinsons_disease_progression_prediction_score': 'mean',
+    'cifar10_score': 'mean',
+    'imdb_score': 'mean',
+    'level_1_accuracy': 'mean',
+    'level_2_accuracy': 'mean',
+    'level_3_accuracy': 'mean',
+    'task_goal_completion': 'mean',
+    'scenario_goal_completion': 'mean',
+    'Verified': 'first',
+    'Runs': 'first',
+    'accuracy_ci': 'first',
+    'cost_ci': 'first',
+}
+# Define column display names
+COLUMN_DISPLAY_NAMES = {
+    'agent_name': 'Agent Name',
+    'date': 'Date',
+    'total_cost': 'Total Cost',
+    'accuracy': 'Accuracy',
+    'precision': 'Precision',
+    'recall': 'Recall',
+    'f1_score': 'F1 Score',
+    'auc': 'AUC',
+    'overall_score': 'Overall Score',
+    'vectorization_score': 'Vectorization Score',
+    'fathomnet_score': 'Fathomnet Score',
+    'feedback_score': 'Feedback Score',
+    'house_price_score': 'House Price Score',
+    'spaceship_titanic_score': 'Spaceship Titanic Score',
+    'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
+    'cifar10_score': 'CIFAR10 Score',
+    'imdb_score': 'IMDB Score',
+    'level_1_accuracy': 'Level 1 Accuracy',
+    'level_2_accuracy': 'Level 2 Accuracy',
+    'level_3_accuracy': 'Level 3 Accuracy',
+    'task_goal_completion': 'Task Goal Completion',
+    'scenario_goal_completion': 'Scenario Goal Completion',
+    'accuracy_ci': 'Accuracy CI',
+    'cost_ci': 'Total Cost CI',
+}
+DEFAULT_PRICING = {
+    "text-embedding-3-small": {"prompt_tokens": 0.02, "completion_tokens": 0},
+                "text-embedding-3-large": {"prompt_tokens": 0.13, "completion_tokens": 0},
+                "gpt-4o-2024-05-13": {"prompt_tokens": 2.5, "completion_tokens": 10},
+                "gpt-4o-2024-08-06": {"prompt_tokens": 2.5, "completion_tokens": 10},
+                "gpt-3.5-turbo-0125": {"prompt_tokens": 0.5, "completion_tokens": 1.5},
+                "gpt-3.5-turbo": {"prompt_tokens": 0.5, "completion_tokens": 1.5},
+                "gpt-4-turbo-2024-04-09": {"prompt_tokens": 10, "completion_tokens": 30},
+                "gpt-4-turbo": {"prompt_tokens": 10, "completion_tokens": 30},
+                "gpt-4o-mini-2024-07-18": {"prompt_tokens": 0.15, "completion_tokens": 0.6},
+                "meta-llama/Meta-Llama-3.1-8B-Instruct": {"prompt_tokens": 0.18, "completion_tokens": 0.18},
+                "meta-llama/Meta-Llama-3.1-70B-Instruct": {"prompt_tokens": 0.88, "completion_tokens": 0.88},
+                "meta-llama/Meta-Llama-3.1-405B-Instruct": {"prompt_tokens": 5, "completion_tokens": 15},
+                "gpt-4o": {"prompt_tokens": 2.5, "completion_tokens": 10},
+                "o1-mini-2024-09-12": {"prompt_tokens": 3, "completion_tokens": 12},
+                "o1-preview-2024-09-12": {"prompt_tokens": 15, "completion_tokens": 60},
+                "claude-3-5-sonnet-20240620": {"prompt_tokens": 3, "completion_tokens": 15},
+                "claude-3-5-sonnet-20241022": {"prompt_tokens": 3, "completion_tokens": 15},
+                "us.anthropic.claude-3-5-sonnet-20240620-v1:0": {"prompt_tokens": 3, "completion_tokens": 15},
+                "us.anthropic.claude-3-5-sonnet-20241022-v2:0": {"prompt_tokens": 3, "completion_tokens": 15},
+                "openai/gpt-4o-2024-11-20": {"prompt_tokens": 2.5, "completion_tokens": 10},
+                "openai/gpt-4o-2024-08-06": {"prompt_tokens": 2.5, "completion_tokens": 10},
+                "openai/gpt-4o-mini-2024-07-18": {"prompt_tokens": 0.15, "completion_tokens": 0.6},
+                "openai/o1-mini-2024-09-12": {"prompt_tokens": 3, "completion_tokens": 12},
+                "openai/o1-preview-2024-09-12": {"prompt_tokens": 15, "completion_tokens": 60},
+                "anthropic/claude-3-5-sonnet-20240620": {"prompt_tokens": 3, "completion_tokens": 15},
+                "anthropic/claude-3-5-sonnet-20241022": {"prompt_tokens": 3, "completion_tokens": 15},
+                "google/gemini-1.5-pro": {"prompt_tokens": 1.25, "completion_tokens": 5},
+                "google/gemini-1.5-flash": {"prompt_tokens": 0.075, "completion_tokens": 0.3},
+                "together/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo": {"prompt_tokens": 3.5, "completion_tokens": 3.5},
+                "together/meta-llama/Meta-Llama-3.1-70B-Instruct": {"prompt_tokens": 0.88, "completion_tokens": 0.88},
+}
 class TracePreprocessor:
+    def __init__(self, db_dir='preprocessed_traces'):
+        self.db_dir = Path(db_dir)
+        self.db_dir.mkdir(exist_ok=True)
         self.local = threading.local()
+        self.connections = {}
+    def get_conn(self, benchmark_name):
+        # Sanitize benchmark name for filename
+        safe_name = benchmark_name.replace('/', '_').replace('\\', '_')
+        db_path = self.db_dir / f"{safe_name}.db"
+        # Get thread-specific connections dictionary
+        if not hasattr(self.local, 'connections'):
+            self.local.connections = {}
+        # Create new connection if not exists for this benchmark
+        if safe_name not in self.local.connections:
+            self.local.connections[safe_name] = sqlite3.connect(db_path)
+        return self.local.connections[safe_name]
+    def create_tables(self, benchmark_name):
+        with self.get_conn(benchmark_name) as conn:
+            # Create parsed_results table dynamically from schema
+            columns = [f"{col} {dtype}" for col, dtype in PARSED_RESULTS_COLUMNS.items()]
+            create_parsed_results = f'''
+                CREATE TABLE IF NOT EXISTS parsed_results (
+                    {', '.join(columns)},
+                    PRIMARY KEY (benchmark_name, agent_name, run_id)
+                )
+            '''
+            conn.execute(create_parsed_results)
             conn.execute('''
                 CREATE TABLE IF NOT EXISTS preprocessed_traces (
                     benchmark_name TEXT,
                 )
             ''')
             conn.execute('''
+                CREATE TABLE IF NOT EXISTS token_usage (
                     benchmark_name TEXT,
                     agent_name TEXT,
                     run_id TEXT,
+                    model_name TEXT,
+                    prompt_tokens INTEGER,
+                    completion_tokens INTEGER,
+                    input_tokens INTEGER,
+                    output_tokens INTEGER,
+                    total_tokens INTEGER,
+                    input_tokens_cache_write INTEGER,
+                    input_tokens_cache_read INTEGER,
+                    PRIMARY KEY (benchmark_name, agent_name, run_id, model_name)
                 )
             ''')
     def preprocess_traces(self, processed_dir="evals_live"):
         processed_dir = Path(processed_dir)
         for file in processed_dir.glob('*.json'):
             with open(file, 'r') as f:
                 date = data['config']['date']
                 config = data['config']
+            # Create tables for this benchmark if they don't exist
+            self.create_tables(benchmark_name)
             try:
                 raw_logging_results = pickle.dumps(data['raw_logging_results'])
+                with self.get_conn(benchmark_name) as conn:
                     conn.execute('''
                         INSERT OR REPLACE INTO preprocessed_traces
                         (benchmark_name, agent_name, date, run_id, raw_logging_results)
             try:
                 failure_report = pickle.dumps(data['failure_report'])
+                with self.get_conn(benchmark_name) as conn:
                     conn.execute('''
                         INSERT INTO failure_reports
                         (benchmark_name, agent_name, date, run_id, failure_report)
+                        VALUES (?, ?, ?, ?, ?)
                     ''', (benchmark_name, agent_name, date, config['run_id'], failure_report))
             except Exception as e:
                 print(f"Error preprocessing failure_report in {file}: {e}")
             try:
                 results = data['results']
+                with self.get_conn(benchmark_name) as conn:
+                    # Dynamically create placeholders and values list
+                    columns = [col for col in PARSED_RESULTS_COLUMNS.keys()
+                              if col not in ['benchmark_name', 'agent_name', 'date', 'run_id']]
+                    placeholders = ','.join(['?'] * (len(columns) + 4))  # +4 for benchmark_name, agent_name, date, run_id
+                    values = [
                         benchmark_name,
                         agent_name,
                         config['date'],
+                        config['run_id']
+                    ] + [str(results.get(col)) if col in ['successful_tasks', 'failed_tasks']
+                         else results.get(col) for col in columns]
+                    query = f'''
+                        INSERT INTO parsed_results
+                        ({', '.join(PARSED_RESULTS_COLUMNS.keys())})
+                        VALUES ({placeholders})
+                    '''
+                    conn.execute(query, values)
             except Exception as e:
                 print(f"Error preprocessing parsed results in {file}: {e}")
+            try:
+                total_usage = data.get('total_usage', {})
+                for model_name, usage in total_usage.items():
+                    with self.get_conn(benchmark_name) as conn:
+                        conn.execute('''
+                            INSERT INTO token_usage
+                            (benchmark_name, agent_name, run_id, model_name,
+                            prompt_tokens, completion_tokens, input_tokens, output_tokens, total_tokens,
+                            input_tokens_cache_write, input_tokens_cache_read)
+                            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                        ''', (
+                            benchmark_name,
+                            agent_name,
+                            config['run_id'],
+                            model_name,
+                            usage.get('prompt_tokens', 0),
+                            usage.get('completion_tokens', 0),
+                            usage.get('input_tokens', 0),
+                            usage.get('output_tokens', 0),
+                            usage.get('total_tokens', 0),
+                            usage.get('input_tokens_cache_write', 0),
+                            usage.get('input_tokens_cache_read', 0)
+                        ))
+            except Exception as e:
+                print(f"Error preprocessing token usage in {file}: {e}")
     @lru_cache(maxsize=100)
     def get_analyzed_traces(self, agent_name, benchmark_name):
+        with self.get_conn(benchmark_name) as conn:
             query = '''
                 SELECT agent_name, raw_logging_results, date FROM preprocessed_traces
                 WHERE benchmark_name = ? AND agent_name = ?
             '''
             df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
+        # check for each row if raw_logging_results is not None
         df = df[df['raw_logging_results'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
         if len(df) == 0:
         # select latest run
         df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
         return pickle.loads(df['raw_logging_results'][0])
     @lru_cache(maxsize=100)
     def get_failure_report(self, agent_name, benchmark_name):
+        with self.get_conn(benchmark_name) as conn:
             query = '''
                 SELECT agent_name, date, failure_report FROM failure_reports
                 WHERE benchmark_name = ? AND agent_name = ?
             '''
             df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
         df = df[df['failure_report'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
         if len(df) == 0:
             return None
         df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
         return pickle.loads(df['failure_report'][0])
     def _calculate_ci(self, data, confidence=0.95, type='minmax'):
         data = data[np.isfinite(data)]
         return mean, ci[0], ci[1]
     def get_parsed_results(self, benchmark_name, aggregate=True):
+        with self.get_conn(benchmark_name) as conn:
             query = '''
                 SELECT * FROM parsed_results
                 WHERE benchmark_name = ?
             '''
             df = pd.read_sql_query(query, conn, params=(benchmark_name,))
+        # Load metadata
+        with open('agents_metadata.yaml', 'r') as f:
+            metadata = yaml.safe_load(f)
+        # Create URL mapping
+        url_mapping = {}
+        if benchmark_name in metadata:
+            for agent in metadata[benchmark_name]:
+                if 'url' in agent and agent['url']:  # Only add if URL exists and is not empty
+                    url_mapping[agent['agent_name']] = agent['url']
         # Add 'Verified' column
+        verified_agents = self.load_verified_agents()
         df['Verified'] = df.apply(lambda row: '✓' if (benchmark_name, row['agent_name']) in verified_agents else '', axis=1)
+        # Add URLs to agent names if they exist
+        df['agent_name'] = df['agent_name'].apply(
+            lambda x: f'[{x}]({url_mapping[x]})' if x in url_mapping else x
+        )
         # Add column for how many times an agent_name appears in the DataFrame
         df['Runs'] = df.groupby('agent_name')['agent_name'].transform('count')
         # Compute the 95% confidence interval for accuracy and cost for agents that have been run more than once
+        df['accuracy_ci'] = None
         df['cost_ci'] = None
+        # Round float columns to 2 decimal places
+        float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc',
+                        'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score',
+                        'house_price_score', 'spaceship_titanic_score',
+                        'amp_parkinsons_disease_progression_prediction_score', 'cifar10_score',
+                        'imdb_score', 'level_1_accuracy', 'level_2_accuracy', 'level_3_accuracy']
+        for column in float_columns:
+            if column in df.columns:
+                df[column] = df[column].round(2)
         for agent_name in df['agent_name'].unique():
             agent_df = df[df['agent_name'] == agent_name]
                 accuracy_mean, accuracy_lower, accuracy_upper = self._calculate_ci(agent_df['accuracy'], type='minmax')
                 cost_mean, cost_lower, cost_upper = self._calculate_ci(agent_df['total_cost'], type='minmax')
+                # Round CI values to 2 decimals
+                accuracy_ci = f"-{abs(accuracy_mean - accuracy_lower):.2f}/+{abs(accuracy_mean - accuracy_upper):.2f}"
+                cost_ci = f"-{abs(cost_mean - cost_lower):.2f}/+{abs(cost_mean - cost_upper):.2f}"
+                df.loc[df['agent_name'] == agent_name, 'accuracy_ci'] = accuracy_ci
                 df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
         df = df.drop(columns=['successful_tasks', 'failed_tasks', 'run_id'], axis=1)
         if aggregate:
+            df = df.groupby('agent_name').agg(AGGREGATION_RULES).reset_index()
+        # Rename columns using the display names mapping
+        df = df.rename(columns=COLUMN_DISPLAY_NAMES)
+        # Sort by Accuracy in descending order
+        df = df.sort_values('Accuracy', ascending=False)
         return df
     def get_task_success_data(self, benchmark_name):
+        with self.get_conn(benchmark_name) as conn:
             query = '''
                 SELECT agent_name, accuracy, successful_tasks, failed_tasks
                 FROM parsed_results
         return df
+    def load_verified_agents(self, file_path='agents_metadata.yaml'):
         with open(file_path, 'r') as f:
+            metadata = yaml.safe_load(f)
         verified_agents = set()
+        for benchmark, agents in metadata.items():
             for agent in agents:
+                if 'verification_date' in agent:  # Only add if verified
+                    verified_agents.add((benchmark, agent['agent_name']))
         return verified_agents
+    def get_token_usage_with_costs(self, benchmark_name, pricing_config=None):
+        """Get token usage data with configurable pricing"""
+        if pricing_config is None:
+            pricing_config = DEFAULT_PRICING
+        with self.get_conn(benchmark_name) as conn:
+            query = '''
+                SELECT agent_name, model_name,
+                SUM(prompt_tokens) as prompt_tokens,
+                SUM(completion_tokens) as completion_tokens,
+                SUM(input_tokens) as input_tokens,
+                SUM(output_tokens) as output_tokens,
+                SUM(total_tokens) as total_tokens,
+                SUM(input_tokens_cache_write) as input_tokens_cache_write,
+                SUM(input_tokens_cache_read) as input_tokens_cache_read
+                FROM token_usage
+                WHERE benchmark_name = ?
+                GROUP BY agent_name, model_name
+            '''
+            df = pd.read_sql_query(query, conn, params=(benchmark_name,))
+        # Calculate costs based on pricing config (prices are per 1M tokens)
+        df['total_cost'] = 0.0
+        for model, prices in pricing_config.items():
+            mask = df['model_name'] == model
+            df.loc[mask, 'total_cost'] = (
+                df.loc[mask, 'input_tokens'] * prices['prompt_tokens'] / 1e6 +
+                df.loc[mask, 'output_tokens'] * prices['completion_tokens'] / 1e6 +
+                df.loc[mask, 'input_tokens_cache_read'] * prices['prompt_tokens'] / 1e6 +
+                df.loc[mask, 'input_tokens_cache_write'] * prices['prompt_tokens'] / 1e6 +
+                df.loc[mask, 'prompt_tokens'] * prices['prompt_tokens'] / 1e6 +
+                df.loc[mask, 'completion_tokens'] * prices['completion_tokens'] / 1e6
+            )
+        return df
+    def get_parsed_results_with_costs(self, benchmark_name, pricing_config=None, aggregate=True):
+        """Get parsed results with recalculated costs based on token usage"""
+        # Get base results
+        results_df = self.get_parsed_results(benchmark_name, aggregate=False)
+        # Get token usage with new costs
+        token_costs = self.get_token_usage_with_costs(benchmark_name, pricing_config)
+        # Group token costs by agent
+        agent_costs = token_costs.groupby('agent_name')['total_cost'].sum().reset_index()
+        agent_costs = agent_costs.rename(columns={
+            'agent_name': 'agent_name_temp',
+            'total_cost': 'Total Cost'
+        })
+        # Drop existing Total Cost column if it exists
+        if 'Total Cost' in results_df.columns:
+            results_df = results_df.drop('Total Cost', axis=1)
+        # create temp column that is whatever is in agent_name [x] because of url we added to agent_name
+        results_df['agent_name_temp'] = results_df['Agent Name'].apply(lambda x: x.split('[')[1].split(']')[0])
+        # Update costs in results
+        results_df = results_df.merge(agent_costs, on='agent_name_temp', how='left')
+        # Drop temp column
+        results_df = results_df.drop('agent_name_temp', axis=1)
+        # Fill any missing costs with 0
+        # results_df['Total Cost'] = results_df['Total Cost'].fillna(0)
+        if aggregate:
+            # Aggregate results
+            results_df = results_df.groupby('Agent Name').agg({
+                'Date': 'first',
+                'Total Cost': 'mean',
+                'Accuracy': 'mean',
+                'Precision': 'mean',
+                'Recall': 'mean',
+                'F1 Score': 'mean',
+                'AUC': 'mean',
+                'Overall Score': 'mean',
+                'Vectorization Score': 'mean',
+                'Fathomnet Score': 'mean',
+                'Feedback Score': 'mean',
+                'House Price Score': 'mean',
+                'Spaceship Titanic Score': 'mean',
+                'AMP Parkinsons Disease Progression Prediction Score': 'mean',
+                'CIFAR10 Score': 'mean',
+                'IMDB Score': 'mean',
+                'Level 1 Accuracy': 'mean',
+                'Level 2 Accuracy': 'mean',
+                'Level 3 Accuracy': 'mean',
+                'Verified': 'first',
+                'Runs': 'first',
+                'Accuracy CI': 'first',
+                'Total Cost CI': 'first'
+            }).reset_index()
+        # Round the cost values
+        results_df['Total Cost'] = results_df['Total Cost'].round(3)
+        # Sort by Accuracy in descending order
+        results_df = results_df.sort_values('Accuracy', ascending=False)
+        return results_df
+    def check_token_usage_data(self, benchmark_name):
+        """Debug helper to check token usage data"""
+        with self.get_conn(benchmark_name) as conn:
+            query = '''
+                SELECT * FROM token_usage
+                WHERE benchmark_name = ?
+            '''
+            df = pd.read_sql_query(query, conn, params=(benchmark_name,))
+        return df
+    def get_models_for_benchmark(self, benchmark_name):
+        """Get list of unique model names used in a benchmark"""
+        with self.get_conn(benchmark_name) as conn:
+            query = '''
+                SELECT DISTINCT model_name
+                FROM token_usage
+                WHERE benchmark_name = ?
+            '''
+            df = pd.read_sql_query(query, conn, params=(benchmark_name,))
+        return df['model_name'].tolist()
+    def get_all_agents(self, benchmark_name):
+        """Get list of all agent names for a benchmark"""
+        with self.get_conn(benchmark_name) as conn:
+            query = '''
+                SELECT DISTINCT agent_name
+                FROM parsed_results
+                WHERE benchmark_name = ?
+            '''
+            df = pd.read_sql_query(query, conn, params=(benchmark_name,))
+        return df['agent_name'].tolist()
 if __name__ == '__main__':
     preprocessor = TracePreprocessor()
     preprocessor.preprocess_traces()

utils/viz.py CHANGED Viewed

@@ -19,7 +19,7 @@ def create_leaderboard(df, ci_metrics = None):
             # for rows in the df for which CI metric is not None, join the metric and CI columns by looping through the CI metrics columns
             for i, row in df.iterrows():
                 if str(row[CI_metric]) != 'None':
-                    df.at[i, metric] = str(row[metric]) + " (" + str(row[CI_metric]) + ")"
     return df

             # for rows in the df for which CI metric is not None, join the metric and CI columns by looping through the CI metrics columns
             for i, row in df.iterrows():
                 if str(row[CI_metric]) != 'None':
+                    df.at[i, metric] = str(round(float(row[metric]), 2)) + " (" + str(row[CI_metric]) + ")"
     return df

verified_agents.yaml DELETED Viewed

@@ -1,129 +0,0 @@
-# This file contains information about verified agent results for different benchmarks.
-# Format:
-#   benchmark_name:
-#     - agent_name: "Name of the agent"
-#       verification_date: YYYY-MM-DD
-usaco:
-  - agent_name: "USACO Reflexion + Episodic (gpt-4o-mini-2024-07-18)"
-    verification_date: 2024-08-20
-  - agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-mini-2024-07-18)"
-    verification_date: 2024-08-20
-  - agent_name: "USACO Reflexion (gpt-4o-mini-2024-07-18)"
-    verification_date: 2024-08-20
-  - agent_name: "USACO Episodic (gpt-4o-mini-2024-07-18)"
-    verification_date: 2024-08-12
-  - agent_name: "USACO Reflexion + Semantic (gpt-4o-mini-2024-07-18)"
-    verification_date: 2024-08-20
-  - agent_name: "USACO Zero-shot (gpt-4o-mini-2024-07-18)"
-    verification_date: 2024-08-11
-  - agent_name: "USACO Semantic (gpt-4o-mini-2024-07-18)"
-    verification_date: 2024-08-12
-  - agent_name: USACO Reflexion + Episodic + Semantic (gpt-4o-2024-05-13)
-    verification_date: 2024-08-25
-  - agent_name: USACO Reflexion + Episodic (gpt-4o-2024-05-13)
-    verification_date: 2024-08-25
-  - agent_name: USACO Reflexion + Semantic (gpt-4o-2024-05-13)
-    verification_date: 2024-08-25
-  - agent_name: Episodic Retrial (2x) (gpt-4o-2024-05-13)
-    verification_date: 2024-08-25
-  - agent_name: Episodic Retrial (3x) (gpt-4o-mini-2024-07-18)
-    verification_date: 2024-08-25
-  - agent_name: Episodic Retrial (2x) (gpt-4o-mini-2024-07-18)
-    verification_date: 2024-08-25
-  - agent_name: Episodic Retrial (5x) (gpt-4o-mini-2024-07-18)
-    verification_date: 2024-08-25
-  - agent_name: Episodic Warming (3 Steps) (gpt-4o-mini-2024-07-18)
-    verification_date: 2024-08-24
-  - agent_name: USACO Episodic (gpt-4o-2024-05-13)
-    verification_date: 2024-08-24
-  - agent_name: USACO Semantic (gpt-4o-2024-05-13)
-    verification_date: 2024-08-24
-  - agent_name: Zero-shot Retrial (2x) (gpt-4o-mini-2024-07-18)
-    verification_date: 2024-08-24
-  - agent_name: Zero-shot Retrial (3x) (gpt-4o-mini-2024-07-18)
-    verification_date: 2024-08-24
-  - agent_name: Zero-shot Retrial (5x) (gpt-4o-mini-2024-07-18)
-    verification_date: 2024-08-24
-  - agent_name: USACO Zero-shot (gpt-4o-2024-05-13)
-    verification_date: 2024-08-24
-swebench_verified_mini:
-  - agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
-    verification_date: 2024-08-17
-  - agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)"
-    verification_date: 2024-08-19
-  - agent_name: "Moatless (gpt-4o-mini-2024-07-18)"
-    verification_date: 2024-10-30
-  - agent_name: "Moatless (gpt-4o-2024-08-06)"
-    verification_date: 2024-10-30
-  - agent_name: "Moatless (claude-3-5-sonnet-20241022)"
-    verification_date: 2024-10-30
-  - agent_name: "Agentless (o1-mini-2024-09-12)"
-    verification_date: 2024-10-30
-swebench_verified:
-  - agent_name: "Moatless (gpt-4o-2024-08-06)"
-    verification_date: 2024-10-30
-  - agent_name: "Agentless (o1-mini-2024-09-12)"
-    verification_date: 2024-10-30
-mlagentbench:
-  - agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
-    verification_date: 2024-08-19
-corebench_easy:
-  - agent_name: "AutoGPT (GPT-4o)"
-    verification_date: 2024-09-28
-  - agent_name: "AutoGPT (GPT-4o-mini)"
-    verification_date: 2024-09-28
-  - agent_name: "CORE-Agent (GPT-4o)"
-    verification_date: 2024-09-28
-  - agent_name: "CORE-Agent (GPT-4o-mini)"
-    verification_date: 2024-09-28
-corebench_medium:
-  - agent_name: "AutoGPT (GPT-4o)"
-    verification_date: 2024-09-28
-  - agent_name: "AutoGPT (GPT-4o-mini)"
-    verification_date: 2024-09-28
-  - agent_name: "CORE-Agent (GPT-4o)"
-    verification_date: 2024-09-28
-  - agent_name: "CORE-Agent (GPT-4o-mini)"
-    verification_date: 2024-09-28
-corebench_hard:
-  - agent_name: "AutoGPT (GPT-4o)"
-    verification_date: 2024-09-28
-  - agent_name: "AutoGPT (GPT-4o-mini)"
-    verification_date: 2024-09-28
-  - agent_name: "CORE-Agent (GPT-4o)"
-    verification_date: 2024-09-28
-  - agent_name: "CORE-Agent (GPT-4o-mini)"
-    verification_date: 2024-09-28
-gaia:
-  - agent_name: "Inspect Default Agent (gpt-4o-mini-2024-07-18)"
-    verification_date: 2024-11-30
-  - agent_name: "Inspect Default Agent (gpt-4o-2024-11-20)"
-    verification_date: 2024-11-30
-  - agent_name: "Inspect Default Agent (claude-3-5-sonnet-20241022)"
-    verification_date: 2024-11-30
-  - agent_name: "Inspect Default Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
-    verification_date: 2024-11-30
-cybench:
-  - agent_name: "Inspect Default Agent (gpt-4o-mini-2024-07-18)"
-    verification_date: 2024-11-30
-  - agent_name: "Inspect Default Agent (gpt-4o-2024-11-20)"
-    verification_date: 2024-11-30
-  - agent_name: "Inspect Default Agent (claude-3-5-sonnet-20241022)"
-    verification_date: 2024-11-30
-  - agent_name: "Inspect Default Agent (o1-mini-2024-09-12)"
-    verification_date: 2024-11-30
-  - agent_name: "Inspect Default Agent (Meta-Llama-3.1-405B-Instruct-Turbo)"
-    verification_date: 2024-11-30