Spaces:
Running
Running
import glob | |
import os | |
import json | |
import glob | |
import tiktoken | |
import pandas as pd | |
import copy | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import re | |
import time | |
def estimate_tokens(path): | |
enc = tiktoken.encoding_for_model("gpt-4") | |
prompt_tokens = 0 | |
completed_tokens = 0 | |
num_steps = 0 | |
step_logs = path.replace("trace.json", "../agent_log/*.log") | |
for file in glob.glob(step_logs): | |
with open(file, "r") as f: | |
content = f.read() | |
if "langchain" not in file: | |
prompts = re.findall(r"===================prompt=====================" + r"(.*?)" + r"===================.*?response.*?=====================", content, re.DOTALL) | |
prompt_tokens += sum([len(enc.encode(p)) for p in prompts]) | |
completed = re.findall(r"===================.*?response.*?=====================" + r"(.*?)" + r"===================tokens=====================", content, re.DOTALL) | |
completed_tokens += sum([len(enc.encode(p)) for p in completed]) | |
else: | |
prompts = re.findall(r"Prompt after formatting:\n\x1B\[32;1m\x1B\[1;3m" + r"(.*?)" + r"\x1B\[0m\n\n\x1B\[1m> Finished chain.\x1B\[0m\n\x1B\[32;1m\x1B\[1;3m", content, re.DOTALL) | |
prompt_tokens += sum([len(enc.encode(p)) for p in prompts]) | |
completed = re.findall(r"\x1B\[0m\n\n\x1B\[1m> Finished chain.\x1B\[0m\n\x1B\[32;1m\x1B\[1;3m" + r"(.*?)" + r"Prompt after formatting:\n\x1B\[32;1m\x1B\[1;3m", content, re.DOTALL) | |
completed_tokens += sum([len(enc.encode(p)) for p in completed]) | |
num_steps = len(json.load(open(path, "r"))["steps"]) | |
try: | |
total_time = float(open(path.replace("trace.json", "overall_time.txt"), "r").read()) | |
except: | |
total_time = 0 | |
tool_step_logs = path.replace("trace.json", "tool_logs/*.log") | |
tool_prompt_tokens = 0 | |
tool_completed_tokens = 0 | |
for file in glob.glob(tool_step_logs): | |
with open(file, "r") as f: | |
content = f.read() | |
if "langchain" not in file: | |
prompts = re.findall(r"===================prompt=====================" + r"(.*?)" + r"===================.*?response.*?=====================", content, re.DOTALL) | |
tool_prompt_tokens += sum([len(enc.encode(p)) for p in prompts]) | |
completed = re.findall(r"===================.*?response.*?=====================" + r"(.*?)" + r"===================tokens=====================", content, re.DOTALL) | |
tool_completed_tokens += sum([len(enc.encode(p)) for p in completed]) | |
else: | |
prompts = re.findall(r"Prompt after formatting:\n\x1B\[32;1m\x1B\[1;3m" + r"(.*?)" + r"\x1B\[0m\n\n\x1B\[1m> Finished chain.\x1B\[0m\n\x1B\[32;1m\x1B\[1;3m", content, re.DOTALL) | |
tool_prompt_tokens += sum([len(enc.encode(p)) for p in prompts]) | |
completed = re.findall(r"\x1B\[0m\n\n\x1B\[1m> Finished chain.\x1B\[0m\n\x1B\[32;1m\x1B\[1;3m" + r"(.*?)" + r"Prompt after formatting:\n\x1B\[32;1m\x1B\[1;3m", content, re.DOTALL) | |
tool_completed_tokens += sum([len(enc.encode(p)) for p in completed]) | |
return prompt_tokens, completed_tokens, tool_prompt_tokens, tool_completed_tokens, num_steps, total_time | |
def oom_error(path): | |
log = path.replace("trace.json", "../log") | |
main_log = path.replace("trace.json", "../agent_log/main_log") | |
message = "CUDA out of memory" | |
return (message in open(log, "r").read()) or (message in open(main_log, "r").read()) | |
def mkl_error(path): | |
log = path.replace("trace.json", "../log") | |
main_log = path.replace("trace.json", "../agent_log/main_log") | |
messages = ["rror: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp-a34b3233.so.1 library.", "OpenBLAS blas_thread_init:"] | |
return any([m in open(log, "r").read() for m in messages]) or any([m in open(main_log, "r").read() for m in messages]) | |
def quota_error(path): | |
log = path.replace("trace.json", "error.txt") | |
if os.path.exists(log): | |
message = "RemoteServiceError: EXCEPTION: total quota" | |
return message in open(log, "r").read() | |
return False | |
def connection_error(path): | |
log = path.replace("trace.json", "../log") | |
main_log = path.replace("trace.json", "../agent_log/main_log") | |
bad = ["You exceeded your current quota, please check your plan and billing details.", "Error: 'text-similarity-ada-001'", "Error: 'text-embedding-ada-001'"] | |
return ("Connection aborted" in open(log, "r").read()) or (any([b in open(main_log, "r").read() for b in bad])) | |
def langchain_error(path): | |
if os.path.exists(os.path.join(path.replace("trace.json", ""), "error.txt")): | |
return "langchain.schema.OutputParserException" in open(os.path.join(path.replace("trace.json", ""), "error.txt"), "r").read() | |
return False | |
def error(path): | |
return (os.path.exists(os.path.join(path.replace("trace.json", ""), "error.txt")) and not langchain_error(path)) or not os.path.exists(os.path.join(path.replace("trace.json", ""), "overall_time.txt")) | |
def json_error(path): | |
main_log = path.replace("trace.json", "../agent_log/main_log") | |
return open(main_log, "r").read().count("JSONDecodeError") > 2 | |
def langchain_final(path): | |
return "Final Answer" in open(path.replace("trace.json", "../agent_log/main_log"), "r").read() | |
def autogpt_final(path): | |
return "Goal achieved" in open(path.replace("trace.json", "../agent_log/main_log"), "r").read() | |
def long_prompt_error(path): | |
main_log = path.replace("trace.json", "../agent_log/main_log") | |
return "EnvError: too long input for the tool" in open(main_log, "r").read() | |
def get_all_runs_with_log(): | |
#TODO: fix paths to where your trace.json are | |
all_runs.extend(glob.glob("/lfs/local/0/qhwang/nlp_logs/final_exp_logs*/*/*/*/env_log/trace.json")) | |
df = pd.DataFrame() | |
for r in all_runs: | |
exp, task, run = r.split("/")[-5:-2] | |
if task in os.listdir("../research_assistant_final/MLAgentBench/benchmarks"): | |
new_row={"task": task, "exp": exp, "run": run, "path": r} | |
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) | |
df["error"] = df["path"].apply(error) | |
df["json_error"] = df["path"].apply(json_error) | |
df["long_prompt_error"] = df["path"].apply(long_prompt_error) | |
df["oom_error"] = df["path"].apply(oom_error) | |
df["connection_error"] = df["path"].apply(connection_error) | |
df['mkl_error'] = df["path"].apply(mkl_error) | |
df['quota_error'] = df["path"].apply(quota_error) | |
df["langchain_error"] = df["path"].apply(langchain_error) | |
df_no_error = df[(((~df["error"]) & (~df["connection_error"])) | df["exp"].isin(["no_retrieval_gpt4", "full_gpt4_long"]) | (df["exp"].isin(["langchain", "langchain_long"]) & df["langchain_error"]) )& (~df["oom_error"]) & (~df["mkl_error"])] | |
return df , df_no_error | |
lower_the_better_tasks = [ "parkinsons-disease", "feedback", "BabyLM", "llama-inference", "house-price", "vectorization"] | |
# TODO: add propoer label mapping and task name mapping for pretty printing in the figure | |
print_labels = { | |
"no_retrieval_gpt4" : "GPT-4", | |
"no_retrieval" : "Claude v1.0", | |
"autogpt" : "AutoGPT", | |
"react" : "React", | |
"langchain" : "LangChain (React)", | |
"sanity_check" : "Baseline" | |
} | |
print_task_labels = { | |
"cifar10_training" : "cifar10", | |
"imdb" : "imdb", | |
"ogbn-arxiv" : "ogbn-arxiv", | |
"home-data-for-ml-course" : "house-price", | |
"kaggle_training_reg" : "house-price", | |
"kaggle_training_class" : "spaceship-titanic", | |
"amp-parkinsons-disease-progression-prediction" : "parkinsons-disease", | |
"fathomnet-out-of-sample-detection" : "fathomnet", | |
"feedback-prize-english-language-learning" : "feedback", | |
"google-research-identify-contrails-reduce-global-warming" : "identify-contrails", | |
"speed-up" : "llama-inference", | |
"vectorisation" : "vectorization", | |
"CLRS" : "CLRS", | |
"babylm" : "BabyLM" | |
} | |
def get_improvement(df, baseline, thresh = None, prefix=""): | |
if prefix: | |
df[f"{prefix}increase"] = df[[f"{prefix}score", "task"]].apply(lambda x: (x[f"{prefix}score"] - baseline[(baseline["task"] == x["task"])]["final_score"].values[0])/baseline[(baseline["task"] == x["task"])]["final_score"].values[0] if x[f"{prefix}score"] is not None else None, axis=1) | |
df[f"{prefix}decrease"] = df[[f"{prefix}score", "task"]].apply(lambda x: (x[f"{prefix}score"] - baseline[(baseline["task"] == x["task"])][f"final_score"].values[0])/baseline[(baseline["task"] == x["task"])]["final_score"].values[0] if x[f"{prefix}score"] is not None else None, axis=1) | |
if thresh: | |
return df[["task", f"{prefix}increase", f"{prefix}decrease"]].apply(lambda x: (x[f"{prefix}increase"] > thresh if x["task"] not in lower_the_better_tasks else x[f"{prefix}decrease"] < - thresh) if x[f"{prefix}increase"] is not None else False, axis=1) | |
else: | |
return df[["task", f"{prefix}increase", f"{prefix}decrease"]].apply(lambda x: (x[f"{prefix}increase"] if x["task"] not in lower_the_better_tasks else - x[f"{prefix}decrease"]) if x[f"{prefix}increase"] is not None else None, axis=1) | |
# performance | |
def get_all_runs_eval(print_labels = print_labels, print_task_labels = print_task_labels): | |
# TODO: collect all evaluation jsons into all_results | |
all_results = {} | |
for f in glob.glob("/lfs/local/0/qhwang/nlp_logs/*.json"): | |
all_results.update(json.load(open(f, "r"))) | |
df = pd.DataFrame() | |
for n, results in all_results.items(): | |
if n.endswith(".json"): | |
n=n.split("/env_log")[0] | |
results = {n: results} | |
exp, task, run = n.split("/")[-3:] | |
exp = exp.strip() | |
if exp == "react": | |
continue | |
task = task.strip() | |
run = run.strip() | |
for source_file, r in results.items(): | |
r_ = copy.deepcopy(r) | |
if len(r["score"]) < len(r["score_steps"])+1: | |
r_["score"].append(r["final_score"]) | |
r_["score_steps"].append(len(json.load(open(r_["path"], "r"))["steps"])) | |
r_["score"] = np.array(r_["score"]) | |
r_["score_steps"] = np.array(r_["score_steps"]) | |
if exp == "no_retrieval": | |
r_["score"] = r_["score"][r_["score_steps"] < 16] | |
r_["score_steps"] = r_["score_steps"][r_["score_steps"] < 16] | |
if exp == "langchain": | |
r_["submitted_final_answer"] = langchain_final(r_["path"]) | |
if exp == "autogpt": | |
r_["submitted_final_answer"] = autogpt_final(r_["path"]) | |
new_row={"task": task, "exp": exp, "run": run, **r_} | |
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) | |
df["connection_error"] = df["path"].apply(connection_error) | |
df["has_error"] = df["path"].apply(error) | |
df["oom_error"] = df["path"].apply(oom_error) | |
df["mkl_error"] = df["path"].apply(mkl_error) | |
df["langchain_error"] = df["path"].apply(langchain_error) | |
print(len(df[(df["error"] != "") | (df["connection_error"] == True)])) | |
df = df[(((~df["has_error"]) & (df["connection_error"] == False)) | df["exp"].isin(["no_retrieval_gpt4", "full_gpt4_long"])| (df["exp"].isin(["langchain", "langchain_long"]) & df["langchain_error"]) ) & (~df["oom_error"]) & (~df["mkl_error"])] | |
df["exp"] = df["exp"].apply(lambda x: x if not x.endswith("_long") else x[:-5]) | |
df = df[df["exp"].isin(list(print_labels.keys()))] | |
df["exp"] = df["exp"].apply(lambda x: print_labels[x]) | |
df["task"] = df["task"].apply(lambda x: print_task_labels.get(x, x)) | |
df["final_submitted_score"] = df[["final_score", "submitted_final_answer"]].apply(lambda x: x["final_score"] if x["final_score"] > 0 and x["submitted_final_answer"] else None, axis=1) | |
df["final_score"] = df["final_score"].apply(lambda x: x if x > 0 else None) | |
baseline = df[df["exp"] == "Baseline"][[ "task", "exp", "final_score"]].groupby(["task", "exp"]).mean().reset_index() | |
# special baseline numbers | |
try: | |
baseline.at[baseline[baseline["task"] == "imdb"].index.values[0], "final_score"] = 0.5 | |
baseline.at[baseline[baseline["task"] == "fathomnet"].index.values[0], "final_score"] = 1e-10 | |
except: | |
baseline = pd.concat( | |
[ | |
baseline, | |
pd.DataFrame( | |
[{"task": "imdb", "exp": "Baseline", "final_score": 0.5}] | |
), | |
], | |
ignore_index=True, | |
) | |
baseline = pd.concat( | |
[ | |
baseline, | |
pd.DataFrame( | |
[{"task": "fathomnet", "exp": "Baseline", "final_score": 1e-10}] | |
), | |
], | |
ignore_index=True, | |
) | |
baseline = pd.concat([baseline, pd.DataFrame([{"task" : "spaceship-titanic", "exp" :"Baseline", "final_score": 0.5}])], ignore_index=True) | |
baseline = pd.concat([baseline, pd.DataFrame([{"task" : "house-price", "exp" :"Baseline", "final_score": 1e10}])], ignore_index=True) | |
baseline = pd.concat([baseline, pd.DataFrame([{"task" : "ogbn-arxiv", "exp" :"Baseline", "final_score": 0.3134}])], ignore_index=True) | |
baseline = pd.concat([baseline, pd.DataFrame([{"task" : "vectorization", "exp" :"Baseline", "final_score": 6.1742}])], ignore_index=True) | |
return df, baseline | |
def get_all_runs_results(df = None, baseline = None, print_labels = print_labels, print_task_labels = print_task_labels): | |
if df is None or baseline is None: | |
df, baseline = get_all_runs_eval(print_labels = print_labels, print_task_labels = print_task_labels) | |
df[df["final_score"] > -1]["task"].unique() | |
df = df[df["task"].isin(baseline["task"].unique())] | |
df["max_score"] = df["score"].apply(lambda x: max(list(filter(lambda a: a > 0, x))) if len(list(filter(lambda a: a > 0, x))) > 0 else None) | |
df["min_score"] = df["score"].apply(lambda x: min(list(filter(lambda a: a > 0, x))) if len(list(filter(lambda a: a > 0, x))) > 0 else None) | |
df["increase"] = df[["max_score", "task"]].apply(lambda x: (x["max_score"] - baseline[(baseline["task"] == x["task"])]["final_score"].values[0])/baseline[(baseline["task"] == x["task"])]["final_score"].values[0] if x["max_score"] is not None else None, axis=1) | |
df["decrease"] = df[["min_score", "task"]].apply(lambda x: (x["min_score"] - baseline[(baseline["task"] == x["task"])]["final_score"].values[0])/baseline[(baseline["task"] == x["task"])]["final_score"].values[0] if x["min_score"] is not None else None, axis=1) | |
print(time.time()) | |
df["improve"] = get_improvement(df, baseline) | |
df["improve_5"] = get_improvement(df, baseline, 0.05) | |
df["improve_10"] = get_improvement(df, baseline, 0.1) | |
df["improve_15"] = get_improvement(df, baseline, 0.15) | |
df["improve_20"] = get_improvement(df, baseline, 0.2) | |
df["improve_30"] = get_improvement(df, baseline, 0.3) | |
for prefix in ["final_"]: | |
df[f"{prefix}improve"] = get_improvement(df, baseline, None, prefix) | |
df[f"{prefix}improve_5"] = get_improvement(df, baseline, 0.05, prefix) | |
df[f"{prefix}improve_10"] = get_improvement(df, baseline, 0.1, prefix) | |
df[f"{prefix}improve_15"] = get_improvement(df, baseline, 0.15, prefix) | |
df[f"{prefix}improve_20"] = get_improvement(df, baseline, 0.2, prefix) | |
df[f"{prefix}improve_30"] = get_improvement(df, baseline, 0.3, prefix) | |
print(time.time()) | |
# uncomment these to count tokens | |
# df[["prompt_tokens", "completed_tokens", "tool_prompt_tokens", "tool_completed_tokens", "num_steps", "total_time"]] = df.apply((lambda row: estimate_tokens(row["path"])), axis=1, result_type="expand") | |
# df['total_tokens'] = df["prompt_tokens"] + df["completed_tokens"] + df["tool_prompt_tokens"] + df["tool_completed_tokens"] | |
print(time.time()) | |
return df | |
import seaborn as sns | |
from pandas.api.types import CategoricalDtype | |
colors = { | |
"GPT-4" : "#d62728", | |
"Claude v1.0" : "#2ca02c", | |
"AutoGPT" : "#9467bd", | |
"React" : "#8c564b", | |
"LangChain (React)" : "#e377c2", | |
"Baseline" : "#7f7f7f" | |
} | |
def get_tradeoff_plot(df): | |
def sample_and_mean(group): | |
if "GPT-4" in group["exp"].values[0]: | |
sample = group.sample(n=min(len(group), 8), random_state=1) | |
else: | |
sample = group.sample(n=min(len(group), 25), random_state=1) | |
return sample.groupby(["task", "exp"]).mean().reset_index().drop(columns=["task", "exp"]) | |
grouped_df = df[["task", "exp", "final_improve_10", "total_tokens"]].groupby(["task", "exp"]).apply(sample_and_mean).round(4).reset_index() | |
x = grouped_df[["total_tokens","exp"]].groupby([ "exp"]).mean().values.flatten().tolist() | |
y = grouped_df[["final_improve_10","exp"]].groupby([ "exp"]).mean().values.flatten().tolist() | |
labels = ["AutoGPT", "Baseline", "Claude v1.0", "GPT-4", "LangChain (React)"] | |
plt.figure() | |
plt.scatter(x,y) | |
for i in range(len(x)): | |
plt.annotate(labels[i], # this is the text | |
(x[i], y[i]), # these are the coordinates to position the label | |
textcoords="offset points", # how to position the text | |
xytext=(0,10), # distance from text to points (x,y) | |
ha='center') # horizontal alignment can be left, right or center | |
plt.xlim((-30000, 200000)) | |
plt.ylim((0, 0.3)) | |
# plt.show() | |
plt.xlabel("Average Nsumber of Tokens Spent") | |
plt.ylabel("Average Success Rate") | |
plt.savefig("plots/tradeoff.pdf") | |
def get_plot(df, column_name = "improve_5", titile = "Improvement of 5%", save_name = "improve_5", plot_tokens = False, plot_time = False): | |
def sample_and_mean(group): | |
if "GPT-4" in group["exp"].values[0]: | |
sample = group.sample(n=min(len(group), 8), random_state=1) | |
else: | |
sample = group.sample(n=min(len(group), 25), random_state=1) | |
return sample.groupby(["task", "exp"]).mean().reset_index().drop(columns=["task", "exp"]) | |
grouped_df = df[["task", "exp", column_name]].groupby(["task", "exp"]).apply(sample_and_mean).round(4).reset_index() | |
grouped_df.fillna(0, inplace=True) | |
if plot_time: | |
grouped_df[column_name] = grouped_df[column_name] / 60 | |
elif not plot_tokens: | |
grouped_df[column_name] = grouped_df[column_name] * 100 | |
# Define the order | |
task_order = list(print_task_labels.values()) | |
task_order.remove("house-price") | |
exp_order = ["GPT-4", "Claude v1.0", "AutoGPT", "LangChain (React)", "Baseline"] | |
cat_type = CategoricalDtype(categories=task_order, ordered=True) | |
grouped_df['task'] = grouped_df['task'].astype(cat_type) | |
cat_type = CategoricalDtype(categories=exp_order, ordered=True) | |
grouped_df['exp'] = grouped_df['exp'].astype(cat_type) | |
plt.figure(figsize=(10,6)) | |
palette = [colors[x] for x in exp_order] | |
barplot = sns.barplot(x='task', y=column_name, hue='exp', data=grouped_df, palette=palette, ci=95) | |
print(titile) | |
# Get the current x-tick labels | |
labels = [item.get_text() for item in barplot.get_xticklabels()] | |
# Modify the labels | |
new_labels = labels # [ l.split("_")[0].split("-")[0] for l in labels] | |
# Set the new labels | |
plt.xticks(range(len(labels)), new_labels, rotation=30) | |
plt.ylim(plt.ylim()[0], plt.ylim()[1] + (plt.ylim()[1]-plt.ylim()[0]) * 0.1) | |
leg = barplot.get_legend() | |
leg.set_title(None) | |
for t in leg.texts: | |
t.set_text(t.get_text().replace("Year=", "")) | |
plt.legend(loc='upper center', fancybox=True, shadow=True, ncol=4) | |
plt.xlabel("Task") | |
if plot_tokens: | |
plt.ylabel("Tokens") | |
elif plot_time: | |
plt.ylabel("Time (minutes)") | |
else: | |
plt.ylabel("Percentage") | |
plt.savefig(f"plots/{save_name}.pdf", bbox_inches='tight') | |
plt.show() | |
if __name__ == "__main__": | |
df = get_all_runs_results() | |
get_plot(df, "improve_5", "Percentage of runs that improve objective by over 5% at any point", "improve_5") | |
get_plot(df, "improve_10", "Percentage of runs that improve objective by over 10% at any point", "improve_10") | |
get_plot(df, "final_improve_5", "Percentage of runs that improves objective by over 5% at the end", "final_improve_5") | |
get_plot(df, "final_improve_10", "Percentage of runs that improves objective by over 10% at the end", "final_improve_10") | |
get_plot(df, "final_improve_30", "Percentage of runs that improves objective by over 30% at the end", "final_improve_30") | |
get_plot(df, "final_improve", "Average improvement in objective among runs that made a submission at the end.", "final_improve") | |
get_plot(df[df["submitted_final_answer"]], "final_improve", "Average improvement in objective among runs that made a final submission.", "final_improve_submitted") | |
get_plot(df, "total_tokens", "", "total_tokens", plot_tokens= True) | |
get_plot(df, "total_time", "", "total_time",plot_time=True) | |