StringChaos's picture
question id and problem_idx
0df4202
raw
history blame
4.03 kB
import os
import random
import glob
import json
import numpy as np
from flask import Flask, render_template, request
app = Flask(__name__)
with open("problems.json") as f:
problems = json.load(f)
problem_choices = [q["question_title"] for q in problems]
random_idxs = list(range(len(problems)))
# random.seed(42)
# random.shuffle(random_idxs)
problems = [problems[idx] for idx in random_idxs]
with open("all_outputs.json") as f:
all_outputs = json.load(f)
all_models = list(all_outputs.keys())
num_questions_filtered = len(problems)
all_correctness_by_problem = {
idx: {model: np.mean(all_outputs[model][idx]["pass1_list"]) for model in all_models}
for idx in random_idxs
}
def calculate_color(performance):
# Convert performance to a value between 0 and 1
# Calculate the red and green components of the color
if performance > 0.75:
return f"rgba(0, 150, 0, 0.5)"
elif performance > 0.5:
return f"rgba(50, 150, 0, {performance})"
elif performance > 0.25:
return f"rgba(150, 50, 0, {1-performance})"
else:
return f"rgba(150, 0, 0, 0.5)"
all_evaluations_by_problem_colored = [
(
trueidx,
{
model: {
"correctness": f"{all_correctness_by_problem[idx][model]*100:.1f}",
"correctness_color": calculate_color(
all_correctness_by_problem[idx][model]
),
}
for model in all_models
},
problems[idx]["difficulty"],
problems[idx]["question_id"],
)
for trueidx, idx in enumerate(random_idxs)
]
all_data_for_view_formatted = {
model: [
[
{"code": a, "pass1": b, "metadata": c}
for a, b, c in zip(
row["code_list"], row["pass1_list"], row["metadata_list"]
)
]
# print(row)
for idx in random_idxs
for row in [resp[idx]]
]
for model, resp in all_outputs.items()
}
@app.route("/")
def home():
# Fetch your data here
print(all_models)
return render_template(
"index.html", models=all_models, problems=all_evaluations_by_problem_colored
)
@app.route("/problem/<int:problem_idx>")
def problem(problem_idx):
# Fetch your data here
data = {
model: all_data_for_view_formatted[model][problem_idx] for model in all_models
}
evaluation = all_evaluations_by_problem_colored[problem_idx][1]
question = problems[problem_idx]
# print(data)
return render_template(
"problem.html",
problem_idx=problem_idx,
question_id=all_evaluations_by_problem_colored[problem_idx][3],
evaluation=evaluation,
models=all_models,
question=question,
data=data,
)
mini_models = [
# "DeepSeek-V2",
"DeepSeek-V3",
"DeepSeek-R1-Preview",
# "DSCoder-33b-Ins",
# "GPT-4-Turbo-2024-04-09",
"GPT-4O-2024-05-13",
"Claude-3.5-Sonnet-20240620",
"Gemini-Flash-2.0-Thinking",
# "Gemini-Exp-1206",
# "Claude-3-Sonnet",
"O1-2024-12-17 (N=1) (High)",
"QwQ-32B-Preview (N=1)",
]
@app.route("/mini")
def mini():
# Fetch your data here
return render_template(
"index_mini.html",
models=mini_models,
problems=all_evaluations_by_problem_colored,
)
@app.route("/problem_mini/<int:problem_idx>")
def problem_mini(problem_idx):
# Fetch your data here
data = {
model: all_data_for_view_formatted[model][problem_idx] for model in mini_models
}
evaluation = all_evaluations_by_problem_colored[problem_idx][1]
question = problems[problem_idx]
# print(data)
return render_template(
"problem_mini.html",
problem_idx=problem_idx,
question_id=all_evaluations_by_problem_colored[problem_idx][3],
evaluation=evaluation,
models=mini_models,
question=question,
data=data,
)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)