|
import gradio as gr |
|
import pandas as pd |
|
import random |
|
import firebase_admin |
|
from firebase_admin import credentials |
|
from firebase_admin import firestore |
|
from trueskill import Rating |
|
import trueskill |
|
|
|
CSV_FILE_PATH = "qa_pairs.csv" |
|
|
|
cred = credentials.Certificate("unlpboard_f.json") |
|
firebase_admin.initialize_app(cred) |
|
|
|
def list_models(): |
|
df = pd.read_csv(CSV_FILE_PATH) |
|
return df['model'].unique().tolist() |
|
|
|
|
|
def list_questions(): |
|
df = pd.read_csv(CSV_FILE_PATH) |
|
return df['question'].unique().tolist() |
|
|
|
def fetch_questions(): |
|
questions_ref = db.collection('questions') |
|
docs = questions_ref.stream() |
|
questions_list = [] |
|
for doc in docs: |
|
question = doc.to_dict() |
|
questions_list.append(question) |
|
return questions_list |
|
|
|
|
|
def display_answers(question, model1, model2, df): |
|
|
|
answers = { |
|
model1: "No answer available for Model 1", |
|
model2: "No answer available for Model 2", |
|
} |
|
for model in [model1, model2]: |
|
filtered_df = df[(df['question'] == question) & (df['model'] == model)] |
|
if not filtered_df.empty: |
|
answers[model] = f"**Answer:**\n{filtered_df['answer'].iloc[0]}" |
|
return answers[model1], answers[model2] |
|
|
|
|
|
def update_b(q,m1,a1,m2,a2): |
|
print('Model1: ', random_model2) |
|
print('Model2: ', random_model2) |
|
q, m1, a1, m2, a2 = update_symbols(q, m1, a1, m2, a2) |
|
b1 = gr.Button("Vote for Model 1",interactive=True) |
|
b2 = gr.Button("It’s a tie!",interactive=True) |
|
b3 = gr.Button("Vote for Model 2",interactive=True) |
|
b4 = gr.Button("START!", visible = False) |
|
return q, m1, a1, m2, a2, b1, b2, b3, b4 |
|
|
|
|
|
def update_symbols1(q,m1,a1,m2,a2): |
|
print("Voted for Model 1") |
|
log_vote( |
|
model1=m1, |
|
model2=m2, |
|
question=q, |
|
output1=a1, |
|
output2=a2, |
|
outcome=m1 |
|
) |
|
votes_ref = db.collection('votes') |
|
vote_doc = votes_ref.document(m1).get() |
|
elo_count_1 = vote_doc.get('elo_rating') |
|
elo1 = Rating(elo_count_1) |
|
if vote_doc.exists: |
|
votes_ref.document(m1).update({'win_count': firestore.Increment(1)}) |
|
else: |
|
votes_ref.document(m1).set({'win_count': 1}) |
|
vote_doc = votes_ref.document(m2).get() |
|
elo_count_2 = vote_doc.get('elo_rating') |
|
elo2 = Rating(elo_count_2) |
|
elo1, elo2 = trueskill.rate_1vs1(elo1, elo2) |
|
votes_ref.document(m2).update({'elo_rating': elo2.mu}) |
|
votes_ref.document(m1).update({'elo_rating': elo1.mu}) |
|
if vote_doc.exists: |
|
votes_ref.document(m2).update({'loss_count': firestore.Increment(1)}) |
|
else: |
|
votes_ref.document(m2).set({'loss_count': 1}) |
|
|
|
return update_symbols(q, m1, a1, m2, a2) |
|
|
|
|
|
def update_symbols2(q, m1, a1, m2, a2): |
|
print("Voted for Spare") |
|
log_vote( |
|
model1=m1, |
|
model2=m2, |
|
question=q, |
|
output1=a1, |
|
output2=a2, |
|
outcome='tie' |
|
) |
|
|
|
return update_symbols(q, m1, a1, m2, a2) |
|
|
|
def update_symbols3(q, m1, a1, m2, a2): |
|
print("Voted for Model 2") |
|
log_vote( |
|
model1=m1, |
|
model2=m2, |
|
question=q, |
|
output1=a1, |
|
output2=a2, |
|
outcome=m2 |
|
) |
|
votes_ref = db.collection('votes') |
|
vote_doc = votes_ref.document(m2).get() |
|
elo_count_2 = vote_doc.get('elo_rating') |
|
elo2 = Rating(elo_count_2) |
|
if vote_doc.exists: |
|
votes_ref.document(m2).update({'win_count': firestore.Increment(1)}) |
|
else: |
|
votes_ref.document(m2).set({'win_count': 1}) |
|
vote_doc = votes_ref.document(m1).get() |
|
elo_count_1 = vote_doc.get('elo_rating') |
|
elo1 = Rating(elo_count_1) |
|
elo1, elo2 = trueskill.rate_1vs1(elo2, elo1) |
|
votes_ref.document(m2).update({'elo_rating': elo2.mu}) |
|
votes_ref.document(m1).update({'elo_rating': elo1.mu}) |
|
if vote_doc.exists: |
|
votes_ref.document(m1).update({'loss_count': firestore.Increment(1)}) |
|
else: |
|
votes_ref.document(m1).set({'loss_count': 1}) |
|
|
|
return update_symbols(q, m1, a1, m2, a2) |
|
|
|
def update_symbols(q,m1,a1,m2,a2): |
|
random_question = random.choice(questions) |
|
random_model1, random_model2 = random.sample(models, 2) |
|
answer1, answer2 = display_answers(random_question, random_model1, random_model2, combined_df) |
|
m1 = gr.Markdown(f"{random_model1}", visible=False) |
|
a1 = gr.Markdown(answer1) |
|
q = gr.Markdown(f"{random_question}") |
|
m2 = gr.Markdown(f"{random_model2}", visible=False) |
|
a2 = gr.Markdown(answer2) |
|
return q,m1,a1,m2,a2 |
|
|
|
def update_total_votes(): |
|
votes_ref = db.collection('votes') |
|
vote_doc = votes_ref.document('total').get() |
|
if vote_doc.exists: |
|
votes_ref.document('total').update({'count': firestore.Increment(1)}) |
|
else: |
|
votes_ref.document('total').set({'count': 1}) |
|
|
|
def log_vote(model1, model2, question, output1, output2, outcome): |
|
|
|
votes_log_ref = db.collection('votes_log') |
|
|
|
|
|
vote_data = { |
|
'model1': model1, |
|
'model2': model2, |
|
'question': question, |
|
'output1': output1, |
|
'output2': output2, |
|
'outcome': outcome, |
|
'timestamp': firestore.SERVER_TIMESTAMP |
|
} |
|
|
|
|
|
votes_log_ref.add(vote_data) |
|
|
|
|
|
def fetch_and_format_leaderboard(): |
|
vote_counts_ref = db.collection('votes') |
|
docs = vote_counts_ref.stream() |
|
|
|
leaderboard = [] |
|
for doc in docs: |
|
model_data = doc.to_dict() |
|
model_name = doc.id |
|
win_count = model_data.get('win_count', 0) |
|
loss_count = model_data.get('loss_count', 0) |
|
total_matches = win_count + loss_count |
|
win_rate = (win_count / total_matches) * 100 if total_matches > 0 else 0 |
|
elo_rating = model_data.get('elo_rating', 0) |
|
|
|
leaderboard.append({ |
|
"model": model_name, |
|
"win_rate": win_rate, |
|
"elo_rating": elo_rating |
|
}) |
|
|
|
|
|
leaderboard.sort(key=lambda x: x['win_rate'], reverse=True) |
|
leaderboard_df = pd.DataFrame(leaderboard) |
|
leaderboard_df['Rank'] = [1,2,3,4,5,6] |
|
|
|
|
|
leaderboard_df = leaderboard_df[['Rank', 'model', 'win_rate', 'elo_rating' |
|
]] |
|
|
|
|
|
|
|
return leaderboard_df |
|
|
|
|
|
|
|
|
|
|
|
db = firestore.client() |
|
|
|
def fetch_questions_c(collection): |
|
questions_ref = db.collection(collection) |
|
docs = questions_ref.stream() |
|
questions_list = [] |
|
for doc in docs: |
|
question = doc.to_dict() |
|
questions_list.append(question) |
|
return questions_list |
|
|
|
codekobzar = fetch_questions_c('codekobzar') |
|
gpt = fetch_questions_c('gpt-4') |
|
llama = fetch_questions_c('llama-2-70b-chat') |
|
sherlocknorag = fetch_questions_c('sherlock-no-rag') |
|
sherlockrag = fetch_questions_c('sherlock-rag') |
|
ukrainenow = fetch_questions_c('ukrainenow') |
|
|
|
df1 = pd.DataFrame(codekobzar) |
|
df2 = pd.DataFrame(gpt) |
|
df3 = pd.DataFrame(llama) |
|
df4 = pd.DataFrame(sherlocknorag) |
|
df5 = pd.DataFrame(sherlockrag) |
|
df6 = pd.DataFrame(ukrainenow) |
|
df1['model'] = 'codekobzar' |
|
df2['model'] = 'gpt-4' |
|
df3['model'] = 'llama-2-70b-chat' |
|
df4['model'] = 'sherlock-no-rag' |
|
df5['model'] = 'sherlock-rag' |
|
df6['model'] = 'ukrainenow' |
|
|
|
|
|
combined_df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True) |
|
combined_df.drop('input',axis=1,inplace=True) |
|
combined_df.rename(columns={'instruction': 'question', 'output': 'answer'}, inplace=True) |
|
|
|
models = ['codekobzar','gpt-4','llama-2-70b-chat','sherlock-no-rag','sherlock-rag','ukrainenow'] |
|
|
|
votes_ref = db.collection('votes') |
|
for model in models: |
|
vote_doc = votes_ref.document(model).get() |
|
if vote_doc.exists: |
|
print("-------") |
|
else: |
|
votes_ref.document(model).set({'win_count': 0}) |
|
votes_ref.document(model).set({'loss_count': 0}) |
|
votes_ref.document(model).set({'elo_rating': 25}) |
|
|
|
|
|
|
|
random_question = 'Click any button to start!' |
|
random_model1, random_model2 = '1', '2' |
|
answer1, answer2 = display_answers(random_question, random_model1, random_model2,combined_df) |
|
|
|
questions = [] |
|
questions_ = fetch_questions() |
|
for question in questions_: |
|
questions.append(question['question_text']) |
|
|
|
votes_ref = db.collection('votes') |
|
|
|
|
|
def create_app(): |
|
|
|
print('-----------------------') |
|
print(random_question) |
|
print(random_model1) |
|
print('-----!!!!!!!!!!!!!') |
|
|
|
with gr.Blocks() as app: |
|
q = gr.Markdown(f"### Question: {random_question}") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
m1 = gr.Markdown(f"{random_model1}", visible=False) |
|
a1 = gr.Markdown(answer1) |
|
|
|
with gr.Column(): |
|
m2 = gr.Markdown(f"{random_model2}", visible=False) |
|
a2 = gr.Markdown(answer2) |
|
|
|
with gr.Row(): |
|
b1 = gr.Button("Vote for Model 1",interactive=False) |
|
b2 = gr.Button("It’s a tie!",interactive=False) |
|
b3 = gr.Button("Vote for Model 2",interactive=False) |
|
with gr.Row(): |
|
b4 = gr.Button("START!", interactive=True) |
|
|
|
|
|
|
|
initial_leaderboard_data = fetch_and_format_leaderboard() |
|
|
|
leaderboard_display = gr.Dataframe(value=initial_leaderboard_data, label="Leaderboard") |
|
|
|
|
|
b4.click(update_b, inputs=[q,m1,a1,m2,a2], outputs=[q,m1,a1,m2,a2,b1,b2,b3, b4]) |
|
b1.click(update_symbols1, inputs=[q,m1,a1,m2,a2], outputs=[q,m1,a1,m2,a2]) |
|
b2.click(update_symbols2, inputs=[q, m1, a1, m2, a2], outputs=[q, m1, a1, m2, a2]) |
|
b3.click(update_symbols3, inputs=[q, m1, a1, m2, a2], outputs=[q, m1, a1, m2, a2]) |
|
leaderboard_button = gr.Button("Refresh Leaderboard") |
|
leaderboard_button.click(fn=fetch_and_format_leaderboard, inputs=[], outputs=leaderboard_display) |
|
|
|
return app |
|
|
|
app = create_app() |
|
app.launch() |
|
|