kaikaidai commited on
Commit
d4256bf
·
verified ·
1 Parent(s): 44387c3

13-14 Nov changes

Browse files
Files changed (1) hide show
  1. app.py +201 -299
app.py CHANGED
@@ -4,13 +4,19 @@ import random
4
  from collections import defaultdict
5
  from datetime import datetime, timezone
6
  import hashlib
 
7
 
8
  from dotenv import load_dotenv
9
 
10
  load_dotenv()
11
 
12
  import gradio as gr
13
- from gen_api_answer import get_model_response, parse_model_response, get_random_human_ai_pair
 
 
 
 
 
14
  from db import add_vote, create_db_connection, get_votes
15
  from utils import Vote
16
  from common import (
@@ -26,12 +32,16 @@ from common import (
26
  EVAL_DESCRIPTION,
27
  VOTING_HEADER,
28
  )
29
- from example_metrics import EXAMPLE_METRICS
 
 
 
 
 
 
 
30
 
31
 
32
- # Model and ELO score data
33
- DEFAULT_ELO = 1200 # Starting ELO for new models
34
- K_FACTOR = 32 # Standard chess K-factor, adjust as needed
35
  elo_scores = defaultdict(lambda: DEFAULT_ELO)
36
  vote_counts = defaultdict(int)
37
 
@@ -143,6 +153,30 @@ def get_ip(request: gr.Request) -> str:
143
  return hashlib.sha256(ip.encode()).hexdigest()[:16]
144
 
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  def vote(
147
  choice,
148
  model_a,
@@ -192,16 +226,20 @@ def vote(
192
  store_vote_data(
193
  final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
194
  )
195
-
 
 
 
196
  # Return updates for UI components
197
  return [
198
- gr.update(visible=False), # vote_a
199
- gr.update(visible=False), # vote_b
200
- gr.update(visible=False), # tie_button_row
201
  gr.update(value=f"*Model: {model_a}*"), # model_name_a
202
  gr.update(value=f"*Model: {model_b}*"), # model_name_b
203
- gr.update(interactive=True, value="Run the evaluators", variant="primary"), # send_btn
204
- gr.update(visible=True), # spacing_div
 
205
  ]
206
 
207
 
@@ -210,150 +248,24 @@ def get_current_votes():
210
  return get_votes(db)
211
 
212
 
213
- def get_leaderboard(show_preliminary=True):
214
- """Generate leaderboard data using fresh votes from MongoDB."""
215
- # Get fresh voting data
216
- voting_data = get_current_votes()
217
- print(f"Fetched {len(voting_data)} votes from database") # Debug log
218
-
219
- # Initialize dictionaries for tracking
220
- ratings = defaultdict(lambda: DEFAULT_ELO)
221
- matches = defaultdict(int)
222
-
223
- # Process each vote
224
- for vote in voting_data:
225
- try:
226
- model_a = vote.get("model_a")
227
- model_b = vote.get("model_b")
228
- winner = vote.get("winner")
229
-
230
- # Skip if models aren't in current model_data
231
- if (
232
- not all([model_a, model_b, winner])
233
- or model_a not in model_data
234
- or model_b not in model_data
235
- ):
236
- continue
237
-
238
- # Update match counts
239
- matches[model_a] += 1
240
- matches[model_b] += 1
241
-
242
- # Calculate ELO changes
243
- elo_a = ratings[model_a]
244
- elo_b = ratings[model_b]
245
-
246
- # Expected scores
247
- expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
248
- expected_b = 1 - expected_a
249
-
250
- # Actual scores
251
- score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
252
- score_b = 1 - score_a
253
-
254
- # Update ratings
255
- ratings[model_a] += K_FACTOR * (score_a - expected_a)
256
- ratings[model_b] += K_FACTOR * (score_b - expected_b)
257
-
258
- except Exception as e:
259
- print(f"Error processing vote: {e}")
260
- continue
261
-
262
- # Generate leaderboard data
263
- leaderboard = []
264
- for model in model_data.keys():
265
- votes = matches[model]
266
- # Skip models with < 500 votes if show_preliminary is False
267
- if not show_preliminary and votes < 500:
268
- continue
269
-
270
- elo = ratings[model]
271
- ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
272
- data = {
273
- "Model": model,
274
- "ELO Score": f"{int(elo)}",
275
- "95% CI": f"±{int(ci)}",
276
- "# Votes": votes,
277
- "Organization": model_data[model]["organization"],
278
- "License": model_data[model]["license"],
279
- }
280
- leaderboard.append(data)
281
-
282
- # Sort leaderboard by ELO score in descending order
283
- leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
284
-
285
- return leaderboard
286
-
287
-
288
- def calculate_elo_change(rating_a, rating_b, winner):
289
- """Calculate ELO rating changes for both players."""
290
- expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
291
- expected_b = 1 - expected_a
292
-
293
- if winner == "A":
294
- score_a, score_b = 1, 0
295
- elif winner == "B":
296
- score_a, score_b = 0, 1
297
- else: # Handle ties
298
- score_a, score_b = 0.5, 0.5
299
-
300
- change_a = K_FACTOR * (score_a - expected_a)
301
- change_b = K_FACTOR * (score_b - expected_b)
302
-
303
- return change_a, change_b
304
-
305
-
306
- def update_leaderboard():
307
- """Generate leaderboard DataFrame using fresh votes from MongoDB."""
308
- # Get fresh voting data
309
  voting_data = get_current_votes()
310
- print(f"Found {len(voting_data)} votes in database")
311
- matches = defaultdict(int)
312
-
313
- # Process each vote chronologically
314
- for vote in voting_data:
315
- # Extract model names from the vote document
316
- try:
317
- model_a = vote.get("model_a")
318
- model_b = vote.get("model_b")
319
- winner = vote.get("winner")
320
-
321
- print(f"Processing vote: {model_a} vs {model_b}, winner: {winner}")
322
-
323
- # Skip if any required field is missing or models aren't in current model_data
324
- if not all([model_a, model_b, winner]):
325
- print(f"Missing required fields in vote: {vote}")
326
- continue
327
-
328
- if model_a not in model_data:
329
- print(f"Model A '{model_a}' not found in model_data")
330
- continue
331
-
332
- if model_b not in model_data:
333
- print(f"Model B '{model_b}' not found in model_data")
334
- continue
335
-
336
- # Update match counts
337
- matches[model_a] += 1
338
- matches[model_b] += 1
339
- print(
340
- f"Updated matches - {model_a}: {matches[model_a]}, {model_b}: {matches[model_b]}"
341
- )
342
- except Exception as e:
343
- print(f"Error processing vote: {e}")
344
- print(f"Problematic vote data: {vote}")
345
- continue
346
-
347
-
348
- # Update the display_leaderboard function
349
- def display_leaderboard():
350
- df = update_leaderboard()
351
- return gr.DataFrame(
352
- value=df,
353
- headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
354
- datatype=["str", "number", "str", "number", "str", "str", "str"],
355
- row_count=(len(df) + 1, "dynamic"),
356
- )
357
 
358
 
359
  # Update the leaderboard table definition in the UI
@@ -363,63 +275,22 @@ leaderboard_table = gr.Dataframe(
363
  )
364
 
365
 
366
- def get_leaderboard_stats():
367
- """Get summary statistics for the leaderboard."""
368
- now = datetime.now(timezone.utc)
369
- total_votes = len(get_current_votes())
370
- total_models = len(model_data)
371
- last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
372
- "%B %d, %Y at %H:00 UTC"
373
- )
374
-
375
- return f"""
376
- ### Leaderboard Stats
377
- - **Total Models**: {total_models}
378
- - **Total Votes**: {total_votes}
379
- - **Last Updated**: {last_updated}
380
- """
381
-
382
-
383
- #def set_example_metric(metric_name):
384
- # if metric_name == "Custom":
385
- # variables = parse_variables(DEFAULT_EVAL_PROMPT)
386
- # variable_values = []
387
- # for var in variables:
388
- # if var == "input":
389
- # variable_values.append(DEFAULT_INPUT)
390
- # elif var == "response":
391
- # variable_values.append(DEFAULT_RESPONSE)
392
- # else:
393
- # variable_values.append("") # Default empty value
394
- # Pad variable_values to match the length of variable_rows
395
- # while len(variable_values) < len(variable_rows):
396
- # variable_values.append("")
397
- # return [DEFAULT_EVAL_PROMPT] + variable_values
398
-
399
- # metric_data = EXAMPLE_METRICS[metric_name]
400
- # variables = parse_variables(metric_data["prompt"])
401
- # variable_values = []
402
- # for var in variables:
403
- # value = metric_data.get(var, "") # Default to empty string if not found
404
- # variable_values.append(value)
405
- # Pad variable_values to match the length of variable_rows
406
- # while len(variable_values) < len(variable_rows):
407
- # variable_values.append("")
408
- # return [metric_data["prompt"]] + variable_values
409
-
410
-
411
- # Select random metric at startup
412
- # def get_random_metric():
413
- # metrics = list(EXAMPLE_METRICS.keys())
414
- # return set_example_metric(random.choice(metrics))
415
-
416
-
417
  def populate_random_example(request: gr.Request):
418
- """Generate a random human-AI conversation example."""
419
  human_msg, ai_msg = get_random_human_ai_pair()
420
  return [
421
  gr.update(value=human_msg),
422
- gr.update(value=ai_msg)
 
 
 
 
 
 
 
 
 
 
423
  ]
424
 
425
 
@@ -435,27 +306,35 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
435
 
436
  with gr.Tabs():
437
  with gr.TabItem("Judge Arena"):
438
- random_btn = gr.Button("🎲", scale=0)
439
  with gr.Row():
440
  # Left side - Input section
441
  with gr.Column(scale=1):
442
  with gr.Group():
443
  human_input = gr.TextArea(
444
  label="👩 Human Input",
445
- lines=13,
446
  placeholder="Enter the human message here..."
447
  )
 
 
 
 
 
 
448
 
449
  ai_response = gr.TextArea(
450
  label="🤖 AI Response",
451
- lines=13,
452
  placeholder="Enter the AI response here..."
453
  )
454
 
 
 
455
  send_btn = gr.Button(
456
- value="Run the evaluators",
457
  variant="primary",
458
- size="lg"
 
459
  )
460
 
461
  # Right side - Model outputs
@@ -466,17 +345,14 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
466
  with gr.Row():
467
  with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
468
  score_a = gr.Textbox(label="Score", lines=6, interactive=False)
469
- vote_a = gr.Button("Vote A", variant="primary", visible=False)
470
  with gr.Column(scale=9, min_width=400): # Wider width for critique
471
  critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
472
 
473
- # Spacing div that's visible only when tie button is hidden
474
- spacing_div = gr.HTML('<div style="height: 42px;"></div>', visible=True, elem_id="spacing-div")
475
-
476
  # Tie button row
477
- with gr.Row(visible=False) as tie_button_row:
478
  with gr.Column():
479
- vote_tie = gr.Button("Tie", variant="secondary")
480
 
481
 
482
  gr.Markdown("### 🧑‍⚖️ Judge B")
@@ -485,13 +361,17 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
485
  with gr.Row():
486
  with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
487
  score_b = gr.Textbox(label="Score", lines=6, interactive=False)
488
- vote_b = gr.Button("Vote B", variant="primary", visible=False)
489
  with gr.Column(scale=9, min_width=400): # Wider width for critique
490
  critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
491
  # Place Vote B button directly under Judge B
492
 
493
  gr.Markdown("<br>")
494
 
 
 
 
 
495
  # Add spacing and acknowledgements at the bottom
496
  gr.Markdown(ACKNOWLEDGEMENTS)
497
 
@@ -510,24 +390,6 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
510
  datatype=["str", "number", "str", "number", "str", "str", "str"],
511
  )
512
 
513
- # Update refresh_leaderboard to use the checkbox value
514
- def refresh_leaderboard(show_preliminary):
515
- """Refresh the leaderboard data and stats."""
516
- leaderboard = get_leaderboard(show_preliminary)
517
- data = [
518
- [
519
- entry["Model"],
520
- float(entry["ELO Score"]),
521
- entry["95% CI"],
522
- entry["# Votes"],
523
- entry["Organization"],
524
- entry["License"],
525
- ]
526
- for entry in leaderboard
527
- ]
528
- stats = get_leaderboard_stats()
529
- return [gr.update(value=data), gr.update(value=stats)]
530
-
531
  # Add change handler for checkbox
532
  show_preliminary.change(
533
  fn=refresh_leaderboard,
@@ -551,35 +413,35 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
551
  final_prompt_state = gr.State()
552
 
553
  # Update variable inputs based on the eval prompt
554
- def update_variables(eval_prompt):
555
- variables = parse_variables(eval_prompt)
556
- updates = []
557
-
558
- for i in range(len(variable_rows)):
559
- var_row, var_input = variable_rows[i]
560
- if i < len(variables):
561
- var_name = variables[i]
562
- # Set the number of lines based on the variable name
563
- if var_name == "response":
564
- lines = 4 # Adjust this number as needed
565
- else:
566
- lines = 1 # Default to single line for other variables
567
- updates.extend(
568
- [
569
- gr.update(visible=True), # Show the variable row
570
- gr.update(
571
- label=var_name, visible=True, lines=lines
572
- ), # Update label and lines
573
- ]
574
- )
575
- else:
576
- updates.extend(
577
- [
578
- gr.update(visible=False), # Hide the variable row
579
- gr.update(value="", visible=False), # Clear value when hidden
580
- ]
581
- )
582
- return updates
583
 
584
  #eval_prompt.change(
585
  # fn=update_variables,
@@ -619,7 +481,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
619
  vote_a.click(
620
  fn=vote,
621
  inputs=[
622
- gr.State("A"), # Choice
623
  model_a_state,
624
  model_b_state,
625
  final_prompt_state,
@@ -631,18 +493,19 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
631
  outputs=[
632
  vote_a,
633
  vote_b,
634
- tie_button_row,
635
  model_name_a,
636
  model_name_b,
637
  send_btn,
638
- spacing_div,
 
639
  ],
640
  )
641
 
642
  vote_b.click(
643
  fn=vote,
644
  inputs=[
645
- gr.State("B"), # Choice
646
  model_a_state,
647
  model_b_state,
648
  final_prompt_state,
@@ -654,18 +517,19 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
654
  outputs=[
655
  vote_a,
656
  vote_b,
657
- tie_button_row,
658
  model_name_a,
659
  model_name_b,
660
  send_btn,
661
- spacing_div,
 
662
  ],
663
  )
664
 
665
  vote_tie.click(
666
  fn=vote,
667
  inputs=[
668
- gr.State("Tie"), # Choice
669
  model_a_state,
670
  model_b_state,
671
  final_prompt_state,
@@ -677,11 +541,12 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
677
  outputs=[
678
  vote_a,
679
  vote_b,
680
- tie_button_row,
681
  model_name_a,
682
  model_name_b,
683
  send_btn,
684
- spacing_div,
 
685
  ],
686
  )
687
 
@@ -717,21 +582,20 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
717
  critique_a,
718
  score_b,
719
  critique_b,
720
- gr.update(visible=True), # vote_a
721
- gr.update(visible=True), # vote_b
722
- gr.update(visible=True), # tie_button_row
723
  model_a,
724
  model_b,
725
- final_prompt, # Add final_prompt to state
726
  gr.update(value="*Model: Hidden*"),
727
  gr.update(value="*Model: Hidden*"),
728
- # Change the button to "Regenerate" mode after evaluation
729
  gr.update(
730
- value="Regenerate with different models",
731
  variant="secondary",
732
  interactive=True
733
  ),
734
- gr.update(visible=False), # spacing_div
735
  )
736
 
737
  send_btn.click(
@@ -744,29 +608,29 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
744
  critique_b,
745
  vote_a,
746
  vote_b,
747
- tie_button_row,
748
  model_a_state,
749
  model_b_state,
750
  final_prompt_state,
751
  model_name_a,
752
  model_name_b,
753
  send_btn,
754
- spacing_div,
755
  ],
756
  )
757
 
758
  # Update the input change handlers to also disable regenerate button
759
- def handle_input_changes(prompt, *variables):
760
- """Enable send button and manage regenerate button based on input changes"""
761
- last_inputs = last_submission.value
762
- current_inputs = {"prompt": prompt, "variables": variables}
763
- inputs_changed = last_inputs != current_inputs
764
- return [
765
- gr.update(interactive=True), # send button always enabled
766
- gr.update(
767
- interactive=not inputs_changed
768
- ), # regenerate button disabled if inputs changed
769
- ]
770
 
771
  # Update the change handlers for prompt and variables
772
  #eval_prompt.change(
@@ -813,24 +677,62 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
813
  random_btn.click(
814
  fn=populate_random_example,
815
  inputs=[],
816
- outputs=[human_input, ai_response]
 
 
 
 
 
 
 
 
 
 
 
 
 
817
  )
818
 
819
  # Add new input change handlers
820
  def handle_input_change():
821
- return gr.update(value="Run the evaluators", variant="primary")
 
 
 
 
 
 
 
822
 
823
  # Update the change handlers for inputs
824
  human_input.change(
825
  fn=handle_input_change,
826
  inputs=[],
827
- outputs=[send_btn]
828
  )
829
 
830
  ai_response.change(
831
  fn=handle_input_change,
832
  inputs=[],
833
- outputs=[send_btn]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
834
  )
835
 
836
  # Update the demo.load to include the random example population
 
4
  from collections import defaultdict
5
  from datetime import datetime, timezone
6
  import hashlib
7
+ from typing import Dict, List
8
 
9
  from dotenv import load_dotenv
10
 
11
  load_dotenv()
12
 
13
  import gradio as gr
14
+ from gen_api_answer import (
15
+ get_model_response,
16
+ parse_model_response,
17
+ get_random_human_ai_pair,
18
+ generate_ai_response
19
+ )
20
  from db import add_vote, create_db_connection, get_votes
21
  from utils import Vote
22
  from common import (
 
32
  EVAL_DESCRIPTION,
33
  VOTING_HEADER,
34
  )
35
+ from leaderboard import (
36
+ get_leaderboard,
37
+ get_leaderboard_stats,
38
+ calculate_elo_change,
39
+ get_model_rankings,
40
+ DEFAULT_ELO,
41
+ K_FACTOR
42
+ )
43
 
44
 
 
 
 
45
  elo_scores = defaultdict(lambda: DEFAULT_ELO)
46
  vote_counts = defaultdict(int)
47
 
 
153
  return hashlib.sha256(ip.encode()).hexdigest()[:16]
154
 
155
 
156
+ def get_vote_message(choice: str, model_a: str, model_b: str) -> str:
157
+ """Generate appropriate message based on vote and model rankings."""
158
+ voting_data = get_current_votes()
159
+ leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
160
+ rankings = get_model_rankings(leaderboard)
161
+ pos_a = rankings.get(model_a, 0)
162
+ pos_b = rankings.get(model_b, 0)
163
+
164
+ if choice == "Tie":
165
+ return f"It's a tie! Currently, {model_a} ranks #{pos_a} and {model_b} ranks #{pos_b}. \nYour votes shapes the leaderboard, carry on voting responsibly :)"
166
+
167
+ # Get chosen and rejected models based on vote
168
+ model_chosen = model_a if choice == "A" else model_b
169
+ model_rejected = model_b if choice == "A" else model_a
170
+ pos_chosen = pos_a if choice == "A" else pos_b
171
+ pos_rejected = pos_b if choice == "A" else pos_a
172
+
173
+ # Check if vote aligns with leaderboard
174
+ if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
175
+ return f"You're in touch with the community! {model_chosen} ranks #{pos_chosen} ahead of {model_rejected} in #{pos_rejected}. \nYour votes shapes the leaderboard, carry on voting responsibly :)"
176
+ else:
177
+ return f"You don't think like everyone else ;) {model_chosen} ranks #{pos_chosen} which is behind {model_rejected} in #{pos_rejected}. \nYour votes shapes the leaderboard, carry on voting responsibly :)"
178
+
179
+
180
  def vote(
181
  choice,
182
  model_a,
 
226
  store_vote_data(
227
  final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
228
  )
229
+
230
+ # Generate vote message
231
+ message = get_vote_message(choice, model_a, model_b)
232
+
233
  # Return updates for UI components
234
  return [
235
+ gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"), # vote_a
236
+ gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"), # vote_b
237
+ gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"), # vote_tie
238
  gr.update(value=f"*Model: {model_a}*"), # model_name_a
239
  gr.update(value=f"*Model: {model_b}*"), # model_name_b
240
+ gr.update(interactive=True, value="Regenerate judges", variant="secondary"), # send_btn
241
+ gr.update(value="🎲 New round", variant="primary"), # random_btn
242
+ gr.Info(message, title = "🥳 Thanks for your vote!"), # success message
243
  ]
244
 
245
 
 
248
  return get_votes(db)
249
 
250
 
251
+ # Update the refresh_leaderboard function
252
+ def refresh_leaderboard(show_preliminary):
253
+ """Refresh the leaderboard data and stats."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  voting_data = get_current_votes()
255
+ leaderboard = get_leaderboard(model_data, voting_data, show_preliminary)
256
+ data = [
257
+ [
258
+ entry["Model"],
259
+ float(entry["ELO Score"]),
260
+ entry["95% CI"],
261
+ entry["# Votes"],
262
+ entry["Organization"],
263
+ entry["License"],
264
+ ]
265
+ for entry in leaderboard
266
+ ]
267
+ stats = get_leaderboard_stats(model_data, voting_data)
268
+ return [gr.update(value=data), gr.update(value=stats)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
 
271
  # Update the leaderboard table definition in the UI
 
275
  )
276
 
277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  def populate_random_example(request: gr.Request):
279
+ """Generate a random human-AI conversation example and reset judge outputs."""
280
  human_msg, ai_msg = get_random_human_ai_pair()
281
  return [
282
  gr.update(value=human_msg),
283
+ gr.update(value=ai_msg),
284
+ gr.update(value="🎲", variant="secondary"), # Reset random button appearance
285
+ gr.update(value=""), # Clear score A
286
+ gr.update(value=""), # Clear critique A
287
+ gr.update(value=""), # Clear score B
288
+ gr.update(value=""), # Clear critique B
289
+ gr.update(interactive=False, variant="primary"), # Reset vote A
290
+ gr.update(interactive=False, variant="primary"), # Reset vote B
291
+ gr.update(interactive=False, variant="primary"), # Reset vote tie
292
+ gr.update(value="*Model: Hidden*"), # Reset model name A
293
+ gr.update(value="*Model: Hidden*"), # Reset model name B
294
  ]
295
 
296
 
 
306
 
307
  with gr.Tabs():
308
  with gr.TabItem("Judge Arena"):
 
309
  with gr.Row():
310
  # Left side - Input section
311
  with gr.Column(scale=1):
312
  with gr.Group():
313
  human_input = gr.TextArea(
314
  label="👩 Human Input",
315
+ lines=10,
316
  placeholder="Enter the human message here..."
317
  )
318
+ with gr.Row():
319
+ generate_btn = gr.Button(
320
+ "Generate AI Response",
321
+ size="sm",
322
+ interactive=False
323
+ )
324
 
325
  ai_response = gr.TextArea(
326
  label="🤖 AI Response",
327
+ lines=15,
328
  placeholder="Enter the AI response here..."
329
  )
330
 
331
+ with gr.Row():
332
+ random_btn = gr.Button("🎲", scale=2)
333
  send_btn = gr.Button(
334
+ value="Run judges",
335
  variant="primary",
336
+ size="lg",
337
+ scale=8
338
  )
339
 
340
  # Right side - Model outputs
 
345
  with gr.Row():
346
  with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
347
  score_a = gr.Textbox(label="Score", lines=6, interactive=False)
348
+ vote_a = gr.Button("Vote A", variant="primary", interactive=False)
349
  with gr.Column(scale=9, min_width=400): # Wider width for critique
350
  critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
351
 
 
 
 
352
  # Tie button row
353
+ with gr.Row() as tie_button_row:
354
  with gr.Column():
355
+ vote_tie = gr.Button("Tie", variant="primary", interactive=False)
356
 
357
 
358
  gr.Markdown("### 🧑‍⚖️ Judge B")
 
361
  with gr.Row():
362
  with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
363
  score_b = gr.Textbox(label="Score", lines=6, interactive=False)
364
+ vote_b = gr.Button("Vote B", variant="primary", interactive=False)
365
  with gr.Column(scale=9, min_width=400): # Wider width for critique
366
  critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
367
  # Place Vote B button directly under Judge B
368
 
369
  gr.Markdown("<br>")
370
 
371
+ # Add Evaluator Prompt Accordion
372
+ with gr.Accordion("📝 Evaluator Prompt", open=False):
373
+ gr.Markdown(f"```\n{DEFAULT_EVAL_PROMPT}\n```")
374
+
375
  # Add spacing and acknowledgements at the bottom
376
  gr.Markdown(ACKNOWLEDGEMENTS)
377
 
 
390
  datatype=["str", "number", "str", "number", "str", "str", "str"],
391
  )
392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  # Add change handler for checkbox
394
  show_preliminary.change(
395
  fn=refresh_leaderboard,
 
413
  final_prompt_state = gr.State()
414
 
415
  # Update variable inputs based on the eval prompt
416
+ #def update_variables(eval_prompt):
417
+ # variables = parse_variables(eval_prompt)
418
+ # updates = []
419
+
420
+ # for i in range(len(variable_rows)):
421
+ # var_row, var_input = variable_rows[i]
422
+ # if i < len(variables):
423
+ # var_name = variables[i]
424
+ # # Set the number of lines based on the variable name
425
+ # if var_name == "response":
426
+ # lines = 4 # Adjust this number as needed
427
+ # else:
428
+ # lines = 1 # Default to single line for other variables
429
+ # updates.extend(
430
+ # [
431
+ # gr.update(visible=True), # Show the variable row
432
+ # gr.update(
433
+ # label=var_name, visible=True, lines=lines
434
+ # ), # Update label and lines
435
+ # ]
436
+ # )
437
+ # else:
438
+ # updates.extend(
439
+ # [
440
+ # gr.update(visible=False), # Hide the variable row
441
+ # gr.update(value="", visible=False), # Clear value when hidden
442
+ # ]
443
+ # )
444
+ # return updates
445
 
446
  #eval_prompt.change(
447
  # fn=update_variables,
 
481
  vote_a.click(
482
  fn=vote,
483
  inputs=[
484
+ gr.State("A"),
485
  model_a_state,
486
  model_b_state,
487
  final_prompt_state,
 
493
  outputs=[
494
  vote_a,
495
  vote_b,
496
+ vote_tie,
497
  model_name_a,
498
  model_name_b,
499
  send_btn,
500
+ random_btn,
501
+ gr.State(), # placeholder for success message
502
  ],
503
  )
504
 
505
  vote_b.click(
506
  fn=vote,
507
  inputs=[
508
+ gr.State("B"),
509
  model_a_state,
510
  model_b_state,
511
  final_prompt_state,
 
517
  outputs=[
518
  vote_a,
519
  vote_b,
520
+ vote_tie,
521
  model_name_a,
522
  model_name_b,
523
  send_btn,
524
+ random_btn,
525
+ gr.State(), # placeholder for success message
526
  ],
527
  )
528
 
529
  vote_tie.click(
530
  fn=vote,
531
  inputs=[
532
+ gr.State("Tie"),
533
  model_a_state,
534
  model_b_state,
535
  final_prompt_state,
 
541
  outputs=[
542
  vote_a,
543
  vote_b,
544
+ vote_tie,
545
  model_name_a,
546
  model_name_b,
547
  send_btn,
548
+ random_btn,
549
+ gr.State(), # placeholder for success message
550
  ],
551
  )
552
 
 
582
  critique_a,
583
  score_b,
584
  critique_b,
585
+ gr.update(interactive=True, variant="primary"), # vote_a
586
+ gr.update(interactive=True, variant="primary"), # vote_b
587
+ gr.update(interactive=True, variant="primary"), # vote_tie
588
  model_a,
589
  model_b,
590
+ final_prompt,
591
  gr.update(value="*Model: Hidden*"),
592
  gr.update(value="*Model: Hidden*"),
 
593
  gr.update(
594
+ value="Regenerate judges",
595
  variant="secondary",
596
  interactive=True
597
  ),
598
+ gr.update(value="🎲"), # random_btn
599
  )
600
 
601
  send_btn.click(
 
608
  critique_b,
609
  vote_a,
610
  vote_b,
611
+ vote_tie,
612
  model_a_state,
613
  model_b_state,
614
  final_prompt_state,
615
  model_name_a,
616
  model_name_b,
617
  send_btn,
618
+ random_btn,
619
  ],
620
  )
621
 
622
  # Update the input change handlers to also disable regenerate button
623
+ # def handle_input_changes(prompt, *variables):
624
+ # """Enable send button and manage regenerate button based on input changes"""
625
+ # last_inputs = last_submission.value
626
+ # current_inputs = {"prompt": prompt, "variables": variables}
627
+ # inputs_changed = last_inputs != current_inputs
628
+ # return [
629
+ # gr.update(interactive=True), # send button always enabled
630
+ # gr.update(
631
+ # interactive=not inputs_changed
632
+ # ), # regenerate button disabled if inputs changed
633
+ # ]
634
 
635
  # Update the change handlers for prompt and variables
636
  #eval_prompt.change(
 
677
  random_btn.click(
678
  fn=populate_random_example,
679
  inputs=[],
680
+ outputs=[
681
+ human_input,
682
+ ai_response,
683
+ random_btn,
684
+ score_a,
685
+ critique_a,
686
+ score_b,
687
+ critique_b,
688
+ vote_a,
689
+ vote_b,
690
+ vote_tie,
691
+ model_name_a,
692
+ model_name_b,
693
+ ]
694
  )
695
 
696
  # Add new input change handlers
697
  def handle_input_change():
698
+ """Reset UI state when inputs are changed"""
699
+ return [
700
+ gr.update(interactive=False), # vote_a
701
+ gr.update(interactive=False), # vote_b
702
+ gr.update(interactive=False), # vote_tie
703
+ gr.update(value="Run judges", variant="primary"), # send_btn
704
+ gr.update(value="🎲", variant="secondary"), # random_btn
705
+ ]
706
 
707
  # Update the change handlers for inputs
708
  human_input.change(
709
  fn=handle_input_change,
710
  inputs=[],
711
+ outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
712
  )
713
 
714
  ai_response.change(
715
  fn=handle_input_change,
716
  inputs=[],
717
+ outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
718
+ )
719
+
720
+ generate_btn.click(
721
+ fn=lambda msg: (
722
+ generate_ai_response(msg)[0], # Only take the response text
723
+ gr.update(
724
+ value="Generate AI Response", # Keep the label
725
+ interactive=False # Disable the button
726
+ )
727
+ ),
728
+ inputs=[human_input],
729
+ outputs=[ai_response, generate_btn]
730
+ )
731
+
732
+ human_input.change(
733
+ fn=lambda x: gr.update(interactive=bool(x.strip())),
734
+ inputs=[human_input],
735
+ outputs=[generate_btn]
736
  )
737
 
738
  # Update the demo.load to include the random example population