natolambert commited on
Commit
91c5b22
·
1 Parent(s): 93916f2
Files changed (2) hide show
  1. src/md.py +3 -1
  2. src/utils.py +20 -7
src/md.py CHANGED
@@ -2,6 +2,8 @@ ABOUT_TEXT = """
2
  We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
3
  A win is when the score for the chosen response is higher than the score for the rejected response.
4
 
 
 
5
  ## Overview
6
 
7
  We average over 4 core sections (per prompt weighting):
@@ -93,5 +95,5 @@ For more details, see the [dataset](https://huggingface.co/datasets/allenai/rewa
93
  TOP_TEXT = """
94
  # RewardBench: Evaluating Reward Models
95
  ### Evaluating the capabilities, safety, and pitfalls of reward models
96
- [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {}
97
  """
 
2
  We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
3
  A win is when the score for the chosen response is higher than the score for the rejected response.
4
 
5
+ Note: Models with (*) after the model name are independently submitted model scores which have not been verified by the RewardBench team.
6
+
7
  ## Overview
8
 
9
  We average over 4 core sections (per prompt weighting):
 
95
  TOP_TEXT = """
96
  # RewardBench: Evaluating Reward Models
97
  ### Evaluating the capabilities, safety, and pitfalls of reward models
98
+ [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {} | * Unverified models
99
  """
src/utils.py CHANGED
@@ -5,24 +5,37 @@ import numpy as np
5
  import os
6
  import re
7
 
 
 
 
 
 
 
 
 
8
  # From Open LLM Leaderboard
9
  def model_hyperlink(link, model_name):
10
  # if model_name is above 50 characters, return first 47 characters and "..."
11
  if len(model_name) > 50:
12
  model_name = model_name[:47] + "..."
13
  if model_name == "random":
14
- return "random"
15
  elif model_name == "Cohere March 2024":
16
- return f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
17
  elif "openai" == model_name.split("/")[0]:
18
- return f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
19
  elif "Anthropic" == model_name.split("/")[0]:
20
- return f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
21
  elif "google" == model_name.split("/")[0]:
22
- return f'<a target="_blank" href="https://huggingface.co/google" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
23
  elif "PoLL" == model_name.split("/")[0]:
24
- return model_name
25
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 
 
 
 
 
26
 
27
  def undo_hyperlink(html_string):
28
  # Regex pattern to match content inside > and <
 
5
  import os
6
  import re
7
 
8
+ UNVERIFIED_MODELS = [
9
+ "nvidia/Nemotron-4-340B-Reward",
10
+ "nvidia/Llama3-70B-SteerLM-RM",
11
+ "Cohere May 2024",
12
+ "google/gemini-1.5-pro-0514",
13
+ "Cohere March 2024",
14
+ ]
15
+
16
  # From Open LLM Leaderboard
17
  def model_hyperlink(link, model_name):
18
  # if model_name is above 50 characters, return first 47 characters and "..."
19
  if len(model_name) > 50:
20
  model_name = model_name[:47] + "..."
21
  if model_name == "random":
22
+ output = "random"
23
  elif model_name == "Cohere March 2024":
24
+ output = f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
25
  elif "openai" == model_name.split("/")[0]:
26
+ output = f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
27
  elif "Anthropic" == model_name.split("/")[0]:
28
+ output = f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
29
  elif "google" == model_name.split("/")[0]:
30
+ output = f'<a target="_blank" href="https://huggingface.co/google" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
31
  elif "PoLL" == model_name.split("/")[0]:
32
+ output = model_name
33
+ output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
34
+
35
+ if model_name in UNVERIFIED_MODELS:
36
+ return output + " *"
37
+ else:
38
+ return output
39
 
40
  def undo_hyperlink(html_string):
41
  # Regex pattern to match content inside > and <