edbeeching commited on
Commit
b2c063a
·
1 Parent(s): 59c748f

adds revision option

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. app.py +23 -52
  3. utils.py +24 -16
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  evals/
2
- venv/
 
 
1
  evals/
2
+ venv/
3
+ __pycache__/
app.py CHANGED
@@ -8,7 +8,7 @@ import json
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
  import pandas as pd
10
  import datetime
11
- from utils import get_eval_results_dicts, make_clickable_model
12
 
13
  # clone / pull the lmeh eval data
14
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
@@ -45,53 +45,16 @@ def load_results(model, benchmark, metric):
45
  mean_acc = np.mean(accs)
46
  return mean_acc, data["config"]["model_args"]
47
 
48
- def get_n_params(base_model):
49
-
50
- # config = AutoConfig.from_pretrained(model_name)
51
-
52
- # # Retrieve the number of parameters from the configuration
53
- # try:
54
- # num_params = config.n_parameters
55
- # except AttributeError:
56
- # print(f"Error: The number of parameters is not available in the config for the model '{model_name}'.")
57
- # return None
58
-
59
- # return num_params
60
-
61
- now = datetime.datetime.now()
62
- time_string = now.strftime("%Y-%m-%d %H:%M:%S")
63
- return time_string
64
 
65
- COLS = ["eval_name", "# params", "total ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthQA (0-shot) ⬆️", "base_model"]
66
- TYPES = ["str","str", "number", "number", "number", "number", "number","markdown", ]
67
 
68
- EVAL_COLS = ["model","# params", "private", "8bit_eval", "is_delta_weight", "status"]
69
- EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
70
  def get_leaderboard():
71
  if repo:
72
  print("pulling changes")
73
  repo.git_pull()
74
- # entries = [entry for entry in os.listdir("evals") if not (entry.startswith('.') or entry=="eval_requests" or entry=="evals")]
75
- # model_directories = [entry for entry in entries if os.path.isdir(os.path.join("evals", entry))]
76
- # all_data = []
77
- # for model in model_directories:
78
- # model_data = {"base_model": None, "eval_name": model}
79
-
80
- # for benchmark, metric in zip(BENCHMARKS, METRICS):
81
- # value, base_model = load_results(model, benchmark, metric)
82
- # model_data[BENCH_TO_NAME[benchmark]] = round(value,3)
83
- # if base_model is not None: # in case the last benchmark failed
84
- # model_data["base_model"] = base_model
85
-
86
- # model_data["total ⬆️"] = round(sum(model_data[benchmark] for benchmark in BENCH_TO_NAME.values()),3)
87
-
88
- # if model_data["base_model"] is not None:
89
- # model_data["base_model"] = make_clickable_model(model_data["base_model"])
90
-
91
- # model_data["# params"] = get_n_params(model_data["base_model"])
92
-
93
- # if model_data["base_model"] is not None:
94
- # all_data.append(model_data)
95
 
96
  all_data = get_eval_results_dicts()
97
  dataframe = pd.DataFrame.from_records(all_data)
@@ -116,6 +79,7 @@ def get_eval_table():
116
 
117
  data["# params"] = get_n_params(data["model"])
118
  data["model"] = make_clickable_model(data["model"])
 
119
 
120
 
121
  all_evals.append(data)
@@ -127,7 +91,7 @@ def get_eval_table():
127
  with open(file_path) as fp:
128
  data = json.load(fp)
129
 
130
- data["# params"] = get_n_params(data["model"])
131
  data["model"] = make_clickable_model(data["model"])
132
  all_evals.append(data)
133
 
@@ -139,9 +103,9 @@ def get_eval_table():
139
  leaderboard = get_leaderboard()
140
  eval_queue = get_eval_table()
141
 
142
- def is_model_on_hub(model_name) -> bool:
143
  try:
144
- config = AutoConfig.from_pretrained(model_name)
145
  return True
146
 
147
  except Exception as e:
@@ -151,15 +115,19 @@ def is_model_on_hub(model_name) -> bool:
151
 
152
 
153
 
154
- def add_new_eval(model:str, private:bool, is_8_bit_eval: bool, is_delta_weight:bool):
155
  # check the model actually exists before adding the eval
156
- if not is_model_on_hub(model):
 
 
 
157
  print(model, "not found on hub")
158
  return
159
  print("adding new eval")
160
 
161
  eval_entry = {
162
  "model" : model,
 
163
  "private" : private,
164
  "8bit_eval" : is_8_bit_eval,
165
  "is_delta_weight" : is_delta_weight,
@@ -227,14 +195,17 @@ with block:
227
  # with gr.Row():
228
  # gr.Markdown(f"""# Submit a new model for evaluation""")
229
  with gr.Row():
230
- model_name_textbox = gr.Textbox(label="model_name")
231
- is_8bit_toggle = gr.Checkbox(False, label="8 bit Eval?")
232
- private = gr.Checkbox(False, label="Private?")
233
- is_delta_weight = gr.Checkbox(False, label="Delta Weights?")
 
 
 
234
 
235
  with gr.Row():
236
  submit_button = gr.Button("Submit Eval")
237
- submit_button.click(add_new_eval, [model_name_textbox, is_8bit_toggle, private, is_delta_weight])
238
 
239
 
240
 
 
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
  import pandas as pd
10
  import datetime
11
+ from utils import get_eval_results_dicts, make_clickable_model, get_n_params
12
 
13
  # clone / pull the lmeh eval data
14
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
 
45
  mean_acc = np.mean(accs)
46
  return mean_acc, data["config"]["model_args"]
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ COLS = ["eval_name", "total ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthQA (0-shot) ⬆️", "base_model"]
50
+ TYPES = ["str", "number", "number", "number", "number", "number","markdown", ]
51
 
52
+ EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
53
+ EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
54
  def get_leaderboard():
55
  if repo:
56
  print("pulling changes")
57
  repo.git_pull()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  all_data = get_eval_results_dicts()
60
  dataframe = pd.DataFrame.from_records(all_data)
 
79
 
80
  data["# params"] = get_n_params(data["model"])
81
  data["model"] = make_clickable_model(data["model"])
82
+ data["revision"] = data.get("revision", "main")
83
 
84
 
85
  all_evals.append(data)
 
91
  with open(file_path) as fp:
92
  data = json.load(fp)
93
 
94
+ #data["# params"] = get_n_params(data["model"])
95
  data["model"] = make_clickable_model(data["model"])
96
  all_evals.append(data)
97
 
 
103
  leaderboard = get_leaderboard()
104
  eval_queue = get_eval_table()
105
 
106
+ def is_model_on_hub(model_name, revision) -> bool:
107
  try:
108
+ config = AutoConfig.from_pretrained(model_name, revision=revision)
109
  return True
110
 
111
  except Exception as e:
 
115
 
116
 
117
 
118
+ def add_new_eval(model:str, revision:str, private:bool, is_8_bit_eval: bool, is_delta_weight:bool):
119
  # check the model actually exists before adding the eval
120
+ if revision == "":
121
+ revision = "main"
122
+ print("revision", revision)
123
+ if not is_model_on_hub(model, revision):
124
  print(model, "not found on hub")
125
  return
126
  print("adding new eval")
127
 
128
  eval_entry = {
129
  "model" : model,
130
+ "revision" : revision,
131
  "private" : private,
132
  "8bit_eval" : is_8_bit_eval,
133
  "is_delta_weight" : is_delta_weight,
 
195
  # with gr.Row():
196
  # gr.Markdown(f"""# Submit a new model for evaluation""")
197
  with gr.Row():
198
+ with gr.Column():
199
+ model_name_textbox = gr.Textbox(label="Model name")
200
+ revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
201
+ with gr.Column():
202
+ is_8bit_toggle = gr.Checkbox(False, label="8 bit eval")
203
+ private = gr.Checkbox(False, label="Private")
204
+ is_delta_weight = gr.Checkbox(False, label="Delta weights")
205
 
206
  with gr.Row():
207
  submit_button = gr.Button("Submit Eval")
208
+ submit_button.click(add_new_eval, [model_name_textbox, revision_name_textbox, is_8bit_toggle, private, is_delta_weight])
209
 
210
 
211
 
utils.py CHANGED
@@ -3,7 +3,7 @@ import shutil
3
  import numpy as np
4
  import gradio as gr
5
  from huggingface_hub import Repository, HfApi
6
- from transformers import AutoConfig
7
  import json
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
  import pandas as pd
@@ -15,18 +15,6 @@ from typing import List, Tuple, Dict
15
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
16
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
17
 
18
- # repo=None
19
- # if H4_TOKEN:
20
- # print("pulling repo")
21
- # # try:
22
- # # shutil.rmtree("./evals/")
23
- # # except:
24
- # # pass
25
-
26
- # repo = Repository(
27
- # local_dir="./evals/", clone_from=LMEH_REPO, use_auth_token=H4_TOKEN, repo_type="dataset"
28
- # )
29
- # repo.git_pull()
30
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
31
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
32
  BENCH_TO_NAME = {
@@ -42,6 +30,21 @@ def make_clickable_model(model_name):
42
  link = "https://huggingface.co/" + model_name
43
  return f'<a target="_blank" href="{link}" style="color: blue; text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  @dataclass
46
  class EvalResult:
47
  eval_name : str
@@ -50,12 +53,17 @@ class EvalResult:
50
  is_8bit : bool
51
  results : dict
52
 
53
- def to_dict(self):
 
 
 
 
 
54
  data_dict = {}
55
  data_dict["eval_name"] = self.eval_name
56
- data_dict["base_model"] = make_clickable_model(f"{self.org}/{self.model}")
57
  data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
58
- data_dict["# params"] = "unknown (todo)"
59
 
60
  for benchmark in BENCHMARKS:
61
  if not benchmark in self.results.keys():
 
3
  import numpy as np
4
  import gradio as gr
5
  from huggingface_hub import Repository, HfApi
6
+ from transformers import AutoConfig, AutoModel
7
  import json
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
  import pandas as pd
 
15
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
16
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
17
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
19
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
20
  BENCH_TO_NAME = {
 
30
  link = "https://huggingface.co/" + model_name
31
  return f'<a target="_blank" href="{link}" style="color: blue; text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
32
 
33
+ def get_n_params(base_model):
34
+ return "unknown"
35
+
36
+ # WARNING: High memory usage
37
+
38
+ # Retrieve the number of parameters from the configuration
39
+ try:
40
+ config = AutoConfig.from_pretrained(base_model, use_auth_token=True, low_cpu_mem_usage=True)
41
+ n_params = AutoModel.from_config(config).num_parameters()
42
+ except Exception as e:
43
+ print(f"Error:{e} The number of parameters is not available in the config for the model '{base_model}'.")
44
+ return "unknown"
45
+
46
+ return str(n_params)
47
+
48
  @dataclass
49
  class EvalResult:
50
  eval_name : str
 
53
  is_8bit : bool
54
  results : dict
55
 
56
+ def to_dict(self):
57
+
58
+ if self.org is not None:
59
+ base_model =f"{self.org}/{self.model}"
60
+ else:
61
+ base_model =f"{self.model}"
62
  data_dict = {}
63
  data_dict["eval_name"] = self.eval_name
64
+ data_dict["base_model"] = make_clickable_model(base_model)
65
  data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
66
+ data_dict["# params"] = get_n_params(base_model)
67
 
68
  for benchmark in BENCHMARKS:
69
  if not benchmark in self.results.keys():