lewtun HF staff commited on
Commit
6c14077
Β·
1 Parent(s): 32dc08d

Enable selection from all datasets

Browse files
Files changed (2) hide show
  1. app.py +134 -42
  2. utils.py +21 -9
app.py CHANGED
@@ -2,9 +2,11 @@ import os
2
  from pathlib import Path
3
 
4
  import streamlit as st
 
5
  from dotenv import load_dotenv
 
6
 
7
- from utils import get_compatible_models, get_metadata, http_post
8
 
9
  if Path(".env").is_file():
10
  load_dotenv(".env")
@@ -12,6 +14,7 @@ if Path(".env").is_file():
12
  HF_TOKEN = os.getenv("HF_TOKEN")
13
  AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
14
  AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
 
15
 
16
 
17
  TASK_TO_ID = {
@@ -25,8 +28,19 @@ TASK_TO_ID = {
25
  "single_column_regression": 10,
26
  }
27
 
 
 
 
 
 
 
 
 
 
 
 
28
  # TODO: remove this hardcorded logic and accept any dataset on the Hub
29
- DATASETS_TO_EVALUATE = ["emotion", "conll2003", "imdb", "squad", "xsum", "ncbi_disease", "go_emotions"]
30
 
31
  ###########
32
  ### APP ###
@@ -42,28 +56,59 @@ st.markdown(
42
  """
43
  )
44
 
45
- dataset_name = st.selectbox("Select a dataset", [f"lewtun/autoevaluate__{dset}" for dset in DATASETS_TO_EVALUATE])
 
 
46
 
47
  # TODO: remove this step once we select real datasets
48
  # Strip out original dataset name
49
- original_dataset_name = dataset_name.split("/")[-1].split("__")[-1]
50
 
51
  # In general this will be a list of multiple configs => need to generalise logic here
52
- metadata = get_metadata(dataset_name)
 
 
 
53
 
54
  with st.expander("Advanced configuration"):
55
-
56
- dataset_config = st.selectbox("Select a config", [metadata[0]["config"]])
57
-
58
- splits = metadata[0]["splits"]
59
- split_names = list(splits.values())
60
- eval_split = splits.get("eval_split", split_names[0])
61
-
62
- selected_split = st.selectbox("Select a split", split_names, index=split_names.index(eval_split))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  # TODO: add a function to handle the mapping task <--> column mapping
65
- col_mapping = metadata[0]["col_mapping"]
66
- col_names = list(col_mapping.keys())
67
 
68
  # TODO: figure out how to get all dataset column names (i.e. features) without download dataset itself
69
  st.markdown("**Map your data columns**")
@@ -71,6 +116,7 @@ with st.expander("Advanced configuration"):
71
 
72
  # TODO: find a better way to layout these items
73
  # TODO: propagate this information to payload
 
74
  with col1:
75
  st.markdown("`text` column")
76
  st.text("")
@@ -84,34 +130,80 @@ with st.expander("Advanced configuration"):
84
 
85
  with st.form(key="form"):
86
 
87
- compatible_models = get_compatible_models(metadata[0]["task"], original_dataset_name)
88
 
89
- selected_models = st.multiselect("Select the models you wish to evaluate", compatible_models, compatible_models[0])
 
 
 
90
 
91
  submit_button = st.form_submit_button("Make submission")
92
 
93
- if submit_button:
94
- for model in selected_models:
95
- payload = {
96
- "username": AUTOTRAIN_USERNAME,
97
- "task": TASK_TO_ID[metadata[0]["task_id"]],
98
- "model": model,
99
- "col_mapping": metadata[0]["col_mapping"],
100
- "split": selected_split,
101
- "dataset": original_dataset_name,
102
- "config": dataset_config,
103
- }
104
- json_resp = http_post(
105
- path="/evaluate/create", payload=payload, token=HF_TOKEN, domain=AUTOTRAIN_BACKEND_API
106
- ).json()
107
- if json_resp["status"] == 1:
108
- st.success(f"βœ… Successfully submitted model {model} for evaluation with job ID {json_resp['id']}")
109
- st.markdown(
110
- f"""
111
- Evaluation takes appoximately 1 hour to complete, so grab a β˜• or 🍡 while you wait:
112
-
113
- * πŸ“Š Click [here](https://huggingface.co/spaces/huggingface/leaderboards) to view the results from your submission
114
- """
115
- )
116
- else:
117
- st.error("πŸ™ˆ Oh noes, there was an error submitting your submission!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from pathlib import Path
3
 
4
  import streamlit as st
5
+ from datasets import get_dataset_config_names
6
  from dotenv import load_dotenv
7
+ from huggingface_hub import list_datasets
8
 
9
+ from utils import get_compatible_models, get_metadata, http_get, http_post
10
 
11
  if Path(".env").is_file():
12
  load_dotenv(".env")
 
14
  HF_TOKEN = os.getenv("HF_TOKEN")
15
  AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
16
  AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
17
+ DATASETS_PREVIEW_API = os.getenv("DATASETS_PREVIEW_API")
18
 
19
 
20
  TASK_TO_ID = {
 
28
  "single_column_regression": 10,
29
  }
30
 
31
+ AUTOTRAIN_TASK_TO_HUB_TASK = {
32
+ "binary_classification": "text-classification",
33
+ "multi_class_classification": "text-classification",
34
+ "multi_label_classification": "text-classification",
35
+ "entity_extraction": "token-classification",
36
+ "extractive_question_answering": "question-answering",
37
+ "translation": "translation",
38
+ "summarization": "summarization",
39
+ "single_column_regression": 10,
40
+ }
41
+
42
  # TODO: remove this hardcorded logic and accept any dataset on the Hub
43
+ # DATASETS_TO_EVALUATE = ["emotion", "conll2003", "imdb", "squad", "xsum", "ncbi_disease", "go_emotions"]
44
 
45
  ###########
46
  ### APP ###
 
56
  """
57
  )
58
 
59
+ all_datasets = [d.id for d in list_datasets()]
60
+ selected_dataset = st.selectbox("Select a dataset", all_datasets)
61
+ print(f"Dataset name: {selected_dataset}")
62
 
63
  # TODO: remove this step once we select real datasets
64
  # Strip out original dataset name
65
+ # original_dataset_name = dataset_name.split("/")[-1].split("__")[-1]
66
 
67
  # In general this will be a list of multiple configs => need to generalise logic here
68
+ metadata = get_metadata(selected_dataset)
69
+ print(metadata)
70
+ if metadata is None:
71
+ st.warning("No evaluation metadata found. Please configure the evaluation job below.")
72
 
73
  with st.expander("Advanced configuration"):
74
+ ## Select task
75
+ selected_task = st.selectbox("Select a task", list(AUTOTRAIN_TASK_TO_HUB_TASK.values()))
76
+ ### Select config
77
+ configs = get_dataset_config_names(selected_dataset)
78
+ selected_config = st.selectbox("Select a config", configs)
79
+
80
+ ## Select splits
81
+ splits_resp = http_get(path="/splits", domain=DATASETS_PREVIEW_API, params={"dataset": selected_dataset})
82
+ if splits_resp.status_code == 200:
83
+ split_names = []
84
+ all_splits = splits_resp.json()
85
+ print(all_splits)
86
+ for split in all_splits["splits"]:
87
+ print(selected_config)
88
+ if split["config"] == selected_config:
89
+ split_names.append(split["split"])
90
+
91
+ selected_split = st.selectbox("Select a split", split_names) # , index=split_names.index(eval_split))
92
+
93
+ ## Show columns
94
+ rows_resp = http_get(
95
+ path="/rows",
96
+ domain="https://datasets-preview.huggingface.tech",
97
+ params={"dataset": selected_dataset, "config": selected_config, "split": selected_split},
98
+ ).json()
99
+ columns = rows_resp["columns"]
100
+ col_names = []
101
+ for c in columns:
102
+ col_names.append(c["column"]["name"])
103
+ # splits = metadata[0]["splits"]
104
+ # split_names = list(splits.values())
105
+ # eval_split = splits.get("eval_split", split_names[0])
106
+
107
+ # selected_split = st.selectbox("Select a split", split_names, index=split_names.index(eval_split))
108
 
109
  # TODO: add a function to handle the mapping task <--> column mapping
110
+ # col_mapping = metadata[0]["col_mapping"]
111
+ # col_names = list(col_mapping.keys())
112
 
113
  # TODO: figure out how to get all dataset column names (i.e. features) without download dataset itself
114
  st.markdown("**Map your data columns**")
 
116
 
117
  # TODO: find a better way to layout these items
118
  # TODO: propagate this information to payload
119
+ # TODO: make it task specific
120
  with col1:
121
  st.markdown("`text` column")
122
  st.text("")
 
130
 
131
  with st.form(key="form"):
132
 
133
+ compatible_models = get_compatible_models(selected_task, selected_dataset)
134
 
135
+ selected_models = st.multiselect(
136
+ "Select the models you wish to evaluate", compatible_models
137
+ ) # , compatible_models[0])
138
+ print(selected_models)
139
 
140
  submit_button = st.form_submit_button("Make submission")
141
 
142
+ # if submit_button:
143
+ # for model in selected_models:
144
+ # payload = {
145
+ # "username": AUTOTRAIN_USERNAME,
146
+ # "task": TASK_TO_ID[metadata[0]["task_id"]],
147
+ # "model": model,
148
+ # "col_mapping": metadata[0]["col_mapping"],
149
+ # "split": selected_split,
150
+ # "dataset": original_dataset_name,
151
+ # "config": selected_config,
152
+ # }
153
+ # json_resp = http_post(
154
+ # path="/evaluate/create", payload=payload, token=HF_TOKEN, domain=AUTOTRAIN_BACKEND_API
155
+ # ).json()
156
+ # if json_resp["status"] == 1:
157
+ # st.success(f"βœ… Successfully submitted model {model} for evaluation with job ID {json_resp['id']}")
158
+ # st.markdown(
159
+ # f"""
160
+ # Evaluation takes appoximately 1 hour to complete, so grab a β˜• or 🍡 while you wait:
161
+
162
+ # * πŸ“Š Click [here](https://huggingface.co/spaces/huggingface/leaderboards) to view the results from your submission
163
+ # """
164
+ # )
165
+ # else:
166
+ # st.error("πŸ™ˆ Oh noes, there was an error submitting your submission!")
167
+
168
+ # st.write("Creating project!")
169
+ # payload = {
170
+ # "username": AUTOTRAIN_USERNAME,
171
+ # "proj_name": "my-eval-project-1",
172
+ # "task": TASK_TO_ID[metadata[0]["task_id"]],
173
+ # "config": {
174
+ # "language": "en",
175
+ # "max_models": 5,
176
+ # "instance": {
177
+ # "provider": "aws",
178
+ # "instance_type": "ml.g4dn.4xlarge",
179
+ # "max_runtime_seconds": 172800,
180
+ # "num_instances": 1,
181
+ # "disk_size_gb": 150,
182
+ # },
183
+ # },
184
+ # }
185
+ # json_resp = http_post(
186
+ # path="/projects/create", payload=payload, token=HF_TOKEN, domain=AUTOTRAIN_BACKEND_API
187
+ # ).json()
188
+ # # print(json_resp)
189
+
190
+ # # st.write("Uploading data")
191
+ # payload = {
192
+ # "split": 4,
193
+ # "col_mapping": metadata[0]["col_mapping"],
194
+ # "load_config": {"max_size_bytes": 0, "shuffle": False},
195
+ # }
196
+ # json_resp = http_post(
197
+ # path="/projects/522/data/emotion",
198
+ # payload=payload,
199
+ # token=HF_TOKEN,
200
+ # domain=AUTOTRAIN_BACKEND_API,
201
+ # params={"type": "dataset", "config_name": "default", "split_name": "train"},
202
+ # ).json()
203
+ # print(json_resp)
204
+
205
+ # st.write("Training")
206
+ # json_resp = http_get(
207
+ # path="/projects/522/data/start_process", token=HF_TOKEN, domain=AUTOTRAIN_BACKEND_API
208
+ # ).json()
209
+ # print(json_resp)
utils.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import requests
2
  from huggingface_hub import DatasetFilter, HfApi, ModelFilter
3
 
@@ -8,16 +10,23 @@ def get_auth_headers(token: str, prefix: str = "autonlp"):
8
  return {"Authorization": f"{prefix} {token}"}
9
 
10
 
11
- def http_post(
12
- path: str,
13
- token: str,
14
- payload=None,
15
- domain: str = None,
16
- ) -> requests.Response:
17
  """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
18
  try:
19
  response = requests.post(
20
- url=domain + path, json=payload, headers=get_auth_headers(token=token), allow_redirects=True
 
 
 
 
 
 
 
 
 
 
 
 
21
  )
22
  except requests.exceptions.ConnectionError:
23
  print("❌ Failed to reach AutoNLP API, check your internet connection")
@@ -25,10 +34,13 @@ def http_post(
25
  return response
26
 
27
 
28
- def get_metadata(dataset_name):
29
  filt = DatasetFilter(dataset_name=dataset_name)
30
  data = api.list_datasets(filter=filt, full=True)
31
- return data[0].cardData["train-eval-index"]
 
 
 
32
 
33
 
34
  def get_compatible_models(task, dataset_name):
 
1
+ from typing import Dict, Union
2
+
3
  import requests
4
  from huggingface_hub import DatasetFilter, HfApi, ModelFilter
5
 
 
10
  return {"Authorization": f"{prefix} {token}"}
11
 
12
 
13
+ def http_post(path: str, token: str, payload=None, domain: str = None, params=None) -> requests.Response:
 
 
 
 
 
14
  """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
15
  try:
16
  response = requests.post(
17
+ url=domain + path, json=payload, headers=get_auth_headers(token=token), allow_redirects=True, params=params
18
+ )
19
+ except requests.exceptions.ConnectionError:
20
+ print("❌ Failed to reach AutoNLP API, check your internet connection")
21
+ response.raise_for_status()
22
+ return response
23
+
24
+
25
+ def http_get(path: str, domain: str, token: str = None, params: dict = None) -> requests.Response:
26
+ """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
27
+ try:
28
+ response = requests.get(
29
+ url=domain + path, headers=get_auth_headers(token=token), allow_redirects=True, params=params
30
  )
31
  except requests.exceptions.ConnectionError:
32
  print("❌ Failed to reach AutoNLP API, check your internet connection")
 
34
  return response
35
 
36
 
37
+ def get_metadata(dataset_name: str) -> Union[Dict, None]:
38
  filt = DatasetFilter(dataset_name=dataset_name)
39
  data = api.list_datasets(filter=filt, full=True)
40
+ if data[0].cardData is not None and "train-eval-index" in data[0].cardData.keys():
41
+ return data[0].cardData["train-eval-index"]
42
+ else:
43
+ return None
44
 
45
 
46
  def get_compatible_models(task, dataset_name):