sdiazlor HF staff commited on
Commit
2b5c2e3
·
1 Parent(s): 5d91425

modify column width and typos

Browse files
src/distilabel_dataset_generator/apps/eval.py CHANGED
@@ -35,15 +35,15 @@ def get_valid_columns(df: pd.DataFrame):
35
  return valid_columns
36
 
37
 
38
- def load_dataset_from_hub(hub_repo_id: str, n_rows: int = 10):
39
  gr.Info(message="Loading dataset ...")
40
  if not hub_repo_id:
41
  raise gr.Error("Hub repo id is required")
42
  ds_dict = load_dataset(hub_repo_id)
43
  splits = list(ds_dict.keys())
44
  ds = ds_dict[splits[0]]
45
- if n_rows:
46
- ds = ds.select(range(n_rows))
47
  df = ds.to_pandas()
48
  # Get columns that contain either strings or lists of dictionaries
49
  valid_columns = get_valid_columns(df)
@@ -130,7 +130,7 @@ def apply_to_sample_dataset(
130
  prompt_template: str,
131
  structured_output: dict,
132
  ):
133
- df, _, _, _ = load_dataset_from_hub(repo_id, n_rows=10)
134
  df = _apply_to_dataset(
135
  df,
136
  eval_type,
@@ -150,7 +150,7 @@ def push_to_hub(
150
  org_name: str,
151
  repo_name: str,
152
  private: bool,
153
- n_rows: int,
154
  original_repo_id: str,
155
  eval_type: str,
156
  aspects_instruction: list[str],
@@ -162,7 +162,7 @@ def push_to_hub(
162
  prompt_template: str,
163
  structured_output: dict,
164
  ):
165
- df, _, _, _ = load_dataset_from_hub(original_repo_id, n_rows=n_rows)
166
  df = _apply_to_dataset(
167
  df,
168
  eval_type,
@@ -257,7 +257,7 @@ with gr.Blocks() as app:
257
  gr.HTML("<hr>")
258
  gr.Markdown("## 3. Generate your dataset")
259
  with gr.Row():
260
- with gr.Column(scale=1):
261
  org_name = get_org_dropdown()
262
  repo_name = gr.Textbox(
263
  label="Repo name",
@@ -265,7 +265,7 @@ with gr.Blocks() as app:
265
  value="my-distiset",
266
  interactive=True,
267
  )
268
- n_rows = gr.Number(
269
  label="Number of rows",
270
  value=10,
271
  interactive=True,
@@ -314,7 +314,7 @@ with gr.Blocks() as app:
314
  org_name,
315
  repo_name,
316
  private,
317
- n_rows,
318
  search_in,
319
  eval_type,
320
  aspects_instruction,
 
35
  return valid_columns
36
 
37
 
38
+ def load_dataset_from_hub(hub_repo_id: str, num_rows: int = 10):
39
  gr.Info(message="Loading dataset ...")
40
  if not hub_repo_id:
41
  raise gr.Error("Hub repo id is required")
42
  ds_dict = load_dataset(hub_repo_id)
43
  splits = list(ds_dict.keys())
44
  ds = ds_dict[splits[0]]
45
+ if num_rows:
46
+ ds = ds.select(range(num_rows))
47
  df = ds.to_pandas()
48
  # Get columns that contain either strings or lists of dictionaries
49
  valid_columns = get_valid_columns(df)
 
130
  prompt_template: str,
131
  structured_output: dict,
132
  ):
133
+ df, _, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
134
  df = _apply_to_dataset(
135
  df,
136
  eval_type,
 
150
  org_name: str,
151
  repo_name: str,
152
  private: bool,
153
+ num_rows: int,
154
  original_repo_id: str,
155
  eval_type: str,
156
  aspects_instruction: list[str],
 
162
  prompt_template: str,
163
  structured_output: dict,
164
  ):
165
+ df, _, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
166
  df = _apply_to_dataset(
167
  df,
168
  eval_type,
 
257
  gr.HTML("<hr>")
258
  gr.Markdown("## 3. Generate your dataset")
259
  with gr.Row():
260
+ with gr.Column(scale=2):
261
  org_name = get_org_dropdown()
262
  repo_name = gr.Textbox(
263
  label="Repo name",
 
265
  value="my-distiset",
266
  interactive=True,
267
  )
268
+ num_rows = gr.Number(
269
  label="Number of rows",
270
  value=10,
271
  interactive=True,
 
314
  org_name,
315
  repo_name,
316
  private,
317
+ num_rows,
318
  search_in,
319
  eval_type,
320
  aspects_instruction,
src/distilabel_dataset_generator/apps/sft.py CHANGED
@@ -213,7 +213,7 @@ def push_dataset_to_argilla(
213
  repo_name: str,
214
  system_prompt: str,
215
  num_turns: int = 1,
216
- n_rows: int = 10,
217
  private: bool = False,
218
  oauth_token: Union[gr.OAuthToken, None] = None,
219
  progress=gr.Progress(),
@@ -221,7 +221,7 @@ def push_dataset_to_argilla(
221
  dataframe = generate_dataset(
222
  system_prompt=system_prompt,
223
  num_turns=num_turns,
224
- num_rows=n_rows,
225
  )
226
  push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
227
  try:
@@ -352,20 +352,23 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
352
  with gr.Column() as main_ui:
353
  gr.Markdown(value="## 1. Describe the dataset you want")
354
  with gr.Row():
355
- with gr.Column(scale=1):
356
  dataset_description = gr.Textbox(
357
  label="Dataset description",
358
  placeholder="Give a precise description of your desired dataset.",
359
  )
 
 
 
 
 
360
  examples = gr.Examples(
361
  examples=DEFAULT_DATASET_DESCRIPTIONS,
362
  inputs=[dataset_description],
363
  cache_examples=False,
364
  label="Example descriptions",
365
  )
366
-
367
- load_btn = gr.Button("Load dataset")
368
- with gr.Column(scale=3):
369
  pass
370
 
371
  gr.HTML(value="<hr>")
@@ -392,7 +395,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
392
  gr.HTML(value="<hr>")
393
  gr.Markdown(value="## 3. Generate your dataset")
394
  with gr.Row():
395
- with gr.Column(scale=1):
396
  org_name = get_org_dropdown()
397
  repo_name = gr.Textbox(
398
  label="Repo name",
@@ -400,7 +403,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
400
  value=f"my-distiset-{str(uuid.uuid4())[:8]}",
401
  interactive=True,
402
  )
403
- n_rows = gr.Number(
404
  label="Number of rows",
405
  value=10,
406
  interactive=True,
@@ -417,7 +420,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
417
  success_message = gr.Markdown()
418
 
419
  pipeline_code = get_pipeline_code_ui(
420
- generate_pipeline_code(system_prompt.value, num_turns.value, n_rows.value)
421
  )
422
 
423
  gr.on(
@@ -454,7 +457,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
454
  repo_name,
455
  system_prompt,
456
  num_turns,
457
- n_rows,
458
  private,
459
  ],
460
  outputs=[success_message],
 
213
  repo_name: str,
214
  system_prompt: str,
215
  num_turns: int = 1,
216
+ num_rows: int = 10,
217
  private: bool = False,
218
  oauth_token: Union[gr.OAuthToken, None] = None,
219
  progress=gr.Progress(),
 
221
  dataframe = generate_dataset(
222
  system_prompt=system_prompt,
223
  num_turns=num_turns,
224
+ num_rows=num_rows,
225
  )
226
  push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
227
  try:
 
352
  with gr.Column() as main_ui:
353
  gr.Markdown(value="## 1. Describe the dataset you want")
354
  with gr.Row():
355
+ with gr.Column(scale=2):
356
  dataset_description = gr.Textbox(
357
  label="Dataset description",
358
  placeholder="Give a precise description of your desired dataset.",
359
  )
360
+ load_btn = gr.Button(
361
+ "Load dataset",
362
+ variant="primary",
363
+ )
364
+ with gr.Column(scale=2):
365
  examples = gr.Examples(
366
  examples=DEFAULT_DATASET_DESCRIPTIONS,
367
  inputs=[dataset_description],
368
  cache_examples=False,
369
  label="Example descriptions",
370
  )
371
+ with gr.Column(scale=1):
 
 
372
  pass
373
 
374
  gr.HTML(value="<hr>")
 
395
  gr.HTML(value="<hr>")
396
  gr.Markdown(value="## 3. Generate your dataset")
397
  with gr.Row():
398
+ with gr.Column(scale=2):
399
  org_name = get_org_dropdown()
400
  repo_name = gr.Textbox(
401
  label="Repo name",
 
403
  value=f"my-distiset-{str(uuid.uuid4())[:8]}",
404
  interactive=True,
405
  )
406
+ num_rows = gr.Number(
407
  label="Number of rows",
408
  value=10,
409
  interactive=True,
 
420
  success_message = gr.Markdown()
421
 
422
  pipeline_code = get_pipeline_code_ui(
423
+ generate_pipeline_code(system_prompt.value, num_turns.value, num_rows.value)
424
  )
425
 
426
  gr.on(
 
457
  repo_name,
458
  system_prompt,
459
  num_turns,
460
+ num_rows,
461
  private,
462
  ],
463
  outputs=[success_message],
src/distilabel_dataset_generator/apps/textcat.py CHANGED
@@ -219,7 +219,7 @@ def push_dataset_to_argilla(
219
  difficulty: str,
220
  clarity: str,
221
  num_labels: int = 1,
222
- n_rows: int = 10,
223
  labels: List[str] = None,
224
  private: bool = False,
225
  oauth_token: Union[gr.OAuthToken, None] = None,
@@ -231,7 +231,7 @@ def push_dataset_to_argilla(
231
  clarity=clarity,
232
  num_labels=num_labels,
233
  labels=labels,
234
- num_rows=n_rows,
235
  )
236
  push_dataset_to_hub(
237
  dataframe, org_name, repo_name, num_labels, labels, oauth_token, private
@@ -361,19 +361,23 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
361
  with gr.Column() as main_ui:
362
  gr.Markdown("## 1. Describe the dataset you want")
363
  with gr.Row():
364
- with gr.Column(scale=1):
365
  dataset_description = gr.Textbox(
366
  label="Dataset description",
367
  placeholder="Give a precise description of your desired dataset.",
368
  )
 
 
 
 
 
369
  examples = gr.Examples(
370
  examples=DEFAULT_DATASET_DESCRIPTIONS,
371
  inputs=[dataset_description],
372
  cache_examples=False,
373
  label="Example descriptions",
374
  )
375
- load_btn = gr.Button("Load dataset")
376
- with gr.Column(scale=3):
377
  pass
378
 
379
  gr.HTML("<hr>")
@@ -435,7 +439,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
435
  gr.HTML("<hr>")
436
  gr.Markdown("## 3. Generate your dataset")
437
  with gr.Row():
438
- with gr.Column(scale=1):
439
  org_name = get_org_dropdown()
440
  repo_name = gr.Textbox(
441
  label="Repo name",
@@ -443,7 +447,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
443
  value=f"my-distiset-{str(uuid.uuid4())[:8]}",
444
  interactive=True,
445
  )
446
- n_rows = gr.Number(
447
  label="Number of rows",
448
  value=10,
449
  interactive=True,
@@ -466,7 +470,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
466
  clarity=clarity.value,
467
  labels=labels.value,
468
  num_labels=num_labels.value,
469
- num_rows=n_rows.value,
470
  )
471
  )
472
 
@@ -514,7 +518,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
514
  difficulty,
515
  clarity,
516
  num_labels,
517
- n_rows,
518
  labels,
519
  private,
520
  ],
 
219
  difficulty: str,
220
  clarity: str,
221
  num_labels: int = 1,
222
+ num_rows: int = 10,
223
  labels: List[str] = None,
224
  private: bool = False,
225
  oauth_token: Union[gr.OAuthToken, None] = None,
 
231
  clarity=clarity,
232
  num_labels=num_labels,
233
  labels=labels,
234
+ num_rows=num_rows,
235
  )
236
  push_dataset_to_hub(
237
  dataframe, org_name, repo_name, num_labels, labels, oauth_token, private
 
361
  with gr.Column() as main_ui:
362
  gr.Markdown("## 1. Describe the dataset you want")
363
  with gr.Row():
364
+ with gr.Column(scale=2):
365
  dataset_description = gr.Textbox(
366
  label="Dataset description",
367
  placeholder="Give a precise description of your desired dataset.",
368
  )
369
+ load_btn = gr.Button(
370
+ "Load dataset",
371
+ variant="primary",
372
+ )
373
+ with gr.Column(scale=2):
374
  examples = gr.Examples(
375
  examples=DEFAULT_DATASET_DESCRIPTIONS,
376
  inputs=[dataset_description],
377
  cache_examples=False,
378
  label="Example descriptions",
379
  )
380
+ with gr.Column(scale=1):
 
381
  pass
382
 
383
  gr.HTML("<hr>")
 
439
  gr.HTML("<hr>")
440
  gr.Markdown("## 3. Generate your dataset")
441
  with gr.Row():
442
+ with gr.Column(scale=2):
443
  org_name = get_org_dropdown()
444
  repo_name = gr.Textbox(
445
  label="Repo name",
 
447
  value=f"my-distiset-{str(uuid.uuid4())[:8]}",
448
  interactive=True,
449
  )
450
+ num_rows = gr.Number(
451
  label="Number of rows",
452
  value=10,
453
  interactive=True,
 
470
  clarity=clarity.value,
471
  labels=labels.value,
472
  num_labels=num_labels.value,
473
+ num_rows=num_rows.value,
474
  )
475
  )
476
 
 
518
  difficulty,
519
  clarity,
520
  num_labels,
521
+ num_rows,
522
  labels,
523
  private,
524
  ],
src/distilabel_dataset_generator/utils.py CHANGED
@@ -50,22 +50,22 @@ def list_orgs(oauth_token: OAuthToken = None):
50
  return []
51
  data = whoami(oauth_token.token)
52
  if data["auth"]["type"] == "oauth":
53
- organisations = [data["name"]] + [org["name"] for org in data["orgs"]]
54
  elif data["auth"]["type"] == "access_token":
55
- organisations = [org["name"] for org in data["orgs"]]
56
  else:
57
- organisations = [
58
  entry["entity"]["name"]
59
  for entry in data["auth"]["accessToken"]["fineGrained"]["scoped"]
60
  if "repo.write" in entry["permissions"]
61
  ]
62
- organisations = [org for org in organisations if org != data["name"]]
63
- organisations = [data["name"]] + organisations
64
  except Exception as e:
65
  raise gr.Error(
66
  f"Failed to get organizations: {e}. See if you are logged and connected: https://huggingface.co/settings/connected-applications."
67
  )
68
- return organisations
69
 
70
 
71
  def get_org_dropdown(oauth_token: OAuthToken = None):
 
50
  return []
51
  data = whoami(oauth_token.token)
52
  if data["auth"]["type"] == "oauth":
53
+ organizations = [data["name"]] + [org["name"] for org in data["orgs"]]
54
  elif data["auth"]["type"] == "access_token":
55
+ organizations = [org["name"] for org in data["orgs"]]
56
  else:
57
+ organizations = [
58
  entry["entity"]["name"]
59
  for entry in data["auth"]["accessToken"]["fineGrained"]["scoped"]
60
  if "repo.write" in entry["permissions"]
61
  ]
62
+ organizations = [org for org in organizations if org != data["name"]]
63
+ organizations = [data["name"]] + organizations
64
  except Exception as e:
65
  raise gr.Error(
66
  f"Failed to get organizations: {e}. See if you are logged and connected: https://huggingface.co/settings/connected-applications."
67
  )
68
+ return organizations
69
 
70
 
71
  def get_org_dropdown(oauth_token: OAuthToken = None):