davidberenstein1957 HF staff commited on
Commit
9dcfb8f
·
1 Parent(s): 8cf952e

update logo

Browse files
README.md CHANGED
@@ -17,8 +17,12 @@ hf_oauth_scopes:
17
  - manage-repos
18
  - inference-api
19
  ---
20
-
21
- <img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo-sdg.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: 75%; margin-bottom: -200px;"/>
 
 
 
 
22
 
23
  ![Synthetic Data Generator](https://huggingface.co/spaces/argilla/synthetic-data-generator/resolve/main/assets/ui-full.png)
24
 
 
17
  - manage-repos
18
  - inference-api
19
  ---
20
+ <p align="center">
21
+ <img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="width: 80%;"/>
22
+ </p>
23
+ <p align="center">
24
+ <h3>Build datasets using natural language</h3>
25
+ </p>
26
 
27
  ![Synthetic Data Generator](https://huggingface.co/spaces/argilla/synthetic-data-generator/resolve/main/assets/ui-full.png)
28
 
assets/logo-sdg.svg DELETED
assets/logo.svg CHANGED
src/synthetic_dataset_generator/_tabbedinterface.py CHANGED
@@ -8,7 +8,6 @@ from collections.abc import Sequence
8
 
9
  import gradio as gr
10
  from gradio.blocks import Blocks
11
- from gradio.components import HTML
12
  from gradio.layouts import Tab, Tabs
13
  from gradio.themes import ThemeClass as Theme
14
  from gradio_client.documentation import document
@@ -61,16 +60,11 @@ class TabbedInterface(Blocks):
61
  tab_names = [f"Tab {i}" for i in range(len(interface_list))]
62
  with self:
63
  if title:
64
- HTML(value=title)
65
- with gr.Row():
66
- with gr.Column(scale=2):
67
- gr.Markdown("### Build datasets using natural language")
68
- with gr.Column(scale=3):
69
- pass
70
- with gr.Column(scale=2):
71
- gr.LoginButton(
72
- value="Sign in", variant="primary", scale=2
73
- )
74
  with Tabs():
75
  for interface, tab_name in zip(interface_list, tab_names, strict=False):
76
  with Tab(label=tab_name):
 
8
 
9
  import gradio as gr
10
  from gradio.blocks import Blocks
 
11
  from gradio.layouts import Tab, Tabs
12
  from gradio.themes import ThemeClass as Theme
13
  from gradio_client.documentation import document
 
60
  tab_names = [f"Tab {i}" for i in range(len(interface_list))]
61
  with self:
62
  if title:
63
+ gr.HTML(value=title)
64
+ gr.HTML(
65
+ "<div style='text-align: center;'><h3>Build datasets using natural language</h3></div>"
66
+ )
67
+ gr.LoginButton(value="Sign in", variant="primary", scale=2)
 
 
 
 
 
68
  with Tabs():
69
  for interface, tab_name in zip(interface_list, tab_names, strict=False):
70
  with Tab(label=tab_name):
src/synthetic_dataset_generator/app.py CHANGED
@@ -10,11 +10,13 @@ css = """
10
  .main_ui_logged_out{opacity: 0.3; pointer-events: none}
11
  """
12
 
 
 
13
  demo = TabbedInterface(
14
  [textcat_app, sft_app, eval_app, faq_app],
15
  ["Text Classification", "Supervised Fine-Tuning", "Evaluation", "FAQ"],
16
  css=css,
17
- title="Synthetic Data Generator",
18
  head="Synthetic Data Generator",
19
  theme=theme,
20
  )
 
10
  .main_ui_logged_out{opacity: 0.3; pointer-events: none}
11
  """
12
 
13
+ image = """<img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo-sdg.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: 75%; margin-bottom: -400px;"/>"""
14
+
15
  demo = TabbedInterface(
16
  [textcat_app, sft_app, eval_app, faq_app],
17
  ["Text Classification", "Supervised Fine-Tuning", "Evaluation", "FAQ"],
18
  css=css,
19
+ title=image,
20
  head="Synthetic Data Generator",
21
  theme=theme,
22
  )
src/synthetic_dataset_generator/apps/eval.py CHANGED
@@ -13,8 +13,9 @@ from datasets import (
13
  load_dataset,
14
  )
15
  from distilabel.distiset import Distiset
 
16
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
17
- from huggingface_hub import HfApi
18
 
19
  from synthetic_dataset_generator.apps.base import (
20
  hide_success_message,
@@ -45,7 +46,10 @@ from synthetic_dataset_generator.utils import (
45
 
46
  def get_iframe(hub_repo_id: str) -> str:
47
  if not hub_repo_id:
48
- raise gr.Error("Hub repository ID is required.")
 
 
 
49
 
50
  url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer"
51
  iframe = f"""
@@ -79,12 +83,14 @@ def get_valid_columns(dataframe: pd.DataFrame):
79
  return instruction_valid_columns, response_valid_columns
80
 
81
 
82
- def load_dataset_from_hub(repo_id: str, num_rows: int = 10):
 
 
83
  if not repo_id:
84
  raise gr.Error("Hub repo id is required")
85
- subsets = get_dataset_config_names(repo_id)
86
- ds_dict = load_dataset(repo_id, subsets[0])
87
- splits = get_dataset_split_names(repo_id, subsets[0])
88
  ds = ds_dict[splits[0]]
89
  if num_rows:
90
  ds = ds.select(range(num_rows))
@@ -601,7 +607,10 @@ with gr.Blocks() as app:
601
  search_type="dataset",
602
  sumbit_on_select=True,
603
  )
604
- load_btn = gr.Button("Load dataset", variant="primary")
 
 
 
605
  with gr.Column(scale=3):
606
  search_out = gr.HTML(label="Dataset preview")
607
 
@@ -666,9 +675,9 @@ with gr.Blocks() as app:
666
  inputs=[],
667
  outputs=[eval_type],
668
  )
669
- btn_apply_to_sample_dataset = gr.Button(
670
- "Refresh dataset", variant="primary"
671
- )
672
  with gr.Column(scale=3):
673
  dataframe = gr.Dataframe(
674
  headers=["prompt", "completion", "evaluation"],
@@ -724,7 +733,11 @@ with gr.Blocks() as app:
724
  label="Distilabel Pipeline Code",
725
  )
726
 
727
- search_in.submit(fn=get_iframe, inputs=search_in, outputs=search_out)
 
 
 
 
728
 
729
  load_btn.click(
730
  fn=load_dataset_from_hub,
@@ -793,12 +806,8 @@ with gr.Blocks() as app:
793
  fn=generate_pipeline_code,
794
  inputs=[
795
  search_in,
796
- aspects_instruction_response,
797
- instruction_instruction_response,
798
- response_instruction_response,
799
  prompt_template,
800
  structured_output,
801
- num_rows,
802
  eval_type,
803
  ],
804
  outputs=[pipeline_code],
@@ -808,5 +817,16 @@ with gr.Blocks() as app:
808
  outputs=[pipeline_code_ui],
809
  )
810
 
 
 
 
 
 
 
 
 
 
 
 
811
  app.load(fn=swap_visibility, outputs=main_ui)
812
  app.load(fn=get_org_dropdown, outputs=[org_name])
 
13
  load_dataset,
14
  )
15
  from distilabel.distiset import Distiset
16
+ from gradio.oauth import OAuthToken #
17
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
18
+ from huggingface_hub import HfApi, repo_exists
19
 
20
  from synthetic_dataset_generator.apps.base import (
21
  hide_success_message,
 
46
 
47
  def get_iframe(hub_repo_id: str) -> str:
48
  if not hub_repo_id:
49
+ return ""
50
+
51
+ if not repo_exists(repo_id=hub_repo_id, repo_type="dataset"):
52
+ return ""
53
 
54
  url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer"
55
  iframe = f"""
 
83
  return instruction_valid_columns, response_valid_columns
84
 
85
 
86
+ def load_dataset_from_hub(
87
+ repo_id: str, num_rows: int = 10, token: Union[OAuthToken, None] = None
88
+ ):
89
  if not repo_id:
90
  raise gr.Error("Hub repo id is required")
91
+ subsets = get_dataset_config_names(repo_id, token=token)
92
+ ds_dict = load_dataset(repo_id, subsets[0], token=token)
93
+ splits = get_dataset_split_names(repo_id, subsets[0], token=token)
94
  ds = ds_dict[splits[0]]
95
  if num_rows:
96
  ds = ds.select(range(num_rows))
 
607
  search_type="dataset",
608
  sumbit_on_select=True,
609
  )
610
+ with gr.Row():
611
+ load_btn = gr.Button("Load", variant="primary")
612
+ clear_btn_part = gr.Button("Clear", variant="secondary")
613
+
614
  with gr.Column(scale=3):
615
  search_out = gr.HTML(label="Dataset preview")
616
 
 
675
  inputs=[],
676
  outputs=[eval_type],
677
  )
678
+ with gr.Row():
679
+ btn_apply_to_sample_dataset = gr.Button("Save", variant="primary")
680
+ clear_btn_full = gr.Button("Clear", variant="secondary")
681
  with gr.Column(scale=3):
682
  dataframe = gr.Dataframe(
683
  headers=["prompt", "completion", "evaluation"],
 
733
  label="Distilabel Pipeline Code",
734
  )
735
 
736
+ search_in.submit(fn=get_iframe, inputs=search_in, outputs=search_out).then(
737
+ fn=lambda df: pd.DataFrame(columns=df.columns),
738
+ inputs=[dataframe],
739
+ outputs=[dataframe],
740
+ )
741
 
742
  load_btn.click(
743
  fn=load_dataset_from_hub,
 
806
  fn=generate_pipeline_code,
807
  inputs=[
808
  search_in,
 
 
 
809
  prompt_template,
810
  structured_output,
 
811
  eval_type,
812
  ],
813
  outputs=[pipeline_code],
 
817
  outputs=[pipeline_code_ui],
818
  )
819
 
820
+ clear_btn_part.click(fn=lambda x: "", inputs=[], outputs=[search_in])
821
+ clear_btn_full.click(
822
+ fn=lambda df: ("", "", pd.DataFrame(columns=df.columns)),
823
+ inputs=[dataframe],
824
+ outputs=[
825
+ search_in,
826
+ instruction_instruction_response,
827
+ response_instruction_response,
828
+ ],
829
+ )
830
+
831
  app.load(fn=swap_visibility, outputs=main_ui)
832
  app.load(fn=get_org_dropdown, outputs=[org_name])
src/synthetic_dataset_generator/apps/sft.py CHANGED
@@ -78,6 +78,15 @@ def generate_sample_dataset(system_prompt, num_turns, progress=gr.Progress()):
78
  return dataframe
79
 
80
 
 
 
 
 
 
 
 
 
 
81
  def generate_dataset(
82
  system_prompt: str,
83
  num_turns: int = 1,
@@ -368,7 +377,7 @@ with gr.Blocks() as app:
368
  "Create",
369
  variant="primary",
370
  )
371
- clear_btn = gr.Button(
372
  "Clear",
373
  variant="secondary",
374
  )
@@ -401,17 +410,12 @@ with gr.Blocks() as app:
401
  btn_apply_to_sample_dataset = gr.Button(
402
  "Save", variant="primary"
403
  )
404
- clear_btn = gr.Button(
405
  "Clear",
406
  variant="secondary",
407
  )
408
  with gr.Column(scale=3):
409
- dataframe = gr.Dataframe(
410
- headers=["prompt", "completion"],
411
- wrap=True,
412
- height=500,
413
- interactive=False,
414
- )
415
 
416
  gr.HTML(value="<hr>")
417
  gr.Markdown(value="## 3. Generate your dataset")
@@ -527,6 +531,12 @@ with gr.Blocks() as app:
527
  inputs=[],
528
  outputs=[pipeline_code_ui],
529
  )
 
 
 
 
 
 
530
 
531
  app.load(fn=swap_visibility, outputs=main_ui)
532
  app.load(fn=get_org_dropdown, outputs=[org_name])
 
78
  return dataframe
79
 
80
 
81
+ def _get_dataframe():
82
+ return gr.Dataframe(
83
+ headers=["prompt", "completion"],
84
+ wrap=True,
85
+ height=500,
86
+ interactive=False,
87
+ )
88
+
89
+
90
  def generate_dataset(
91
  system_prompt: str,
92
  num_turns: int = 1,
 
377
  "Create",
378
  variant="primary",
379
  )
380
+ clear_btn_part = gr.Button(
381
  "Clear",
382
  variant="secondary",
383
  )
 
410
  btn_apply_to_sample_dataset = gr.Button(
411
  "Save", variant="primary"
412
  )
413
+ clear_btn_full = gr.Button(
414
  "Clear",
415
  variant="secondary",
416
  )
417
  with gr.Column(scale=3):
418
+ dataframe = _get_dataframe()
 
 
 
 
 
419
 
420
  gr.HTML(value="<hr>")
421
  gr.Markdown(value="## 3. Generate your dataset")
 
531
  inputs=[],
532
  outputs=[pipeline_code_ui],
533
  )
534
+ gr.on(
535
+ triggers=[clear_btn_part.click, clear_btn_full.click],
536
+ fn=lambda _: ("", "", 1, _get_dataframe()),
537
+ inputs=[dataframe],
538
+ outputs=[dataset_description, system_prompt, num_turns, dataframe],
539
+ )
540
 
541
  app.load(fn=swap_visibility, outputs=main_ui)
542
  app.load(fn=get_org_dropdown, outputs=[org_name])
src/synthetic_dataset_generator/apps/textcat.py CHANGED
@@ -35,6 +35,12 @@ from src.synthetic_dataset_generator.utils import (
35
  from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
36
 
37
 
 
 
 
 
 
 
38
  def generate_system_prompt(dataset_description, temperature, progress=gr.Progress()):
39
  progress(0.0, desc="Generating text classification task")
40
  progress(0.3, desc="Initializing text generation")
@@ -345,7 +351,7 @@ with gr.Blocks() as app:
345
  "Create",
346
  variant="primary",
347
  )
348
- clear_btn = gr.Button(
349
  "Clear",
350
  variant="secondary",
351
  )
@@ -411,11 +417,9 @@ with gr.Blocks() as app:
411
  )
412
  with gr.Row():
413
  btn_apply_to_sample_dataset = gr.Button("Save", variant="primary")
414
- clear_btn = gr.Button("Clear", variant="secondary")
415
  with gr.Column(scale=3):
416
- dataframe = gr.Dataframe(
417
- headers=["labels", "text"], wrap=True, height=500, interactive=False
418
- )
419
 
420
  gr.HTML("<hr>")
421
  gr.Markdown("## 3. Generate your dataset")
@@ -553,5 +557,17 @@ with gr.Blocks() as app:
553
  outputs=[pipeline_code_ui],
554
  )
555
 
 
 
 
 
 
 
 
 
 
 
 
 
556
  app.load(fn=swap_visibility, outputs=main_ui)
557
  app.load(fn=get_org_dropdown, outputs=[org_name])
 
35
  from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
36
 
37
 
38
+ def _get_dataframe():
39
+ return gr.Dataframe(
40
+ headers=["labels", "text"], wrap=True, height=500, interactive=False
41
+ )
42
+
43
+
44
  def generate_system_prompt(dataset_description, temperature, progress=gr.Progress()):
45
  progress(0.0, desc="Generating text classification task")
46
  progress(0.3, desc="Initializing text generation")
 
351
  "Create",
352
  variant="primary",
353
  )
354
+ clear_btn_part = gr.Button(
355
  "Clear",
356
  variant="secondary",
357
  )
 
417
  )
418
  with gr.Row():
419
  btn_apply_to_sample_dataset = gr.Button("Save", variant="primary")
420
+ clear_btn_full = gr.Button("Clear", variant="secondary")
421
  with gr.Column(scale=3):
422
+ dataframe = _get_dataframe()
 
 
423
 
424
  gr.HTML("<hr>")
425
  gr.Markdown("## 3. Generate your dataset")
 
557
  outputs=[pipeline_code_ui],
558
  )
559
 
560
+ gr.on(
561
+ triggers=[clear_btn_part.click, clear_btn_full.click],
562
+ fn=lambda _: (
563
+ "",
564
+ "",
565
+ [],
566
+ _get_dataframe(),
567
+ ),
568
+ inputs=[dataframe],
569
+ outputs=[dataset_description, system_prompt, labels, dataframe],
570
+ )
571
+
572
  app.load(fn=swap_visibility, outputs=main_ui)
573
  app.load(fn=get_org_dropdown, outputs=[org_name])