Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

sdiazlor HF staff commited on Dec 1, 2024

Commit

2b5c2e3

1 Parent(s): 5d91425

modify column width and typos

Browse files

Files changed (4) hide show

src/distilabel_dataset_generator/apps/eval.py +9 -9
src/distilabel_dataset_generator/apps/sft.py +13 -10
src/distilabel_dataset_generator/apps/textcat.py +13 -9
src/distilabel_dataset_generator/utils.py +6 -6

src/distilabel_dataset_generator/apps/eval.py CHANGED Viewed

@@ -35,15 +35,15 @@ def get_valid_columns(df: pd.DataFrame):
     return valid_columns
-def load_dataset_from_hub(hub_repo_id: str, n_rows: int = 10):
     gr.Info(message="Loading dataset ...")
     if not hub_repo_id:
         raise gr.Error("Hub repo id is required")
     ds_dict = load_dataset(hub_repo_id)
     splits = list(ds_dict.keys())
     ds = ds_dict[splits[0]]
-    if n_rows:
-        ds = ds.select(range(n_rows))
     df = ds.to_pandas()
     # Get columns that contain either strings or lists of dictionaries
     valid_columns = get_valid_columns(df)
@@ -130,7 +130,7 @@ def apply_to_sample_dataset(
     prompt_template: str,
     structured_output: dict,
 ):
-    df, _, _, _ = load_dataset_from_hub(repo_id, n_rows=10)
     df = _apply_to_dataset(
         df,
         eval_type,
@@ -150,7 +150,7 @@ def push_to_hub(
     org_name: str,
     repo_name: str,
     private: bool,
-    n_rows: int,
     original_repo_id: str,
     eval_type: str,
     aspects_instruction: list[str],
@@ -162,7 +162,7 @@ def push_to_hub(
     prompt_template: str,
     structured_output: dict,
 ):
-    df, _, _, _ = load_dataset_from_hub(original_repo_id, n_rows=n_rows)
     df = _apply_to_dataset(
         df,
         eval_type,
@@ -257,7 +257,7 @@ with gr.Blocks() as app:
     gr.HTML("<hr>")
     gr.Markdown("## 3. Generate your dataset")
     with gr.Row():
-        with gr.Column(scale=1):
             org_name = get_org_dropdown()
             repo_name = gr.Textbox(
                 label="Repo name",
@@ -265,7 +265,7 @@ with gr.Blocks() as app:
                 value="my-distiset",
                 interactive=True,
             )
-            n_rows = gr.Number(
                 label="Number of rows",
                 value=10,
                 interactive=True,
@@ -314,7 +314,7 @@ with gr.Blocks() as app:
             org_name,
             repo_name,
             private,
-            n_rows,
             search_in,
             eval_type,
             aspects_instruction,

     return valid_columns
+def load_dataset_from_hub(hub_repo_id: str, num_rows: int = 10):
     gr.Info(message="Loading dataset ...")
     if not hub_repo_id:
         raise gr.Error("Hub repo id is required")
     ds_dict = load_dataset(hub_repo_id)
     splits = list(ds_dict.keys())
     ds = ds_dict[splits[0]]
+    if num_rows:
+        ds = ds.select(range(num_rows))
     df = ds.to_pandas()
     # Get columns that contain either strings or lists of dictionaries
     valid_columns = get_valid_columns(df)
     prompt_template: str,
     structured_output: dict,
 ):
+    df, _, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
     df = _apply_to_dataset(
         df,
         eval_type,
     org_name: str,
     repo_name: str,
     private: bool,
+    num_rows: int,
     original_repo_id: str,
     eval_type: str,
     aspects_instruction: list[str],
     prompt_template: str,
     structured_output: dict,
 ):
+    df, _, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
     df = _apply_to_dataset(
         df,
         eval_type,
     gr.HTML("<hr>")
     gr.Markdown("## 3. Generate your dataset")
     with gr.Row():
+        with gr.Column(scale=2):
             org_name = get_org_dropdown()
             repo_name = gr.Textbox(
                 label="Repo name",
                 value="my-distiset",
                 interactive=True,
             )
+            num_rows = gr.Number(
                 label="Number of rows",
                 value=10,
                 interactive=True,
             org_name,
             repo_name,
             private,
+            num_rows,
             search_in,
             eval_type,
             aspects_instruction,

src/distilabel_dataset_generator/apps/sft.py CHANGED Viewed

@@ -213,7 +213,7 @@ def push_dataset_to_argilla(
     repo_name: str,
     system_prompt: str,
     num_turns: int = 1,
-    n_rows: int = 10,
     private: bool = False,
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
@@ -221,7 +221,7 @@ def push_dataset_to_argilla(
     dataframe = generate_dataset(
         system_prompt=system_prompt,
         num_turns=num_turns,
-        num_rows=n_rows,
     )
     push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
     try:
@@ -352,20 +352,23 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
     with gr.Column() as main_ui:
         gr.Markdown(value="## 1. Describe the dataset you want")
         with gr.Row():
-            with gr.Column(scale=1):
                 dataset_description = gr.Textbox(
                     label="Dataset description",
                     placeholder="Give a precise description of your desired dataset.",
                 )
                 examples = gr.Examples(
                     examples=DEFAULT_DATASET_DESCRIPTIONS,
                     inputs=[dataset_description],
                     cache_examples=False,
                     label="Example descriptions",
                 )
-                load_btn = gr.Button("Load dataset")
-            with gr.Column(scale=3):
                 pass
         gr.HTML(value="<hr>")
@@ -392,7 +395,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
         gr.HTML(value="<hr>")
         gr.Markdown(value="## 3. Generate your dataset")
         with gr.Row():
-            with gr.Column(scale=1):
                 org_name = get_org_dropdown()
                 repo_name = gr.Textbox(
                     label="Repo name",
@@ -400,7 +403,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
                     value=f"my-distiset-{str(uuid.uuid4())[:8]}",
                     interactive=True,
                 )
-                n_rows = gr.Number(
                     label="Number of rows",
                     value=10,
                     interactive=True,
@@ -417,7 +420,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
                 success_message = gr.Markdown()
         pipeline_code = get_pipeline_code_ui(
-            generate_pipeline_code(system_prompt.value, num_turns.value, n_rows.value)
         )
     gr.on(
@@ -454,7 +457,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
             repo_name,
             system_prompt,
             num_turns,
-            n_rows,
             private,
         ],
         outputs=[success_message],

     repo_name: str,
     system_prompt: str,
     num_turns: int = 1,
+    num_rows: int = 10,
     private: bool = False,
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
     dataframe = generate_dataset(
         system_prompt=system_prompt,
         num_turns=num_turns,
+        num_rows=num_rows,
     )
     push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
     try:
     with gr.Column() as main_ui:
         gr.Markdown(value="## 1. Describe the dataset you want")
         with gr.Row():
+            with gr.Column(scale=2):
                 dataset_description = gr.Textbox(
                     label="Dataset description",
                     placeholder="Give a precise description of your desired dataset.",
                 )
+                load_btn = gr.Button(
+                    "Load dataset",
+                    variant="primary",
+                )
+            with gr.Column(scale=2):
                 examples = gr.Examples(
                     examples=DEFAULT_DATASET_DESCRIPTIONS,
                     inputs=[dataset_description],
                     cache_examples=False,
                     label="Example descriptions",
                 )
+            with gr.Column(scale=1):
                 pass
         gr.HTML(value="<hr>")
         gr.HTML(value="<hr>")
         gr.Markdown(value="## 3. Generate your dataset")
         with gr.Row():
+            with gr.Column(scale=2):
                 org_name = get_org_dropdown()
                 repo_name = gr.Textbox(
                     label="Repo name",
                     value=f"my-distiset-{str(uuid.uuid4())[:8]}",
                     interactive=True,
                 )
+                num_rows = gr.Number(
                     label="Number of rows",
                     value=10,
                     interactive=True,
                 success_message = gr.Markdown()
         pipeline_code = get_pipeline_code_ui(
+            generate_pipeline_code(system_prompt.value, num_turns.value, num_rows.value)
         )
     gr.on(
             repo_name,
             system_prompt,
             num_turns,
+            num_rows,
             private,
         ],
         outputs=[success_message],

src/distilabel_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -219,7 +219,7 @@ def push_dataset_to_argilla(
     difficulty: str,
     clarity: str,
     num_labels: int = 1,
-    n_rows: int = 10,
     labels: List[str] = None,
     private: bool = False,
     oauth_token: Union[gr.OAuthToken, None] = None,
@@ -231,7 +231,7 @@ def push_dataset_to_argilla(
         clarity=clarity,
         num_labels=num_labels,
         labels=labels,
-        num_rows=n_rows,
     )
     push_dataset_to_hub(
         dataframe, org_name, repo_name, num_labels, labels, oauth_token, private
@@ -361,19 +361,23 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
     with gr.Column() as main_ui:
         gr.Markdown("## 1. Describe the dataset you want")
         with gr.Row():
-            with gr.Column(scale=1):
                 dataset_description = gr.Textbox(
                     label="Dataset description",
                     placeholder="Give a precise description of your desired dataset.",
                 )
                 examples = gr.Examples(
                     examples=DEFAULT_DATASET_DESCRIPTIONS,
                     inputs=[dataset_description],
                     cache_examples=False,
                     label="Example descriptions",
                 )
-                load_btn = gr.Button("Load dataset")
-            with gr.Column(scale=3):
                 pass
         gr.HTML("<hr>")
@@ -435,7 +439,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
         gr.HTML("<hr>")
         gr.Markdown("## 3. Generate your dataset")
         with gr.Row():
-            with gr.Column(scale=1):
                 org_name = get_org_dropdown()
                 repo_name = gr.Textbox(
                     label="Repo name",
@@ -443,7 +447,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
                     value=f"my-distiset-{str(uuid.uuid4())[:8]}",
                     interactive=True,
                 )
-                n_rows = gr.Number(
                     label="Number of rows",
                     value=10,
                     interactive=True,
@@ -466,7 +470,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
                 clarity=clarity.value,
                 labels=labels.value,
                 num_labels=num_labels.value,
-                num_rows=n_rows.value,
             )
         )
@@ -514,7 +518,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
             difficulty,
             clarity,
             num_labels,
-            n_rows,
             labels,
             private,
         ],

     difficulty: str,
     clarity: str,
     num_labels: int = 1,
+    num_rows: int = 10,
     labels: List[str] = None,
     private: bool = False,
     oauth_token: Union[gr.OAuthToken, None] = None,
         clarity=clarity,
         num_labels=num_labels,
         labels=labels,
+        num_rows=num_rows,
     )
     push_dataset_to_hub(
         dataframe, org_name, repo_name, num_labels, labels, oauth_token, private
     with gr.Column() as main_ui:
         gr.Markdown("## 1. Describe the dataset you want")
         with gr.Row():
+            with gr.Column(scale=2):
                 dataset_description = gr.Textbox(
                     label="Dataset description",
                     placeholder="Give a precise description of your desired dataset.",
                 )
+                load_btn = gr.Button(
+                    "Load dataset",
+                    variant="primary",
+                )
+            with gr.Column(scale=2):
                 examples = gr.Examples(
                     examples=DEFAULT_DATASET_DESCRIPTIONS,
                     inputs=[dataset_description],
                     cache_examples=False,
                     label="Example descriptions",
                 )
+            with gr.Column(scale=1):
                 pass
         gr.HTML("<hr>")
         gr.HTML("<hr>")
         gr.Markdown("## 3. Generate your dataset")
         with gr.Row():
+            with gr.Column(scale=2):
                 org_name = get_org_dropdown()
                 repo_name = gr.Textbox(
                     label="Repo name",
                     value=f"my-distiset-{str(uuid.uuid4())[:8]}",
                     interactive=True,
                 )
+                num_rows = gr.Number(
                     label="Number of rows",
                     value=10,
                     interactive=True,
                 clarity=clarity.value,
                 labels=labels.value,
                 num_labels=num_labels.value,
+                num_rows=num_rows.value,
             )
         )
             difficulty,
             clarity,
             num_labels,
+            num_rows,
             labels,
             private,
         ],

src/distilabel_dataset_generator/utils.py CHANGED Viewed

@@ -50,22 +50,22 @@ def list_orgs(oauth_token: OAuthToken = None):
             return []
         data = whoami(oauth_token.token)
         if data["auth"]["type"] == "oauth":
-            organisations = [data["name"]] + [org["name"] for org in data["orgs"]]
         elif data["auth"]["type"] == "access_token":
-            organisations = [org["name"] for org in data["orgs"]]
         else:
-            organisations = [
                 entry["entity"]["name"]
                 for entry in data["auth"]["accessToken"]["fineGrained"]["scoped"]
                 if "repo.write" in entry["permissions"]
             ]
-            organisations = [org for org in organisations if org != data["name"]]
-            organisations = [data["name"]] + organisations
     except Exception as e:
         raise gr.Error(
             f"Failed to get organizations: {e}. See if you are logged and connected: https://huggingface.co/settings/connected-applications."
         )
-    return organisations
 def get_org_dropdown(oauth_token: OAuthToken = None):

             return []
         data = whoami(oauth_token.token)
         if data["auth"]["type"] == "oauth":
+            organizations = [data["name"]] + [org["name"] for org in data["orgs"]]
         elif data["auth"]["type"] == "access_token":
+            organizations = [org["name"] for org in data["orgs"]]
         else:
+            organizations = [
                 entry["entity"]["name"]
                 for entry in data["auth"]["accessToken"]["fineGrained"]["scoped"]
                 if "repo.write" in entry["permissions"]
             ]
+            organizations = [org for org in organizations if org != data["name"]]
+            organizations = [data["name"]] + organizations
     except Exception as e:
         raise gr.Error(
             f"Failed to get organizations: {e}. See if you are logged and connected: https://huggingface.co/settings/connected-applications."
         )
+    return organizations
 def get_org_dropdown(oauth_token: OAuthToken = None):