modify column width and typos
Browse files
src/distilabel_dataset_generator/apps/eval.py
CHANGED
@@ -35,15 +35,15 @@ def get_valid_columns(df: pd.DataFrame):
|
|
35 |
return valid_columns
|
36 |
|
37 |
|
38 |
-
def load_dataset_from_hub(hub_repo_id: str,
|
39 |
gr.Info(message="Loading dataset ...")
|
40 |
if not hub_repo_id:
|
41 |
raise gr.Error("Hub repo id is required")
|
42 |
ds_dict = load_dataset(hub_repo_id)
|
43 |
splits = list(ds_dict.keys())
|
44 |
ds = ds_dict[splits[0]]
|
45 |
-
if
|
46 |
-
ds = ds.select(range(
|
47 |
df = ds.to_pandas()
|
48 |
# Get columns that contain either strings or lists of dictionaries
|
49 |
valid_columns = get_valid_columns(df)
|
@@ -130,7 +130,7 @@ def apply_to_sample_dataset(
|
|
130 |
prompt_template: str,
|
131 |
structured_output: dict,
|
132 |
):
|
133 |
-
df, _, _, _ = load_dataset_from_hub(repo_id,
|
134 |
df = _apply_to_dataset(
|
135 |
df,
|
136 |
eval_type,
|
@@ -150,7 +150,7 @@ def push_to_hub(
|
|
150 |
org_name: str,
|
151 |
repo_name: str,
|
152 |
private: bool,
|
153 |
-
|
154 |
original_repo_id: str,
|
155 |
eval_type: str,
|
156 |
aspects_instruction: list[str],
|
@@ -162,7 +162,7 @@ def push_to_hub(
|
|
162 |
prompt_template: str,
|
163 |
structured_output: dict,
|
164 |
):
|
165 |
-
df, _, _, _ = load_dataset_from_hub(original_repo_id,
|
166 |
df = _apply_to_dataset(
|
167 |
df,
|
168 |
eval_type,
|
@@ -257,7 +257,7 @@ with gr.Blocks() as app:
|
|
257 |
gr.HTML("<hr>")
|
258 |
gr.Markdown("## 3. Generate your dataset")
|
259 |
with gr.Row():
|
260 |
-
with gr.Column(scale=
|
261 |
org_name = get_org_dropdown()
|
262 |
repo_name = gr.Textbox(
|
263 |
label="Repo name",
|
@@ -265,7 +265,7 @@ with gr.Blocks() as app:
|
|
265 |
value="my-distiset",
|
266 |
interactive=True,
|
267 |
)
|
268 |
-
|
269 |
label="Number of rows",
|
270 |
value=10,
|
271 |
interactive=True,
|
@@ -314,7 +314,7 @@ with gr.Blocks() as app:
|
|
314 |
org_name,
|
315 |
repo_name,
|
316 |
private,
|
317 |
-
|
318 |
search_in,
|
319 |
eval_type,
|
320 |
aspects_instruction,
|
|
|
35 |
return valid_columns
|
36 |
|
37 |
|
38 |
+
def load_dataset_from_hub(hub_repo_id: str, num_rows: int = 10):
|
39 |
gr.Info(message="Loading dataset ...")
|
40 |
if not hub_repo_id:
|
41 |
raise gr.Error("Hub repo id is required")
|
42 |
ds_dict = load_dataset(hub_repo_id)
|
43 |
splits = list(ds_dict.keys())
|
44 |
ds = ds_dict[splits[0]]
|
45 |
+
if num_rows:
|
46 |
+
ds = ds.select(range(num_rows))
|
47 |
df = ds.to_pandas()
|
48 |
# Get columns that contain either strings or lists of dictionaries
|
49 |
valid_columns = get_valid_columns(df)
|
|
|
130 |
prompt_template: str,
|
131 |
structured_output: dict,
|
132 |
):
|
133 |
+
df, _, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
|
134 |
df = _apply_to_dataset(
|
135 |
df,
|
136 |
eval_type,
|
|
|
150 |
org_name: str,
|
151 |
repo_name: str,
|
152 |
private: bool,
|
153 |
+
num_rows: int,
|
154 |
original_repo_id: str,
|
155 |
eval_type: str,
|
156 |
aspects_instruction: list[str],
|
|
|
162 |
prompt_template: str,
|
163 |
structured_output: dict,
|
164 |
):
|
165 |
+
df, _, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
|
166 |
df = _apply_to_dataset(
|
167 |
df,
|
168 |
eval_type,
|
|
|
257 |
gr.HTML("<hr>")
|
258 |
gr.Markdown("## 3. Generate your dataset")
|
259 |
with gr.Row():
|
260 |
+
with gr.Column(scale=2):
|
261 |
org_name = get_org_dropdown()
|
262 |
repo_name = gr.Textbox(
|
263 |
label="Repo name",
|
|
|
265 |
value="my-distiset",
|
266 |
interactive=True,
|
267 |
)
|
268 |
+
num_rows = gr.Number(
|
269 |
label="Number of rows",
|
270 |
value=10,
|
271 |
interactive=True,
|
|
|
314 |
org_name,
|
315 |
repo_name,
|
316 |
private,
|
317 |
+
num_rows,
|
318 |
search_in,
|
319 |
eval_type,
|
320 |
aspects_instruction,
|
src/distilabel_dataset_generator/apps/sft.py
CHANGED
@@ -213,7 +213,7 @@ def push_dataset_to_argilla(
|
|
213 |
repo_name: str,
|
214 |
system_prompt: str,
|
215 |
num_turns: int = 1,
|
216 |
-
|
217 |
private: bool = False,
|
218 |
oauth_token: Union[gr.OAuthToken, None] = None,
|
219 |
progress=gr.Progress(),
|
@@ -221,7 +221,7 @@ def push_dataset_to_argilla(
|
|
221 |
dataframe = generate_dataset(
|
222 |
system_prompt=system_prompt,
|
223 |
num_turns=num_turns,
|
224 |
-
num_rows=
|
225 |
)
|
226 |
push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
|
227 |
try:
|
@@ -352,20 +352,23 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
|
|
352 |
with gr.Column() as main_ui:
|
353 |
gr.Markdown(value="## 1. Describe the dataset you want")
|
354 |
with gr.Row():
|
355 |
-
with gr.Column(scale=
|
356 |
dataset_description = gr.Textbox(
|
357 |
label="Dataset description",
|
358 |
placeholder="Give a precise description of your desired dataset.",
|
359 |
)
|
|
|
|
|
|
|
|
|
|
|
360 |
examples = gr.Examples(
|
361 |
examples=DEFAULT_DATASET_DESCRIPTIONS,
|
362 |
inputs=[dataset_description],
|
363 |
cache_examples=False,
|
364 |
label="Example descriptions",
|
365 |
)
|
366 |
-
|
367 |
-
load_btn = gr.Button("Load dataset")
|
368 |
-
with gr.Column(scale=3):
|
369 |
pass
|
370 |
|
371 |
gr.HTML(value="<hr>")
|
@@ -392,7 +395,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
|
|
392 |
gr.HTML(value="<hr>")
|
393 |
gr.Markdown(value="## 3. Generate your dataset")
|
394 |
with gr.Row():
|
395 |
-
with gr.Column(scale=
|
396 |
org_name = get_org_dropdown()
|
397 |
repo_name = gr.Textbox(
|
398 |
label="Repo name",
|
@@ -400,7 +403,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
|
|
400 |
value=f"my-distiset-{str(uuid.uuid4())[:8]}",
|
401 |
interactive=True,
|
402 |
)
|
403 |
-
|
404 |
label="Number of rows",
|
405 |
value=10,
|
406 |
interactive=True,
|
@@ -417,7 +420,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
|
|
417 |
success_message = gr.Markdown()
|
418 |
|
419 |
pipeline_code = get_pipeline_code_ui(
|
420 |
-
generate_pipeline_code(system_prompt.value, num_turns.value,
|
421 |
)
|
422 |
|
423 |
gr.on(
|
@@ -454,7 +457,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
|
|
454 |
repo_name,
|
455 |
system_prompt,
|
456 |
num_turns,
|
457 |
-
|
458 |
private,
|
459 |
],
|
460 |
outputs=[success_message],
|
|
|
213 |
repo_name: str,
|
214 |
system_prompt: str,
|
215 |
num_turns: int = 1,
|
216 |
+
num_rows: int = 10,
|
217 |
private: bool = False,
|
218 |
oauth_token: Union[gr.OAuthToken, None] = None,
|
219 |
progress=gr.Progress(),
|
|
|
221 |
dataframe = generate_dataset(
|
222 |
system_prompt=system_prompt,
|
223 |
num_turns=num_turns,
|
224 |
+
num_rows=num_rows,
|
225 |
)
|
226 |
push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
|
227 |
try:
|
|
|
352 |
with gr.Column() as main_ui:
|
353 |
gr.Markdown(value="## 1. Describe the dataset you want")
|
354 |
with gr.Row():
|
355 |
+
with gr.Column(scale=2):
|
356 |
dataset_description = gr.Textbox(
|
357 |
label="Dataset description",
|
358 |
placeholder="Give a precise description of your desired dataset.",
|
359 |
)
|
360 |
+
load_btn = gr.Button(
|
361 |
+
"Load dataset",
|
362 |
+
variant="primary",
|
363 |
+
)
|
364 |
+
with gr.Column(scale=2):
|
365 |
examples = gr.Examples(
|
366 |
examples=DEFAULT_DATASET_DESCRIPTIONS,
|
367 |
inputs=[dataset_description],
|
368 |
cache_examples=False,
|
369 |
label="Example descriptions",
|
370 |
)
|
371 |
+
with gr.Column(scale=1):
|
|
|
|
|
372 |
pass
|
373 |
|
374 |
gr.HTML(value="<hr>")
|
|
|
395 |
gr.HTML(value="<hr>")
|
396 |
gr.Markdown(value="## 3. Generate your dataset")
|
397 |
with gr.Row():
|
398 |
+
with gr.Column(scale=2):
|
399 |
org_name = get_org_dropdown()
|
400 |
repo_name = gr.Textbox(
|
401 |
label="Repo name",
|
|
|
403 |
value=f"my-distiset-{str(uuid.uuid4())[:8]}",
|
404 |
interactive=True,
|
405 |
)
|
406 |
+
num_rows = gr.Number(
|
407 |
label="Number of rows",
|
408 |
value=10,
|
409 |
interactive=True,
|
|
|
420 |
success_message = gr.Markdown()
|
421 |
|
422 |
pipeline_code = get_pipeline_code_ui(
|
423 |
+
generate_pipeline_code(system_prompt.value, num_turns.value, num_rows.value)
|
424 |
)
|
425 |
|
426 |
gr.on(
|
|
|
457 |
repo_name,
|
458 |
system_prompt,
|
459 |
num_turns,
|
460 |
+
num_rows,
|
461 |
private,
|
462 |
],
|
463 |
outputs=[success_message],
|
src/distilabel_dataset_generator/apps/textcat.py
CHANGED
@@ -219,7 +219,7 @@ def push_dataset_to_argilla(
|
|
219 |
difficulty: str,
|
220 |
clarity: str,
|
221 |
num_labels: int = 1,
|
222 |
-
|
223 |
labels: List[str] = None,
|
224 |
private: bool = False,
|
225 |
oauth_token: Union[gr.OAuthToken, None] = None,
|
@@ -231,7 +231,7 @@ def push_dataset_to_argilla(
|
|
231 |
clarity=clarity,
|
232 |
num_labels=num_labels,
|
233 |
labels=labels,
|
234 |
-
num_rows=
|
235 |
)
|
236 |
push_dataset_to_hub(
|
237 |
dataframe, org_name, repo_name, num_labels, labels, oauth_token, private
|
@@ -361,19 +361,23 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
|
|
361 |
with gr.Column() as main_ui:
|
362 |
gr.Markdown("## 1. Describe the dataset you want")
|
363 |
with gr.Row():
|
364 |
-
with gr.Column(scale=
|
365 |
dataset_description = gr.Textbox(
|
366 |
label="Dataset description",
|
367 |
placeholder="Give a precise description of your desired dataset.",
|
368 |
)
|
|
|
|
|
|
|
|
|
|
|
369 |
examples = gr.Examples(
|
370 |
examples=DEFAULT_DATASET_DESCRIPTIONS,
|
371 |
inputs=[dataset_description],
|
372 |
cache_examples=False,
|
373 |
label="Example descriptions",
|
374 |
)
|
375 |
-
|
376 |
-
with gr.Column(scale=3):
|
377 |
pass
|
378 |
|
379 |
gr.HTML("<hr>")
|
@@ -435,7 +439,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
|
|
435 |
gr.HTML("<hr>")
|
436 |
gr.Markdown("## 3. Generate your dataset")
|
437 |
with gr.Row():
|
438 |
-
with gr.Column(scale=
|
439 |
org_name = get_org_dropdown()
|
440 |
repo_name = gr.Textbox(
|
441 |
label="Repo name",
|
@@ -443,7 +447,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
|
|
443 |
value=f"my-distiset-{str(uuid.uuid4())[:8]}",
|
444 |
interactive=True,
|
445 |
)
|
446 |
-
|
447 |
label="Number of rows",
|
448 |
value=10,
|
449 |
interactive=True,
|
@@ -466,7 +470,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
|
|
466 |
clarity=clarity.value,
|
467 |
labels=labels.value,
|
468 |
num_labels=num_labels.value,
|
469 |
-
num_rows=
|
470 |
)
|
471 |
)
|
472 |
|
@@ -514,7 +518,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
|
|
514 |
difficulty,
|
515 |
clarity,
|
516 |
num_labels,
|
517 |
-
|
518 |
labels,
|
519 |
private,
|
520 |
],
|
|
|
219 |
difficulty: str,
|
220 |
clarity: str,
|
221 |
num_labels: int = 1,
|
222 |
+
num_rows: int = 10,
|
223 |
labels: List[str] = None,
|
224 |
private: bool = False,
|
225 |
oauth_token: Union[gr.OAuthToken, None] = None,
|
|
|
231 |
clarity=clarity,
|
232 |
num_labels=num_labels,
|
233 |
labels=labels,
|
234 |
+
num_rows=num_rows,
|
235 |
)
|
236 |
push_dataset_to_hub(
|
237 |
dataframe, org_name, repo_name, num_labels, labels, oauth_token, private
|
|
|
361 |
with gr.Column() as main_ui:
|
362 |
gr.Markdown("## 1. Describe the dataset you want")
|
363 |
with gr.Row():
|
364 |
+
with gr.Column(scale=2):
|
365 |
dataset_description = gr.Textbox(
|
366 |
label="Dataset description",
|
367 |
placeholder="Give a precise description of your desired dataset.",
|
368 |
)
|
369 |
+
load_btn = gr.Button(
|
370 |
+
"Load dataset",
|
371 |
+
variant="primary",
|
372 |
+
)
|
373 |
+
with gr.Column(scale=2):
|
374 |
examples = gr.Examples(
|
375 |
examples=DEFAULT_DATASET_DESCRIPTIONS,
|
376 |
inputs=[dataset_description],
|
377 |
cache_examples=False,
|
378 |
label="Example descriptions",
|
379 |
)
|
380 |
+
with gr.Column(scale=1):
|
|
|
381 |
pass
|
382 |
|
383 |
gr.HTML("<hr>")
|
|
|
439 |
gr.HTML("<hr>")
|
440 |
gr.Markdown("## 3. Generate your dataset")
|
441 |
with gr.Row():
|
442 |
+
with gr.Column(scale=2):
|
443 |
org_name = get_org_dropdown()
|
444 |
repo_name = gr.Textbox(
|
445 |
label="Repo name",
|
|
|
447 |
value=f"my-distiset-{str(uuid.uuid4())[:8]}",
|
448 |
interactive=True,
|
449 |
)
|
450 |
+
num_rows = gr.Number(
|
451 |
label="Number of rows",
|
452 |
value=10,
|
453 |
interactive=True,
|
|
|
470 |
clarity=clarity.value,
|
471 |
labels=labels.value,
|
472 |
num_labels=num_labels.value,
|
473 |
+
num_rows=num_rows.value,
|
474 |
)
|
475 |
)
|
476 |
|
|
|
518 |
difficulty,
|
519 |
clarity,
|
520 |
num_labels,
|
521 |
+
num_rows,
|
522 |
labels,
|
523 |
private,
|
524 |
],
|
src/distilabel_dataset_generator/utils.py
CHANGED
@@ -50,22 +50,22 @@ def list_orgs(oauth_token: OAuthToken = None):
|
|
50 |
return []
|
51 |
data = whoami(oauth_token.token)
|
52 |
if data["auth"]["type"] == "oauth":
|
53 |
-
|
54 |
elif data["auth"]["type"] == "access_token":
|
55 |
-
|
56 |
else:
|
57 |
-
|
58 |
entry["entity"]["name"]
|
59 |
for entry in data["auth"]["accessToken"]["fineGrained"]["scoped"]
|
60 |
if "repo.write" in entry["permissions"]
|
61 |
]
|
62 |
-
|
63 |
-
|
64 |
except Exception as e:
|
65 |
raise gr.Error(
|
66 |
f"Failed to get organizations: {e}. See if you are logged and connected: https://huggingface.co/settings/connected-applications."
|
67 |
)
|
68 |
-
return
|
69 |
|
70 |
|
71 |
def get_org_dropdown(oauth_token: OAuthToken = None):
|
|
|
50 |
return []
|
51 |
data = whoami(oauth_token.token)
|
52 |
if data["auth"]["type"] == "oauth":
|
53 |
+
organizations = [data["name"]] + [org["name"] for org in data["orgs"]]
|
54 |
elif data["auth"]["type"] == "access_token":
|
55 |
+
organizations = [org["name"] for org in data["orgs"]]
|
56 |
else:
|
57 |
+
organizations = [
|
58 |
entry["entity"]["name"]
|
59 |
for entry in data["auth"]["accessToken"]["fineGrained"]["scoped"]
|
60 |
if "repo.write" in entry["permissions"]
|
61 |
]
|
62 |
+
organizations = [org for org in organizations if org != data["name"]]
|
63 |
+
organizations = [data["name"]] + organizations
|
64 |
except Exception as e:
|
65 |
raise gr.Error(
|
66 |
f"Failed to get organizations: {e}. See if you are logged and connected: https://huggingface.co/settings/connected-applications."
|
67 |
)
|
68 |
+
return organizations
|
69 |
|
70 |
|
71 |
def get_org_dropdown(oauth_token: OAuthToken = None):
|