Commit
·
c4435ca
1
Parent(s):
86f370f
fix examples for evaluation
Browse files
pdm.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
[project]
|
2 |
name = "synthetic-dataset-generator"
|
3 |
-
version = "0.1.
|
4 |
description = "Build datasets using natural language"
|
5 |
authors = [
|
6 |
{name = "davidberenstein1957", email = "[email protected]"},
|
|
|
1 |
[project]
|
2 |
name = "synthetic-dataset-generator"
|
3 |
+
version = "0.1.2"
|
4 |
description = "Build datasets using natural language"
|
5 |
authors = [
|
6 |
{name = "davidberenstein1957", email = "[email protected]"},
|
src/synthetic_dataset_generator/app.py
CHANGED
@@ -15,6 +15,9 @@ button[role="tab"][aria-selected="true"]:hover {border-color: var(--button-prima
|
|
15 |
#system_prompt_examples { color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
|
16 |
.container {padding-inline: 0 !important}
|
17 |
#sign_in_button { flex-grow: 0; width: 50% !important; display: flex; align-items: center; justify-content: center; margin: 0 auto; }
|
|
|
|
|
|
|
18 |
"""
|
19 |
|
20 |
image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""
|
|
|
15 |
#system_prompt_examples { color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
|
16 |
.container {padding-inline: 0 !important}
|
17 |
#sign_in_button { flex-grow: 0; width: 50% !important; display: flex; align-items: center; justify-content: center; margin: 0 auto; }
|
18 |
+
.table-view .table-wrap {
|
19 |
+
max-height: 450px;
|
20 |
+
}
|
21 |
"""
|
22 |
|
23 |
image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""
|
src/synthetic_dataset_generator/apps/eval.py
CHANGED
@@ -89,22 +89,72 @@ def load_dataset_from_hub(
|
|
89 |
if not repo_id:
|
90 |
raise gr.Error("Hub repo id is required")
|
91 |
subsets = get_dataset_config_names(repo_id, token=token)
|
92 |
-
ds_dict = load_dataset(repo_id, subsets[0], token=token)
|
93 |
splits = get_dataset_split_names(repo_id, subsets[0], token=token)
|
94 |
-
ds =
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
|
|
97 |
dataframe = ds.to_pandas()
|
98 |
instruction_valid_columns, response_valid_columns = get_valid_columns(dataframe)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
return (
|
100 |
dataframe,
|
101 |
-
gr.Dropdown(
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
)
|
104 |
|
105 |
|
106 |
def define_evaluation_aspects(task_type: str):
|
107 |
-
if task_type == "
|
108 |
return gr.Dropdown(
|
109 |
value=["overall-rating"],
|
110 |
choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
|
@@ -251,7 +301,7 @@ def _evaluate_dataset(
|
|
251 |
num_rows: int = 10,
|
252 |
is_sample: bool = False,
|
253 |
):
|
254 |
-
if eval_type == "
|
255 |
dataframe = evaluate_instruction_response(
|
256 |
dataframe=dataframe,
|
257 |
aspects=aspects_instruction_response,
|
@@ -280,7 +330,7 @@ def evaluate_sample_dataset(
|
|
280 |
prompt_template: str,
|
281 |
structured_output: dict,
|
282 |
):
|
283 |
-
dataframe, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
|
284 |
dataframe = _evaluate_dataset(
|
285 |
dataframe=dataframe,
|
286 |
eval_type=eval_type,
|
@@ -324,7 +374,7 @@ def push_dataset(
|
|
324 |
oauth_token: Union[gr.OAuthToken, None] = None,
|
325 |
progress=gr.Progress(),
|
326 |
) -> pd.DataFrame:
|
327 |
-
dataframe, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
|
328 |
dataframe = _evaluate_dataset(
|
329 |
dataframe=dataframe,
|
330 |
eval_type=eval_type,
|
@@ -342,7 +392,7 @@ def push_dataset(
|
|
342 |
client = get_argilla_client()
|
343 |
if client is None:
|
344 |
return ""
|
345 |
-
if eval_type == "
|
346 |
num_generations = len((dataframe["generations"][0]))
|
347 |
fields = [
|
348 |
rg.ChatField(
|
@@ -612,7 +662,18 @@ with gr.Blocks() as app:
|
|
612 |
load_btn = gr.Button("Load", variant="primary")
|
613 |
|
614 |
with gr.Column(scale=3):
|
615 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
616 |
|
617 |
gr.HTML(value="<hr>")
|
618 |
gr.Markdown(value="## 2. Configure your task")
|
@@ -620,58 +681,54 @@ with gr.Blocks() as app:
|
|
620 |
with gr.Column(scale=2):
|
621 |
eval_type = gr.Dropdown(
|
622 |
label="Evaluation type",
|
623 |
-
choices=["
|
624 |
-
value="
|
625 |
multiselect=False,
|
626 |
visible=False,
|
627 |
)
|
628 |
-
with gr.Tab("
|
629 |
aspects_instruction_response = define_evaluation_aspects(
|
630 |
-
"
|
631 |
)
|
632 |
instruction_instruction_response = gr.Dropdown(
|
633 |
label="Instruction Column",
|
634 |
-
|
|
|
|
|
|
|
635 |
multiselect=False,
|
636 |
allow_custom_value=False,
|
637 |
)
|
638 |
response_instruction_response = gr.Dropdown(
|
639 |
label="Response Column",
|
640 |
-
|
641 |
-
|
|
|
|
|
|
|
642 |
allow_custom_value=False,
|
643 |
)
|
644 |
tab_instruction_response.select(
|
645 |
-
fn=lambda: "
|
646 |
inputs=[],
|
647 |
outputs=[eval_type],
|
648 |
)
|
649 |
-
with gr.Tab("
|
650 |
-
aspects_custom = define_evaluation_aspects("custom")
|
651 |
prompt_template = gr.Code(
|
652 |
label="Prompt template",
|
653 |
-
value="
|
654 |
language="markdown",
|
655 |
-
interactive=
|
656 |
)
|
657 |
structured_output = gr.Code(
|
658 |
label="Structured output",
|
659 |
-
value=
|
660 |
-
{
|
661 |
-
"type": "object",
|
662 |
-
"properties": {
|
663 |
-
"quality": {"type": "integer"},
|
664 |
-
"clarity": {"type": "integer"},
|
665 |
-
"relevance": {"type": "integer"},
|
666 |
-
},
|
667 |
-
},
|
668 |
-
indent=4,
|
669 |
-
),
|
670 |
language="json",
|
671 |
-
interactive=
|
672 |
)
|
673 |
tab_custom.select(
|
674 |
-
fn=lambda: "custom",
|
675 |
inputs=[],
|
676 |
outputs=[eval_type],
|
677 |
)
|
@@ -681,9 +738,10 @@ with gr.Blocks() as app:
|
|
681 |
with gr.Column(scale=3):
|
682 |
dataframe = gr.Dataframe(
|
683 |
headers=["prompt", "completion", "evaluation"],
|
684 |
-
wrap=
|
685 |
height=500,
|
686 |
interactive=False,
|
|
|
687 |
)
|
688 |
|
689 |
gr.HTML(value="<hr>")
|
@@ -746,6 +804,8 @@ with gr.Blocks() as app:
|
|
746 |
dataframe,
|
747 |
instruction_instruction_response,
|
748 |
response_instruction_response,
|
|
|
|
|
749 |
],
|
750 |
)
|
751 |
|
|
|
89 |
if not repo_id:
|
90 |
raise gr.Error("Hub repo id is required")
|
91 |
subsets = get_dataset_config_names(repo_id, token=token)
|
|
|
92 |
splits = get_dataset_split_names(repo_id, subsets[0], token=token)
|
93 |
+
ds = load_dataset(repo_id, subsets[0], split=splits[0], token=token, streaming=True)
|
94 |
+
rows = []
|
95 |
+
for idx, row in enumerate(ds):
|
96 |
+
rows.append(row)
|
97 |
+
if idx == num_rows:
|
98 |
+
break
|
99 |
+
ds = Dataset.from_list(rows)
|
100 |
dataframe = ds.to_pandas()
|
101 |
instruction_valid_columns, response_valid_columns = get_valid_columns(dataframe)
|
102 |
+
col_instruction = instruction_valid_columns[0] if instruction_valid_columns else ""
|
103 |
+
col_response = "No valid response columns found."
|
104 |
+
for col in response_valid_columns:
|
105 |
+
if col != col_instruction:
|
106 |
+
col_response = col
|
107 |
+
break
|
108 |
+
|
109 |
+
prompt_template = gr.Code(
|
110 |
+
label="Prompt template",
|
111 |
+
value="\n".join(
|
112 |
+
[
|
113 |
+
"Evaluate the following text based on criteria.",
|
114 |
+
"Criteria: quality.",
|
115 |
+
"Score: between 1 and 10.",
|
116 |
+
"Text: {{" + col_response + "}}",
|
117 |
+
]
|
118 |
+
),
|
119 |
+
language="markdown",
|
120 |
+
interactive=True,
|
121 |
+
)
|
122 |
+
structured_output = gr.Code(
|
123 |
+
label="Structured output",
|
124 |
+
value=json.dumps(
|
125 |
+
{
|
126 |
+
"type": "object",
|
127 |
+
"properties": {"quality": {"type": "integer"}},
|
128 |
+
"required": ["quality"],
|
129 |
+
},
|
130 |
+
indent=4,
|
131 |
+
),
|
132 |
+
language="json",
|
133 |
+
interactive=True,
|
134 |
+
)
|
135 |
return (
|
136 |
dataframe,
|
137 |
+
gr.Dropdown(
|
138 |
+
choices=instruction_valid_columns,
|
139 |
+
label="Instruction column",
|
140 |
+
value=col_instruction,
|
141 |
+
interactive=True,
|
142 |
+
),
|
143 |
+
gr.Dropdown(
|
144 |
+
choices=response_valid_columns,
|
145 |
+
label="Response column",
|
146 |
+
value=col_response,
|
147 |
+
interactive=False
|
148 |
+
if col_response == "No valid response columns found."
|
149 |
+
else True,
|
150 |
+
),
|
151 |
+
prompt_template,
|
152 |
+
structured_output,
|
153 |
)
|
154 |
|
155 |
|
156 |
def define_evaluation_aspects(task_type: str):
|
157 |
+
if task_type == "chat-eval":
|
158 |
return gr.Dropdown(
|
159 |
value=["overall-rating"],
|
160 |
choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
|
|
|
301 |
num_rows: int = 10,
|
302 |
is_sample: bool = False,
|
303 |
):
|
304 |
+
if eval_type == "chat-eval":
|
305 |
dataframe = evaluate_instruction_response(
|
306 |
dataframe=dataframe,
|
307 |
aspects=aspects_instruction_response,
|
|
|
330 |
prompt_template: str,
|
331 |
structured_output: dict,
|
332 |
):
|
333 |
+
dataframe, _, _, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
|
334 |
dataframe = _evaluate_dataset(
|
335 |
dataframe=dataframe,
|
336 |
eval_type=eval_type,
|
|
|
374 |
oauth_token: Union[gr.OAuthToken, None] = None,
|
375 |
progress=gr.Progress(),
|
376 |
) -> pd.DataFrame:
|
377 |
+
dataframe, _, _, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
|
378 |
dataframe = _evaluate_dataset(
|
379 |
dataframe=dataframe,
|
380 |
eval_type=eval_type,
|
|
|
392 |
client = get_argilla_client()
|
393 |
if client is None:
|
394 |
return ""
|
395 |
+
if eval_type == "chat-eval":
|
396 |
num_generations = len((dataframe["generations"][0]))
|
397 |
fields = [
|
398 |
rg.ChatField(
|
|
|
662 |
load_btn = gr.Button("Load", variant="primary")
|
663 |
|
664 |
with gr.Column(scale=3):
|
665 |
+
examples = gr.Examples(
|
666 |
+
examples=[
|
667 |
+
"argilla/distilabel-sft-easy",
|
668 |
+
"HuggingFaceFW/fineweb-edu",
|
669 |
+
"argilla/distilabel-intel-orca-dpo-pairs",
|
670 |
+
],
|
671 |
+
label="Example datasets",
|
672 |
+
fn=lambda x: x,
|
673 |
+
inputs=[search_in],
|
674 |
+
run_on_click=True,
|
675 |
+
)
|
676 |
+
search_out = gr.HTML(label="Dataset preview", visible=False)
|
677 |
|
678 |
gr.HTML(value="<hr>")
|
679 |
gr.Markdown(value="## 2. Configure your task")
|
|
|
681 |
with gr.Column(scale=2):
|
682 |
eval_type = gr.Dropdown(
|
683 |
label="Evaluation type",
|
684 |
+
choices=["chat-eval", "custom-eval"],
|
685 |
+
value="chat-eval",
|
686 |
multiselect=False,
|
687 |
visible=False,
|
688 |
)
|
689 |
+
with gr.Tab("Response Evaluation") as tab_instruction_response:
|
690 |
aspects_instruction_response = define_evaluation_aspects(
|
691 |
+
"chat-eval"
|
692 |
)
|
693 |
instruction_instruction_response = gr.Dropdown(
|
694 |
label="Instruction Column",
|
695 |
+
info="Select the instruction column to evaluate",
|
696 |
+
choices=["Load your data first in step 1."],
|
697 |
+
value="Load your data first in step 1.",
|
698 |
+
interactive=False,
|
699 |
multiselect=False,
|
700 |
allow_custom_value=False,
|
701 |
)
|
702 |
response_instruction_response = gr.Dropdown(
|
703 |
label="Response Column",
|
704 |
+
info="Select the response column(s) to evaluate",
|
705 |
+
choices=["Load your data first in step 1."],
|
706 |
+
value="Load your data first in step 1.",
|
707 |
+
interactive=False,
|
708 |
+
multiselect=False,
|
709 |
allow_custom_value=False,
|
710 |
)
|
711 |
tab_instruction_response.select(
|
712 |
+
fn=lambda: "chat-eval",
|
713 |
inputs=[],
|
714 |
outputs=[eval_type],
|
715 |
)
|
716 |
+
with gr.Tab("Custom Evaluation Prompt") as tab_custom:
|
717 |
+
aspects_custom = define_evaluation_aspects("custom-eval")
|
718 |
prompt_template = gr.Code(
|
719 |
label="Prompt template",
|
720 |
+
value="Load your data first in step 1.",
|
721 |
language="markdown",
|
722 |
+
interactive=False,
|
723 |
)
|
724 |
structured_output = gr.Code(
|
725 |
label="Structured output",
|
726 |
+
value="Load your data first in step 1.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
727 |
language="json",
|
728 |
+
interactive=False,
|
729 |
)
|
730 |
tab_custom.select(
|
731 |
+
fn=lambda: "custom-eval",
|
732 |
inputs=[],
|
733 |
outputs=[eval_type],
|
734 |
)
|
|
|
738 |
with gr.Column(scale=3):
|
739 |
dataframe = gr.Dataframe(
|
740 |
headers=["prompt", "completion", "evaluation"],
|
741 |
+
wrap=True,
|
742 |
height=500,
|
743 |
interactive=False,
|
744 |
+
elem_classes="table-view",
|
745 |
)
|
746 |
|
747 |
gr.HTML(value="<hr>")
|
|
|
804 |
dataframe,
|
805 |
instruction_instruction_response,
|
806 |
response_instruction_response,
|
807 |
+
prompt_template,
|
808 |
+
structured_output,
|
809 |
],
|
810 |
)
|
811 |
|
src/synthetic_dataset_generator/apps/sft.py
CHANGED
@@ -84,6 +84,7 @@ def _get_dataframe():
|
|
84 |
wrap=True,
|
85 |
height=500,
|
86 |
interactive=False,
|
|
|
87 |
)
|
88 |
|
89 |
|
|
|
84 |
wrap=True,
|
85 |
height=500,
|
86 |
interactive=False,
|
87 |
+
elem_classes="table-view",
|
88 |
)
|
89 |
|
90 |
|
src/synthetic_dataset_generator/apps/textcat.py
CHANGED
@@ -37,7 +37,11 @@ from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
|
|
37 |
|
38 |
def _get_dataframe():
|
39 |
return gr.Dataframe(
|
40 |
-
headers=["labels", "text"],
|
|
|
|
|
|
|
|
|
41 |
)
|
42 |
|
43 |
|
|
|
37 |
|
38 |
def _get_dataframe():
|
39 |
return gr.Dataframe(
|
40 |
+
headers=["labels", "text"],
|
41 |
+
wrap=True,
|
42 |
+
height=500,
|
43 |
+
interactive=False,
|
44 |
+
elem_classes="table-view",
|
45 |
)
|
46 |
|
47 |
|
src/synthetic_dataset_generator/pipelines/eval.py
CHANGED
@@ -18,7 +18,7 @@ def get_ultrafeedback_evaluator(aspect, is_sample):
|
|
18 |
api_key=_get_next_api_key(),
|
19 |
generation_kwargs={
|
20 |
"temperature": 0.01,
|
21 |
-
"max_new_tokens":
|
22 |
},
|
23 |
),
|
24 |
aspect=aspect,
|
@@ -36,7 +36,7 @@ def get_custom_evaluator(prompt_template, structured_output, columns, is_sample)
|
|
36 |
structured_output={"format": "json", "schema": structured_output},
|
37 |
generation_kwargs={
|
38 |
"temperature": 0.01,
|
39 |
-
"max_new_tokens":
|
40 |
},
|
41 |
),
|
42 |
template=prompt_template,
|
|
|
18 |
api_key=_get_next_api_key(),
|
19 |
generation_kwargs={
|
20 |
"temperature": 0.01,
|
21 |
+
"max_new_tokens": 2048 if not is_sample else 512,
|
22 |
},
|
23 |
),
|
24 |
aspect=aspect,
|
|
|
36 |
structured_output={"format": "json", "schema": structured_output},
|
37 |
generation_kwargs={
|
38 |
"temperature": 0.01,
|
39 |
+
"max_new_tokens": 2048 if not is_sample else 512,
|
40 |
},
|
41 |
),
|
42 |
template=prompt_template,
|