davidberenstein1957 HF staff commited on
Commit
c4435ca
·
1 Parent(s): 86f370f

fix examples for evaluation

Browse files
pdm.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [project]
2
  name = "synthetic-dataset-generator"
3
- version = "0.1.1"
4
  description = "Build datasets using natural language"
5
  authors = [
6
  {name = "davidberenstein1957", email = "[email protected]"},
 
1
  [project]
2
  name = "synthetic-dataset-generator"
3
+ version = "0.1.2"
4
  description = "Build datasets using natural language"
5
  authors = [
6
  {name = "davidberenstein1957", email = "[email protected]"},
src/synthetic_dataset_generator/app.py CHANGED
@@ -15,6 +15,9 @@ button[role="tab"][aria-selected="true"]:hover {border-color: var(--button-prima
15
  #system_prompt_examples { color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
16
  .container {padding-inline: 0 !important}
17
  #sign_in_button { flex-grow: 0; width: 50% !important; display: flex; align-items: center; justify-content: center; margin: 0 auto; }
 
 
 
18
  """
19
 
20
  image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""
 
15
  #system_prompt_examples { color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
16
  .container {padding-inline: 0 !important}
17
  #sign_in_button { flex-grow: 0; width: 50% !important; display: flex; align-items: center; justify-content: center; margin: 0 auto; }
18
+ .table-view .table-wrap {
19
+ max-height: 450px;
20
+ }
21
  """
22
 
23
  image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""
src/synthetic_dataset_generator/apps/eval.py CHANGED
@@ -89,22 +89,72 @@ def load_dataset_from_hub(
89
  if not repo_id:
90
  raise gr.Error("Hub repo id is required")
91
  subsets = get_dataset_config_names(repo_id, token=token)
92
- ds_dict = load_dataset(repo_id, subsets[0], token=token)
93
  splits = get_dataset_split_names(repo_id, subsets[0], token=token)
94
- ds = ds_dict[splits[0]]
95
- if num_rows:
96
- ds = ds.select(range(num_rows))
 
 
 
 
97
  dataframe = ds.to_pandas()
98
  instruction_valid_columns, response_valid_columns = get_valid_columns(dataframe)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  return (
100
  dataframe,
101
- gr.Dropdown(choices=instruction_valid_columns, label="Instruction column"),
102
- gr.Dropdown(choices=response_valid_columns, label="Response column"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  )
104
 
105
 
106
  def define_evaluation_aspects(task_type: str):
107
- if task_type == "ultrafeedback":
108
  return gr.Dropdown(
109
  value=["overall-rating"],
110
  choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
@@ -251,7 +301,7 @@ def _evaluate_dataset(
251
  num_rows: int = 10,
252
  is_sample: bool = False,
253
  ):
254
- if eval_type == "ultrafeedback":
255
  dataframe = evaluate_instruction_response(
256
  dataframe=dataframe,
257
  aspects=aspects_instruction_response,
@@ -280,7 +330,7 @@ def evaluate_sample_dataset(
280
  prompt_template: str,
281
  structured_output: dict,
282
  ):
283
- dataframe, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
284
  dataframe = _evaluate_dataset(
285
  dataframe=dataframe,
286
  eval_type=eval_type,
@@ -324,7 +374,7 @@ def push_dataset(
324
  oauth_token: Union[gr.OAuthToken, None] = None,
325
  progress=gr.Progress(),
326
  ) -> pd.DataFrame:
327
- dataframe, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
328
  dataframe = _evaluate_dataset(
329
  dataframe=dataframe,
330
  eval_type=eval_type,
@@ -342,7 +392,7 @@ def push_dataset(
342
  client = get_argilla_client()
343
  if client is None:
344
  return ""
345
- if eval_type == "ultrafeedback":
346
  num_generations = len((dataframe["generations"][0]))
347
  fields = [
348
  rg.ChatField(
@@ -612,7 +662,18 @@ with gr.Blocks() as app:
612
  load_btn = gr.Button("Load", variant="primary")
613
 
614
  with gr.Column(scale=3):
615
- search_out = gr.HTML(label="Dataset preview")
 
 
 
 
 
 
 
 
 
 
 
616
 
617
  gr.HTML(value="<hr>")
618
  gr.Markdown(value="## 2. Configure your task")
@@ -620,58 +681,54 @@ with gr.Blocks() as app:
620
  with gr.Column(scale=2):
621
  eval_type = gr.Dropdown(
622
  label="Evaluation type",
623
- choices=["ultrafeedback", "custom"],
624
- value="ultrafeedback",
625
  multiselect=False,
626
  visible=False,
627
  )
628
- with gr.Tab("ultrafeedback") as tab_instruction_response:
629
  aspects_instruction_response = define_evaluation_aspects(
630
- "ultrafeedback"
631
  )
632
  instruction_instruction_response = gr.Dropdown(
633
  label="Instruction Column",
634
- interactive=True,
 
 
 
635
  multiselect=False,
636
  allow_custom_value=False,
637
  )
638
  response_instruction_response = gr.Dropdown(
639
  label="Response Column",
640
- interactive=True,
641
- multiselect=True,
 
 
 
642
  allow_custom_value=False,
643
  )
644
  tab_instruction_response.select(
645
- fn=lambda: "ultrafeedback",
646
  inputs=[],
647
  outputs=[eval_type],
648
  )
649
- with gr.Tab("custom") as tab_custom:
650
- aspects_custom = define_evaluation_aspects("custom")
651
  prompt_template = gr.Code(
652
  label="Prompt template",
653
- value="Evaluate {{column_1}} based on {{column_2}}.",
654
  language="markdown",
655
- interactive=True,
656
  )
657
  structured_output = gr.Code(
658
  label="Structured output",
659
- value=json.dumps(
660
- {
661
- "type": "object",
662
- "properties": {
663
- "quality": {"type": "integer"},
664
- "clarity": {"type": "integer"},
665
- "relevance": {"type": "integer"},
666
- },
667
- },
668
- indent=4,
669
- ),
670
  language="json",
671
- interactive=True,
672
  )
673
  tab_custom.select(
674
- fn=lambda: "custom",
675
  inputs=[],
676
  outputs=[eval_type],
677
  )
@@ -681,9 +738,10 @@ with gr.Blocks() as app:
681
  with gr.Column(scale=3):
682
  dataframe = gr.Dataframe(
683
  headers=["prompt", "completion", "evaluation"],
684
- wrap=False,
685
  height=500,
686
  interactive=False,
 
687
  )
688
 
689
  gr.HTML(value="<hr>")
@@ -746,6 +804,8 @@ with gr.Blocks() as app:
746
  dataframe,
747
  instruction_instruction_response,
748
  response_instruction_response,
 
 
749
  ],
750
  )
751
 
 
89
  if not repo_id:
90
  raise gr.Error("Hub repo id is required")
91
  subsets = get_dataset_config_names(repo_id, token=token)
 
92
  splits = get_dataset_split_names(repo_id, subsets[0], token=token)
93
+ ds = load_dataset(repo_id, subsets[0], split=splits[0], token=token, streaming=True)
94
+ rows = []
95
+ for idx, row in enumerate(ds):
96
+ rows.append(row)
97
+ if idx == num_rows:
98
+ break
99
+ ds = Dataset.from_list(rows)
100
  dataframe = ds.to_pandas()
101
  instruction_valid_columns, response_valid_columns = get_valid_columns(dataframe)
102
+ col_instruction = instruction_valid_columns[0] if instruction_valid_columns else ""
103
+ col_response = "No valid response columns found."
104
+ for col in response_valid_columns:
105
+ if col != col_instruction:
106
+ col_response = col
107
+ break
108
+
109
+ prompt_template = gr.Code(
110
+ label="Prompt template",
111
+ value="\n".join(
112
+ [
113
+ "Evaluate the following text based on criteria.",
114
+ "Criteria: quality.",
115
+ "Score: between 1 and 10.",
116
+ "Text: {{" + col_response + "}}",
117
+ ]
118
+ ),
119
+ language="markdown",
120
+ interactive=True,
121
+ )
122
+ structured_output = gr.Code(
123
+ label="Structured output",
124
+ value=json.dumps(
125
+ {
126
+ "type": "object",
127
+ "properties": {"quality": {"type": "integer"}},
128
+ "required": ["quality"],
129
+ },
130
+ indent=4,
131
+ ),
132
+ language="json",
133
+ interactive=True,
134
+ )
135
  return (
136
  dataframe,
137
+ gr.Dropdown(
138
+ choices=instruction_valid_columns,
139
+ label="Instruction column",
140
+ value=col_instruction,
141
+ interactive=True,
142
+ ),
143
+ gr.Dropdown(
144
+ choices=response_valid_columns,
145
+ label="Response column",
146
+ value=col_response,
147
+ interactive=False
148
+ if col_response == "No valid response columns found."
149
+ else True,
150
+ ),
151
+ prompt_template,
152
+ structured_output,
153
  )
154
 
155
 
156
  def define_evaluation_aspects(task_type: str):
157
+ if task_type == "chat-eval":
158
  return gr.Dropdown(
159
  value=["overall-rating"],
160
  choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
 
301
  num_rows: int = 10,
302
  is_sample: bool = False,
303
  ):
304
+ if eval_type == "chat-eval":
305
  dataframe = evaluate_instruction_response(
306
  dataframe=dataframe,
307
  aspects=aspects_instruction_response,
 
330
  prompt_template: str,
331
  structured_output: dict,
332
  ):
333
+ dataframe, _, _, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
334
  dataframe = _evaluate_dataset(
335
  dataframe=dataframe,
336
  eval_type=eval_type,
 
374
  oauth_token: Union[gr.OAuthToken, None] = None,
375
  progress=gr.Progress(),
376
  ) -> pd.DataFrame:
377
+ dataframe, _, _, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
378
  dataframe = _evaluate_dataset(
379
  dataframe=dataframe,
380
  eval_type=eval_type,
 
392
  client = get_argilla_client()
393
  if client is None:
394
  return ""
395
+ if eval_type == "chat-eval":
396
  num_generations = len((dataframe["generations"][0]))
397
  fields = [
398
  rg.ChatField(
 
662
  load_btn = gr.Button("Load", variant="primary")
663
 
664
  with gr.Column(scale=3):
665
+ examples = gr.Examples(
666
+ examples=[
667
+ "argilla/distilabel-sft-easy",
668
+ "HuggingFaceFW/fineweb-edu",
669
+ "argilla/distilabel-intel-orca-dpo-pairs",
670
+ ],
671
+ label="Example datasets",
672
+ fn=lambda x: x,
673
+ inputs=[search_in],
674
+ run_on_click=True,
675
+ )
676
+ search_out = gr.HTML(label="Dataset preview", visible=False)
677
 
678
  gr.HTML(value="<hr>")
679
  gr.Markdown(value="## 2. Configure your task")
 
681
  with gr.Column(scale=2):
682
  eval_type = gr.Dropdown(
683
  label="Evaluation type",
684
+ choices=["chat-eval", "custom-eval"],
685
+ value="chat-eval",
686
  multiselect=False,
687
  visible=False,
688
  )
689
+ with gr.Tab("Response Evaluation") as tab_instruction_response:
690
  aspects_instruction_response = define_evaluation_aspects(
691
+ "chat-eval"
692
  )
693
  instruction_instruction_response = gr.Dropdown(
694
  label="Instruction Column",
695
+ info="Select the instruction column to evaluate",
696
+ choices=["Load your data first in step 1."],
697
+ value="Load your data first in step 1.",
698
+ interactive=False,
699
  multiselect=False,
700
  allow_custom_value=False,
701
  )
702
  response_instruction_response = gr.Dropdown(
703
  label="Response Column",
704
+ info="Select the response column(s) to evaluate",
705
+ choices=["Load your data first in step 1."],
706
+ value="Load your data first in step 1.",
707
+ interactive=False,
708
+ multiselect=False,
709
  allow_custom_value=False,
710
  )
711
  tab_instruction_response.select(
712
+ fn=lambda: "chat-eval",
713
  inputs=[],
714
  outputs=[eval_type],
715
  )
716
+ with gr.Tab("Custom Evaluation Prompt") as tab_custom:
717
+ aspects_custom = define_evaluation_aspects("custom-eval")
718
  prompt_template = gr.Code(
719
  label="Prompt template",
720
+ value="Load your data first in step 1.",
721
  language="markdown",
722
+ interactive=False,
723
  )
724
  structured_output = gr.Code(
725
  label="Structured output",
726
+ value="Load your data first in step 1.",
 
 
 
 
 
 
 
 
 
 
727
  language="json",
728
+ interactive=False,
729
  )
730
  tab_custom.select(
731
+ fn=lambda: "custom-eval",
732
  inputs=[],
733
  outputs=[eval_type],
734
  )
 
738
  with gr.Column(scale=3):
739
  dataframe = gr.Dataframe(
740
  headers=["prompt", "completion", "evaluation"],
741
+ wrap=True,
742
  height=500,
743
  interactive=False,
744
+ elem_classes="table-view",
745
  )
746
 
747
  gr.HTML(value="<hr>")
 
804
  dataframe,
805
  instruction_instruction_response,
806
  response_instruction_response,
807
+ prompt_template,
808
+ structured_output,
809
  ],
810
  )
811
 
src/synthetic_dataset_generator/apps/sft.py CHANGED
@@ -84,6 +84,7 @@ def _get_dataframe():
84
  wrap=True,
85
  height=500,
86
  interactive=False,
 
87
  )
88
 
89
 
 
84
  wrap=True,
85
  height=500,
86
  interactive=False,
87
+ elem_classes="table-view",
88
  )
89
 
90
 
src/synthetic_dataset_generator/apps/textcat.py CHANGED
@@ -37,7 +37,11 @@ from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
37
 
38
  def _get_dataframe():
39
  return gr.Dataframe(
40
- headers=["labels", "text"], wrap=True, height=500, interactive=False
 
 
 
 
41
  )
42
 
43
 
 
37
 
38
  def _get_dataframe():
39
  return gr.Dataframe(
40
+ headers=["labels", "text"],
41
+ wrap=True,
42
+ height=500,
43
+ interactive=False,
44
+ elem_classes="table-view",
45
  )
46
 
47
 
src/synthetic_dataset_generator/pipelines/eval.py CHANGED
@@ -18,7 +18,7 @@ def get_ultrafeedback_evaluator(aspect, is_sample):
18
  api_key=_get_next_api_key(),
19
  generation_kwargs={
20
  "temperature": 0.01,
21
- "max_new_tokens": 256 if is_sample else 2048,
22
  },
23
  ),
24
  aspect=aspect,
@@ -36,7 +36,7 @@ def get_custom_evaluator(prompt_template, structured_output, columns, is_sample)
36
  structured_output={"format": "json", "schema": structured_output},
37
  generation_kwargs={
38
  "temperature": 0.01,
39
- "max_new_tokens": 256 if is_sample else 2048,
40
  },
41
  ),
42
  template=prompt_template,
 
18
  api_key=_get_next_api_key(),
19
  generation_kwargs={
20
  "temperature": 0.01,
21
+ "max_new_tokens": 2048 if not is_sample else 512,
22
  },
23
  ),
24
  aspect=aspect,
 
36
  structured_output={"format": "json", "schema": structured_output},
37
  generation_kwargs={
38
  "temperature": 0.01,
39
+ "max_new_tokens": 2048 if not is_sample else 512,
40
  },
41
  ),
42
  template=prompt_template,