sdiazlor HF staff commited on
Commit
34371d3
1 Parent(s): 49d5948

add evaluation task

Browse files
app.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
  from src.distilabel_dataset_generator._tabbedinterface import TabbedInterface
4
  from src.distilabel_dataset_generator.apps.faq import app as faq_app
5
  from src.distilabel_dataset_generator.apps.sft import app as sft_app
 
6
  from src.distilabel_dataset_generator.apps.textcat import app as textcat_app
7
 
8
  theme = gr.themes.Monochrome(
@@ -54,8 +55,8 @@ button[role="tab"][data-tab-id][aria-selected="true"] {
54
  """
55
 
56
  demo = TabbedInterface(
57
- [textcat_app, sft_app, faq_app],
58
- ["Text Classification", "Supervised Fine-Tuning", "FAQ"],
59
  css=css,
60
  title="""
61
  <h1>Synthetic Data Generator</h1>
 
3
  from src.distilabel_dataset_generator._tabbedinterface import TabbedInterface
4
  from src.distilabel_dataset_generator.apps.faq import app as faq_app
5
  from src.distilabel_dataset_generator.apps.sft import app as sft_app
6
+ from src.distilabel_dataset_generator.apps.eval import app as eval_app
7
  from src.distilabel_dataset_generator.apps.textcat import app as textcat_app
8
 
9
  theme = gr.themes.Monochrome(
 
55
  """
56
 
57
  demo = TabbedInterface(
58
+ [textcat_app, sft_app, eval_app, faq_app],
59
+ ["Text Classification", "Supervised Fine-Tuning", "Evaluation", "FAQ"],
60
  css=css,
61
  title="""
62
  <h1>Synthetic Data Generator</h1>
pyproject.toml CHANGED
@@ -6,7 +6,7 @@ authors = [
6
  {name = "davidberenstein1957", email = "[email protected]"},
7
  ]
8
  dependencies = [
9
- "distilabel[hf-inference-endpoints,argilla,outlines]>=1.4.1",
10
  "gradio[oauth]<5.0.0",
11
  "transformers>=4.44.2",
12
  "sentence-transformers>=3.2.0",
 
6
  {name = "davidberenstein1957", email = "[email protected]"},
7
  ]
8
  dependencies = [
9
+ "distilabel[hf-inference-endpoints,argilla,outlines,instructor]>=1.4.1",
10
  "gradio[oauth]<5.0.0",
11
  "transformers>=4.44.2",
12
  "sentence-transformers>=3.2.0",
src/distilabel_dataset_generator/apps/eval.py CHANGED
@@ -1,70 +1,106 @@
1
  import json
 
 
2
 
 
3
  import gradio as gr
 
4
  import pandas as pd
5
- from datasets import load_dataset
 
 
 
 
 
 
6
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
 
7
 
8
- from src.distilabel_dataset_generator.utils import get_org_dropdown
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
- def get_iframe(hub_repo_id) -> str:
12
  if not hub_repo_id:
13
- raise gr.Error("Hub repo id is required")
 
14
  url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer"
15
  iframe = f"""
16
  <iframe
17
- src="{url}"
18
- frameborder="0"
19
- width="100%"
20
- height="600px"
21
- ></iframe>
22
- """
23
  return iframe
24
 
25
 
26
- def get_valid_columns(df: pd.DataFrame):
27
- valid_columns = []
28
- for col in df.columns:
29
- sample_val = df[col].iloc[0]
 
 
30
  if isinstance(sample_val, str) or (
31
- isinstance(sample_val, list)
32
- and all(isinstance(item, dict) for item in sample_val)
33
  ):
34
- valid_columns.append(col)
35
- return valid_columns
 
 
 
 
36
 
 
37
 
38
- def load_dataset_from_hub(hub_repo_id: str, num_rows: int = 10):
39
- gr.Info(message="Loading dataset ...")
40
- if not hub_repo_id:
41
  raise gr.Error("Hub repo id is required")
42
- ds_dict = load_dataset(hub_repo_id)
43
- splits = list(ds_dict.keys())
 
44
  ds = ds_dict[splits[0]]
45
  if num_rows:
46
  ds = ds.select(range(num_rows))
47
- df = ds.to_pandas()
48
- # Get columns that contain either strings or lists of dictionaries
49
- valid_columns = get_valid_columns(df)
50
  return (
51
- df,
52
- gr.Dropdown(choices=valid_columns, label="Instruction Column"),
53
- gr.Dropdown(choices=valid_columns, label="Instruction Column"),
54
- gr.Dropdown(choices=valid_columns, label="Response Column"),
55
  )
56
 
57
 
58
  def define_evaluation_aspects(task_type: str):
59
- if task_type == "instruction":
60
- return gr.Dropdown(
61
- value=["overall-rating"],
62
- choices=["complexity", "quality"],
63
- label="Evaluation Aspects",
64
- multiselect=True,
65
- interactive=True,
66
- )
67
- elif task_type == "instruction-response":
68
  return gr.Dropdown(
69
  value=["overall-rating"],
70
  choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
@@ -76,106 +112,473 @@ def define_evaluation_aspects(task_type: str):
76
  return gr.Dropdown(interactive=False, visible=False)
77
 
78
 
79
- def evaluate_instruction(df: pd.DataFrame, aspects: list[str], instruction_column: str):
80
- pass
81
-
82
-
83
  def evaluate_instruction_response(
84
- df: pd.DataFrame, aspects: list[str], instruction_column: str, response_column: str
 
 
 
 
 
 
85
  ):
86
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
 
89
  def evaluate_custom(
90
- df: pd.DataFrame, aspects: list[str], prompt_template: str, structured_output: dict
 
 
 
 
 
91
  ):
92
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
 
 
 
94
 
95
- def _apply_to_dataset(
96
- df: pd.DataFrame,
 
 
 
 
 
 
 
 
 
 
 
97
  eval_type: str,
98
- aspects_instruction: list[str],
99
- instruction_column: str,
100
  aspects_instruction_response: list[str],
101
- instruction_column_response: str,
102
- response_column_response: str,
103
- aspects_custom: list[str],
104
  prompt_template: str,
105
  structured_output: dict,
 
 
106
  ):
107
- if eval_type == "instruction":
108
- df = evaluate_instruction(df, aspects_instruction, instruction_column)
109
- elif eval_type == "instruction-response":
110
- df = evaluate_instruction_response(
111
- df,
112
- aspects_instruction_response,
113
- instruction_column_response,
114
- response_column_response,
 
 
 
 
 
 
 
 
115
  )
116
- elif eval_type == "custom":
117
- df = evaluate_custom(df, aspects_custom, prompt_template, structured_output)
118
- return df
119
 
120
 
121
- def apply_to_sample_dataset(
122
  repo_id: str,
123
  eval_type: str,
124
- aspects_instruction: list[str],
125
  aspects_instruction_response: list[str],
126
- aspects_custom: list[str],
127
- instruction_instruction: str,
128
  instruction_instruction_response: str,
129
  response_instruction_response: str,
130
  prompt_template: str,
131
  structured_output: dict,
132
  ):
133
- df, _, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
134
- df = _apply_to_dataset(
135
- df,
136
- eval_type,
137
- aspects_instruction,
138
- instruction_instruction,
139
- aspects_instruction_response,
140
- instruction_instruction_response,
141
- response_instruction_response,
142
- aspects_custom,
143
- prompt_template,
144
- structured_output,
145
  )
146
- return df
147
 
148
 
149
- def push_to_hub(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  org_name: str,
151
  repo_name: str,
152
  private: bool,
153
  num_rows: int,
154
  original_repo_id: str,
155
  eval_type: str,
156
- aspects_instruction: list[str],
157
  aspects_instruction_response: list[str],
158
- aspects_custom: list[str],
159
- instruction_instruction: str,
160
  instruction_instruction_response: str,
161
  response_instruction_response: str,
162
  prompt_template: str,
163
  structured_output: dict,
164
- ):
165
- df, _, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
166
- df = _apply_to_dataset(
167
- df,
168
- eval_type,
169
- aspects_instruction,
170
- instruction_instruction,
171
- aspects_instruction_response,
172
- instruction_instruction_response,
173
- response_instruction_response,
174
- aspects_custom,
175
- prompt_template,
176
- structured_output,
177
  )
178
- new_repo_id = f"{org_name}/{repo_name}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
 
181
  ######################
@@ -184,123 +587,157 @@ def push_to_hub(
184
 
185
 
186
  with gr.Blocks() as app:
187
- gr.Markdown("## 1. Select your input dataset")
188
- with gr.Row(equal_height=False):
189
- with gr.Column(scale=1):
190
- search_in = HuggingfaceHubSearch(
191
- label="Search",
192
- placeholder="Search for a Dataset",
193
- search_type="dataset",
194
- sumbit_on_select=True,
195
- )
196
- load_btn = gr.Button("Load dataset")
197
- with gr.Column(scale=3):
198
- search_out = gr.HTML(label="Dataset Preview")
199
-
200
- gr.HTML("<hr>")
201
- gr.Markdown("## 2. Configure your task")
202
- with gr.Row(equal_height=False):
203
- with gr.Column(scale=1):
204
- eval_type = gr.Dropdown(
205
- label="Evaluation Type",
206
- choices=["instruction", "instruction-response", "custom-template"],
207
- visible=False,
208
- )
209
- with gr.Tab("instruction") as tab_instruction:
210
- aspects_instruction = define_evaluation_aspects("instruction")
211
- instruction_instruction = gr.Dropdown(
212
- label="Instruction Column", interactive=True
213
  )
214
- tab_instruction.select(
215
- lambda: "instruction",
216
- inputs=[],
217
- outputs=[eval_type],
218
- )
219
- with gr.Tab("instruction-response") as tab_instruction_response:
220
- aspects_instruction_response = define_evaluation_aspects(
221
- "instruction-response"
222
- )
223
- instruction_instruction_response = gr.Dropdown(
224
- label="Instruction Column", interactive=True
 
 
 
225
  )
226
- response_instruction_response = gr.Dropdown(
227
- label="Response Column", interactive=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  )
229
- tab_instruction_response.select(
230
- lambda: "instruction-response",
231
- inputs=[],
232
- outputs=[eval_type],
 
 
233
  )
234
- with gr.Tab("custom") as tab_custom:
235
- aspects_custom = define_evaluation_aspects("custom")
236
- prompt_template = gr.Code(
237
- label="Prompt Template",
238
- value="{{column_1}} based on {{column_2}}",
239
- language="markdown",
 
 
 
 
240
  interactive=True,
241
  )
242
- structured_output = gr.Code(
243
- label="Structured Output",
244
- value=json.dumps({"eval_aspect": "str"}),
245
- language="json",
246
  interactive=True,
 
247
  )
248
- tab_custom.select(
249
- lambda: "custom-template",
250
- inputs=[],
251
- outputs=[eval_type],
 
252
  )
253
- btn_apply_to_sample_dataset = gr.Button("Refresh dataset")
254
- with gr.Column(scale=3):
255
- dataframe = gr.Dataframe(wrap=True, height=300)
256
-
257
- gr.HTML("<hr>")
258
- gr.Markdown("## 3. Generate your dataset")
259
- with gr.Row():
260
- with gr.Column(scale=2):
261
- org_name = get_org_dropdown()
262
- repo_name = gr.Textbox(
263
- label="Repo name",
264
- placeholder="dataset_name",
265
- value="my-distiset",
266
- interactive=True,
267
- )
268
- num_rows = gr.Number(
269
- label="Number of rows",
270
- value=10,
271
- interactive=True,
272
- scale=1,
273
- )
274
- private = gr.Checkbox(
275
- label="Private dataset",
276
- value=False,
277
- interactive=True,
278
- scale=1,
279
- )
280
- btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
281
- with gr.Column(scale=3):
282
- success_message = gr.Markdown(visible=False)
283
 
284
- search_in.submit(get_iframe, inputs=search_in, outputs=search_out)
285
  load_btn.click(
286
- load_dataset_from_hub,
287
  inputs=[search_in],
288
  outputs=[
289
  dataframe,
290
- instruction_instruction,
291
  instruction_instruction_response,
292
  response_instruction_response,
293
  ],
294
  )
 
295
  btn_apply_to_sample_dataset.click(
296
- apply_to_sample_dataset,
297
  inputs=[
298
  search_in,
299
  eval_type,
300
- aspects_instruction,
301
  aspects_instruction_response,
302
- aspects_custom,
303
- instruction_instruction,
304
  instruction_instruction_response,
305
  response_instruction_response,
306
  prompt_template,
@@ -308,8 +745,23 @@ with gr.Blocks() as app:
308
  ],
309
  outputs=dataframe,
310
  )
 
311
  btn_push_to_hub.click(
312
- push_to_hub,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  inputs=[
314
  org_name,
315
  repo_name,
@@ -317,15 +769,36 @@ with gr.Blocks() as app:
317
  num_rows,
318
  search_in,
319
  eval_type,
320
- aspects_instruction,
321
  aspects_instruction_response,
322
- aspects_custom,
323
- instruction_instruction,
324
  instruction_instruction_response,
325
  response_instruction_response,
326
  prompt_template,
327
  structured_output,
328
  ],
329
- outputs=success_message,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  )
 
 
331
  app.load(fn=get_org_dropdown, outputs=[org_name])
 
1
  import json
2
+ import uuid
3
+ from typing import Union
4
 
5
+ import argilla as rg
6
  import gradio as gr
7
+ import numpy as np
8
  import pandas as pd
9
+ from datasets import (
10
+ Dataset,
11
+ get_dataset_config_names,
12
+ get_dataset_split_names,
13
+ load_dataset,
14
+ )
15
+ from distilabel.distiset import Distiset
16
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
17
+ from huggingface_hub import HfApi
18
 
19
+ from src.distilabel_dataset_generator.apps.base import (
20
+ hide_success_message,
21
+ show_success_message,
22
+ validate_argilla_user_workspace_dataset,
23
+ validate_push_to_hub,
24
+ )
25
+ from src.distilabel_dataset_generator.pipelines.base import (
26
+ DEFAULT_BATCH_SIZE,
27
+ )
28
+ from src.distilabel_dataset_generator.pipelines.embeddings import (
29
+ get_embeddings,
30
+ get_sentence_embedding_dimensions,
31
+ )
32
+ from src.distilabel_dataset_generator.pipelines.eval import (
33
+ generate_pipeline_code,
34
+ get_custom_evaluator,
35
+ get_ultrafeedback_evaluator,
36
+ )
37
+ from src.distilabel_dataset_generator.utils import (
38
+ column_to_list,
39
+ extract_column_names,
40
+ get_argilla_client,
41
+ get_org_dropdown,
42
+ process_columns,
43
+ swap_visibility,
44
+ pad_or_truncate_list,
45
+ )
46
 
47
 
48
+ def get_iframe(hub_repo_id: str) -> str:
49
  if not hub_repo_id:
50
+ raise gr.Error("Hub repository ID is required.")
51
+
52
  url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer"
53
  iframe = f"""
54
  <iframe
55
+ src="{url}"
56
+ frameborder="0"
57
+ width="100%"
58
+ height="600px"
59
+ ></iframe>
60
+ """
61
  return iframe
62
 
63
 
64
+ def get_valid_columns(dataframe: pd.DataFrame):
65
+ instruction_valid_columns = []
66
+ response_valid_columns = []
67
+
68
+ for col in dataframe.columns:
69
+ sample_val = dataframe[col].iloc[0]
70
  if isinstance(sample_val, str) or (
71
+ isinstance(sample_val, (list, np.ndarray))
72
+ and all(isinstance(item, dict) and "role" in item for item in sample_val)
73
  ):
74
+ instruction_valid_columns.append(col)
75
+ response_valid_columns.append(col)
76
+ if isinstance(sample_val, (list, np.ndarray)) and all(
77
+ isinstance(item, str) for item in sample_val
78
+ ):
79
+ response_valid_columns.append(col)
80
 
81
+ return instruction_valid_columns, response_valid_columns
82
 
83
+
84
+ def load_dataset_from_hub(repo_id: str, num_rows: int = 10):
85
+ if not repo_id:
86
  raise gr.Error("Hub repo id is required")
87
+ subsets = get_dataset_config_names(repo_id)
88
+ ds_dict = load_dataset(repo_id, subsets[0])
89
+ splits = get_dataset_split_names(repo_id, subsets[0])
90
  ds = ds_dict[splits[0]]
91
  if num_rows:
92
  ds = ds.select(range(num_rows))
93
+ dataframe = ds.to_pandas()
94
+ instruction_valid_columns, response_valid_columns = get_valid_columns(dataframe)
 
95
  return (
96
+ dataframe,
97
+ gr.Dropdown(choices=instruction_valid_columns, label="Instruction column"),
98
+ gr.Dropdown(choices=response_valid_columns, label="Response column"),
 
99
  )
100
 
101
 
102
  def define_evaluation_aspects(task_type: str):
103
+ if task_type == "ultrafeedback":
 
 
 
 
 
 
 
 
104
  return gr.Dropdown(
105
  value=["overall-rating"],
106
  choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
 
112
  return gr.Dropdown(interactive=False, visible=False)
113
 
114
 
 
 
 
 
115
  def evaluate_instruction_response(
116
+ dataframe: pd.DataFrame,
117
+ aspects: list[str],
118
+ instruction_column: str,
119
+ response_columns: str,
120
+ num_rows: int = 10,
121
+ is_sample: bool = False,
122
+ progress=gr.Progress(),
123
  ):
124
+ progress(0.0, desc="Evaluating instructions and responses")
125
+ data = process_columns(dataframe, instruction_column, response_columns)
126
+ num_generations = len(data[0]["generations"])
127
+ evaluated_results = []
128
+ for entry in data:
129
+ result_row = {
130
+ "instruction": entry["instruction"],
131
+ "generations": entry["generations"],
132
+ }
133
+ for aspect in aspects:
134
+ result_row[f"ratings_{aspect}"] = None
135
+ result_row[f"rationale_for_ratings_{aspect}"] = None
136
+ if aspect in ["truthfulness", "helpfulness"]:
137
+ result_row[f"type_{aspect}"] = None
138
+ result_row[f"rationale_for_type_{aspect}"] = None
139
+ result_row["model_name"] = None
140
+ evaluated_results.append(result_row)
141
+
142
+ batch_size = DEFAULT_BATCH_SIZE
143
+ total_steps: int = len(aspects) * num_rows
144
+
145
+ # evaluate instructions and responses
146
+ for aspect in aspects:
147
+ ultrafeedback_evaluator = get_ultrafeedback_evaluator(aspect, is_sample)
148
+ n_processed = 0
149
+
150
+ while n_processed < num_rows:
151
+ progress(
152
+ (len(aspects) * n_processed) / total_steps,
153
+ total=total_steps,
154
+ desc=f"Evaluating aspect: {aspect}",
155
+ )
156
+
157
+ remaining_rows = num_rows - n_processed
158
+ batch_size = min(batch_size, remaining_rows)
159
+ inputs = data[n_processed : n_processed + batch_size]
160
+ batch_results = list(ultrafeedback_evaluator.process(inputs=inputs))
161
+ for j, result in enumerate(batch_results[0]):
162
+ idx = n_processed + j
163
+ evaluated_results[idx][f"ratings_{aspect}"] = pad_or_truncate_list(
164
+ result.get("ratings"), num_generations
165
+ )
166
+ evaluated_results[idx]["model_name"] = result.get("model_name")
167
+ if aspect in ["truthfulness", "helpfulness"]:
168
+ evaluated_results[idx][f"type_{aspect}"] = pad_or_truncate_list(
169
+ result.get("types"), num_generations
170
+ )
171
+ evaluated_results[idx][f"rationale_for_type_{aspect}"] = (
172
+ pad_or_truncate_list(result.get("rationales"), num_generations)
173
+ )
174
+ evaluated_results[idx][f"rationale_for_ratings_{aspect}"] = (
175
+ pad_or_truncate_list(
176
+ result.get("rationales-for-ratings"), num_generations
177
+ )
178
+ )
179
+ else:
180
+ evaluated_results[idx][f"rationale_for_ratings_{aspect}"] = (
181
+ pad_or_truncate_list(result.get("rationales"), num_generations)
182
+ )
183
+ n_processed += batch_size
184
+
185
+ # create final dataset
186
+ dataframe = pd.DataFrame(evaluated_results)
187
+ progress(1.0, desc="Dataset evaluation completed")
188
+ return dataframe
189
 
190
 
191
  def evaluate_custom(
192
+ dataframe: pd.DataFrame,
193
+ prompt_template: str,
194
+ structured_output: dict,
195
+ num_rows: int = 10,
196
+ is_sample: bool = False,
197
+ progress=gr.Progress(),
198
  ):
199
+ progress(0.0, desc="Evaluating dataset")
200
+ columns = extract_column_names(prompt_template)
201
+ input_columns = {column: column_to_list(dataframe, column) for column in columns}
202
+
203
+ custom_evaluator = get_custom_evaluator(
204
+ prompt_template, structured_output, columns, is_sample
205
+ )
206
+ batch_size = DEFAULT_BATCH_SIZE
207
+
208
+ # evaluate the data
209
+ n_processed = 0
210
+ evaluation_results = []
211
+ while n_processed < num_rows:
212
+ progress(
213
+ n_processed / num_rows,
214
+ desc="Evaluating dataset",
215
+ )
216
+ remaining_rows = num_rows - n_processed
217
+ batch_size = min(batch_size, remaining_rows)
218
+
219
+ inputs = []
220
+ for idx in range(n_processed, n_processed + batch_size):
221
+ input = {column: input_columns[column][idx] for column in input_columns}
222
+ inputs.append(input)
223
 
224
+ batch = list(custom_evaluator.process(inputs=inputs))
225
+ evaluation_results.extend(batch[0])
226
+ n_processed += batch_size
227
 
228
+ # create final dataset
229
+ distiset_results = []
230
+ for result in evaluation_results:
231
+ record = {key: result[key] for key in result if key != "distilabel_metadata"}
232
+ distiset_results.append(record)
233
+
234
+ dataframe = pd.DataFrame(distiset_results)
235
+ progress(1.0, desc="Dataset evaluation completed")
236
+ return dataframe
237
+
238
+
239
+ def _evaluate_dataset(
240
+ dataframe: pd.DataFrame,
241
  eval_type: str,
 
 
242
  aspects_instruction_response: list[str],
243
+ instruction_instruction_response: str,
244
+ response_instruction_response: str,
 
245
  prompt_template: str,
246
  structured_output: dict,
247
+ num_rows: int = 10,
248
+ is_sample: bool = False,
249
  ):
250
+ if eval_type == "ultrafeedback":
251
+ dataframe = evaluate_instruction_response(
252
+ dataframe=dataframe,
253
+ aspects=aspects_instruction_response,
254
+ instruction_column=instruction_instruction_response,
255
+ response_columns=response_instruction_response,
256
+ num_rows=num_rows,
257
+ is_sample=is_sample,
258
+ )
259
+ else:
260
+ dataframe = evaluate_custom(
261
+ dataframe=dataframe,
262
+ prompt_template=prompt_template,
263
+ structured_output=structured_output,
264
+ num_rows=num_rows,
265
+ is_sample=is_sample,
266
  )
267
+ return dataframe
 
 
268
 
269
 
270
+ def evaluate_sample_dataset(
271
  repo_id: str,
272
  eval_type: str,
 
273
  aspects_instruction_response: list[str],
 
 
274
  instruction_instruction_response: str,
275
  response_instruction_response: str,
276
  prompt_template: str,
277
  structured_output: dict,
278
  ):
279
+ dataframe, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
280
+ dataframe = _evaluate_dataset(
281
+ dataframe=dataframe,
282
+ eval_type=eval_type,
283
+ aspects_instruction_response=aspects_instruction_response,
284
+ instruction_instruction_response=instruction_instruction_response,
285
+ response_instruction_response=response_instruction_response,
286
+ prompt_template=prompt_template,
287
+ structured_output=structured_output,
288
+ num_rows=10,
289
+ is_sample=True,
 
290
  )
291
+ return dataframe
292
 
293
 
294
+ def push_dataset_to_hub(
295
+ dataframe: pd.DataFrame, org_name: str, repo_name: str, oauth_token, private
296
+ ):
297
+ repo_id = validate_push_to_hub(org_name, repo_name)
298
+ distiset = Distiset({"default": Dataset.from_pandas(dataframe)})
299
+ distiset.push_to_hub(
300
+ repo_id=repo_id,
301
+ private=private,
302
+ include_script=False,
303
+ token=oauth_token.token,
304
+ create_pr=False,
305
+ )
306
+
307
+
308
+ def push_dataset(
309
  org_name: str,
310
  repo_name: str,
311
  private: bool,
312
  num_rows: int,
313
  original_repo_id: str,
314
  eval_type: str,
 
315
  aspects_instruction_response: list[str],
 
 
316
  instruction_instruction_response: str,
317
  response_instruction_response: str,
318
  prompt_template: str,
319
  structured_output: dict,
320
+ oauth_token: Union[gr.OAuthToken, None] = None,
321
+ progress=gr.Progress(),
322
+ ) -> pd.DataFrame:
323
+ dataframe, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
324
+ dataframe = _evaluate_dataset(
325
+ dataframe=dataframe,
326
+ eval_type=eval_type,
327
+ aspects_instruction_response=aspects_instruction_response,
328
+ instruction_instruction_response=instruction_instruction_response,
329
+ response_instruction_response=response_instruction_response,
330
+ prompt_template=prompt_template,
331
+ structured_output=structured_output,
332
+ num_rows=num_rows,
333
  )
334
+ push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
335
+ try:
336
+ progress(0.1, desc="Setting up user and workspace")
337
+ client = get_argilla_client()
338
+ hf_user = HfApi().whoami(token=oauth_token.token)["name"]
339
+ if eval_type == "ultrafeedback":
340
+ num_generations = len((dataframe["generations"][0]))
341
+ fields = [
342
+ rg.ChatField(
343
+ name=f"chat_{i}",
344
+ title=f"Chat {i+1}",
345
+ description=f"User and assistant conversation for generation {i+1}",
346
+ )
347
+ for i in range(num_generations)
348
+ ]
349
+ questions = []
350
+ for i in range(num_generations):
351
+ for aspect in aspects_instruction_response:
352
+ questions.append(
353
+ rg.RatingQuestion(
354
+ name=f"ratings_{aspect}_{i}",
355
+ values=list(range(11)),
356
+ title=f"Ratings for {aspect} for response {i+1}",
357
+ required=True,
358
+ )
359
+ )
360
+ questions.append(
361
+ rg.TextQuestion(
362
+ name=f"rationale_for_ratings_{aspect}_{i}",
363
+ title=f"Rationale for ratings for {aspect} for response {i+1}",
364
+ required=False,
365
+ use_markdown=True,
366
+ )
367
+ )
368
+ if aspect in ["truthfulness", "helpfulness"]:
369
+ questions.append(
370
+ rg.RatingQuestion(
371
+ name=f"type_{aspect}_{i}",
372
+ values=list(range(1, 6)),
373
+ title=f"The type of the response {i+1} for {aspect}",
374
+ required=True,
375
+ )
376
+ )
377
+ questions.append(
378
+ rg.TextQuestion(
379
+ name=f"rationale_for_type_{aspect}_{i}",
380
+ title=f"Rationale for type of the response {i+1} for {aspect}",
381
+ required=False,
382
+ use_markdown=True,
383
+ )
384
+ )
385
+ metadata = [
386
+ rg.IntegerMetadataProperty(
387
+ name="instruction_length", title="Instruction length"
388
+ ),
389
+ ]
390
+ for i in range(num_generations):
391
+ metadata.append(
392
+ rg.IntegerMetadataProperty(
393
+ name=f"response_{i}_length", title=f"Response {i+1} length"
394
+ )
395
+ )
396
+ vectors = [
397
+ rg.VectorField(
398
+ name="instruction_embeddings",
399
+ dimensions=get_sentence_embedding_dimensions(),
400
+ )
401
+ ]
402
+ settings = rg.Settings(
403
+ fields=fields,
404
+ questions=questions,
405
+ metadata=metadata,
406
+ vectors=vectors,
407
+ guidelines="Please review the conversation and provide an evaluation.",
408
+ )
409
+
410
+ dataframe["instruction_length"] = dataframe["instruction"].apply(len)
411
+ for i in range(num_generations):
412
+ dataframe[f"response_{i}_length"] = dataframe["generations"].apply(
413
+ lambda gens: len(gens[i]) if i < len(gens) else 0
414
+ )
415
+ dataframe["instruction_embeddings"] = get_embeddings(
416
+ dataframe["instruction"].to_list()
417
+ )
418
+
419
+ progress(0.5, desc="Creating dataset")
420
+ rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
421
+ if rg_dataset is None:
422
+ rg_dataset = rg.Dataset(
423
+ name=repo_name,
424
+ workspace=hf_user,
425
+ settings=settings,
426
+ client=client,
427
+ )
428
+ rg_dataset = rg_dataset.create()
429
+
430
+ progress(0.7, desc="Pushing dataset to Argilla")
431
+ hf_dataset = Dataset.from_pandas(dataframe)
432
+ records = []
433
+ for sample in hf_dataset:
434
+ fields = {}
435
+ metadata = {"instruction_length": sample.get("instruction_length", 0)}
436
+ vectors = {
437
+ "instruction_embeddings": sample.get("instruction_embeddings", [])
438
+ }
439
+ suggestions = []
440
+ generations = sample.get("generations", [])
441
+ for i in range(num_generations):
442
+ fields[f"chat_{i}"] = [
443
+ {"role": "user", "content": sample.get("instruction", "")},
444
+ {"role": "assistant", "content": generations[i]},
445
+ ]
446
+ metadata[f"response_{i}_length"] = sample.get(
447
+ f"response_{i}_length", 0
448
+ )
449
+
450
+ for aspect in aspects_instruction_response:
451
+ ratings = sample.get(f"ratings_{aspect}", [])
452
+ rationales = sample.get(f"rationale_for_ratings__{aspect}", [])
453
+
454
+ rating_value = (
455
+ ratings[i]
456
+ if ratings and isinstance(ratings[i], int)
457
+ else None
458
+ )
459
+ rationale_value = (
460
+ rationales[i]
461
+ if rationales and isinstance(rationales[i], str)
462
+ else None
463
+ )
464
+
465
+ if rating_value is not None:
466
+ suggestions.append(
467
+ rg.Suggestion(
468
+ question_name=f"ratings_{aspect}_{i}",
469
+ value=rating_value,
470
+ )
471
+ )
472
+ if rationale_value is not None:
473
+ suggestions.append(
474
+ rg.Suggestion(
475
+ question_name=f"rationale_for_ratings_{aspect}_{i}",
476
+ value=rationale_value,
477
+ )
478
+ )
479
+
480
+ if aspect in ["truthfulness", "helpfulness"]:
481
+ types = sample.get(f"type_{aspect}", [])
482
+ rationale_types = sample.get(
483
+ f"rationale_for_type_{aspect}", []
484
+ )
485
+
486
+ type_value = (
487
+ types[i]
488
+ if types and isinstance(types[i], int)
489
+ else None
490
+ )
491
+ rationale_type_value = (
492
+ rationale_types[i]
493
+ if rationale_types
494
+ and isinstance(rationale_types[i], str)
495
+ else None
496
+ )
497
+ if type_value is not None:
498
+ suggestions.append(
499
+ rg.Suggestion(
500
+ question_name=f"type_{aspect}_{i}",
501
+ value=type_value,
502
+ )
503
+ )
504
+ if rationale_type_value is not None:
505
+ suggestions.append(
506
+ rg.Suggestion(
507
+ question_name=f"rationale_for_type_{aspect}_{i}",
508
+ value=rationale_type_value,
509
+ )
510
+ )
511
+ records.append(
512
+ rg.Record(
513
+ fields=fields,
514
+ metadata=metadata,
515
+ vectors=vectors,
516
+ suggestions=suggestions,
517
+ )
518
+ )
519
+ rg_dataset.records.log(records=records)
520
+ progress(1.0, desc="Dataset pushed to Argilla")
521
+ else:
522
+ columns = extract_column_names(prompt_template)
523
+ settings = rg.Settings(
524
+ fields=[
525
+ rg.TextField(
526
+ name=column,
527
+ title=column.capitalize(),
528
+ description="The column content",
529
+ )
530
+ for column in columns
531
+ ],
532
+ questions=[
533
+ rg.TextQuestion(
534
+ name="evaluation",
535
+ title="Evaluation",
536
+ description="The generated evaluation",
537
+ use_markdown=True,
538
+ ),
539
+ ],
540
+ metadata=[
541
+ rg.IntegerMetadataProperty(
542
+ name=f"{column}_length", title=f"{column.capitalize()} length"
543
+ )
544
+ for column in columns
545
+ ],
546
+ vectors=[
547
+ rg.VectorField(
548
+ name=f"{column}_embeddings",
549
+ dimensions=get_sentence_embedding_dimensions(),
550
+ )
551
+ for column in columns
552
+ ],
553
+ guidelines="Please review, correct and provide an accurate evaluation.",
554
+ )
555
+ for column in columns:
556
+ dataframe[f"{column}_length"] = dataframe[column].apply(len)
557
+ dataframe[f"{column}_embeddings"] = get_embeddings(dataframe[column])
558
+
559
+ progress(0.5, desc="Creating dataset")
560
+ rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
561
+ if rg_dataset is None:
562
+ rg_dataset = rg.Dataset(
563
+ name=repo_name,
564
+ workspace=hf_user,
565
+ settings=settings,
566
+ client=client,
567
+ )
568
+ rg_dataset = rg_dataset.create()
569
+ progress(0.7, desc="Pushing dataset to Argilla")
570
+ hf_dataset = Dataset.from_pandas(dataframe)
571
+ rg_dataset.records.log(
572
+ records=hf_dataset, mapping={"generation": "evaluation"}
573
+ )
574
+ progress(1.0, desc="Dataset pushed to Argilla")
575
+ except Exception as e:
576
+ raise gr.Error(f"Error pushing dataset to Argilla: {e}")
577
+ return ""
578
+
579
+
580
+ def update_pipeline_code_visibility():
581
+ return {pipeline_code_ui: gr.Accordion(visible=True)}
582
 
583
 
584
  ######################
 
587
 
588
 
589
  with gr.Blocks() as app:
590
+ with gr.Column() as main_ui:
591
+ gr.Markdown("## 1. Select your input dataset")
592
+ with gr.Row(equal_height=False):
593
+ with gr.Column(scale=1):
594
+ search_in = HuggingfaceHubSearch(
595
+ label="Search",
596
+ placeholder="Search for a dataset",
597
+ search_type="dataset",
598
+ sumbit_on_select=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
  )
600
+ load_btn = gr.Button("Load dataset", variant="primary")
601
+ with gr.Column(scale=3):
602
+ search_out = gr.HTML(label="Dataset preview")
603
+
604
+ gr.HTML(value="<hr>")
605
+ gr.Markdown(value="## 2. Configure your task")
606
+ with gr.Row(equal_height=False):
607
+ with gr.Column(scale=1):
608
+ eval_type = gr.Dropdown(
609
+ label="Evaluation type",
610
+ choices=["ultrafeedback", "custom"],
611
+ value="ultrafeedback",
612
+ multiselect=False,
613
+ visible=False,
614
  )
615
+ with gr.Tab("ultrafeedback") as tab_instruction_response:
616
+ aspects_instruction_response = define_evaluation_aspects(
617
+ "ultrafeedback"
618
+ )
619
+ instruction_instruction_response = gr.Dropdown(
620
+ label="Instruction Column",
621
+ interactive=True,
622
+ multiselect=False,
623
+ allow_custom_value=False,
624
+ )
625
+ response_instruction_response = gr.Dropdown(
626
+ label="Response Column",
627
+ interactive=True,
628
+ multiselect=True,
629
+ allow_custom_value=False,
630
+ )
631
+ tab_instruction_response.select(
632
+ fn=lambda: "ultrafeedback",
633
+ inputs=[],
634
+ outputs=[eval_type],
635
+ )
636
+ with gr.Tab("custom") as tab_custom:
637
+ aspects_custom = define_evaluation_aspects("custom")
638
+ prompt_template = gr.Code(
639
+ label="Prompt template",
640
+ value="Evaluate {{column_1}} based on {{column_2}}.",
641
+ language="markdown",
642
+ interactive=True,
643
+ )
644
+ structured_output = gr.Code(
645
+ label="Structured output",
646
+ value=json.dumps(
647
+ {
648
+ "type": "object",
649
+ "properties": {
650
+ "quality": {"type": "integer"},
651
+ "clarity": {"type": "integer"},
652
+ "relevance": {"type": "integer"},
653
+ },
654
+ },
655
+ indent=4,
656
+ ),
657
+ language="json",
658
+ interactive=True,
659
+ )
660
+ tab_custom.select(
661
+ fn=lambda: "custom",
662
+ inputs=[],
663
+ outputs=[eval_type],
664
+ )
665
+ btn_apply_to_sample_dataset = gr.Button(
666
+ "Refresh dataset", variant="secondary", size="sm"
667
  )
668
+ with gr.Column(scale=3):
669
+ dataframe = gr.Dataframe(
670
+ headers=["prompt", "completion", "evaluation"],
671
+ wrap=False,
672
+ height=500,
673
+ interactive=False,
674
  )
675
+
676
+ gr.HTML(value="<hr>")
677
+ gr.Markdown(value="## 3. Evaluate your dataset")
678
+ with gr.Row(equal_height=False):
679
+ with gr.Column(scale=2):
680
+ org_name = get_org_dropdown()
681
+ repo_name = gr.Textbox(
682
+ label="Repo name",
683
+ placeholder="dataset_name",
684
+ value=f"my-distiset-{str(uuid.uuid4())[:8]}",
685
  interactive=True,
686
  )
687
+ num_rows = gr.Number(
688
+ label="Number of rows",
689
+ value=10,
 
690
  interactive=True,
691
+ scale=1,
692
  )
693
+ private = gr.Checkbox(
694
+ label="Private dataset",
695
+ value=False,
696
+ interactive=True,
697
+ scale=1,
698
  )
699
+ btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
700
+ with gr.Column(scale=3):
701
+ success_message = gr.Markdown(visible=True)
702
+ with gr.Accordion(
703
+ "Do you want to go further? Customize and run with Distilabel",
704
+ open=False,
705
+ visible=False,
706
+ ) as pipeline_code_ui:
707
+ code = generate_pipeline_code(
708
+ repo_id=search_in.value,
709
+ aspects=aspects_instruction_response.value,
710
+ instruction_column=instruction_instruction_response,
711
+ response_columns=response_instruction_response,
712
+ prompt_template=prompt_template.value,
713
+ structured_output=structured_output.value,
714
+ num_rows=num_rows.value,
715
+ eval_type=eval_type.value,
716
+ )
717
+ pipeline_code = gr.Code(
718
+ value=code,
719
+ language="python",
720
+ label="Distilabel Pipeline Code",
721
+ )
722
+
723
+ search_in.submit(fn=get_iframe, inputs=search_in, outputs=search_out)
 
 
 
 
 
724
 
 
725
  load_btn.click(
726
+ fn=load_dataset_from_hub,
727
  inputs=[search_in],
728
  outputs=[
729
  dataframe,
 
730
  instruction_instruction_response,
731
  response_instruction_response,
732
  ],
733
  )
734
+
735
  btn_apply_to_sample_dataset.click(
736
+ fn=evaluate_sample_dataset,
737
  inputs=[
738
  search_in,
739
  eval_type,
 
740
  aspects_instruction_response,
 
 
741
  instruction_instruction_response,
742
  response_instruction_response,
743
  prompt_template,
 
745
  ],
746
  outputs=dataframe,
747
  )
748
+
749
  btn_push_to_hub.click(
750
+ fn=validate_argilla_user_workspace_dataset,
751
+ inputs=[repo_name],
752
+ outputs=[success_message],
753
+ show_progress=True,
754
+ ).then(
755
+ fn=validate_push_to_hub,
756
+ inputs=[org_name, repo_name],
757
+ outputs=[success_message],
758
+ show_progress=True,
759
+ ).success(
760
+ fn=hide_success_message,
761
+ outputs=[success_message],
762
+ show_progress=True,
763
+ ).success(
764
+ fn=push_dataset,
765
  inputs=[
766
  org_name,
767
  repo_name,
 
769
  num_rows,
770
  search_in,
771
  eval_type,
 
772
  aspects_instruction_response,
 
 
773
  instruction_instruction_response,
774
  response_instruction_response,
775
  prompt_template,
776
  structured_output,
777
  ],
778
+ outputs=[success_message],
779
+ show_progress=True,
780
+ ).success(
781
+ fn=show_success_message,
782
+ inputs=[org_name, repo_name],
783
+ outputs=[success_message],
784
+ ).success(
785
+ fn=generate_pipeline_code,
786
+ inputs=[
787
+ search_in,
788
+ aspects_instruction_response,
789
+ instruction_instruction_response,
790
+ response_instruction_response,
791
+ prompt_template,
792
+ structured_output,
793
+ num_rows,
794
+ eval_type,
795
+ ],
796
+ outputs=[pipeline_code],
797
+ ).success(
798
+ fn=update_pipeline_code_visibility,
799
+ inputs=[],
800
+ outputs=[pipeline_code_ui],
801
  )
802
+
803
+ app.load(fn=swap_visibility, outputs=main_ui)
804
  app.load(fn=get_org_dropdown, outputs=[org_name])
src/distilabel_dataset_generator/apps/sft.py CHANGED
@@ -499,6 +499,10 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
499
  fn=show_success_message,
500
  inputs=[org_name, repo_name],
501
  outputs=[success_message],
 
 
 
 
502
  ).success(
503
  fn=update_pipeline_code_visibility,
504
  inputs=[],
 
499
  fn=show_success_message,
500
  inputs=[org_name, repo_name],
501
  outputs=[success_message],
502
+ ).success(
503
+ fn=generate_pipeline_code,
504
+ inputs=[system_prompt, num_turns, num_rows],
505
+ outputs=[pipeline_code],
506
  ).success(
507
  fn=update_pipeline_code_visibility,
508
  inputs=[],
src/distilabel_dataset_generator/apps/textcat.py CHANGED
@@ -526,6 +526,17 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
526
  fn=show_success_message,
527
  inputs=[org_name, repo_name],
528
  outputs=[success_message],
 
 
 
 
 
 
 
 
 
 
 
529
  ).success(
530
  fn=update_pipeline_code_visibility,
531
  inputs=[],
 
526
  fn=show_success_message,
527
  inputs=[org_name, repo_name],
528
  outputs=[success_message],
529
+ ).success(
530
+ fn=generate_pipeline_code,
531
+ inputs=[
532
+ system_prompt,
533
+ difficulty,
534
+ clarity,
535
+ labels,
536
+ num_labels,
537
+ num_rows,
538
+ ],
539
+ outputs=[pipeline_code],
540
  ).success(
541
  fn=update_pipeline_code_visibility,
542
  inputs=[],
src/distilabel_dataset_generator/pipelines/eval.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from datasets import get_dataset_config_names, get_dataset_split_names
4
+ from distilabel.llms import InferenceEndpointsLLM
5
+ from distilabel.steps.tasks import (
6
+ UltraFeedback,
7
+ TextGeneration,
8
+ )
9
+
10
+ from src.distilabel_dataset_generator.pipelines.base import (
11
+ MODEL,
12
+ _get_next_api_key,
13
+ )
14
+ from src.distilabel_dataset_generator.utils import extract_column_names
15
+
16
+
17
+ def get_ultrafeedback_evaluator(aspect, is_sample):
18
+ ultrafeedback_evaluator = UltraFeedback(
19
+ llm=InferenceEndpointsLLM(
20
+ model_id=MODEL,
21
+ tokenizer_id=MODEL,
22
+ api_key=_get_next_api_key(),
23
+ generation_kwargs={
24
+ "temperature": 0.7,
25
+ "max_new_tokens": 256 if is_sample else 2048,
26
+ },
27
+ ),
28
+ aspect=aspect,
29
+ )
30
+ ultrafeedback_evaluator.load()
31
+ return ultrafeedback_evaluator
32
+
33
+
34
+ def get_custom_evaluator(prompt_template, structured_output, columns, is_sample):
35
+ custom_evaluator = TextGeneration(
36
+ llm=InferenceEndpointsLLM(
37
+ model_id=MODEL,
38
+ tokenizer_id=MODEL,
39
+ api_key=_get_next_api_key(),
40
+ structured_output={"format": "json", "schema": structured_output},
41
+ generation_kwargs={
42
+ "temperature": 0.7,
43
+ "max_new_tokens": 256 if is_sample else 2048,
44
+ },
45
+ ),
46
+ template=prompt_template,
47
+ columns=columns
48
+ )
49
+ custom_evaluator.load()
50
+ return custom_evaluator
51
+
52
+
53
+ def generate_ultrafeedback_pipeline_code(
54
+ repo_id, subset, split, aspects, instruction_column, response_columns, num_rows
55
+ ):
56
+ if len(aspects) == 1:
57
+ code = f"""
58
+ # Requirements: `pip install distilabel[hf-inference-endpoints]`
59
+ import os
60
+ from datasets import load_dataset
61
+ from distilabel.pipeline import Pipeline
62
+ from distilabel.steps import LoadDataFromDicts
63
+ from distilabel.steps.tasks import UltraFeedback
64
+ from distilabel.llms import InferenceEndpointsLLM
65
+
66
+ MODEL = "{MODEL}"
67
+ os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
68
+
69
+ hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}[:{num_rows}]")
70
+ data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
71
+
72
+ with Pipeline(name="ultrafeedback") as pipeline:
73
+
74
+ load_the_dataset = LoadDataFromDicts(
75
+ data = data,
76
+ )
77
+
78
+ ultrafeedback_evaluator = UltraFeedback(
79
+ llm=InferenceEndpointsLLM(
80
+ model_id=MODEL,
81
+ tokenizer_id=MODEL,
82
+ api_key=os.environ["HF_TOKEN"],
83
+ generation_kwargs={{
84
+ "temperature": 0.7,
85
+ "max_new_tokens": 2048,
86
+ }},
87
+ ),
88
+ aspect=aspect,
89
+ )
90
+
91
+ load_the_dataset >> ultrafeedback_evaluator
92
+
93
+ if __name__ == "__main__":
94
+ distiset = pipeline.run()
95
+ """
96
+ else:
97
+ code = f"""
98
+ # Requirements: `pip install distilabel[hf-inference-endpoints]`
99
+ import os
100
+ from distilabel.pipeline import Pipeline
101
+ from distilabel.steps import LoadDataFromDicts, CombineOutputs
102
+ from distilabel.steps.tasks import UltraFeedback
103
+ from distilabel.llms import InferenceEndpointsLLM
104
+
105
+ MODEL = "{MODEL}"
106
+ os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
107
+
108
+ hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}")
109
+ data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
110
+
111
+ with Pipeline(name="ultrafeedback") as pipeline:
112
+
113
+ load_the_dataset = LoadDataFromDicts(
114
+ data = data,
115
+ )
116
+
117
+ tasks = []
118
+ for aspect in aspects:
119
+ evaluate_responses = UltraFeedback(
120
+ name=f"evaluate-responses-{{aspect}}",
121
+ aspect=aspect,
122
+ llm=InferenceEndpointsLLM(
123
+ model_id=MODEL,
124
+ tokenizer_id=MODEL,
125
+ api_key=os.environ["HF_TOKEN"],
126
+ generation_kwargs={{
127
+ "temperature": 0.7,
128
+ "max_new_tokens": 2048,
129
+ }},
130
+ output_mappings={{
131
+ "ratings": f"ratings_{{aspect}}",
132
+ "types": f"type_{{aspect}}",
133
+ "rationales": f"rationales_for_types_{{aspect}}",
134
+ "rationales-for-ratings": f"rationales_for_ratings_{{aspect}}",
135
+ }} if aspect in ["truthfulness", "helpfulness"] else {{"rationales": f"rationales_{{aspect}}", "ratings": f"ratings_{{aspect}}"}},
136
+ )
137
+ tasks.append(evaluate_responses)
138
+
139
+ combine_outputs = CombineOutputs()
140
+
141
+ load_the_dataset >> tasks >> combine_outputs
142
+
143
+ if __name__ == "__main__":
144
+ distiset = pipeline.run()
145
+ """
146
+ return code
147
+
148
+
149
+ def generate_custom_pipeline_code(
150
+ repo_id, subset, split, prompt_template, structured_output, num_rows
151
+ ):
152
+ columns = extract_column_names(structured_output)
153
+ code = f"""
154
+ # Requirements: `pip install distilabel[hf-inference-endpoints, instructor]`
155
+ import os
156
+ from distilabel.pipeline import Pipeline
157
+ from distilabel.steps import LoadDataFromHub
158
+ from distilabel.steps.tasks import TextGeneration
159
+ from distilabel.llms import InferenceEndpointsLLM
160
+
161
+ MODEL = "{MODEL}"
162
+ CUSTOM_TEMPLATE = "{prompt_template}"
163
+ os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
164
+
165
+ with Pipeline(name="custom-evaluation") as pipeline:
166
+ load_the_dataset = LoadDataFromHub(
167
+ repo_id="{repo_id}",
168
+ config="{subset}",
169
+ split="{split}",
170
+ num_examples={num_rows},
171
+ batch_size=2
172
+ )
173
+ custom_evaluator = TextGeneration(
174
+ llm=InferenceEndpointsLLM(
175
+ model_id=MODEL,
176
+ tokenizer_id=MODEL,
177
+ api_key=os.environ["HF_TOKEN"],
178
+ structured_output={{"format": "json", "schema": {structured_output}}},
179
+ generation_kwargs={{
180
+ "temperature": 0.7,
181
+ "max_new_tokens": 2048,
182
+ }},
183
+ ),
184
+ template=CUSTOM_TEMPLATE,
185
+ columns={columns}
186
+ )
187
+
188
+ load_the_dataset >> custom_evaluator
189
+
190
+ if __name__ == "__main__":
191
+ distiset = pipeline.run()
192
+ """
193
+ return code
194
+
195
+
196
+ def generate_pipeline_code(repo_id, aspects, instruction_column, response_columns, prompt_template, structured_output, num_rows, eval_type):
197
+ if repo_id is None:
198
+ subset = "default"
199
+ split = "train"
200
+ else:
201
+ subset = get_dataset_config_names(repo_id)[0]
202
+ split = get_dataset_split_names(repo_id, subset)[0]
203
+ if eval_type == "ultrafeedback":
204
+ return generate_ultrafeedback_pipeline_code(repo_id, subset, split, aspects, instruction_column, response_columns, num_rows)
205
+ return generate_custom_pipeline_code(repo_id, subset, split, prompt_template, structured_output, num_rows)
src/distilabel_dataset_generator/utils.py CHANGED
@@ -1,8 +1,11 @@
 
1
  import os
2
  from typing import List, Optional, Union
3
 
4
  import argilla as rg
5
  import gradio as gr
 
 
6
  from gradio.oauth import (
7
  OAUTH_CLIENT_ID,
8
  OAUTH_CLIENT_SECRET,
@@ -11,6 +14,7 @@ from gradio.oauth import (
11
  get_space,
12
  )
13
  from huggingface_hub import whoami
 
14
 
15
  _LOGGED_OUT_CSS = ".main_ui_logged_out{opacity: 0.3; pointer-events: none}"
16
 
@@ -132,6 +136,91 @@ def get_argilla_client() -> Union[rg.Argilla, None]:
132
  except Exception:
133
  return None
134
 
135
-
136
  def get_preprocess_labels(labels: Optional[List[str]]) -> List[str]:
137
  return list(set([label.lower().strip() for label in labels])) if labels else []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
  import os
3
  from typing import List, Optional, Union
4
 
5
  import argilla as rg
6
  import gradio as gr
7
+ import numpy as np
8
+ import pandas as pd
9
  from gradio.oauth import (
10
  OAUTH_CLIENT_ID,
11
  OAUTH_CLIENT_SECRET,
 
14
  get_space,
15
  )
16
  from huggingface_hub import whoami
17
+ from jinja2 import Environment, meta
18
 
19
  _LOGGED_OUT_CSS = ".main_ui_logged_out{opacity: 0.3; pointer-events: none}"
20
 
 
136
  except Exception:
137
  return None
138
 
 
139
  def get_preprocess_labels(labels: Optional[List[str]]) -> List[str]:
140
  return list(set([label.lower().strip() for label in labels])) if labels else []
141
+
142
+
143
+ def column_to_list(dataframe: pd.DataFrame, column_name: str) -> List[str]:
144
+ if column_name in dataframe.columns:
145
+ return dataframe[column_name].tolist()
146
+ else:
147
+ raise ValueError(f"Column '{column_name}' does not exist.")
148
+
149
+
150
+ def process_columns(
151
+ dataframe,
152
+ instruction_column: str,
153
+ response_columns: Union[str, List[str]],
154
+ ) -> List[dict]:
155
+ instruction_column = [instruction_column]
156
+ if isinstance(response_columns, str):
157
+ response_columns = [response_columns]
158
+
159
+ data = []
160
+ for _, row in dataframe.iterrows():
161
+ instruction = ""
162
+ for col in instruction_column:
163
+ value = row[col]
164
+ if isinstance(value, (list, np.ndarray)):
165
+ user_contents = [d["content"] for d in value if d.get("role") == "user"]
166
+ if user_contents:
167
+ instruction = user_contents[-1]
168
+ elif isinstance(value, str):
169
+ try:
170
+ parsed_message = json.loads(value)
171
+ user_contents = [
172
+ d["content"] for d in parsed_message if d.get("role") == "user"
173
+ ]
174
+ if user_contents:
175
+ instruction = user_contents[-1]
176
+ except json.JSONDecodeError:
177
+ instruction = value
178
+ else:
179
+ instruction = ""
180
+
181
+ generations = []
182
+ for col in response_columns:
183
+ value = row[col]
184
+ if isinstance(value, (list, np.ndarray)):
185
+ if all(isinstance(item, dict) and "role" in item for item in value):
186
+ assistant_contents = [
187
+ d["content"] for d in value if d.get("role") == "assistant"
188
+ ]
189
+ if assistant_contents:
190
+ generations.append(assistant_contents[-1])
191
+ else:
192
+ generations.extend(value)
193
+ elif isinstance(value, str):
194
+ try:
195
+ parsed_message = json.loads(value)
196
+ assistant_contents = [
197
+ d["content"]
198
+ for d in parsed_message
199
+ if d.get("role") == "assistant"
200
+ ]
201
+ if assistant_contents:
202
+ generations.append(assistant_contents[-1])
203
+ except json.JSONDecodeError:
204
+ generations.append(value)
205
+ else:
206
+ pass
207
+
208
+ data.append({"instruction": instruction, "generations": generations})
209
+
210
+ return data
211
+
212
+
213
+ def extract_column_names(prompt_template: str) -> List[str]:
214
+ env = Environment()
215
+ parsed_content = env.parse(prompt_template)
216
+ variables = meta.find_undeclared_variables(parsed_content)
217
+ return list(variables)
218
+
219
+
220
+ def pad_or_truncate_list(lst, target_length):
221
+ lst = lst or []
222
+ lst_length = len(lst)
223
+ if lst_length >= target_length:
224
+ return lst[-target_length:]
225
+ else:
226
+ return lst + [None] * (target_length - lst_length)