import json import gradio as gr import pandas as pd from datasets import load_dataset from gradio_huggingfacehub_search import HuggingfaceHubSearch from src.distilabel_dataset_generator.utils import get_org_dropdown def get_iframe(hub_repo_id) -> str: if not hub_repo_id: raise gr.Error("Hub repo id is required") url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer" iframe = f""" """ return iframe def get_valid_columns(df: pd.DataFrame): valid_columns = [] for col in df.columns: sample_val = df[col].iloc[0] if isinstance(sample_val, str) or ( isinstance(sample_val, list) and all(isinstance(item, dict) for item in sample_val) ): valid_columns.append(col) return valid_columns def load_dataset_from_hub(hub_repo_id: str, num_rows: int = 10): gr.Info(message="Loading dataset ...") if not hub_repo_id: raise gr.Error("Hub repo id is required") ds_dict = load_dataset(hub_repo_id) splits = list(ds_dict.keys()) ds = ds_dict[splits[0]] if num_rows: ds = ds.select(range(num_rows)) df = ds.to_pandas() # Get columns that contain either strings or lists of dictionaries valid_columns = get_valid_columns(df) return ( df, gr.Dropdown(choices=valid_columns, label="Instruction Column"), gr.Dropdown(choices=valid_columns, label="Instruction Column"), gr.Dropdown(choices=valid_columns, label="Response Column"), ) def define_evaluation_aspects(task_type: str): if task_type == "instruction": return gr.Dropdown( value=["overall-rating"], choices=["complexity", "quality"], label="Evaluation Aspects", multiselect=True, interactive=True, ) elif task_type == "instruction-response": return gr.Dropdown( value=["overall-rating"], choices=["helpfulness", "truthfulness", "overall-rating", "honesty"], label="Evaluation Aspects", multiselect=True, interactive=True, ) else: return gr.Dropdown(interactive=False, visible=False) def evaluate_instruction(df: pd.DataFrame, aspects: list[str], instruction_column: str): pass def evaluate_instruction_response( df: pd.DataFrame, aspects: list[str], instruction_column: str, response_column: str ): pass def evaluate_custom( df: pd.DataFrame, aspects: list[str], prompt_template: str, structured_output: dict ): pass def _apply_to_dataset( df: pd.DataFrame, eval_type: str, aspects_instruction: list[str], instruction_column: str, aspects_instruction_response: list[str], instruction_column_response: str, response_column_response: str, aspects_custom: list[str], prompt_template: str, structured_output: dict, ): if eval_type == "instruction": df = evaluate_instruction(df, aspects_instruction, instruction_column) elif eval_type == "instruction-response": df = evaluate_instruction_response( df, aspects_instruction_response, instruction_column_response, response_column_response, ) elif eval_type == "custom": df = evaluate_custom(df, aspects_custom, prompt_template, structured_output) return df def apply_to_sample_dataset( repo_id: str, eval_type: str, aspects_instruction: list[str], aspects_instruction_response: list[str], aspects_custom: list[str], instruction_instruction: str, instruction_instruction_response: str, response_instruction_response: str, prompt_template: str, structured_output: dict, ): df, _, _, _ = load_dataset_from_hub(repo_id, num_rows=10) df = _apply_to_dataset( df, eval_type, aspects_instruction, instruction_instruction, aspects_instruction_response, instruction_instruction_response, response_instruction_response, aspects_custom, prompt_template, structured_output, ) return df def push_to_hub( org_name: str, repo_name: str, private: bool, num_rows: int, original_repo_id: str, eval_type: str, aspects_instruction: list[str], aspects_instruction_response: list[str], aspects_custom: list[str], instruction_instruction: str, instruction_instruction_response: str, response_instruction_response: str, prompt_template: str, structured_output: dict, ): df, _, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows) df = _apply_to_dataset( df, eval_type, aspects_instruction, instruction_instruction, aspects_instruction_response, instruction_instruction_response, response_instruction_response, aspects_custom, prompt_template, structured_output, ) new_repo_id = f"{org_name}/{repo_name}" ###################### # Gradio UI ###################### with gr.Blocks() as app: gr.Markdown("## 1. Select your input dataset") with gr.Row(equal_height=False): with gr.Column(scale=1): search_in = HuggingfaceHubSearch( label="Search", placeholder="Search for a Dataset", search_type="dataset", sumbit_on_select=True, ) load_btn = gr.Button("Load dataset") with gr.Column(scale=3): search_out = gr.HTML(label="Dataset Preview") gr.HTML("