import gradio as gr from huggingface_hub import whoami from datetime import datetime, timezone from dataset_uploader import ParquetScheduler ########## # Setup # ########## # only show an info the first time uploading to the hub show_info = True every = 1 # we push once every 1 minute (use 5 if there are lots of people using the same HF token) choices = ["sharegpt","standard"] # schedulers schedulers = { "sft-sharegpt": ParquetScheduler(repo_id="phxia/sft-sharegpt", every=every), "sft-standard": ParquetScheduler(repo_id="phxia/sft-standard", every=every), "dpo-sharegpt": ParquetScheduler(repo_id="phxia/dpo-sharegpt", every=every), "dpo-standard": ParquetScheduler(repo_id="phxia/dpo-standard", every=every), } ########## # Utils # ########## def chat_message(role, content, prompt_type=None): """ A function that transforms the chat content into a chat message Args: role: A string, either "user" or "assistant" content: A string, the content of the message prompt_type: A string, either "standard" or "sharegpt" Returns: A dictionary, the message to be sent to the chatbot. """ if prompt_type == "sharegpt": if role == "user": role = "human" elif role == "assistant": role = "gpt" # sharegpt chat format return {"from": role, "value": content} else: return {"role": role, "content": content} def chat(prompt: str, history=[]): """ A function that generates a response to a given prompt. Args: prompt: A string, the prompt to be sent to the chatbot. history: A list of dictionaries, each dictionary being a message from the user or the assistant. Returns: A generator in the form of a single updated list of dictionaries, being a list of messages from the user and assistant """ if history == [] or (len(history) > 1 and history[-1]["role"] == "assistant"): history.append(chat_message("user", prompt)) else: history.append(chat_message("assistant", prompt)) return history def clear_textbox_field(): """ A function that clears the textbox field. """ return None def clear_both_fields(): """ A function that clears both the textbox and the chatbot. """ return None, None def clear_3_fields(): """ A function that clears both the textbox and the chatbot. """ return None, None, None def setup_submission(system_prompt="", history=[], chat_format="sharegpt"): # removes the extra metadata field from the chat history and format sharegpt accordingly for i in range(len(history)): sample = history[i] history[i] = chat_message( sample["role"], sample["content"], prompt_type=chat_format ) # add system prompt if provided system_prompt = system_prompt.strip() if system_prompt != "": sys = chat_message("system", system_prompt, prompt_type=chat_format) history.insert(0, sys) return history def save_sft_data(system_prompt="", history=[], sft_chat_format="sharegpt", oauth_token: gr.OAuthToken | None = None): """ A function that pushes the data to the hub. """ contributor_username = whoami(oauth_token.token)["name"] # setup the info message to only show once global show_info scheduler = schedulers[f"sft-{sft_chat_format}"] # case user clicked submit and did not have any chat history if history == []: raise gr.Error("you need to setup a chat first") # case history ends with user prompt if history[-1]["role"] == "user": raise gr.Error("history needs to end with assistant prompt") history = setup_submission(system_prompt, history, sft_chat_format) # preparing the submission data = {"contributor": contributor_username} data["timestamp"] = str(datetime.now(timezone.utc)) data["chat_format"] = sft_chat_format data["conversations"] = history # submitting the data scheduler.append(data) # show the info message only once if show_info: gr.Info("Data has been saved successfully (this message is only shown once)") gr.Info( "The scheduler may take up to 1 minute to push the data, please wait 🤗" ) show_info = False def save_dpo_data( system_prompt="", history=[], chosen="", rejected="", dpo_chat_format="sharegpt", oauth_token: gr.OAuthToken | None = None ): """ A function that pushes the data to the hub. """ contributor_username = whoami(oauth_token.token)["name"] # setup the info message to only show once global show_info scheduler = schedulers[f"dpo-{dpo_chat_format}"] # case user clicked submit and did not have any chat history if history == []: raise gr.Error("you need to setup a chat first") # case history ends with user prompt if history[-1]["role"] == "assistant": raise gr.Error("history needs to end with user prompt") # case chosen and rejected are not full chosen, rejected = chosen.strip(), rejected.strip() if chosen == "" or rejected == "": raise gr.Error( "both chosen and rejected need to have a text when you click the submit button" ) history = setup_submission(system_prompt, history, dpo_chat_format) chosen_chat, rejected_chat = history.copy(), history.copy() chosen_chat.append(chat_message("user", chosen, dpo_chat_format)) rejected_chat.append(chat_message("user", rejected, dpo_chat_format)) # preparing the submission data = {"contributor": contributor_username} data["timestamp"] = str(datetime.now(timezone.utc)) data["chat_format"] = dpo_chat_format data["prompt"] = history data["chosen"] = chosen_chat data["rejected"] = rejected_chat # submitting the data scheduler.append(data) # show the info message only once if show_info: gr.Info("Data has been saved successfully (this message is only shown once)") gr.Info( "The scheduler may take up to 1 minute to push the data, please wait 🤗" ) show_info = False def undo_chat(history): return history[:-2] ############## # Interface # ############## with gr.Blocks() as interface: gr.Markdown("