import gradio as gr from huggingface_hub import InferenceClient import os import requests # Set up the inference API client hf_client = InferenceClient("meta-llama/Meta-Llama-3.1-70B-Instruct", token=os.getenv("HF_TOKEN")) def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): system_prefix = """ If the input language is Korean, respond in Korean. If it's English, respond in English. Do not output in both languages simultaneously. Always respond in Korean to Korean questions and in English to English questions. """ messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}] # Ensure alternating user/assistant messages for user_msg, assistant_msg in history: messages.append({"role": "user", "content": user_msg}) if assistant_msg: # Only add assistant message if it exists messages.append({"role": "assistant", "content": assistant_msg}) # Add the current user message messages.append({"role": "user", "content": message}) response = "" try: for message in hf_client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content if token is not None: response += token.strip("") yield response except Exception as e: yield f"An error occurred: {str(e)}" theme = "Nymbo/Nymbo_Theme" css = """ footer { visibility: hidden; } """ demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value=""" You are an AI assistant. """, label="System Prompt"), gr.Slider(minimum=1, maximum=2000, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], theme=theme, # Apply theme css=css # Apply CSS ) if __name__ == "__main__": demo.launch()