import os import nltk import openai import time import gradio as gr from threading import Thread #线程 用于定时器 from assets.char_poses_base64 import ( #角色动作 CHAR_IDLE_HTML, CHAR_THINKING_HTML, CHAR_TALKING_HTML) from app_utils import ( get_chat_history, initialize_knowledge_base, text_to_speech_gen, logging, buzz_user) global FUNC_CALL #全局变量 用于判断角色动作 FUNC_CALL = 0 global BUZZ_TIMEOUT #全局变量 用于定时器 BUZZ_TIMEOUT = 60 GENERAL_RSPONSE_TRIGGERS = ["I don't understand the question.", "I don't know", "Hello, my name is", "mentioned in the context provided"] MESSAGES = [{"role": "system", "content": "You are a helpful assistant.You accompany me to practice English and engage in scene dialogue. As a hotel attendant, I am checking in. You introduce the hotel to me and recommend hotel services to me. After receiving my needs, arrange for the service personnel to work. Please remember, my English is not very good. Please have a conversation with me in simple English. After you ask questions in English, please give me some English prompts so that I know how to answer you. Let's start the conversation. You first say hello to me."}] LOGGER = logging.getLogger('voice_agent') #日志 AUDIO_HTML = '' # Uncomment If this is your first Run: nltk.download('averaged_perceptron_tagger') #下载语料库 conv_model, voice_model = initialize_knowledge_base() #初始化知识库 def idle_timer(): global BUZZ_TIMEOUT while True: time.sleep(BUZZ_TIMEOUT) buzz_user() if BUZZ_TIMEOUT == 80: time.sleep(BUZZ_TIMEOUT) BUZZ_TIMEOUT = 60 def update_img(): global FUNC_CALL FUNC_CALL += 1 if FUNC_CALL % 2== 0: return CHAR_TALKING_HTML else: return CHAR_THINKING_HTML def get_response(history, audio_input): query_type = 'text' question =history[-1][0] global BUZZ_TIMEOUT BUZZ_TIMEOUT = 80 if not question: if audio_input: query_type = 'audio' os.rename(audio_input, audio_input + '.wav') audio_file = open(audio_input + '.wav', "rb") transcript = openai.Audio.transcribe("whisper-1", audio_file) question = transcript['text'] else: return None, None LOGGER.info("\nquery_type: %s", query_type) LOGGER.info("query_text: %s", question) print('\nquery_type:', query_type) print('\nquery_text:', question) if question.lower().strip() == 'hi': question = 'hello' answer = conv_model.run(question) LOGGER.info("\ndocument_response: %s", answer) print('\ndocument_response:', answer) for trigger in GENERAL_RSPONSE_TRIGGERS: if trigger in answer: MESSAGES.append({"role": "user", "content": question}) chat = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=MESSAGES, temperature=0.7, n=128, stop="\n" ) answer = chat.choices[0].message.content MESSAGES.append({"role": "assistant", "content": answer}) LOGGER.info("general_response: %s", answer) print('\ngeneral_response:', answer) AUDIO_HTML = text_to_speech_gen(answer) history[-1][1] = answer return history, AUDIO_HTML # buzz_usr_proc = Thread(target=idle_timer) with gr.Blocks(css = """#col_image{width:800px; height:800px; margin-left: auto; margin-right: auto;}""") as demo: with gr.Column(): output_html = gr.HTML(label="Felix's Voice", value=AUDIO_HTML) output_html.visible = False image1= gr.Image("assets/NPCtest1.png",shape=(800, 800),elem_id = "col_image") #assistant_character = gr.HTML(label=None, value=CHAR_IDLE_HTML, show_label=False) #with gr.Column(scale=0.1): chatbot = gr.Chatbot(label='Send a text or a voice input').style(height=285) with gr.Row(): msg = gr.Textbox(placeholder='Write a chat & press Enter.', show_label=False).style(container=False) with gr.Column(scale=0.5): audio_input = gr.Audio(source="microphone", type='filepath', show_label=False).style(container=False) button = gr.Button(value="Send") msg.submit(get_chat_history, [msg, chatbot], [msg, chatbot] ).then(get_response, [chatbot, audio_input], [chatbot, output_html] ) button.click(get_chat_history, [msg, chatbot], [msg, chatbot] ).then(get_response, [chatbot, audio_input], [chatbot, output_html] ) # buzz_usr_proc.start() demo.launch(debug=False, favicon_path='assets/favicon.png', show_api=False, share=False)