Spaces:

MERaLiON
/

MERaLiON-AudioLLM

Running

App Files Files Community

YingxuHe commited on 4 days ago

Commit

aba3925

1 Parent(s): aea1886

refactor

Browse files

Files changed (6) hide show

src/content/agent.py +52 -41
src/content/common.py +3 -3
src/retrieval.py +2 -2
style/app_style.css +18 -0
style/normal_window.css +2 -2
style/small_window.css +2 -8

src/content/agent.py CHANGED Viewed

@@ -131,6 +131,54 @@ def bottom_input_section():
             st.session_state.new_prompt = chat_input
 def conversation_section():
     chat_message_container = st.container(height=480)
     if st.session_state.ag_audio_array.size:
@@ -170,49 +218,12 @@ def conversation_section():
         with chat_message_container.chat_message("assistant"):
             assistant_message = {"role": "assistant", "process": []}
             st.session_state.ag_messages.append(assistant_message)
-            relevant_query_indices = retrieve_relevant_docs(one_time_prompt, STANDARD_QUERIES)
-            if len(st.session_state.ag_messages) <= 2:
-                relevant_query_indices.append(0)
-            relevant_query_indices = list(set(relevant_query_indices).difference(st.session_state.ag_visited_query_indices))
-            audio_info = []
-            if relevant_query_indices:
-                with st.status("Thought process...", expanded=True) as status:
-                    for idx in relevant_query_indices:
-                        error_msg, warnings, response = retrive_response_with_ui(
-                            model_name=MODEL_NAMES["with_lora"]["vllm_name"],
-                            prompt=STANDARD_QUERIES[idx]["query_text"],
-                            array_audio=st.session_state.ag_audio_array,
-                            base64_audio=st.session_state.ag_audio_base64,
-                            prefix=f"**{STANDARD_QUERIES[idx]['ui_text']}** :speech_balloon: : ",
-                            stream=True
-                        )
-                        audio_info.append(STANDARD_QUERIES[idx]["response_prefix_text"] + response)
-                        assistant_message["process"].append({
-                            "error": error_msg,
-                            "warnings": warnings,
-                            "content": response
-                        })
-                    status.update(state="complete")
-            audio_information_prompt = ""
-            if audio_info:
-                audio_information_prompt = AUDIO_INFO_TEMPLATE.format(
-                    audio_information="\n".join(audio_info)
-                )
-            prompt = LLM_PROMPT_TEMPLATE.format(
-                user_question=one_time_prompt,
-                audio_information_prompt=audio_information_prompt
-            )
             error_msg, warnings, response = retrive_response_with_ui(
                 model_name=MODEL_NAMES["wo_lora"]["vllm_name"],
-                prompt=prompt,
                 array_audio=st.session_state.ag_audio_array,
                 base64_audio="",
                 stream=True,
@@ -221,7 +232,7 @@ def conversation_section():
             assistant_message.update({"error": error_msg, "warnings": warnings, "content": response})
             st.session_state.ag_model_messages.extend([
-                {"role": "user", "content": prompt},
                 {"role": "assistant", "content": response}
             ])

             st.session_state.new_prompt = chat_input
+def _prepare_final_prompt_with_ui(one_time_prompt):
+    relevant_query_indices = retrieve_relevant_docs(one_time_prompt, STANDARD_QUERIES)
+    if len(st.session_state.ag_messages) <= 2:
+        relevant_query_indices.append(0)
+    relevant_query_indices = list(
+        set(relevant_query_indices).difference(st.session_state.ag_visited_query_indices)
+        )
+    st.session_state.ag_visited_query_indices.extend(relevant_query_indices)
+    if not relevant_query_indices:
+        return LLM_PROMPT_TEMPLATE.format(
+            user_question=one_time_prompt,
+            audio_information_prompt=""
+        )
+    audio_info = []
+    with st.status("Thought process...", expanded=True) as status:
+        for idx in relevant_query_indices:
+            error_msg, warnings, response = retrive_response_with_ui(
+                model_name=MODEL_NAMES["with_lora"]["vllm_name"],
+                prompt=STANDARD_QUERIES[idx]["query_text"],
+                array_audio=st.session_state.ag_audio_array,
+                base64_audio=st.session_state.ag_audio_base64,
+                prefix=f"**{STANDARD_QUERIES[idx]['ui_text']}** :speech_balloon: : ",
+                stream=True
+            )
+            audio_info.append(STANDARD_QUERIES[idx]["response_prefix_text"] + response)
+            st.session_state.ag_messages[-1]["process"].append({
+                "error": error_msg,
+                "warnings": warnings,
+                "content": response
+            })
+        status.update(state="complete")
+    audio_information_prompt = AUDIO_INFO_TEMPLATE.format(
+        audio_information="\n".join(audio_info)
+    )
+    return LLM_PROMPT_TEMPLATE.format(
+        user_question=one_time_prompt,
+        audio_information_prompt=audio_information_prompt
+    )
 def conversation_section():
     chat_message_container = st.container(height=480)
     if st.session_state.ag_audio_array.size:
         with chat_message_container.chat_message("assistant"):
             assistant_message = {"role": "assistant", "process": []}
             st.session_state.ag_messages.append(assistant_message)
+            final_prompt = _prepare_final_prompt_with_ui(one_time_prompt)
             error_msg, warnings, response = retrive_response_with_ui(
                 model_name=MODEL_NAMES["wo_lora"]["vllm_name"],
+                prompt=final_prompt,
                 array_audio=st.session_state.ag_audio_array,
                 base64_audio="",
                 stream=True,
             assistant_message.update({"error": error_msg, "warnings": warnings, "content": response})
             st.session_state.ag_model_messages.extend([
+                {"role": "user", "content": final_prompt},
                 {"role": "assistant", "content": response}
             ])

src/content/common.py CHANGED Viewed

@@ -317,19 +317,19 @@ STANDARD_QUERIES = [
     },
     {
         "query_text": "May I know the gender of the speakers",
-        "doc_text": "Please identify speaker gender by analyzing pitch, formants, harmonics, and prosody features, which reflect physiological and speech pattern differences between genders.",
         "response_prefix_text": "By analyzing pitch, formants, harmonics, and prosody features, which reflect physiological and speech pattern differences between genders: ",
         "ui_text": "gender recognition"
     },
     {
         "query_text": "May I know the nationality of the speakers",
-        "doc_text": "Discover speakers' nationality, country, or the place he is coming from. Analyze speakers' accent, pronunciation patterns, intonation, rhythm, phoneme usage, and language-specific speech features influenced by cultural and linguistic backgrounds.",
         "response_prefix_text": "By analyzing accent, pronunciation patterns, intonation, rhythm, phoneme usage, and language-specific speech features influenced by cultural and linguistic backgrounds: ",
         "ui_text": "accent recognition"
     },
     {
         "query_text": "Can you guess which ethnic group this person is from based on their accent.",
-        "doc_text": "Discover speakers' ethnic group, home country, or the place he is coming from, from speech features like accent, tone, intonation, phoneme variations, and vocal characteristics influenced by cultural, regional, and linguistic factors.",
         "response_prefix_text": "By analyzing speech features like accent, tone, intonation, phoneme variations, and vocal characteristics influenced by cultural, regional, and linguistic factors: ",
         "ui_text": "accent recognition"
     },

     },
     {
         "query_text": "May I know the gender of the speakers",
+        "doc_text": "Please identify the gender of the speaker. For instance, whether is the speaker male or female.",
         "response_prefix_text": "By analyzing pitch, formants, harmonics, and prosody features, which reflect physiological and speech pattern differences between genders: ",
         "ui_text": "gender recognition"
     },
     {
         "query_text": "May I know the nationality of the speakers",
+        "doc_text": "Discover speakers' nationality, country, or the place he is coming from, from his/her accent, pronunciation patterns, and other language-specific speech features influenced by cultural and linguistic backgrounds.",
         "response_prefix_text": "By analyzing accent, pronunciation patterns, intonation, rhythm, phoneme usage, and language-specific speech features influenced by cultural and linguistic backgrounds: ",
         "ui_text": "accent recognition"
     },
     {
         "query_text": "Can you guess which ethnic group this person is from based on their accent.",
+        "doc_text": "Discover speakers' ethnic group, home country, or the place he is coming from, from his/her accent, tone, and other vocal characteristics influenced by cultural, regional, and linguistic factors.",
         "response_prefix_text": "By analyzing speech features like accent, tone, intonation, phoneme variations, and vocal characteristics influenced by cultural, regional, and linguistic factors: ",
         "ui_text": "accent recognition"
     },

src/retrieval.py CHANGED Viewed

@@ -15,6 +15,6 @@ def load_retriever():
 def retrieve_relevant_docs(user_question, docs: List[Dict]) -> List[int]:
     scores = st.session_state.retriever.compute_score([[user_question, d["doc_text"]] for d in docs], normalize=True)
     normalized_scores = np.array(scores) / np.sum(scores)
-    selected_indices = np.where((np.array(scores) > 0.02) & (normalized_scores > 0.3))[0]
     return selected_indices.tolist()

 def retrieve_relevant_docs(user_question, docs: List[Dict]) -> List[int]:
     scores = st.session_state.retriever.compute_score([[user_question, d["doc_text"]] for d in docs], normalize=True)
     normalized_scores = np.array(scores) / np.sum(scores)
+    selected_indices = np.where((np.array(scores) > 0.2) & (normalized_scores > 0.3))[0]
     return selected_indices.tolist()

style/app_style.css CHANGED Viewed

@@ -1,6 +1,19 @@
 div[data-testid="stMainBlockContainer"] {
     padding-top: 2rem;
     padding-bottom: 1rem;
 }
 div[data-testid="stMainBlockContainer"] div[data-testid="stAudioInput"]>div {
@@ -25,6 +38,11 @@ div[data-testid="stChatMessage"]:has(> div[data-testid="stChatMessageAvatarUser"
     text-align: right;
 }
 /* audio quick actions */
 div[data-testid="stChatMessage"] div[data-testid="stVerticalBlock"]:has( audio[data-testid="stAudio"]) {

 div[data-testid="stMainBlockContainer"] {
     padding-top: 2rem;
     padding-bottom: 1rem;
+    height: 100%;
+}
+div[data-testid="stMainBlockContainer"]>div[data-testid="stVerticalBlockBorderWrapper"] {
+    height: 100%;
+}
+div[data-testid="stMainBlockContainer"]>div[data-testid="stVerticalBlockBorderWrapper"]>div {
+    height: 100%;
+}
+div[data-testid="stMainBlockContainer"]>div[data-testid="stVerticalBlockBorderWrapper"]>div>div {
+    height: 100%;
 }
 div[data-testid="stMainBlockContainer"] div[data-testid="stAudioInput"]>div {
     text-align: right;
 }
+div[height="480"][data-testid="stVerticalBlockBorderWrapper"] {
+    height: 100%;
+    min-height: 380px;
+}
 /* audio quick actions */
 div[data-testid="stChatMessage"] div[data-testid="stVerticalBlock"]:has( audio[data-testid="stAudio"]) {

style/normal_window.css CHANGED Viewed

@@ -1,7 +1,7 @@
-@media(min-width: 576px) {
     div[data-testid="stMainBlockContainer"] {
         padding-left: 5rem;
-        padding-bottom: 5rem;
     }
     div[data-testid="stBottomBlockContainer"] {

+@media(min-width: 800px) {
     div[data-testid="stMainBlockContainer"] {
         padding-left: 5rem;
+        padding-right: 5rem;
     }
     div[data-testid="stBottomBlockContainer"] {

style/small_window.css CHANGED Viewed

@@ -1,7 +1,7 @@
-@media(max-width: 576px) {
     div[data-testid="stMainBlockContainer"] {
         padding-left: 1rem;
-        padding-bottom: 1rem;
     }
     div[data-testid="stMainBlockContainer"] div[data-testid="stVerticalBlock"]>div[data-testid="stElementContainer"]:has( div[data-testid="stHeadingWithActionElements"]) {
@@ -15,10 +15,4 @@
     div[data-testid="stSidebarCollapsedControl"] button[data-testid="stBaseButton-headerNoPadding"]::after {
         content: "More Use Cases"
     }
-}
-@media (max-width: 916px) and (max-height: 958px) {
-    div[height="480"][data-testid="stVerticalBlockBorderWrapper"] {
-        height: 380px;
-    }
 }

+@media(max-width: 800px) {
     div[data-testid="stMainBlockContainer"] {
         padding-left: 1rem;
+        padding-right: 1rem;
     }
     div[data-testid="stMainBlockContainer"] div[data-testid="stVerticalBlock"]>div[data-testid="stElementContainer"]:has( div[data-testid="stHeadingWithActionElements"]) {
     div[data-testid="stSidebarCollapsedControl"] button[data-testid="stBaseButton-headerNoPadding"]::after {
         content: "More Use Cases"
     }
 }