Spaces:

MERaLiON
/

MERaLiON-AudioLLM

Running

App Files Files Community

YingxuHe commited on 3 days ago

Commit

5a227f0

1 Parent(s): aba3925

change retriever

Browse files

Files changed (3) hide show

src/content/agent.py +34 -19
src/content/common.py +1 -41
src/retrieval.py +103 -10

src/content/agent.py CHANGED Viewed

@@ -4,12 +4,11 @@ import base64
 import streamlit as st
 from src.generation import MAX_AUDIO_LENGTH
-from src.retrieval import retrieve_relevant_docs
 from src.utils import bytes_to_array, array_to_bytes
 from src.content.common import (
     MODEL_NAMES,
     AUDIO_SAMPLES_W_INSTRUCT,
-    STANDARD_QUERIES,
     DEFAULT_DIALOGUE_STATES,
     init_state_section,
     header_section,
@@ -20,21 +19,25 @@ from src.content.common import (
 LLM_PROMPT_TEMPLATE = """User asked a question about the audio clip.
-## User question
 {user_question}
-{audio_information_prompt}Please reply this user question with an friendly, accurate, and helpful answer."""
 AUDIO_INFO_TEMPLATE = """Here are some information about this audio clip.
 ## Audio Information
 {audio_information}
-This may or may not contain relevant information to the user question, please use with caution.
 """
 def _update_audio(audio_bytes):
     origin_audio_array = bytes_to_array(audio_bytes)
     truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000]
@@ -46,7 +49,7 @@ def _update_audio(audio_bytes):
 @st.fragment
 def successful_example_section():
-    audio_sample_names = [audio_sample_name for audio_sample_name in AUDIO_SAMPLES_W_INSTRUCT.keys()]
     st.markdown(":fire: **Successful Tasks and Examples**")
@@ -61,6 +64,7 @@ def successful_example_section():
             on_select=True,
             ag_messages=[],
             ag_model_messages=[],
             disprompt=True
         ),
         key='select')
@@ -83,7 +87,12 @@ def audio_attach_dialogue():
         label="**Upload Audio:**",
         label_visibility="collapsed",
         type=['wav', 'mp3'],
-        on_change=lambda: st.session_state.update(on_upload=True, ag_messages=[], ag_model_messages=[]),
         key='upload'
     )
@@ -98,7 +107,12 @@ def audio_attach_dialogue():
     uploaded_file = st.audio_input(
         label="**Record Audio:**",
         label_visibility="collapsed",
-        on_change=lambda: st.session_state.update(on_record=True, ag_messages=[], ag_model_messages=[]),
         key='record'
     )
@@ -132,15 +146,16 @@ def bottom_input_section():
 def _prepare_final_prompt_with_ui(one_time_prompt):
-    relevant_query_indices = retrieve_relevant_docs(one_time_prompt, STANDARD_QUERIES)
-    if len(st.session_state.ag_messages) <= 2:
-        relevant_query_indices.append(0)
-    relevant_query_indices = list(
-        set(relevant_query_indices).difference(st.session_state.ag_visited_query_indices)
-        )
-    st.session_state.ag_visited_query_indices.extend(relevant_query_indices)
     if not relevant_query_indices:
         return LLM_PROMPT_TEMPLATE.format(
@@ -149,7 +164,7 @@ def _prepare_final_prompt_with_ui(one_time_prompt):
         )
     audio_info = []
-    with st.status("Thought process...", expanded=True) as status:
         for idx in relevant_query_indices:
             error_msg, warnings, response = retrive_response_with_ui(
                 model_name=MODEL_NAMES["with_lora"]["vllm_name"],
@@ -194,7 +209,7 @@ def conversation_section():
             for warning_msg in message.get("warnings", []):
                 st.warning(warning_msg)
             if process := message.get("process", []):
-                with st.status("Thought process...", expanded=True, state="complete"):
                     for proc in process:
                         if proc.get("error"):
                             st.error(proc["error"])
@@ -242,7 +257,7 @@ def conversation_section():
 def agent_page():
     init_state_section()
-    header_section(component_name="Agent System", icon="👥")
     with st.sidebar:
         sidebar_fragment()

 import streamlit as st
 from src.generation import MAX_AUDIO_LENGTH
+from src.retrieval import STANDARD_QUERIES, retrieve_relevant_docs
 from src.utils import bytes_to_array, array_to_bytes
 from src.content.common import (
     MODEL_NAMES,
     AUDIO_SAMPLES_W_INSTRUCT,
     DEFAULT_DIALOGUE_STATES,
     init_state_section,
     header_section,
 LLM_PROMPT_TEMPLATE = """User asked a question about the audio clip.
+## User Question
 {user_question}
+{audio_information_prompt}Please reply to user's question with a friendly, accurate, and helpful answer."""
 AUDIO_INFO_TEMPLATE = """Here are some information about this audio clip.
 ## Audio Information
 {audio_information}
+However, the audio analysis may or may not contain relevant information to the user question, please only reply the user with the relevant information.
 """
+AUDIO_ANALYSIS_STATUS = "Analyzing audio..."
 def _update_audio(audio_bytes):
     origin_audio_array = bytes_to_array(audio_bytes)
     truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000]
 @st.fragment
 def successful_example_section():
+    audio_sample_names = [name for name in AUDIO_SAMPLES_W_INSTRUCT.keys() if "Paralinguistic" in name]
     st.markdown(":fire: **Successful Tasks and Examples**")
             on_select=True,
             ag_messages=[],
             ag_model_messages=[],
+            ag_visited_query_indices=[],
             disprompt=True
         ),
         key='select')
         label="**Upload Audio:**",
         label_visibility="collapsed",
         type=['wav', 'mp3'],
+        on_change=lambda: st.session_state.update(
+            on_upload=True,
+            ag_messages=[],
+            ag_model_messages=[],
+            ag_visited_query_indices=[]
+            ),
         key='upload'
     )
     uploaded_file = st.audio_input(
         label="**Record Audio:**",
         label_visibility="collapsed",
+        on_change=lambda: st.session_state.update(
+            on_record=True,
+            ag_messages=[],
+            ag_model_messages=[],
+            ag_visited_query_indices=[]
+            ),
         key='record'
     )
 def _prepare_final_prompt_with_ui(one_time_prompt):
+    with st.spinner("Searching appropriate querys..."):
+        relevant_query_indices = retrieve_relevant_docs(one_time_prompt)
+        if len(st.session_state.ag_messages) <= 2:
+            relevant_query_indices.append(0)
+        relevant_query_indices = list(
+            set(relevant_query_indices).difference(st.session_state.ag_visited_query_indices)
+            )
+        st.session_state.ag_visited_query_indices.extend(relevant_query_indices)
     if not relevant_query_indices:
         return LLM_PROMPT_TEMPLATE.format(
         )
     audio_info = []
+    with st.status(AUDIO_ANALYSIS_STATUS, expanded=True) as status:
         for idx in relevant_query_indices:
             error_msg, warnings, response = retrive_response_with_ui(
                 model_name=MODEL_NAMES["with_lora"]["vllm_name"],
             for warning_msg in message.get("warnings", []):
                 st.warning(warning_msg)
             if process := message.get("process", []):
+                with st.status(AUDIO_ANALYSIS_STATUS, expanded=True, state="complete"):
                     for proc in process:
                         if proc.get("error"):
                             st.error(proc["error"])
 def agent_page():
     init_state_section()
+    header_section(component_name="Chatbot", icon="👥")
     with st.sidebar:
         sidebar_fragment()

src/content/common.py CHANGED Viewed

@@ -302,46 +302,6 @@ AUDIO_SAMPLES_W_INSTRUCT = {
 }
-STANDARD_QUERIES = [
-    {
-        "query_text": "Please transcribe this speech.",
-        "doc_text": "Listen to a speech and write down exactly what is being said in text form. It's essentially converting spoken words into written words. Provide the exact transcription of the given audio. Record whatever the speaker has said into written text.",
-        "response_prefix_text": "The transcription of the speech is: ",
-        "ui_text": "speech trancription"
-    },
-    {
-        "query_text": "Please describe what happended in this audio",
-        "doc_text": "Text captions describing the sound events and environments in the audio clips, describing the events and actions happened in the audio.",
-        "response_prefix_text": "Events in this audio clip: ",
-        "ui_text": "audio caption"
-    },
-    {
-        "query_text": "May I know the gender of the speakers",
-        "doc_text": "Please identify the gender of the speaker. For instance, whether is the speaker male or female.",
-        "response_prefix_text": "By analyzing pitch, formants, harmonics, and prosody features, which reflect physiological and speech pattern differences between genders: ",
-        "ui_text": "gender recognition"
-    },
-    {
-        "query_text": "May I know the nationality of the speakers",
-        "doc_text": "Discover speakers' nationality, country, or the place he is coming from, from his/her accent, pronunciation patterns, and other language-specific speech features influenced by cultural and linguistic backgrounds.",
-        "response_prefix_text": "By analyzing accent, pronunciation patterns, intonation, rhythm, phoneme usage, and language-specific speech features influenced by cultural and linguistic backgrounds: ",
-        "ui_text": "accent recognition"
-    },
-    {
-        "query_text": "Can you guess which ethnic group this person is from based on their accent.",
-        "doc_text": "Discover speakers' ethnic group, home country, or the place he is coming from, from his/her accent, tone, and other vocal characteristics influenced by cultural, regional, and linguistic factors.",
-        "response_prefix_text": "By analyzing speech features like accent, tone, intonation, phoneme variations, and vocal characteristics influenced by cultural, regional, and linguistic factors: ",
-        "ui_text": "accent recognition"
-    },
-    {
-        "query_text": "What do you think the speakers are feeling.",
-        "doc_text": "What do you think the speakers are feeling. Please identify speakers' emotions by analyzing vocal features like pitch, tone, volume, speech rate, rhythm, and spectral energy, which reflect emotional states such as happiness, anger, sadness, or fear.",
-        "response_prefix_text": "By analyzing vocal features like pitch, tone, volume, speech rate, rhythm, and spectral energy: ",
-        "ui_text": "emotion recognition"
-    },
-]
 def init_state_section():
     st.set_page_config(page_title='MERaLiON-AudioLLM', page_icon = "🔥", layout='wide')
@@ -414,7 +374,7 @@ def header_section(component_name="Playground", icon="🤖"):
 def sidebar_fragment():
     with st.container(height=256, border=False):
         st.page_link("pages/playground.py", disabled=st.session_state.disprompt, label="🚀 Playground")
-        st.page_link("pages/agent.py", disabled=st.session_state.disprompt, label="👥 Multi-Agent System")
         st.page_link("pages/voice_chat.py", disabled=st.session_state.disprompt, label="🗣️ Voice Chat (experimental)")
     st.divider()

 }
 def init_state_section():
     st.set_page_config(page_title='MERaLiON-AudioLLM', page_icon = "🔥", layout='wide')
 def sidebar_fragment():
     with st.container(height=256, border=False):
         st.page_link("pages/playground.py", disabled=st.session_state.disprompt, label="🚀 Playground")
+        st.page_link("pages/agent.py", disabled=st.session_state.disprompt, label="👥 Chatbot")
         st.page_link("pages/voice_chat.py", disabled=st.session_state.disprompt, label="🗣️ Voice Chat (experimental)")
     st.divider()

src/retrieval.py CHANGED Viewed

@@ -1,20 +1,113 @@
-from typing import Dict, List
 import numpy as np
 import streamlit as st
-from FlagEmbedding import FlagReranker
 @st.cache_resource()
 def load_retriever():
-    reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)
-    reranker.compute_score([["test", "test"]], normalize=True)
-    return reranker
-def retrieve_relevant_docs(user_question, docs: List[Dict]) -> List[int]:
-    scores = st.session_state.retriever.compute_score([[user_question, d["doc_text"]] for d in docs], normalize=True)
-    normalized_scores = np.array(scores) / np.sum(scores)
-    selected_indices = np.where((np.array(scores) > 0.2) & (normalized_scores > 0.3))[0]
     return selected_indices.tolist()

+from typing import List
 import numpy as np
 import streamlit as st
+from FlagEmbedding import BGEM3FlagModel
+STANDARD_QUERIES = [
+    {
+        "query_text": "Please transcribe this speech.",
+        "doc_text": "Listen to a speech and write down exactly what is being said in text form. It's essentially converting spoken words into written words. Provide the exact transcription of the given audio. Record whatever the speaker has said into written text.",
+        "response_prefix_text": "The transcription of the speech is: ",
+        "ui_text": "speech trancription"
+    },
+    {
+        "query_text": "Please describe what happended in this audio",
+        "doc_text": "Text captions describing the sound events and environments in the audio clips, describing the events and actions happened in the audio.",
+        "response_prefix_text": "Events in this audio clip: ",
+        "ui_text": "audio caption"
+    },
+    {
+        "query_text": "May I know the gender of the speakers",
+        "doc_text": "Identify the gender, male or female, based on pitch, formants, harmonics, and prosody features, and other speech pattern differences between genders.",
+        "response_prefix_text": "By analyzing pitch, formants, harmonics, and prosody features, which reflect physiological and speech pattern differences between genders: ",
+        "ui_text": "gender recognition"
+    },
+    {
+        "query_text": "May I know the nationality of the speakers",
+        "doc_text": "Discover speakers' nationality, country, or the place he is coming from, from his/her accent, pronunciation patterns, and other language-specific speech features influenced by cultural and linguistic backgrounds.",
+        "response_prefix_text": "By analyzing accent, pronunciation patterns, intonation, rhythm, phoneme usage, and language-specific speech features influenced by cultural and linguistic backgrounds: ",
+        "ui_text": "natinoality recognition"
+    },
+    {
+        "query_text": "Can you guess which ethnic group this person is from based on their accent.",
+        "doc_text": "Discover speakers' ethnic group, home country, or the place he is coming from, from his/her accent, tone, and other vocal characteristics influenced by cultural, regional, and linguistic factors.",
+        "response_prefix_text": "By analyzing speech features like accent, tone, intonation, phoneme variations, and vocal characteristics influenced by cultural, regional, and linguistic factors: ",
+        "ui_text": "ethnic group recognition"
+    },
+    {
+        "query_text": "What do you think the speakers are feeling.",
+        "doc_text": "What do you think the speakers are feeling. Please identify speakers' emotions by analyzing vocal features like pitch, tone, volume, speech rate, rhythm, and spectral energy, which reflect emotional states such as happiness, anger, sadness, or fear.",
+        "response_prefix_text": "By analyzing vocal features like pitch, tone, volume, speech rate, rhythm, and spectral energy: ",
+        "ui_text": "emotion recognition"
+    },
+]
+def _colbert_score(q_reps, p_reps):
+    """Compute colbert scores of input queries and passages.
+    Args:
+        q_reps (np.ndarray): Multi-vector embeddings for queries.
+        p_reps (np.ndarray): Multi-vector embeddings for passages/corpus.
+    Returns:
+        torch.Tensor: Computed colbert scores.
+    """
+    # q_reps, p_reps = torch.from_numpy(q_reps), torch.from_numpy(p_reps)
+    token_scores = np.einsum('in,jn->ij', q_reps, p_reps)
+    scores = token_scores.max(-1)
+    scores = np.sum(scores) / q_reps.shape[0]
+    return scores
+class QueryRetriever:
+    def __init__(self, docs):
+        self.model = BGEM3FlagModel('BAAI/bge-m3',  use_fp16=True)
+        self.docs = docs
+        self.doc_vectors = self.model.encode(
+            [d["doc_text"] for d in self.docs],
+            return_sparse=True,
+            return_colbert_vecs=True
+        )
+        self.scorer_attrs = {
+            "lexical_weights": {
+                "method": self.model.compute_lexical_matching_score,
+                "weight": 0.2
+                },
+            "colbert_vecs": {
+                "method": _colbert_score,
+                "weight": 0.8
+                },
+        }
+    def get_relevant_doc_indices(self, prompt, normalize=False) -> np.ndarray:
+        scores = np.zeros(len(self.docs))
+        if not prompt:
+            return scores
+        prompt_vector = self.model.encode(
+            prompt,
+            return_sparse=True,
+            return_colbert_vecs=True
+        )
+        for scorer_name, scorer_attrs in self.scorer_attrs.items():
+            for i, doc_vec in enumerate(self.doc_vectors[scorer_name]):
+                scores[i] += scorer_attrs["method"](prompt_vector[scorer_name], doc_vec)
+        if normalize:
+            scores = scores / np.sum(scores)
+        return scores
 @st.cache_resource()
 def load_retriever():
+    return QueryRetriever(docs=STANDARD_QUERIES)
+def retrieve_relevant_docs(user_question: str) -> List[int]:
+    scores = st.session_state.retriever.get_relevant_doc_indices(user_question, normalize=True)
+    selected_indices = np.where(scores > 0.2)[0]
     return selected_indices.tolist()