TheoLvs commited on
Commit
481f3b1
·
1 Parent(s): caf1faa

First commit CQA with Agents

Browse files
.gitignore CHANGED
@@ -5,3 +5,6 @@ __pycache__/utils.cpython-38.pyc
5
 
6
  notebooks/
7
  *.pyc
 
 
 
 
5
 
6
  notebooks/
7
  *.pyc
8
+
9
+ **/.ipynb_checkpoints/
10
+ **/.flashrank_cache/
app.py CHANGED
@@ -29,16 +29,16 @@ from utils import create_user_id
29
 
30
  # ClimateQ&A imports
31
  from climateqa.engine.llm import get_llm
32
- from climateqa.engine.rag import make_rag_chain
33
  from climateqa.engine.vectorstore import get_pinecone_vectorstore
34
  from climateqa.engine.retriever import ClimateQARetriever
35
  from climateqa.engine.embeddings import get_embeddings_function
36
- from climateqa.engine.prompts import audience_prompts
37
  from climateqa.sample_questions import QUESTIONS
38
  from climateqa.constants import POSSIBLE_REPORTS
39
  from climateqa.utils import get_image_from_azure_blob_storage
40
  from climateqa.engine.keywords import make_keywords_chain
41
- from climateqa.engine.rag import make_rag_papers_chain
42
 
43
  # Load environment variables in local mode
44
  try:
 
29
 
30
  # ClimateQ&A imports
31
  from climateqa.engine.llm import get_llm
32
+ from climateqa.engine.chains.answer_rag import make_rag_chain
33
  from climateqa.engine.vectorstore import get_pinecone_vectorstore
34
  from climateqa.engine.retriever import ClimateQARetriever
35
  from climateqa.engine.embeddings import get_embeddings_function
36
+ from climateqa.engine.chains.prompts import audience_prompts
37
  from climateqa.sample_questions import QUESTIONS
38
  from climateqa.constants import POSSIBLE_REPORTS
39
  from climateqa.utils import get_image_from_azure_blob_storage
40
  from climateqa.engine.keywords import make_keywords_chain
41
+ from climateqa.engine.chains.answer_rag import make_rag_papers_chain
42
 
43
  # Load environment variables in local mode
44
  try:
climateqa/engine/chains/__init__.py ADDED
File without changes
climateqa/engine/chains/answer_ai_impact.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts import ChatPromptTemplate
2
+ from langchain_core.output_parsers import StrOutputParser
3
+
4
+
5
+ prompt_template = """
6
+ You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
7
+ Always stay true to climate and nature science and do not make up information.
8
+ If you do not know the answer, just say you do not know.
9
+
10
+ ## Guidelines
11
+ - Explain that the environmental impact of AI is not covered by the IPCC or IPBES reports, but you can recommend info based on the sources below
12
+ - Answer the question in the original language of the question
13
+
14
+ ## Sources
15
+ - You can propose to visit this page https://climateqa.com/docs/carbon-footprint/ to learn more about ClimateQ&A's own carbon footprint
16
+ - You can recommend to look at the work of the AI & climate expert scientist Sasha Luccioni with in in particular those papers
17
+ - Power Hungry Processing: Watts Driving the Cost of AI Deployment? - https://arxiv.org/abs/2311.16863 - about the carbon footprint at the inference stage of AI models
18
+ - Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning - https://arxiv.org/abs/2302.08476
19
+ - Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model - https://arxiv.org/abs/2211.02001 - about the carbon footprint of training a large language model
20
+ - You can also recommend the following tools to calculate the carbon footprint of AI models
21
+ - CodeCarbon - https://github.com/mlco2/codecarbon to measure the carbon footprint of your code
22
+ - Ecologits - https://ecologits.ai/ to measure the carbon footprint of using LLMs APIs such
23
+ """
24
+
25
+
26
+ def make_ai_impact_chain(llm):
27
+
28
+ prompt = ChatPromptTemplate.from_messages([
29
+ ("system", prompt_template),
30
+ ("user", "{question}")
31
+ ])
32
+
33
+ chain = prompt | llm | StrOutputParser()
34
+ chain = chain.with_config({"run_name":"ai_impact_chain"})
35
+
36
+ return chain
37
+
38
+ def make_ai_impact_node(llm):
39
+
40
+ ai_impact_chain = make_ai_impact_chain(llm)
41
+
42
+ async def answer_ai_impact(state,config):
43
+ answer = await ai_impact_chain.ainvoke({"question":state["user_input"]},config)
44
+ return {"answer":answer}
45
+
46
+ return answer_ai_impact
climateqa/engine/chains/answer_chitchat.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts import ChatPromptTemplate
2
+ from langchain_core.output_parsers import StrOutputParser
3
+
4
+
5
+ chitchat_prompt_template = """
6
+ You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
7
+ Always stay true to climate and nature science and do not make up information.
8
+ If you do not know the answer, just say you do not know.
9
+
10
+ ## Guidelines
11
+ - If it's a conversational question, you can normally chat with the user
12
+ - If the question is not related to any topic about the environment, refuse to answer and politely ask the user to ask another question about the environment
13
+ - If the user ask if you speak any language, you can say you speak all languages :)
14
+ - If the user ask about the bot itself "ClimateQ&A", you can explain that you are an AI assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports and propose to visit the website here https://climateqa.com/docs/intro/ for more information
15
+ - If the question is about ESG regulations, standards, or frameworks like the CSRD, TCFD, SASB, GRI, CDP, etc., you can explain that this is not a topic covered by the IPCC or IPBES reports.
16
+ - Precise that you are specialized in finding trustworthy information from the scientific reports of the IPCC and IPBES and other scientific litterature
17
+ - If relevant you can propose up to 3 example of questions they could ask from the IPCC or IPBES reports from the examples below
18
+ - Always answer in the original language of the question
19
+
20
+ ## Examples of questions you can suggest (in the original language of the question)
21
+ "What evidence do we have of climate change?",
22
+ "Are human activities causing global warming?",
23
+ "What are the impacts of climate change?",
24
+ "Can climate change be reversed?",
25
+ "What is the difference between climate change and global warming?",
26
+ """
27
+
28
+
29
+ def make_chitchat_chain(llm):
30
+
31
+ prompt = ChatPromptTemplate.from_messages([
32
+ ("system", chitchat_prompt_template),
33
+ ("user", "{question}")
34
+ ])
35
+
36
+ chain = prompt | llm | StrOutputParser()
37
+ chain = chain.with_config({"run_name":"chitchat_chain"})
38
+
39
+ return chain
40
+
41
+
42
+
43
+ def make_chitchat_node(llm):
44
+
45
+ chitchat_chain = make_chitchat_chain(llm)
46
+
47
+ async def answer_chitchat(state,config):
48
+ answer = await chitchat_chain.ainvoke({"question":state["user_input"]},config)
49
+ return {"answer":answer}
50
+
51
+ return answer_chitchat
52
+
climateqa/engine/{rag.py → chains/answer_rag.py} RENAMED
@@ -6,7 +6,7 @@ from langchain_core.runnables import RunnablePassthrough, RunnableLambda, Runnab
6
  from langchain_core.prompts.prompt import PromptTemplate
7
  from langchain_core.prompts.base import format_document
8
 
9
- from climateqa.engine.reformulation import make_reformulation_chain
10
  from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
11
  from climateqa.engine.prompts import papers_prompt_template
12
  from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
@@ -131,4 +131,14 @@ def make_illustration_chain(llm):
131
  }
132
 
133
  illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
134
- return illustration_chain
 
 
 
 
 
 
 
 
 
 
 
6
  from langchain_core.prompts.prompt import PromptTemplate
7
  from langchain_core.prompts.base import format_document
8
 
9
+ from climateqa.engine.chains.reformulation import make_reformulation_chain
10
  from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
11
  from climateqa.engine.prompts import papers_prompt_template
12
  from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
 
131
  }
132
 
133
  illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
134
+ return illustration_chain
135
+
136
+
137
+ def make_answer_rag_node(llm):
138
+
139
+
140
+ def answer_rag(state):
141
+ answer = "\n".join([x["question"] for x in state["questions"]])
142
+ return {"answer":answer}
143
+
144
+ return answer_rag
climateqa/engine/chains/intent_routing.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain_core.pydantic_v1 import BaseModel, Field
3
+ from typing import List
4
+ from typing import Literal
5
+ from langchain.prompts import ChatPromptTemplate
6
+ from langchain_core.utils.function_calling import convert_to_openai_function
7
+ from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
8
+
9
+
10
+ class IntentRouter(BaseModel):
11
+ """Analyzing the user message input"""
12
+
13
+ language: str = Field(
14
+ description="Find the language of the message input in full words (ex: French, English, Spanish, ...), defaults to English",
15
+ default="English",
16
+ )
17
+ intent: str = Field(
18
+ enum=[
19
+ "ai_impact",
20
+ "geo_info",
21
+ "esg",
22
+ "search",
23
+ "chitchat",
24
+ ],
25
+ description="""
26
+ Categorize the user input in one of the following category
27
+ Any question
28
+
29
+ Examples:
30
+ - ai_impact = Environmental impacts of AI: "What are the environmental impacts of AI", "How does AI affect the environment"
31
+ - geo_info = Geolocated info about climate change: Any question where the user wants to know localized impacts of climate change, eg: "What will be the temperature in Marseille in 2050"
32
+ - esg = Any question about the ESG regulation, frameworks and standards like the CSRD, TCFD, SASB, GRI, CDP, etc.
33
+ - search = Searching for any quesiton about climate change, energy, biodiversity, nature, and everything we can find the IPCC or IPBES reports or scientific papers,
34
+ - chitchat = Any general question that is not related to the environment or climate change or just conversational, or if you don't think searching the IPCC or IPBES reports would be relevant
35
+ """,
36
+ )
37
+
38
+
39
+
40
+ def make_intent_router_chain(llm):
41
+
42
+ openai_functions = [convert_to_openai_function(IntentRouter)]
43
+ llm_with_router = llm.bind(functions = openai_functions,function_call={"name":"IntentRouter"})
44
+
45
+ prompt = ChatPromptTemplate.from_messages([
46
+ ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
47
+ ("user", "input: {input}")
48
+ ])
49
+
50
+ chain = prompt | llm_with_router | JsonOutputFunctionsParser()
51
+ return chain
52
+
53
+
54
+ def make_intent_router_node(llm):
55
+
56
+ router_chain = make_intent_router_chain(llm)
57
+
58
+ def route_input_message(state):
59
+ output = router_chain.invoke({"input":state["user_input"]})
60
+ if "language" not in output: output["language"] = "English"
61
+ output["query"] = state["user_input"]
62
+ return output
63
+
64
+ return route_input_message
65
+
66
+
67
+
68
+
69
+ # SAMPLE_QUESTIONS = [
70
+ # "Est-ce que l'IA a un impact sur l'environnement ?",
71
+ # "Que dit le GIEC sur l'impact de l'IA",
72
+ # "Qui sont les membres du GIEC",
73
+ # "What is the impact of El Nino ?",
74
+ # "Yo",
75
+ # "Hello ça va bien ?",
76
+ # "Par qui as tu été créé ?",
77
+ # "What role do cloud formations play in modulating the Earth's radiative balance, and how are they represented in current climate models?",
78
+ # "Which industries have the highest GHG emissions?",
79
+ # "What are invasive alien species and how do they threaten biodiversity and ecosystems?",
80
+ # "Are human activities causing global warming?",
81
+ # "What is the motivation behind mining the deep seabed?",
82
+ # "Tu peux m'écrire un poème sur le changement climatique ?",
83
+ # "Tu peux m'écrire un poème sur les bonbons ?",
84
+ # "What will be the temperature in 2100 in Strasbourg?",
85
+ # "C'est quoi le lien entre biodiversity and changement climatique ?",
86
+ # ]
climateqa/engine/{prompts.py → chains/prompts.py} RENAMED
File without changes
climateqa/engine/chains/query_transform.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from langchain_core.pydantic_v1 import BaseModel, Field
4
+ from typing import List
5
+ from typing import Literal
6
+ from langchain.prompts import ChatPromptTemplate
7
+ from langchain_core.utils.function_calling import convert_to_openai_function
8
+ from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
9
+
10
+
11
+ # Prompt from the original paper https://arxiv.org/pdf/2305.14283
12
+ # Query Rewriting for Retrieval-Augmented Large Language Models
13
+ class QueryDecomposition(BaseModel):
14
+ """
15
+ Decompose the user query into smaller parts to think step by step to answer this question
16
+ Act as a simple planning agent
17
+ """
18
+
19
+ questions: List[str] = Field(
20
+ description="""
21
+ Think step by step to answer this question, and provide one or several search engine questions in English for knowledge that you need.
22
+ Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
23
+ - If it's already a standalone question, you don't need to provide more questions, just reformulate it if relevant as a better question for a search engine
24
+ - If you need to decompose the question, output a list of maximum 3 questions
25
+ """
26
+ )
27
+
28
+
29
+ class Location(BaseModel):
30
+ country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
31
+ location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
32
+
33
+ class QueryAnalysis(BaseModel):
34
+ """
35
+ Analyzing the user query to extract topics, sources and date
36
+ Also do query expansion to get alternative search queries
37
+ Also provide simple keywords to feed a search engine
38
+ """
39
+
40
+ # keywords: List[str] = Field(
41
+ # description="""
42
+ # Extract the keywords from the user query to feed a search engine as a list
43
+ # Maximum 3 keywords
44
+
45
+ # Examples:
46
+ # - "What is the impact of deep sea mining ?" -> deep sea mining
47
+ # - "How will El Nino be impacted by climate change" -> el nino;climate change
48
+ # - "Is climate change a hoax" -> climate change;hoax
49
+ # """
50
+ # )
51
+
52
+ # alternative_queries: List[str] = Field(
53
+ # description="""
54
+ # Generate alternative search questions from the user query to feed a search engine
55
+ # """
56
+ # )
57
+
58
+ # step_back_question: str = Field(
59
+ # description="""
60
+ # You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.
61
+ # This questions should help you get more context and information about the user query
62
+ # """
63
+ # )
64
+
65
+ sources: List[Literal["IPCC", "IPBES", "IPOS","OpenAlex"]] = Field(
66
+ ...,
67
+ description="""
68
+ Given a user question choose which documents would be most relevant for answering their question,
69
+ - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
70
+ - IPBES is for questions about biodiversity and nature
71
+ - IPOS is for questions about the ocean and deep sea mining
72
+ - OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
73
+ """,
74
+ )
75
+ # topics: List[Literal[
76
+ # "Climate change",
77
+ # "Biodiversity",
78
+ # "Energy",
79
+ # "Decarbonization",
80
+ # "Climate science",
81
+ # "Nature",
82
+ # "Climate policy and justice",
83
+ # "Oceans",
84
+ # "Deep sea mining",
85
+ # "ESG and regulations",
86
+ # "CSRD",
87
+ # ]] = Field(
88
+ # ...,
89
+ # description = """
90
+ # Choose the topics that are most relevant to the user query, ex: Climate change, Energy, Biodiversity, ...
91
+ # """,
92
+ # )
93
+ # date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
94
+ # location:Location
95
+
96
+
97
+ def make_query_decomposition_chain(llm):
98
+
99
+ openai_functions = [convert_to_openai_function(QueryDecomposition)]
100
+ llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryDecomposition"})
101
+
102
+ prompt = ChatPromptTemplate.from_messages([
103
+ ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
104
+ ("user", "input: {input}")
105
+ ])
106
+
107
+ chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
108
+ return chain
109
+
110
+
111
+ def make_query_rewriter_chain(llm):
112
+
113
+ openai_functions = [convert_to_openai_function(QueryAnalysis)]
114
+ llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
115
+
116
+
117
+
118
+ prompt = ChatPromptTemplate.from_messages([
119
+ ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
120
+ ("user", "input: {input}")
121
+ ])
122
+
123
+
124
+ chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
125
+ return chain
126
+
127
+
128
+ def make_query_transform_node(llm):
129
+
130
+ decomposition_chain = make_query_decomposition_chain(llm)
131
+ rewriter_chain = make_query_rewriter_chain(llm)
132
+
133
+ def transform_query(state):
134
+
135
+ new_state = {}
136
+
137
+ # Decomposition
138
+ decomposition_output = decomposition_chain.invoke({"input":state["query"]})
139
+ new_state.update(decomposition_output)
140
+
141
+ # Query Analysis
142
+ questions = []
143
+ for question in new_state["questions"]:
144
+ question_state = {"question":question}
145
+ analysis_output = rewriter_chain.invoke({"input":question})
146
+ question_state.update(analysis_output)
147
+ questions.append(question_state)
148
+ new_state["questions"] = questions
149
+
150
+ return new_state
151
+
152
+ return transform_query
climateqa/engine/{reformulation.py → chains/reformulation.py} RENAMED
@@ -3,7 +3,7 @@ from langchain.output_parsers.structured import StructuredOutputParser, Response
3
  from langchain_core.prompts import PromptTemplate
4
  from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
5
 
6
- from climateqa.engine.prompts import reformulation_prompt_template
7
  from climateqa.engine.utils import pass_values, flatten_dict
8
 
9
 
 
3
  from langchain_core.prompts import PromptTemplate
4
  from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
5
 
6
+ from climateqa.engine.chains.prompts import reformulation_prompt_template
7
  from climateqa.engine.utils import pass_values, flatten_dict
8
 
9
 
climateqa/engine/chains/retriever.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from contextlib import contextmanager
4
+
5
+ from ..reranker import rerank_docs
6
+ from ..retriever import ClimateQARetriever
7
+
8
+
9
+
10
+ def divide_into_parts(target, parts):
11
+ # Base value for each part
12
+ base = target // parts
13
+ # Remainder to distribute
14
+ remainder = target % parts
15
+ # List to hold the result
16
+ result = []
17
+
18
+ for i in range(parts):
19
+ if i < remainder:
20
+ # These parts get base value + 1
21
+ result.append(base + 1)
22
+ else:
23
+ # The rest get the base value
24
+ result.append(base)
25
+
26
+ return result
27
+
28
+
29
+ @contextmanager
30
+ def suppress_output():
31
+ # Open a null device
32
+ with open(os.devnull, 'w') as devnull:
33
+ # Store the original stdout and stderr
34
+ old_stdout = sys.stdout
35
+ old_stderr = sys.stderr
36
+ # Redirect stdout and stderr to the null device
37
+ sys.stdout = devnull
38
+ sys.stderr = devnull
39
+ try:
40
+ yield
41
+ finally:
42
+ # Restore stdout and stderr
43
+ sys.stdout = old_stdout
44
+ sys.stderr = old_stderr
45
+
46
+
47
+
48
+ def make_retriever_node(vectorstore,reranker):
49
+
50
+ def retrieve_documents(state):
51
+
52
+ POSSIBLE_SOURCES = ["IPCC","IPBES","IPOS","OpenAlex"]
53
+ questions = state["questions"]
54
+
55
+ # Use sources from the user input or from the LLM detection
56
+ sources_input = state["sources_input"] if "sources_input" in state else ["auto"]
57
+ auto_mode = "auto" in sources_input
58
+
59
+ # Constants
60
+ k_final = 15
61
+ k_before_reranking = 100
62
+ k_summary = 5
63
+ rerank_by_question = True
64
+
65
+ # There are several options to get the final top k
66
+ # Option 1 - Get 100 documents by question and rerank by question
67
+ # Option 2 - Get 100/n documents by question and rerank the total
68
+ if rerank_by_question:
69
+ k_by_question = divide_into_parts(k_final,len(questions))
70
+
71
+ docs = []
72
+
73
+ for i,q in enumerate(questions):
74
+
75
+ sources = q["sources"]
76
+ question = q["question"]
77
+
78
+ # If auto mode, we use the sources detected by the LLM
79
+ if auto_mode:
80
+ sources = [x for x in sources if x in POSSIBLE_SOURCES]
81
+
82
+ # Otherwise, we use the config
83
+ else:
84
+ sources = sources_input
85
+
86
+ # Search the document store using the retriever
87
+ # Configure high top k for further reranking step
88
+ retriever = ClimateQARetriever(
89
+ vectorstore=vectorstore,
90
+ sources = sources,
91
+ # reports = ias_reports,
92
+ min_size = 200,
93
+ k_summary = k_summary,k_total = k_before_reranking,
94
+ threshold = 0.5,
95
+ )
96
+ docs_question = retriever.get_relevant_documents(question)
97
+
98
+ # Rerank
99
+ with suppress_output():
100
+ docs_question = rerank_docs(reranker,docs_question,question)
101
+
102
+ # If rerank by question we select the top documents for each question
103
+ if rerank_by_question:
104
+ docs_question = docs_question[:k_by_question[i]]
105
+
106
+ # Add sources used in the metadata
107
+ for doc in docs_question:
108
+ doc.metadata["sources_used"] = sources
109
+
110
+ # Add to the list of docs
111
+ docs.extend(docs_question)
112
+
113
+ # Sorting the list in descending order by rerank_score
114
+ # Then select the top k
115
+ docs = sorted(docs, key=lambda x: x.metadata["rerank_score"], reverse=True)
116
+ docs = docs[:k_final]
117
+
118
+ new_state = {"documents":docs}
119
+ return new_state
120
+
121
+ return retrieve_documents
122
+
climateqa/engine/chains/sample_router.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # from typing import List
3
+ # from typing import Literal
4
+ # from langchain.prompts import ChatPromptTemplate
5
+ # from langchain_core.utils.function_calling import convert_to_openai_function
6
+ # from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
7
+
8
+ # # https://livingdatalab.com/posts/2023-11-05-openai-function-calling-with-langchain.html
9
+
10
+ # class Location(BaseModel):
11
+ # country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
12
+ # location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
13
+
14
+ # class QueryAnalysis(BaseModel):
15
+ # """Analyzing the user query"""
16
+
17
+ # language: str = Field(
18
+ # description="Find the language of the query in full words (ex: French, English, Spanish, ...), defaults to English"
19
+ # )
20
+ # intent: str = Field(
21
+ # enum=[
22
+ # "Environmental impacts of AI",
23
+ # "Geolocated info about climate change",
24
+ # "Climate change",
25
+ # "Biodiversity",
26
+ # "Deep sea mining",
27
+ # "Chitchat",
28
+ # ],
29
+ # description="""
30
+ # Categorize the user query in one of the following category,
31
+
32
+ # Examples:
33
+ # - Geolocated info about climate change: "What will be the temperature in Marseille in 2050"
34
+ # - Climate change: "What is radiative forcing", "How much will
35
+ # """,
36
+ # )
37
+ # sources: List[Literal["IPCC", "IPBES", "IPOS"]] = Field(
38
+ # ...,
39
+ # description="""
40
+ # Given a user question choose which documents would be most relevant for answering their question,
41
+ # - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
42
+ # - IPBES is for questions about biodiversity and nature
43
+ # - IPOS is for questions about the ocean and deep sea mining
44
+
45
+ # """,
46
+ # )
47
+ # date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
48
+ # location:Location
49
+ # # query: str = Field(
50
+ # # description = """
51
+ # # Translate to english and reformulate the following user message to be a short standalone question, in the context of an educational discussion about climate change.
52
+ # # The reformulated question will used in a search engine
53
+ # # By default, assume that the user is asking information about the last century,
54
+ # # Use the following examples
55
+
56
+ # # ### Examples:
57
+ # # La technologie nous sauvera-t-elle ? -> Can technology help humanity mitigate the effects of climate change?
58
+ # # what are our reserves in fossil fuel? -> What are the current reserves of fossil fuels and how long will they last?
59
+ # # what are the main causes of climate change? -> What are the main causes of climate change in the last century?
60
+
61
+ # # Question in English:
62
+ # # """
63
+ # # )
64
+
65
+ # openai_functions = [convert_to_openai_function(QueryAnalysis)]
66
+ # llm2 = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
climateqa/engine/chains/translation.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain_core.pydantic_v1 import BaseModel, Field
3
+ from typing import List
4
+ from typing import Literal
5
+ from langchain.prompts import ChatPromptTemplate
6
+ from langchain_core.utils.function_calling import convert_to_openai_function
7
+ from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
8
+
9
+
10
+ class Translation(BaseModel):
11
+ """Analyzing the user message input"""
12
+
13
+ translation: str = Field(
14
+ description="Translate the message input to English",
15
+ )
16
+
17
+
18
+ def make_translation_chain(llm):
19
+
20
+ openai_functions = [convert_to_openai_function(Translation)]
21
+ llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"Translation"})
22
+
23
+ prompt = ChatPromptTemplate.from_messages([
24
+ ("system", "You are a helpful assistant, you will translate the user input message to English using the function provided"),
25
+ ("user", "input: {input}")
26
+ ])
27
+
28
+ chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
29
+ return chain
30
+
31
+
32
+ def make_translation_node(llm):
33
+
34
+ translation_chain = make_translation_chain(llm)
35
+
36
+ def translate_query(state):
37
+ user_input = state["user_input"]
38
+ translation = translation_chain.invoke({"input":user_input})
39
+ return {"query":translation["translation"]}
40
+
41
+ return translate_query
climateqa/engine/embeddings.py CHANGED
@@ -2,7 +2,7 @@
2
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
3
  from langchain_community.embeddings import HuggingFaceEmbeddings
4
 
5
- def get_embeddings_function(version = "v1.2"):
6
 
7
  if version == "v1.2":
8
 
@@ -10,12 +10,12 @@ def get_embeddings_function(version = "v1.2"):
10
  # Best embedding model at a reasonable size at the moment (2023-11-22)
11
 
12
  model_name = "BAAI/bge-base-en-v1.5"
13
- encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
14
  print("Loading embeddings model: ", model_name)
15
  embeddings_function = HuggingFaceBgeEmbeddings(
16
  model_name=model_name,
17
  encode_kwargs=encode_kwargs,
18
- query_instruction="Represent this sentence for searching relevant passages: "
19
  )
20
 
21
  else:
@@ -23,3 +23,6 @@ def get_embeddings_function(version = "v1.2"):
23
  embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
24
 
25
  return embeddings_function
 
 
 
 
2
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
3
  from langchain_community.embeddings import HuggingFaceEmbeddings
4
 
5
+ def get_embeddings_function(version = "v1.2",query_instruction = "Represent this sentence for searching relevant passages: "):
6
 
7
  if version == "v1.2":
8
 
 
10
  # Best embedding model at a reasonable size at the moment (2023-11-22)
11
 
12
  model_name = "BAAI/bge-base-en-v1.5"
13
+ encode_kwargs = {'normalize_embeddings': True,"show_progress_bar":False} # set True to compute cosine similarity
14
  print("Loading embeddings model: ", model_name)
15
  embeddings_function = HuggingFaceBgeEmbeddings(
16
  model_name=model_name,
17
  encode_kwargs=encode_kwargs,
18
+ query_instruction=query_instruction,
19
  )
20
 
21
  else:
 
23
  embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
24
 
25
  return embeddings_function
26
+
27
+
28
+
climateqa/engine/graph.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from contextlib import contextmanager
4
+
5
+ from langchain.schema import Document
6
+ from langgraph.graph import END, StateGraph
7
+ from typing_extensions import TypedDict
8
+ from typing import List
9
+
10
+ from .chains.answer_chitchat import make_chitchat_node
11
+ from .chains.answer_ai_impact import make_ai_impact_node
12
+ from .chains.query_transform import make_query_transform_node
13
+ from .chains.translation import make_translation_node
14
+ from .chains.intent_routing import make_intent_router_node
15
+
16
+
17
+ class GraphState(TypedDict):
18
+ """
19
+ Represents the state of our graph.
20
+ """
21
+ user_input : str
22
+ language : str
23
+ intent : str
24
+ query: str
25
+ questions : List[dict]
26
+ answer: str
27
+ audience: str
28
+ sources_input: str
29
+ documents: List[Document]
30
+
31
+ def search(state):
32
+ return {}
climateqa/engine/reranker.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from scipy.special import expit, logit
3
+ from rerankers import Reranker
4
+
5
+
6
+ def get_reranker(model = "nano",cohere_api_key = None):
7
+
8
+ assert model in ["nano","tiny","small","large"]
9
+
10
+ if model == "nano":
11
+ reranker = Reranker('ms-marco-TinyBERT-L-2-v2', model_type='flashrank')
12
+ elif model == "tiny":
13
+ reranker = Reranker('ms-marco-MiniLM-L-12-v2', model_type='flashrank')
14
+ elif model == "small":
15
+ reranker = Reranker("mixedbread-ai/mxbai-rerank-xsmall-v1", model_type='cross-encoder')
16
+ elif model == "large":
17
+ if cohere_api_key is None:
18
+ cohere_api_key = os.environ["COHERE_API_KEY"]
19
+ reranker = Reranker("cohere", lang='en', api_key = cohere_api_key)
20
+ return reranker
21
+
22
+
23
+
24
+ def rerank_docs(reranker,docs,query):
25
+
26
+ # Get a list of texts from langchain docs
27
+ input_docs = [x.page_content for x in docs]
28
+
29
+ # Rerank using rerankers library
30
+ results = reranker.rank(query=query, docs=input_docs)
31
+
32
+ # Prepare langchain list of docs
33
+ docs_reranked = []
34
+ for result in results.results:
35
+ doc_id = result.document.doc_id
36
+ doc = docs[doc_id]
37
+ doc.metadata["rerank_score"] = result.score
38
+ doc.metadata["query_used_for_retrieval"] = query
39
+ docs_reranked.append(doc)
40
+ return docs_reranked
requirements.txt CHANGED
@@ -10,4 +10,5 @@ huggingface-hub
10
  msal
11
  pyalex==0.13
12
  networkx==3.2.1
13
- pyvis==0.3.2
 
 
10
  msal
11
  pyalex==0.13
12
  networkx==3.2.1
13
+ pyvis==0.3.2
14
+ flashrank==0.2.5
sandbox/20240310 - CQA - Semantic Routing 1.ipynb ADDED
The diff for this file is too large to render. See raw diff