Vasanth commited on
Commit
4d8deb8
·
1 Parent(s): 445f5b9

Researcher Done

Browse files
Files changed (5) hide show
  1. .env +3 -0
  2. app.py +37 -0
  3. config.py +18 -0
  4. requirements.txt +122 -0
  5. researcher.py +93 -0
.env ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ GROQ_API_KEY = "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"
2
+ SERPER_API_KEY = "a89c1bc89b03a84f903ebe84e0c389fc16d2a072"
3
+ SERPER_API_KEY = "a89c1bc89b03a84f903ebe84e0c389fc16d2a072"
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_chat import message
3
+ from researcher import Researcher
4
+ from dotenv import find_dotenv, load_dotenv
5
+ load_dotenv(find_dotenv())
6
+ st.set_page_config(layout="wide")
7
+ st.session_state.clicked=True
8
+
9
+ @st.cache_resource(show_spinner=True)
10
+ def create_researcher():
11
+ researcher = Researcher()
12
+ return researcher
13
+ research_apprentice = create_researcher()
14
+
15
+ def display_conversation(history):
16
+ for i in range(len(history["apprentice"])):
17
+ message(history["user"][i], is_user=True, key=str(i) + "_user")
18
+ message(history["apprentice"][i], key=str(i))
19
+
20
+ if st.session_state.clicked:
21
+ st.title("InfoGenie - Your 24/7 AI Research Apprentice 🧑‍💻")
22
+ st.subheader("An AI apprentice who can serve you 24/7 by researching on a given question in realtime over Internet and provide you answers accurately within a blink of an eye.")
23
+
24
+ if "apprentice" not in st.session_state:
25
+ st.session_state["apprentice"] = ["Hello. How can I help you?"]
26
+ if "user" not in st.session_state:
27
+ st.session_state["user"] = ["Hey InfoGenie!"]
28
+ with st.expander("Command InfoGenie"):
29
+ research_query_input = st.text_input("Resarch Query")
30
+ if st.button("Send"):
31
+ robowiz_output = research_apprentice.research(research_query_input)
32
+
33
+ st.session_state["user"].append(research_query_input)
34
+ st.session_state["apprentice"].append(robowiz_output)
35
+
36
+ if st.session_state["apprentice"]:
37
+ display_conversation(st.session_state)
config.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PROMPT_TEMPLATE = """
2
+ You are a great researcher. With the information provided understand in deep and try to answer the question.
3
+ If you cant answer the question based on the information either say you cant find an answer or unable to find an answer.
4
+ So try to understand in depth about the context and answer only based on the information provided. Dont generate irrelevant answers.
5
+
6
+ Context: {context}
7
+ Question: {question}
8
+ Do provide only helpful answers
9
+
10
+ Answer:
11
+ """
12
+ INPUT_VARIABLES = ["context", "question"]
13
+ SEPARATORS = "\n"
14
+ CHUNK_SIZE = 10000
15
+ CHUNK_OVERLAP = 1000
16
+ EMBEDDER = "BAAI/bge-base-en-v1.5"
17
+ CHAIN_TYPE = "stuff"
18
+ SEARCH_KWARGS = {'k': 3}
requirements.txt ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.3
2
+ aiosignal==1.3.1
3
+ altair==5.2.0
4
+ annotated-types==0.6.0
5
+ anyio==4.3.0
6
+ attrs==23.2.0
7
+ backoff==2.2.1
8
+ beautifulsoup4==4.12.3
9
+ blinker==1.7.0
10
+ cachetools==5.3.3
11
+ certifi==2024.2.2
12
+ chardet==5.2.0
13
+ charset-normalizer==3.3.2
14
+ click==8.1.7
15
+ colorama==0.4.6
16
+ contourpy==1.2.0
17
+ cycler==0.12.1
18
+ dataclasses-json==0.6.4
19
+ distro==1.9.0
20
+ emoji==2.10.1
21
+ faiss-cpu==1.8.0
22
+ filelock==3.9.0
23
+ filetype==1.2.0
24
+ fonttools==4.49.0
25
+ frozenlist==1.4.1
26
+ fsspec==2024.2.0
27
+ gitdb==4.0.11
28
+ GitPython==3.1.42
29
+ greenlet==3.0.3
30
+ groq==0.4.2
31
+ h11==0.14.0
32
+ httpcore==1.0.4
33
+ httpx==0.27.0
34
+ huggingface-hub==0.21.3
35
+ idna==3.6
36
+ importlib-metadata==7.0.1
37
+ Jinja2==3.1.2
38
+ joblib==1.3.2
39
+ jsonpatch==1.33
40
+ jsonpath-python==1.0.6
41
+ jsonpointer==2.4
42
+ jsonschema==4.21.1
43
+ jsonschema-specifications==2023.12.1
44
+ kiwisolver==1.4.5
45
+ langchain==0.1.10
46
+ langchain-community==0.0.25
47
+ langchain-core==0.1.28
48
+ langchain-groq==0.0.1
49
+ langchain-text-splitters==0.0.1
50
+ langdetect==1.0.9
51
+ langsmith==0.1.14
52
+ lxml==5.1.0
53
+ markdown-it-py==3.0.0
54
+ MarkupSafe==2.1.3
55
+ marshmallow==3.21.0
56
+ matplotlib==3.8.3
57
+ mdurl==0.1.2
58
+ mpmath==1.3.0
59
+ multidict==6.0.5
60
+ mypy-extensions==1.0.0
61
+ networkx==3.2.1
62
+ nltk==3.8.1
63
+ numpy==1.26.4
64
+ orjson==3.9.15
65
+ packaging==23.2
66
+ pandas==2.2.1
67
+ pillow==10.2.0
68
+ protobuf==4.25.3
69
+ pyarrow==15.0.0
70
+ pydantic==2.6.3
71
+ pydantic_core==2.16.3
72
+ pydeck==0.8.1b0
73
+ Pygments==2.17.2
74
+ pyparsing==3.1.1
75
+ python-dateutil==2.9.0.post0
76
+ python-dotenv==1.0.1
77
+ python-iso639==2024.2.7
78
+ pytz==2024.1
79
+ PyYAML==6.0.1
80
+ rapidfuzz==3.6.1
81
+ referencing==0.33.0
82
+ regex==2023.12.25
83
+ requests==2.31.0
84
+ rich==13.7.1
85
+ rpds-py==0.18.0
86
+ safetensors==0.4.2
87
+ scikit-learn==1.4.1.post1
88
+ scipy==1.12.0
89
+ seaborn==0.13.2
90
+ sentence-transformers==2.5.1
91
+ six==1.16.0
92
+ smmap==5.0.1
93
+ sniffio==1.3.1
94
+ soupsieve==2.5
95
+ SQLAlchemy==2.0.27
96
+ streamlit==1.31.1
97
+ streamlit-chat==0.1.1
98
+ sympy==1.12
99
+ tabulate==0.9.0
100
+ tenacity==8.2.3
101
+ threadpoolctl==3.3.0
102
+ tokenizers==0.15.2
103
+ toml==0.10.2
104
+ toolz==0.12.1
105
+ torch==2.2.1
106
+ torchaudio==2.2.1
107
+ torchvision==0.17.1
108
+ tornado==6.4
109
+ tqdm==4.66.2
110
+ transformers==4.38.2
111
+ typing-inspect==0.9.0
112
+ typing_extensions==4.8.0
113
+ tzdata==2024.1
114
+ tzlocal==5.2
115
+ unstructured==0.11.8
116
+ unstructured-client==0.21.0
117
+ urllib3==2.2.1
118
+ validators==0.22.0
119
+ watchdog==4.0.0
120
+ wrapt==1.16.0
121
+ yarl==1.9.4
122
+ zipp==3.17.0
researcher.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from config import *
2
+ import os
3
+ from dotenv import load_dotenv, find_dotenv
4
+ import json
5
+ import requests
6
+ from langchain_groq import ChatGroq
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain.chains import RetrievalQA
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain.document_loaders.url import UnstructuredURLLoader
11
+ from langchain.vectorstores.faiss import FAISS
12
+ from langchain_community.embeddings import HuggingFaceEmbeddings
13
+ import os
14
+ load_dotenv(find_dotenv())
15
+ from langchain.globals import set_debug
16
+
17
+ set_debug(True)
18
+
19
+ class Researcher:
20
+
21
+ def __init__(self):
22
+ self.serper_api_key = os.getenv("SERPER_API_KEY")
23
+ self.groq_api_key = os.getenv("GROQ_API_KEY")
24
+ self.prompt_template = PromptTemplate(
25
+ template=PROMPT_TEMPLATE,
26
+ input_variables=INPUT_VARIABLES
27
+ )
28
+ self.text_splitter = RecursiveCharacterTextSplitter(
29
+ separators=SEPARATORS,
30
+ chunk_size=CHUNK_SIZE,
31
+ chunk_overlap=CHUNK_OVERLAP
32
+ )
33
+ self.llm = ChatGroq(temperature=0.5, model_name="mixtral-8x7b-32768", groq_api_key=self.groq_api_key)
34
+ self.hfembeddings = HuggingFaceEmbeddings(
35
+ model_name=EMBEDDER,
36
+ model_kwargs={'device': 'cpu'}
37
+ )
38
+
39
+ def search_articles(self, query):
40
+
41
+ url = "https://google.serper.dev/search"
42
+ data = json.dumps({"q":query})
43
+
44
+ headers = {
45
+ 'X-API-KEY': self.serper_api_key,
46
+ 'Content-Type': 'application/json'
47
+ }
48
+
49
+ response = requests.request("POST", url, headers=headers, data=data)
50
+
51
+ return response.json()
52
+
53
+ def research_answerer(self):
54
+
55
+ research_qa_chain = RetrievalQA.from_chain_type(
56
+ llm=self.llm,
57
+ chain_type=CHAIN_TYPE,
58
+ retriever= self.db.as_retriever(search_kwargs=SEARCH_KWARGS),
59
+ return_source_documents=True,
60
+ verbose=True,
61
+ chain_type_kwargs={"prompt": self.prompt_template}
62
+ )
63
+ return research_qa_chain
64
+
65
+ def get_urls(self, articles):
66
+ urls = []
67
+ try:
68
+ urls.append(articles["answerBox"]["link"])
69
+ except:
70
+ pass
71
+ for i in range(0, min(3, len(articles["organic"]))):
72
+ urls.append(articles["organic"][i]["link"])
73
+ return urls
74
+
75
+ def get_content_from_urls(self, urls):
76
+ loader = UnstructuredURLLoader(urls=urls)
77
+ research_content = loader.load()
78
+ return research_content
79
+
80
+ def research_given_query(self, research_objective, research_content):
81
+
82
+ docs = self.text_splitter.split_documents(research_content)
83
+ self.db = FAISS.from_documents(documents=docs, embedding=self.hfembeddings)
84
+ bot = self.research_answerer()
85
+ research_out =bot({"query": research_objective})
86
+ return research_out["result"]
87
+
88
+ def research(self, query):
89
+ search_articles = self.search_articles(query)
90
+ urls = self.get_urls(search_articles)
91
+ research_content = self.get_content_from_urls(urls)
92
+ answer = self.research_given_query(query, research_content)
93
+ return answer