Upload 2 files
Browse files- app.py +20 -16
- requirements.txt +2 -1
app.py
CHANGED
@@ -9,11 +9,10 @@ from bs4 import BeautifulSoup
|
|
9 |
from langchain.chains import ConversationalRetrievalChain
|
10 |
from langchain.docstore.document import Document
|
11 |
from langchain.document_loaders import PDFMinerPDFasHTMLLoader, WebBaseLoader
|
12 |
-
from
|
13 |
-
from langchain_openai import ChatOpenAI, OpenAI
|
14 |
-
from langchain.vectorstores import Chroma
|
15 |
from langchain.retrievers.multi_query import MultiQueryRetriever
|
16 |
from ragatouille import RAGPretrainedModel
|
|
|
17 |
|
18 |
|
19 |
st.set_page_config(layout="wide")
|
@@ -55,7 +54,10 @@ def query_llm(retriever, query):
|
|
55 |
|
56 |
def input_fields():
|
57 |
st.session_state.source_doc_urls = [
|
58 |
-
url.strip()
|
|
|
|
|
|
|
59 |
]
|
60 |
|
61 |
|
@@ -68,9 +70,9 @@ def process_documents():
|
|
68 |
else:
|
69 |
snippets.extend(process_web(url))
|
70 |
st.session_state.retriever = embeddings_on_local_vectordb(snippets)
|
71 |
-
st.session_state.headers =
|
72 |
-
|
73 |
-
|
74 |
except Exception as e:
|
75 |
st.error(f"An error occurred: {e}")
|
76 |
|
@@ -88,12 +90,13 @@ def process_pdf(url):
|
|
88 |
Document(
|
89 |
page_content=deep_strip(snip[1]["header_text"]) + " " + deep_strip(snip[0]),
|
90 |
metadata={
|
91 |
-
"header":
|
92 |
"source_url": url,
|
93 |
"source_type": "pdf",
|
|
|
94 |
},
|
95 |
)
|
96 |
-
for snip in semantic_snippets
|
97 |
]
|
98 |
return document_snippets
|
99 |
|
@@ -196,12 +199,13 @@ def process_web(url):
|
|
196 |
|
197 |
def boot():
|
198 |
st.title("Xi Chatbot")
|
|
|
199 |
input_fields()
|
200 |
col1, col2 = st.columns([4, 1])
|
201 |
st.sidebar.button("Submit Documents", on_click=process_documents)
|
202 |
if "headers" in st.session_state:
|
203 |
-
|
204 |
-
|
205 |
if "messages" not in st.session_state:
|
206 |
st.session_state.messages = []
|
207 |
for message in st.session_state.messages:
|
@@ -210,11 +214,11 @@ def boot():
|
|
210 |
if query := col1.chat_input():
|
211 |
col1.chat_message("human").write(query)
|
212 |
references, response = query_llm(st.session_state.retriever, query)
|
213 |
-
for
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
|
219 |
|
220 |
if __name__ == "__main__":
|
|
|
9 |
from langchain.chains import ConversationalRetrievalChain
|
10 |
from langchain.docstore.document import Document
|
11 |
from langchain.document_loaders import PDFMinerPDFasHTMLLoader, WebBaseLoader
|
12 |
+
from langchain_openai import ChatOpenAI
|
|
|
|
|
13 |
from langchain.retrievers.multi_query import MultiQueryRetriever
|
14 |
from ragatouille import RAGPretrainedModel
|
15 |
+
import pandas as pd
|
16 |
|
17 |
|
18 |
st.set_page_config(layout="wide")
|
|
|
54 |
|
55 |
def input_fields():
|
56 |
st.session_state.source_doc_urls = [
|
57 |
+
url.strip()
|
58 |
+
for url in st.sidebar.text_area(
|
59 |
+
"Source Document URLs\n(New line separated)", height=200
|
60 |
+
).split("\n")
|
61 |
]
|
62 |
|
63 |
|
|
|
70 |
else:
|
71 |
snippets.extend(process_web(url))
|
72 |
st.session_state.retriever = embeddings_on_local_vectordb(snippets)
|
73 |
+
st.session_state.headers = pd.Series(
|
74 |
+
[snip.metadata["header"] for snip in snippets], name="references"
|
75 |
+
)
|
76 |
except Exception as e:
|
77 |
st.error(f"An error occurred: {e}")
|
78 |
|
|
|
90 |
Document(
|
91 |
page_content=deep_strip(snip[1]["header_text"]) + " " + deep_strip(snip[0]),
|
92 |
metadata={
|
93 |
+
"header": " ".join(snip[1]["header_text"].split()[:10]),
|
94 |
"source_url": url,
|
95 |
"source_type": "pdf",
|
96 |
+
"chunk_id": i,
|
97 |
},
|
98 |
)
|
99 |
+
for i, snip in enumerate(semantic_snippets)
|
100 |
]
|
101 |
return document_snippets
|
102 |
|
|
|
199 |
|
200 |
def boot():
|
201 |
st.title("Xi Chatbot")
|
202 |
+
st.sidebar.title("Input Documents")
|
203 |
input_fields()
|
204 |
col1, col2 = st.columns([4, 1])
|
205 |
st.sidebar.button("Submit Documents", on_click=process_documents)
|
206 |
if "headers" in st.session_state:
|
207 |
+
col2.write("### References")
|
208 |
+
col2.write(st.session_state.headers)
|
209 |
if "messages" not in st.session_state:
|
210 |
st.session_state.messages = []
|
211 |
for message in st.session_state.messages:
|
|
|
214 |
if query := col1.chat_input():
|
215 |
col1.chat_message("human").write(query)
|
216 |
references, response = query_llm(st.session_state.retriever, query)
|
217 |
+
sorted_references = sorted([ref.metadata["chunk_id"] for ref in references])
|
218 |
+
references_str = " ".join([f"[{ref}]" for ref in sorted_references])
|
219 |
+
col1.chat_message("ai").write(
|
220 |
+
response + "\n\n---\nReferences:" + references_str
|
221 |
+
)
|
222 |
|
223 |
|
224 |
if __name__ == "__main__":
|
requirements.txt
CHANGED
@@ -8,4 +8,5 @@ chromadb==0.4.22
|
|
8 |
tiktoken==0.5.2
|
9 |
pdfminer.six==20231228
|
10 |
beautifulsoup4==4.12.3
|
11 |
-
RAGatouille==0.0.7.post7
|
|
|
|
8 |
tiktoken==0.5.2
|
9 |
pdfminer.six==20231228
|
10 |
beautifulsoup4==4.12.3
|
11 |
+
RAGatouille==0.0.7.post7
|
12 |
+
pandas==2.2.1
|