Ritvik19 commited on
Commit
89588e0
·
verified ·
1 Parent(s): 10f7511

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +20 -16
  2. requirements.txt +2 -1
app.py CHANGED
@@ -9,11 +9,10 @@ from bs4 import BeautifulSoup
9
  from langchain.chains import ConversationalRetrievalChain
10
  from langchain.docstore.document import Document
11
  from langchain.document_loaders import PDFMinerPDFasHTMLLoader, WebBaseLoader
12
- from langchain.embeddings.openai import OpenAIEmbeddings
13
- from langchain_openai import ChatOpenAI, OpenAI
14
- from langchain.vectorstores import Chroma
15
  from langchain.retrievers.multi_query import MultiQueryRetriever
16
  from ragatouille import RAGPretrainedModel
 
17
 
18
 
19
  st.set_page_config(layout="wide")
@@ -55,7 +54,10 @@ def query_llm(retriever, query):
55
 
56
  def input_fields():
57
  st.session_state.source_doc_urls = [
58
- url.strip() for url in st.sidebar.text_input("Source Document URLs").split(",")
 
 
 
59
  ]
60
 
61
 
@@ -68,9 +70,9 @@ def process_documents():
68
  else:
69
  snippets.extend(process_web(url))
70
  st.session_state.retriever = embeddings_on_local_vectordb(snippets)
71
- st.session_state.headers = [
72
- " ".join(snip.metadata["header"].split()[:10]) for snip in snippets
73
- ]
74
  except Exception as e:
75
  st.error(f"An error occurred: {e}")
76
 
@@ -88,12 +90,13 @@ def process_pdf(url):
88
  Document(
89
  page_content=deep_strip(snip[1]["header_text"]) + " " + deep_strip(snip[0]),
90
  metadata={
91
- "header": deep_strip(snip[1]["header_text"]),
92
  "source_url": url,
93
  "source_type": "pdf",
 
94
  },
95
  )
96
- for snip in semantic_snippets
97
  ]
98
  return document_snippets
99
 
@@ -196,12 +199,13 @@ def process_web(url):
196
 
197
  def boot():
198
  st.title("Xi Chatbot")
 
199
  input_fields()
200
  col1, col2 = st.columns([4, 1])
201
  st.sidebar.button("Submit Documents", on_click=process_documents)
202
  if "headers" in st.session_state:
203
- for header in st.session_state.headers:
204
- col2.info(header)
205
  if "messages" not in st.session_state:
206
  st.session_state.messages = []
207
  for message in st.session_state.messages:
@@ -210,11 +214,11 @@ def boot():
210
  if query := col1.chat_input():
211
  col1.chat_message("human").write(query)
212
  references, response = query_llm(st.session_state.retriever, query)
213
- for snip in references:
214
- st.sidebar.success(
215
- f'Section {" ".join(snip.metadata["header"].split()[:10])}'
216
- )
217
- col1.chat_message("ai").write(response)
218
 
219
 
220
  if __name__ == "__main__":
 
9
  from langchain.chains import ConversationalRetrievalChain
10
  from langchain.docstore.document import Document
11
  from langchain.document_loaders import PDFMinerPDFasHTMLLoader, WebBaseLoader
12
+ from langchain_openai import ChatOpenAI
 
 
13
  from langchain.retrievers.multi_query import MultiQueryRetriever
14
  from ragatouille import RAGPretrainedModel
15
+ import pandas as pd
16
 
17
 
18
  st.set_page_config(layout="wide")
 
54
 
55
  def input_fields():
56
  st.session_state.source_doc_urls = [
57
+ url.strip()
58
+ for url in st.sidebar.text_area(
59
+ "Source Document URLs\n(New line separated)", height=200
60
+ ).split("\n")
61
  ]
62
 
63
 
 
70
  else:
71
  snippets.extend(process_web(url))
72
  st.session_state.retriever = embeddings_on_local_vectordb(snippets)
73
+ st.session_state.headers = pd.Series(
74
+ [snip.metadata["header"] for snip in snippets], name="references"
75
+ )
76
  except Exception as e:
77
  st.error(f"An error occurred: {e}")
78
 
 
90
  Document(
91
  page_content=deep_strip(snip[1]["header_text"]) + " " + deep_strip(snip[0]),
92
  metadata={
93
+ "header": " ".join(snip[1]["header_text"].split()[:10]),
94
  "source_url": url,
95
  "source_type": "pdf",
96
+ "chunk_id": i,
97
  },
98
  )
99
+ for i, snip in enumerate(semantic_snippets)
100
  ]
101
  return document_snippets
102
 
 
199
 
200
  def boot():
201
  st.title("Xi Chatbot")
202
+ st.sidebar.title("Input Documents")
203
  input_fields()
204
  col1, col2 = st.columns([4, 1])
205
  st.sidebar.button("Submit Documents", on_click=process_documents)
206
  if "headers" in st.session_state:
207
+ col2.write("### References")
208
+ col2.write(st.session_state.headers)
209
  if "messages" not in st.session_state:
210
  st.session_state.messages = []
211
  for message in st.session_state.messages:
 
214
  if query := col1.chat_input():
215
  col1.chat_message("human").write(query)
216
  references, response = query_llm(st.session_state.retriever, query)
217
+ sorted_references = sorted([ref.metadata["chunk_id"] for ref in references])
218
+ references_str = " ".join([f"[{ref}]" for ref in sorted_references])
219
+ col1.chat_message("ai").write(
220
+ response + "\n\n---\nReferences:" + references_str
221
+ )
222
 
223
 
224
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -8,4 +8,5 @@ chromadb==0.4.22
8
  tiktoken==0.5.2
9
  pdfminer.six==20231228
10
  beautifulsoup4==4.12.3
11
- RAGatouille==0.0.7.post7
 
 
8
  tiktoken==0.5.2
9
  pdfminer.six==20231228
10
  beautifulsoup4==4.12.3
11
+ RAGatouille==0.0.7.post7
12
+ pandas==2.2.1