Added streamlit app and requirements
Browse files- app.py +183 -0
- requirements.txt +9 -0
app.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tempfile
|
2 |
+
import fitz
|
3 |
+
import streamlit as st
|
4 |
+
from langchain.chains import ConversationalRetrievalChain
|
5 |
+
from langchain.document_loaders import PyPDFLoader
|
6 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
7 |
+
from langchain.llms import CTransformers
|
8 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
+
from langchain.vectorstores import Chroma
|
10 |
+
from PIL import Image
|
11 |
+
from streamlit_chat import message
|
12 |
+
|
13 |
+
st.set_page_config(
|
14 |
+
page_title="AskBot",
|
15 |
+
page_icon=":robot_face:",
|
16 |
+
layout="wide"
|
17 |
+
)
|
18 |
+
|
19 |
+
st.sidebar.title("""
|
20 |
+
Ask A Bot :robot_face: \n Talk with your PDFs
|
21 |
+
""")
|
22 |
+
|
23 |
+
st.sidebar.write("""
|
24 |
+
###### A Q&A chatbot for you to talk with your PDFs.
|
25 |
+
###### Upload the PDF you want to talk to and start asking questions. The display will show the page where the answer was found.
|
26 |
+
###### When you upload a new PDF, the chat history is reset for you to start fresh.
|
27 |
+
###### The chatbot is based on Langchain and the Llama language model, which is a large language model trained on the Common Crawl dataset. Obtained from [here](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML).
|
28 |
+
###### The performance of this bot is limited due to its size. For better performance, a larger LLM should be used.
|
29 |
+
###### :warning: Sometimes the Streamlit app will not re-run and refresh the PDF. If this happens, refresh the page.
|
30 |
+
###### Developed by [Carlos Pereira](https://linkedin.com/in/carlos-miguel-pereira/).
|
31 |
+
""")
|
32 |
+
|
33 |
+
if 'pdf_page' not in st.session_state:
|
34 |
+
st.session_state['pdf_page'] = 0
|
35 |
+
|
36 |
+
if 'chat_history' not in st.session_state:
|
37 |
+
st.session_state['chat_history'] = []
|
38 |
+
|
39 |
+
if 'generated' not in st.session_state:
|
40 |
+
st.session_state['generated'] = []
|
41 |
+
|
42 |
+
if 'past' not in st.session_state:
|
43 |
+
st.session_state['past'] = []
|
44 |
+
|
45 |
+
def update_state():
|
46 |
+
"""
|
47 |
+
Reset state when a new PDF is uploaded
|
48 |
+
"""
|
49 |
+
st.session_state.pdf_page = 0
|
50 |
+
st.session_state.chat_history = []
|
51 |
+
st.session_state['generated'] = []
|
52 |
+
st.session_state['past'] = []
|
53 |
+
|
54 |
+
@st.cache_resource(show_spinner=False)
|
55 |
+
def load_llm():
|
56 |
+
"""
|
57 |
+
Load Llama LLM
|
58 |
+
"""
|
59 |
+
llm_model = CTransformers(
|
60 |
+
model="llama-2-13b-chat.ggmlv3.q3_K_L.bin",
|
61 |
+
model_type="llama",
|
62 |
+
max_new_tokens=150,
|
63 |
+
temperature=0.2
|
64 |
+
)
|
65 |
+
return llm_model
|
66 |
+
|
67 |
+
@st.cache_resource(show_spinner=False)
|
68 |
+
def gen_embeddings():
|
69 |
+
"""
|
70 |
+
Generate embeddings
|
71 |
+
"""
|
72 |
+
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",
|
73 |
+
model_kwargs={'device': 'cpu'})
|
74 |
+
|
75 |
+
return embeddings
|
76 |
+
|
77 |
+
def load_pdf(file):
|
78 |
+
"""
|
79 |
+
Load PDF and process for Search
|
80 |
+
"""
|
81 |
+
# create tempfile to load pdf to PyPDFLoader
|
82 |
+
temp_file = tempfile.NamedTemporaryFile()
|
83 |
+
temp_file.write(file.getbuffer())
|
84 |
+
loader = PyPDFLoader(temp_file.name)
|
85 |
+
documents = loader.load()
|
86 |
+
pdf_file = fitz.open(temp_file.name)
|
87 |
+
temp_file.close()
|
88 |
+
|
89 |
+
# split doc into chunks
|
90 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
91 |
+
texts = text_splitter.split_documents(documents)
|
92 |
+
|
93 |
+
#get embedding model
|
94 |
+
embeddings = gen_embeddings()
|
95 |
+
pdf_search = Chroma.from_documents(texts, embeddings)
|
96 |
+
|
97 |
+
return pdf_search, pdf_file
|
98 |
+
|
99 |
+
def generate_chain(pdf_vector, llm):
|
100 |
+
"""
|
101 |
+
Generate Retrieval chain
|
102 |
+
"""
|
103 |
+
chain = ConversationalRetrievalChain.from_llm(llm,
|
104 |
+
chain_type="stuff",
|
105 |
+
retriever=pdf_vector.as_retriever(search_kwargs={"k": 1}),
|
106 |
+
return_source_documents=True)
|
107 |
+
|
108 |
+
return chain
|
109 |
+
|
110 |
+
def get_answer(chain, query, chat_history):
|
111 |
+
"""
|
112 |
+
Get an answer from the chain
|
113 |
+
"""
|
114 |
+
result = chain({"question": query, 'chat_history': chat_history}, return_only_outputs=True)
|
115 |
+
answer = result["answer"]
|
116 |
+
# if you want history uncomment the line below
|
117 |
+
# st.session_state.chat_history += [(query, answer)]
|
118 |
+
st.session_state.pdf_page = list(result['source_documents'][0])[1][1]['page']
|
119 |
+
|
120 |
+
return answer
|
121 |
+
|
122 |
+
def render_page_file(file, page):
|
123 |
+
"""
|
124 |
+
Render page from PDF file
|
125 |
+
"""
|
126 |
+
try:
|
127 |
+
page = file[page]
|
128 |
+
except: # todo: fix this exception handling
|
129 |
+
page = file[0]
|
130 |
+
st.session_state.pdf_page = 0
|
131 |
+
|
132 |
+
# Render the PDF page as an image
|
133 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
|
134 |
+
image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
|
135 |
+
|
136 |
+
return image
|
137 |
+
|
138 |
+
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"],
|
139 |
+
accept_multiple_files=False,
|
140 |
+
on_change=update_state)
|
141 |
+
|
142 |
+
def app():
|
143 |
+
"""
|
144 |
+
Main app
|
145 |
+
"""
|
146 |
+
if uploaded_file:
|
147 |
+
# Load LLM
|
148 |
+
with st.spinner('Loading LLM...'):
|
149 |
+
llm = load_llm()
|
150 |
+
# Load and process the uploaded PDF file
|
151 |
+
with st.spinner('Loading PDF...'):
|
152 |
+
pdf_vector, pdf_file = load_pdf(uploaded_file)
|
153 |
+
with st.spinner('Generating chain...'):
|
154 |
+
chain = generate_chain(pdf_vector, llm)
|
155 |
+
|
156 |
+
col1, col2 = st.columns(2)
|
157 |
+
with col1:
|
158 |
+
# Question and answering
|
159 |
+
with st.form(key='question_form', clear_on_submit=True):
|
160 |
+
question = st.text_input('Enter your question:', value="", key='text_value')
|
161 |
+
submit_question = st.form_submit_button(label="Enter")
|
162 |
+
|
163 |
+
if submit_question:
|
164 |
+
with st.spinner('Getting answer...'):
|
165 |
+
answer = get_answer(chain, question,
|
166 |
+
st.session_state.chat_history)
|
167 |
+
st.session_state.past.append(question)
|
168 |
+
st.session_state.generated.append(answer)
|
169 |
+
|
170 |
+
if st.session_state['generated']:
|
171 |
+
for i in range(len(st.session_state['generated'])-1, -1, -1):
|
172 |
+
message(st.session_state["generated"][i], is_user=False,
|
173 |
+
avatar_style="bottts", key=str(i))
|
174 |
+
message(st.session_state['past'][i], is_user=True,
|
175 |
+
avatar_style="adventurer", key=str(i) + '_user')
|
176 |
+
|
177 |
+
with col2:
|
178 |
+
# Render PDF page
|
179 |
+
if pdf_file:
|
180 |
+
st.image(render_page_file(pdf_file, st.session_state.pdf_page))
|
181 |
+
|
182 |
+
if __name__ == "__main__":
|
183 |
+
app()
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
chromadb==0.4.6
|
2 |
+
ctransformers==0.2.22
|
3 |
+
faiss-cpu==1.7.4
|
4 |
+
langchain==0.0.266
|
5 |
+
PyMuPDF==1.22.5
|
6 |
+
pypdf==3.15.1
|
7 |
+
sentence-transformers==2.2.2
|
8 |
+
streamlit==1.25.0
|
9 |
+
streamlit-chat==0.1.1
|