cpereira commited on
Commit
3a2813b
·
1 Parent(s): b6888d5

Added streamlit app and requirements

Browse files
Files changed (2) hide show
  1. app.py +183 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import fitz
3
+ import streamlit as st
4
+ from langchain.chains import ConversationalRetrievalChain
5
+ from langchain.document_loaders import PyPDFLoader
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain.llms import CTransformers
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain.vectorstores import Chroma
10
+ from PIL import Image
11
+ from streamlit_chat import message
12
+
13
+ st.set_page_config(
14
+ page_title="AskBot",
15
+ page_icon=":robot_face:",
16
+ layout="wide"
17
+ )
18
+
19
+ st.sidebar.title("""
20
+ Ask A Bot :robot_face: \n Talk with your PDFs
21
+ """)
22
+
23
+ st.sidebar.write("""
24
+ ###### A Q&A chatbot for you to talk with your PDFs.
25
+ ###### Upload the PDF you want to talk to and start asking questions. The display will show the page where the answer was found.
26
+ ###### When you upload a new PDF, the chat history is reset for you to start fresh.
27
+ ###### The chatbot is based on Langchain and the Llama language model, which is a large language model trained on the Common Crawl dataset. Obtained from [here](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML).
28
+ ###### The performance of this bot is limited due to its size. For better performance, a larger LLM should be used.
29
+ ###### :warning: Sometimes the Streamlit app will not re-run and refresh the PDF. If this happens, refresh the page.
30
+ ###### Developed by [Carlos Pereira](https://linkedin.com/in/carlos-miguel-pereira/).
31
+ """)
32
+
33
+ if 'pdf_page' not in st.session_state:
34
+ st.session_state['pdf_page'] = 0
35
+
36
+ if 'chat_history' not in st.session_state:
37
+ st.session_state['chat_history'] = []
38
+
39
+ if 'generated' not in st.session_state:
40
+ st.session_state['generated'] = []
41
+
42
+ if 'past' not in st.session_state:
43
+ st.session_state['past'] = []
44
+
45
+ def update_state():
46
+ """
47
+ Reset state when a new PDF is uploaded
48
+ """
49
+ st.session_state.pdf_page = 0
50
+ st.session_state.chat_history = []
51
+ st.session_state['generated'] = []
52
+ st.session_state['past'] = []
53
+
54
+ @st.cache_resource(show_spinner=False)
55
+ def load_llm():
56
+ """
57
+ Load Llama LLM
58
+ """
59
+ llm_model = CTransformers(
60
+ model="llama-2-13b-chat.ggmlv3.q3_K_L.bin",
61
+ model_type="llama",
62
+ max_new_tokens=150,
63
+ temperature=0.2
64
+ )
65
+ return llm_model
66
+
67
+ @st.cache_resource(show_spinner=False)
68
+ def gen_embeddings():
69
+ """
70
+ Generate embeddings
71
+ """
72
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",
73
+ model_kwargs={'device': 'cpu'})
74
+
75
+ return embeddings
76
+
77
+ def load_pdf(file):
78
+ """
79
+ Load PDF and process for Search
80
+ """
81
+ # create tempfile to load pdf to PyPDFLoader
82
+ temp_file = tempfile.NamedTemporaryFile()
83
+ temp_file.write(file.getbuffer())
84
+ loader = PyPDFLoader(temp_file.name)
85
+ documents = loader.load()
86
+ pdf_file = fitz.open(temp_file.name)
87
+ temp_file.close()
88
+
89
+ # split doc into chunks
90
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
91
+ texts = text_splitter.split_documents(documents)
92
+
93
+ #get embedding model
94
+ embeddings = gen_embeddings()
95
+ pdf_search = Chroma.from_documents(texts, embeddings)
96
+
97
+ return pdf_search, pdf_file
98
+
99
+ def generate_chain(pdf_vector, llm):
100
+ """
101
+ Generate Retrieval chain
102
+ """
103
+ chain = ConversationalRetrievalChain.from_llm(llm,
104
+ chain_type="stuff",
105
+ retriever=pdf_vector.as_retriever(search_kwargs={"k": 1}),
106
+ return_source_documents=True)
107
+
108
+ return chain
109
+
110
+ def get_answer(chain, query, chat_history):
111
+ """
112
+ Get an answer from the chain
113
+ """
114
+ result = chain({"question": query, 'chat_history': chat_history}, return_only_outputs=True)
115
+ answer = result["answer"]
116
+ # if you want history uncomment the line below
117
+ # st.session_state.chat_history += [(query, answer)]
118
+ st.session_state.pdf_page = list(result['source_documents'][0])[1][1]['page']
119
+
120
+ return answer
121
+
122
+ def render_page_file(file, page):
123
+ """
124
+ Render page from PDF file
125
+ """
126
+ try:
127
+ page = file[page]
128
+ except: # todo: fix this exception handling
129
+ page = file[0]
130
+ st.session_state.pdf_page = 0
131
+
132
+ # Render the PDF page as an image
133
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
134
+ image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
135
+
136
+ return image
137
+
138
+ uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"],
139
+ accept_multiple_files=False,
140
+ on_change=update_state)
141
+
142
+ def app():
143
+ """
144
+ Main app
145
+ """
146
+ if uploaded_file:
147
+ # Load LLM
148
+ with st.spinner('Loading LLM...'):
149
+ llm = load_llm()
150
+ # Load and process the uploaded PDF file
151
+ with st.spinner('Loading PDF...'):
152
+ pdf_vector, pdf_file = load_pdf(uploaded_file)
153
+ with st.spinner('Generating chain...'):
154
+ chain = generate_chain(pdf_vector, llm)
155
+
156
+ col1, col2 = st.columns(2)
157
+ with col1:
158
+ # Question and answering
159
+ with st.form(key='question_form', clear_on_submit=True):
160
+ question = st.text_input('Enter your question:', value="", key='text_value')
161
+ submit_question = st.form_submit_button(label="Enter")
162
+
163
+ if submit_question:
164
+ with st.spinner('Getting answer...'):
165
+ answer = get_answer(chain, question,
166
+ st.session_state.chat_history)
167
+ st.session_state.past.append(question)
168
+ st.session_state.generated.append(answer)
169
+
170
+ if st.session_state['generated']:
171
+ for i in range(len(st.session_state['generated'])-1, -1, -1):
172
+ message(st.session_state["generated"][i], is_user=False,
173
+ avatar_style="bottts", key=str(i))
174
+ message(st.session_state['past'][i], is_user=True,
175
+ avatar_style="adventurer", key=str(i) + '_user')
176
+
177
+ with col2:
178
+ # Render PDF page
179
+ if pdf_file:
180
+ st.image(render_page_file(pdf_file, st.session_state.pdf_page))
181
+
182
+ if __name__ == "__main__":
183
+ app()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ chromadb==0.4.6
2
+ ctransformers==0.2.22
3
+ faiss-cpu==1.7.4
4
+ langchain==0.0.266
5
+ PyMuPDF==1.22.5
6
+ pypdf==3.15.1
7
+ sentence-transformers==2.2.2
8
+ streamlit==1.25.0
9
+ streamlit-chat==0.1.1