ahsannawazch commited on
Commit
28c1ebd
·
1 Parent(s): 3e6bf20

Hugging face deployment

Browse files
Files changed (4) hide show
  1. .dockerignore +10 -0
  2. Dockerfile +11 -0
  3. app.py +227 -0
  4. requirements.txt +0 -0
.dockerignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ venv/
2
+ .env
3
+ keys.txt
4
+ bill.pdf
5
+ cause_list.pdf
6
+ ploomber.py
7
+ pluto.py
8
+ app.ipynb
9
+ .git/
10
+ .gitignore
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12.2
2
+
3
+ COPY . /app
4
+
5
+ WORKDIR /app
6
+
7
+ RUN pip install -r requirements.txt
8
+
9
+ EXPOSE 8000
10
+
11
+ CMD ["chainlit", "run", "app.py"]
app.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chainlit as cl
3
+ from langchain.schema.runnable.config import RunnableConfig
4
+ from chainlit.types import AskFileResponse
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter # Implement Semantic Chinking 2. llamaindex document knowledge graph
7
+ #from langchain_openai import OpenAIEmbeddings
8
+ #from langchain_pinecone import PineconeVectorStore
9
+ #from langchain_openai import ChatOpenAI
10
+ from langchain_cohere import ChatCohere, CohereEmbeddings, CohereRagRetriever
11
+ from langchain_community.vectorstores import FAISS
12
+ from langchain_core.output_parsers import StrOutputParser
13
+ from langchain_core.runnables import RunnablePassthrough
14
+ from langchain.chains import create_history_aware_retriever,create_retrieval_chain
15
+ from langchain.prompts import ChatPromptTemplate
16
+ from langchain_core.prompts import MessagesPlaceholder
17
+ from langchain.chains.combine_documents import create_stuff_documents_chain
18
+ from langchain_core.chat_history import BaseChatMessageHistory
19
+ from langchain_core.runnables.history import RunnableWithMessageHistory
20
+ from langchain_community.chat_message_histories import ChatMessageHistory
21
+
22
+ from dotenv import load_dotenv
23
+ load_dotenv()
24
+
25
+ # OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
26
+ # PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
27
+ # PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
28
+ COHERE_API_KEY = os.getenv("COHERE_API_KEY")
29
+
30
+ # Loading PDF
31
+ def file_loader(file: AskFileResponse):
32
+ loader = PyPDFLoader(file.path)
33
+ pages = loader.load_and_split()
34
+ return pages
35
+
36
+ # Splitting the docs
37
+ def doc_splitter(pages):
38
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=70) # paly with em
39
+ chunks = splitter.split_documents(pages)
40
+
41
+ for i, doc in enumerate(chunks):
42
+ doc.metadata["source"] = f"source_{i}"
43
+
44
+ return chunks
45
+
46
+ # Storing Embeddings
47
+ def store_embeddings(chunks):
48
+ embeddings = CohereEmbeddings()
49
+ vectorstore = FAISS.from_documents(chunks,embeddings)
50
+ return vectorstore
51
+
52
+
53
+
54
+ # If data is already in pinecone don't add more/repetitive stuff. check later
55
+ # How to clear an index and add new data in it.
56
+ # How to append data in same index?
57
+ # Should I add multiple books in the same index?
58
+
59
+
60
+ # Model
61
+ model = ChatCohere(cohere_api_key= COHERE_API_KEY)
62
+
63
+
64
+
65
+ @cl.on_chat_start
66
+ async def on_start_chat():
67
+ elements = [
68
+ cl.Image(name="image1",display="inline",path="llama.jpg")
69
+ ]
70
+ await cl.Message(content="Hello, How can I be of your assistance?", elements=elements).send()
71
+
72
+ files = None
73
+
74
+ # Wait for the user to upload a file
75
+ while files is None:
76
+ files = await cl.AskFileMessage(
77
+ content="Please upload a PDF file to begin!\n"
78
+ "The processing of the file may require a few moments or minutes to complete.",
79
+ accept=["text/plain", "application/pdf"],
80
+ max_size_mb=100,
81
+ timeout=180,
82
+ ).send()
83
+
84
+ file = files[0]
85
+
86
+ msg = cl.Message(content=f"Processing `{file.name}`...", disable_feedback=True)
87
+ await msg.send()
88
+
89
+ # Process the file and return pages
90
+ pages = file_loader(file)
91
+
92
+ # Split pages into chunks
93
+ chunks = doc_splitter(pages)
94
+
95
+ # Store Embeddings
96
+ vectordb = store_embeddings(chunks)
97
+
98
+ # Set vectorstore as retriever
99
+ retriever = vectordb.as_retriever() # Play with top k and return source docs. later
100
+
101
+ msg.content = f"Creating embeddings for `{file.name}`. . ."
102
+ await msg.update()
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+ #model = ChatOpenAI(model= "gpt-3.5-turbo")
120
+
121
+
122
+ contextualize_query_system_message = """ Given a chat history and the latest user question \
123
+ which might reference context in the chat history, formulate a standalone question \
124
+ which can be understood without the chat history. Do NOT answer the question, \
125
+ just reformulate it if needed and otherwise return it as is."""
126
+ contextualize_query_prompt = ChatPromptTemplate.from_messages(
127
+ [
128
+ ("system", contextualize_query_system_message),
129
+ MessagesPlaceholder("chat_history"),
130
+ ("human", "{input}")
131
+ ]
132
+ )
133
+ history_aware_retriever = create_history_aware_retriever(model, retriever, contextualize_query_prompt)
134
+
135
+
136
+ qa_system_message = """You are an assistant for question-answering tasks. \
137
+ Use the following pieces of retrieved context to answer the question. \
138
+ If you don't know the answer, just say that you don't know. \
139
+ Use three sentences maximum and keep the answer concise.\
140
+
141
+ {context}"""
142
+ qa_prompt = ChatPromptTemplate.from_messages(
143
+ [
144
+ ("system", qa_system_message),
145
+ MessagesPlaceholder("chat_history"),
146
+ ("human", "{input}")
147
+ ]
148
+ )
149
+ question_answer_chain = create_stuff_documents_chain(llm=model, prompt=qa_prompt)
150
+
151
+ rag_chain = create_retrieval_chain(history_aware_retriever,question_answer_chain)
152
+
153
+ # Statefully tracking history
154
+ store = {}
155
+
156
+ def get_session_history(session_id: str) -> BaseChatMessageHistory:
157
+ if session_id not in store:
158
+ store[session_id] = ChatMessageHistory()
159
+ return store[session_id]
160
+
161
+ conversational_rag_chain = RunnableWithMessageHistory(
162
+ rag_chain,
163
+ get_session_history,
164
+ input_messages_key= "input",
165
+ history_messages_key="chat_history",
166
+ output_messages_key="answer",
167
+ )
168
+
169
+ cl.user_session.set("conversational_rag_chain",conversational_rag_chain) #Might need to change quoted conversational_rag_chain to chain
170
+
171
+ msg.content = f"`{file.name}` processed. You can now ask questions!"
172
+ await msg.update()
173
+
174
+
175
+
176
+
177
+
178
+
179
+ ##########################
180
+
181
+ @cl.on_message
182
+ async def on_message(message: cl.Message):
183
+
184
+ conversational_rag_chain = cl.user_session.get("conversational_rag_chain")
185
+
186
+ #msg = cl.Message(content="")
187
+
188
+ # conversational_rag_chain.invoke(
189
+ # {"input": "Who is Ibn e Khaldoon?"},
190
+ # config={
191
+ # "configurable": {"session_id": "abc123"}
192
+ # }, # constructs a key "abc123" in `store`.
193
+ # )["answer"]
194
+
195
+ response = await conversational_rag_chain.ainvoke(
196
+ {"input": message.content},
197
+ config={"configurable": {"session_id": "abc123"},
198
+ "callbacks":[cl.AsyncLangchainCallbackHandler()]},
199
+ )
200
+ answer = response["answer"]
201
+
202
+ source_documents = response["context"]
203
+ text_elements = []
204
+ unique_pages = set()
205
+
206
+ if source_documents:
207
+
208
+ for source_idx, source_doc in enumerate(source_documents):
209
+ source_name = f"source_{source_idx+1}"
210
+ page_number = source_doc.metadata['page']
211
+ #page_number = source_doc.metadata.get('page', "NA") # NA or any default value
212
+ page = f"Page {page_number}"
213
+ text_element_content = source_doc.page_content
214
+ #text_elements.append(cl.Text(content=text_element_content, name=source_name))
215
+ if page not in unique_pages:
216
+ unique_pages.add(page)
217
+ text_elements.append(cl.Text(content=text_element_content, name=page))
218
+ #text_elements.append(cl.Text(content=text_element_content, name=page))
219
+ source_names = [text_el.name for text_el in text_elements]
220
+
221
+ if source_names:
222
+ answer += f"\n\n Sources:{', '.join(source_names)}"
223
+ else:
224
+ answer += "\n\n No sources found"
225
+
226
+ await cl.Message(content=answer, elements=text_elements).send()
227
+
requirements.txt ADDED
Binary file (11.3 kB). View file