Mehmet Emin Aydin commited on
Commit
afb7b3e
·
unverified ·
1 Parent(s): 0cf5ec1

File management avoided

Browse files
Files changed (1) hide show
  1. app.py +59 -106
app.py CHANGED
@@ -7,7 +7,6 @@ import signal
7
  import os
8
  import PyPDF2
9
  from docx import Document
10
- from fastapi import UploadFile, FastAPI, File, Form, UploadFile, HTTPException
11
  from langchain.text_splitter import CharacterTextSplitter
12
  from langchain_community.embeddings import HuggingFaceEmbeddings
13
  from langchain_community.vectorstores import FAISS
@@ -16,119 +15,96 @@ import pickle
16
  from datetime import datetime
17
  import io
18
  from dotenv import load_dotenv
 
 
 
 
19
  class User:
20
  def __init__(self, username):
21
  self.username = username
22
  self.llm = "gemini-pro"
23
  self.embedder = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
24
 
25
- async def upload_documents(user: User, files: list[UploadFile]) -> tuple[str, int]:
26
- text = await _extract_text_from_document(files)
27
- chunks = await _chunk_text(text)
28
- pkl_name, status_code = await _create_embeddings_and_save(user, chunks)
 
29
  if status_code == 200:
30
  return "Document uploaded successfully.", 200
31
  else:
32
  return "Failed to upload document.", 500
33
 
34
- async def _extract_text_from_document(files: list[UploadFile]) -> str:
 
35
  text = ""
36
  for file in files:
37
- byte_object = await file.read()
38
- file_name = file.filename
39
  file_extension = os.path.splitext(file_name)[1]
40
  if file_extension == '.txt':
41
- text += byte_object.decode('utf-8')
42
  elif file_extension == '.pdf':
43
- pdf_reader = PyPDF2.PdfReader(io.BytesIO(byte_object))
44
  for page_number in range(len(pdf_reader.pages)):
45
  page = pdf_reader.pages[page_number]
46
  text += page.extract_text()
47
  elif file_extension == '.docx':
48
- doc = Document(io.BytesIO(byte_object))
49
  for paragraph in doc.paragraphs:
50
  text += paragraph.text + "\n"
51
  return text
52
 
53
- async def _chunk_text(text: str) -> list[str]:
54
- chunks = None
55
  text_splitter = CharacterTextSplitter(
56
  separator="\n",
57
  chunk_size=512,
58
  chunk_overlap=10,
59
  length_function=len
60
  )
61
- chunks = text_splitter.split_text(text)
62
- return chunks
63
 
64
- async def _create_embeddings_and_save(user: User, chunks: any) -> FAISS:
 
65
  embeddings = HuggingFaceEmbeddings(model_name=user.embedder)
66
- pkl_name = os.path.join(user.username + ".pkl")
67
- vector_store = FAISS.from_texts(chunks, embeddings, metadatas=[{"source": f"{pkl_name}:{i}"} for i in range(len(chunks))])
68
- with open(pkl_name, "wb") as f:
69
- pickle.dump(vector_store, f)
70
- return vector_store
71
-
72
- async def ask_question(user: User, question: str, api_key: str) -> tuple[str, int]:
73
- username = user.username
74
- vector_store = await _get_vector_file(username)
75
- if vector_store is None:
76
- return "Document not found.", 400
77
-
78
- if api_key is not None:
79
  os.environ["GOOGLE_API_KEY"] = api_key
80
  else:
81
  is_loaded = load_dotenv()
82
- if is_loaded == False:
83
  return "API key not found.", 400
84
-
85
- llm = ChatGoogleGenerativeAI(model=user.llm, temperature=0, max_output_tokens=256, top_k = 40, top_p = 0.8)
86
  docs = vector_store.similarity_search(question)
87
  retrieved_chunks = docs[0].page_content + docs[1].page_content + docs[2].page_content
88
- system_message="Figure out the answer of the question by the given information pieces. ALWAYS answer with the language of the question."
89
  prompt = system_message + "Question: " + question + " Context: " + retrieved_chunks
90
  try:
91
  response = llm.invoke(prompt)
92
  except Exception:
93
  return "Wrong API key.", 400
 
94
  answer = response.content + " **<Most Related Chunk>** " + retrieved_chunks
95
- await _log(user, question, system_message, retrieved_chunks, response.content)
96
  return answer, 200
97
 
98
- async def _get_vector_file(username: str)-> any:
99
- with open(username+".pkl", "rb") as f:
100
- vector_store = pickle.load(f)
101
- return vector_store
102
 
103
- async def _log(user: User, question: str, system_message: str, retrieved_chunks: str, answer: str):
104
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
105
  log_message = (
106
  f"{timestamp}, Username: {user.username}, Question: {question}, "
107
  f"LLM: {user.llm}, Embedder: {user.embedder}, System Message: {system_message}, "
108
  f"Retrieved Texts: {retrieved_chunks}, Answer: {answer}\n"
109
  )
110
- with open("log.txt", "a", encoding="utf-8") as file:
111
- file.write(log_message)
112
-
113
-
114
- app = FastAPI()
115
- @app.post("/document-uploader")
116
- async def document_uploader(username: str = Form(...), files: list[UploadFile] = File(...)):
117
- user = User(username=username)
118
- response, status_code = await upload_documents(user, files)
119
- if status_code == 200:
120
- return {response}
121
- else:
122
- raise HTTPException(status_code=status_code, detail=response)
123
-
124
- @app.post("/question-answerer")
125
- async def question_answerer(username: str = Form(...), question: str = Form(...), api_key = File(None)):
126
- user = User(username=username)
127
- response, status_code = await ask_question(user, question, api_key)
128
- if status_code == 200:
129
- return {response}
130
- else:
131
- raise HTTPException(status_code=status_code, detail=response)
132
 
133
 
134
  def main():
@@ -136,76 +112,53 @@ def main():
136
 
137
  tabs = ["Upload Document", "Ask Question"]
138
  active_tab = st.radio("Upload documents first, ask questions later:", tabs)
139
-
140
  if active_tab == "Upload Document":
141
  upload_document()
142
  elif active_tab == "Ask Question":
143
- ask_question()
 
144
 
145
  def upload_document():
146
  st.write("Several files can be uploaded, each upload crushes the old one. Depending on the number and size of files, the upload process may take a long time.")
147
 
148
  username = st.text_input("Enter a username (just something that represents you):")
149
- uploaded_files = st.file_uploader("Upload your documents (for now it only works with files that has .txt, .pdf or .docx extension):", accept_multiple_files=True)
150
 
151
- if uploaded_files:
152
  st.write("Number of uploaded files:", len(uploaded_files))
153
-
154
  for uploaded_file in uploaded_files:
155
  file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type, "FileSize": uploaded_file.size}
156
  st.write(file_details)
157
-
158
- files = [("files", (uploaded_file.name, uploaded_file, uploaded_file.type)) for uploaded_file in uploaded_files]
159
-
160
- payload = {'username': username}
161
-
162
- with st.spinner('Loading...'):
163
- response = requests.post("http://localhost:8000/document-uploader/", files=files, data=payload)
164
-
165
- if response.status_code == 200:
166
- st.success(response.text)
167
  else:
168
- st.error("Error:", response.text)
169
 
170
 
171
- def ask_question():
172
- username = st.text_input("Enter a username (just something that represents you):")
173
  api_key = st.text_input("Add your Google API key. It is free. Key acquisition video: [https://www.youtube.com/watch?v=brCkpzAD0gc]: (If you do not trust you can download and use the app in your local too)", type="password")
174
- question = st.text_area("Enter the question you want to ask in your document (the more detailed your question, the more accurate an answer you will get): ")
175
-
176
  if st.button("Ask"):
177
  if not question:
178
  st.warning("Please enter a question.")
179
  elif not username:
180
  st.warning("Please enter a username.")
181
  else:
182
- payload = {'username': username, 'question': question, 'api_key': api_key}
183
-
184
- with st.spinner('Question is getting answered...'):
185
- response = requests.post("http://localhost:8000/question-answerer/", data=payload)
186
 
187
- if response.status_code == 200:
188
- st.success("Answer: " + response.text)
189
  else:
190
- print(response)
191
- st.error("Error:", response.text)
192
-
193
- uvicorn_process = None
194
-
195
- def run_fastapi():
196
- global uvicorn_process
197
- if uvicorn_process is None:
198
- uvicorn_process = subprocess.Popen(["uvicorn", "app:app", "--host", "127.0.0.1", "--port", "8000"])
199
- print("FastAPI server has been started.")
200
-
201
- def cleanup():
202
- global uvicorn_process
203
- if uvicorn_process:
204
- os.kill(uvicorn_process.pid, signal.SIGTERM)
205
- uvicorn_process.wait()
206
- print("FastAPI server has been closed.")
207
 
208
  if __name__ == "__main__":
209
- run_fastapi()
210
- atexit.register(cleanup)
211
  main()
 
7
  import os
8
  import PyPDF2
9
  from docx import Document
 
10
  from langchain.text_splitter import CharacterTextSplitter
11
  from langchain_community.embeddings import HuggingFaceEmbeddings
12
  from langchain_community.vectorstores import FAISS
 
15
  from datetime import datetime
16
  import io
17
  from dotenv import load_dotenv
18
+
19
+ log_data = []
20
+
21
+
22
  class User:
23
  def __init__(self, username):
24
  self.username = username
25
  self.llm = "gemini-pro"
26
  self.embedder = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
27
 
28
+
29
+ def upload_documents(user: User, files) -> tuple[str, int]:
30
+ text = _extract_text_from_document(files)
31
+ chunks = _chunk_text(text)
32
+ status_code = _create_embeddings_and_save(user, chunks)
33
  if status_code == 200:
34
  return "Document uploaded successfully.", 200
35
  else:
36
  return "Failed to upload document.", 500
37
 
38
+
39
+ def _extract_text_from_document(files) -> str:
40
  text = ""
41
  for file in files:
42
+ file_name = file.name
 
43
  file_extension = os.path.splitext(file_name)[1]
44
  if file_extension == '.txt':
45
+ text += file.read().decode('utf-8')
46
  elif file_extension == '.pdf':
47
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(file.read()))
48
  for page_number in range(len(pdf_reader.pages)):
49
  page = pdf_reader.pages[page_number]
50
  text += page.extract_text()
51
  elif file_extension == '.docx':
52
+ doc = Document(io.BytesIO(file.read()))
53
  for paragraph in doc.paragraphs:
54
  text += paragraph.text + "\n"
55
  return text
56
 
57
+
58
+ def _chunk_text(text: str) -> list[str]:
59
  text_splitter = CharacterTextSplitter(
60
  separator="\n",
61
  chunk_size=512,
62
  chunk_overlap=10,
63
  length_function=len
64
  )
65
+ return text_splitter.split_text(text)
 
66
 
67
+
68
+ def _create_embeddings_and_save(user: User, chunks: any) -> int:
69
  embeddings = HuggingFaceEmbeddings(model_name=user.embedder)
70
+ vector_store = FAISS.from_texts(chunks, embeddings, metadatas=[{"source": f"{user.username}:{i}"} for i in range(len(chunks))])
71
+ st.session_state.vector_store = vector_store
72
+ return 200
73
+
74
+
75
+ def ask_question(user: User, question: str, api_key: str, vector_store : FAISS) -> tuple[str, int]:
76
+
77
+
78
+ if api_key:
 
 
 
 
79
  os.environ["GOOGLE_API_KEY"] = api_key
80
  else:
81
  is_loaded = load_dotenv()
82
+ if not is_loaded:
83
  return "API key not found.", 400
84
+
85
+ llm = ChatGoogleGenerativeAI(model=user.llm, temperature=0, max_output_tokens=256, top_k=40, top_p=0.8)
86
  docs = vector_store.similarity_search(question)
87
  retrieved_chunks = docs[0].page_content + docs[1].page_content + docs[2].page_content
88
+ system_message = "Figure out the answer of the question by the given information pieces. ALWAYS answer with the language of the question."
89
  prompt = system_message + "Question: " + question + " Context: " + retrieved_chunks
90
  try:
91
  response = llm.invoke(prompt)
92
  except Exception:
93
  return "Wrong API key.", 400
94
+
95
  answer = response.content + " **<Most Related Chunk>** " + retrieved_chunks
96
+ _log(user, question, system_message, retrieved_chunks, response.content)
97
  return answer, 200
98
 
 
 
 
 
99
 
100
+ def _log(user: User, question: str, system_message: str, retrieved_chunks: str, answer: str):
101
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
102
  log_message = (
103
  f"{timestamp}, Username: {user.username}, Question: {question}, "
104
  f"LLM: {user.llm}, Embedder: {user.embedder}, System Message: {system_message}, "
105
  f"Retrieved Texts: {retrieved_chunks}, Answer: {answer}\n"
106
  )
107
+ log_data.append(log_message)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
 
110
  def main():
 
112
 
113
  tabs = ["Upload Document", "Ask Question"]
114
  active_tab = st.radio("Upload documents first, ask questions later:", tabs)
115
+
116
  if active_tab == "Upload Document":
117
  upload_document()
118
  elif active_tab == "Ask Question":
119
+ ask_question_ui(st.session_state.vector_store)
120
+
121
 
122
  def upload_document():
123
  st.write("Several files can be uploaded, each upload crushes the old one. Depending on the number and size of files, the upload process may take a long time.")
124
 
125
  username = st.text_input("Enter a username (just something that represents you):")
126
+ uploaded_files = st.file_uploader("Upload your documents (for now it only works with files that have .txt, .pdf or .docx extension):", accept_multiple_files=True)
127
 
128
+ if uploaded_files and username:
129
  st.write("Number of uploaded files:", len(uploaded_files))
130
+
131
  for uploaded_file in uploaded_files:
132
  file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type, "FileSize": uploaded_file.size}
133
  st.write(file_details)
134
+
135
+ user = User(username=username)
136
+ response, status_code = upload_documents(user, uploaded_files)
137
+
138
+ if status_code == 200:
139
+ st.success(response)
 
 
 
 
140
  else:
141
+ st.error("Error:", response)
142
 
143
 
144
+ def ask_question_ui(vector_store : FAISS):
145
+ username = st.text_input("Enter a username (just something that represents you):")
146
  api_key = st.text_input("Add your Google API key. It is free. Key acquisition video: [https://www.youtube.com/watch?v=brCkpzAD0gc]: (If you do not trust you can download and use the app in your local too)", type="password")
147
+ question = st.text_area("Enter the question you want to ask in your document (the more detailed your question, the more accurate an answer you will get):")
148
+
149
  if st.button("Ask"):
150
  if not question:
151
  st.warning("Please enter a question.")
152
  elif not username:
153
  st.warning("Please enter a username.")
154
  else:
155
+ user = User(username=username)
156
+ answer, status_code = ask_question(user, question, api_key, vector_store)
 
 
157
 
158
+ if status_code == 200:
159
+ st.success("Answer: " + answer)
160
  else:
161
+ st.error("Error: " + answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  if __name__ == "__main__":
 
 
164
  main()