Ritvik19 commited on
Commit
7e4014b
·
verified ·
1 Parent(s): 20c0b83

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +90 -210
  2. command_center.py +44 -0
  3. embed_documents.py +18 -0
  4. process_documents.py +141 -0
  5. requirements.txt +2 -2
app.py CHANGED
@@ -1,37 +1,57 @@
1
- import math
2
  import os
3
- import re
4
- from pathlib import Path
5
- from statistics import median
6
- import json
7
  import pandas as pd
8
- import streamlit as st
9
- from bs4 import BeautifulSoup
 
 
10
  from langchain.callbacks import get_openai_callback
11
  from langchain.chains import ConversationalRetrievalChain
12
- from langchain.docstore.document import Document
13
- from langchain.document_loaders import PDFMinerPDFasHTMLLoader, WebBaseLoader
14
- from langchain.retrievers.multi_query import MultiQueryRetriever
15
  from langchain_openai import ChatOpenAI
16
- from ragatouille import RAGPretrainedModel
17
 
18
  st.set_page_config(layout="wide")
19
  os.environ["OPENAI_API_KEY"] = "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"
20
 
21
- LOCAL_VECTOR_STORE_DIR = Path(__file__).resolve().parent.joinpath("vector_store")
22
-
23
- deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
24
-
25
  get_references = lambda relevant_docs: " ".join(
26
  [f"[{ref}]" for ref in sorted([ref.metadata["chunk_id"] for ref in relevant_docs])]
27
  )
28
  session_state_2_llm_chat_history = lambda session_state: [
29
- ss[:2] for ss in session_state
30
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
 
33
- def get_conversation_history():
34
- return json.dumps(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  {
36
  "document_urls": (
37
  st.session_state.source_doc_urls
@@ -39,7 +59,7 @@ def get_conversation_history():
39
  else []
40
  ),
41
  "document_snippets": (
42
- st.session_state.headers.to_list()
43
  if "headers" in st.session_state
44
  else []
45
  ),
@@ -60,38 +80,28 @@ def get_conversation_history():
60
  ),
61
  }
62
  )
63
-
64
-
65
- ai_message_format = lambda message, references: f"{message}\n\n---\n\n{references}"
66
-
67
-
68
- def embeddings_on_local_vectordb(texts):
69
- colbert = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv1.9")
70
- colbert.index(
71
- collection=[chunk.page_content for chunk in texts],
72
- split_documents=False,
73
- document_metadatas=[chunk.metadata for chunk in texts],
74
- index_name="vector_store",
75
- )
76
- retriever = colbert.as_langchain_retriever(k=5)
77
- retriever = MultiQueryRetriever.from_llm(
78
- retriever=retriever, llm=ChatOpenAI(temperature=0)
79
  )
80
- return retriever
81
 
82
 
83
- def query_llm(retriever, query):
 
84
  qa_chain = ConversationalRetrievalChain.from_llm(
85
  llm=ChatOpenAI(model="gpt-4-0125-preview", temperature=0),
86
  retriever=retriever,
87
  return_source_documents=True,
88
  chain_type="stuff",
89
  )
90
- relevant_docs = retriever.get_relevant_documents(query)
91
  with get_openai_callback() as cb:
92
  result = qa_chain(
93
  {
94
- "question": query,
95
  "chat_history": session_state_2_llm_chat_history(
96
  st.session_state.messages
97
  ),
@@ -100,192 +110,62 @@ def query_llm(retriever, query):
100
  stats = cb
101
  result = result["answer"]
102
  references = get_references(relevant_docs)
103
- st.session_state.messages.append((query, result, references))
104
- return result, references, stats
105
-
106
-
107
- def input_fields():
108
- st.session_state.source_doc_urls = [
109
- url.strip()
110
- for url in st.sidebar.text_area(
111
- "Source Document URLs\n(New line separated)", height=50
112
- ).split("\n")
113
- ]
114
-
115
-
116
- def process_documents():
117
- try:
118
- snippets = []
119
- for url in st.session_state.source_doc_urls:
120
- if url.endswith(".pdf"):
121
- snippets.extend(process_pdf(url))
122
- else:
123
- snippets.extend(process_web(url))
124
- st.session_state.retriever = embeddings_on_local_vectordb(snippets)
125
- st.session_state.headers = pd.Series(
126
- [snip.metadata["header"] for snip in snippets], name="references"
127
- )
128
- except Exception as e:
129
- st.error(f"An error occurred: {e}")
130
-
131
-
132
- def process_pdf(url):
133
- data = PDFMinerPDFasHTMLLoader(url).load()[0]
134
- content = BeautifulSoup(data.page_content, "html.parser").find_all("div")
135
- snippets = get_pdf_snippets(content)
136
- filtered_snippets = filter_pdf_snippets(snippets, new_line_threshold_ratio=0.4)
137
- median_font_size = math.ceil(
138
- median([font_size for _, font_size in filtered_snippets])
139
- )
140
- semantic_snippets = get_pdf_semantic_snippets(filtered_snippets, median_font_size)
141
- document_snippets = [
142
- Document(
143
- page_content=deep_strip(snip[1]["header_text"]) + " " + deep_strip(snip[0]),
144
- metadata={
145
- "header": " ".join(snip[1]["header_text"].split()[:10]),
146
- "source_url": url,
147
- "source_type": "pdf",
148
- "chunk_id": i,
149
- },
150
- )
151
- for i, snip in enumerate(semantic_snippets)
152
- ]
153
- return document_snippets
154
-
155
-
156
- def get_pdf_snippets(content):
157
- current_font_size = None
158
- current_text = ""
159
- snippets = []
160
- for cntnt in content:
161
- span = cntnt.find("span")
162
- if not span:
163
- continue
164
- style = span.get("style")
165
- if not style:
166
- continue
167
- font_size = re.findall("font-size:(\d+)px", style)
168
- if not font_size:
169
- continue
170
- font_size = int(font_size[0])
171
-
172
- if not current_font_size:
173
- current_font_size = font_size
174
- if font_size == current_font_size:
175
- current_text += cntnt.text
176
- else:
177
- snippets.append((current_text, current_font_size))
178
- current_font_size = font_size
179
- current_text = cntnt.text
180
- snippets.append((current_text, current_font_size))
181
- return snippets
182
-
183
-
184
- def filter_pdf_snippets(content_list, new_line_threshold_ratio):
185
- filtered_list = []
186
- for e, (content, font_size) in enumerate(content_list):
187
- newline_count = content.count("\n")
188
- total_chars = len(content)
189
- ratio = newline_count / total_chars
190
- if ratio <= new_line_threshold_ratio:
191
- filtered_list.append((content, font_size))
192
- return filtered_list
193
-
194
-
195
- def get_pdf_semantic_snippets(filtered_snippets, median_font_size):
196
- semantic_snippets = []
197
- current_header = None
198
- current_content = []
199
- header_font_size = None
200
- content_font_sizes = []
201
-
202
- for content, font_size in filtered_snippets:
203
- if font_size > median_font_size:
204
- if current_header is not None:
205
- metadata = {
206
- "header_font_size": header_font_size,
207
- "content_font_size": (
208
- median(content_font_sizes) if content_font_sizes else None
209
- ),
210
- "header_text": current_header,
211
- }
212
- semantic_snippets.append((current_content, metadata))
213
- current_content = []
214
- content_font_sizes = []
215
-
216
- current_header = content
217
- header_font_size = font_size
218
- else:
219
- content_font_sizes.append(font_size)
220
- if current_content:
221
- current_content += " " + content
222
- else:
223
- current_content = content
224
-
225
- if current_header is not None:
226
- metadata = {
227
- "header_font_size": header_font_size,
228
- "content_font_size": (
229
- median(content_font_sizes) if content_font_sizes else None
230
- ),
231
- "header_text": current_header,
232
  }
233
- semantic_snippets.append((current_content, metadata))
234
- return semantic_snippets
235
-
236
-
237
- def process_web(url):
238
- data = WebBaseLoader(url).load()[0]
239
- document_snippets = [
240
- Document(
241
- page_content=deep_strip(data.page_content),
242
- metadata={
243
- "header": data.metadata["title"],
244
- "source_url": url,
245
- "source_type": "web",
246
- },
247
- )
248
- ]
249
- return document_snippets
250
 
251
 
252
- def boot():
253
  st.title("Agent Xi - An ArXiv Chatbot")
254
- st.sidebar.title("Input Documents")
255
- input_fields()
256
- st.sidebar.button("Submit Documents", on_click=process_documents)
257
- if "headers" in st.session_state:
258
- st.sidebar.write("### References")
259
- st.sidebar.write(st.session_state.headers)
260
  if "costing" not in st.session_state:
261
  st.session_state.costing = []
262
  if "messages" not in st.session_state:
263
  st.session_state.messages = []
264
-
265
  for message in st.session_state.messages:
266
  st.chat_message("human").write(message[0])
267
  st.chat_message("ai").write(ai_message_format(message[1], message[2]))
268
  if query := st.chat_input():
269
  st.chat_message("human").write(query)
270
- response, references, stats = query_llm(st.session_state.retriever, query)
271
- st.chat_message("ai").write(ai_message_format(response, references))
 
 
 
 
 
 
 
272
 
273
- st.session_state.costing.append(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  {
275
- "prompt tokens": stats.prompt_tokens,
276
- "completion tokens": stats.completion_tokens,
277
- "cost": stats.total_cost,
278
  }
279
  )
280
- stats_df = pd.DataFrame(st.session_state.costing)
281
- stats_df.loc["total"] = stats_df.sum()
282
- st.sidebar.write(stats_df)
283
- st.sidebar.download_button(
284
- "Download Conversation",
285
- get_conversation_history(),
286
- "conversation.json",
287
  )
288
-
289
-
290
- if __name__ == "__main__":
291
- boot()
 
 
 
1
+ import streamlit as st
2
  import os
 
 
 
 
3
  import pandas as pd
4
+ from command_center import CommandCenter
5
+ from process_documents import process_documents
6
+ from embed_documents import create_retriever
7
+ import json
8
  from langchain.callbacks import get_openai_callback
9
  from langchain.chains import ConversationalRetrievalChain
 
 
 
10
  from langchain_openai import ChatOpenAI
 
11
 
12
  st.set_page_config(layout="wide")
13
  os.environ["OPENAI_API_KEY"] = "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"
14
 
 
 
 
 
15
  get_references = lambda relevant_docs: " ".join(
16
  [f"[{ref}]" for ref in sorted([ref.metadata["chunk_id"] for ref in relevant_docs])]
17
  )
18
  session_state_2_llm_chat_history = lambda session_state: [
19
+ ss[:2] for ss in session_state if not ss[0].startswith("/")
20
  ]
21
+ ai_message_format = lambda message, references: (
22
+ f"{message}\n\n---\n\n{references}" if references != "" else message
23
+ )
24
+
25
+
26
+ def process_documents_wrapper(inputs):
27
+ snippets = process_documents(inputs)
28
+ st.session_state.retriever = create_retriever(snippets)
29
+ st.session_state.source_doc_urls = inputs
30
+ st.session_state.index = [snip.metadata["header"] for snip in snippets]
31
+ response = f"Uploaded and processed documents {inputs}"
32
+ st.session_state.messages.append((f"/upload {inputs}", response, ""))
33
+ return response
34
 
35
 
36
+ def index_documents_wrapper(inputs=None):
37
+ response = pd.Series(st.session_state.index, name="references").to_markdown()
38
+ st.session_state.messages.append(("/index", response, ""))
39
+ return response
40
+
41
+
42
+ def calculate_cost_wrapper(inputs=None):
43
+ try:
44
+ stats_df = pd.DataFrame(st.session_state.costing)
45
+ stats_df.loc["total"] = stats_df.sum()
46
+ response = stats_df.to_markdown()
47
+ except ValueError:
48
+ response = "No costing incurred yet"
49
+ st.session_state.messages.append(("/cost", response, ""))
50
+ return response
51
+
52
+
53
+ def download_conversation_wrapper(inputs=None):
54
+ conversation_data = json.dumps(
55
  {
56
  "document_urls": (
57
  st.session_state.source_doc_urls
 
59
  else []
60
  ),
61
  "document_snippets": (
62
+ st.session_state.index.to_list()
63
  if "headers" in st.session_state
64
  else []
65
  ),
 
80
  ),
81
  }
82
  )
83
+ st.sidebar.download_button(
84
+ "Download Conversation",
85
+ conversation_data,
86
+ file_name="conversation_data.json",
87
+ mime="application/json",
 
 
 
 
 
 
 
 
 
 
 
88
  )
89
+ st.session_state.messages.append(("/download", "Conversation data downloaded", ""))
90
 
91
 
92
+ def query_llm_wrapper(inputs):
93
+ retriever = st.session_state.retriever
94
  qa_chain = ConversationalRetrievalChain.from_llm(
95
  llm=ChatOpenAI(model="gpt-4-0125-preview", temperature=0),
96
  retriever=retriever,
97
  return_source_documents=True,
98
  chain_type="stuff",
99
  )
100
+ relevant_docs = retriever.get_relevant_documents(inputs)
101
  with get_openai_callback() as cb:
102
  result = qa_chain(
103
  {
104
+ "question": inputs,
105
  "chat_history": session_state_2_llm_chat_history(
106
  st.session_state.messages
107
  ),
 
110
  stats = cb
111
  result = result["answer"]
112
  references = get_references(relevant_docs)
113
+ st.session_state.messages.append((inputs, result, references))
114
+ st.session_state.costing.append(
115
+ {
116
+ "prompt tokens": stats.prompt_tokens,
117
+ "completion tokens": stats.completion_tokens,
118
+ "cost": stats.total_cost,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  }
120
+ )
121
+ return result, references
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
 
124
+ def boot(command_center):
125
  st.title("Agent Xi - An ArXiv Chatbot")
 
 
 
 
 
 
126
  if "costing" not in st.session_state:
127
  st.session_state.costing = []
128
  if "messages" not in st.session_state:
129
  st.session_state.messages = []
 
130
  for message in st.session_state.messages:
131
  st.chat_message("human").write(message[0])
132
  st.chat_message("ai").write(ai_message_format(message[1], message[2]))
133
  if query := st.chat_input():
134
  st.chat_message("human").write(query)
135
+ response = command_center.execute_command(query)
136
+ if response is None:
137
+ pass
138
+ elif type(response) == tuple:
139
+ result, references = response
140
+ st.chat_message("ai").write(ai_message_format(result, references))
141
+ else:
142
+ st.chat_message("ai").write(response)
143
+
144
 
145
+ if __name__ == "__main__":
146
+ all_commands = [
147
+ ("/upload", list, process_documents_wrapper, "Upload and process documents"),
148
+ ("/index", None, index_documents_wrapper, "View index of processed documents"),
149
+ ("/cost", None, calculate_cost_wrapper, "Calculate cost of conversation"),
150
+ (
151
+ "/download",
152
+ None,
153
+ download_conversation_wrapper,
154
+ "Download conversation data",
155
+ ),
156
+ ]
157
+ st.sidebar.title("Commands Menu")
158
+ st.sidebar.write(
159
+ pd.DataFrame(
160
  {
161
+ "Command": [command[0] for command in all_commands],
162
+ "Description": [command[3] for command in all_commands],
 
163
  }
164
  )
 
 
 
 
 
 
 
165
  )
166
+ command_center = CommandCenter(
167
+ default_input_type=str,
168
+ default_function=query_llm_wrapper,
169
+ all_commands=[command[:3] for command in all_commands],
170
+ )
171
+ boot(command_center)
command_center.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class CommandCenter:
2
+ def __init__(self, default_input_type, default_function=None, all_commands=None):
3
+ self.commands = {}
4
+ self.add_command("/default", default_input_type, default_function)
5
+ if all_commands:
6
+ for command, input_type, function in all_commands:
7
+ self.add_command(command, input_type, function)
8
+
9
+ def add_command(self, command, input_type, function=None):
10
+ assert input_type in [None, str, int, float, bool, list], "Invalid input type"
11
+ self.commands[command] = {"input_type": input_type, "function": function}
12
+
13
+ def parse_command(self, input_string):
14
+ # parsing the input string
15
+ if not input_string.startswith("/"):
16
+ command = "/default"
17
+ argument = input_string.split(" ")
18
+ else:
19
+ inputs = input_string.split(" ")
20
+ command = inputs[0]
21
+ argument = inputs[1:]
22
+
23
+ # type casting the arguments
24
+ if self.commands[command]["input_type"] == str:
25
+ argument = " ".join(argument)
26
+ elif self.commands[command]["input_type"] == int:
27
+ argument = int(" ".join(argument))
28
+ elif self.commands[command]["input_type"] == float:
29
+ argument = float(" ".join(argument))
30
+ elif self.commands[command]["input_type"] == bool:
31
+ argument = bool(" ".join(argument))
32
+ elif self.commands[command]["input_type"] == list:
33
+ argument = argument
34
+ else:
35
+ argument = None
36
+
37
+ return command, argument
38
+
39
+ def execute_command(self, input_string):
40
+ command, argument = self.parse_command(input_string)
41
+ if command in self.commands:
42
+ return self.commands[command]["function"](argument)
43
+ else:
44
+ return "Invalid command"
embed_documents.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.retrievers.multi_query import MultiQueryRetriever
2
+ from langchain_openai import ChatOpenAI
3
+ from ragatouille import RAGPretrainedModel
4
+
5
+
6
+ def create_retriever(texts):
7
+ colbert = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv1.9")
8
+ colbert.index(
9
+ collection=[chunk.page_content for chunk in texts],
10
+ split_documents=False,
11
+ document_metadatas=[chunk.metadata for chunk in texts],
12
+ index_name="vector_store",
13
+ )
14
+ retriever = colbert.as_langchain_retriever(k=5)
15
+ retriever = MultiQueryRetriever.from_llm(
16
+ retriever=retriever, llm=ChatOpenAI(temperature=0)
17
+ )
18
+ return retriever
process_documents.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import re
3
+ from statistics import median
4
+ from bs4 import BeautifulSoup
5
+ from langchain.docstore.document import Document
6
+ from langchain.document_loaders import PDFMinerPDFasHTMLLoader, WebBaseLoader
7
+
8
+ deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
9
+
10
+
11
+ def process_documents(urls):
12
+ snippets = []
13
+ for url in urls:
14
+ if url.endswith(".pdf"):
15
+ snippets.extend(process_pdf(url))
16
+ else:
17
+ snippets.extend(process_web(url))
18
+ for e, snippet in enumerate(snippets):
19
+ snippet.metadata["chunk_id"] = e
20
+ return snippets
21
+
22
+
23
+ def process_pdf(url):
24
+ data = PDFMinerPDFasHTMLLoader(url).load()[0]
25
+ content = BeautifulSoup(data.page_content, "html.parser").find_all("div")
26
+ snippets = get_pdf_snippets(content)
27
+ filtered_snippets = filter_pdf_snippets(snippets, new_line_threshold_ratio=0.4)
28
+ median_font_size = math.ceil(
29
+ median([font_size for _, font_size in filtered_snippets])
30
+ )
31
+ semantic_snippets = get_pdf_semantic_snippets(filtered_snippets, median_font_size)
32
+ document_snippets = [
33
+ Document(
34
+ page_content=deep_strip(snip[1]["header_text"]) + " " + deep_strip(snip[0]),
35
+ metadata={
36
+ "header": " ".join(snip[1]["header_text"].split()[:10]),
37
+ "source_url": url,
38
+ "source_type": "pdf",
39
+ "chunk_id": i,
40
+ },
41
+ )
42
+ for i, snip in enumerate(semantic_snippets)
43
+ ]
44
+ return document_snippets
45
+
46
+
47
+ def get_pdf_snippets(content):
48
+ current_font_size = None
49
+ current_text = ""
50
+ snippets = []
51
+ for cntnt in content:
52
+ span = cntnt.find("span")
53
+ if not span:
54
+ continue
55
+ style = span.get("style")
56
+ if not style:
57
+ continue
58
+ font_size = re.findall("font-size:(\d+)px", style)
59
+ if not font_size:
60
+ continue
61
+ font_size = int(font_size[0])
62
+
63
+ if not current_font_size:
64
+ current_font_size = font_size
65
+ if font_size == current_font_size:
66
+ current_text += cntnt.text
67
+ else:
68
+ snippets.append((current_text, current_font_size))
69
+ current_font_size = font_size
70
+ current_text = cntnt.text
71
+ snippets.append((current_text, current_font_size))
72
+ return snippets
73
+
74
+
75
+ def filter_pdf_snippets(content_list, new_line_threshold_ratio):
76
+ filtered_list = []
77
+ for e, (content, font_size) in enumerate(content_list):
78
+ newline_count = content.count("\n")
79
+ total_chars = len(content)
80
+ ratio = newline_count / total_chars
81
+ if ratio <= new_line_threshold_ratio:
82
+ filtered_list.append((content, font_size))
83
+ return filtered_list
84
+
85
+
86
+ def get_pdf_semantic_snippets(filtered_snippets, median_font_size):
87
+ semantic_snippets = []
88
+ current_header = None
89
+ current_content = []
90
+ header_font_size = None
91
+ content_font_sizes = []
92
+
93
+ for content, font_size in filtered_snippets:
94
+ if font_size > median_font_size:
95
+ if current_header is not None:
96
+ metadata = {
97
+ "header_font_size": header_font_size,
98
+ "content_font_size": (
99
+ median(content_font_sizes) if content_font_sizes else None
100
+ ),
101
+ "header_text": current_header,
102
+ }
103
+ semantic_snippets.append((current_content, metadata))
104
+ current_content = []
105
+ content_font_sizes = []
106
+
107
+ current_header = content
108
+ header_font_size = font_size
109
+ else:
110
+ content_font_sizes.append(font_size)
111
+ if current_content:
112
+ current_content += " " + content
113
+ else:
114
+ current_content = content
115
+
116
+ if current_header is not None:
117
+ metadata = {
118
+ "header_font_size": header_font_size,
119
+ "content_font_size": (
120
+ median(content_font_sizes) if content_font_sizes else None
121
+ ),
122
+ "header_text": current_header,
123
+ }
124
+ semantic_snippets.append((current_content, metadata))
125
+ return semantic_snippets
126
+
127
+
128
+ def process_web(url):
129
+ data = WebBaseLoader(url).load()[0]
130
+ document_snippets = [
131
+ Document(
132
+ page_content=deep_strip(data.page_content),
133
+ metadata={
134
+ "header": data.metadata["title"],
135
+ "source_url": url,
136
+ "source_type": "web",
137
+ "chunk_id": 0,
138
+ },
139
+ )
140
+ ]
141
+ return document_snippets
requirements.txt CHANGED
@@ -4,9 +4,9 @@ langchain-community==0.0.24
4
  langchain-core==0.1.27
5
  langchain-experimental==0.0.49
6
  langchain-openai==0.0.8
7
- chromadb==0.4.22
8
  tiktoken==0.5.2
9
  pdfminer.six==20231228
10
  beautifulsoup4==4.12.3
11
  RAGatouille==0.0.7.post7
12
- pandas==2.2.1
 
 
4
  langchain-core==0.1.27
5
  langchain-experimental==0.0.49
6
  langchain-openai==0.0.8
 
7
  tiktoken==0.5.2
8
  pdfminer.six==20231228
9
  beautifulsoup4==4.12.3
10
  RAGatouille==0.0.7.post7
11
+ pandas==2.2.1
12
+ tabulate==0.9.0