Farid Karimli commited on
Commit
c26167a
·
1 Parent(s): 49a1201

Chunking method selector and patches

Browse files
code/main.py CHANGED
@@ -66,16 +66,19 @@ class Chatbot:
66
  async def setup_llm(self):
67
  """
68
  Set up the LLM with the provided settings. Update the configuration and initialize the LLM tutor.
 
 
69
  """
70
  start_time = time.time()
71
 
72
  llm_settings = cl.user_session.get("llm_settings", {})
73
- chat_profile, retriever_method, memory_window, llm_style, generate_follow_up = (
74
  llm_settings.get("chat_model"),
75
  llm_settings.get("retriever_method"),
76
  llm_settings.get("memory_window"),
77
  llm_settings.get("llm_style"),
78
  llm_settings.get("follow_up_questions"),
 
79
  )
80
 
81
  chain = cl.user_session.get("chain")
@@ -95,6 +98,7 @@ class Chatbot:
95
  self.config["llm_params"]["llm_style"] = llm_style
96
  self.config["llm_params"]["llm_loader"] = chat_profile
97
  self.config["llm_params"]["generate_follow_up"] = generate_follow_up
 
98
 
99
  self.llm_tutor.update_llm(
100
  old_config, self.config
@@ -172,6 +176,12 @@ class Chatbot:
172
  label="Stream response",
173
  initial=config["llm_params"]["stream"],
174
  ),
 
 
 
 
 
 
175
  cl.input_widget.Switch(
176
  id="follow_up_questions",
177
  label="Generate follow up questions",
 
66
  async def setup_llm(self):
67
  """
68
  Set up the LLM with the provided settings. Update the configuration and initialize the LLM tutor.
69
+
70
+ #TODO: Clean this up.
71
  """
72
  start_time = time.time()
73
 
74
  llm_settings = cl.user_session.get("llm_settings", {})
75
+ chat_profile, retriever_method, memory_window, llm_style, generate_follow_up, chunking_mode = (
76
  llm_settings.get("chat_model"),
77
  llm_settings.get("retriever_method"),
78
  llm_settings.get("memory_window"),
79
  llm_settings.get("llm_style"),
80
  llm_settings.get("follow_up_questions"),
81
+ llm_settings.get("chunking_mode"),
82
  )
83
 
84
  chain = cl.user_session.get("chain")
 
98
  self.config["llm_params"]["llm_style"] = llm_style
99
  self.config["llm_params"]["llm_loader"] = chat_profile
100
  self.config["llm_params"]["generate_follow_up"] = generate_follow_up
101
+ self.config["splitter_options"]["chunking_mode"] = chunking_mode
102
 
103
  self.llm_tutor.update_llm(
104
  old_config, self.config
 
176
  label="Stream response",
177
  initial=config["llm_params"]["stream"],
178
  ),
179
+ cl.input_widget.Select(
180
+ id="chunking_mode",
181
+ label="Chunking mode",
182
+ values=['fixed', 'semantic'],
183
+ initial_index=1,
184
+ ),
185
  cl.input_widget.Switch(
186
  id="follow_up_questions",
187
  label="Generate follow up questions",
code/modules/dataloader/data_loader.py CHANGED
@@ -202,10 +202,11 @@ class ChunkProcessor:
202
  def process_chunks(
203
  self, documents, file_type="txt", source="", page=0, metadata={}
204
  ):
205
- if file_type == "pdf":
 
 
206
  document_chunks = documents
207
  else:
208
- documents = [Document(page_content=documents, source=source, page=page)]
209
  document_chunks = self.splitter.split_documents(documents)
210
 
211
  # add the source and page number back to the metadata
 
202
  def process_chunks(
203
  self, documents, file_type="txt", source="", page=0, metadata={}
204
  ):
205
+ # TODO: Clear up this pipeline of re-adding metadata
206
+ documents = [Document(page_content=documents, source=source, page=page)]
207
+ if file_type == "pdf" and self.config["splitter_options"]["chunking_mode"] == "fixed":
208
  document_chunks = documents
209
  else:
 
210
  document_chunks = self.splitter.split_documents(documents)
211
 
212
  # add the source and page number back to the metadata
code/modules/dataloader/pdf_readers/gpt.py CHANGED
@@ -23,7 +23,7 @@ class GPTParser:
23
  The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
24
  The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
25
  For images, give a description and if you can, a source. Separate each page with '---'.
26
- Just respond with the markdown.
27
  """
28
 
29
  def parse(self, pdf_path):
 
23
  The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
24
  The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
25
  For images, give a description and if you can, a source. Separate each page with '---'.
26
+ Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
27
  """
28
 
29
  def parse(self, pdf_path):