Farid Karimli
commited on
Commit
·
c26167a
1
Parent(s):
49a1201
Chunking method selector and patches
Browse files
code/main.py
CHANGED
@@ -66,16 +66,19 @@ class Chatbot:
|
|
66 |
async def setup_llm(self):
|
67 |
"""
|
68 |
Set up the LLM with the provided settings. Update the configuration and initialize the LLM tutor.
|
|
|
|
|
69 |
"""
|
70 |
start_time = time.time()
|
71 |
|
72 |
llm_settings = cl.user_session.get("llm_settings", {})
|
73 |
-
chat_profile, retriever_method, memory_window, llm_style, generate_follow_up = (
|
74 |
llm_settings.get("chat_model"),
|
75 |
llm_settings.get("retriever_method"),
|
76 |
llm_settings.get("memory_window"),
|
77 |
llm_settings.get("llm_style"),
|
78 |
llm_settings.get("follow_up_questions"),
|
|
|
79 |
)
|
80 |
|
81 |
chain = cl.user_session.get("chain")
|
@@ -95,6 +98,7 @@ class Chatbot:
|
|
95 |
self.config["llm_params"]["llm_style"] = llm_style
|
96 |
self.config["llm_params"]["llm_loader"] = chat_profile
|
97 |
self.config["llm_params"]["generate_follow_up"] = generate_follow_up
|
|
|
98 |
|
99 |
self.llm_tutor.update_llm(
|
100 |
old_config, self.config
|
@@ -172,6 +176,12 @@ class Chatbot:
|
|
172 |
label="Stream response",
|
173 |
initial=config["llm_params"]["stream"],
|
174 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
cl.input_widget.Switch(
|
176 |
id="follow_up_questions",
|
177 |
label="Generate follow up questions",
|
|
|
66 |
async def setup_llm(self):
|
67 |
"""
|
68 |
Set up the LLM with the provided settings. Update the configuration and initialize the LLM tutor.
|
69 |
+
|
70 |
+
#TODO: Clean this up.
|
71 |
"""
|
72 |
start_time = time.time()
|
73 |
|
74 |
llm_settings = cl.user_session.get("llm_settings", {})
|
75 |
+
chat_profile, retriever_method, memory_window, llm_style, generate_follow_up, chunking_mode = (
|
76 |
llm_settings.get("chat_model"),
|
77 |
llm_settings.get("retriever_method"),
|
78 |
llm_settings.get("memory_window"),
|
79 |
llm_settings.get("llm_style"),
|
80 |
llm_settings.get("follow_up_questions"),
|
81 |
+
llm_settings.get("chunking_mode"),
|
82 |
)
|
83 |
|
84 |
chain = cl.user_session.get("chain")
|
|
|
98 |
self.config["llm_params"]["llm_style"] = llm_style
|
99 |
self.config["llm_params"]["llm_loader"] = chat_profile
|
100 |
self.config["llm_params"]["generate_follow_up"] = generate_follow_up
|
101 |
+
self.config["splitter_options"]["chunking_mode"] = chunking_mode
|
102 |
|
103 |
self.llm_tutor.update_llm(
|
104 |
old_config, self.config
|
|
|
176 |
label="Stream response",
|
177 |
initial=config["llm_params"]["stream"],
|
178 |
),
|
179 |
+
cl.input_widget.Select(
|
180 |
+
id="chunking_mode",
|
181 |
+
label="Chunking mode",
|
182 |
+
values=['fixed', 'semantic'],
|
183 |
+
initial_index=1,
|
184 |
+
),
|
185 |
cl.input_widget.Switch(
|
186 |
id="follow_up_questions",
|
187 |
label="Generate follow up questions",
|
code/modules/dataloader/data_loader.py
CHANGED
@@ -202,10 +202,11 @@ class ChunkProcessor:
|
|
202 |
def process_chunks(
|
203 |
self, documents, file_type="txt", source="", page=0, metadata={}
|
204 |
):
|
205 |
-
|
|
|
|
|
206 |
document_chunks = documents
|
207 |
else:
|
208 |
-
documents = [Document(page_content=documents, source=source, page=page)]
|
209 |
document_chunks = self.splitter.split_documents(documents)
|
210 |
|
211 |
# add the source and page number back to the metadata
|
|
|
202 |
def process_chunks(
|
203 |
self, documents, file_type="txt", source="", page=0, metadata={}
|
204 |
):
|
205 |
+
# TODO: Clear up this pipeline of re-adding metadata
|
206 |
+
documents = [Document(page_content=documents, source=source, page=page)]
|
207 |
+
if file_type == "pdf" and self.config["splitter_options"]["chunking_mode"] == "fixed":
|
208 |
document_chunks = documents
|
209 |
else:
|
|
|
210 |
document_chunks = self.splitter.split_documents(documents)
|
211 |
|
212 |
# add the source and page number back to the metadata
|
code/modules/dataloader/pdf_readers/gpt.py
CHANGED
@@ -23,7 +23,7 @@ class GPTParser:
|
|
23 |
The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
|
24 |
The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
|
25 |
For images, give a description and if you can, a source. Separate each page with '---'.
|
26 |
-
Just respond with the markdown.
|
27 |
"""
|
28 |
|
29 |
def parse(self, pdf_path):
|
|
|
23 |
The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
|
24 |
The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
|
25 |
For images, give a description and if you can, a source. Separate each page with '---'.
|
26 |
+
Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
|
27 |
"""
|
28 |
|
29 |
def parse(self, pdf_path):
|