XThomasBU
commited on
Commit
·
4308a1a
1
Parent(s):
e17a5d0
remove hard coded values
Browse files
code/modules/config/{user_config.yml → project_config.yml}
RENAMED
@@ -1,3 +1,7 @@
|
|
1 |
retriever:
|
2 |
retriever_hf_paths:
|
3 |
RAGatouille: "XThomasBU/Colbert_Index"
|
|
|
|
|
|
|
|
|
|
1 |
retriever:
|
2 |
retriever_hf_paths:
|
3 |
RAGatouille: "XThomasBU/Colbert_Index"
|
4 |
+
|
5 |
+
metadata:
|
6 |
+
metada_links: ["https://dl4ds.github.io/sp2024/lectures/", "https://dl4ds.github.io/sp2024/schedule/"]
|
7 |
+
slide_base_link: "https://dl4ds.github.io"
|
code/modules/dataloader/data_loader.py
CHANGED
@@ -222,8 +222,7 @@ class ChunkProcessor:
|
|
222 |
|
223 |
def chunk_docs(self, file_reader, uploaded_files, weblinks):
|
224 |
addl_metadata = get_metadata(
|
225 |
-
|
226 |
-
"https://dl4ds.github.io/sp2024/schedule/",
|
227 |
) # For any additional metadata
|
228 |
|
229 |
# remove already processed files if reparse_files is False
|
@@ -426,6 +425,12 @@ if __name__ == "__main__":
|
|
426 |
with open("../code/modules/config/config.yml", "r") as f:
|
427 |
config = yaml.safe_load(f)
|
428 |
|
|
|
|
|
|
|
|
|
|
|
|
|
429 |
STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
|
430 |
uploaded_files = [
|
431 |
os.path.join(STORAGE_DIR, file)
|
@@ -434,6 +439,7 @@ if __name__ == "__main__":
|
|
434 |
]
|
435 |
|
436 |
data_loader = DataLoader(config, logger=logger)
|
|
|
437 |
document_chunks, document_names, documents, document_metadata = (
|
438 |
data_loader.get_chunks(
|
439 |
[
|
|
|
222 |
|
223 |
def chunk_docs(self, file_reader, uploaded_files, weblinks):
|
224 |
addl_metadata = get_metadata(
|
225 |
+
*self.config["metadata"]["metada_links"], self.config
|
|
|
226 |
) # For any additional metadata
|
227 |
|
228 |
# remove already processed files if reparse_files is False
|
|
|
425 |
with open("../code/modules/config/config.yml", "r") as f:
|
426 |
config = yaml.safe_load(f)
|
427 |
|
428 |
+
with open("../code/modules/config/project_config.yml", "r") as f:
|
429 |
+
project_config = yaml.safe_load(f)
|
430 |
+
|
431 |
+
# Combine project config with the main config
|
432 |
+
config.update(project_config)
|
433 |
+
|
434 |
STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
|
435 |
uploaded_files = [
|
436 |
os.path.join(STORAGE_DIR, file)
|
|
|
439 |
]
|
440 |
|
441 |
data_loader = DataLoader(config, logger=logger)
|
442 |
+
# Just for testing
|
443 |
document_chunks, document_names, documents, document_metadata = (
|
444 |
data_loader.get_chunks(
|
445 |
[
|
code/modules/dataloader/helpers.py
CHANGED
@@ -21,7 +21,8 @@ def get_base_url(url):
|
|
21 |
return base_url
|
22 |
|
23 |
|
24 |
-
|
|
|
25 |
"""
|
26 |
Function to get the lecture metadata from the lectures and schedule URLs.
|
27 |
"""
|
@@ -50,7 +51,9 @@ def get_metadata(lectures_url, schedule_url):
|
|
50 |
slides_link_tag = description_div.find("a", title="Download slides")
|
51 |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
52 |
slides_link = (
|
53 |
-
f"
|
|
|
|
|
54 |
)
|
55 |
if slides_link:
|
56 |
date_mapping[slides_link] = date
|
@@ -70,7 +73,9 @@ def get_metadata(lectures_url, schedule_url):
|
|
70 |
slides_link_tag = block.find("a", title="Download slides")
|
71 |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
72 |
slides_link = (
|
73 |
-
f"
|
|
|
|
|
74 |
)
|
75 |
|
76 |
# Extract the link to the lecture recording
|
|
|
21 |
return base_url
|
22 |
|
23 |
|
24 |
+
### THIS FUNCTION IS NOT GENERALIZABLE.. IT IS SPECIFIC TO THE COURSE WEBSITE ###
|
25 |
+
def get_metadata(lectures_url, schedule_url, config):
|
26 |
"""
|
27 |
Function to get the lecture metadata from the lectures and schedule URLs.
|
28 |
"""
|
|
|
51 |
slides_link_tag = description_div.find("a", title="Download slides")
|
52 |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
53 |
slides_link = (
|
54 |
+
f"{config['metadata']['slide_base_link']}{slides_link}"
|
55 |
+
if slides_link
|
56 |
+
else None
|
57 |
)
|
58 |
if slides_link:
|
59 |
date_mapping[slides_link] = date
|
|
|
73 |
slides_link_tag = block.find("a", title="Download slides")
|
74 |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
75 |
slides_link = (
|
76 |
+
f"{config['metadata']['slide_base_link']}{slides_link}"
|
77 |
+
if slides_link
|
78 |
+
else None
|
79 |
)
|
80 |
|
81 |
# Extract the link to the lecture recording
|
code/modules/vectorstore/store_manager.py
CHANGED
@@ -168,19 +168,21 @@ if __name__ == "__main__":
|
|
168 |
|
169 |
with open("modules/config/config.yml", "r") as f:
|
170 |
config = yaml.safe_load(f)
|
171 |
-
with open("modules/config/
|
172 |
-
|
|
|
|
|
|
|
173 |
print(config)
|
174 |
-
print(user_config)
|
175 |
print(f"Trying to create database with config: {config}")
|
176 |
vector_db = VectorStoreManager(config)
|
177 |
if config["vectorstore"]["load_from_HF"]:
|
178 |
if (
|
179 |
config["vectorstore"]["db_option"]
|
180 |
-
in
|
181 |
):
|
182 |
vector_db.load_from_HF(
|
183 |
-
HF_PATH=
|
184 |
config["vectorstore"]["db_option"]
|
185 |
]
|
186 |
)
|
|
|
168 |
|
169 |
with open("modules/config/config.yml", "r") as f:
|
170 |
config = yaml.safe_load(f)
|
171 |
+
with open("modules/config/project_config.yml", "r") as f:
|
172 |
+
project_config = yaml.safe_load(f)
|
173 |
+
|
174 |
+
# combine the two configs
|
175 |
+
config.update(project_config)
|
176 |
print(config)
|
|
|
177 |
print(f"Trying to create database with config: {config}")
|
178 |
vector_db = VectorStoreManager(config)
|
179 |
if config["vectorstore"]["load_from_HF"]:
|
180 |
if (
|
181 |
config["vectorstore"]["db_option"]
|
182 |
+
in config["retriever"]["retriever_hf_paths"]
|
183 |
):
|
184 |
vector_db.load_from_HF(
|
185 |
+
HF_PATH=config["retriever"]["retriever_hf_paths"][
|
186 |
config["vectorstore"]["db_option"]
|
187 |
]
|
188 |
)
|
docs/setup.md
CHANGED
@@ -124,4 +124,4 @@ CHAINLIT_URL=<your_chainlit_url>
|
|
124 |
# Configuration
|
125 |
|
126 |
The configuration file `code/modules/config.yaml` contains the parameters that control the behaviour of your app.
|
127 |
-
The configuration file `code/modules/
|
|
|
124 |
# Configuration
|
125 |
|
126 |
The configuration file `code/modules/config.yaml` contains the parameters that control the behaviour of your app.
|
127 |
+
The configuration file `code/modules/project_config.yaml` contains project-specific parameters.
|