XThomasBU
commited on
Commit
·
9a7da99
1
Parent(s):
7a98bd3
improvements
Browse files- README.md +8 -2
- code/main.py +2 -9
- code/modules/__init__.py +0 -2
- code/modules/chat/__init__.py +0 -2
- code/modules/chat/llm_tutor.py +1 -1
- code/modules/config/__init__.py +0 -1
- code/modules/config/config.yml +1 -1
- code/modules/dataloader/__init__.py +0 -2
- code/modules/dataloader/data_loader.py +4 -1
- code/modules/retriever/__init__.py +0 -5
- code/modules/vectorstore/store_manager.py +16 -17
README.md
CHANGED
@@ -15,7 +15,13 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
|
|
15 |
- Add URLs in the `urls.txt` file.
|
16 |
- Add other PDF files in the `storage/data` directory.
|
17 |
|
18 |
-
3. **
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
```bash
|
20 |
cd code
|
21 |
python -m modules.vectorstore.store_manager
|
@@ -23,7 +29,7 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
|
|
23 |
- Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
|
24 |
- Alternatively, you can set `["vectorstore"]["embedd_files"]` to `True` in the `code/modules/config/config.yaml` file, which will embed files from the storage directory every time you run the below chainlit command.
|
25 |
|
26 |
-
|
27 |
```bash
|
28 |
chainlit run main.py
|
29 |
```
|
|
|
15 |
- Add URLs in the `urls.txt` file.
|
16 |
- Add other PDF files in the `storage/data` directory.
|
17 |
|
18 |
+
3. **To test Data Loading (Optional)**
|
19 |
+
```bash
|
20 |
+
cd code
|
21 |
+
python -m modules.dataloader.data_loader
|
22 |
+
```
|
23 |
+
|
24 |
+
4. **Create the Vector Database**
|
25 |
```bash
|
26 |
cd code
|
27 |
python -m modules.vectorstore.store_manager
|
|
|
29 |
- Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
|
30 |
- Alternatively, you can set `["vectorstore"]["embedd_files"]` to `True` in the `code/modules/config/config.yaml` file, which will embed files from the storage directory every time you run the below chainlit command.
|
31 |
|
32 |
+
5. **Run the Chainlit App**
|
33 |
```bash
|
34 |
chainlit run main.py
|
35 |
```
|
code/main.py
CHANGED
@@ -10,27 +10,20 @@ import yaml
|
|
10 |
import logging
|
11 |
from dotenv import load_dotenv
|
12 |
|
13 |
-
import os
|
14 |
-
import sys
|
15 |
-
|
16 |
-
# Add the 'code' directory to the Python path
|
17 |
-
current_dir = os.path.dirname(os.path.abspath(__file__))
|
18 |
-
sys.path.append(current_dir)
|
19 |
-
|
20 |
from modules.chat.llm_tutor import LLMTutor
|
21 |
from modules.config.constants import *
|
22 |
from modules.chat.helpers import get_sources
|
23 |
from modules.chat_processor.chat_processor import ChatProcessor
|
24 |
|
25 |
global logger
|
|
|
26 |
logger = logging.getLogger(__name__)
|
27 |
logger.setLevel(logging.INFO)
|
28 |
-
|
29 |
|
30 |
# Console Handler
|
31 |
console_handler = logging.StreamHandler()
|
32 |
console_handler.setLevel(logging.INFO)
|
33 |
-
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
34 |
console_handler.setFormatter(formatter)
|
35 |
logger.addHandler(console_handler)
|
36 |
|
|
|
10 |
import logging
|
11 |
from dotenv import load_dotenv
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
from modules.chat.llm_tutor import LLMTutor
|
14 |
from modules.config.constants import *
|
15 |
from modules.chat.helpers import get_sources
|
16 |
from modules.chat_processor.chat_processor import ChatProcessor
|
17 |
|
18 |
global logger
|
19 |
+
# Initialize logger
|
20 |
logger = logging.getLogger(__name__)
|
21 |
logger.setLevel(logging.INFO)
|
22 |
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
23 |
|
24 |
# Console Handler
|
25 |
console_handler = logging.StreamHandler()
|
26 |
console_handler.setLevel(logging.INFO)
|
|
|
27 |
console_handler.setFormatter(formatter)
|
28 |
logger.addHandler(console_handler)
|
29 |
|
code/modules/__init__.py
CHANGED
@@ -1,2 +0,0 @@
|
|
1 |
-
from . import vectorstore
|
2 |
-
from . import dataloader
|
|
|
|
|
|
code/modules/chat/__init__.py
CHANGED
@@ -1,2 +0,0 @@
|
|
1 |
-
from .llm_tutor import LLMTutor
|
2 |
-
from .chat_model_loader import ChatModelLoader
|
|
|
|
|
|
code/modules/chat/llm_tutor.py
CHANGED
@@ -10,7 +10,7 @@ from modules.chat.helpers import get_prompt
|
|
10 |
from modules.chat.chat_model_loader import ChatModelLoader
|
11 |
from modules.vectorstore.store_manager import VectorStoreManager
|
12 |
|
13 |
-
from modules.retriever import Retriever
|
14 |
|
15 |
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
|
16 |
from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
|
|
|
10 |
from modules.chat.chat_model_loader import ChatModelLoader
|
11 |
from modules.vectorstore.store_manager import VectorStoreManager
|
12 |
|
13 |
+
from modules.retriever.retriever import Retriever
|
14 |
|
15 |
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
|
16 |
from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
|
code/modules/config/__init__.py
CHANGED
@@ -1 +0,0 @@
|
|
1 |
-
from .constants import *
|
|
|
|
code/modules/config/config.yml
CHANGED
@@ -7,7 +7,7 @@ vectorstore:
|
|
7 |
data_path: '../storage/data' # str
|
8 |
url_file_path: '../storage/data/urls.txt' # str
|
9 |
expand_urls: True # bool
|
10 |
-
db_option : '
|
11 |
db_path : '../vectorstores' # str
|
12 |
model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
|
13 |
search_top_k : 3 # int
|
|
|
7 |
data_path: '../storage/data' # str
|
8 |
url_file_path: '../storage/data/urls.txt' # str
|
9 |
expand_urls: True # bool
|
10 |
+
db_option : 'FAISS' # str [FAISS, Chroma, RAGatouille, RAPTOR]
|
11 |
db_path : '../vectorstores' # str
|
12 |
model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
|
13 |
search_top_k : 3 # int
|
code/modules/dataloader/__init__.py
CHANGED
@@ -1,2 +0,0 @@
|
|
1 |
-
from .webpage_crawler import WebpageCrawler
|
2 |
-
from .data_loader import DataLoader
|
|
|
|
|
|
code/modules/dataloader/data_loader.py
CHANGED
@@ -345,7 +345,7 @@ if __name__ == "__main__":
|
|
345 |
logger = logging.getLogger(__name__)
|
346 |
logger.setLevel(logging.INFO)
|
347 |
|
348 |
-
with open("../code/config.yml", "r") as f:
|
349 |
config = yaml.safe_load(f)
|
350 |
|
351 |
data_loader = DataLoader(config, logger=logger)
|
@@ -355,3 +355,6 @@ if __name__ == "__main__":
|
|
355 |
["https://dl4ds.github.io/sp2024/"],
|
356 |
)
|
357 |
)
|
|
|
|
|
|
|
|
345 |
logger = logging.getLogger(__name__)
|
346 |
logger.setLevel(logging.INFO)
|
347 |
|
348 |
+
with open("../code/modules/config/config.yml", "r") as f:
|
349 |
config = yaml.safe_load(f)
|
350 |
|
351 |
data_loader = DataLoader(config, logger=logger)
|
|
|
355 |
["https://dl4ds.github.io/sp2024/"],
|
356 |
)
|
357 |
)
|
358 |
+
|
359 |
+
print(document_names)
|
360 |
+
print(len(document_chunks))
|
code/modules/retriever/__init__.py
CHANGED
@@ -1,5 +0,0 @@
|
|
1 |
-
from .faiss_retriever import FaissRetriever
|
2 |
-
from .chroma_retriever import ChromaRetriever
|
3 |
-
from .colbert_retriever import ColbertRetriever
|
4 |
-
from .raptor_retriever import RaptorRetriever
|
5 |
-
from .retriever import Retriever
|
|
|
|
|
|
|
|
|
|
|
|
code/modules/vectorstore/store_manager.py
CHANGED
@@ -16,37 +16,36 @@ class VectorStoreManager:
|
|
16 |
self.document_names = None
|
17 |
|
18 |
# Set up logging to both console and a file
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
# Console Handler
|
25 |
console_handler = logging.StreamHandler()
|
26 |
console_handler.setLevel(logging.INFO)
|
27 |
-
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
28 |
console_handler.setFormatter(formatter)
|
29 |
-
|
30 |
|
31 |
# Ensure log directory exists
|
32 |
log_directory = self.config["log_dir"]
|
33 |
-
|
34 |
-
os.makedirs(log_directory)
|
35 |
|
36 |
# File Handler
|
37 |
-
log_file_path =
|
38 |
file_handler = logging.FileHandler(log_file_path, mode="w")
|
39 |
file_handler.setLevel(logging.INFO)
|
40 |
file_handler.setFormatter(formatter)
|
41 |
-
|
42 |
-
else:
|
43 |
-
self.logger = logger
|
44 |
-
|
45 |
-
self.webpage_crawler = WebpageCrawler()
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
self.logger.info("VectorDB instance instantiated")
|
50 |
|
51 |
def load_files(self):
|
52 |
|
|
|
16 |
self.document_names = None
|
17 |
|
18 |
# Set up logging to both console and a file
|
19 |
+
self.logger = logger or self._setup_logging()
|
20 |
+
self.webpage_crawler = WebpageCrawler()
|
21 |
+
self.vector_db = VectorStore(self.config)
|
22 |
+
|
23 |
+
self.logger.info("VectorDB instance instantiated")
|
24 |
+
|
25 |
+
def _setup_logging(self):
|
26 |
+
logger = logging.getLogger(__name__)
|
27 |
+
if not logger.hasHandlers():
|
28 |
+
logger.setLevel(logging.INFO)
|
29 |
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
30 |
|
31 |
# Console Handler
|
32 |
console_handler = logging.StreamHandler()
|
33 |
console_handler.setLevel(logging.INFO)
|
|
|
34 |
console_handler.setFormatter(formatter)
|
35 |
+
logger.addHandler(console_handler)
|
36 |
|
37 |
# Ensure log directory exists
|
38 |
log_directory = self.config["log_dir"]
|
39 |
+
os.makedirs(log_directory, exist_ok=True)
|
|
|
40 |
|
41 |
# File Handler
|
42 |
+
log_file_path = os.path.join(log_directory, "vector_db.log")
|
43 |
file_handler = logging.FileHandler(log_file_path, mode="w")
|
44 |
file_handler.setLevel(logging.INFO)
|
45 |
file_handler.setFormatter(formatter)
|
46 |
+
logger.addHandler(file_handler)
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
return logger
|
|
|
|
|
49 |
|
50 |
def load_files(self):
|
51 |
|