XThomasBU commited on
Commit
9a7da99
·
1 Parent(s): 7a98bd3

improvements

Browse files
README.md CHANGED
@@ -15,7 +15,13 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
15
  - Add URLs in the `urls.txt` file.
16
  - Add other PDF files in the `storage/data` directory.
17
 
18
- 3. **Create the Vector Database**
 
 
 
 
 
 
19
  ```bash
20
  cd code
21
  python -m modules.vectorstore.store_manager
@@ -23,7 +29,7 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
23
  - Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
24
  - Alternatively, you can set `["vectorstore"]["embedd_files"]` to `True` in the `code/modules/config/config.yaml` file, which will embed files from the storage directory every time you run the below chainlit command.
25
 
26
- 4. **Run the Chainlit App**
27
  ```bash
28
  chainlit run main.py
29
  ```
 
15
  - Add URLs in the `urls.txt` file.
16
  - Add other PDF files in the `storage/data` directory.
17
 
18
+ 3. **To test Data Loading (Optional)**
19
+ ```bash
20
+ cd code
21
+ python -m modules.dataloader.data_loader
22
+ ```
23
+
24
+ 4. **Create the Vector Database**
25
  ```bash
26
  cd code
27
  python -m modules.vectorstore.store_manager
 
29
  - Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
30
  - Alternatively, you can set `["vectorstore"]["embedd_files"]` to `True` in the `code/modules/config/config.yaml` file, which will embed files from the storage directory every time you run the below chainlit command.
31
 
32
+ 5. **Run the Chainlit App**
33
  ```bash
34
  chainlit run main.py
35
  ```
code/main.py CHANGED
@@ -10,27 +10,20 @@ import yaml
10
  import logging
11
  from dotenv import load_dotenv
12
 
13
- import os
14
- import sys
15
-
16
- # Add the 'code' directory to the Python path
17
- current_dir = os.path.dirname(os.path.abspath(__file__))
18
- sys.path.append(current_dir)
19
-
20
  from modules.chat.llm_tutor import LLMTutor
21
  from modules.config.constants import *
22
  from modules.chat.helpers import get_sources
23
  from modules.chat_processor.chat_processor import ChatProcessor
24
 
25
  global logger
 
26
  logger = logging.getLogger(__name__)
27
  logger.setLevel(logging.INFO)
28
- logger.propagate = False
29
 
30
  # Console Handler
31
  console_handler = logging.StreamHandler()
32
  console_handler.setLevel(logging.INFO)
33
- formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
34
  console_handler.setFormatter(formatter)
35
  logger.addHandler(console_handler)
36
 
 
10
  import logging
11
  from dotenv import load_dotenv
12
 
 
 
 
 
 
 
 
13
  from modules.chat.llm_tutor import LLMTutor
14
  from modules.config.constants import *
15
  from modules.chat.helpers import get_sources
16
  from modules.chat_processor.chat_processor import ChatProcessor
17
 
18
  global logger
19
+ # Initialize logger
20
  logger = logging.getLogger(__name__)
21
  logger.setLevel(logging.INFO)
22
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
23
 
24
  # Console Handler
25
  console_handler = logging.StreamHandler()
26
  console_handler.setLevel(logging.INFO)
 
27
  console_handler.setFormatter(formatter)
28
  logger.addHandler(console_handler)
29
 
code/modules/__init__.py CHANGED
@@ -1,2 +0,0 @@
1
- from . import vectorstore
2
- from . import dataloader
 
 
 
code/modules/chat/__init__.py CHANGED
@@ -1,2 +0,0 @@
1
- from .llm_tutor import LLMTutor
2
- from .chat_model_loader import ChatModelLoader
 
 
 
code/modules/chat/llm_tutor.py CHANGED
@@ -10,7 +10,7 @@ from modules.chat.helpers import get_prompt
10
  from modules.chat.chat_model_loader import ChatModelLoader
11
  from modules.vectorstore.store_manager import VectorStoreManager
12
 
13
- from modules.retriever import Retriever
14
 
15
  from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
16
  from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
 
10
  from modules.chat.chat_model_loader import ChatModelLoader
11
  from modules.vectorstore.store_manager import VectorStoreManager
12
 
13
+ from modules.retriever.retriever import Retriever
14
 
15
  from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
16
  from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
code/modules/config/__init__.py CHANGED
@@ -1 +0,0 @@
1
- from .constants import *
 
 
code/modules/config/config.yml CHANGED
@@ -7,7 +7,7 @@ vectorstore:
7
  data_path: '../storage/data' # str
8
  url_file_path: '../storage/data/urls.txt' # str
9
  expand_urls: True # bool
10
- db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille, RAPTOR]
11
  db_path : '../vectorstores' # str
12
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
13
  search_top_k : 3 # int
 
7
  data_path: '../storage/data' # str
8
  url_file_path: '../storage/data/urls.txt' # str
9
  expand_urls: True # bool
10
+ db_option : 'FAISS' # str [FAISS, Chroma, RAGatouille, RAPTOR]
11
  db_path : '../vectorstores' # str
12
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
13
  search_top_k : 3 # int
code/modules/dataloader/__init__.py CHANGED
@@ -1,2 +0,0 @@
1
- from .webpage_crawler import WebpageCrawler
2
- from .data_loader import DataLoader
 
 
 
code/modules/dataloader/data_loader.py CHANGED
@@ -345,7 +345,7 @@ if __name__ == "__main__":
345
  logger = logging.getLogger(__name__)
346
  logger.setLevel(logging.INFO)
347
 
348
- with open("../code/config.yml", "r") as f:
349
  config = yaml.safe_load(f)
350
 
351
  data_loader = DataLoader(config, logger=logger)
@@ -355,3 +355,6 @@ if __name__ == "__main__":
355
  ["https://dl4ds.github.io/sp2024/"],
356
  )
357
  )
 
 
 
 
345
  logger = logging.getLogger(__name__)
346
  logger.setLevel(logging.INFO)
347
 
348
+ with open("../code/modules/config/config.yml", "r") as f:
349
  config = yaml.safe_load(f)
350
 
351
  data_loader = DataLoader(config, logger=logger)
 
355
  ["https://dl4ds.github.io/sp2024/"],
356
  )
357
  )
358
+
359
+ print(document_names)
360
+ print(len(document_chunks))
code/modules/retriever/__init__.py CHANGED
@@ -1,5 +0,0 @@
1
- from .faiss_retriever import FaissRetriever
2
- from .chroma_retriever import ChromaRetriever
3
- from .colbert_retriever import ColbertRetriever
4
- from .raptor_retriever import RaptorRetriever
5
- from .retriever import Retriever
 
 
 
 
 
 
code/modules/vectorstore/store_manager.py CHANGED
@@ -16,37 +16,36 @@ class VectorStoreManager:
16
  self.document_names = None
17
 
18
  # Set up logging to both console and a file
19
- if logger is None:
20
- self.logger = logging.getLogger(__name__)
21
- self.logger.setLevel(logging.INFO)
22
- self.logger.propagate = False
 
 
 
 
 
 
 
23
 
24
  # Console Handler
25
  console_handler = logging.StreamHandler()
26
  console_handler.setLevel(logging.INFO)
27
- formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
28
  console_handler.setFormatter(formatter)
29
- self.logger.addHandler(console_handler)
30
 
31
  # Ensure log directory exists
32
  log_directory = self.config["log_dir"]
33
- if not os.path.exists(log_directory):
34
- os.makedirs(log_directory)
35
 
36
  # File Handler
37
- log_file_path = f"{log_directory}/vector_db.log" # Change this to your desired log file path
38
  file_handler = logging.FileHandler(log_file_path, mode="w")
39
  file_handler.setLevel(logging.INFO)
40
  file_handler.setFormatter(formatter)
41
- self.logger.addHandler(file_handler)
42
- else:
43
- self.logger = logger
44
-
45
- self.webpage_crawler = WebpageCrawler()
46
 
47
- self.vector_db = VectorStore(self.config)
48
-
49
- self.logger.info("VectorDB instance instantiated")
50
 
51
  def load_files(self):
52
 
 
16
  self.document_names = None
17
 
18
  # Set up logging to both console and a file
19
+ self.logger = logger or self._setup_logging()
20
+ self.webpage_crawler = WebpageCrawler()
21
+ self.vector_db = VectorStore(self.config)
22
+
23
+ self.logger.info("VectorDB instance instantiated")
24
+
25
+ def _setup_logging(self):
26
+ logger = logging.getLogger(__name__)
27
+ if not logger.hasHandlers():
28
+ logger.setLevel(logging.INFO)
29
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
30
 
31
  # Console Handler
32
  console_handler = logging.StreamHandler()
33
  console_handler.setLevel(logging.INFO)
 
34
  console_handler.setFormatter(formatter)
35
+ logger.addHandler(console_handler)
36
 
37
  # Ensure log directory exists
38
  log_directory = self.config["log_dir"]
39
+ os.makedirs(log_directory, exist_ok=True)
 
40
 
41
  # File Handler
42
+ log_file_path = os.path.join(log_directory, "vector_db.log")
43
  file_handler = logging.FileHandler(log_file_path, mode="w")
44
  file_handler.setLevel(logging.INFO)
45
  file_handler.setFormatter(formatter)
46
+ logger.addHandler(file_handler)
 
 
 
 
47
 
48
+ return logger
 
 
49
 
50
  def load_files(self):
51