Spaces:
Build error
Build error
XThomasBU
commited on
Commit
·
a2ac5f7
1
Parent(s):
e5cd1d3
more modularization for vectorestore and retriever
Browse files- code/modules/chat/llm_tutor.py +2 -11
- code/modules/config/config.yml +1 -1
- code/modules/retriever/__init__.py +2 -0
- code/modules/retriever/colbert_retriever.py +10 -0
- code/modules/retriever/retriever.py +24 -0
- code/modules/vectorstore/store_manager.py +20 -25
- code/modules/vectorstore/vectorstore.py +50 -0
code/modules/chat/llm_tutor.py
CHANGED
@@ -10,7 +10,7 @@ from modules.chat.helpers import get_prompt
|
|
10 |
from modules.chat.chat_model_loader import ChatModelLoader
|
11 |
from modules.vectorstore.store_manager import VectorStoreManager
|
12 |
|
13 |
-
from modules.retriever import
|
14 |
|
15 |
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
|
16 |
from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
|
@@ -159,16 +159,7 @@ class LLMTutor:
|
|
159 |
# Retrieval QA Chain
|
160 |
def retrieval_qa_chain(self, llm, prompt, db):
|
161 |
|
162 |
-
|
163 |
-
retriever = FaissRetriever().return_retriever(db, self.config)
|
164 |
-
|
165 |
-
elif self.config["vectorstore"]["db_option"] == "Chroma":
|
166 |
-
retriever = ChromaRetriever().return_retriever(db, self.config)
|
167 |
-
|
168 |
-
elif self.config["vectorstore"]["db_option"] == "RAGatouille":
|
169 |
-
retriever = db.as_langchain_retriever(
|
170 |
-
k=self.config["vectorstore"]["search_top_k"]
|
171 |
-
)
|
172 |
|
173 |
if self.config["llm_params"]["use_history"]:
|
174 |
memory = ConversationBufferWindowMemory(
|
|
|
10 |
from modules.chat.chat_model_loader import ChatModelLoader
|
11 |
from modules.vectorstore.store_manager import VectorStoreManager
|
12 |
|
13 |
+
from modules.retriever import Retriever
|
14 |
|
15 |
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
|
16 |
from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
|
|
|
159 |
# Retrieval QA Chain
|
160 |
def retrieval_qa_chain(self, llm, prompt, db):
|
161 |
|
162 |
+
retriever = Retriever(self.config)._return_retriever(db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
if self.config["llm_params"]["use_history"]:
|
165 |
memory = ConversationBufferWindowMemory(
|
code/modules/config/config.yml
CHANGED
@@ -7,7 +7,7 @@ vectorstore:
|
|
7 |
data_path: '../storage/data' # str
|
8 |
url_file_path: '../storage/data/urls.txt' # str
|
9 |
expand_urls: False # bool
|
10 |
-
db_option : '
|
11 |
db_path : '../vectorstores' # str
|
12 |
model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
|
13 |
search_top_k : 3 # int
|
|
|
7 |
data_path: '../storage/data' # str
|
8 |
url_file_path: '../storage/data/urls.txt' # str
|
9 |
expand_urls: False # bool
|
10 |
+
db_option : 'Chroma' # str [FAISS, Chroma, RAGatouille]
|
11 |
db_path : '../vectorstores' # str
|
12 |
model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
|
13 |
search_top_k : 3 # int
|
code/modules/retriever/__init__.py
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
from .faiss_retriever import FaissRetriever
|
2 |
from .chroma_retriever import ChromaRetriever
|
|
|
|
|
|
1 |
from .faiss_retriever import FaissRetriever
|
2 |
from .chroma_retriever import ChromaRetriever
|
3 |
+
from .colbert_retriever import ColbertRetriever
|
4 |
+
from .retriever import Retriever
|
code/modules/retriever/colbert_retriever.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base import BaseRetriever
|
2 |
+
|
3 |
+
|
4 |
+
class ColbertRetriever(BaseRetriever):
|
5 |
+
def __init__(self):
|
6 |
+
pass
|
7 |
+
|
8 |
+
def return_retriever(self, db, config):
|
9 |
+
retriever = db.as_retriever()
|
10 |
+
return retriever
|
code/modules/retriever/retriever.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from modules.retriever.faiss_retriever import FaissRetriever
|
2 |
+
from modules.retriever.chroma_retriever import ChromaRetriever
|
3 |
+
from modules.retriever.colbert_retriever import ColbertRetriever
|
4 |
+
|
5 |
+
|
6 |
+
class Retriever:
|
7 |
+
def __init__(self, config):
|
8 |
+
self.config = config
|
9 |
+
self._create_retriever()
|
10 |
+
|
11 |
+
def _create_retriever(self):
|
12 |
+
if self.config["vectorstore"]["db_option"] == "FAISS":
|
13 |
+
self.retriever = FaissRetriever()
|
14 |
+
elif self.config["vectorstore"]["db_option"] == "Chroma":
|
15 |
+
self.retriever = ChromaRetriever()
|
16 |
+
elif self.config["vectorstore"]["db_option"] == "RAGatouille":
|
17 |
+
self.retriever = ColbertRetriever()
|
18 |
+
else:
|
19 |
+
raise ValueError(
|
20 |
+
"Invalid db_option: {}".format(self.config["vectorstore"]["db_option"])
|
21 |
+
)
|
22 |
+
|
23 |
+
def _return_retriever(self, db):
|
24 |
+
return self.retriever.return_retriever(db, self.config)
|
code/modules/vectorstore/store_manager.py
CHANGED
@@ -1,6 +1,4 @@
|
|
1 |
-
from modules.vectorstore.
|
2 |
-
from modules.vectorstore.chroma import ChromaVectorStore
|
3 |
-
from modules.vectorstore.colbert import ColbertVectorStore
|
4 |
from modules.vectorstore.helpers import *
|
5 |
from modules.dataloader.webpage_crawler import WebpageCrawler
|
6 |
from modules.dataloader.data_loader import DataLoader
|
@@ -15,7 +13,6 @@ import asyncio
|
|
15 |
class VectorStoreManager:
|
16 |
def __init__(self, config, logger=None):
|
17 |
self.config = config
|
18 |
-
self.db_option = config["vectorstore"]["db_option"]
|
19 |
self.document_names = None
|
20 |
|
21 |
# Set up logging to both console and a file
|
@@ -47,9 +44,12 @@ class VectorStoreManager:
|
|
47 |
|
48 |
self.webpage_crawler = WebpageCrawler()
|
49 |
|
|
|
|
|
50 |
self.logger.info("VectorDB instance instantiated")
|
51 |
|
52 |
def load_files(self):
|
|
|
53 |
files = os.listdir(self.config["vectorstore"]["data_path"])
|
54 |
files = [
|
55 |
os.path.join(self.config["vectorstore"]["data_path"], file)
|
@@ -71,6 +71,7 @@ class VectorStoreManager:
|
|
71 |
return files, urls
|
72 |
|
73 |
def create_embedding_model(self):
|
|
|
74 |
self.logger.info("Creating embedding function")
|
75 |
embedding_model_loader = EmbeddingModelLoader(self.config)
|
76 |
embedding_model = embedding_model_loader.load_embedding_model()
|
@@ -83,22 +84,23 @@ class VectorStoreManager:
|
|
83 |
documents: list,
|
84 |
document_metadata: list,
|
85 |
):
|
86 |
-
if self.db_option in ["FAISS", "Chroma"]:
|
87 |
self.embedding_model = self.create_embedding_model()
|
88 |
|
89 |
self.logger.info("Initializing vector_db")
|
90 |
-
self.logger.info(
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
self.
|
99 |
-
|
100 |
|
101 |
def create_database(self):
|
|
|
102 |
start_time = time.time() # Start time for creating database
|
103 |
data_loader = DataLoader(self.config, self.logger)
|
104 |
self.logger.info("Loading data")
|
@@ -126,18 +128,11 @@ class VectorStoreManager:
|
|
126 |
)
|
127 |
|
128 |
def load_database(self):
|
|
|
129 |
start_time = time.time() # Start time for loading database
|
130 |
-
if self.db_option in ["FAISS", "Chroma"]:
|
131 |
self.embedding_model = self.create_embedding_model()
|
132 |
-
|
133 |
-
self.vector_db = FaissVectorStore(self.config)
|
134 |
-
self.loaded_vector_db = self.vector_db.load_database(self.embedding_model)
|
135 |
-
elif self.db_option == "Chroma":
|
136 |
-
self.vector_db = ChromaVectorStore(self.config)
|
137 |
-
self.loaded_vector_db = self.vector_db.load_database(self.embedding_model)
|
138 |
-
elif self.db_option == "RAGatouille":
|
139 |
-
self.vector_db = ColbertVectorStore(self.config)
|
140 |
-
self.loaded_vector_db = self.vector_db.load_database()
|
141 |
end_time = time.time() # End time for loading database
|
142 |
self.logger.info(
|
143 |
f"Time taken to load database: {end_time - start_time} seconds"
|
|
|
1 |
+
from modules.vectorstore.vectorstore import VectorStore
|
|
|
|
|
2 |
from modules.vectorstore.helpers import *
|
3 |
from modules.dataloader.webpage_crawler import WebpageCrawler
|
4 |
from modules.dataloader.data_loader import DataLoader
|
|
|
13 |
class VectorStoreManager:
|
14 |
def __init__(self, config, logger=None):
|
15 |
self.config = config
|
|
|
16 |
self.document_names = None
|
17 |
|
18 |
# Set up logging to both console and a file
|
|
|
44 |
|
45 |
self.webpage_crawler = WebpageCrawler()
|
46 |
|
47 |
+
self.vector_db = VectorStore(self.config)
|
48 |
+
|
49 |
self.logger.info("VectorDB instance instantiated")
|
50 |
|
51 |
def load_files(self):
|
52 |
+
|
53 |
files = os.listdir(self.config["vectorstore"]["data_path"])
|
54 |
files = [
|
55 |
os.path.join(self.config["vectorstore"]["data_path"], file)
|
|
|
71 |
return files, urls
|
72 |
|
73 |
def create_embedding_model(self):
|
74 |
+
|
75 |
self.logger.info("Creating embedding function")
|
76 |
embedding_model_loader = EmbeddingModelLoader(self.config)
|
77 |
embedding_model = embedding_model_loader.load_embedding_model()
|
|
|
84 |
documents: list,
|
85 |
document_metadata: list,
|
86 |
):
|
87 |
+
if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma"]:
|
88 |
self.embedding_model = self.create_embedding_model()
|
89 |
|
90 |
self.logger.info("Initializing vector_db")
|
91 |
+
self.logger.info(
|
92 |
+
"\tUsing {} as db_option".format(self.config["vectorstore"]["db_option"])
|
93 |
+
)
|
94 |
+
self.vector_db._create_database(
|
95 |
+
document_chunks,
|
96 |
+
document_names,
|
97 |
+
documents,
|
98 |
+
document_metadata,
|
99 |
+
self.embedding_model,
|
100 |
+
)
|
101 |
|
102 |
def create_database(self):
|
103 |
+
|
104 |
start_time = time.time() # Start time for creating database
|
105 |
data_loader = DataLoader(self.config, self.logger)
|
106 |
self.logger.info("Loading data")
|
|
|
128 |
)
|
129 |
|
130 |
def load_database(self):
|
131 |
+
|
132 |
start_time = time.time() # Start time for loading database
|
133 |
+
if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma"]:
|
134 |
self.embedding_model = self.create_embedding_model()
|
135 |
+
self.loaded_vector_db = self.vector_db._load_database(self.embedding_model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
end_time = time.time() # End time for loading database
|
137 |
self.logger.info(
|
138 |
f"Time taken to load database: {end_time - start_time} seconds"
|
code/modules/vectorstore/vectorstore.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from modules.vectorstore.faiss import FaissVectorStore
|
2 |
+
from modules.vectorstore.chroma import ChromaVectorStore
|
3 |
+
from modules.vectorstore.colbert import ColbertVectorStore
|
4 |
+
|
5 |
+
|
6 |
+
class VectorStore:
|
7 |
+
def __init__(self, config):
|
8 |
+
self.config = config
|
9 |
+
self.vectorstore = None
|
10 |
+
|
11 |
+
def _create_database(
|
12 |
+
self,
|
13 |
+
document_chunks,
|
14 |
+
document_names,
|
15 |
+
documents,
|
16 |
+
document_metadata,
|
17 |
+
embedding_model,
|
18 |
+
):
|
19 |
+
if self.config["vectorstore"]["db_option"] == "FAISS":
|
20 |
+
self.vectorstore = FaissVectorStore(self.config)
|
21 |
+
self.vectorstore.create_database(document_chunks, embedding_model)
|
22 |
+
elif self.config["vectorstore"]["db_option"] == "Chroma":
|
23 |
+
self.vectorstore = ChromaVectorStore(self.config)
|
24 |
+
self.vectorstore.create_database(document_chunks, embedding_model)
|
25 |
+
elif self.config["vectorstore"]["db_option"] == "RAGatouille":
|
26 |
+
self.vectorstore = ColbertVectorStore(self.config)
|
27 |
+
self.vectorstore.create_database(
|
28 |
+
documents, document_names, document_metadata
|
29 |
+
)
|
30 |
+
else:
|
31 |
+
raise ValueError(
|
32 |
+
"Invalid db_option: {}".format(self.config["vectorstore"]["db_option"])
|
33 |
+
)
|
34 |
+
|
35 |
+
def _load_database(self, embedding_model):
|
36 |
+
if self.config["vectorstore"]["db_option"] == "FAISS":
|
37 |
+
self.vectorstore = FaissVectorStore(self.config)
|
38 |
+
return self.vectorstore.load_database(embedding_model)
|
39 |
+
elif self.config["vectorstore"]["db_option"] == "Chroma":
|
40 |
+
self.vectorstore = ChromaVectorStore(self.config)
|
41 |
+
return self.vectorstore.load_database(embedding_model)
|
42 |
+
elif self.config["vectorstore"]["db_option"] == "RAGatouille":
|
43 |
+
self.vectorstore = ColbertVectorStore(self.config)
|
44 |
+
return self.vectorstore.load_database()
|
45 |
+
|
46 |
+
def _as_retriever(self):
|
47 |
+
return self.vectorstore.as_retriever()
|
48 |
+
|
49 |
+
def _get_vectorstore(self):
|
50 |
+
return self.vectorstore
|