XThomasBU commited on
Commit
e5cd1d3
·
1 Parent(s): f51bb92

RAGatouille added

Browse files
code/modules/config/config.yml CHANGED
@@ -7,7 +7,7 @@ vectorstore:
7
  data_path: '../storage/data' # str
8
  url_file_path: '../storage/data/urls.txt' # str
9
  expand_urls: False # bool
10
- db_option : 'Chroma' # str [FAISS, Chroma, RAGatouille]
11
  db_path : '../vectorstores' # str
12
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
13
  search_top_k : 3 # int
@@ -20,6 +20,9 @@ vectorstore:
20
  index_nlist: 100 # int
21
  index_nprobe: 10 # int
22
 
 
 
 
23
  llm_params:
24
  use_history: True # bool
25
  memory_window: 3 # int
 
7
  data_path: '../storage/data' # str
8
  url_file_path: '../storage/data/urls.txt' # str
9
  expand_urls: False # bool
10
+ db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille]
11
  db_path : '../vectorstores' # str
12
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
13
  search_top_k : 3 # int
 
20
  index_nlist: 100 # int
21
  index_nprobe: 10 # int
22
 
23
+ colbert_params:
24
+ index_name: "new_idx" # str
25
+
26
  llm_params:
27
  use_history: True # bool
28
  memory_window: 3 # int
code/modules/vectorstore/colbert.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ragatouille import RAGPretrainedModel
2
+ from modules.vectorstore.base import VectorStoreBase
3
+ import os
4
+
5
+
6
+ class ColbertVectorStore(VectorStoreBase):
7
+ def __init__(self, config):
8
+ self.config = config
9
+ self._init_vector_db()
10
+
11
+ def _init_vector_db(self):
12
+ self.colbert = RAGPretrainedModel.from_pretrained(
13
+ "colbert-ir/colbertv2.0",
14
+ index_root=os.path.join(
15
+ self.config["vectorstore"]["db_path"],
16
+ "db_" + self.config["vectorstore"]["db_option"],
17
+ ),
18
+ )
19
+
20
+ def create_database(self, documents, document_names, document_metadata):
21
+ index_path = self.colbert.index(
22
+ index_name="new_idx",
23
+ collection=documents,
24
+ document_ids=document_names,
25
+ document_metadatas=document_metadata,
26
+ )
27
+
28
+ def load_database(self):
29
+ path = os.path.join(
30
+ self.config["vectorstore"]["db_path"],
31
+ "db_" + self.config["vectorstore"]["db_option"],
32
+ )
33
+ self.vectorstore = RAGPretrainedModel.from_index(
34
+ f"{path}/colbert/indexes/new_idx"
35
+ )
36
+ return self.vectorstore
37
+
38
+ def as_retriever(self):
39
+ return self.vectorstore.as_retriever()
code/modules/vectorstore/store_manager.py CHANGED
@@ -1,5 +1,6 @@
1
  from modules.vectorstore.faiss import FaissVectorStore
2
  from modules.vectorstore.chroma import ChromaVectorStore
 
3
  from modules.vectorstore.helpers import *
4
  from modules.dataloader.webpage_crawler import WebpageCrawler
5
  from modules.dataloader.data_loader import DataLoader
@@ -94,31 +95,8 @@ class VectorStoreManager:
94
  self.vector_db = ChromaVectorStore(self.config)
95
  self.vector_db.create_database(document_chunks, self.embedding_model)
96
  elif self.db_option == "RAGatouille":
97
- self.RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
98
- # index_path = self.RAG.index(
99
- # index_name="new_idx",
100
- # collection=documents,
101
- # document_ids=document_names,
102
- # document_metadatas=document_metadata,
103
- # )
104
- batch_size = 32
105
- for i in range(0, len(documents), batch_size):
106
- if i == 0:
107
- self.RAG.index(
108
- index_name="new_idx",
109
- collection=documents[i : i + batch_size],
110
- document_ids=document_names[i : i + batch_size],
111
- document_metadatas=document_metadata[i : i + batch_size],
112
- )
113
- else:
114
- self.RAG = RAGPretrainedModel.from_index(
115
- ".ragatouille/colbert/indexes/new_idx"
116
- )
117
- self.RAG.add_to_index(
118
- new_collection=documents[i : i + batch_size],
119
- new_document_ids=document_names[i : i + batch_size],
120
- new_document_metadatas=document_metadata[i : i + batch_size],
121
- )
122
 
123
  def create_database(self):
124
  start_time = time.time() # Start time for creating database
@@ -147,30 +125,6 @@ class VectorStoreManager:
147
  f"Time taken to create database: {end_time - start_time} seconds"
148
  )
149
 
150
- # def save_database(self):
151
- # start_time = time.time() # Start time for saving database
152
- # if self.db_option == "FAISS":
153
- # self.vector_db.save_local(
154
- # os.path.join(
155
- # self.config["vectorstore"]["db_path"],
156
- # "db_"
157
- # + self.config["vectorstore"]["db_option"]
158
- # + "_"
159
- # + self.config["vectorstore"]["model"],
160
- # )
161
- # )
162
- # elif self.db_option == "Chroma":
163
- # # db is saved in the persist directory during initialization
164
- # pass
165
- # elif self.db_option == "RAGatouille":
166
- # # index is saved during initialization
167
- # pass
168
- # self.logger.info("Saved database")
169
- # end_time = time.time() # End time for saving database
170
- # self.logger.info(
171
- # f"Time taken to save database: {end_time - start_time} seconds"
172
- # )
173
-
174
  def load_database(self):
175
  start_time = time.time() # Start time for loading database
176
  if self.db_option in ["FAISS", "Chroma"]:
@@ -181,33 +135,9 @@ class VectorStoreManager:
181
  elif self.db_option == "Chroma":
182
  self.vector_db = ChromaVectorStore(self.config)
183
  self.loaded_vector_db = self.vector_db.load_database(self.embedding_model)
184
- # if self.db_option == "FAISS":
185
- # self.vector_db = FAISS.load_local(
186
- # os.path.join(
187
- # self.config["vectorstore"]["db_path"],
188
- # "db_"
189
- # + self.config["vectorstore"]["db_option"]
190
- # + "_"
191
- # + self.config["vectorstore"]["model"],
192
- # ),
193
- # self.embedding_model,
194
- # allow_dangerous_deserialization=True,
195
- # )
196
- # elif self.db_option == "Chroma":
197
- # self.vector_db = Chroma(
198
- # persist_directory=os.path.join(
199
- # self.config["embedding_options"]["db_path"],
200
- # "db_"
201
- # + self.config["embedding_options"]["db_option"]
202
- # + "_"
203
- # + self.config["embedding_options"]["model"],
204
- # ),
205
- # embedding_function=self.embedding_model,
206
- # )
207
- # elif self.db_option == "RAGatouille":
208
- # self.vector_db = RAGPretrainedModel.from_index(
209
- # ".ragatouille/colbert/indexes/new_idx"
210
- # )
211
  end_time = time.time() # End time for loading database
212
  self.logger.info(
213
  f"Time taken to load database: {end_time - start_time} seconds"
 
1
  from modules.vectorstore.faiss import FaissVectorStore
2
  from modules.vectorstore.chroma import ChromaVectorStore
3
+ from modules.vectorstore.colbert import ColbertVectorStore
4
  from modules.vectorstore.helpers import *
5
  from modules.dataloader.webpage_crawler import WebpageCrawler
6
  from modules.dataloader.data_loader import DataLoader
 
95
  self.vector_db = ChromaVectorStore(self.config)
96
  self.vector_db.create_database(document_chunks, self.embedding_model)
97
  elif self.db_option == "RAGatouille":
98
+ self.vector_db = ColbertVectorStore(self.config)
99
+ self.vector_db.create_database(documents, document_names, document_metadata)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  def create_database(self):
102
  start_time = time.time() # Start time for creating database
 
125
  f"Time taken to create database: {end_time - start_time} seconds"
126
  )
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  def load_database(self):
129
  start_time = time.time() # Start time for loading database
130
  if self.db_option in ["FAISS", "Chroma"]:
 
135
  elif self.db_option == "Chroma":
136
  self.vector_db = ChromaVectorStore(self.config)
137
  self.loaded_vector_db = self.vector_db.load_database(self.embedding_model)
138
+ elif self.db_option == "RAGatouille":
139
+ self.vector_db = ColbertVectorStore(self.config)
140
+ self.loaded_vector_db = self.vector_db.load_database()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  end_time = time.time() # End time for loading database
142
  self.logger.info(
143
  f"Time taken to load database: {end_time - start_time} seconds"