XThomasBU commited on
Commit
f2daaee
Β·
1 Parent(s): ea7b686

added raptor and literalai

Browse files
Dockerfile.dev CHANGED
@@ -10,7 +10,8 @@ RUN pip install --no-cache-dir -r /code/requirements.txt
10
 
11
  COPY . /code
12
 
13
- RUN ls -R
 
14
 
15
  # Change permissions to allow writing to the directory
16
  RUN chmod -R 777 /code
@@ -21,7 +22,10 @@ RUN mkdir /code/logs && chmod 777 /code/logs
21
  # Create a cache directory within the application's working directory
22
  RUN mkdir /.cache && chmod -R 777 /.cache
23
 
 
 
24
  # Expose the port the app runs on
25
  EXPOSE 8051
26
 
27
- CMD python code/modules/vector_db.py && chainlit run code/main.py --port 8051
 
 
10
 
11
  COPY . /code
12
 
13
+ # List the contents of the /code directory to verify files are copied correctly
14
+ RUN ls -R /code
15
 
16
  # Change permissions to allow writing to the directory
17
  RUN chmod -R 777 /code
 
22
  # Create a cache directory within the application's working directory
23
  RUN mkdir /.cache && chmod -R 777 /.cache
24
 
25
+ WORKDIR /code/code
26
+
27
  # Expose the port the app runs on
28
  EXPOSE 8051
29
 
30
+ # Default command to run the application
31
+ CMD ["sh", "-c", "python -m modules.vectorstore.store_manager && chainlit run main.py --host 0.0.0.0 --port 8051"]
code/main.py CHANGED
@@ -1,5 +1,5 @@
1
  from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
2
- from langchain import PromptTemplate
3
  from langchain_community.embeddings import HuggingFaceEmbeddings
4
  from langchain_community.vectorstores import FAISS
5
  from langchain.chains import RetrievalQA
@@ -20,7 +20,7 @@ sys.path.append(current_dir)
20
  from modules.chat.llm_tutor import LLMTutor
21
  from modules.config.constants import *
22
  from modules.chat.helpers import get_sources
23
-
24
 
25
  global logger
26
  logger = logging.getLogger(__name__)
@@ -113,7 +113,16 @@ async def start():
113
  msg.content = opening_message
114
  await msg.update()
115
 
 
 
116
  cl.user_session.set("chain", chain)
 
 
 
 
 
 
 
117
 
118
 
119
  @cl.on_message
@@ -121,15 +130,28 @@ async def main(message):
121
  global logger
122
  user = cl.user_session.get("user")
123
  chain = cl.user_session.get("chain")
 
 
 
 
 
 
 
 
 
 
 
124
  cb = cl.AsyncLangchainCallbackHandler() # TODO: fix streaming here
125
  cb.answer_reached = True
126
- res = await chain.acall(message.content, callbacks=[cb])
127
- # res = await chain.acall(message.content)
 
128
  try:
129
  answer = res["answer"]
130
  except:
131
  answer = res["result"]
132
 
133
- answer_with_sources, source_elements = get_sources(res, answer)
 
134
 
135
  await cl.Message(content=answer_with_sources, elements=source_elements).send()
 
1
  from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
2
+ from langchain_core.prompts import PromptTemplate
3
  from langchain_community.embeddings import HuggingFaceEmbeddings
4
  from langchain_community.vectorstores import FAISS
5
  from langchain.chains import RetrievalQA
 
20
  from modules.chat.llm_tutor import LLMTutor
21
  from modules.config.constants import *
22
  from modules.chat.helpers import get_sources
23
+ from modules.chat_processor.chat_processor import ChatProcessor
24
 
25
  global logger
26
  logger = logging.getLogger(__name__)
 
113
  msg.content = opening_message
114
  await msg.update()
115
 
116
+ tags = [chat_profile, config["vectorstore"]["db_option"]]
117
+ chat_processor = ChatProcessor(config["chat_logging"]["platform"], tags=tags)
118
  cl.user_session.set("chain", chain)
119
+ cl.user_session.set("counter", 0)
120
+ cl.user_session.set("chat_processor", chat_processor)
121
+
122
+
123
+ @cl.on_chat_end
124
+ async def on_chat_end():
125
+ await cl.Message(content="Sorry, I have to go now. Goodbye!").send()
126
 
127
 
128
  @cl.on_message
 
130
  global logger
131
  user = cl.user_session.get("user")
132
  chain = cl.user_session.get("chain")
133
+
134
+ counter = cl.user_session.get("counter")
135
+ counter += 1
136
+ cl.user_session.set("counter", counter)
137
+
138
+ # if counter >= 3: # Ensure the counter condition is checked
139
+ # await cl.Message(content="Your credits are up!").send()
140
+ # await on_chat_end() # Call the on_chat_end function to handle the end of the chat
141
+ # return # Exit the function to stop further processing
142
+ # else:
143
+
144
  cb = cl.AsyncLangchainCallbackHandler() # TODO: fix streaming here
145
  cb.answer_reached = True
146
+
147
+ processor = cl.user_session.get("chat_processor")
148
+ res = await processor.rag(message.content, chain, cb)
149
  try:
150
  answer = res["answer"]
151
  except:
152
  answer = res["result"]
153
 
154
+ answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
155
+ processor._process(message.content, answer, sources_dict)
156
 
157
  await cl.Message(content=answer_with_sources, elements=source_elements).send()
code/modules/chat/helpers.py CHANGED
@@ -9,7 +9,7 @@ def get_sources(res, answer):
9
 
10
  for idx, source in enumerate(res["source_documents"]):
11
  source_metadata = source.metadata
12
- url = source_metadata["source"]
13
  score = source_metadata.get("score", "N/A")
14
  page = source_metadata.get("page", 1)
15
 
@@ -75,7 +75,7 @@ def get_sources(res, answer):
75
  )
76
  )
77
 
78
- return full_answer, source_elements
79
 
80
 
81
  def get_prompt(config):
 
9
 
10
  for idx, source in enumerate(res["source_documents"]):
11
  source_metadata = source.metadata
12
+ url = source_metadata.get("source", "N/A")
13
  score = source_metadata.get("score", "N/A")
14
  page = source_metadata.get("page", 1)
15
 
 
75
  )
76
  )
77
 
78
+ return full_answer, source_elements, source_dict
79
 
80
 
81
  def get_prompt(config):
code/modules/chat/llm_tutor.py CHANGED
@@ -102,7 +102,7 @@ class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
102
  # Prepare the final prompt with metadata
103
  context = "\n\n".join(
104
  [
105
- f"Context {idx+1}: \n(Document content: {doc.page_content}\nMetadata: (source_file: {doc.metadata['source']}))"
106
  for idx, doc in enumerate(docs)
107
  ]
108
  )
 
102
  # Prepare the final prompt with metadata
103
  context = "\n\n".join(
104
  [
105
+ f"Context {idx+1}: \n(Document content: {doc.page_content}\nMetadata: (source_file: {doc.metadata['source'] if 'source' in doc.metadata else 'unknown'}))"
106
  for idx, doc in enumerate(docs)
107
  ]
108
  )
code/modules/chat_processor/__init__.py ADDED
File without changes
code/modules/chat_processor/base.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ class ChatProcessorBase:
2
+ def __init__(self, config):
3
+ self.config = config
4
+
5
+ def process(self, message):
6
+ raise NotImplementedError("process method not implemented")
code/modules/chat_processor/chat_processor.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modules.chat_processor.literal_ai import LiteralaiChatProcessor
2
+
3
+
4
+ class ChatProcessor:
5
+ def __init__(self, chat_processor_type, tags=None):
6
+ self.chat_processor_type = chat_processor_type
7
+ self.tags = tags
8
+ self._init_processor()
9
+
10
+ def _init_processor(self):
11
+ if self.chat_processor_type == "literalai":
12
+ self.processor = LiteralaiChatProcessor(self.tags)
13
+ else:
14
+ raise ValueError(
15
+ f"Chat processor type {self.chat_processor_type} not supported"
16
+ )
17
+
18
+ def _process(self, user_message, assistant_message, source_dict):
19
+ self.processor.process(user_message, assistant_message, source_dict)
20
+
21
+ async def rag(self, user_query: str, chain, cb):
22
+ try:
23
+ return await self.processor.rag(user_query, chain, cb)
24
+ except:
25
+ return await chain.acall(user_query, callbacks=[cb])
code/modules/chat_processor/literal_ai.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from literalai import LiteralClient
2
+ import os
3
+ from .base import ChatProcessorBase
4
+
5
+
6
+ class LiteralaiChatProcessor(ChatProcessorBase):
7
+ def __init__(self, tags=None):
8
+ self.literal_client = LiteralClient(api_key=os.getenv("LITERAL_API_KEY"))
9
+ self.literal_client.reset_context()
10
+ with self.literal_client.thread(name="TEST") as thread:
11
+ self.thread_id = thread.id
12
+ self.thread = thread
13
+ if tags is not None and type(tags) == list:
14
+ self.thread.tags = tags
15
+ print(f"Thread ID: {self.thread}")
16
+
17
+ def process(self, user_message, assistant_message, source_dict):
18
+ with self.literal_client.thread(thread_id=self.thread_id) as thread:
19
+ self.literal_client.message(
20
+ content=user_message,
21
+ type="user_message",
22
+ name="User",
23
+ )
24
+ self.literal_client.message(
25
+ content=assistant_message,
26
+ type="assistant_message",
27
+ name="AI_Tutor",
28
+ )
29
+
30
+ async def rag(self, user_query: str, chain, cb):
31
+ with self.literal_client.step(
32
+ type="retrieval", name="RAG", thread_id=self.thread_id
33
+ ) as step:
34
+ step.input = {"question": user_query}
35
+ res = await chain.acall(user_query, callbacks=[cb])
36
+ step.output = res
37
+ return res
code/modules/config/config.yml CHANGED
@@ -6,8 +6,8 @@ vectorstore:
6
  embedd_files: False # bool
7
  data_path: '../storage/data' # str
8
  url_file_path: '../storage/data/urls.txt' # str
9
- expand_urls: False # bool
10
- db_option : 'Chroma' # str [FAISS, Chroma, RAGatouille]
11
  db_path : '../vectorstores' # str
12
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
13
  search_top_k : 3 # int
@@ -29,6 +29,13 @@ llm_params:
29
  llm_loader: 'openai' # str [local_llm, openai]
30
  openai_params:
31
  model: 'gpt-3.5-turbo-1106' # str [gpt-3.5-turbo-1106, gpt-4]
 
 
 
 
 
 
 
32
 
33
  splitter_options:
34
  use_splitter: True # bool
 
6
  embedd_files: False # bool
7
  data_path: '../storage/data' # str
8
  url_file_path: '../storage/data/urls.txt' # str
9
+ expand_urls: True # bool
10
+ db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille, RAPTOR]
11
  db_path : '../vectorstores' # str
12
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
13
  search_top_k : 3 # int
 
29
  llm_loader: 'openai' # str [local_llm, openai]
30
  openai_params:
31
  model: 'gpt-3.5-turbo-1106' # str [gpt-3.5-turbo-1106, gpt-4]
32
+ local_llm_params:
33
+ model: 'tiny-llama'
34
+ temperature: 0.7
35
+
36
+ chat_logging:
37
+ log_chat: True # bool
38
+ platform: 'literalai'
39
 
40
  splitter_options:
41
  use_splitter: True # bool
code/modules/config/constants.py CHANGED
@@ -77,5 +77,5 @@ Question: {question}
77
 
78
  # Model Paths
79
 
80
- LLAMA_PATH = "storage/models/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf"
81
  MISTRAL_PATH = "storage/models/mistral-7b-v0.1.Q4_K_M.gguf"
 
77
 
78
  # Model Paths
79
 
80
+ LLAMA_PATH = "../storage/models/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf"
81
  MISTRAL_PATH = "storage/models/mistral-7b-v0.1.Q4_K_M.gguf"
code/modules/retriever/__init__.py CHANGED
@@ -1,4 +1,5 @@
1
  from .faiss_retriever import FaissRetriever
2
  from .chroma_retriever import ChromaRetriever
3
  from .colbert_retriever import ColbertRetriever
 
4
  from .retriever import Retriever
 
1
  from .faiss_retriever import FaissRetriever
2
  from .chroma_retriever import ChromaRetriever
3
  from .colbert_retriever import ColbertRetriever
4
+ from .raptor_retriever import RaptorRetriever
5
  from .retriever import Retriever
code/modules/retriever/colbert_retriever.py CHANGED
@@ -6,5 +6,5 @@ class ColbertRetriever(BaseRetriever):
6
  pass
7
 
8
  def return_retriever(self, db, config):
9
- retriever = db.as_retriever()
10
  return retriever
 
6
  pass
7
 
8
  def return_retriever(self, db, config):
9
+ retriever = db.as_langchain_retriever(k=config["vectorstore"]["search_top_k"])
10
  return retriever
code/modules/retriever/raptor_retriever.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .helpers import VectorStoreRetrieverScore
2
+ from .base import BaseRetriever
3
+
4
+
5
+ class RaptorRetriever(BaseRetriever):
6
+ def __init__(self):
7
+ pass
8
+
9
+ def return_retriever(self, db, config):
10
+ retriever = VectorStoreRetrieverScore(
11
+ vectorstore=db,
12
+ search_kwargs={
13
+ "k": config["vectorstore"]["search_top_k"],
14
+ },
15
+ )
16
+ return retriever
code/modules/retriever/retriever.py CHANGED
@@ -1,6 +1,7 @@
1
  from modules.retriever.faiss_retriever import FaissRetriever
2
  from modules.retriever.chroma_retriever import ChromaRetriever
3
  from modules.retriever.colbert_retriever import ColbertRetriever
 
4
 
5
 
6
  class Retriever:
@@ -10,6 +11,7 @@ class Retriever:
10
  "FAISS": FaissRetriever,
11
  "Chroma": ChromaRetriever,
12
  "RAGatouille": ColbertRetriever,
 
13
  }
14
  self._create_retriever()
15
 
 
1
  from modules.retriever.faiss_retriever import FaissRetriever
2
  from modules.retriever.chroma_retriever import ChromaRetriever
3
  from modules.retriever.colbert_retriever import ColbertRetriever
4
+ from modules.retriever.raptor_retriever import RaptorRetriever
5
 
6
 
7
  class Retriever:
 
11
  "FAISS": FaissRetriever,
12
  "Chroma": ChromaRetriever,
13
  "RAGatouille": ColbertRetriever,
14
+ "RAPTOR": RaptorRetriever,
15
  }
16
  self._create_retriever()
17
 
code/modules/vectorstore/__init__.py CHANGED
@@ -1,2 +0,0 @@
1
- from .base import VectorStoreBase
2
- from .faiss import FAISS
 
 
 
code/modules/vectorstore/raptor.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # code modified from https://github.com/langchain-ai/langchain/blob/master/cookbook/RAPTOR.ipynb
2
+
3
+ from typing import Dict, List, Optional, Tuple
4
+ import os
5
+ import numpy as np
6
+ import pandas as pd
7
+ import umap
8
+ from langchain_core.prompts import ChatPromptTemplate
9
+ from langchain_core.output_parsers import StrOutputParser
10
+ from sklearn.mixture import GaussianMixture
11
+ from langchain_community.chat_models import ChatOpenAI
12
+ from langchain_community.vectorstores import FAISS
13
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
14
+ from modules.vectorstore.base import VectorStoreBase
15
+
16
+ RANDOM_SEED = 42
17
+
18
+
19
+ class RAPTORVectoreStore(VectorStoreBase):
20
+ def __init__(self, config, documents=[], text_splitter=None, embedding_model=None):
21
+ self.documents = documents
22
+ self.config = config
23
+ self.text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
24
+ chunk_size=self.config["splitter_options"]["chunk_size"],
25
+ chunk_overlap=self.config["splitter_options"]["chunk_overlap"],
26
+ separators=self.config["splitter_options"]["chunk_separators"],
27
+ disallowed_special=(),
28
+ )
29
+ self.embd = embedding_model
30
+ self.model = ChatOpenAI(
31
+ model="gpt-3.5-turbo",
32
+ )
33
+
34
+ def concat_documents(self, documents):
35
+ d_sorted = sorted(documents, key=lambda x: x.metadata["source"])
36
+ d_reversed = list(reversed(d_sorted))
37
+ concatenated_content = "\n\n\n --- \n\n\n".join(
38
+ [doc.page_content for doc in d_reversed]
39
+ )
40
+ return concatenated_content
41
+
42
+ def split_documents(self, documents):
43
+ concatenated_content = self.concat_documents(documents)
44
+ texts_split = self.text_splitter.split_text(concatenated_content)
45
+ return texts_split
46
+
47
+ def add_documents(self, documents):
48
+ self.documents.extend(documents)
49
+
50
+ def global_cluster_embeddings(
51
+ self,
52
+ embeddings: np.ndarray,
53
+ dim: int,
54
+ n_neighbors: Optional[int] = None,
55
+ metric: str = "cosine",
56
+ ) -> np.ndarray:
57
+ """
58
+ Perform global dimensionality reduction on the embeddings using UMAP.
59
+
60
+ Parameters:
61
+ - embeddings: The input embeddings as a numpy array.
62
+ - dim: The target dimensionality for the reduced space.
63
+ - n_neighbors: Optional; the number of neighbors to consider for each point.
64
+ If not provided, it defaults to the square root of the number of embeddings.
65
+ - metric: The distance metric to use for UMAP.
66
+
67
+ Returns:
68
+ - A numpy array of the embeddings reduced to the specified dimensionality.
69
+ """
70
+ if n_neighbors is None:
71
+ n_neighbors = int((len(embeddings) - 1) ** 0.5)
72
+ return umap.UMAP(
73
+ n_neighbors=n_neighbors, n_components=dim, metric=metric
74
+ ).fit_transform(embeddings)
75
+
76
+ def local_cluster_embeddings(
77
+ self,
78
+ embeddings: np.ndarray,
79
+ dim: int,
80
+ num_neighbors: int = 10,
81
+ metric: str = "cosine",
82
+ ) -> np.ndarray:
83
+ """
84
+ Perform local dimensionality reduction on the embeddings using UMAP, typically after global clustering.
85
+
86
+ Parameters:
87
+ - embeddings: The input embeddings as a numpy array.
88
+ - dim: The target dimensionality for the reduced space.
89
+ - num_neighbors: The number of neighbors to consider for each point.
90
+ - metric: The distance metric to use for UMAP.
91
+
92
+ Returns:
93
+ - A numpy array of the embeddings reduced to the specified dimensionality.
94
+ """
95
+ return umap.UMAP(
96
+ n_neighbors=num_neighbors, n_components=dim, metric=metric
97
+ ).fit_transform(embeddings)
98
+
99
+ def get_optimal_clusters(
100
+ self,
101
+ embeddings: np.ndarray,
102
+ max_clusters: int = 50,
103
+ random_state: int = RANDOM_SEED,
104
+ ) -> int:
105
+ """
106
+ Determine the optimal number of clusters using the Bayesian Information Criterion (BIC) with a Gaussian Mixture Model.
107
+
108
+ Parameters:
109
+ - embeddings: The input embeddings as a numpy array.
110
+ - max_clusters: The maximum number of clusters to consider.
111
+ - random_state: Seed for reproducibility.
112
+
113
+ Returns:
114
+ - An integer representing the optimal number of clusters found.
115
+ """
116
+ max_clusters = min(max_clusters, len(embeddings))
117
+ n_clusters = np.arange(1, max_clusters)
118
+ bics = []
119
+ for n in n_clusters:
120
+ gm = GaussianMixture(n_components=n, random_state=random_state)
121
+ gm.fit(embeddings)
122
+ bics.append(gm.bic(embeddings))
123
+ return n_clusters[np.argmin(bics)]
124
+
125
+ def GMM_cluster(
126
+ self, embeddings: np.ndarray, threshold: float, random_state: int = 0
127
+ ):
128
+ """
129
+ Cluster embeddings using a Gaussian Mixture Model (GMM) based on a probability threshold.
130
+
131
+ Parameters:
132
+ - embeddings: The input embeddings as a numpy array.
133
+ - threshold: The probability threshold for assigning an embedding to a cluster.
134
+ - random_state: Seed for reproducibility.
135
+
136
+ Returns:
137
+ - A tuple containing the cluster labels and the number of clusters determined.
138
+ """
139
+ n_clusters = self.get_optimal_clusters(embeddings)
140
+ gm = GaussianMixture(n_components=n_clusters, random_state=random_state)
141
+ gm.fit(embeddings)
142
+ probs = gm.predict_proba(embeddings)
143
+ labels = [np.where(prob > threshold)[0] for prob in probs]
144
+ return labels, n_clusters
145
+
146
+ def perform_clustering(
147
+ self,
148
+ embeddings: np.ndarray,
149
+ dim: int,
150
+ threshold: float,
151
+ ) -> List[np.ndarray]:
152
+ """
153
+ Perform clustering on the embeddings by first reducing their dimensionality globally, then clustering
154
+ using a Gaussian Mixture Model, and finally performing local clustering within each global cluster.
155
+
156
+ Parameters:
157
+ - embeddings: The input embeddings as a numpy array.
158
+ - dim: The target dimensionality for UMAP reduction.
159
+ - threshold: The probability threshold for assigning an embedding to a cluster in GMM.
160
+
161
+ Returns:
162
+ - A list of numpy arrays, where each array contains the cluster IDs for each embedding.
163
+ """
164
+ if len(embeddings) <= dim + 1:
165
+ # Avoid clustering when there's insufficient data
166
+ return [np.array([0]) for _ in range(len(embeddings))]
167
+
168
+ # Global dimensionality reduction
169
+ reduced_embeddings_global = self.global_cluster_embeddings(embeddings, dim)
170
+ # Global clustering
171
+ global_clusters, n_global_clusters = self.GMM_cluster(
172
+ reduced_embeddings_global, threshold
173
+ )
174
+
175
+ all_local_clusters = [np.array([]) for _ in range(len(embeddings))]
176
+ total_clusters = 0
177
+
178
+ # Iterate through each global cluster to perform local clustering
179
+ for i in range(n_global_clusters):
180
+ # Extract embeddings belonging to the current global cluster
181
+ global_cluster_embeddings_ = embeddings[
182
+ np.array([i in gc for gc in global_clusters])
183
+ ]
184
+
185
+ if len(global_cluster_embeddings_) == 0:
186
+ continue
187
+ if len(global_cluster_embeddings_) <= dim + 1:
188
+ # Handle small clusters with direct assignment
189
+ local_clusters = [np.array([0]) for _ in global_cluster_embeddings_]
190
+ n_local_clusters = 1
191
+ else:
192
+ # Local dimensionality reduction and clustering
193
+ reduced_embeddings_local = self.local_cluster_embeddings(
194
+ global_cluster_embeddings_, dim
195
+ )
196
+ local_clusters, n_local_clusters = self.GMM_cluster(
197
+ reduced_embeddings_local, threshold
198
+ )
199
+
200
+ # Assign local cluster IDs, adjusting for total clusters already processed
201
+ for j in range(n_local_clusters):
202
+ local_cluster_embeddings_ = global_cluster_embeddings_[
203
+ np.array([j in lc for lc in local_clusters])
204
+ ]
205
+ indices = np.where(
206
+ (embeddings == local_cluster_embeddings_[:, None]).all(-1)
207
+ )[1]
208
+ for idx in indices:
209
+ all_local_clusters[idx] = np.append(
210
+ all_local_clusters[idx], j + total_clusters
211
+ )
212
+
213
+ total_clusters += n_local_clusters
214
+
215
+ return all_local_clusters
216
+
217
+ def embed(self, texts):
218
+ """
219
+ Generate embeddings for a list of text documents.
220
+
221
+ This function assumes the existence of an `embd` object with a method `embed_documents`
222
+ that takes a list of texts and returns their embeddings.
223
+
224
+ Parameters:
225
+ - texts: List[str], a list of text documents to be embedded.
226
+
227
+ Returns:
228
+ - numpy.ndarray: An array of embeddings for the given text documents.
229
+ """
230
+ text_embeddings = self.embd.embed_documents(texts)
231
+ text_embeddings_np = np.array(text_embeddings)
232
+ return text_embeddings_np
233
+
234
+ def embed_cluster_texts(self, texts):
235
+ """
236
+ Embeds a list of texts and clusters them, returning a DataFrame with texts, their embeddings, and cluster labels.
237
+
238
+ This function combines embedding generation and clustering into a single step. It assumes the existence
239
+ of a previously defined `perform_clustering` function that performs clustering on the embeddings.
240
+
241
+ Parameters:
242
+ - texts: List[str], a list of text documents to be processed.
243
+
244
+ Returns:
245
+ - pandas.DataFrame: A DataFrame containing the original texts, their embeddings, and the assigned cluster labels.
246
+ """
247
+ text_embeddings_np = self.embed(texts) # Generate embeddings
248
+ cluster_labels = self.perform_clustering(
249
+ text_embeddings_np, 10, 0.1
250
+ ) # Perform clustering on the embeddings
251
+ df = pd.DataFrame() # Initialize a DataFrame to store the results
252
+ df["text"] = texts # Store original texts
253
+ df["embd"] = list(
254
+ text_embeddings_np
255
+ ) # Store embeddings as a list in the DataFrame
256
+ df["cluster"] = cluster_labels # Store cluster labels
257
+ return df
258
+
259
+ def fmt_txt(self, df: pd.DataFrame) -> str:
260
+ """
261
+ Formats the text documents in a DataFrame into a single string.
262
+
263
+ Parameters:
264
+ - df: DataFrame containing the 'text' column with text documents to format.
265
+
266
+ Returns:
267
+ - A single string where all text documents are joined by a specific delimiter.
268
+ """
269
+ unique_txt = df["text"].tolist()
270
+ return "--- --- \n --- --- ".join(unique_txt)
271
+
272
+ def embed_cluster_summarize_texts(
273
+ self, texts: List[str], level: int
274
+ ) -> Tuple[pd.DataFrame, pd.DataFrame]:
275
+ """
276
+ Embeds, clusters, and summarizes a list of texts. This function first generates embeddings for the texts,
277
+ clusters them based on similarity, expands the cluster assignments for easier processing, and then summarizes
278
+ the content within each cluster.
279
+
280
+ Parameters:
281
+ - texts: A list of text documents to be processed.
282
+ - level: An integer parameter that could define the depth or detail of processing.
283
+
284
+ Returns:
285
+ - Tuple containing two DataFrames:
286
+ 1. The first DataFrame (`df_clusters`) includes the original texts, their embeddings, and cluster assignments.
287
+ 2. The second DataFrame (`df_summary`) contains summaries for each cluster, the specified level of detail,
288
+ and the cluster identifiers.
289
+ """
290
+
291
+ # Embed and cluster the texts, resulting in a DataFrame with 'text', 'embd', and 'cluster' columns
292
+ df_clusters = self.embed_cluster_texts(texts)
293
+
294
+ # Prepare to expand the DataFrame for easier manipulation of clusters
295
+ expanded_list = []
296
+
297
+ # Expand DataFrame entries to document-cluster pairings for straightforward processing
298
+ for index, row in df_clusters.iterrows():
299
+ for cluster in row["cluster"]:
300
+ expanded_list.append(
301
+ {"text": row["text"], "embd": row["embd"], "cluster": cluster}
302
+ )
303
+
304
+ # Create a new DataFrame from the expanded list
305
+ expanded_df = pd.DataFrame(expanded_list)
306
+
307
+ # Retrieve unique cluster identifiers for processing
308
+ all_clusters = expanded_df["cluster"].unique()
309
+
310
+ print(f"--Generated {len(all_clusters)} clusters--")
311
+
312
+ # Summarization
313
+ template = """Here is content from the course DS598: Deep Learning for Data Science.
314
+
315
+ The content may be form webapge about the course, or lecture content, or any other relevant information.
316
+ If the content is in bullet points (from pdf lectre slides), you can summarize the bullet points.
317
+
318
+ Give a detailed summary of the content below.
319
+
320
+ Documentation:
321
+ {context}
322
+ """
323
+ prompt = ChatPromptTemplate.from_template(template)
324
+ chain = prompt | self.model | StrOutputParser()
325
+
326
+ # Format text within each cluster for summarization
327
+ summaries = []
328
+ for i in all_clusters:
329
+ df_cluster = expanded_df[expanded_df["cluster"] == i]
330
+ formatted_txt = self.fmt_txt(df_cluster)
331
+ summaries.append(chain.invoke({"context": formatted_txt}))
332
+
333
+ # Create a DataFrame to store summaries with their corresponding cluster and level
334
+ df_summary = pd.DataFrame(
335
+ {
336
+ "summaries": summaries,
337
+ "level": [level] * len(summaries),
338
+ "cluster": list(all_clusters),
339
+ }
340
+ )
341
+
342
+ return df_clusters, df_summary
343
+
344
+ def recursive_embed_cluster_summarize(
345
+ self, texts: List[str], level: int = 1, n_levels: int = 3
346
+ ) -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]:
347
+ """
348
+ Recursively embeds, clusters, and summarizes texts up to a specified level or until
349
+ the number of unique clusters becomes 1, storing the results at each level.
350
+
351
+ Parameters:
352
+ - texts: List[str], texts to be processed.
353
+ - level: int, current recursion level (starts at 1).
354
+ - n_levels: int, maximum depth of recursion.
355
+
356
+ Returns:
357
+ - Dict[int, Tuple[pd.DataFrame, pd.DataFrame]], a dictionary where keys are the recursion
358
+ levels and values are tuples containing the clusters DataFrame and summaries DataFrame at that level.
359
+ """
360
+ results = {} # Dictionary to store results at each level
361
+
362
+ # Perform embedding, clustering, and summarization for the current level
363
+ df_clusters, df_summary = self.embed_cluster_summarize_texts(texts, level)
364
+
365
+ # Store the results of the current level
366
+ results[level] = (df_clusters, df_summary)
367
+
368
+ # Determine if further recursion is possible and meaningful
369
+ unique_clusters = df_summary["cluster"].nunique()
370
+ if level < n_levels and unique_clusters > 1:
371
+ # Use summaries as the input texts for the next level of recursion
372
+ new_texts = df_summary["summaries"].tolist()
373
+ next_level_results = self.recursive_embed_cluster_summarize(
374
+ new_texts, level + 1, n_levels
375
+ )
376
+
377
+ # Merge the results from the next level into the current results dictionary
378
+ results.update(next_level_results)
379
+
380
+ return results
381
+
382
+ def get_vector_db(self):
383
+ """
384
+ Generate a retriever object from a list of documents.
385
+
386
+ Parameters:
387
+ - documents: List of document objects.
388
+
389
+ Returns:
390
+ - A retriever object.
391
+ """
392
+ leaf_texts = self.split_documents(self.documents)
393
+ results = self.recursive_embed_cluster_summarize(
394
+ leaf_texts, level=1, n_levels=10
395
+ )
396
+
397
+ all_texts = leaf_texts.copy()
398
+ # Iterate through the results to extract summaries from each level and add them to all_texts
399
+ for level in sorted(results.keys()):
400
+ # Extract summaries from the current level's DataFrame
401
+ summaries = results[level][1]["summaries"].tolist()
402
+ # Extend all_texts with the summaries from the current level
403
+ all_texts.extend(summaries)
404
+
405
+ # Now, use all_texts to build the vectorstore
406
+ vectorstore = FAISS.from_texts(texts=all_texts, embedding=self.embd)
407
+ return vectorstore
408
+
409
+ def create_database(self, documents, embedding_model):
410
+ self.documents = documents
411
+ self.embd = embedding_model
412
+ self.vectorstore = self.get_vector_db()
413
+ self.vectorstore.save_local(
414
+ os.path.join(
415
+ self.config["vectorstore"]["db_path"],
416
+ "db_"
417
+ + self.config["vectorstore"]["db_option"]
418
+ + "_"
419
+ + self.config["vectorstore"]["model"],
420
+ )
421
+ )
422
+
423
+ def load_database(self, embedding_model):
424
+ self.vectorstore = FAISS.load_local(
425
+ os.path.join(
426
+ self.config["vectorstore"]["db_path"],
427
+ "db_"
428
+ + self.config["vectorstore"]["db_option"]
429
+ + "_"
430
+ + self.config["vectorstore"]["model"],
431
+ ),
432
+ embedding_model,
433
+ allow_dangerous_deserialization=True,
434
+ )
435
+ return self.vectorstore
436
+
437
+ def as_retriever(self):
438
+ return self.vectorstore.as_retriever()
code/modules/vectorstore/store_manager.py CHANGED
@@ -84,7 +84,7 @@ class VectorStoreManager:
84
  documents: list,
85
  document_metadata: list,
86
  ):
87
- if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma"]:
88
  self.embedding_model = self.create_embedding_model()
89
  else:
90
  self.embedding_model = None
@@ -132,7 +132,7 @@ class VectorStoreManager:
132
  def load_database(self):
133
 
134
  start_time = time.time() # Start time for loading database
135
- if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma"]:
136
  self.embedding_model = self.create_embedding_model()
137
  else:
138
  self.embedding_model = None
 
84
  documents: list,
85
  document_metadata: list,
86
  ):
87
+ if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma", "RAPTOR"]:
88
  self.embedding_model = self.create_embedding_model()
89
  else:
90
  self.embedding_model = None
 
132
  def load_database(self):
133
 
134
  start_time = time.time() # Start time for loading database
135
+ if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma", "RAPTOR"]:
136
  self.embedding_model = self.create_embedding_model()
137
  else:
138
  self.embedding_model = None
code/modules/vectorstore/vectorstore.py CHANGED
@@ -1,6 +1,7 @@
1
  from modules.vectorstore.faiss import FaissVectorStore
2
  from modules.vectorstore.chroma import ChromaVectorStore
3
  from modules.vectorstore.colbert import ColbertVectorStore
 
4
 
5
 
6
  class VectorStore:
@@ -11,6 +12,7 @@ class VectorStore:
11
  "FAISS": FaissVectorStore,
12
  "Chroma": ChromaVectorStore,
13
  "RAGatouille": ColbertVectorStore,
 
14
  }
15
 
16
  def _create_database(
 
1
  from modules.vectorstore.faiss import FaissVectorStore
2
  from modules.vectorstore.chroma import ChromaVectorStore
3
  from modules.vectorstore.colbert import ColbertVectorStore
4
+ from modules.vectorstore.raptor import RAPTORVectoreStore
5
 
6
 
7
  class VectorStore:
 
12
  "FAISS": FaissVectorStore,
13
  "Chroma": ChromaVectorStore,
14
  "RAGatouille": ColbertVectorStore,
15
+ "RAPTOR": RAPTORVectoreStore,
16
  }
17
 
18
  def _create_database(
{public β†’ code/public}/logo_dark.png RENAMED
File without changes
{public β†’ code/public}/logo_light.png RENAMED
File without changes
{public β†’ code/public}/test.css RENAMED
File without changes
requirements.txt CHANGED
@@ -18,3 +18,5 @@ llama-cpp-python==0.2.77
18
  fake_useragent==1.5.1
19
  chromadb==0.5.0
20
  pymupdf==1.24.5
 
 
 
18
  fake_useragent==1.5.1
19
  chromadb==0.5.0
20
  pymupdf==1.24.5
21
+ literalai==0.0.601
22
+ umap-learn==0.5.6