Tonic commited on
Commit
be9cd13
·
1 Parent(s): 70c5bc9

Revert "chroma langchain fix 1"

Browse files

This reverts commit 70c5bc93401cd8a8a030ca08155fcfeb5906751a.

Files changed (2) hide show
  1. app.py +16 -35
  2. requirements.txt +0 -1
app.py CHANGED
@@ -113,13 +113,9 @@ from langchain_community.document_loaders import UnstructuredFileLoader
113
  from chromadb import Documents, EmbeddingFunction, Embeddings
114
  from chromadb.config import Settings
115
  from chromadb import HttpClient
116
- from langchain_chroma import Chroma
117
  from utils import load_env_variables, parse_and_route
118
  from globalvars import API_BASE, intention_prompt, tasks, system_message, model_name
119
- from langchain_core.embeddings import Embeddings
120
- from chromadb.api.types import EmbeddingFunction, Documents
121
 
122
-
123
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:30'
124
  os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
125
  os.environ['CUDA_CACHE_DISABLE'] = '1'
@@ -182,23 +178,6 @@ class EmbeddingGenerator:
182
  self.clear_cuda_cache()
183
  return embeddings_list
184
 
185
- class ChromaEmbeddingsAdapter(Embeddings):
186
- def __init__(self, ef: EmbeddingFunction):
187
- self.ef = ef
188
-
189
- def embed_documents(self, texts):
190
- return self.ef(texts)
191
-
192
- def embed_query(self, query):
193
- return self.ef([query])[0]
194
-
195
- class LangChainEmbeddingAdapter(EmbeddingFunction[Documents]):
196
- def __init__(self, ef: Embeddings):
197
- self.ef = ef
198
-
199
- def __call__(self, input: Documents) -> Embeddings:
200
- return self.ef.embed_documents(input)
201
-
202
  class MyEmbeddingFunction(EmbeddingFunction):
203
  def __init__(self, embedding_generator: EmbeddingGenerator):
204
  self.embedding_generator = embedding_generator
@@ -214,22 +193,25 @@ def load_documents(file_path: str, mode: str = "elements"):
214
  return [doc.page_content for doc in docs]
215
 
216
  def initialize_chroma(collection_name: str, embedding_function: MyEmbeddingFunction):
217
- client = Chroma.from_documents([], ChromaEmbeddingsAdapter(embedding_function)) # Initialize with no documents
218
- return client
 
 
219
 
220
- def add_documents_to_chroma(client, documents: list, embedding_function: MyEmbeddingFunction):
221
  for doc in documents:
222
- client.add(ids=[str(uuid.uuid1())], documents=[doc], embeddings=embedding_function([doc]))
223
 
224
- def query_chroma(client, query_text: str):
225
- result_docs = client.similarity_search(query_text)
 
226
  return result_docs
227
-
228
  # Initialize clients
229
  intention_client = OpenAI(api_key=yi_token, base_url=API_BASE)
230
  embedding_generator = EmbeddingGenerator(model_name=model_name, token=hf_token, intention_client=intention_client)
231
  embedding_function = MyEmbeddingFunction(embedding_generator=embedding_generator)
232
- chroma_client = initialize_chroma(collection_name="Tonic-instruct", embedding_function=embedding_function)
233
 
234
  def respond(
235
  message,
@@ -261,15 +243,14 @@ def respond(
261
 
262
  def upload_documents(files):
263
  for file in files:
264
- loader = UnstructuredFileLoader(file.name)
265
- documents = loader.load()
266
- add_documents_to_chroma(chroma_client, documents, embedding_function)
267
  return "Documents uploaded and processed successfully!"
268
 
269
-
270
  def query_documents(query):
271
- results = query_chroma(chroma_client, query)
272
- return "\n\n".join([result.page_content for result in results])
273
 
274
  with gr.Blocks() as demo:
275
  with gr.Tab("Upload Documents"):
 
113
  from chromadb import Documents, EmbeddingFunction, Embeddings
114
  from chromadb.config import Settings
115
  from chromadb import HttpClient
 
116
  from utils import load_env_variables, parse_and_route
117
  from globalvars import API_BASE, intention_prompt, tasks, system_message, model_name
 
 
118
 
 
119
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:30'
120
  os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
121
  os.environ['CUDA_CACHE_DISABLE'] = '1'
 
178
  self.clear_cuda_cache()
179
  return embeddings_list
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  class MyEmbeddingFunction(EmbeddingFunction):
182
  def __init__(self, embedding_generator: EmbeddingGenerator):
183
  self.embedding_generator = embedding_generator
 
193
  return [doc.page_content for doc in docs]
194
 
195
  def initialize_chroma(collection_name: str, embedding_function: MyEmbeddingFunction):
196
+ client = chromadb.HttpClient(host='localhost', port=8000, settings=Settings(allow_reset=True, anonymized_telemetry=False))
197
+ client.reset() # resets the database
198
+ collection = client.create_collection(collection_name)
199
+ return client, collection
200
 
201
+ def add_documents_to_chroma(client, collection, documents: list, embedding_function: MyEmbeddingFunction):
202
  for doc in documents:
203
+ collection.add(ids=[str(uuid.uuid1())], documents=[doc], embeddings=embedding_function([doc]))
204
 
205
+ def query_chroma(client, collection_name: str, query_text: str, embedding_function: MyEmbeddingFunction):
206
+ db = Chroma(client=client, collection_name=collection_name, embedding_function=embedding_function)
207
+ result_docs = db.similarity_search(query_text)
208
  return result_docs
209
+
210
  # Initialize clients
211
  intention_client = OpenAI(api_key=yi_token, base_url=API_BASE)
212
  embedding_generator = EmbeddingGenerator(model_name=model_name, token=hf_token, intention_client=intention_client)
213
  embedding_function = MyEmbeddingFunction(embedding_generator=embedding_generator)
214
+ chroma_client, chroma_collection = initialize_chroma(collection_name="Tonic-instruct", embedding_function=embedding_function)
215
 
216
  def respond(
217
  message,
 
243
 
244
  def upload_documents(files):
245
  for file in files:
246
+ loader = DocumentLoader(file.name)
247
+ documents = loader.load_documents()
248
+ chroma_manager.add_documents(documents)
249
  return "Documents uploaded and processed successfully!"
250
 
 
251
  def query_documents(query):
252
+ results = chroma_manager.query(query)
253
+ return "\n\n".join([result.content for result in results])
254
 
255
  with gr.Blocks() as demo:
256
  with gr.Tab("Upload Documents"):
requirements.txt CHANGED
@@ -7,7 +7,6 @@ openai
7
  python-dotenv
8
  chromadb
9
  langchain-community
10
- langchain-chroma
11
  unstructured[all-docs]
12
  libmagic
13
  # poppler
 
7
  python-dotenv
8
  chromadb
9
  langchain-community
 
10
  unstructured[all-docs]
11
  libmagic
12
  # poppler