andreasmartin commited on
Commit
31d4f49
·
1 Parent(s): b7c1815

deepnote update

Browse files
Files changed (2) hide show
  1. faq.py +31 -16
  2. requirements.txt +1 -0
faq.py CHANGED
@@ -1,20 +1,22 @@
1
  import pandas as pd
2
  from langchain.document_loaders import DataFrameLoader
3
  from langchain.embeddings import HuggingFaceEmbeddings
4
- from langchain.vectorstores import AwaDB
5
  from typing import List, Tuple
6
  from langchain.docstore.document import Document
7
  from langchain.embeddings.base import Embeddings
8
  from langchain.vectorstores.base import VectorStore
9
  import os
10
  import shutil
 
11
 
12
  SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
13
  SHEET_URL_Y = "/edit#gid="
14
  SHEET_URL_Y_EXPORT = "/export?gid="
15
- CACHE_FOLDER = ".embedding-model"
16
  VECTORDB_FOLDER = ".vectordb"
17
  EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
 
18
 
19
 
20
  def faq_id(sheet_url: str) -> str:
@@ -41,26 +43,39 @@ def define_embedding_function(model_name: str) -> HuggingFaceEmbeddings:
41
  return HuggingFaceEmbeddings(
42
  model_name=model_name,
43
  encode_kwargs={"normalize_embeddings": True},
44
- cache_folder=CACHE_FOLDER,
45
  )
46
 
47
 
48
  def get_vectordb(
49
- faq_id: str, embedding_function: Embeddings, documents: List[Document] = None
50
  ) -> VectorStore:
51
  vectordb = None
52
- if documents is None:
53
- vectordb = AwaDB(embedding=embedding_function, log_and_data_dir=VECTORDB_FOLDER)
54
- success = vectordb.load_local(table_name=faq_id)
55
- if not success:
56
- raise Exception("faq_id may not exists")
57
- else:
58
- vectordb = AwaDB.from_documents(
59
- documents=documents,
60
- embedding=embedding_function,
61
- table_name=faq_id,
62
- log_and_data_dir=VECTORDB_FOLDER,
63
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  return vectordb
65
 
66
 
 
1
  import pandas as pd
2
  from langchain.document_loaders import DataFrameLoader
3
  from langchain.embeddings import HuggingFaceEmbeddings
4
+ from langchain.vectorstores import AwaDB, Chroma
5
  from typing import List, Tuple
6
  from langchain.docstore.document import Document
7
  from langchain.embeddings.base import Embeddings
8
  from langchain.vectorstores.base import VectorStore
9
  import os
10
  import shutil
11
+ from enum import Enum
12
 
13
  SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
14
  SHEET_URL_Y = "/edit#gid="
15
  SHEET_URL_Y_EXPORT = "/export?gid="
16
+ EMBEDDING_MODEL_FOLDER = ".embedding-model"
17
  VECTORDB_FOLDER = ".vectordb"
18
  EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
19
+ VECTORDB_TYPE = Enum("VECTORDB_TYPE", ["AwaDB", "Chroma"])
20
 
21
 
22
  def faq_id(sheet_url: str) -> str:
 
43
  return HuggingFaceEmbeddings(
44
  model_name=model_name,
45
  encode_kwargs={"normalize_embeddings": True},
46
+ cache_folder=EMBEDDING_MODEL_FOLDER,
47
  )
48
 
49
 
50
  def get_vectordb(
51
+ faq_id: str, embedding_function: Embeddings, documents: List[Document] = None, vectordb_type: str = VECTORDB_TYPE.AwaDB
52
  ) -> VectorStore:
53
  vectordb = None
54
+
55
+ if vectordb_type is VECTORDB_TYPE.AwaDB:
56
+ if documents is None:
57
+ vectordb = AwaDB(embedding=embedding_function, log_and_data_dir=VECTORDB_FOLDER)
58
+ if not vectordb.load_local(table_name=faq_id):
59
+ raise Exception("faq_id may not exists")
60
+ else:
61
+ vectordb = AwaDB.from_documents(
62
+ documents=documents,
63
+ embedding=embedding_function,
64
+ table_name=faq_id,
65
+ log_and_data_dir=VECTORDB_FOLDER,
66
+ )
67
+ if vectordb_type is VECTORDB_TYPE.Chroma:
68
+ if documents is None:
69
+ vectordb = Chroma(collection_name=faq_id, embedding_function=embedding_function, persist_directory=VECTORDB_FOLDER)
70
+ if not vectordb.get()["ids"]:
71
+ raise Exception("faq_id may not exists")
72
+ else:
73
+ vectordb = Chroma.from_documents(
74
+ documents=documents,
75
+ embedding=embedding_function,
76
+ collection_name=faq_id,
77
+ persist_directory=VECTORDB_FOLDER,
78
+ )
79
  return vectordb
80
 
81
 
requirements.txt CHANGED
@@ -2,6 +2,7 @@ openpyxl
2
  langchain
3
  sentence_transformers
4
  awadb
 
5
  fastapi
6
  uvicorn
7
  gradio==3.35.2
 
2
  langchain
3
  sentence_transformers
4
  awadb
5
+ chromadb
6
  fastapi
7
  uvicorn
8
  gradio==3.35.2