Spaces:
Sleeping
Sleeping
File size: 4,336 Bytes
4dc1d14 4cdba7b 5005601 31d4f49 5005601 6323bc8 31d4f49 5005601 31d4f49 8c2f0ba 6323bc8 78aafcc 1435c22 5005601 6017dce 5005601 31d4f49 5005601 6017dce 718e159 78aafcc 5005601 31d4f49 78aafcc 31d4f49 78aafcc 718e159 31d4f49 718e159 31d4f49 78aafcc 31d4f49 78aafcc 718e159 78aafcc 31d4f49 718e159 31d4f49 718e159 31d4f49 718e159 5005601 8c2f0ba 6017dce 8c2f0ba 5005601 8c2f0ba 6017dce 6323bc8 718e159 6323bc8 6017dce 718e159 6017dce 78aafcc 718e159 6323bc8 718e159 6323bc8 718e159 6323bc8 718e159 6323bc8 6017dce 4dc1d14 6323bc8 718e159 6323bc8 718e159 1435c22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import util as util
import pandas as pd
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import AwaDB, Chroma
from typing import List, Tuple
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VectorStore
import os
import shutil
from enum import Enum
EMBEDDING_MODEL_FOLDER = ".embedding-model"
VECTORDB_FOLDER = ".vectordb"
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
VECTORDB_TYPES = Enum("VECTORDB_TYPES", ["AwaDB", "Chroma"])
VECTORDB_TYPE = VECTORDB_TYPES.Chroma
def create_documents(df: pd.DataFrame, page_content_column: str) -> pd.DataFrame:
loader = DataFrameLoader(df, page_content_column=page_content_column)
return loader.load()
def define_embedding_function(model_name: str) -> HuggingFaceEmbeddings:
return HuggingFaceEmbeddings(
model_name=model_name,
encode_kwargs={"normalize_embeddings": True},
cache_folder=EMBEDDING_MODEL_FOLDER,
)
def get_vectordb(
collection_id: str,
embedding_function: Embeddings,
documents: List[Document] = None,
vectordb_type: str = VECTORDB_TYPE,
) -> VectorStore:
vectordb = None
if vectordb_type is VECTORDB_TYPES.AwaDB:
if documents is None:
vectordb = AwaDB(
embedding=embedding_function, log_and_data_dir=VECTORDB_FOLDER
)
if not vectordb.load_local(table_name=collection_id):
raise Exception("collection_id may not exists")
else:
vectordb = AwaDB.from_documents(
documents=documents,
embedding=embedding_function,
table_name=collection_id,
log_and_data_dir=VECTORDB_FOLDER,
)
if vectordb_type is VECTORDB_TYPES.Chroma:
if documents is None:
vectordb = Chroma(
collection_name=collection_id,
embedding_function=embedding_function,
persist_directory=VECTORDB_FOLDER,
)
if not vectordb.get()["ids"]:
raise Exception("collection_id may not exists")
else:
vectordb = Chroma.from_documents(
documents=documents,
embedding=embedding_function,
collection_name=collection_id,
persist_directory=VECTORDB_FOLDER,
)
vectordb.persist()
return vectordb
def similarity_search(
vectordb: VectorStore, query: str, k: int = 3
) -> List[Tuple[Document, float]]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"
return vectordb.similarity_search_with_relevance_scores(query=query, k=k)
def load_vectordb_id(
collection_id: str,
page_content_column: str,
embedding_function_name: str = EMBEDDING_MODEL,
) -> VectorStore:
embedding_function = define_embedding_function(embedding_function_name)
vectordb = None
try:
vectordb = get_vectordb(collection_id=collection_id, embedding_function=embedding_function)
except Exception as e:
print(e)
vectordb = create_vectordb_id(collection_id, page_content_column, embedding_function)
return vectordb
def create_vectordb_id(
collection_id: str,
page_content_column: str,
embedding_function: HuggingFaceEmbeddings = None,
) -> VectorStore:
if embedding_function is None:
embedding_function = define_embedding_function(EMBEDDING_MODEL)
df = util.read_df(util.xlsx_url(collection_id), page_content_column)
documents = create_documents(df, page_content_column)
vectordb = get_vectordb(
collection_id=collection_id, embedding_function=embedding_function, documents=documents
)
return vectordb
def load_vectordb(sheet_url: str, page_content_column: str) -> VectorStore:
return load_vectordb_id(util.get_id(sheet_url), page_content_column)
def delete_vectordb() -> None:
shutil.rmtree(VECTORDB_FOLDER, ignore_errors=True)
def delete_vectordb_current_collection(vectordb: VectorStore) -> None:
if VECTORDB_TYPE is VECTORDB_TYPES.Chroma:
vectordb.delete_collection()
vectordb.persist()
if VECTORDB_TYPE is VECTORDB_TYPES.AwaDB:
delete_vectordb()
|