File size: 4,336 Bytes
4dc1d14
4cdba7b
5005601
 
31d4f49
5005601
 
 
 
 
6323bc8
31d4f49
5005601
31d4f49
8c2f0ba
6323bc8
78aafcc
1435c22
5005601
 
 
 
 
 
 
6017dce
5005601
 
 
31d4f49
5005601
 
 
6017dce
718e159
78aafcc
 
 
5005601
 
31d4f49
78aafcc
31d4f49
78aafcc
 
 
718e159
 
31d4f49
 
 
 
718e159
31d4f49
 
78aafcc
31d4f49
78aafcc
718e159
78aafcc
 
 
31d4f49
718e159
31d4f49
 
 
 
718e159
31d4f49
 
718e159
5005601
 
 
8c2f0ba
6017dce
8c2f0ba
5005601
8c2f0ba
6017dce
 
6323bc8
718e159
6323bc8
 
 
 
6017dce
 
718e159
6017dce
78aafcc
718e159
6323bc8
 
 
 
 
718e159
6323bc8
 
 
 
 
 
718e159
6323bc8
 
718e159
6323bc8
6017dce
 
 
 
4dc1d14
6323bc8
 
718e159
6323bc8
718e159
 
 
1435c22
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import util as util
import pandas as pd
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import AwaDB, Chroma
from typing import List, Tuple
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VectorStore
import os
import shutil
from enum import Enum

EMBEDDING_MODEL_FOLDER = ".embedding-model"
VECTORDB_FOLDER = ".vectordb"
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
VECTORDB_TYPES = Enum("VECTORDB_TYPES", ["AwaDB", "Chroma"])
VECTORDB_TYPE = VECTORDB_TYPES.Chroma


def create_documents(df: pd.DataFrame, page_content_column: str) -> pd.DataFrame:
    loader = DataFrameLoader(df, page_content_column=page_content_column)
    return loader.load()


def define_embedding_function(model_name: str) -> HuggingFaceEmbeddings:
    return HuggingFaceEmbeddings(
        model_name=model_name,
        encode_kwargs={"normalize_embeddings": True},
        cache_folder=EMBEDDING_MODEL_FOLDER,
    )


def get_vectordb(
    collection_id: str,
    embedding_function: Embeddings,
    documents: List[Document] = None,
    vectordb_type: str = VECTORDB_TYPE,
) -> VectorStore:
    vectordb = None

    if vectordb_type is VECTORDB_TYPES.AwaDB:
        if documents is None:
            vectordb = AwaDB(
                embedding=embedding_function, log_and_data_dir=VECTORDB_FOLDER
            )
            if not vectordb.load_local(table_name=collection_id):
                raise Exception("collection_id may not exists")
        else:
            vectordb = AwaDB.from_documents(
                documents=documents,
                embedding=embedding_function,
                table_name=collection_id,
                log_and_data_dir=VECTORDB_FOLDER,
            )
    if vectordb_type is VECTORDB_TYPES.Chroma:
        if documents is None:
            vectordb = Chroma(
                collection_name=collection_id,
                embedding_function=embedding_function,
                persist_directory=VECTORDB_FOLDER,
            )
            if not vectordb.get()["ids"]:
                raise Exception("collection_id may not exists")
        else:
            vectordb = Chroma.from_documents(
                documents=documents,
                embedding=embedding_function,
                collection_name=collection_id,
                persist_directory=VECTORDB_FOLDER,
            )
            vectordb.persist()
    return vectordb


def similarity_search(
    vectordb: VectorStore, query: str, k: int = 3
) -> List[Tuple[Document, float]]:
    os.environ["TOKENIZERS_PARALLELISM"] = "true"
    return vectordb.similarity_search_with_relevance_scores(query=query, k=k)


def load_vectordb_id(
    collection_id: str,
    page_content_column: str,
    embedding_function_name: str = EMBEDDING_MODEL,
) -> VectorStore:
    embedding_function = define_embedding_function(embedding_function_name)
    vectordb = None
    try:
        vectordb = get_vectordb(collection_id=collection_id, embedding_function=embedding_function)
    except Exception as e:
        print(e)
        vectordb = create_vectordb_id(collection_id, page_content_column, embedding_function)

    return vectordb


def create_vectordb_id(
    collection_id: str,
    page_content_column: str,
    embedding_function: HuggingFaceEmbeddings = None,
) -> VectorStore:
    if embedding_function is None:
        embedding_function = define_embedding_function(EMBEDDING_MODEL)

    df = util.read_df(util.xlsx_url(collection_id), page_content_column)
    documents = create_documents(df, page_content_column)
    vectordb = get_vectordb(
        collection_id=collection_id, embedding_function=embedding_function, documents=documents
    )
    return vectordb


def load_vectordb(sheet_url: str, page_content_column: str) -> VectorStore:
    return load_vectordb_id(util.get_id(sheet_url), page_content_column)


def delete_vectordb() -> None:
    shutil.rmtree(VECTORDB_FOLDER, ignore_errors=True)


def delete_vectordb_current_collection(vectordb: VectorStore) -> None:
    if VECTORDB_TYPE is VECTORDB_TYPES.Chroma:
        vectordb.delete_collection()
        vectordb.persist()
    if VECTORDB_TYPE is VECTORDB_TYPES.AwaDB:
        delete_vectordb()