Spaces:
Sleeping
Sleeping
andreasmartin
commited on
Commit
·
8c2f0ba
1
Parent(s):
96db48f
deepnote update
Browse files
faq.py
CHANGED
@@ -8,22 +8,22 @@ from langchain.embeddings.base import Embeddings
|
|
8 |
from langchain.vectorstores.base import VectorStore
|
9 |
import os
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
|
17 |
|
18 |
def faq_id(sheet_url: str) -> str:
|
19 |
-
x = sheet_url.find(
|
20 |
-
y = sheet_url.find(
|
21 |
-
return sheet_url[x + len(
|
22 |
|
23 |
|
24 |
def xlsx_url(faq_id: str) -> str:
|
25 |
y = faq_id.rfind("-")
|
26 |
-
return
|
27 |
|
28 |
|
29 |
def read_df(xlsx_url: str) -> pd.DataFrame:
|
@@ -39,21 +39,16 @@ def embedding_function(model_name: str) -> HuggingFaceEmbeddings:
|
|
39 |
return HuggingFaceEmbeddings(
|
40 |
model_name=model_name,
|
41 |
encode_kwargs={"normalize_embeddings": True},
|
42 |
-
cache_folder=
|
43 |
)
|
44 |
|
45 |
|
46 |
def vectordb(
|
47 |
-
faq_id: str,
|
48 |
-
embedding_function: Embeddings,
|
49 |
-
documents: List[Document] = None
|
50 |
) -> VectorStore:
|
51 |
vectordb = None
|
52 |
if documents is None:
|
53 |
-
vectordb = AwaDB(
|
54 |
-
embedding=embedding_function,
|
55 |
-
log_and_data_dir=dir_vectordb
|
56 |
-
)
|
57 |
success = vectordb.load_local(table_name=faq_id)
|
58 |
if not success:
|
59 |
raise Exception("faq_id may not exists")
|
@@ -62,11 +57,13 @@ def vectordb(
|
|
62 |
documents=documents,
|
63 |
embedding=embedding_function,
|
64 |
table_name=faq_id,
|
65 |
-
log_and_data_dir=
|
66 |
)
|
67 |
return vectordb
|
68 |
|
69 |
|
70 |
-
def similarity_search(
|
|
|
|
|
71 |
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
72 |
-
return vectordb.similarity_search_with_relevance_scores(query=query, k=k)
|
|
|
8 |
from langchain.vectorstores.base import VectorStore
|
9 |
import os
|
10 |
|
11 |
+
SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
|
12 |
+
SHEET_URL_Y = "/edit#gid="
|
13 |
+
SHEET_URL_Y_EXPORT = "/export?gid="
|
14 |
+
CACHE_FOLDER = ".embedding-model"
|
15 |
+
VECTORDB_FOLDER = ".vectordb"
|
16 |
|
17 |
|
18 |
def faq_id(sheet_url: str) -> str:
|
19 |
+
x = sheet_url.find(SHEET_URL_X)
|
20 |
+
y = sheet_url.find(SHEET_URL_Y)
|
21 |
+
return sheet_url[x + len(SHEET_URL_X) : y] + "-" + sheet_url[y + len(SHEET_URL_Y) :]
|
22 |
|
23 |
|
24 |
def xlsx_url(faq_id: str) -> str:
|
25 |
y = faq_id.rfind("-")
|
26 |
+
return SHEET_URL_X + faq_id[0:y] + SHEET_URL_Y_EXPORT + faq_id[y + 1 :]
|
27 |
|
28 |
|
29 |
def read_df(xlsx_url: str) -> pd.DataFrame:
|
|
|
39 |
return HuggingFaceEmbeddings(
|
40 |
model_name=model_name,
|
41 |
encode_kwargs={"normalize_embeddings": True},
|
42 |
+
cache_folder=CACHE_FOLDER,
|
43 |
)
|
44 |
|
45 |
|
46 |
def vectordb(
|
47 |
+
faq_id: str, embedding_function: Embeddings, documents: List[Document] = None
|
|
|
|
|
48 |
) -> VectorStore:
|
49 |
vectordb = None
|
50 |
if documents is None:
|
51 |
+
vectordb = AwaDB(embedding=embedding_function, log_and_data_dir=VECTORDB_FOLDER)
|
|
|
|
|
|
|
52 |
success = vectordb.load_local(table_name=faq_id)
|
53 |
if not success:
|
54 |
raise Exception("faq_id may not exists")
|
|
|
57 |
documents=documents,
|
58 |
embedding=embedding_function,
|
59 |
table_name=faq_id,
|
60 |
+
log_and_data_dir=VECTORDB_FOLDER,
|
61 |
)
|
62 |
return vectordb
|
63 |
|
64 |
|
65 |
+
def similarity_search(
|
66 |
+
vectordb: VectorStore, query: str, k: int
|
67 |
+
) -> List[Tuple[Document, float]]:
|
68 |
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
69 |
+
return vectordb.similarity_search_with_relevance_scores(query=query, k=k)
|