andreasmartin commited on
Commit
5005601
·
1 Parent(s): 4cdba7b

deepnote update

Browse files
Files changed (2) hide show
  1. faq.py +74 -0
  2. requirements.txt +4 -1
faq.py CHANGED
@@ -1 +1,75 @@
1
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
+ from langchain.document_loaders import DataFrameLoader
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ from langchain.vectorstores import AwaDB
5
+ from typing import List, Tuple
6
+ from langchain.docstore.document import Document
7
+ from langchain.embeddings.base import Embeddings
8
+ from langchain.vectorstores.base import VectorStore
9
+ import os
10
+
11
+ sheet_url_x = "https://docs.google.com/spreadsheets/d/"
12
+ sheet_url_y = "/edit#gid="
13
+ sheet_url_y_exp = "/export?gid="
14
+ cache_folder=".embedding-model"
15
+ dir_vectordb = ".vectordb"
16
+
17
+
18
+ def faq_id(sheet_url: str) -> str:
19
+ x = sheet_url.find(sheet_url_x)
20
+ y = sheet_url.find(sheet_url_y)
21
+ return sheet_url[x + len(sheet_url_x) : y] + "-" + sheet_url[y + len(sheet_url_y) :]
22
+
23
+
24
+ def xlsx_url(sheet_url: str) -> str:
25
+ return sheet_url.replace(sheet_url_y, sheet_url_y_exp)
26
+
27
+
28
+ def xlsx_url_faq_id(faq_id: str) -> str:
29
+ y = faq_id.rfind("-")
30
+ return sheet_url_x + faq_id[0:y] + sheet_url_y_exp + faq_id[y + 1 :]
31
+
32
+
33
+ def read_df(xlsx_url: str) -> pd.DataFrame:
34
+ return pd.read_excel(xlsx_url, header=0, keep_default_na=False)
35
+
36
+
37
+ def create_documents(df: pd.DataFrame, page_content_column: str) -> pd.DataFrame:
38
+ loader = DataFrameLoader(df, page_content_column=page_content_column)
39
+ return loader.load()
40
+
41
+
42
+ def embedding_function(model_name: str) -> HuggingFaceEmbeddings:
43
+ return HuggingFaceEmbeddings(
44
+ model_name=model_name,
45
+ encode_kwargs={"normalize_embeddings": True},
46
+ cache_folder=cache_folder
47
+ )
48
+
49
+
50
+ def vectordb(
51
+ faq_id: str,
52
+ documents: List[Document],
53
+ embedding_function: Embeddings,
54
+ init: bool = False,
55
+ ) -> VectorStore:
56
+ vectordb = None
57
+ if init:
58
+ vectordb = AwaDB.from_documents(
59
+ documents=documents,
60
+ embedding=embedding_function,
61
+ table_name=faq_id,
62
+ log_and_data_dir=dir_vectordb
63
+ )
64
+ else:
65
+ vectordb = AwaDB(
66
+ embedding=embedding_function,
67
+ log_and_data_dir=dir_vectordb
68
+ )
69
+ vectordb.load_local(table_name=faq_id)
70
+ return vectordb
71
+
72
+
73
+ def similarity_search(vectordb: VectorStore, query: str, k: int) -> List[Tuple[Document, float]]:
74
+ os.environ["TOKENIZERS_PARALLELISM"] = "true"
75
+ return vectordb.similarity_search_with_relevance_scores(query=query, k=k)
requirements.txt CHANGED
@@ -1 +1,4 @@
1
-
 
 
 
 
1
+ openpyxl
2
+ langchain
3
+ sentence_transformers
4
+ awadb