Upload helper.py
Browse files
helper.py
CHANGED
@@ -42,8 +42,8 @@ llm = HuggingFacePipeline(pipeline=pipe)
|
|
42 |
# # Initialize instructor embeddings using the Hugging Face model
|
43 |
# instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="C:/Users/arasu/Workspace/Projects/GenAI/embeddings/hkunlp_instructor-large")
|
44 |
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")
|
|
|
45 |
vector_db = ""
|
46 |
-
|
47 |
def create_vector_db():
|
48 |
# Load data from pdf
|
49 |
raw_text = ""
|
@@ -53,14 +53,13 @@ def create_vector_db():
|
|
53 |
chunk_overlap = 100,
|
54 |
length_function = len,
|
55 |
)
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
raw_text += content
|
64 |
texts = text_splitter.split_text(raw_text)
|
65 |
|
66 |
# Create a vector database from 'text'
|
|
|
42 |
# # Initialize instructor embeddings using the Hugging Face model
|
43 |
# instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="C:/Users/arasu/Workspace/Projects/GenAI/embeddings/hkunlp_instructor-large")
|
44 |
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")
|
45 |
+
# db_path = "vector_db"
|
46 |
vector_db = ""
|
|
|
47 |
def create_vector_db():
|
48 |
# Load data from pdf
|
49 |
raw_text = ""
|
|
|
53 |
chunk_overlap = 100,
|
54 |
length_function = len,
|
55 |
)
|
56 |
+
from PyPDF2 import PdfReader
|
57 |
+
pdf = PdfReader("employment-agreement2018.pdf")
|
58 |
+
raw_text = ""
|
59 |
+
for i, page in enumerate(pdf.pages):
|
60 |
+
content = page.extract_text()
|
61 |
+
if content:
|
62 |
+
raw_text += content
|
|
|
63 |
texts = text_splitter.split_text(raw_text)
|
64 |
|
65 |
# Create a vector database from 'text'
|