File size: 965 Bytes
2c9f2c4 861cd81 2c9f2c4 9f6b354 2c9f2c4 9f6b354 2c9f2c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader, UnstructuredPDFLoader
from langchain_community.vectorstores.pgvector import PGVector
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
from app.config import EMBEDDING_MODEL, PG_COLLECTION_NAME
load_dotenv()
loader = DirectoryLoader(
os.path.abspath("../source_docs"),
glob="**/*.pdf",
use_multithreading=True,
show_progress=True,
max_concurrency=50,
loader_cls=UnstructuredPDFLoader,
)
docs = loader.load()
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, )
text_splitter = SemanticChunker(
embeddings=embeddings
)
chunks = text_splitter.split_documents(docs)
PGVector.from_documents(
documents=chunks,
embedding=embeddings,
collection_name=PG_COLLECTION_NAME,
connection_string=os.getenv("POSTGRES_URL"),
pre_delete_collection=True,
)
|