import os from dotenv import load_dotenv from langchain_community.document_loaders import DirectoryLoader, UnstructuredPDFLoader from langchain_community.vectorstores.pgvector import PGVector from langchain_experimental.text_splitter import SemanticChunker from langchain_openai import OpenAIEmbeddings from app.config import EMBEDDING_MODEL, PG_COLLECTION_NAME load_dotenv() loader = DirectoryLoader( os.path.abspath("../source_docs"), glob="**/*.pdf", use_multithreading=True, show_progress=True, max_concurrency=50, loader_cls=UnstructuredPDFLoader, ) docs = loader.load() embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, ) text_splitter = SemanticChunker( embeddings=embeddings ) chunks = text_splitter.split_documents(docs) PGVector.from_documents( documents=chunks, embedding=embeddings, collection_name=PG_COLLECTION_NAME, connection_string=os.getenv("POSTGRES_URL"), pre_delete_collection=True, )