File size: 965 Bytes
2c9f2c4
 
 
 
 
 
 
 
861cd81
2c9f2c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f6b354
2c9f2c4
 
 
 
 
 
 
 
9f6b354
2c9f2c4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import os

from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader, UnstructuredPDFLoader
from langchain_community.vectorstores.pgvector import PGVector
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings

from app.config import EMBEDDING_MODEL, PG_COLLECTION_NAME

load_dotenv()

loader = DirectoryLoader(
    os.path.abspath("../source_docs"),
    glob="**/*.pdf",
    use_multithreading=True,
    show_progress=True,
    max_concurrency=50,
    loader_cls=UnstructuredPDFLoader,
)
docs = loader.load()

embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, )

text_splitter = SemanticChunker(
    embeddings=embeddings
)

chunks = text_splitter.split_documents(docs)

PGVector.from_documents(
    documents=chunks,
    embedding=embeddings,
    collection_name=PG_COLLECTION_NAME,
    connection_string=os.getenv("POSTGRES_URL"),
    pre_delete_collection=True,
)