import glob import os from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter from transformers import AutoTokenizer from langchain_community.document_loaders import PyMuPDFLoader from langchain.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Qdrant path_to_data = "./data/" def process_pdf(): files = {'ABC':'./data/MWTS2021.pdf', 'XYZ':'./data/Consolidated2021.pdf'} docs = {} for file,value in files.items(): try: docs[file] = PyMuPDFLoader(value).load() except Exception as e: print("Exception: ", e) # text splitter based on the tokenizer of a model of your choosing # to make texts fit exactly a transformer's context window size # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/ chunk_size = 256 text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"), chunk_size=chunk_size, chunk_overlap=int(chunk_size / 10), add_start_index=True, strip_whitespace=True, separators=["\n\n", "\n"], ) all_documents = {} for file,value in docs.items(): doc_processed = text_splitter.split_documents(value) for doc in doc_processed: doc.metadata["source"] = file all_documents[file] = doc_processed print(all_documents.keys()) print(all_documents['ABC']) embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-mpnet-base-v2" ) qdrant_collections = {} for file,value in all_documents: qdrant_collections[file] = Qdrant.from_documents( value, embeddings, location=":memory:", collection_name=file, )