from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter import os import download_and_extract_zip def gen_splits(): URL = os.getenv('URL') destination_folder = os.getcwd() download_and_extract_zip.download_and_extract_zip(URL, destination_folder) file_paths = os.listdir('Model_TS_Full') new_file_paths = [os.getcwd() +"/Model_TS_Full/"+ file for file in file_paths] loaders = [] for file_path in new_file_paths: if file_path.lower().endswith(".pdf"): loaders.append(PyPDFLoader(file_path)) docs = [] for loader in loaders: docs.extend(loader.load()) # Splitting Documents text_splitter = RecursiveCharacterTextSplitter(chunk_size = 7500, chunk_overlap = 500) splits = text_splitter.split_documents(docs) return splits