Engg-SS_ChatBOT / gen_splits.py
abhivsh's picture
Update gen_splits.py
ca1c8f9 verified
raw
history blame
888 Bytes
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import download_and_extract_zip
def gen_splits():
URL = os.getenv('URL')
destination_folder = os.getcwd()
download_and_extract_zip.download_and_extract_zip(URL, destination_folder)
file_paths = os.listdir('Model_TS_Full')
new_file_paths = [os.getcwd() +"/Model_TS_Full/"+ file for file in file_paths]
loaders = []
for file_path in new_file_paths:
if file_path.lower().endswith(".pdf"):
loaders.append(PyPDFLoader(file_path))
docs = []
for loader in loaders:
docs.extend(loader.load())
# Splitting Documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 7500, chunk_overlap = 500)
splits = text_splitter.split_documents(docs)
return splits