Spaces:
Sleeping
Sleeping
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
import os | |
import download_and_extract_zip | |
def gen_splits(): | |
URL = os.getenv('URL') | |
destination_folder = os.getcwd() | |
download_and_extract_zip.download_and_extract_zip(URL, destination_folder) | |
file_paths = os.listdir('Model_TS_Full') | |
new_file_paths = [os.getcwd() +"/Model_TS_Full/"+ file for file in file_paths] | |
loaders = [] | |
for file_path in new_file_paths: | |
if file_path.lower().endswith(".pdf"): | |
loaders.append(PyPDFLoader(file_path)) | |
docs = [] | |
for loader in loaders: | |
docs.extend(loader.load()) | |
# Splitting Documents | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 7500, chunk_overlap = 500) | |
splits = text_splitter.split_documents(docs) | |
return splits |