Wagner / data_loader.py
SalehAhmad's picture
Upload 6 files
10d250d verified
import os
from docx import Document
import PyPDF2
class ChatbotDataLoader:
def __init__(self):
pass
def read_docx(self, file_path):
"""
Reads content from a .docx file.
"""
doc = Document(file_path)
content = "\n".join([para.text for para in doc.paragraphs])
return content
def read_pdf(self, file_path):
"""
Reads content from a .pdf file.
"""
with open(file_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
content = ""
for page in range(len(reader.pages)):
content += reader.pages[page].extract_text()
return content
def load_file(self, file_path):
"""
Reads content from a .docx or .pdf file based on the file extension.
"""
if file_path.endswith(".docx"):
return self.read_docx(file_path)
elif file_path.endswith(".pdf"):
return self.read_pdf(file_path)
else:
raise ValueError(f"Unsupported file type: {file_path}")
def load_directory(self, dir_path):
"""
Iterates through the directory, loads all .docx and .pdf files, and returns their content.
"""
file_contents = {}
for root, _, files in os.walk(dir_path):
for file in files:
file_path = os.path.join(root, file)
if file.endswith((".docx", ".pdf")):
try:
content = self.load_file(file_path)
file_contents[file_path] = content
except Exception as e:
print(f"Failed to load {file_path}: {str(e)}")
return file_contents