Spaces:
Runtime error
Runtime error
import os | |
from docx import Document | |
import PyPDF2 | |
class ChatbotDataLoader: | |
def __init__(self): | |
pass | |
def read_docx(self, file_path): | |
""" | |
Reads content from a .docx file. | |
""" | |
doc = Document(file_path) | |
content = "\n".join([para.text for para in doc.paragraphs]) | |
return content | |
def read_pdf(self, file_path): | |
""" | |
Reads content from a .pdf file. | |
""" | |
with open(file_path, "rb") as file: | |
reader = PyPDF2.PdfReader(file) | |
content = "" | |
for page in range(len(reader.pages)): | |
content += reader.pages[page].extract_text() | |
return content | |
def load_file(self, file_path): | |
""" | |
Reads content from a .docx or .pdf file based on the file extension. | |
""" | |
if file_path.endswith(".docx"): | |
return self.read_docx(file_path) | |
elif file_path.endswith(".pdf"): | |
return self.read_pdf(file_path) | |
else: | |
raise ValueError(f"Unsupported file type: {file_path}") | |
def load_directory(self, dir_path): | |
""" | |
Iterates through the directory, loads all .docx and .pdf files, and returns their content. | |
""" | |
file_contents = {} | |
for root, _, files in os.walk(dir_path): | |
for file in files: | |
file_path = os.path.join(root, file) | |
if file.endswith((".docx", ".pdf")): | |
try: | |
content = self.load_file(file_path) | |
file_contents[file_path] = content | |
except Exception as e: | |
print(f"Failed to load {file_path}: {str(e)}") | |
return file_contents | |