Spaces:

SalehAhmad
/

Wagner

Runtime error

Wagner / data_loader.py

Upload 6 files

10d250d verified 4 months ago

1.74 kB

	import os
	from docx import Document
	import PyPDF2

	class ChatbotDataLoader:
	def __init__(self):
	pass

	def read_docx(self, file_path):
	"""
	Reads content from a .docx file.
	"""
	doc = Document(file_path)
	content = "\n".join([para.text for para in doc.paragraphs])
	return content

	def read_pdf(self, file_path):
	"""
	Reads content from a .pdf file.
	"""
	with open(file_path, "rb") as file:
	reader = PyPDF2.PdfReader(file)
	content = ""
	for page in range(len(reader.pages)):
	content += reader.pages[page].extract_text()
	return content

	def load_file(self, file_path):
	"""
	Reads content from a .docx or .pdf file based on the file extension.
	"""
	if file_path.endswith(".docx"):
	return self.read_docx(file_path)
	elif file_path.endswith(".pdf"):
	return self.read_pdf(file_path)
	else:
	raise ValueError(f"Unsupported file type: {file_path}")

	def load_directory(self, dir_path):
	"""
	Iterates through the directory, loads all .docx and .pdf files, and returns their content.
	"""
	file_contents = {}
	for root, _, files in os.walk(dir_path):
	for file in files:
	file_path = os.path.join(root, file)
	if file.endswith((".docx", ".pdf")):
	try:
	content = self.load_file(file_path)
	file_contents[file_path] = content
	except Exception as e:
	print(f"Failed to load {file_path}: {str(e)}")
	return file_contents