Spaces:

bardicreels
/

rag

Running

rag / pdf_processor.py

user

modifications for remote development using huggingface resouces

b5553ae 5 months ago

972 Bytes

	import fitz # PyMuPDF
	import numpy as np
	from transformers import AutoTokenizer, AutoModel
	import faiss

	def extract_text_from_pdf(file_path):
	text = ""
	with fitz.open(file_path) as doc:
	for page in doc:
	text += page.get_text()
	return text

	def process_pdf(pdf_text):
	chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)]
	tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
	model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

	embeddings = []
	for chunk in chunks:
	inputs = tokenizer(chunk, padding=True, truncation=True, max_length=512, return_tensors="pt")
	outputs = model(**inputs)
	embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())

	embeddings = np.array(embeddings)
	index = faiss.IndexFlatL2(embeddings.shape[1])
	index.add(embeddings.astype('float32'))
	return chunks, index