Spaces:

bardicreels
/

rag

Running

rag / pdf_processor.py

user

upload to hf

176bc9a 4 months ago

818 Bytes

	import fitz
	import faiss
	import numpy as np
	import torch
	from model_loader import load_model

	def extract_text_from_pdf(file_path):
	with fitz.open(file_path) as doc:
	return " ".join(page.get_text() for page in doc)

	def process_pdf(pdf_text):
	chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)]
	tokenizer, model = load_model()
	embeddings = []
	for chunk in chunks:
	inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
	with torch.no_grad():
	outputs = model(**inputs)
	embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
	embeddings = np.array(embeddings)
	index = faiss.IndexFlatL2(embeddings.shape[1])
	index.add(embeddings.astype('float32'))
	return chunks, index