import fitz import faiss import numpy as np import torch from model_loader import load_model def extract_text_from_pdf(file_path): with fitz.open(file_path) as doc: return " ".join(page.get_text() for page in doc) def process_pdf(pdf_text): chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)] tokenizer, model = load_model() embeddings = [] for chunk in chunks: inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy()) embeddings = np.array(embeddings) index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(embeddings.astype('float32')) return chunks, index