rag / pdf_processor.py
user
upload to hf
176bc9a
raw
history blame
818 Bytes
import fitz
import faiss
import numpy as np
import torch
from model_loader import load_model
def extract_text_from_pdf(file_path):
with fitz.open(file_path) as doc:
return " ".join(page.get_text() for page in doc)
def process_pdf(pdf_text):
chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)]
tokenizer, model = load_model()
embeddings = []
for chunk in chunks:
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
embeddings = np.array(embeddings)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings.astype('float32'))
return chunks, index