Spaces:
Running
Running
import fitz | |
import faiss | |
import numpy as np | |
import torch | |
from model_loader import load_model | |
def extract_text_from_pdf(file_path): | |
with fitz.open(file_path) as doc: | |
return " ".join(page.get_text() for page in doc) | |
def process_pdf(pdf_text): | |
chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)] | |
tokenizer, model = load_model() | |
embeddings = [] | |
for chunk in chunks: | |
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy()) | |
embeddings = np.array(embeddings) | |
index = faiss.IndexFlatL2(embeddings.shape[1]) | |
index.add(embeddings.astype('float32')) | |
return chunks, index | |