Spaces:
Running
Running
import fitz # PyMuPDF | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModel | |
import faiss | |
def extract_text_from_pdf(file_path): | |
text = "" | |
with fitz.open(file_path) as doc: | |
for page in doc: | |
text += page.get_text() | |
return text | |
def process_pdf(pdf_text): | |
chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)] | |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") | |
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") | |
embeddings = [] | |
for chunk in chunks: | |
inputs = tokenizer(chunk, padding=True, truncation=True, max_length=512, return_tensors="pt") | |
outputs = model(**inputs) | |
embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy()) | |
embeddings = np.array(embeddings) | |
index = faiss.IndexFlatL2(embeddings.shape[1]) | |
index.add(embeddings.astype('float32')) | |
return chunks, index | |