rag / pdf_processor.py
user
modifications for remote development using huggingface resouces
b5553ae
raw
history blame
972 Bytes
import fitz # PyMuPDF
import numpy as np
from transformers import AutoTokenizer, AutoModel
import faiss
def extract_text_from_pdf(file_path):
text = ""
with fitz.open(file_path) as doc:
for page in doc:
text += page.get_text()
return text
def process_pdf(pdf_text):
chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)]
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
embeddings = []
for chunk in chunks:
inputs = tokenizer(chunk, padding=True, truncation=True, max_length=512, return_tensors="pt")
outputs = model(**inputs)
embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
embeddings = np.array(embeddings)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings.astype('float32'))
return chunks, index