Spaces:
Running
Running
user
commited on
Commit
·
1591490
1
Parent(s):
176bc9a
updates
Browse files- README.md +1 -2
- Untitled-2 +0 -23
README.md
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🤖
|
4 |
colorFrom: blue
|
5 |
colorTo: red
|
6 |
sdk: static
|
7 |
-
app_port: 7860
|
8 |
---
|
|
|
1 |
---
|
2 |
+
title: rag
|
3 |
emoji: 🤖
|
4 |
colorFrom: blue
|
5 |
colorTo: red
|
6 |
sdk: static
|
|
|
7 |
---
|
Untitled-2
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
import fitz
|
2 |
-
import faiss
|
3 |
-
import numpy as np
|
4 |
-
import torch
|
5 |
-
from model_loader import load_model
|
6 |
-
|
7 |
-
def extract_text_from_pdf(file_path):
|
8 |
-
with fitz.open(file_path) as doc:
|
9 |
-
return " ".join(page.get_text() for page in doc)
|
10 |
-
|
11 |
-
def process_pdf(pdf_text):
|
12 |
-
chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)]
|
13 |
-
tokenizer, model = load_model()
|
14 |
-
embeddings = []
|
15 |
-
for chunk in chunks:
|
16 |
-
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
17 |
-
with torch.no_grad():
|
18 |
-
outputs = model(**inputs)
|
19 |
-
embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
|
20 |
-
embeddings = np.array(embeddings)
|
21 |
-
index = faiss.IndexFlatL2(embeddings.shape[1])
|
22 |
-
index.add(embeddings.astype('float32'))
|
23 |
-
return chunks, index
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|