Spaces:
Running
Running
user
commited on
Commit
·
176bc9a
0
Parent(s):
upload to hf
Browse files- README.md +8 -0
- Untitled-2 +23 -0
- app.py +37 -0
- data/example.pdf +0 -0
- index.html +61 -0
- model_loader.py +9 -0
- pdf_processor.py +23 -0
- project_ouline.md +28 -0
- requirements.txt +6 -0
README.md
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Rag PDF Chatbot
|
3 |
+
emoji: 🤖
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: red
|
6 |
+
sdk: static
|
7 |
+
app_port: 7860
|
8 |
+
---
|
Untitled-2
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz
|
2 |
+
import faiss
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from model_loader import load_model
|
6 |
+
|
7 |
+
def extract_text_from_pdf(file_path):
|
8 |
+
with fitz.open(file_path) as doc:
|
9 |
+
return " ".join(page.get_text() for page in doc)
|
10 |
+
|
11 |
+
def process_pdf(pdf_text):
|
12 |
+
chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)]
|
13 |
+
tokenizer, model = load_model()
|
14 |
+
embeddings = []
|
15 |
+
for chunk in chunks:
|
16 |
+
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
17 |
+
with torch.no_grad():
|
18 |
+
outputs = model(**inputs)
|
19 |
+
embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
|
20 |
+
embeddings = np.array(embeddings)
|
21 |
+
index = faiss.IndexFlatL2(embeddings.shape[1])
|
22 |
+
index.add(embeddings.astype('float32'))
|
23 |
+
return chunks, index
|
app.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify, send_from_directory
|
2 |
+
from pdf_processor import extract_text_from_pdf, process_pdf
|
3 |
+
from model_loader import load_model, load_generator
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
app = Flask(__name__)
|
8 |
+
|
9 |
+
# Load data and models
|
10 |
+
pdf_text = extract_text_from_pdf("data/example.pdf")
|
11 |
+
chunks, index = process_pdf(pdf_text)
|
12 |
+
tokenizer, model = load_model()
|
13 |
+
generator = load_generator()
|
14 |
+
|
15 |
+
@app.route('/')
|
16 |
+
def home():
|
17 |
+
return send_from_directory('.', 'index.html')
|
18 |
+
|
19 |
+
@app.route('/ask', methods=['POST'])
|
20 |
+
def ask():
|
21 |
+
prompt = request.json['question']
|
22 |
+
|
23 |
+
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
24 |
+
with torch.no_grad():
|
25 |
+
query_vector = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()
|
26 |
+
|
27 |
+
k = 3
|
28 |
+
distances, indices = index.search(query_vector.astype('float32').reshape(1, -1), k)
|
29 |
+
context = " ".join([chunks[i] for i in indices[0]])
|
30 |
+
|
31 |
+
input_text = f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer:"
|
32 |
+
response = generator(input_text, max_length=150, num_return_sequences=1)[0]['generated_text']
|
33 |
+
|
34 |
+
return jsonify({'response': response})
|
35 |
+
|
36 |
+
if __name__ == '__main__':
|
37 |
+
app.run(host='0.0.0.0', port=7860)
|
data/example.pdf
ADDED
Binary file (64.9 kB). View file
|
|
index.html
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>PDF RAG Chatbot</title>
|
7 |
+
<style>
|
8 |
+
body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
|
9 |
+
#chat-container { border: 1px solid #ddd; height: 400px; overflow-y: scroll; padding: 10px; margin-bottom: 10px; }
|
10 |
+
#user-input { width: 70%; padding: 5px; }
|
11 |
+
#send-button { padding: 5px 10px; }
|
12 |
+
</style>
|
13 |
+
</head>
|
14 |
+
<body>
|
15 |
+
<h1>PDF RAG Chatbot</h1>
|
16 |
+
<div id="chat-container"></div>
|
17 |
+
<input type="text" id="user-input" placeholder="Ask a question...">
|
18 |
+
<button id="send-button">Send</button>
|
19 |
+
|
20 |
+
<script>
|
21 |
+
const chatContainer = document.getElementById('chat-container');
|
22 |
+
const userInput = document.getElementById('user-input');
|
23 |
+
const sendButton = document.getElementById('send-button');
|
24 |
+
|
25 |
+
function addMessage(role, content) {
|
26 |
+
const messageDiv = document.createElement('div');
|
27 |
+
messageDiv.innerHTML = `<strong>${role}:</strong> ${content}`;
|
28 |
+
chatContainer.appendChild(messageDiv);
|
29 |
+
chatContainer.scrollTop = chatContainer.scrollHeight;
|
30 |
+
}
|
31 |
+
|
32 |
+
async function sendMessage() {
|
33 |
+
const question = userInput.value.trim();
|
34 |
+
if (question) {
|
35 |
+
addMessage('User', question);
|
36 |
+
userInput.value = '';
|
37 |
+
|
38 |
+
try {
|
39 |
+
const response = await fetch('/ask', {
|
40 |
+
method: 'POST',
|
41 |
+
headers: {
|
42 |
+
'Content-Type': 'application/json',
|
43 |
+
},
|
44 |
+
body: JSON.stringify({ question }),
|
45 |
+
});
|
46 |
+
const data = await response.json();
|
47 |
+
addMessage('Bot', data.response);
|
48 |
+
} catch (error) {
|
49 |
+
console.error('Error:', error);
|
50 |
+
addMessage('Bot', 'Sorry, there was an error processing your request.');
|
51 |
+
}
|
52 |
+
}
|
53 |
+
}
|
54 |
+
|
55 |
+
sendButton.addEventListener('click', sendMessage);
|
56 |
+
userInput.addEventListener('keypress', (e) => {
|
57 |
+
if (e.key === 'Enter') sendMessage();
|
58 |
+
});
|
59 |
+
</script>
|
60 |
+
</body>
|
61 |
+
</html>
|
model_loader.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModel, pipeline
|
2 |
+
|
3 |
+
def load_model():
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
5 |
+
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
6 |
+
return tokenizer, model
|
7 |
+
|
8 |
+
def load_generator():
|
9 |
+
return pipeline('text2text-generation', model='google/flan-t5-base')
|
pdf_processor.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz
|
2 |
+
import faiss
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from model_loader import load_model
|
6 |
+
|
7 |
+
def extract_text_from_pdf(file_path):
|
8 |
+
with fitz.open(file_path) as doc:
|
9 |
+
return " ".join(page.get_text() for page in doc)
|
10 |
+
|
11 |
+
def process_pdf(pdf_text):
|
12 |
+
chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)]
|
13 |
+
tokenizer, model = load_model()
|
14 |
+
embeddings = []
|
15 |
+
for chunk in chunks:
|
16 |
+
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
17 |
+
with torch.no_grad():
|
18 |
+
outputs = model(**inputs)
|
19 |
+
embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
|
20 |
+
embeddings = np.array(embeddings)
|
21 |
+
index = faiss.IndexFlatL2(embeddings.shape[1])
|
22 |
+
index.add(embeddings.astype('float32'))
|
23 |
+
return chunks, index
|
project_ouline.md
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Goal: Create a PDF RAG chatbot to work on Hugging Face Spaces.
|
2 |
+
Key points:
|
3 |
+
|
4 |
+
Using Hugging Face Spaces to host the chatbot.
|
5 |
+
using html on space
|
6 |
+
using flask locally to test out features
|
7 |
+
|
8 |
+
@huggingface.co/docs
|
9 |
+
@huggingface.co/docs/hub/spaces-sdks-python
|
10 |
+
@hugging face static docs
|
11 |
+
|
12 |
+
Create a PDF-based RAG (Retrieval-Augmented Generation) chatbot.
|
13 |
+
Implement character-based interactions, where the chatbot embodies a persona based on the PDF content.
|
14 |
+
Deploy the chatbot on Hugging Face Spaces using a static HTML frontend and Flask backend.
|
15 |
+
Develop a local Flask setup for testing and development purposes.
|
16 |
+
Implement efficient PDF processing, including text extraction and chunking.
|
17 |
+
Utilize Hugging Face models for text embedding and generation.
|
18 |
+
Create a user-friendly web interface for interacting with the chatbot.
|
19 |
+
Ensure the chatbot provides contextually relevant responses based on the PDF content
|
20 |
+
Create a RAG (Retrieval-Augmented Generation) chatbot
|
21 |
+
Use a PDF file as the knowledge base
|
22 |
+
Have the chatbot take on the role of a character
|
23 |
+
Users will interact with it as though it were a living version of the data
|
24 |
+
Deploy the project on Hugging Face Spaces
|
25 |
+
Use static HTML for the frontend on Hugging Face Spaces
|
26 |
+
Use Flask locally to test out features
|
27 |
+
Focus on PDF functionality for now (VTT and JSON are stretch goals)
|
28 |
+
Store the PDF file in a 'data/' folder within the project structure
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
flask
|
2 |
+
transformers
|
3 |
+
torch
|
4 |
+
PyMuPDF
|
5 |
+
faiss-cpu
|
6 |
+
numpy
|