user commited on
Commit
176bc9a
·
0 Parent(s):

upload to hf

Browse files
Files changed (9) hide show
  1. README.md +8 -0
  2. Untitled-2 +23 -0
  3. app.py +37 -0
  4. data/example.pdf +0 -0
  5. index.html +61 -0
  6. model_loader.py +9 -0
  7. pdf_processor.py +23 -0
  8. project_ouline.md +28 -0
  9. requirements.txt +6 -0
README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Rag PDF Chatbot
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: red
6
+ sdk: static
7
+ app_port: 7860
8
+ ---
Untitled-2 ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import faiss
3
+ import numpy as np
4
+ import torch
5
+ from model_loader import load_model
6
+
7
+ def extract_text_from_pdf(file_path):
8
+ with fitz.open(file_path) as doc:
9
+ return " ".join(page.get_text() for page in doc)
10
+
11
+ def process_pdf(pdf_text):
12
+ chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)]
13
+ tokenizer, model = load_model()
14
+ embeddings = []
15
+ for chunk in chunks:
16
+ inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
17
+ with torch.no_grad():
18
+ outputs = model(**inputs)
19
+ embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
20
+ embeddings = np.array(embeddings)
21
+ index = faiss.IndexFlatL2(embeddings.shape[1])
22
+ index.add(embeddings.astype('float32'))
23
+ return chunks, index
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, send_from_directory
2
+ from pdf_processor import extract_text_from_pdf, process_pdf
3
+ from model_loader import load_model, load_generator
4
+ import torch
5
+ import numpy as np
6
+
7
+ app = Flask(__name__)
8
+
9
+ # Load data and models
10
+ pdf_text = extract_text_from_pdf("data/example.pdf")
11
+ chunks, index = process_pdf(pdf_text)
12
+ tokenizer, model = load_model()
13
+ generator = load_generator()
14
+
15
+ @app.route('/')
16
+ def home():
17
+ return send_from_directory('.', 'index.html')
18
+
19
+ @app.route('/ask', methods=['POST'])
20
+ def ask():
21
+ prompt = request.json['question']
22
+
23
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
24
+ with torch.no_grad():
25
+ query_vector = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()
26
+
27
+ k = 3
28
+ distances, indices = index.search(query_vector.astype('float32').reshape(1, -1), k)
29
+ context = " ".join([chunks[i] for i in indices[0]])
30
+
31
+ input_text = f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer:"
32
+ response = generator(input_text, max_length=150, num_return_sequences=1)[0]['generated_text']
33
+
34
+ return jsonify({'response': response})
35
+
36
+ if __name__ == '__main__':
37
+ app.run(host='0.0.0.0', port=7860)
data/example.pdf ADDED
Binary file (64.9 kB). View file
 
index.html ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>PDF RAG Chatbot</title>
7
+ <style>
8
+ body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
9
+ #chat-container { border: 1px solid #ddd; height: 400px; overflow-y: scroll; padding: 10px; margin-bottom: 10px; }
10
+ #user-input { width: 70%; padding: 5px; }
11
+ #send-button { padding: 5px 10px; }
12
+ </style>
13
+ </head>
14
+ <body>
15
+ <h1>PDF RAG Chatbot</h1>
16
+ <div id="chat-container"></div>
17
+ <input type="text" id="user-input" placeholder="Ask a question...">
18
+ <button id="send-button">Send</button>
19
+
20
+ <script>
21
+ const chatContainer = document.getElementById('chat-container');
22
+ const userInput = document.getElementById('user-input');
23
+ const sendButton = document.getElementById('send-button');
24
+
25
+ function addMessage(role, content) {
26
+ const messageDiv = document.createElement('div');
27
+ messageDiv.innerHTML = `<strong>${role}:</strong> ${content}`;
28
+ chatContainer.appendChild(messageDiv);
29
+ chatContainer.scrollTop = chatContainer.scrollHeight;
30
+ }
31
+
32
+ async function sendMessage() {
33
+ const question = userInput.value.trim();
34
+ if (question) {
35
+ addMessage('User', question);
36
+ userInput.value = '';
37
+
38
+ try {
39
+ const response = await fetch('/ask', {
40
+ method: 'POST',
41
+ headers: {
42
+ 'Content-Type': 'application/json',
43
+ },
44
+ body: JSON.stringify({ question }),
45
+ });
46
+ const data = await response.json();
47
+ addMessage('Bot', data.response);
48
+ } catch (error) {
49
+ console.error('Error:', error);
50
+ addMessage('Bot', 'Sorry, there was an error processing your request.');
51
+ }
52
+ }
53
+ }
54
+
55
+ sendButton.addEventListener('click', sendMessage);
56
+ userInput.addEventListener('keypress', (e) => {
57
+ if (e.key === 'Enter') sendMessage();
58
+ });
59
+ </script>
60
+ </body>
61
+ </html>
model_loader.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel, pipeline
2
+
3
+ def load_model():
4
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
5
+ model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
6
+ return tokenizer, model
7
+
8
+ def load_generator():
9
+ return pipeline('text2text-generation', model='google/flan-t5-base')
pdf_processor.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import faiss
3
+ import numpy as np
4
+ import torch
5
+ from model_loader import load_model
6
+
7
+ def extract_text_from_pdf(file_path):
8
+ with fitz.open(file_path) as doc:
9
+ return " ".join(page.get_text() for page in doc)
10
+
11
+ def process_pdf(pdf_text):
12
+ chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)]
13
+ tokenizer, model = load_model()
14
+ embeddings = []
15
+ for chunk in chunks:
16
+ inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
17
+ with torch.no_grad():
18
+ outputs = model(**inputs)
19
+ embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
20
+ embeddings = np.array(embeddings)
21
+ index = faiss.IndexFlatL2(embeddings.shape[1])
22
+ index.add(embeddings.astype('float32'))
23
+ return chunks, index
project_ouline.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Goal: Create a PDF RAG chatbot to work on Hugging Face Spaces.
2
+ Key points:
3
+
4
+ Using Hugging Face Spaces to host the chatbot.
5
+ using html on space
6
+ using flask locally to test out features
7
+
8
+ @huggingface.co/docs
9
+ @huggingface.co/docs/hub/spaces-sdks-python
10
+ @hugging face static docs
11
+
12
+ Create a PDF-based RAG (Retrieval-Augmented Generation) chatbot.
13
+ Implement character-based interactions, where the chatbot embodies a persona based on the PDF content.
14
+ Deploy the chatbot on Hugging Face Spaces using a static HTML frontend and Flask backend.
15
+ Develop a local Flask setup for testing and development purposes.
16
+ Implement efficient PDF processing, including text extraction and chunking.
17
+ Utilize Hugging Face models for text embedding and generation.
18
+ Create a user-friendly web interface for interacting with the chatbot.
19
+ Ensure the chatbot provides contextually relevant responses based on the PDF content
20
+ Create a RAG (Retrieval-Augmented Generation) chatbot
21
+ Use a PDF file as the knowledge base
22
+ Have the chatbot take on the role of a character
23
+ Users will interact with it as though it were a living version of the data
24
+ Deploy the project on Hugging Face Spaces
25
+ Use static HTML for the frontend on Hugging Face Spaces
26
+ Use Flask locally to test out features
27
+ Focus on PDF functionality for now (VTT and JSON are stretch goals)
28
+ Store the PDF file in a 'data/' folder within the project structure
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ flask
2
+ transformers
3
+ torch
4
+ PyMuPDF
5
+ faiss-cpu
6
+ numpy