Spaces:

bardicreels
/

rag

Running

App Files Files Community

user commited on Sep 11, 2024

Commit

176bc9a

0 Parent(s):

upload to hf

Browse files

Files changed (9) hide show

README.md +8 -0
Untitled-2 +23 -0
app.py +37 -0
data/example.pdf +0 -0
index.html +61 -0
model_loader.py +9 -0
pdf_processor.py +23 -0
project_ouline.md +28 -0
requirements.txt +6 -0

README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+---
+title: Rag PDF Chatbot
+emoji: 🤖
+colorFrom: blue
+colorTo: red
+sdk: static
+app_port: 7860
+---

Untitled-2 ADDED Viewed

	@@ -0,0 +1,23 @@

+import fitz
+import faiss
+import numpy as np
+import torch
+from model_loader import load_model
+def extract_text_from_pdf(file_path):
+    with fitz.open(file_path) as doc:
+        return " ".join(page.get_text() for page in doc)
+def process_pdf(pdf_text):
+    chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)]
+    tokenizer, model = load_model()
+    embeddings = []
+    for chunk in chunks:
+        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
+    embeddings = np.array(embeddings)
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings.astype('float32'))
+    return chunks, index

app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from flask import Flask, request, jsonify, send_from_directory
+from pdf_processor import extract_text_from_pdf, process_pdf
+from model_loader import load_model, load_generator
+import torch
+import numpy as np
+app = Flask(__name__)
+# Load data and models
+pdf_text = extract_text_from_pdf("data/example.pdf")
+chunks, index = process_pdf(pdf_text)
+tokenizer, model = load_model()
+generator = load_generator()
+@app.route('/')
+def home():
+    return send_from_directory('.', 'index.html')
+@app.route('/ask', methods=['POST'])
+def ask():
+    prompt = request.json['question']
+    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    with torch.no_grad():
+        query_vector = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()
+    k = 3
+    distances, indices = index.search(query_vector.astype('float32').reshape(1, -1), k)
+    context = " ".join([chunks[i] for i in indices[0]])
+    input_text = f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer:"
+    response = generator(input_text, max_length=150, num_return_sequences=1)[0]['generated_text']
+    return jsonify({'response': response})
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=7860)

data/example.pdf ADDED Viewed

Binary file (64.9 kB). View file

index.html ADDED Viewed

	@@ -0,0 +1,61 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>PDF RAG Chatbot</title>
+    <style>
+        body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
+        #chat-container { border: 1px solid #ddd; height: 400px; overflow-y: scroll; padding: 10px; margin-bottom: 10px; }
+        #user-input { width: 70%; padding: 5px; }
+        #send-button { padding: 5px 10px; }
+    </style>
+</head>
+<body>
+    <h1>PDF RAG Chatbot</h1>
+    <div id="chat-container"></div>
+    <input type="text" id="user-input" placeholder="Ask a question...">
+    <button id="send-button">Send</button>
+    <script>
+        const chatContainer = document.getElementById('chat-container');
+        const userInput = document.getElementById('user-input');
+        const sendButton = document.getElementById('send-button');
+        function addMessage(role, content) {
+            const messageDiv = document.createElement('div');
+            messageDiv.innerHTML = `<strong>${role}:</strong> ${content}`;
+            chatContainer.appendChild(messageDiv);
+            chatContainer.scrollTop = chatContainer.scrollHeight;
+        }
+        async function sendMessage() {
+            const question = userInput.value.trim();
+            if (question) {
+                addMessage('User', question);
+                userInput.value = '';
+                try {
+                    const response = await fetch('/ask', {
+                        method: 'POST',
+                        headers: {
+                            'Content-Type': 'application/json',
+                        },
+                        body: JSON.stringify({ question }),
+                    });
+                    const data = await response.json();
+                    addMessage('Bot', data.response);
+                } catch (error) {
+                    console.error('Error:', error);
+                    addMessage('Bot', 'Sorry, there was an error processing your request.');
+                }
+            }
+        }
+        sendButton.addEventListener('click', sendMessage);
+        userInput.addEventListener('keypress', (e) => {
+            if (e.key === 'Enter') sendMessage();
+        });
+    </script>
+</body>
+</html>

model_loader.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from transformers import AutoTokenizer, AutoModel, pipeline
+def load_model():
+    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+    model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+    return tokenizer, model
+def load_generator():
+    return pipeline('text2text-generation', model='google/flan-t5-base')

pdf_processor.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import fitz
+import faiss
+import numpy as np
+import torch
+from model_loader import load_model
+def extract_text_from_pdf(file_path):
+    with fitz.open(file_path) as doc:
+        return " ".join(page.get_text() for page in doc)
+def process_pdf(pdf_text):
+    chunks = [pdf_text[i:i+512] for i in range(0, len(pdf_text), 512)]
+    tokenizer, model = load_model()
+    embeddings = []
+    for chunk in chunks:
+        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
+    embeddings = np.array(embeddings)
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings.astype('float32'))
+    return chunks, index

project_ouline.md ADDED Viewed

	@@ -0,0 +1,28 @@

+Goal: Create a PDF RAG chatbot to work on Hugging Face Spaces.
+Key points:
+Using Hugging Face Spaces to host the chatbot.
+using html on space
+using flask locally to test out features
+@huggingface.co/docs
+@huggingface.co/docs/hub/spaces-sdks-python
+@hugging face static docs
+Create a PDF-based RAG (Retrieval-Augmented Generation) chatbot.
+Implement character-based interactions, where the chatbot embodies a persona based on the PDF content.
+Deploy the chatbot on Hugging Face Spaces using a static HTML frontend and Flask backend.
+Develop a local Flask setup for testing and development purposes.
+Implement efficient PDF processing, including text extraction and chunking.
+Utilize Hugging Face models for text embedding and generation.
+Create a user-friendly web interface for interacting with the chatbot.
+Ensure the chatbot provides contextually relevant responses based on the PDF content
+Create a RAG (Retrieval-Augmented Generation) chatbot
+Use a PDF file as the knowledge base
+Have the chatbot take on the role of a character
+Users will interact with it as though it were a living version of the data
+Deploy the project on Hugging Face Spaces
+Use static HTML for the frontend on Hugging Face Spaces
+Use Flask locally to test out features
+Focus on PDF functionality for now (VTT and JSON are stretch goals)
+Store the PDF file in a 'data/' folder within the project structure

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+flask
+transformers
+torch
+PyMuPDF
+faiss-cpu
+numpy