Shahabmoin's picture
Update app.py
8f62e8c verified
import streamlit as st
import os
from groq import Groq
import fitz # PyMuPDF for PDF parsing
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer # Hugging Face transformer
from io import BytesIO # To handle file upload correctly
# Initialize the Hugging Face model and Groq API client
model = SentenceTransformer('all-MiniLM-L6-v2') # Model for generating embeddings
GROQ_API_KEY = "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"
client = Groq(api_key=GROQ_API_KEY)
# Function to extract text from a PDF
def extract_text_from_pdf(file):
doc = fitz.open(stream=file.read(), filetype="pdf") # Use the stream and specify file type
text = ""
for page in doc:
text += page.get_text()
return text
# Function to generate embeddings using Hugging Face model (for text retrieval)
def generate_huggingface_embeddings(text):
embeddings = model.encode(text) # Using the SentenceTransformer model
return embeddings
# Function to get relevant chunks from the document using FAISS similarity search
def get_relevant_chunks(query, top_k=5):
query_embedding = generate_huggingface_embeddings(query) # Get query embedding
query_embedding = np.array(query_embedding).reshape(1, -1) # Reshape for FAISS
# Perform similarity search in FAISS
distances, indices = index.search(query_embedding, top_k)
relevant_chunks = [document_chunks[i] for i in indices[0]]
return relevant_chunks
# Function to generate an answer based on retrieved context and Groq's model
def generate_answer(query):
relevant_chunks = get_relevant_chunks(query)
context = " ".join(relevant_chunks) # Combine the most relevant chunks
# Generate the response with Groq's chat model
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": f"Answer based on this: {context}"}],
model="llama3-8b-8192", # Adjust with the appropriate Groq model
stream=False
)
return chat_completion.choices[0].message.content
# Streamlit app interface
st.title("Knowledge-Based Assistant")
st.write("Upload a PDF to generate answers based on its content.")
# Upload PDF file
pdf_file = st.file_uploader("Choose a PDF file", type="pdf")
if pdf_file is not None:
# Extract the text content from the uploaded PDF
document_text = extract_text_from_pdf(pdf_file)
# Split the document into chunks (adjust chunk size as needed)
chunk_size = 1000 # Size of each chunk of text for embedding
document_chunks = [document_text[i:i+chunk_size] for i in range(0, len(document_text), chunk_size)]
# Generate embeddings for each chunk and store them
embeddings = [generate_huggingface_embeddings(chunk) for chunk in document_chunks]
# Convert embeddings to numpy arrays for FAISS
embeddings_array = np.array(embeddings)
# Initialize FAISS index
index = faiss.IndexFlatL2(embeddings_array.shape[1]) # L2 distance metric
# Add embeddings to the FAISS index
index.add(embeddings_array)
# Query input from user
query = st.text_input("Ask a question about the document:")
if query:
# Generate the answer based on the query
answer = generate_answer(query)
st.write("Answer: ", answer)