Spaces:
Runtime error
Runtime error
import streamlit as st | |
import os | |
from groq import Groq | |
import fitz # PyMuPDF for PDF parsing | |
import numpy as np | |
import faiss | |
from sentence_transformers import SentenceTransformer # Hugging Face transformer | |
from io import BytesIO # To handle file upload correctly | |
# Initialize the Hugging Face model and Groq API client | |
model = SentenceTransformer('all-MiniLM-L6-v2') # Model for generating embeddings | |
GROQ_API_KEY = "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" | |
client = Groq(api_key=GROQ_API_KEY) | |
# Function to extract text from a PDF | |
def extract_text_from_pdf(file): | |
doc = fitz.open(stream=file.read(), filetype="pdf") # Use the stream and specify file type | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
# Function to generate embeddings using Hugging Face model (for text retrieval) | |
def generate_huggingface_embeddings(text): | |
embeddings = model.encode(text) # Using the SentenceTransformer model | |
return embeddings | |
# Function to get relevant chunks from the document using FAISS similarity search | |
def get_relevant_chunks(query, top_k=5): | |
query_embedding = generate_huggingface_embeddings(query) # Get query embedding | |
query_embedding = np.array(query_embedding).reshape(1, -1) # Reshape for FAISS | |
# Perform similarity search in FAISS | |
distances, indices = index.search(query_embedding, top_k) | |
relevant_chunks = [document_chunks[i] for i in indices[0]] | |
return relevant_chunks | |
# Function to generate an answer based on retrieved context and Groq's model | |
def generate_answer(query): | |
relevant_chunks = get_relevant_chunks(query) | |
context = " ".join(relevant_chunks) # Combine the most relevant chunks | |
# Generate the response with Groq's chat model | |
chat_completion = client.chat.completions.create( | |
messages=[{"role": "user", "content": f"Answer based on this: {context}"}], | |
model="llama3-8b-8192", # Adjust with the appropriate Groq model | |
stream=False | |
) | |
return chat_completion.choices[0].message.content | |
# Streamlit app interface | |
st.title("Knowledge-Based Assistant") | |
st.write("Upload a PDF to generate answers based on its content.") | |
# Upload PDF file | |
pdf_file = st.file_uploader("Choose a PDF file", type="pdf") | |
if pdf_file is not None: | |
# Extract the text content from the uploaded PDF | |
document_text = extract_text_from_pdf(pdf_file) | |
# Split the document into chunks (adjust chunk size as needed) | |
chunk_size = 1000 # Size of each chunk of text for embedding | |
document_chunks = [document_text[i:i+chunk_size] for i in range(0, len(document_text), chunk_size)] | |
# Generate embeddings for each chunk and store them | |
embeddings = [generate_huggingface_embeddings(chunk) for chunk in document_chunks] | |
# Convert embeddings to numpy arrays for FAISS | |
embeddings_array = np.array(embeddings) | |
# Initialize FAISS index | |
index = faiss.IndexFlatL2(embeddings_array.shape[1]) # L2 distance metric | |
# Add embeddings to the FAISS index | |
index.add(embeddings_array) | |
# Query input from user | |
query = st.text_input("Ask a question about the document:") | |
if query: | |
# Generate the answer based on the query | |
answer = generate_answer(query) | |
st.write("Answer: ", answer) | |