Spaces:

Shahabmoin
/

Knowledge-Assistant

Runtime error

App Files Files Community

Knowledge-Assistant / app.py

Shahabmoin

Update app.py

8f62e8c verified about 1 month ago

raw

history blame contribute delete

3.29 kB

	import streamlit as st
	import os
	from groq import Groq
	import fitz # PyMuPDF for PDF parsing
	import numpy as np
	import faiss
	from sentence_transformers import SentenceTransformer # Hugging Face transformer
	from io import BytesIO # To handle file upload correctly

	# Initialize the Hugging Face model and Groq API client
	model = SentenceTransformer('all-MiniLM-L6-v2') # Model for generating embeddings
	GROQ_API_KEY = "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"
	client = Groq(api_key=GROQ_API_KEY)

	# Function to extract text from a PDF
	def extract_text_from_pdf(file):
	doc = fitz.open(stream=file.read(), filetype="pdf") # Use the stream and specify file type
	text = ""
	for page in doc:
	text += page.get_text()
	return text

	# Function to generate embeddings using Hugging Face model (for text retrieval)
	def generate_huggingface_embeddings(text):
	embeddings = model.encode(text) # Using the SentenceTransformer model
	return embeddings

	# Function to get relevant chunks from the document using FAISS similarity search
	def get_relevant_chunks(query, top_k=5):
	query_embedding = generate_huggingface_embeddings(query) # Get query embedding
	query_embedding = np.array(query_embedding).reshape(1, -1) # Reshape for FAISS

	# Perform similarity search in FAISS
	distances, indices = index.search(query_embedding, top_k)
	relevant_chunks = [document_chunks[i] for i in indices[0]]
	return relevant_chunks

	# Function to generate an answer based on retrieved context and Groq's model
	def generate_answer(query):
	relevant_chunks = get_relevant_chunks(query)
	context = " ".join(relevant_chunks) # Combine the most relevant chunks

	# Generate the response with Groq's chat model
	chat_completion = client.chat.completions.create(
	messages=[{"role": "user", "content": f"Answer based on this: {context}"}],
	model="llama3-8b-8192", # Adjust with the appropriate Groq model
	stream=False
	)
	return chat_completion.choices[0].message.content

	# Streamlit app interface
	st.title("Knowledge-Based Assistant")
	st.write("Upload a PDF to generate answers based on its content.")

	# Upload PDF file
	pdf_file = st.file_uploader("Choose a PDF file", type="pdf")

	if pdf_file is not None:
	# Extract the text content from the uploaded PDF
	document_text = extract_text_from_pdf(pdf_file)

	# Split the document into chunks (adjust chunk size as needed)
	chunk_size = 1000 # Size of each chunk of text for embedding
	document_chunks = [document_text[i:i+chunk_size] for i in range(0, len(document_text), chunk_size)]

	# Generate embeddings for each chunk and store them
	embeddings = [generate_huggingface_embeddings(chunk) for chunk in document_chunks]

	# Convert embeddings to numpy arrays for FAISS
	embeddings_array = np.array(embeddings)

	# Initialize FAISS index
	index = faiss.IndexFlatL2(embeddings_array.shape[1]) # L2 distance metric

	# Add embeddings to the FAISS index
	index.add(embeddings_array)

	# Query input from user
	query = st.text_input("Ask a question about the document:")

	if query:
	# Generate the answer based on the query
	answer = generate_answer(query)
	st.write("Answer: ", answer)