Spaces:
Runtime error
Runtime error
!pip install PyPDF2 | |
import gradio as gr | |
import os | |
import PyPDF2 # Import PyPDF2 for PDF text extraction | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
# Load NLTK resources | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
# Function to extract text from PDFs using PyPDF2 | |
def extract_text_from_pdf(pdf_path): | |
pdf_text = "" | |
with open(pdf_path, 'rb') as pdf_file: | |
pdf_reader = PyPDF2.PdfFileReader(pdf_file) | |
for page_num in range(pdf_reader.getNumPages()): | |
page = pdf_reader.getPage(page_num) | |
pdf_text += page.extractText() | |
return pdf_text | |
# Function to clean and tokenize text | |
def clean_and_tokenize(text): | |
tokens = word_tokenize(text.lower()) | |
tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')] | |
return ' '.join(tokens) | |
# Function to preprocess the documents in the specified directory | |
def preprocess_documents(dataset_dir): | |
documents = [] | |
for filename in os.listdir(dataset_dir): | |
if filename.endswith('.pdf'): | |
pdf_path = os.path.join(dataset_dir, filename) | |
pdf_text = extract_text_from_pdf(pdf_path) | |
clean_text = clean_and_tokenize(pdf_text) | |
documents.append(clean_text) | |
return documents | |
# Function to perform relevance matching and return top N documents | |
def perform_relevance_matching(query, *uploaded_files, dataset_dir): | |
# Preprocess the documents in the specified dataset directory | |
documents = preprocess_documents(dataset_dir) | |
# Combine the user-uploaded files into a single document | |
uploaded_documents = [] | |
for file in uploaded_files: | |
uploaded_text = extract_text_from_pdf(file.name) | |
uploaded_documents.append(uploaded_text) | |
# Combine the uploaded documents and query | |
combined_documents = uploaded_documents + [query] | |
# Vectorize the combined documents | |
tfidf_vectorizer = TfidfVectorizer() | |
tfidf_matrix = tfidf_vectorizer.fit_transform(documents + combined_documents) | |
# Calculate cosine similarities between the combined documents and the dataset | |
cosine_similarities = cosine_similarity(tfidf_matrix[-len(combined_documents):], tfidf_matrix[:-len(combined_documents)]) | |
# Rank documents by similarity score | |
document_scores = list(enumerate(cosine_similarities[0])) | |
sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True) | |
# Extract the top N relevant documents | |
top_n = 5 | |
top_documents = [] | |
for i in range(min(top_n, len(sorted_documents))): | |
doc_index, score = sorted_documents[i] | |
document_text = documents[doc_index][:500] # Extract the first 500 characters of the document | |
top_documents.append((f"Document {doc_index + 1} (Similarity Score: {score:.4f})", document_text)) | |
return top_documents | |
# Create a Gradio interface | |
iface = gr.Interface( | |
fn=perform_relevance_matching, | |
inputs=[ | |
"text", # Query input | |
gr.File(multiple=True), # Allow multiple file uploads | |
"text" # Dataset directory input | |
], | |
outputs=gr.Table(), | |
live=True, | |
title="Legal Research Assistant", | |
description="Enter your legal query, upload files, and specify the dataset directory.", | |
) | |
# Launch the Gradio interface | |
iface.launch() | |