!pip install PyPDF2 import gradio as gr import os import PyPDF2 # Import PyPDF2 for PDF text extraction import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # Load NLTK resources nltk.download('punkt') nltk.download('stopwords') # Function to extract text from PDFs using PyPDF2 def extract_text_from_pdf(pdf_path): pdf_text = "" with open(pdf_path, 'rb') as pdf_file: pdf_reader = PyPDF2.PdfFileReader(pdf_file) for page_num in range(pdf_reader.getNumPages()): page = pdf_reader.getPage(page_num) pdf_text += page.extractText() return pdf_text # Function to clean and tokenize text def clean_and_tokenize(text): tokens = word_tokenize(text.lower()) tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')] return ' '.join(tokens) # Function to preprocess the documents in the specified directory def preprocess_documents(dataset_dir): documents = [] for filename in os.listdir(dataset_dir): if filename.endswith('.pdf'): pdf_path = os.path.join(dataset_dir, filename) pdf_text = extract_text_from_pdf(pdf_path) clean_text = clean_and_tokenize(pdf_text) documents.append(clean_text) return documents # Function to perform relevance matching and return top N documents def perform_relevance_matching(query, *uploaded_files, dataset_dir): # Preprocess the documents in the specified dataset directory documents = preprocess_documents(dataset_dir) # Combine the user-uploaded files into a single document uploaded_documents = [] for file in uploaded_files: uploaded_text = extract_text_from_pdf(file.name) uploaded_documents.append(uploaded_text) # Combine the uploaded documents and query combined_documents = uploaded_documents + [query] # Vectorize the combined documents tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(documents + combined_documents) # Calculate cosine similarities between the combined documents and the dataset cosine_similarities = cosine_similarity(tfidf_matrix[-len(combined_documents):], tfidf_matrix[:-len(combined_documents)]) # Rank documents by similarity score document_scores = list(enumerate(cosine_similarities[0])) sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True) # Extract the top N relevant documents top_n = 5 top_documents = [] for i in range(min(top_n, len(sorted_documents))): doc_index, score = sorted_documents[i] document_text = documents[doc_index][:500] # Extract the first 500 characters of the document top_documents.append((f"Document {doc_index + 1} (Similarity Score: {score:.4f})", document_text)) return top_documents # Create a Gradio interface iface = gr.Interface( fn=perform_relevance_matching, inputs=[ "text", # Query input gr.File(multiple=True), # Allow multiple file uploads "text" # Dataset directory input ], outputs=gr.Table(), live=True, title="Legal Research Assistant", description="Enter your legal query, upload files, and specify the dataset directory.", ) # Launch the Gradio interface iface.launch()