ZeeAI1 commited on
Commit
d097b1f
·
verified ·
1 Parent(s): 497ecc7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -0
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import pdfplumber
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain.vectorstores import FAISS
8
+ from transformers import pipeline
9
+
10
+ # Set up the page configuration
11
+ st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="📄")
12
+
13
+ # Load the summarization pipeline model
14
+ @st.cache_resource
15
+ def load_summarization_pipeline():
16
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
17
+ return summarizer
18
+
19
+ summarizer = load_summarization_pipeline()
20
+
21
+ # Split text into manageable chunks
22
+ @st.cache_data
23
+ def get_text_chunks(text):
24
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
25
+ chunks = text_splitter.split_text(text)
26
+ return chunks
27
+
28
+ # Initialize embedding function
29
+ embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
30
+
31
+ # Create a FAISS vector store with embeddings, checking for empty chunks
32
+ @st.cache_resource
33
+ def load_or_create_vector_store(text_chunks):
34
+ if not text_chunks:
35
+ st.error("No valid text chunks found to create a vector store. Please check your PDF files.")
36
+ return None
37
+ vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
38
+ return vector_store
39
+
40
+ # Helper function to process a single PDF
41
+ def process_single_pdf(file_path):
42
+ text = ""
43
+ try:
44
+ with pdfplumber.open(file_path) as pdf:
45
+ for page in pdf.pages:
46
+ page_text = page.extract_text()
47
+ if page_text:
48
+ text += page_text
49
+ except Exception as e:
50
+ st.error(f"Failed to read PDF: {file_path} - {e}")
51
+ return text
52
+
53
+ # Function to load PDFs with progress display
54
+ def load_pdfs_with_progress(folder_path):
55
+ all_text = ""
56
+ pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
57
+ num_files = len(pdf_files)
58
+
59
+ if num_files == 0:
60
+ st.error("No PDF files found in the specified folder.")
61
+ st.session_state['vector_store'] = None
62
+ st.session_state['loading'] = False
63
+ return
64
+
65
+ # Title for the progress bar
66
+ st.markdown("### Loading data...")
67
+ progress_bar = st.progress(0)
68
+ status_text = st.empty()
69
+
70
+ processed_count = 0
71
+
72
+ for file_path in pdf_files:
73
+ result = process_single_pdf(file_path)
74
+ all_text += result
75
+ processed_count += 1
76
+ progress_percentage = int((processed_count / num_files) * 100)
77
+ progress_bar.progress(processed_count / num_files)
78
+ status_text.text(f"Loading documents: {progress_percentage}% completed")
79
+
80
+ progress_bar.empty() # Remove the progress bar when done
81
+ status_text.text("Document loading completed!") # Show completion message
82
+
83
+ if all_text:
84
+ text_chunks = get_text_chunks(all_text)
85
+ vector_store = load_or_create_vector_store(text_chunks)
86
+ st.session_state['vector_store'] = vector_store
87
+ else:
88
+ st.session_state['vector_store'] = None
89
+
90
+ st.session_state['loading'] = False # Mark loading as complete
91
+
92
+ # Generate summary based on the retrieved text
93
+ def generate_summary_with_huggingface(query, retrieved_text):
94
+ summarization_input = f"{query} Related information:{retrieved_text}"
95
+ max_input_length = 1024
96
+ summarization_input = summarization_input[:max_input_length]
97
+ summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
98
+ return summary[0]["summary_text"]
99
+
100
+ # Generate response for user query
101
+ def user_input(user_question):
102
+ vector_store = st.session_state.get('vector_store')
103
+ if vector_store is None:
104
+ return "The app is still loading documents or no documents were successfully loaded."
105
+ docs = vector_store.similarity_search(user_question)
106
+ context_text = " ".join([doc.page_content for doc in docs])
107
+ return generate_summary_with_huggingface(user_question, context_text)
108
+
109
+ # Main function to run the Streamlit app
110
+ def main():
111
+ # Use HTML to style the title with a larger font size
112
+ st.markdown(
113
+ """
114
+ <h1 style="font-size:30px; text-align: center;">
115
+ 📄 JusticeCompass: Your AI-Powered Legal Navigator for Swift, Accurate Guidance.
116
+ </h1>
117
+ """,
118
+ unsafe_allow_html=True
119
+ )
120
+
121
+ # Start loading documents if not already loaded
122
+ if 'loading' not in st.session_state or st.session_state['loading']:
123
+ st.session_state['loading'] = True
124
+ load_pdfs_with_progress('documents1')
125
+
126
+ user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
127
+
128
+ if st.session_state.get('loading', True):
129
+ st.info("The app is loading documents in the background. You can type your question now and submit once loading is complete.")
130
+
131
+ if st.button("Get Response"):
132
+ if not user_question:
133
+ st.warning("Please enter a question before submitting.")
134
+ else:
135
+ with st.spinner("Generating response..."):
136
+ answer = user_input(user_question)
137
+ st.markdown(f"**🤖 AI:** {answer}")
138
+
139
+ if __name__ == "__main__":
140
+ main()