sadickam commited on
Commit
71a06ed
Β·
verified Β·
1 Parent(s): e230b99

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +848 -0
app.py ADDED
@@ -0,0 +1,848 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import re
4
+ import torch
5
+ import pandas as pd
6
+ import plotly.express as px
7
+ import plotly.io as pio
8
+ import nltk
9
+ import tempfile
10
+ from io import BytesIO
11
+ import base64
12
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
13
+ from nltk.tokenize import sent_tokenize
14
+ from docx.shared import Inches
15
+ from docx import Document
16
+ import numpy as np
17
+ # Needed for HF GPU access
18
+ import spaces
19
+
20
+ from styles import custom_css # Importing custom CSS
21
+
22
+ nltk.download('punkt')
23
+
24
+ # Import PyPDFLoader for PDF processing
25
+ from langchain_community.document_loaders import PyPDFLoader
26
+
27
+ # Model checkpoint for SDG BERT
28
+ checkpoint = "sadickam/sdgBERT"
29
+
30
+ # Text cleaning function
31
+ def clean_text(text):
32
+ """
33
+ Cleans the extracted text by removing irrelevant characters but retains currency symbols.
34
+ """
35
+ text = text.strip()
36
+ # Define the allowed characters (including currency symbols)
37
+ allowed_chars = r'[^a-zA-Z0-9\s\.,!?$€£Β₯β‚ΉΒ’β‚©]'
38
+ text = re.sub(allowed_chars, '', text)
39
+ text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
40
+ return text
41
+
42
+ # Preprocessing function for text
43
+ def prep_text(text):
44
+ clean_sents = []
45
+ sent_tokens = sent_tokenize(str(text))
46
+ for sent_token in sent_tokens:
47
+ word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()]
48
+ clean_sents.append(' '.join(word_tokens))
49
+ joined = ' '.join(clean_sents).strip()
50
+ return re.sub(r'`|"', "", joined)
51
+
52
+ # Load the tokenizer and model with GPU support
53
+ def load_model_and_tokenizer():
54
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)
55
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
56
+ return model, tokenizer
57
+
58
+ # Define device (ensure usage of GPU if available in Hugging Face Spaces)
59
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
60
+
61
+ # SDG labels
62
+ label_list = [
63
+ 'SDG1_No Poverty', 'SDG2_Zero Hunger', 'SDG3_Good Health and Well-being', 'SDG4_Quality Education',
64
+ 'SDG5_Gender Equality', 'SDG6_Clean Water and Sanitation', 'SDG7_Affordable and Clean Energy',
65
+ 'SDG8_Decent Work and Economic Growth', 'SDG9_Industry, Innovation and Infrastructure',
66
+ 'SDG10_Reduced Inequality', 'SDG11_Sustainable Cities and Communities',
67
+ 'SDG12_Responsible Consumption and Production', 'SDG13_Climate Action',
68
+ 'SDG14_Life Below Water', 'SDG15_Life on Land', 'SDG16_Peace, Justice and Strong Institutions'
69
+ ]
70
+
71
+ # Function to predict SDGs for a batch of text inputs
72
+ def predict_sdg_labels_batch(texts, model, tokenizer):
73
+ tokenized_texts = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
74
+ model.eval()
75
+ with torch.no_grad():
76
+ text_logits = model(**tokenized_texts).logits
77
+ predictions = torch.softmax(text_logits, dim=1).tolist()
78
+ return predictions
79
+
80
+ # Page-level predictions with batch processing
81
+ def predict_pages(page_df, batch_size=32):
82
+ model, tokenizer = load_model_and_tokenizer()
83
+ df_results = page_df.copy()
84
+ num_rows = len(page_df)
85
+ all_predicted_labels = [[] for _ in range(16)]
86
+ all_prediction_scores = [[] for _ in range(16)]
87
+
88
+ for start in range(0, num_rows, batch_size):
89
+ end = min(start + batch_size, num_rows)
90
+ df_chunk = page_df.iloc[start:end]
91
+ # Clean text
92
+ texts = df_chunk['Text'].apply(clean_text).apply(prep_text).tolist()
93
+ predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
94
+ for predictions in predictions_batch:
95
+ sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
96
+ for i, (label, score) in enumerate(sorted_preds):
97
+ all_predicted_labels[i].append(label)
98
+ all_prediction_scores[i].append(score)
99
+
100
+ # Add columns to the DataFrame in the desired order (pred1, score1, pred2, score2, ...)
101
+ for i in range(16):
102
+ df_results[f'pred{i + 1}'] = all_predicted_labels[i]
103
+ df_results[f'score{i + 1}'] = all_prediction_scores[i]
104
+
105
+ # Reorder columns to ensure preds and scores are interleaved in the correct order
106
+ reordered_columns = []
107
+ for i in range(16):
108
+ reordered_columns.append(f'pred{i + 1}')
109
+ reordered_columns.append(f'score{i + 1}')
110
+ other_columns = [col for col in df_results.columns if col not in reordered_columns]
111
+ df_results = df_results[other_columns + reordered_columns]
112
+
113
+ return df_results
114
+
115
+ # Sentence-level predictions with batch processing
116
+ def predict_sentences(sentence_df, batch_size=32):
117
+ model, tokenizer = load_model_and_tokenizer()
118
+ df_combined_sentences = sentence_df.copy()
119
+
120
+ num_rows = len(sentence_df)
121
+ all_predicted_labels = [[] for _ in range(16)]
122
+ all_prediction_scores = [[] for _ in range(16)]
123
+
124
+ for start in range(0, num_rows, batch_size):
125
+ end = min(start + batch_size, num_rows)
126
+ df_chunk = sentence_df.iloc[start:end]
127
+ # Clean text
128
+ texts = df_chunk['Sentence'].apply(clean_text).apply(prep_text).tolist()
129
+ predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
130
+ for predictions in predictions_batch:
131
+ sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
132
+ for i, (label, score) in enumerate(sorted_preds):
133
+ all_predicted_labels[i].append(label)
134
+ all_prediction_scores[i].append(round(score, 3))
135
+
136
+ # Add predictions and scores to DataFrame
137
+ for i in range(16):
138
+ df_combined_sentences[f'pred{i + 1}'] = all_predicted_labels[i]
139
+ df_combined_sentences[f'score{i + 1}'] = all_prediction_scores[i]
140
+
141
+ # Reorder columns
142
+ reordered_columns = []
143
+ for i in range(16):
144
+ reordered_columns.append(f'pred{i + 1}')
145
+ reordered_columns.append(f'score{i + 1}')
146
+ other_columns = [col for col in df_combined_sentences.columns if col not in reordered_columns]
147
+ df_combined_sentences = df_combined_sentences[other_columns + reordered_columns]
148
+
149
+ return df_combined_sentences
150
+
151
+ # Define unique colors for each SDG
152
+ sdg_colors = {
153
+ "SDG1_No Poverty": "#E5243B",
154
+ "SDG2_Zero Hunger": "#DDA63A",
155
+ "SDG3_Good Health and Well-being": "#4C9F38",
156
+ "SDG4_Quality Education": "#C5192D",
157
+ "SDG5_Gender Equality": "#FF3A21",
158
+ "SDG6_Clean Water and Sanitation": "#26BDE2",
159
+ "SDG7_Affordable and Clean Energy": "#FCC30B",
160
+ "SDG8_Decent Work and Economic Growth": "#A21942",
161
+ "SDG9_Industry, Innovation and Infrastructure": "#FD6925",
162
+ "SDG10_Reduced Inequality": "#DD1367",
163
+ "SDG11_Sustainable Cities and Communities": "#FD9D24",
164
+ "SDG12_Responsible Consumption and Production": "#BF8B2E",
165
+ "SDG13_Climate Action": "#3F7E44",
166
+ "SDG14_Life Below Water": "#0A97D9",
167
+ "SDG15_Life on Land": "#56C02B",
168
+ "SDG16_Peace, Justice and Strong Institutions": "#00689D"
169
+ }
170
+
171
+ # Function to plot SDG dominant bar graphs using Plotly
172
+ # Function to plot SDG dominant bar graphs using Plotly
173
+ def plot_sdg(df, title, pred_column, icons_folder='assets/icons/'):
174
+ """
175
+ Plots a horizontal bar graph of SDG predictions and superimposes the icon of the most frequent SDG.
176
+
177
+ Args:
178
+ df (pd.DataFrame): DataFrame containing SDG predictions.
179
+ title (str): Title of the plot.
180
+ pred_column (str): Column name to use for plotting (e.g., 'pred1').
181
+ icons_folder (str): Path to the folder containing SDG icons.
182
+
183
+ Returns:
184
+ plotly.graph_objs._figure.Figure: The Plotly figure object.
185
+ """
186
+ df_filtered = df[df[pred_column].notna()]
187
+ labels = df_filtered[pred_column].value_counts().sort_values(ascending=False)
188
+ total = labels.sum()
189
+ percentages = (labels / total) * 100
190
+
191
+ # Create a horizontal bar plot with Plotly
192
+ fig = px.bar(
193
+ percentages.rename_axis('SDG Label').reset_index(name='Percentage'),
194
+ y='SDG Label',
195
+ x='Percentage',
196
+ orientation='h',
197
+ title=title,
198
+ color='SDG Label',
199
+ color_discrete_map=sdg_colors # Use the defined unique colors for each SDG
200
+ )
201
+
202
+ # Update y-axis to show labels
203
+ fig.update_yaxes(showticklabels=True)
204
+
205
+ # Add percentage labels to the bars
206
+ fig.update_traces(
207
+ texttemplate='%{x:.2f}%',
208
+ textposition='auto',
209
+ textfont=dict(size=10)
210
+ )
211
+
212
+ # Adjust layout for better visibility
213
+ fig.update_layout(
214
+ title=dict(
215
+ text=title, font=dict(size=14) # Increase title font size
216
+ ),
217
+ yaxis=dict(
218
+ automargin=True,
219
+ title=None,
220
+ tickfont=dict(size=12)
221
+ ),
222
+ margin=dict(l=20, r=30, t=100, b=20), # Increased right margin for icon
223
+ height=600,
224
+ #width=800,
225
+ showlegend=False,
226
+ template="simple_white",
227
+ xaxis=dict(
228
+ tickfont=dict(size=12) # Reduce x-axis font size
229
+ ),
230
+ )
231
+
232
+ # Identify the most frequent SDG
233
+ if not percentages.empty:
234
+ top_sdg_label = percentages.index[0] # e.g., 'SDG1_No Poverty'
235
+
236
+ # Map SDG label to icon filename
237
+ # Assuming naming convention 'SDG1.png', 'SDG2.png', etc.
238
+ sdg_number = top_sdg_label.split('_')[0] # Extract 'SDG1'
239
+ icon_filename = f"{sdg_number}.png" # e.g., 'SDG1.png'
240
+ icon_path = os.path.join(icons_folder, icon_filename)
241
+
242
+ # Check if the icon file exists
243
+ if os.path.exists(icon_path):
244
+ # Read and encode the image
245
+ with open(icon_path, 'rb') as image_file:
246
+ encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
247
+
248
+ # Add the icon as an image in the Plotly figure
249
+ fig.add_layout_image(
250
+ dict(
251
+ source='data:image/png;base64,' + encoded_image,
252
+ xref="paper", yref="paper",
253
+ x=0.4, y=1.2, # Positioning: slightly to the right and top
254
+ sizex=0.2, sizey=0.2, # Size of the icon
255
+ xanchor="left",
256
+ yanchor="top",
257
+ layer="above" # Ensure the icon is above other plot elements
258
+ )
259
+ )
260
+ else:
261
+ print(f"Icon file '{icon_path}' not found. Skipping icon overlay.")
262
+
263
+ return fig
264
+
265
+ def save_figure_as_jpeg(fig, filename):
266
+ """Saves the Plotly figure as a high-resolution JPEG."""
267
+ pio.write_image(fig, filename, format='jpeg', width=1200, height=600, scale=6)
268
+
269
+ # Generate reports (page and sentence levels)
270
+ def generate_page_report(df_pages, report_file_name):
271
+ doc = Document()
272
+ doc.add_heading("Page-Level SDG Analysis Report", 0)
273
+
274
+ doc.add_heading("πŸ“‹ General Notes", level=2)
275
+ doc.add_paragraph(
276
+ 'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
277
+ 'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
278
+ 'representing the likelihood that the text is aligned with particular SDGs. This page-level '
279
+ 'analysis provides high-level insight into SDG alignment.'
280
+ '\n\n'
281
+ 'Given that a page may align with more than one SDG, this app focuses on the top two SDG predictions '
282
+ '(Primary and Secondary) for each page with a probability score greater than zero.'
283
+ )
284
+
285
+ doc.add_heading("Primary SDGs Bar Graph", level=3)
286
+ doc.add_paragraph(
287
+ 'This graph displays the most essential SDG the AI model associates with pages. The bars '
288
+ 'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
289
+ 'sustainable development theme within the document.'
290
+ )
291
+
292
+ doc.add_heading("Secondary SDGs Bar Graph", level=3)
293
+ doc.add_paragraph(
294
+ 'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
295
+ 'not the primary focus, the text has some relevance to these goals.'
296
+ )
297
+
298
+ for doc_name in df_pages['Document'].unique():
299
+ # Sanitize doc_name to use in file names
300
+ sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
301
+
302
+ doc.add_heading(f"πŸ“„ Document: {doc_name}", level=2)
303
+ df_doc = df_pages[df_pages['Document'] == doc_name]
304
+
305
+ # Generate and save graphs
306
+ first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_page.jpeg"
307
+ second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_page.jpeg"
308
+
309
+ plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
310
+ first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
311
+ plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image(
312
+ second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
313
+
314
+ # Add plots to the Word document
315
+ doc.add_picture(first_sdg_plot_path, width=Inches(6))
316
+ doc.add_picture(second_sdg_plot_path, width=Inches(6))
317
+
318
+ doc.save(report_file_name)
319
+ return report_file_name
320
+
321
+ def generate_sentence_report(df_sentences, report_file_name):
322
+ doc = Document()
323
+ doc.add_heading("Sentence-Level SDG Analysis Report", 0)
324
+
325
+ doc.add_heading("πŸ“‹ General Notes", level=2)
326
+ doc.add_paragraph(
327
+ 'This app splits documents into sentences using a natural language processing algorithm. '
328
+ 'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
329
+ 'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
330
+ 'representing the likelihood that the text is aligned with particular SDGs. This sentence-level '
331
+ 'analysis provides deeper insight into SDG alignment.'
332
+ '\n\n'
333
+ 'Given that a sentence may align with more than one SDG, this app focuses on the top two SDG predictions '
334
+ '(Primary and Secondary) for each sentence with a probability score greater than zero.'
335
+ )
336
+
337
+ doc.add_heading("Primary SDGs Bar Graph", level=3)
338
+ doc.add_paragraph(
339
+ 'This graph displays the most essential SDG the AI model associates with sentences. The bars '
340
+ 'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
341
+ 'into the dominant sustainable development theme within the document.'
342
+ )
343
+
344
+ doc.add_heading("Secondary SDGs Bar Graph", level=3)
345
+ doc.add_paragraph(
346
+ 'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
347
+ 'the primary focus, the text has some relevance to these goals.'
348
+ )
349
+
350
+ for doc_name in df_sentences['Document'].unique():
351
+ # Sanitize doc_name to use in file names
352
+ sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
353
+
354
+ doc.add_heading(f"πŸ“„ Document: {doc_name}", level=2)
355
+ df_doc = df_sentences[df_sentences['Document'] == doc_name]
356
+
357
+ # Generate and save graphs
358
+ first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_sentence.jpeg"
359
+ second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_sentence.jpeg"
360
+
361
+ plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
362
+ first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
363
+ plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image(
364
+ second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
365
+
366
+ # Add plots to the Word document
367
+ doc.add_picture(first_sdg_plot_path, width=Inches(6))
368
+ doc.add_picture(second_sdg_plot_path, width=Inches(6))
369
+
370
+ doc.save(report_file_name)
371
+ return report_file_name
372
+
373
+ # New text extraction functions with text cleaning and line joining
374
+ def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
375
+ """
376
+ Extract text from a PDF page by page using LangChain's PyPDFLoader.
377
+ Args:
378
+ pdf_file_path (str): The file path to the uploaded PDF.
379
+ start_page (int, optional): The starting page number for extraction (1-based index).
380
+ end_page (int, optional): The ending page number for extraction (1-based index).
381
+ Returns:
382
+ tuple:
383
+ - page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
384
+ - sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
385
+ """
386
+ try:
387
+ # Initialize the loader
388
+ loader = PyPDFLoader(pdf_file_path)
389
+ documents = loader.load_and_split() # Each document corresponds to a single page
390
+
391
+ total_pages = len(documents)
392
+ doc_name = os.path.basename(pdf_file_path) # Extract document name
393
+
394
+ # Validate and adjust page range
395
+ if start_page is not None and end_page is not None:
396
+ # Convert to integers to avoid slicing issues
397
+ start_page = int(start_page)
398
+ end_page = int(end_page)
399
+
400
+ # Adjust to valid range
401
+ if start_page < 1:
402
+ start_page = 1
403
+ if end_page > total_pages:
404
+ end_page = total_pages
405
+ if start_page > end_page:
406
+ start_page, end_page = end_page, start_page # Swap if out of order
407
+
408
+ # Select the subset of documents based on user input
409
+ selected_docs = documents[start_page - 1:end_page]
410
+ else:
411
+ selected_docs = documents
412
+ start_page = 1
413
+ end_page = total_pages
414
+
415
+ # Initialize lists to store data
416
+ page_data = []
417
+ sentence_data = []
418
+
419
+ for idx, doc in enumerate(selected_docs, start=start_page):
420
+ page_num = idx
421
+ text = doc.page_content.strip()
422
+
423
+ # Join lines that belong to the same sentence
424
+ lines = text.split('\n')
425
+ joined_text = ' '.join(line.strip() for line in lines if line.strip())
426
+
427
+ # Clean text
428
+ cleaned_text = clean_text(joined_text)
429
+
430
+ # Append page-wise data
431
+ page_data.append({
432
+ "Document": doc_name,
433
+ "Page": page_num,
434
+ "Text": cleaned_text
435
+ })
436
+
437
+ # Sentence tokenization
438
+ sentences = sent_tokenize(cleaned_text)
439
+ for sentence in sentences:
440
+ sentence = sentence.strip()
441
+ if sentence and len(sentence) > 70:
442
+ sentence_data.append({
443
+ "Document": doc_name,
444
+ "Page": page_num,
445
+ "Sentence": sentence
446
+ })
447
+
448
+ # Create DataFrames
449
+ page_df = pd.DataFrame(page_data)
450
+ sentence_df = pd.DataFrame(sentence_data)
451
+
452
+ return page_df, sentence_df
453
+
454
+ except Exception as e:
455
+ raise RuntimeError(f"Error during PDF extraction: {e}")
456
+
457
+ def df_to_csv_bytes(df):
458
+ """
459
+ Convert DataFrame to CSV in bytes.
460
+ Args:
461
+ df (pd.DataFrame): The DataFrame to convert.
462
+ Returns:
463
+ bytes: CSV data in bytes.
464
+ """
465
+ try:
466
+ buffer = BytesIO()
467
+ df.to_csv(buffer, index=False)
468
+ csv_data = buffer.getvalue()
469
+ buffer.close()
470
+ return csv_data
471
+ except Exception as e:
472
+ raise RuntimeError(f"Error during CSV conversion: {e}")
473
+
474
+ def launch_interface():
475
+ with gr.Blocks(css=custom_css) as demo:
476
+
477
+ # Title as a visible heading at the top of the page with an icon
478
+ gr.Markdown(
479
+ """
480
+ # 🌍 SDG Document Analysis App
481
+ Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
482
+ """
483
+ )
484
+
485
+ # Shared PDF file input for both analyses
486
+ gr.Markdown("## Upload PDF File")
487
+ with gr.Row():
488
+ file_input = gr.File(
489
+ label="πŸ“ Upload PDF File for Analysis", file_types=[".pdf"]
490
+ )
491
+
492
+ # Extraction mode selection with explanatory text
493
+ gr.Markdown(
494
+ """
495
+ ## PDF Text Extraction Mode
496
+ Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select
497
+ "Range of Pages" and specify the start and end pages.
498
+ """
499
+ )
500
+ with gr.Row():
501
+ extraction_mode = gr.Radio(
502
+ choices=["All Pages", "Range of Pages"],
503
+ value="All Pages",
504
+ label="Extraction Mode"
505
+ )
506
+
507
+ with gr.Row():
508
+ start_page = gr.Number(value=1, label="πŸ”’ Start Page", visible=False, info="The cover page is page 1")
509
+ end_page = gr.Number(value=1, label="πŸ”’ End Page", visible=False)
510
+
511
+ # Function to update visibility of start_page and end_page
512
+ def update_page_inputs(extraction_mode):
513
+ if extraction_mode == "Range of Pages":
514
+ return gr.update(visible=True), gr.update(visible=True)
515
+ else:
516
+ return gr.update(visible=False), gr.update(visible=False)
517
+
518
+ extraction_mode.change(
519
+ update_page_inputs,
520
+ inputs=extraction_mode,
521
+ outputs=[start_page, end_page]
522
+ )
523
+
524
+ # Main Tabs for Page-Level and Sentence-Level Analysis
525
+ gr.Markdown("## SDG Analysis Type")
526
+
527
+ with gr.Tab("πŸ“„ Page-Level Analysis"):
528
+ gr.Markdown(
529
+ """
530
+ ### Page-Level SDG Analysis
531
+ This section conducts Sustainable Development Goals (SDG) mapping
532
+ of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
533
+ It provides **high-level SDG mapping** of documents at the page level.
534
+ """
535
+ )
536
+
537
+ with gr.Row():
538
+ page_button = gr.Button("πŸƒβ€β™‚οΈ Run Page-Level Analysis")
539
+ reset_page_button = gr.Button("πŸ”„ Reset Page-Level Analysis", elem_classes="reset-button")
540
+
541
+ # Nested Tabs for Primary and Secondary SDGs
542
+ with gr.Tabs():
543
+ with gr.TabItem("πŸ“Š Primary SDGs"):
544
+ with gr.Row():
545
+ primary_page_plot = gr.Plot(label="πŸ“Š Primary SDGs Graph [Page-Level]", scale=2)
546
+ gr.Markdown(
547
+ "When the analysis is done, the Primary SDGs bar graph on the left will show "+
548
+ "the percentage of pages that strongly align with each SDG. The icon for the most frequent "+
549
+ "SDG will be highlighted above the graph. Download the Page Predictions CVS for further details.",
550
+ label = "Note", container=True
551
+ )
552
+
553
+ gr.Markdown("##### Download Results")
554
+ with gr.Row():
555
+ page_csv = gr.File(label="πŸ“Š Download Page Predictions CSV")
556
+ page_docx = gr.File(label="πŸ“„ Download Page Report DOCX")
557
+ page_jpeg1 = gr.File(label="πŸ–ΌοΈ Download Primary SDGs JPEG")
558
+
559
+ with gr.TabItem("πŸ“ˆ Secondary SDGs"):
560
+ with gr.Row():
561
+ secondary_page_plot = gr.Plot(label="πŸ“ˆ Secondary SDGs Graph [Page-Level]", scale=2)
562
+ gr.Markdown(
563
+ "When the analysis is done, the Secondary SDGs bar graph on the left will show "+
564
+ "SDGs that are not the primary focus of the pages analysed. These SDGs are second to the "+
565
+ "Primary SDGs. Download the Sentence Predictions CVS for further details",
566
+ label = "Note", container=True
567
+ )
568
+
569
+ gr.Markdown("##### Download Results")
570
+ with gr.Row():
571
+ page_csv_secondary = gr.File(label="πŸ“Š Download Page Predictions CSV")
572
+ page_report_file_secondary = gr.File(label="πŸ“„ Download Page Report DOCX")
573
+ secondary_page_jpeg = gr.File(label="πŸ–ΌοΈ Download Secondary SDGs JPEG")
574
+
575
+ with gr.Tab("✍️ Sentence-Level Analysis"):
576
+ gr.Markdown(
577
+ """
578
+ ### Sentence-Level SDG Analysis
579
+ This section conducts Sustainable Development Goals (SDG) mapping
580
+ using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
581
+ It provides **detailed SDG mapping** at the sentence level.
582
+ """
583
+ )
584
+
585
+ with gr.Row():
586
+ sentence_button = gr.Button("πŸƒβ€β™‚οΈ Run Sentence-Level Analysis")
587
+ reset_sentence_button = gr.Button("πŸ”„ Reset Sentence-Level Analysis", elem_classes="reset-button")
588
+
589
+ # Nested Tabs for Primary and Secondary SDGs
590
+ with gr.Tabs():
591
+ with gr.TabItem("πŸ“Š Primary SDGs"):
592
+ with gr.Row():
593
+ primary_sentence_plot = gr.Plot(label="πŸ“Š Primary SDGs Graph [Sentence-Level]", scale=2)
594
+ gr.Markdown(
595
+ "When the analysis is done, the Primary SDGs bar graph on the left will show "+
596
+ "the percentage of sentences that strongly align with each SDG. The icon for the most frequent "+
597
+ "SDG will be highlighted above the graph. Download the Sentence Predictions CVS for further details.",
598
+ label = "Note", container=True
599
+ )
600
+
601
+ gr.Markdown("##### Download Results")
602
+ with gr.Row():
603
+ sentence_csv = gr.File(label="πŸ“Š Download Sentence Predictions CSV")
604
+ sentence_docx = gr.File(label="πŸ“„ Download Sentence Report DOCX")
605
+ sentence_jpeg1 = gr.File(label="πŸ–ΌοΈ Download Primary SDGs JPEG")
606
+
607
+ with gr.TabItem("πŸ“ˆ Secondary SDGs"):
608
+ with gr.Row():
609
+ secondary_sentence_plot = gr.Plot(label="πŸ“ˆ Secondary SDGs Graph [Sentence-Level]", scale=2)
610
+ gr.Markdown(
611
+ "When the analysis is done, the Secondary SDGs bar graph on the left will show "+
612
+ "SDGs that are not the primary focus of the sentences analysed. These SDGs are second to the "+
613
+ "Primary SDGs. Download the Sentence Predictions CVS for further details",
614
+ label = "Note", container=True
615
+ )
616
+
617
+ gr.Markdown("##### Download Results")
618
+ with gr.Row():
619
+ sentence_csv_secondary = gr.File(label="πŸ“Š Download Sentence Predictions CSV")
620
+ sentence_report_file_secondary = gr.File(label="πŸ“„ Download Sentence Report DOCX")
621
+ secondary_sentence_jpeg = gr.File(label="πŸ–ΌοΈ Download Secondary SDGs JPEG")
622
+
623
+ # Function to process page-level analysis
624
+ @spaces.GPU
625
+ def process_pages(file, extraction_mode, start_page, end_page):
626
+ if not file:
627
+ # Return None for each output component
628
+ return [None, None, None, None, None, None, None, None]
629
+
630
+ try:
631
+ if hasattr(file, 'name'):
632
+ pdf_file_path = file.name
633
+ original_file_name = os.path.basename(file.name)
634
+ else:
635
+ # Save the file to a temporary location
636
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
637
+ temp_pdf.write(file.read())
638
+ pdf_file_path = temp_pdf.name
639
+ original_file_name = 'uploaded_document'
640
+
641
+ # Sanitize the file name to use in output file names
642
+ sanitized_file_name = os.path.splitext(original_file_name)[0]
643
+ sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
644
+
645
+ # Determine page range based on extraction_mode
646
+ if extraction_mode == "All Pages":
647
+ selected_start = None
648
+ selected_end = None
649
+ else:
650
+ selected_start = int(start_page)
651
+ selected_end = int(end_page)
652
+
653
+ # Extract text and create DataFrames
654
+ page_df, _ = extract_text_with_py_pdf_loader(
655
+ pdf_file_path,
656
+ start_page=selected_start,
657
+ end_page=selected_end
658
+ )
659
+
660
+ # Predict SDGs at page level
661
+ df_page_predictions = predict_pages(page_df)
662
+
663
+ # Generate plots with icon overlay
664
+ first_plot = plot_sdg(
665
+ df_page_predictions, "πŸ“Š Primary SDGs", 'pred1'
666
+ )
667
+ second_plot = plot_sdg(
668
+ df_page_predictions, "πŸ“ˆ Secondary SDGs", 'pred2'
669
+ )
670
+
671
+ # Define output file names
672
+ page_csv_file = f"{sanitized_file_name}_SDG-Page_predictions.csv"
673
+ page_report_file = f"{sanitized_file_name}_SDG-Page_report.docx"
674
+ primary_page_jpeg = f"{sanitized_file_name}_SDG-Page_primary_graph.jpeg"
675
+
676
+ page_csv_file_secondary = f"{sanitized_file_name}_SDG-Page_predictions.csv"
677
+ page_report_file_secondary = f"{sanitized_file_name}_SDG-Page_report.docx"
678
+ secondary_page_jpeg = f"{sanitized_file_name}_SDG-Page_secondary_graph.jpeg"
679
+
680
+ # Save CSV and reports
681
+ df_page_predictions.to_csv(page_csv_file, index=False)
682
+ page_report_primary = generate_page_report(df_page_predictions, page_report_file)
683
+
684
+ df_page_predictions.to_csv(page_csv_file_secondary, index=False)
685
+ page_report_secondary = generate_page_report(df_page_predictions, page_report_file_secondary)
686
+
687
+ # Save figures as JPEG
688
+ save_figure_as_jpeg(first_plot, primary_page_jpeg)
689
+ save_figure_as_jpeg(second_plot, secondary_page_jpeg)
690
+
691
+ return (
692
+ first_plot, second_plot,
693
+ page_csv_file, page_report_file, primary_page_jpeg,
694
+ page_csv_file_secondary, page_report_file_secondary, secondary_page_jpeg
695
+ )
696
+
697
+ except Exception as e:
698
+ print(f"Error: {e}")
699
+ return [None, None, None, None, None, None, None, None]
700
+
701
+ # Function to process sentence-level analysis
702
+ @spaces.GPU
703
+ def process_sentences(file, extraction_mode, start_page, end_page):
704
+ if not file:
705
+ # Return None for each output component
706
+ return [None, None, None, None, None, None, None, None]
707
+
708
+ try:
709
+ if hasattr(file, 'name'):
710
+ pdf_file_path = file.name
711
+ original_file_name = os.path.basename(file.name)
712
+ else:
713
+ # Save the file to a temporary location
714
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
715
+ temp_pdf.write(file.read())
716
+ pdf_file_path = temp_pdf.name
717
+ original_file_name = 'uploaded_document'
718
+
719
+ # Sanitize the file name to use in output file names
720
+ sanitized_file_name = os.path.splitext(original_file_name)[0]
721
+ sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
722
+
723
+ # Determine page range based on extraction_mode
724
+ if extraction_mode == "All Pages":
725
+ selected_start = None
726
+ selected_end = None
727
+ else:
728
+ selected_start = int(start_page)
729
+ selected_end = int(end_page)
730
+
731
+ # Extract text and create DataFrames
732
+ _, sentence_df = extract_text_with_py_pdf_loader(
733
+ pdf_file_path,
734
+ start_page=selected_start,
735
+ end_page=selected_end
736
+ )
737
+
738
+ # Predict SDGs at sentence level
739
+ df_sentence_predictions = predict_sentences(sentence_df)
740
+
741
+ # Generate plots with icon overlay
742
+ first_plot = plot_sdg(
743
+ df_sentence_predictions, "πŸ“Š Primary SDGs", 'pred1'
744
+ )
745
+ second_plot = plot_sdg(
746
+ df_sentence_predictions, "πŸ“ˆ Secondary SDGs", 'pred2'
747
+ )
748
+
749
+ # Define output file names
750
+ sentence_csv_file = f"{sanitized_file_name}_SDG-Sentence_predictions.csv"
751
+ sentence_report_file = f"{sanitized_file_name}_SDG-Sentence_report.docx"
752
+ primary_sentence_jpeg = f"{sanitized_file_name}_SDG-Sentence_primary_graph.jpeg"
753
+
754
+ sentence_csv_file_secondary = f"{sanitized_file_name}_SDG-Sentence_predictions.csv"
755
+ sentence_report_file_secondary = f"{sanitized_file_name}_SDG-Sentence_report.docx"
756
+ secondary_sentence_jpeg = f"{sanitized_file_name}_SDG-Sentence_secondary_graph.jpeg"
757
+
758
+ # Save CSV and reports
759
+ df_sentence_predictions.to_csv(sentence_csv_file, index=False)
760
+ sentence_report_primary = generate_sentence_report(df_sentence_predictions, sentence_report_file)
761
+
762
+ df_sentence_predictions.to_csv(sentence_csv_file_secondary, index=False)
763
+ sentence_report_secondary = generate_sentence_report(df_sentence_predictions, sentence_report_file_secondary)
764
+
765
+ # Save figures as JPEG
766
+ save_figure_as_jpeg(first_plot, primary_sentence_jpeg)
767
+ save_figure_as_jpeg(second_plot, secondary_sentence_jpeg)
768
+
769
+ return (
770
+ first_plot, second_plot,
771
+ sentence_csv_file, sentence_report_file, primary_sentence_jpeg,
772
+ sentence_csv_file_secondary, sentence_report_file_secondary, secondary_sentence_jpeg
773
+ )
774
+
775
+ except Exception as e:
776
+ print(f"Error: {e}")
777
+ return [None, None, None, None, None, None, None, None]
778
+
779
+ # Reset functions to clear the outputs
780
+ def reset_page_outputs():
781
+ return [None, None, None, None, None, None, None, None]
782
+
783
+ def reset_sentence_outputs():
784
+ return [None, None, None, None, None, None, None, None]
785
+
786
+ # Button actions for Page-Level Analysis
787
+ page_button.click(
788
+ process_pages,
789
+ inputs=[file_input, extraction_mode, start_page, end_page],
790
+ outputs=[
791
+ primary_page_plot, # πŸ“Š Primary SDGs [Page-Level]
792
+ secondary_page_plot, # πŸ“ˆ Secondary SDGs [Page-Level]
793
+ page_csv, # πŸ“Š Download Page Predictions CSV
794
+ page_docx, # πŸ“„ Download Page Report DOCX
795
+ page_jpeg1, # πŸ–ΌοΈ Download Primary SDGs JPEG
796
+ page_csv_secondary, # πŸ“Š Download Page Predictions CSV
797
+ page_report_file_secondary, # πŸ“„ Download Page Report DOCX
798
+ secondary_page_jpeg # πŸ–ΌοΈ Download Secondary SDGs JPEG
799
+ ]
800
+ )
801
+
802
+ reset_page_button.click(
803
+ reset_page_outputs,
804
+ outputs=[
805
+ primary_page_plot,
806
+ secondary_page_plot,
807
+ page_csv,
808
+ page_docx,
809
+ page_jpeg1,
810
+ page_csv_secondary,
811
+ page_report_file_secondary,
812
+ secondary_page_jpeg
813
+ ]
814
+ )
815
+
816
+ # Button actions for Sentence-Level Analysis
817
+ sentence_button.click(
818
+ process_sentences,
819
+ inputs=[file_input, extraction_mode, start_page, end_page],
820
+ outputs=[
821
+ primary_sentence_plot, # πŸ“Š Primary SDGs [Sentence-Level]
822
+ secondary_sentence_plot, # πŸ“ˆ Secondary SDGs [Sentence-Level]
823
+ sentence_csv, # πŸ“Š Download Sentence Predictions CSV
824
+ sentence_docx, # πŸ“„ Download Sentence Report DOCX
825
+ sentence_jpeg1, # πŸ–ΌοΈ Download Primary SDGs JPEG
826
+ sentence_csv_secondary, # πŸ“Š Download Sentence Predictions CSV
827
+ sentence_report_file_secondary, # πŸ“„ Download Sentence Report DOCX
828
+ secondary_sentence_jpeg # πŸ–ΌοΈ Download Secondary SDGs JPEG
829
+ ]
830
+ )
831
+
832
+ reset_sentence_button.click(
833
+ reset_sentence_outputs,
834
+ outputs=[
835
+ primary_sentence_plot,
836
+ secondary_sentence_plot,
837
+ sentence_csv,
838
+ sentence_docx,
839
+ sentence_jpeg1,
840
+ sentence_csv_secondary,
841
+ sentence_report_file_secondary,
842
+ secondary_sentence_jpeg
843
+ ]
844
+ )
845
+
846
+ demo.queue().launch()
847
+
848
+ launch_interface()