Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,848 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import torch
|
5 |
+
import pandas as pd
|
6 |
+
import plotly.express as px
|
7 |
+
import plotly.io as pio
|
8 |
+
import nltk
|
9 |
+
import tempfile
|
10 |
+
from io import BytesIO
|
11 |
+
import base64
|
12 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
13 |
+
from nltk.tokenize import sent_tokenize
|
14 |
+
from docx.shared import Inches
|
15 |
+
from docx import Document
|
16 |
+
import numpy as np
|
17 |
+
# Needed for HF GPU access
|
18 |
+
import spaces
|
19 |
+
|
20 |
+
from styles import custom_css # Importing custom CSS
|
21 |
+
|
22 |
+
nltk.download('punkt')
|
23 |
+
|
24 |
+
# Import PyPDFLoader for PDF processing
|
25 |
+
from langchain_community.document_loaders import PyPDFLoader
|
26 |
+
|
27 |
+
# Model checkpoint for SDG BERT
|
28 |
+
checkpoint = "sadickam/sdgBERT"
|
29 |
+
|
30 |
+
# Text cleaning function
|
31 |
+
def clean_text(text):
|
32 |
+
"""
|
33 |
+
Cleans the extracted text by removing irrelevant characters but retains currency symbols.
|
34 |
+
"""
|
35 |
+
text = text.strip()
|
36 |
+
# Define the allowed characters (including currency symbols)
|
37 |
+
allowed_chars = r'[^a-zA-Z0-9\s\.,!?$β¬Β£Β₯βΉΒ’β©]'
|
38 |
+
text = re.sub(allowed_chars, '', text)
|
39 |
+
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
|
40 |
+
return text
|
41 |
+
|
42 |
+
# Preprocessing function for text
|
43 |
+
def prep_text(text):
|
44 |
+
clean_sents = []
|
45 |
+
sent_tokens = sent_tokenize(str(text))
|
46 |
+
for sent_token in sent_tokens:
|
47 |
+
word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()]
|
48 |
+
clean_sents.append(' '.join(word_tokens))
|
49 |
+
joined = ' '.join(clean_sents).strip()
|
50 |
+
return re.sub(r'`|"', "", joined)
|
51 |
+
|
52 |
+
# Load the tokenizer and model with GPU support
|
53 |
+
def load_model_and_tokenizer():
|
54 |
+
model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)
|
55 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
56 |
+
return model, tokenizer
|
57 |
+
|
58 |
+
# Define device (ensure usage of GPU if available in Hugging Face Spaces)
|
59 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
60 |
+
|
61 |
+
# SDG labels
|
62 |
+
label_list = [
|
63 |
+
'SDG1_No Poverty', 'SDG2_Zero Hunger', 'SDG3_Good Health and Well-being', 'SDG4_Quality Education',
|
64 |
+
'SDG5_Gender Equality', 'SDG6_Clean Water and Sanitation', 'SDG7_Affordable and Clean Energy',
|
65 |
+
'SDG8_Decent Work and Economic Growth', 'SDG9_Industry, Innovation and Infrastructure',
|
66 |
+
'SDG10_Reduced Inequality', 'SDG11_Sustainable Cities and Communities',
|
67 |
+
'SDG12_Responsible Consumption and Production', 'SDG13_Climate Action',
|
68 |
+
'SDG14_Life Below Water', 'SDG15_Life on Land', 'SDG16_Peace, Justice and Strong Institutions'
|
69 |
+
]
|
70 |
+
|
71 |
+
# Function to predict SDGs for a batch of text inputs
|
72 |
+
def predict_sdg_labels_batch(texts, model, tokenizer):
|
73 |
+
tokenized_texts = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
|
74 |
+
model.eval()
|
75 |
+
with torch.no_grad():
|
76 |
+
text_logits = model(**tokenized_texts).logits
|
77 |
+
predictions = torch.softmax(text_logits, dim=1).tolist()
|
78 |
+
return predictions
|
79 |
+
|
80 |
+
# Page-level predictions with batch processing
|
81 |
+
def predict_pages(page_df, batch_size=32):
|
82 |
+
model, tokenizer = load_model_and_tokenizer()
|
83 |
+
df_results = page_df.copy()
|
84 |
+
num_rows = len(page_df)
|
85 |
+
all_predicted_labels = [[] for _ in range(16)]
|
86 |
+
all_prediction_scores = [[] for _ in range(16)]
|
87 |
+
|
88 |
+
for start in range(0, num_rows, batch_size):
|
89 |
+
end = min(start + batch_size, num_rows)
|
90 |
+
df_chunk = page_df.iloc[start:end]
|
91 |
+
# Clean text
|
92 |
+
texts = df_chunk['Text'].apply(clean_text).apply(prep_text).tolist()
|
93 |
+
predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
|
94 |
+
for predictions in predictions_batch:
|
95 |
+
sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
|
96 |
+
for i, (label, score) in enumerate(sorted_preds):
|
97 |
+
all_predicted_labels[i].append(label)
|
98 |
+
all_prediction_scores[i].append(score)
|
99 |
+
|
100 |
+
# Add columns to the DataFrame in the desired order (pred1, score1, pred2, score2, ...)
|
101 |
+
for i in range(16):
|
102 |
+
df_results[f'pred{i + 1}'] = all_predicted_labels[i]
|
103 |
+
df_results[f'score{i + 1}'] = all_prediction_scores[i]
|
104 |
+
|
105 |
+
# Reorder columns to ensure preds and scores are interleaved in the correct order
|
106 |
+
reordered_columns = []
|
107 |
+
for i in range(16):
|
108 |
+
reordered_columns.append(f'pred{i + 1}')
|
109 |
+
reordered_columns.append(f'score{i + 1}')
|
110 |
+
other_columns = [col for col in df_results.columns if col not in reordered_columns]
|
111 |
+
df_results = df_results[other_columns + reordered_columns]
|
112 |
+
|
113 |
+
return df_results
|
114 |
+
|
115 |
+
# Sentence-level predictions with batch processing
|
116 |
+
def predict_sentences(sentence_df, batch_size=32):
|
117 |
+
model, tokenizer = load_model_and_tokenizer()
|
118 |
+
df_combined_sentences = sentence_df.copy()
|
119 |
+
|
120 |
+
num_rows = len(sentence_df)
|
121 |
+
all_predicted_labels = [[] for _ in range(16)]
|
122 |
+
all_prediction_scores = [[] for _ in range(16)]
|
123 |
+
|
124 |
+
for start in range(0, num_rows, batch_size):
|
125 |
+
end = min(start + batch_size, num_rows)
|
126 |
+
df_chunk = sentence_df.iloc[start:end]
|
127 |
+
# Clean text
|
128 |
+
texts = df_chunk['Sentence'].apply(clean_text).apply(prep_text).tolist()
|
129 |
+
predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
|
130 |
+
for predictions in predictions_batch:
|
131 |
+
sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
|
132 |
+
for i, (label, score) in enumerate(sorted_preds):
|
133 |
+
all_predicted_labels[i].append(label)
|
134 |
+
all_prediction_scores[i].append(round(score, 3))
|
135 |
+
|
136 |
+
# Add predictions and scores to DataFrame
|
137 |
+
for i in range(16):
|
138 |
+
df_combined_sentences[f'pred{i + 1}'] = all_predicted_labels[i]
|
139 |
+
df_combined_sentences[f'score{i + 1}'] = all_prediction_scores[i]
|
140 |
+
|
141 |
+
# Reorder columns
|
142 |
+
reordered_columns = []
|
143 |
+
for i in range(16):
|
144 |
+
reordered_columns.append(f'pred{i + 1}')
|
145 |
+
reordered_columns.append(f'score{i + 1}')
|
146 |
+
other_columns = [col for col in df_combined_sentences.columns if col not in reordered_columns]
|
147 |
+
df_combined_sentences = df_combined_sentences[other_columns + reordered_columns]
|
148 |
+
|
149 |
+
return df_combined_sentences
|
150 |
+
|
151 |
+
# Define unique colors for each SDG
|
152 |
+
sdg_colors = {
|
153 |
+
"SDG1_No Poverty": "#E5243B",
|
154 |
+
"SDG2_Zero Hunger": "#DDA63A",
|
155 |
+
"SDG3_Good Health and Well-being": "#4C9F38",
|
156 |
+
"SDG4_Quality Education": "#C5192D",
|
157 |
+
"SDG5_Gender Equality": "#FF3A21",
|
158 |
+
"SDG6_Clean Water and Sanitation": "#26BDE2",
|
159 |
+
"SDG7_Affordable and Clean Energy": "#FCC30B",
|
160 |
+
"SDG8_Decent Work and Economic Growth": "#A21942",
|
161 |
+
"SDG9_Industry, Innovation and Infrastructure": "#FD6925",
|
162 |
+
"SDG10_Reduced Inequality": "#DD1367",
|
163 |
+
"SDG11_Sustainable Cities and Communities": "#FD9D24",
|
164 |
+
"SDG12_Responsible Consumption and Production": "#BF8B2E",
|
165 |
+
"SDG13_Climate Action": "#3F7E44",
|
166 |
+
"SDG14_Life Below Water": "#0A97D9",
|
167 |
+
"SDG15_Life on Land": "#56C02B",
|
168 |
+
"SDG16_Peace, Justice and Strong Institutions": "#00689D"
|
169 |
+
}
|
170 |
+
|
171 |
+
# Function to plot SDG dominant bar graphs using Plotly
|
172 |
+
# Function to plot SDG dominant bar graphs using Plotly
|
173 |
+
def plot_sdg(df, title, pred_column, icons_folder='assets/icons/'):
|
174 |
+
"""
|
175 |
+
Plots a horizontal bar graph of SDG predictions and superimposes the icon of the most frequent SDG.
|
176 |
+
|
177 |
+
Args:
|
178 |
+
df (pd.DataFrame): DataFrame containing SDG predictions.
|
179 |
+
title (str): Title of the plot.
|
180 |
+
pred_column (str): Column name to use for plotting (e.g., 'pred1').
|
181 |
+
icons_folder (str): Path to the folder containing SDG icons.
|
182 |
+
|
183 |
+
Returns:
|
184 |
+
plotly.graph_objs._figure.Figure: The Plotly figure object.
|
185 |
+
"""
|
186 |
+
df_filtered = df[df[pred_column].notna()]
|
187 |
+
labels = df_filtered[pred_column].value_counts().sort_values(ascending=False)
|
188 |
+
total = labels.sum()
|
189 |
+
percentages = (labels / total) * 100
|
190 |
+
|
191 |
+
# Create a horizontal bar plot with Plotly
|
192 |
+
fig = px.bar(
|
193 |
+
percentages.rename_axis('SDG Label').reset_index(name='Percentage'),
|
194 |
+
y='SDG Label',
|
195 |
+
x='Percentage',
|
196 |
+
orientation='h',
|
197 |
+
title=title,
|
198 |
+
color='SDG Label',
|
199 |
+
color_discrete_map=sdg_colors # Use the defined unique colors for each SDG
|
200 |
+
)
|
201 |
+
|
202 |
+
# Update y-axis to show labels
|
203 |
+
fig.update_yaxes(showticklabels=True)
|
204 |
+
|
205 |
+
# Add percentage labels to the bars
|
206 |
+
fig.update_traces(
|
207 |
+
texttemplate='%{x:.2f}%',
|
208 |
+
textposition='auto',
|
209 |
+
textfont=dict(size=10)
|
210 |
+
)
|
211 |
+
|
212 |
+
# Adjust layout for better visibility
|
213 |
+
fig.update_layout(
|
214 |
+
title=dict(
|
215 |
+
text=title, font=dict(size=14) # Increase title font size
|
216 |
+
),
|
217 |
+
yaxis=dict(
|
218 |
+
automargin=True,
|
219 |
+
title=None,
|
220 |
+
tickfont=dict(size=12)
|
221 |
+
),
|
222 |
+
margin=dict(l=20, r=30, t=100, b=20), # Increased right margin for icon
|
223 |
+
height=600,
|
224 |
+
#width=800,
|
225 |
+
showlegend=False,
|
226 |
+
template="simple_white",
|
227 |
+
xaxis=dict(
|
228 |
+
tickfont=dict(size=12) # Reduce x-axis font size
|
229 |
+
),
|
230 |
+
)
|
231 |
+
|
232 |
+
# Identify the most frequent SDG
|
233 |
+
if not percentages.empty:
|
234 |
+
top_sdg_label = percentages.index[0] # e.g., 'SDG1_No Poverty'
|
235 |
+
|
236 |
+
# Map SDG label to icon filename
|
237 |
+
# Assuming naming convention 'SDG1.png', 'SDG2.png', etc.
|
238 |
+
sdg_number = top_sdg_label.split('_')[0] # Extract 'SDG1'
|
239 |
+
icon_filename = f"{sdg_number}.png" # e.g., 'SDG1.png'
|
240 |
+
icon_path = os.path.join(icons_folder, icon_filename)
|
241 |
+
|
242 |
+
# Check if the icon file exists
|
243 |
+
if os.path.exists(icon_path):
|
244 |
+
# Read and encode the image
|
245 |
+
with open(icon_path, 'rb') as image_file:
|
246 |
+
encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
|
247 |
+
|
248 |
+
# Add the icon as an image in the Plotly figure
|
249 |
+
fig.add_layout_image(
|
250 |
+
dict(
|
251 |
+
source='data:image/png;base64,' + encoded_image,
|
252 |
+
xref="paper", yref="paper",
|
253 |
+
x=0.4, y=1.2, # Positioning: slightly to the right and top
|
254 |
+
sizex=0.2, sizey=0.2, # Size of the icon
|
255 |
+
xanchor="left",
|
256 |
+
yanchor="top",
|
257 |
+
layer="above" # Ensure the icon is above other plot elements
|
258 |
+
)
|
259 |
+
)
|
260 |
+
else:
|
261 |
+
print(f"Icon file '{icon_path}' not found. Skipping icon overlay.")
|
262 |
+
|
263 |
+
return fig
|
264 |
+
|
265 |
+
def save_figure_as_jpeg(fig, filename):
|
266 |
+
"""Saves the Plotly figure as a high-resolution JPEG."""
|
267 |
+
pio.write_image(fig, filename, format='jpeg', width=1200, height=600, scale=6)
|
268 |
+
|
269 |
+
# Generate reports (page and sentence levels)
|
270 |
+
def generate_page_report(df_pages, report_file_name):
|
271 |
+
doc = Document()
|
272 |
+
doc.add_heading("Page-Level SDG Analysis Report", 0)
|
273 |
+
|
274 |
+
doc.add_heading("π General Notes", level=2)
|
275 |
+
doc.add_paragraph(
|
276 |
+
'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
|
277 |
+
'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
|
278 |
+
'representing the likelihood that the text is aligned with particular SDGs. This page-level '
|
279 |
+
'analysis provides high-level insight into SDG alignment.'
|
280 |
+
'\n\n'
|
281 |
+
'Given that a page may align with more than one SDG, this app focuses on the top two SDG predictions '
|
282 |
+
'(Primary and Secondary) for each page with a probability score greater than zero.'
|
283 |
+
)
|
284 |
+
|
285 |
+
doc.add_heading("Primary SDGs Bar Graph", level=3)
|
286 |
+
doc.add_paragraph(
|
287 |
+
'This graph displays the most essential SDG the AI model associates with pages. The bars '
|
288 |
+
'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
|
289 |
+
'sustainable development theme within the document.'
|
290 |
+
)
|
291 |
+
|
292 |
+
doc.add_heading("Secondary SDGs Bar Graph", level=3)
|
293 |
+
doc.add_paragraph(
|
294 |
+
'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
|
295 |
+
'not the primary focus, the text has some relevance to these goals.'
|
296 |
+
)
|
297 |
+
|
298 |
+
for doc_name in df_pages['Document'].unique():
|
299 |
+
# Sanitize doc_name to use in file names
|
300 |
+
sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
|
301 |
+
|
302 |
+
doc.add_heading(f"π Document: {doc_name}", level=2)
|
303 |
+
df_doc = df_pages[df_pages['Document'] == doc_name]
|
304 |
+
|
305 |
+
# Generate and save graphs
|
306 |
+
first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_page.jpeg"
|
307 |
+
second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_page.jpeg"
|
308 |
+
|
309 |
+
plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
|
310 |
+
first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
|
311 |
+
plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image(
|
312 |
+
second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
|
313 |
+
|
314 |
+
# Add plots to the Word document
|
315 |
+
doc.add_picture(first_sdg_plot_path, width=Inches(6))
|
316 |
+
doc.add_picture(second_sdg_plot_path, width=Inches(6))
|
317 |
+
|
318 |
+
doc.save(report_file_name)
|
319 |
+
return report_file_name
|
320 |
+
|
321 |
+
def generate_sentence_report(df_sentences, report_file_name):
|
322 |
+
doc = Document()
|
323 |
+
doc.add_heading("Sentence-Level SDG Analysis Report", 0)
|
324 |
+
|
325 |
+
doc.add_heading("π General Notes", level=2)
|
326 |
+
doc.add_paragraph(
|
327 |
+
'This app splits documents into sentences using a natural language processing algorithm. '
|
328 |
+
'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
|
329 |
+
'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
|
330 |
+
'representing the likelihood that the text is aligned with particular SDGs. This sentence-level '
|
331 |
+
'analysis provides deeper insight into SDG alignment.'
|
332 |
+
'\n\n'
|
333 |
+
'Given that a sentence may align with more than one SDG, this app focuses on the top two SDG predictions '
|
334 |
+
'(Primary and Secondary) for each sentence with a probability score greater than zero.'
|
335 |
+
)
|
336 |
+
|
337 |
+
doc.add_heading("Primary SDGs Bar Graph", level=3)
|
338 |
+
doc.add_paragraph(
|
339 |
+
'This graph displays the most essential SDG the AI model associates with sentences. The bars '
|
340 |
+
'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
|
341 |
+
'into the dominant sustainable development theme within the document.'
|
342 |
+
)
|
343 |
+
|
344 |
+
doc.add_heading("Secondary SDGs Bar Graph", level=3)
|
345 |
+
doc.add_paragraph(
|
346 |
+
'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
|
347 |
+
'the primary focus, the text has some relevance to these goals.'
|
348 |
+
)
|
349 |
+
|
350 |
+
for doc_name in df_sentences['Document'].unique():
|
351 |
+
# Sanitize doc_name to use in file names
|
352 |
+
sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
|
353 |
+
|
354 |
+
doc.add_heading(f"π Document: {doc_name}", level=2)
|
355 |
+
df_doc = df_sentences[df_sentences['Document'] == doc_name]
|
356 |
+
|
357 |
+
# Generate and save graphs
|
358 |
+
first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_sentence.jpeg"
|
359 |
+
second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_sentence.jpeg"
|
360 |
+
|
361 |
+
plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
|
362 |
+
first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
|
363 |
+
plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image(
|
364 |
+
second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
|
365 |
+
|
366 |
+
# Add plots to the Word document
|
367 |
+
doc.add_picture(first_sdg_plot_path, width=Inches(6))
|
368 |
+
doc.add_picture(second_sdg_plot_path, width=Inches(6))
|
369 |
+
|
370 |
+
doc.save(report_file_name)
|
371 |
+
return report_file_name
|
372 |
+
|
373 |
+
# New text extraction functions with text cleaning and line joining
|
374 |
+
def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
|
375 |
+
"""
|
376 |
+
Extract text from a PDF page by page using LangChain's PyPDFLoader.
|
377 |
+
Args:
|
378 |
+
pdf_file_path (str): The file path to the uploaded PDF.
|
379 |
+
start_page (int, optional): The starting page number for extraction (1-based index).
|
380 |
+
end_page (int, optional): The ending page number for extraction (1-based index).
|
381 |
+
Returns:
|
382 |
+
tuple:
|
383 |
+
- page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
|
384 |
+
- sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
|
385 |
+
"""
|
386 |
+
try:
|
387 |
+
# Initialize the loader
|
388 |
+
loader = PyPDFLoader(pdf_file_path)
|
389 |
+
documents = loader.load_and_split() # Each document corresponds to a single page
|
390 |
+
|
391 |
+
total_pages = len(documents)
|
392 |
+
doc_name = os.path.basename(pdf_file_path) # Extract document name
|
393 |
+
|
394 |
+
# Validate and adjust page range
|
395 |
+
if start_page is not None and end_page is not None:
|
396 |
+
# Convert to integers to avoid slicing issues
|
397 |
+
start_page = int(start_page)
|
398 |
+
end_page = int(end_page)
|
399 |
+
|
400 |
+
# Adjust to valid range
|
401 |
+
if start_page < 1:
|
402 |
+
start_page = 1
|
403 |
+
if end_page > total_pages:
|
404 |
+
end_page = total_pages
|
405 |
+
if start_page > end_page:
|
406 |
+
start_page, end_page = end_page, start_page # Swap if out of order
|
407 |
+
|
408 |
+
# Select the subset of documents based on user input
|
409 |
+
selected_docs = documents[start_page - 1:end_page]
|
410 |
+
else:
|
411 |
+
selected_docs = documents
|
412 |
+
start_page = 1
|
413 |
+
end_page = total_pages
|
414 |
+
|
415 |
+
# Initialize lists to store data
|
416 |
+
page_data = []
|
417 |
+
sentence_data = []
|
418 |
+
|
419 |
+
for idx, doc in enumerate(selected_docs, start=start_page):
|
420 |
+
page_num = idx
|
421 |
+
text = doc.page_content.strip()
|
422 |
+
|
423 |
+
# Join lines that belong to the same sentence
|
424 |
+
lines = text.split('\n')
|
425 |
+
joined_text = ' '.join(line.strip() for line in lines if line.strip())
|
426 |
+
|
427 |
+
# Clean text
|
428 |
+
cleaned_text = clean_text(joined_text)
|
429 |
+
|
430 |
+
# Append page-wise data
|
431 |
+
page_data.append({
|
432 |
+
"Document": doc_name,
|
433 |
+
"Page": page_num,
|
434 |
+
"Text": cleaned_text
|
435 |
+
})
|
436 |
+
|
437 |
+
# Sentence tokenization
|
438 |
+
sentences = sent_tokenize(cleaned_text)
|
439 |
+
for sentence in sentences:
|
440 |
+
sentence = sentence.strip()
|
441 |
+
if sentence and len(sentence) > 70:
|
442 |
+
sentence_data.append({
|
443 |
+
"Document": doc_name,
|
444 |
+
"Page": page_num,
|
445 |
+
"Sentence": sentence
|
446 |
+
})
|
447 |
+
|
448 |
+
# Create DataFrames
|
449 |
+
page_df = pd.DataFrame(page_data)
|
450 |
+
sentence_df = pd.DataFrame(sentence_data)
|
451 |
+
|
452 |
+
return page_df, sentence_df
|
453 |
+
|
454 |
+
except Exception as e:
|
455 |
+
raise RuntimeError(f"Error during PDF extraction: {e}")
|
456 |
+
|
457 |
+
def df_to_csv_bytes(df):
|
458 |
+
"""
|
459 |
+
Convert DataFrame to CSV in bytes.
|
460 |
+
Args:
|
461 |
+
df (pd.DataFrame): The DataFrame to convert.
|
462 |
+
Returns:
|
463 |
+
bytes: CSV data in bytes.
|
464 |
+
"""
|
465 |
+
try:
|
466 |
+
buffer = BytesIO()
|
467 |
+
df.to_csv(buffer, index=False)
|
468 |
+
csv_data = buffer.getvalue()
|
469 |
+
buffer.close()
|
470 |
+
return csv_data
|
471 |
+
except Exception as e:
|
472 |
+
raise RuntimeError(f"Error during CSV conversion: {e}")
|
473 |
+
|
474 |
+
def launch_interface():
|
475 |
+
with gr.Blocks(css=custom_css) as demo:
|
476 |
+
|
477 |
+
# Title as a visible heading at the top of the page with an icon
|
478 |
+
gr.Markdown(
|
479 |
+
"""
|
480 |
+
# π SDG Document Analysis App
|
481 |
+
Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
|
482 |
+
"""
|
483 |
+
)
|
484 |
+
|
485 |
+
# Shared PDF file input for both analyses
|
486 |
+
gr.Markdown("## Upload PDF File")
|
487 |
+
with gr.Row():
|
488 |
+
file_input = gr.File(
|
489 |
+
label="π Upload PDF File for Analysis", file_types=[".pdf"]
|
490 |
+
)
|
491 |
+
|
492 |
+
# Extraction mode selection with explanatory text
|
493 |
+
gr.Markdown(
|
494 |
+
"""
|
495 |
+
## PDF Text Extraction Mode
|
496 |
+
Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select
|
497 |
+
"Range of Pages" and specify the start and end pages.
|
498 |
+
"""
|
499 |
+
)
|
500 |
+
with gr.Row():
|
501 |
+
extraction_mode = gr.Radio(
|
502 |
+
choices=["All Pages", "Range of Pages"],
|
503 |
+
value="All Pages",
|
504 |
+
label="Extraction Mode"
|
505 |
+
)
|
506 |
+
|
507 |
+
with gr.Row():
|
508 |
+
start_page = gr.Number(value=1, label="π’ Start Page", visible=False, info="The cover page is page 1")
|
509 |
+
end_page = gr.Number(value=1, label="π’ End Page", visible=False)
|
510 |
+
|
511 |
+
# Function to update visibility of start_page and end_page
|
512 |
+
def update_page_inputs(extraction_mode):
|
513 |
+
if extraction_mode == "Range of Pages":
|
514 |
+
return gr.update(visible=True), gr.update(visible=True)
|
515 |
+
else:
|
516 |
+
return gr.update(visible=False), gr.update(visible=False)
|
517 |
+
|
518 |
+
extraction_mode.change(
|
519 |
+
update_page_inputs,
|
520 |
+
inputs=extraction_mode,
|
521 |
+
outputs=[start_page, end_page]
|
522 |
+
)
|
523 |
+
|
524 |
+
# Main Tabs for Page-Level and Sentence-Level Analysis
|
525 |
+
gr.Markdown("## SDG Analysis Type")
|
526 |
+
|
527 |
+
with gr.Tab("π Page-Level Analysis"):
|
528 |
+
gr.Markdown(
|
529 |
+
"""
|
530 |
+
### Page-Level SDG Analysis
|
531 |
+
This section conducts Sustainable Development Goals (SDG) mapping
|
532 |
+
of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
|
533 |
+
It provides **high-level SDG mapping** of documents at the page level.
|
534 |
+
"""
|
535 |
+
)
|
536 |
+
|
537 |
+
with gr.Row():
|
538 |
+
page_button = gr.Button("πββοΈ Run Page-Level Analysis")
|
539 |
+
reset_page_button = gr.Button("π Reset Page-Level Analysis", elem_classes="reset-button")
|
540 |
+
|
541 |
+
# Nested Tabs for Primary and Secondary SDGs
|
542 |
+
with gr.Tabs():
|
543 |
+
with gr.TabItem("π Primary SDGs"):
|
544 |
+
with gr.Row():
|
545 |
+
primary_page_plot = gr.Plot(label="π Primary SDGs Graph [Page-Level]", scale=2)
|
546 |
+
gr.Markdown(
|
547 |
+
"When the analysis is done, the Primary SDGs bar graph on the left will show "+
|
548 |
+
"the percentage of pages that strongly align with each SDG. The icon for the most frequent "+
|
549 |
+
"SDG will be highlighted above the graph. Download the Page Predictions CVS for further details.",
|
550 |
+
label = "Note", container=True
|
551 |
+
)
|
552 |
+
|
553 |
+
gr.Markdown("##### Download Results")
|
554 |
+
with gr.Row():
|
555 |
+
page_csv = gr.File(label="π Download Page Predictions CSV")
|
556 |
+
page_docx = gr.File(label="π Download Page Report DOCX")
|
557 |
+
page_jpeg1 = gr.File(label="πΌοΈ Download Primary SDGs JPEG")
|
558 |
+
|
559 |
+
with gr.TabItem("π Secondary SDGs"):
|
560 |
+
with gr.Row():
|
561 |
+
secondary_page_plot = gr.Plot(label="π Secondary SDGs Graph [Page-Level]", scale=2)
|
562 |
+
gr.Markdown(
|
563 |
+
"When the analysis is done, the Secondary SDGs bar graph on the left will show "+
|
564 |
+
"SDGs that are not the primary focus of the pages analysed. These SDGs are second to the "+
|
565 |
+
"Primary SDGs. Download the Sentence Predictions CVS for further details",
|
566 |
+
label = "Note", container=True
|
567 |
+
)
|
568 |
+
|
569 |
+
gr.Markdown("##### Download Results")
|
570 |
+
with gr.Row():
|
571 |
+
page_csv_secondary = gr.File(label="π Download Page Predictions CSV")
|
572 |
+
page_report_file_secondary = gr.File(label="π Download Page Report DOCX")
|
573 |
+
secondary_page_jpeg = gr.File(label="πΌοΈ Download Secondary SDGs JPEG")
|
574 |
+
|
575 |
+
with gr.Tab("βοΈ Sentence-Level Analysis"):
|
576 |
+
gr.Markdown(
|
577 |
+
"""
|
578 |
+
### Sentence-Level SDG Analysis
|
579 |
+
This section conducts Sustainable Development Goals (SDG) mapping
|
580 |
+
using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
|
581 |
+
It provides **detailed SDG mapping** at the sentence level.
|
582 |
+
"""
|
583 |
+
)
|
584 |
+
|
585 |
+
with gr.Row():
|
586 |
+
sentence_button = gr.Button("πββοΈ Run Sentence-Level Analysis")
|
587 |
+
reset_sentence_button = gr.Button("π Reset Sentence-Level Analysis", elem_classes="reset-button")
|
588 |
+
|
589 |
+
# Nested Tabs for Primary and Secondary SDGs
|
590 |
+
with gr.Tabs():
|
591 |
+
with gr.TabItem("π Primary SDGs"):
|
592 |
+
with gr.Row():
|
593 |
+
primary_sentence_plot = gr.Plot(label="π Primary SDGs Graph [Sentence-Level]", scale=2)
|
594 |
+
gr.Markdown(
|
595 |
+
"When the analysis is done, the Primary SDGs bar graph on the left will show "+
|
596 |
+
"the percentage of sentences that strongly align with each SDG. The icon for the most frequent "+
|
597 |
+
"SDG will be highlighted above the graph. Download the Sentence Predictions CVS for further details.",
|
598 |
+
label = "Note", container=True
|
599 |
+
)
|
600 |
+
|
601 |
+
gr.Markdown("##### Download Results")
|
602 |
+
with gr.Row():
|
603 |
+
sentence_csv = gr.File(label="π Download Sentence Predictions CSV")
|
604 |
+
sentence_docx = gr.File(label="π Download Sentence Report DOCX")
|
605 |
+
sentence_jpeg1 = gr.File(label="πΌοΈ Download Primary SDGs JPEG")
|
606 |
+
|
607 |
+
with gr.TabItem("π Secondary SDGs"):
|
608 |
+
with gr.Row():
|
609 |
+
secondary_sentence_plot = gr.Plot(label="π Secondary SDGs Graph [Sentence-Level]", scale=2)
|
610 |
+
gr.Markdown(
|
611 |
+
"When the analysis is done, the Secondary SDGs bar graph on the left will show "+
|
612 |
+
"SDGs that are not the primary focus of the sentences analysed. These SDGs are second to the "+
|
613 |
+
"Primary SDGs. Download the Sentence Predictions CVS for further details",
|
614 |
+
label = "Note", container=True
|
615 |
+
)
|
616 |
+
|
617 |
+
gr.Markdown("##### Download Results")
|
618 |
+
with gr.Row():
|
619 |
+
sentence_csv_secondary = gr.File(label="π Download Sentence Predictions CSV")
|
620 |
+
sentence_report_file_secondary = gr.File(label="π Download Sentence Report DOCX")
|
621 |
+
secondary_sentence_jpeg = gr.File(label="πΌοΈ Download Secondary SDGs JPEG")
|
622 |
+
|
623 |
+
# Function to process page-level analysis
|
624 |
+
@spaces.GPU
|
625 |
+
def process_pages(file, extraction_mode, start_page, end_page):
|
626 |
+
if not file:
|
627 |
+
# Return None for each output component
|
628 |
+
return [None, None, None, None, None, None, None, None]
|
629 |
+
|
630 |
+
try:
|
631 |
+
if hasattr(file, 'name'):
|
632 |
+
pdf_file_path = file.name
|
633 |
+
original_file_name = os.path.basename(file.name)
|
634 |
+
else:
|
635 |
+
# Save the file to a temporary location
|
636 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
|
637 |
+
temp_pdf.write(file.read())
|
638 |
+
pdf_file_path = temp_pdf.name
|
639 |
+
original_file_name = 'uploaded_document'
|
640 |
+
|
641 |
+
# Sanitize the file name to use in output file names
|
642 |
+
sanitized_file_name = os.path.splitext(original_file_name)[0]
|
643 |
+
sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
|
644 |
+
|
645 |
+
# Determine page range based on extraction_mode
|
646 |
+
if extraction_mode == "All Pages":
|
647 |
+
selected_start = None
|
648 |
+
selected_end = None
|
649 |
+
else:
|
650 |
+
selected_start = int(start_page)
|
651 |
+
selected_end = int(end_page)
|
652 |
+
|
653 |
+
# Extract text and create DataFrames
|
654 |
+
page_df, _ = extract_text_with_py_pdf_loader(
|
655 |
+
pdf_file_path,
|
656 |
+
start_page=selected_start,
|
657 |
+
end_page=selected_end
|
658 |
+
)
|
659 |
+
|
660 |
+
# Predict SDGs at page level
|
661 |
+
df_page_predictions = predict_pages(page_df)
|
662 |
+
|
663 |
+
# Generate plots with icon overlay
|
664 |
+
first_plot = plot_sdg(
|
665 |
+
df_page_predictions, "π Primary SDGs", 'pred1'
|
666 |
+
)
|
667 |
+
second_plot = plot_sdg(
|
668 |
+
df_page_predictions, "π Secondary SDGs", 'pred2'
|
669 |
+
)
|
670 |
+
|
671 |
+
# Define output file names
|
672 |
+
page_csv_file = f"{sanitized_file_name}_SDG-Page_predictions.csv"
|
673 |
+
page_report_file = f"{sanitized_file_name}_SDG-Page_report.docx"
|
674 |
+
primary_page_jpeg = f"{sanitized_file_name}_SDG-Page_primary_graph.jpeg"
|
675 |
+
|
676 |
+
page_csv_file_secondary = f"{sanitized_file_name}_SDG-Page_predictions.csv"
|
677 |
+
page_report_file_secondary = f"{sanitized_file_name}_SDG-Page_report.docx"
|
678 |
+
secondary_page_jpeg = f"{sanitized_file_name}_SDG-Page_secondary_graph.jpeg"
|
679 |
+
|
680 |
+
# Save CSV and reports
|
681 |
+
df_page_predictions.to_csv(page_csv_file, index=False)
|
682 |
+
page_report_primary = generate_page_report(df_page_predictions, page_report_file)
|
683 |
+
|
684 |
+
df_page_predictions.to_csv(page_csv_file_secondary, index=False)
|
685 |
+
page_report_secondary = generate_page_report(df_page_predictions, page_report_file_secondary)
|
686 |
+
|
687 |
+
# Save figures as JPEG
|
688 |
+
save_figure_as_jpeg(first_plot, primary_page_jpeg)
|
689 |
+
save_figure_as_jpeg(second_plot, secondary_page_jpeg)
|
690 |
+
|
691 |
+
return (
|
692 |
+
first_plot, second_plot,
|
693 |
+
page_csv_file, page_report_file, primary_page_jpeg,
|
694 |
+
page_csv_file_secondary, page_report_file_secondary, secondary_page_jpeg
|
695 |
+
)
|
696 |
+
|
697 |
+
except Exception as e:
|
698 |
+
print(f"Error: {e}")
|
699 |
+
return [None, None, None, None, None, None, None, None]
|
700 |
+
|
701 |
+
# Function to process sentence-level analysis
|
702 |
+
@spaces.GPU
|
703 |
+
def process_sentences(file, extraction_mode, start_page, end_page):
|
704 |
+
if not file:
|
705 |
+
# Return None for each output component
|
706 |
+
return [None, None, None, None, None, None, None, None]
|
707 |
+
|
708 |
+
try:
|
709 |
+
if hasattr(file, 'name'):
|
710 |
+
pdf_file_path = file.name
|
711 |
+
original_file_name = os.path.basename(file.name)
|
712 |
+
else:
|
713 |
+
# Save the file to a temporary location
|
714 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
|
715 |
+
temp_pdf.write(file.read())
|
716 |
+
pdf_file_path = temp_pdf.name
|
717 |
+
original_file_name = 'uploaded_document'
|
718 |
+
|
719 |
+
# Sanitize the file name to use in output file names
|
720 |
+
sanitized_file_name = os.path.splitext(original_file_name)[0]
|
721 |
+
sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
|
722 |
+
|
723 |
+
# Determine page range based on extraction_mode
|
724 |
+
if extraction_mode == "All Pages":
|
725 |
+
selected_start = None
|
726 |
+
selected_end = None
|
727 |
+
else:
|
728 |
+
selected_start = int(start_page)
|
729 |
+
selected_end = int(end_page)
|
730 |
+
|
731 |
+
# Extract text and create DataFrames
|
732 |
+
_, sentence_df = extract_text_with_py_pdf_loader(
|
733 |
+
pdf_file_path,
|
734 |
+
start_page=selected_start,
|
735 |
+
end_page=selected_end
|
736 |
+
)
|
737 |
+
|
738 |
+
# Predict SDGs at sentence level
|
739 |
+
df_sentence_predictions = predict_sentences(sentence_df)
|
740 |
+
|
741 |
+
# Generate plots with icon overlay
|
742 |
+
first_plot = plot_sdg(
|
743 |
+
df_sentence_predictions, "π Primary SDGs", 'pred1'
|
744 |
+
)
|
745 |
+
second_plot = plot_sdg(
|
746 |
+
df_sentence_predictions, "π Secondary SDGs", 'pred2'
|
747 |
+
)
|
748 |
+
|
749 |
+
# Define output file names
|
750 |
+
sentence_csv_file = f"{sanitized_file_name}_SDG-Sentence_predictions.csv"
|
751 |
+
sentence_report_file = f"{sanitized_file_name}_SDG-Sentence_report.docx"
|
752 |
+
primary_sentence_jpeg = f"{sanitized_file_name}_SDG-Sentence_primary_graph.jpeg"
|
753 |
+
|
754 |
+
sentence_csv_file_secondary = f"{sanitized_file_name}_SDG-Sentence_predictions.csv"
|
755 |
+
sentence_report_file_secondary = f"{sanitized_file_name}_SDG-Sentence_report.docx"
|
756 |
+
secondary_sentence_jpeg = f"{sanitized_file_name}_SDG-Sentence_secondary_graph.jpeg"
|
757 |
+
|
758 |
+
# Save CSV and reports
|
759 |
+
df_sentence_predictions.to_csv(sentence_csv_file, index=False)
|
760 |
+
sentence_report_primary = generate_sentence_report(df_sentence_predictions, sentence_report_file)
|
761 |
+
|
762 |
+
df_sentence_predictions.to_csv(sentence_csv_file_secondary, index=False)
|
763 |
+
sentence_report_secondary = generate_sentence_report(df_sentence_predictions, sentence_report_file_secondary)
|
764 |
+
|
765 |
+
# Save figures as JPEG
|
766 |
+
save_figure_as_jpeg(first_plot, primary_sentence_jpeg)
|
767 |
+
save_figure_as_jpeg(second_plot, secondary_sentence_jpeg)
|
768 |
+
|
769 |
+
return (
|
770 |
+
first_plot, second_plot,
|
771 |
+
sentence_csv_file, sentence_report_file, primary_sentence_jpeg,
|
772 |
+
sentence_csv_file_secondary, sentence_report_file_secondary, secondary_sentence_jpeg
|
773 |
+
)
|
774 |
+
|
775 |
+
except Exception as e:
|
776 |
+
print(f"Error: {e}")
|
777 |
+
return [None, None, None, None, None, None, None, None]
|
778 |
+
|
779 |
+
# Reset functions to clear the outputs
|
780 |
+
def reset_page_outputs():
|
781 |
+
return [None, None, None, None, None, None, None, None]
|
782 |
+
|
783 |
+
def reset_sentence_outputs():
|
784 |
+
return [None, None, None, None, None, None, None, None]
|
785 |
+
|
786 |
+
# Button actions for Page-Level Analysis
|
787 |
+
page_button.click(
|
788 |
+
process_pages,
|
789 |
+
inputs=[file_input, extraction_mode, start_page, end_page],
|
790 |
+
outputs=[
|
791 |
+
primary_page_plot, # π Primary SDGs [Page-Level]
|
792 |
+
secondary_page_plot, # π Secondary SDGs [Page-Level]
|
793 |
+
page_csv, # π Download Page Predictions CSV
|
794 |
+
page_docx, # π Download Page Report DOCX
|
795 |
+
page_jpeg1, # πΌοΈ Download Primary SDGs JPEG
|
796 |
+
page_csv_secondary, # π Download Page Predictions CSV
|
797 |
+
page_report_file_secondary, # π Download Page Report DOCX
|
798 |
+
secondary_page_jpeg # πΌοΈ Download Secondary SDGs JPEG
|
799 |
+
]
|
800 |
+
)
|
801 |
+
|
802 |
+
reset_page_button.click(
|
803 |
+
reset_page_outputs,
|
804 |
+
outputs=[
|
805 |
+
primary_page_plot,
|
806 |
+
secondary_page_plot,
|
807 |
+
page_csv,
|
808 |
+
page_docx,
|
809 |
+
page_jpeg1,
|
810 |
+
page_csv_secondary,
|
811 |
+
page_report_file_secondary,
|
812 |
+
secondary_page_jpeg
|
813 |
+
]
|
814 |
+
)
|
815 |
+
|
816 |
+
# Button actions for Sentence-Level Analysis
|
817 |
+
sentence_button.click(
|
818 |
+
process_sentences,
|
819 |
+
inputs=[file_input, extraction_mode, start_page, end_page],
|
820 |
+
outputs=[
|
821 |
+
primary_sentence_plot, # π Primary SDGs [Sentence-Level]
|
822 |
+
secondary_sentence_plot, # π Secondary SDGs [Sentence-Level]
|
823 |
+
sentence_csv, # π Download Sentence Predictions CSV
|
824 |
+
sentence_docx, # π Download Sentence Report DOCX
|
825 |
+
sentence_jpeg1, # πΌοΈ Download Primary SDGs JPEG
|
826 |
+
sentence_csv_secondary, # π Download Sentence Predictions CSV
|
827 |
+
sentence_report_file_secondary, # π Download Sentence Report DOCX
|
828 |
+
secondary_sentence_jpeg # πΌοΈ Download Secondary SDGs JPEG
|
829 |
+
]
|
830 |
+
)
|
831 |
+
|
832 |
+
reset_sentence_button.click(
|
833 |
+
reset_sentence_outputs,
|
834 |
+
outputs=[
|
835 |
+
primary_sentence_plot,
|
836 |
+
secondary_sentence_plot,
|
837 |
+
sentence_csv,
|
838 |
+
sentence_docx,
|
839 |
+
sentence_jpeg1,
|
840 |
+
sentence_csv_secondary,
|
841 |
+
sentence_report_file_secondary,
|
842 |
+
secondary_sentence_jpeg
|
843 |
+
]
|
844 |
+
)
|
845 |
+
|
846 |
+
demo.queue().launch()
|
847 |
+
|
848 |
+
launch_interface()
|