document-SDG-App-cpu

Sleeping

App Files Files Community

sadickam commited on Oct 18, 2024

Commit

71a06ed

verified ·

1 Parent(s): e230b99

Create app.py

Browse files

Files changed (1) hide show

app.py +848 -0

app.py ADDED Viewed

	@@ -0,0 +1,848 @@

+import gradio as gr
+import os
+import re
+import torch
+import pandas as pd
+import plotly.express as px
+import plotly.io as pio
+import nltk
+import tempfile
+from io import BytesIO
+import base64
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from nltk.tokenize import sent_tokenize
+from docx.shared import Inches
+from docx import Document
+import numpy as np
+# Needed for HF GPU access
+import spaces
+from styles import custom_css  # Importing custom CSS
+nltk.download('punkt')
+# Import PyPDFLoader for PDF processing
+from langchain_community.document_loaders import PyPDFLoader
+# Model checkpoint for SDG BERT
+checkpoint = "sadickam/sdgBERT"
+# Text cleaning function
+def clean_text(text):
+    """
+    Cleans the extracted text by removing irrelevant characters but retains currency symbols.
+    """
+    text = text.strip()
+    # Define the allowed characters (including currency symbols)
+    allowed_chars = r'[^a-zA-Z0-9\s\.,!?$€£¥₹¢₩]'
+    text = re.sub(allowed_chars, '', text)
+    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
+    return text
+# Preprocessing function for text
+def prep_text(text):
+    clean_sents = []
+    sent_tokens = sent_tokenize(str(text))
+    for sent_token in sent_tokens:
+        word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()]
+        clean_sents.append(' '.join(word_tokens))
+    joined = ' '.join(clean_sents).strip()
+    return re.sub(r'`|"', "", joined)
+# Load the tokenizer and model with GPU support
+def load_model_and_tokenizer():
+    model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+    return model, tokenizer
+# Define device (ensure usage of GPU if available in Hugging Face Spaces)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# SDG labels
+label_list = [
+    'SDG1_No Poverty', 'SDG2_Zero Hunger', 'SDG3_Good Health and Well-being', 'SDG4_Quality Education',
+    'SDG5_Gender Equality', 'SDG6_Clean Water and Sanitation', 'SDG7_Affordable and Clean Energy',
+    'SDG8_Decent Work and Economic Growth', 'SDG9_Industry, Innovation and Infrastructure',
+    'SDG10_Reduced Inequality', 'SDG11_Sustainable Cities and Communities',
+    'SDG12_Responsible Consumption and Production', 'SDG13_Climate Action',
+    'SDG14_Life Below Water', 'SDG15_Life on Land', 'SDG16_Peace, Justice and Strong Institutions'
+]
+# Function to predict SDGs for a batch of text inputs
+def predict_sdg_labels_batch(texts, model, tokenizer):
+    tokenized_texts = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
+    model.eval()
+    with torch.no_grad():
+        text_logits = model(**tokenized_texts).logits
+    predictions = torch.softmax(text_logits, dim=1).tolist()
+    return predictions
+# Page-level predictions with batch processing
+def predict_pages(page_df, batch_size=32):
+    model, tokenizer = load_model_and_tokenizer()
+    df_results = page_df.copy()
+    num_rows = len(page_df)
+    all_predicted_labels = [[] for _ in range(16)]
+    all_prediction_scores = [[] for _ in range(16)]
+    for start in range(0, num_rows, batch_size):
+        end = min(start + batch_size, num_rows)
+        df_chunk = page_df.iloc[start:end]
+        # Clean text
+        texts = df_chunk['Text'].apply(clean_text).apply(prep_text).tolist()
+        predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
+        for predictions in predictions_batch:
+            sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
+            for i, (label, score) in enumerate(sorted_preds):
+                all_predicted_labels[i].append(label)
+                all_prediction_scores[i].append(score)
+    # Add columns to the DataFrame in the desired order (pred1, score1, pred2, score2, ...)
+    for i in range(16):
+        df_results[f'pred{i + 1}'] = all_predicted_labels[i]
+        df_results[f'score{i + 1}'] = all_prediction_scores[i]
+    # Reorder columns to ensure preds and scores are interleaved in the correct order
+    reordered_columns = []
+    for i in range(16):
+        reordered_columns.append(f'pred{i + 1}')
+        reordered_columns.append(f'score{i + 1}')
+    other_columns = [col for col in df_results.columns if col not in reordered_columns]
+    df_results = df_results[other_columns + reordered_columns]
+    return df_results
+# Sentence-level predictions with batch processing
+def predict_sentences(sentence_df, batch_size=32):
+    model, tokenizer = load_model_and_tokenizer()
+    df_combined_sentences = sentence_df.copy()
+    num_rows = len(sentence_df)
+    all_predicted_labels = [[] for _ in range(16)]
+    all_prediction_scores = [[] for _ in range(16)]
+    for start in range(0, num_rows, batch_size):
+        end = min(start + batch_size, num_rows)
+        df_chunk = sentence_df.iloc[start:end]
+        # Clean text
+        texts = df_chunk['Sentence'].apply(clean_text).apply(prep_text).tolist()
+        predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
+        for predictions in predictions_batch:
+            sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
+            for i, (label, score) in enumerate(sorted_preds):
+                all_predicted_labels[i].append(label)
+                all_prediction_scores[i].append(round(score, 3))
+    # Add predictions and scores to DataFrame
+    for i in range(16):
+        df_combined_sentences[f'pred{i + 1}'] = all_predicted_labels[i]
+        df_combined_sentences[f'score{i + 1}'] = all_prediction_scores[i]
+    # Reorder columns
+    reordered_columns = []
+    for i in range(16):
+        reordered_columns.append(f'pred{i + 1}')
+        reordered_columns.append(f'score{i + 1}')
+    other_columns = [col for col in df_combined_sentences.columns if col not in reordered_columns]
+    df_combined_sentences = df_combined_sentences[other_columns + reordered_columns]
+    return df_combined_sentences
+# Define unique colors for each SDG
+sdg_colors = {
+    "SDG1_No Poverty": "#E5243B",
+    "SDG2_Zero Hunger": "#DDA63A",
+    "SDG3_Good Health and Well-being": "#4C9F38",
+    "SDG4_Quality Education": "#C5192D",
+    "SDG5_Gender Equality": "#FF3A21",
+    "SDG6_Clean Water and Sanitation": "#26BDE2",
+    "SDG7_Affordable and Clean Energy": "#FCC30B",
+    "SDG8_Decent Work and Economic Growth": "#A21942",
+    "SDG9_Industry, Innovation and Infrastructure": "#FD6925",
+    "SDG10_Reduced Inequality": "#DD1367",
+    "SDG11_Sustainable Cities and Communities": "#FD9D24",
+    "SDG12_Responsible Consumption and Production": "#BF8B2E",
+    "SDG13_Climate Action": "#3F7E44",
+    "SDG14_Life Below Water": "#0A97D9",
+    "SDG15_Life on Land": "#56C02B",
+    "SDG16_Peace, Justice and Strong Institutions": "#00689D"
+}
+# Function to plot SDG dominant bar graphs using Plotly
+# Function to plot SDG dominant bar graphs using Plotly
+def plot_sdg(df, title, pred_column, icons_folder='assets/icons/'):
+    """
+    Plots a horizontal bar graph of SDG predictions and superimposes the icon of the most frequent SDG.
+    Args:
+        df (pd.DataFrame): DataFrame containing SDG predictions.
+        title (str): Title of the plot.
+        pred_column (str): Column name to use for plotting (e.g., 'pred1').
+        icons_folder (str): Path to the folder containing SDG icons.
+    Returns:
+        plotly.graph_objs._figure.Figure: The Plotly figure object.
+    """
+    df_filtered = df[df[pred_column].notna()]
+    labels = df_filtered[pred_column].value_counts().sort_values(ascending=False)
+    total = labels.sum()
+    percentages = (labels / total) * 100
+    # Create a horizontal bar plot with Plotly
+    fig = px.bar(
+        percentages.rename_axis('SDG Label').reset_index(name='Percentage'),
+        y='SDG Label',
+        x='Percentage',
+        orientation='h',
+        title=title,
+        color='SDG Label',
+        color_discrete_map=sdg_colors  # Use the defined unique colors for each SDG
+    )
+    # Update y-axis to show labels
+    fig.update_yaxes(showticklabels=True)
+    # Add percentage labels to the bars
+    fig.update_traces(
+        texttemplate='%{x:.2f}%',
+        textposition='auto',
+        textfont=dict(size=10)
+    )
+    # Adjust layout for better visibility
+    fig.update_layout(
+        title=dict(
+            text=title, font=dict(size=14)  # Increase title font size
+        ),
+        yaxis=dict(
+            automargin=True,
+            title=None,
+            tickfont=dict(size=12)
+        ),
+        margin=dict(l=20, r=30, t=100, b=20),  # Increased right margin for icon
+        height=600,
+        #width=800,
+        showlegend=False,
+        template="simple_white",
+        xaxis=dict(
+            tickfont=dict(size=12)  # Reduce x-axis font size
+        ),
+    )
+    # Identify the most frequent SDG
+    if not percentages.empty:
+        top_sdg_label = percentages.index[0]  # e.g., 'SDG1_No Poverty'
+        # Map SDG label to icon filename
+        # Assuming naming convention 'SDG1.png', 'SDG2.png', etc.
+        sdg_number = top_sdg_label.split('_')[0]  # Extract 'SDG1'
+        icon_filename = f"{sdg_number}.png"  # e.g., 'SDG1.png'
+        icon_path = os.path.join(icons_folder, icon_filename)
+        # Check if the icon file exists
+        if os.path.exists(icon_path):
+            # Read and encode the image
+            with open(icon_path, 'rb') as image_file:
+                encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
+            # Add the icon as an image in the Plotly figure
+            fig.add_layout_image(
+                dict(
+                    source='data:image/png;base64,' + encoded_image,
+                    xref="paper", yref="paper",
+                    x=0.4, y=1.2,  # Positioning: slightly to the right and top
+                    sizex=0.2, sizey=0.2,  # Size of the icon
+                    xanchor="left",
+                    yanchor="top",
+                    layer="above"  # Ensure the icon is above other plot elements
+                )
+            )
+        else:
+            print(f"Icon file '{icon_path}' not found. Skipping icon overlay.")
+    return fig
+def save_figure_as_jpeg(fig, filename):
+    """Saves the Plotly figure as a high-resolution JPEG."""
+    pio.write_image(fig, filename, format='jpeg', width=1200, height=600, scale=6)
+# Generate reports (page and sentence levels)
+def generate_page_report(df_pages, report_file_name):
+    doc = Document()
+    doc.add_heading("Page-Level SDG Analysis Report", 0)
+    doc.add_heading("📋 General Notes", level=2)
+    doc.add_paragraph(
+        'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
+        'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
+        'representing the likelihood that the text is aligned with particular SDGs. This page-level '
+        'analysis provides high-level insight into SDG alignment.'
+        '\n\n'
+        'Given that a page may align with more than one SDG, this app focuses on the top two SDG predictions '
+        '(Primary and Secondary) for each page with a probability score greater than zero.'
+    )
+    doc.add_heading("Primary SDGs Bar Graph", level=3)
+    doc.add_paragraph(
+        'This graph displays the most essential SDG the AI model associates with pages. The bars '
+        'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
+        'sustainable development theme within the document.'
+    )
+    doc.add_heading("Secondary SDGs Bar Graph", level=3)
+    doc.add_paragraph(
+        'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
+        'not the primary focus, the text has some relevance to these goals.'
+    )
+    for doc_name in df_pages['Document'].unique():
+        # Sanitize doc_name to use in file names
+        sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
+        doc.add_heading(f"📄 Document: {doc_name}", level=2)
+        df_doc = df_pages[df_pages['Document'] == doc_name]
+        # Generate and save graphs
+        first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_page.jpeg"
+        second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_page.jpeg"
+        plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
+            first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
+        plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image(
+            second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
+        # Add plots to the Word document
+        doc.add_picture(first_sdg_plot_path, width=Inches(6))
+        doc.add_picture(second_sdg_plot_path, width=Inches(6))
+    doc.save(report_file_name)
+    return report_file_name
+def generate_sentence_report(df_sentences, report_file_name):
+    doc = Document()
+    doc.add_heading("Sentence-Level SDG Analysis Report", 0)
+    doc.add_heading("📋 General Notes", level=2)
+    doc.add_paragraph(
+        'This app splits documents into sentences using a natural language processing algorithm. '
+        'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
+        'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
+        'representing the likelihood that the text is aligned with particular SDGs. This sentence-level '
+        'analysis provides deeper insight into SDG alignment.'
+        '\n\n'
+        'Given that a sentence may align with more than one SDG, this app focuses on the top two SDG predictions '
+        '(Primary and Secondary) for each sentence with a probability score greater than zero.'
+    )
+    doc.add_heading("Primary SDGs Bar Graph", level=3)
+    doc.add_paragraph(
+        'This graph displays the most essential SDG the AI model associates with sentences. The bars '
+        'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
+        'into the dominant sustainable development theme within the document.'
+    )
+    doc.add_heading("Secondary SDGs Bar Graph", level=3)
+    doc.add_paragraph(
+        'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
+        'the primary focus, the text has some relevance to these goals.'
+    )
+    for doc_name in df_sentences['Document'].unique():
+        # Sanitize doc_name to use in file names
+        sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
+        doc.add_heading(f"📄 Document: {doc_name}", level=2)
+        df_doc = df_sentences[df_sentences['Document'] == doc_name]
+        # Generate and save graphs
+        first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_sentence.jpeg"
+        second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_sentence.jpeg"
+        plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
+            first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
+        plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image(
+            second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
+        # Add plots to the Word document
+        doc.add_picture(first_sdg_plot_path, width=Inches(6))
+        doc.add_picture(second_sdg_plot_path, width=Inches(6))
+    doc.save(report_file_name)
+    return report_file_name
+# New text extraction functions with text cleaning and line joining
+def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
+    """
+    Extract text from a PDF page by page using LangChain's PyPDFLoader.
+    Args:
+        pdf_file_path (str): The file path to the uploaded PDF.
+        start_page (int, optional): The starting page number for extraction (1-based index).
+        end_page (int, optional): The ending page number for extraction (1-based index).
+    Returns:
+        tuple:
+            - page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
+            - sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
+    """
+    try:
+        # Initialize the loader
+        loader = PyPDFLoader(pdf_file_path)
+        documents = loader.load_and_split()  # Each document corresponds to a single page
+        total_pages = len(documents)
+        doc_name = os.path.basename(pdf_file_path)  # Extract document name
+        # Validate and adjust page range
+        if start_page is not None and end_page is not None:
+            # Convert to integers to avoid slicing issues
+            start_page = int(start_page)
+            end_page = int(end_page)
+            # Adjust to valid range
+            if start_page < 1:
+                start_page = 1
+            if end_page > total_pages:
+                end_page = total_pages
+            if start_page > end_page:
+                start_page, end_page = end_page, start_page  # Swap if out of order
+            # Select the subset of documents based on user input
+            selected_docs = documents[start_page - 1:end_page]
+        else:
+            selected_docs = documents
+            start_page = 1
+            end_page = total_pages
+        # Initialize lists to store data
+        page_data = []
+        sentence_data = []
+        for idx, doc in enumerate(selected_docs, start=start_page):
+            page_num = idx
+            text = doc.page_content.strip()
+            # Join lines that belong to the same sentence
+            lines = text.split('\n')
+            joined_text = ' '.join(line.strip() for line in lines if line.strip())
+            # Clean text
+            cleaned_text = clean_text(joined_text)
+            # Append page-wise data
+            page_data.append({
+                "Document": doc_name,
+                "Page": page_num,
+                "Text": cleaned_text
+            })
+            # Sentence tokenization
+            sentences = sent_tokenize(cleaned_text)
+            for sentence in sentences:
+                sentence = sentence.strip()
+                if sentence and len(sentence) > 70:
+                    sentence_data.append({
+                        "Document": doc_name,
+                        "Page": page_num,
+                        "Sentence": sentence
+                    })
+        # Create DataFrames
+        page_df = pd.DataFrame(page_data)
+        sentence_df = pd.DataFrame(sentence_data)
+        return page_df, sentence_df
+    except Exception as e:
+        raise RuntimeError(f"Error during PDF extraction: {e}")
+def df_to_csv_bytes(df):
+    """
+    Convert DataFrame to CSV in bytes.
+    Args:
+        df (pd.DataFrame): The DataFrame to convert.
+    Returns:
+        bytes: CSV data in bytes.
+    """
+    try:
+        buffer = BytesIO()
+        df.to_csv(buffer, index=False)
+        csv_data = buffer.getvalue()
+        buffer.close()
+        return csv_data
+    except Exception as e:
+        raise RuntimeError(f"Error during CSV conversion: {e}")
+def launch_interface():
+    with gr.Blocks(css=custom_css) as demo:
+        # Title as a visible heading at the top of the page with an icon
+        gr.Markdown(
+            """
+            # 🌍 SDG Document Analysis App
+            Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
+            """
+        )
+        # Shared PDF file input for both analyses
+        gr.Markdown("## Upload PDF File")
+        with gr.Row():
+            file_input = gr.File(
+                label="📁 Upload PDF File for Analysis", file_types=[".pdf"]
+            )
+        # Extraction mode selection with explanatory text
+        gr.Markdown(
+            """
+            ## PDF Text Extraction Mode
+            Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select
+            "Range of Pages" and specify the start and end pages.
+            """
+        )
+        with gr.Row():
+            extraction_mode = gr.Radio(
+                choices=["All Pages", "Range of Pages"],
+                value="All Pages",
+                label="Extraction Mode"
+            )
+        with gr.Row():
+            start_page = gr.Number(value=1, label="🔢 Start Page", visible=False, info="The cover page is page 1")
+            end_page = gr.Number(value=1, label="🔢 End Page", visible=False)
+        # Function to update visibility of start_page and end_page
+        def update_page_inputs(extraction_mode):
+            if extraction_mode == "Range of Pages":
+                return gr.update(visible=True), gr.update(visible=True)
+            else:
+                return gr.update(visible=False), gr.update(visible=False)
+        extraction_mode.change(
+            update_page_inputs,
+            inputs=extraction_mode,
+            outputs=[start_page, end_page]
+        )
+        # Main Tabs for Page-Level and Sentence-Level Analysis
+        gr.Markdown("## SDG Analysis Type")
+        with gr.Tab("📄 Page-Level Analysis"):
+            gr.Markdown(
+                """
+                ### Page-Level SDG Analysis
+                This section conducts Sustainable Development Goals (SDG) mapping
+                of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
+                It provides **high-level SDG mapping** of documents at the page level.
+                """
+            )
+            with gr.Row():
+                page_button = gr.Button("🏃‍♂️ Run Page-Level Analysis")
+                reset_page_button = gr.Button("🔄 Reset Page-Level Analysis", elem_classes="reset-button")
+            # Nested Tabs for Primary and Secondary SDGs
+            with gr.Tabs():
+                with gr.TabItem("📊 Primary SDGs"):
+                    with gr.Row():
+                        primary_page_plot = gr.Plot(label="📊 Primary SDGs Graph [Page-Level]", scale=2)
+                        gr.Markdown(
+                            "When the analysis is done, the Primary SDGs bar graph on the left will show "+
+                            "the percentage of pages that strongly align with each SDG. The icon for the most frequent "+
+                            "SDG will be highlighted above the graph. Download the Page Predictions CVS for further details.",
+                            label = "Note", container=True
+                        )
+                    gr.Markdown("##### Download Results")
+                    with gr.Row():
+                        page_csv = gr.File(label="📊 Download Page Predictions CSV")
+                        page_docx = gr.File(label="📄 Download Page Report DOCX")
+                        page_jpeg1 = gr.File(label="🖼️ Download Primary SDGs JPEG")
+                with gr.TabItem("📈 Secondary SDGs"):
+                    with gr.Row():
+                        secondary_page_plot = gr.Plot(label="📈 Secondary SDGs Graph [Page-Level]", scale=2)
+                        gr.Markdown(
+                            "When the analysis is done, the Secondary SDGs bar graph on the left will show "+
+                            "SDGs that are not the primary focus of the pages analysed. These SDGs are second to the "+
+                            "Primary SDGs. Download the Sentence Predictions CVS for further details",
+                            label = "Note", container=True
+                        )
+                    gr.Markdown("##### Download Results")
+                    with gr.Row():
+                        page_csv_secondary = gr.File(label="📊 Download Page Predictions CSV")
+                        page_report_file_secondary = gr.File(label="📄 Download Page Report DOCX")
+                        secondary_page_jpeg = gr.File(label="🖼️ Download Secondary SDGs JPEG")
+        with gr.Tab("✍️ Sentence-Level Analysis"):
+            gr.Markdown(
+                """
+                ### Sentence-Level SDG Analysis
+                This section conducts Sustainable Development Goals (SDG) mapping
+                using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
+                It provides **detailed SDG mapping** at the sentence level.
+                """
+            )
+            with gr.Row():
+                sentence_button = gr.Button("🏃‍♂️ Run Sentence-Level Analysis")
+                reset_sentence_button = gr.Button("🔄 Reset Sentence-Level Analysis", elem_classes="reset-button")
+            # Nested Tabs for Primary and Secondary SDGs
+            with gr.Tabs():
+                with gr.TabItem("📊 Primary SDGs"):
+                    with gr.Row():
+                        primary_sentence_plot = gr.Plot(label="📊 Primary SDGs Graph [Sentence-Level]", scale=2)
+                        gr.Markdown(
+                            "When the analysis is done, the Primary SDGs bar graph on the left will show "+
+                            "the percentage of sentences that strongly align with each SDG. The icon for the most frequent "+
+                            "SDG will be highlighted above the graph. Download the Sentence Predictions CVS for further details.",
+                            label = "Note", container=True
+                        )
+                    gr.Markdown("##### Download Results")
+                    with gr.Row():
+                        sentence_csv = gr.File(label="📊 Download Sentence Predictions CSV")
+                        sentence_docx = gr.File(label="📄 Download Sentence Report DOCX")
+                        sentence_jpeg1 = gr.File(label="🖼️ Download Primary SDGs JPEG")
+                with gr.TabItem("📈 Secondary SDGs"):
+                    with gr.Row():
+                        secondary_sentence_plot = gr.Plot(label="📈 Secondary SDGs Graph [Sentence-Level]", scale=2)
+                        gr.Markdown(
+                            "When the analysis is done, the Secondary SDGs bar graph on the left will show "+
+                            "SDGs that are not the primary focus of the sentences analysed. These SDGs are second to the "+
+                            "Primary SDGs. Download the Sentence Predictions CVS for further details",
+                            label = "Note", container=True
+                        )
+                    gr.Markdown("##### Download Results")
+                    with gr.Row():
+                        sentence_csv_secondary = gr.File(label="📊 Download Sentence Predictions CSV")
+                        sentence_report_file_secondary = gr.File(label="📄 Download Sentence Report DOCX")
+                        secondary_sentence_jpeg = gr.File(label="🖼️ Download Secondary SDGs JPEG")
+        # Function to process page-level analysis
+        @spaces.GPU
+        def process_pages(file, extraction_mode, start_page, end_page):
+            if not file:
+                # Return None for each output component
+                return [None, None, None, None, None, None, None, None]
+            try:
+                if hasattr(file, 'name'):
+                    pdf_file_path = file.name
+                    original_file_name = os.path.basename(file.name)
+                else:
+                    # Save the file to a temporary location
+                    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
+                        temp_pdf.write(file.read())
+                        pdf_file_path = temp_pdf.name
+                    original_file_name = 'uploaded_document'
+                # Sanitize the file name to use in output file names
+                sanitized_file_name = os.path.splitext(original_file_name)[0]
+                sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
+                # Determine page range based on extraction_mode
+                if extraction_mode == "All Pages":
+                    selected_start = None
+                    selected_end = None
+                else:
+                    selected_start = int(start_page)
+                    selected_end = int(end_page)
+                # Extract text and create DataFrames
+                page_df, _ = extract_text_with_py_pdf_loader(
+                    pdf_file_path,
+                    start_page=selected_start,
+                    end_page=selected_end
+                )
+                # Predict SDGs at page level
+                df_page_predictions = predict_pages(page_df)
+                # Generate plots with icon overlay
+                first_plot = plot_sdg(
+                    df_page_predictions, "📊 Primary SDGs", 'pred1'
+                )
+                second_plot = plot_sdg(
+                    df_page_predictions, "📈 Secondary SDGs", 'pred2'
+                )
+                # Define output file names
+                page_csv_file = f"{sanitized_file_name}_SDG-Page_predictions.csv"
+                page_report_file = f"{sanitized_file_name}_SDG-Page_report.docx"
+                primary_page_jpeg = f"{sanitized_file_name}_SDG-Page_primary_graph.jpeg"
+                page_csv_file_secondary = f"{sanitized_file_name}_SDG-Page_predictions.csv"
+                page_report_file_secondary = f"{sanitized_file_name}_SDG-Page_report.docx"
+                secondary_page_jpeg = f"{sanitized_file_name}_SDG-Page_secondary_graph.jpeg"
+                # Save CSV and reports
+                df_page_predictions.to_csv(page_csv_file, index=False)
+                page_report_primary = generate_page_report(df_page_predictions, page_report_file)
+                df_page_predictions.to_csv(page_csv_file_secondary, index=False)
+                page_report_secondary = generate_page_report(df_page_predictions, page_report_file_secondary)
+                # Save figures as JPEG
+                save_figure_as_jpeg(first_plot, primary_page_jpeg)
+                save_figure_as_jpeg(second_plot, secondary_page_jpeg)
+                return (
+                    first_plot, second_plot,
+                    page_csv_file, page_report_file, primary_page_jpeg,
+                    page_csv_file_secondary, page_report_file_secondary, secondary_page_jpeg
+                )
+            except Exception as e:
+                print(f"Error: {e}")
+                return [None, None, None, None, None, None, None, None]
+        # Function to process sentence-level analysis
+        @spaces.GPU
+        def process_sentences(file, extraction_mode, start_page, end_page):
+            if not file:
+                # Return None for each output component
+                return [None, None, None, None, None, None, None, None]
+            try:
+                if hasattr(file, 'name'):
+                    pdf_file_path = file.name
+                    original_file_name = os.path.basename(file.name)
+                else:
+                    # Save the file to a temporary location
+                    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
+                        temp_pdf.write(file.read())
+                        pdf_file_path = temp_pdf.name
+                    original_file_name = 'uploaded_document'
+                # Sanitize the file name to use in output file names
+                sanitized_file_name = os.path.splitext(original_file_name)[0]
+                sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
+                # Determine page range based on extraction_mode
+                if extraction_mode == "All Pages":
+                    selected_start = None
+                    selected_end = None
+                else:
+                    selected_start = int(start_page)
+                    selected_end = int(end_page)
+                # Extract text and create DataFrames
+                _, sentence_df = extract_text_with_py_pdf_loader(
+                    pdf_file_path,
+                    start_page=selected_start,
+                    end_page=selected_end
+                )
+                # Predict SDGs at sentence level
+                df_sentence_predictions = predict_sentences(sentence_df)
+                # Generate plots with icon overlay
+                first_plot = plot_sdg(
+                    df_sentence_predictions, "📊 Primary SDGs", 'pred1'
+                )
+                second_plot = plot_sdg(
+                    df_sentence_predictions, "📈 Secondary SDGs", 'pred2'
+                )
+                # Define output file names
+                sentence_csv_file = f"{sanitized_file_name}_SDG-Sentence_predictions.csv"
+                sentence_report_file = f"{sanitized_file_name}_SDG-Sentence_report.docx"
+                primary_sentence_jpeg = f"{sanitized_file_name}_SDG-Sentence_primary_graph.jpeg"
+                sentence_csv_file_secondary = f"{sanitized_file_name}_SDG-Sentence_predictions.csv"
+                sentence_report_file_secondary = f"{sanitized_file_name}_SDG-Sentence_report.docx"
+                secondary_sentence_jpeg = f"{sanitized_file_name}_SDG-Sentence_secondary_graph.jpeg"
+                # Save CSV and reports
+                df_sentence_predictions.to_csv(sentence_csv_file, index=False)
+                sentence_report_primary = generate_sentence_report(df_sentence_predictions, sentence_report_file)
+                df_sentence_predictions.to_csv(sentence_csv_file_secondary, index=False)
+                sentence_report_secondary = generate_sentence_report(df_sentence_predictions, sentence_report_file_secondary)
+                # Save figures as JPEG
+                save_figure_as_jpeg(first_plot, primary_sentence_jpeg)
+                save_figure_as_jpeg(second_plot, secondary_sentence_jpeg)
+                return (
+                    first_plot, second_plot,
+                    sentence_csv_file, sentence_report_file, primary_sentence_jpeg,
+                    sentence_csv_file_secondary, sentence_report_file_secondary, secondary_sentence_jpeg
+                )
+            except Exception as e:
+                print(f"Error: {e}")
+                return [None, None, None, None, None, None, None, None]
+        # Reset functions to clear the outputs
+        def reset_page_outputs():
+            return [None, None, None, None, None, None, None, None]
+        def reset_sentence_outputs():
+            return [None, None, None, None, None, None, None, None]
+        # Button actions for Page-Level Analysis
+        page_button.click(
+            process_pages,
+            inputs=[file_input, extraction_mode, start_page, end_page],
+            outputs=[
+                primary_page_plot,           # 📊 Primary SDGs [Page-Level]
+                secondary_page_plot,         # 📈 Secondary SDGs [Page-Level]
+                page_csv,                    # 📊 Download Page Predictions CSV
+                page_docx,                   # 📄 Download Page Report DOCX
+                page_jpeg1,                  # 🖼️ Download Primary SDGs JPEG
+                page_csv_secondary,          # 📊 Download Page Predictions CSV
+                page_report_file_secondary,  # 📄 Download Page Report DOCX
+                secondary_page_jpeg          # 🖼️ Download Secondary SDGs JPEG
+            ]
+        )
+        reset_page_button.click(
+            reset_page_outputs,
+            outputs=[
+                primary_page_plot,
+                secondary_page_plot,
+                page_csv,
+                page_docx,
+                page_jpeg1,
+                page_csv_secondary,
+                page_report_file_secondary,
+                secondary_page_jpeg
+            ]
+        )
+        # Button actions for Sentence-Level Analysis
+        sentence_button.click(
+            process_sentences,
+            inputs=[file_input, extraction_mode, start_page, end_page],
+            outputs=[
+                primary_sentence_plot,           # 📊 Primary SDGs [Sentence-Level]
+                secondary_sentence_plot,         # 📈 Secondary SDGs [Sentence-Level]
+                sentence_csv,                    # 📊 Download Sentence Predictions CSV
+                sentence_docx,                   # 📄 Download Sentence Report DOCX
+                sentence_jpeg1,                  # 🖼️ Download Primary SDGs JPEG
+                sentence_csv_secondary,          # 📊 Download Sentence Predictions CSV
+                sentence_report_file_secondary,  # 📄 Download Sentence Report DOCX
+                secondary_sentence_jpeg          # 🖼️ Download Secondary SDGs JPEG
+            ]
+        )
+        reset_sentence_button.click(
+            reset_sentence_outputs,
+            outputs=[
+                primary_sentence_plot,
+                secondary_sentence_plot,
+                sentence_csv,
+                sentence_docx,
+                sentence_jpeg1,
+                sentence_csv_secondary,
+                sentence_report_file_secondary,
+                secondary_sentence_jpeg
+            ]
+        )
+    demo.queue().launch()
+launch_interface()