File size: 4,823 Bytes
4354680
fe021fb
cf53b75
d97bcce
4df3ec6
4b21134
 
e9ee3ed
 
 
f3505bb
462dc3c
f3505bb
cf53b75
6514554
ee790d1
cf53b75
e36f01a
 
 
f39343a
2fdb3df
 
 
 
 
4b21134
 
 
e2a202c
 
ea60296
 
 
 
8381f6e
 
e2a202c
 
 
ea60296
8381f6e
 
e2a202c
 
121b578
32ff21e
4354680
e7fc023
 
d97bcce
bcdeb67
32ff21e
121b578
f39343a
fe021fb
0f104d5
727bc47
0f104d5
 
79d5beb
5e2bb2b
79d5beb
 
fe021fb
 
 
ee790d1
462dc3c
ee790d1
 
fe021fb
ee790d1
4b21134
4df3ec6
 
4354680
ee790d1
4354680
ee790d1
4df3ec6
f39343a
4df3ec6
 
 
b916752
ee790d1
cf93567
ee790d1
4df3ec6
4b21134
 
 
 
6f0c363
48200ac
4df3ec6
4b21134
 
 
 
ee790d1
890cbac
 
c099517
890cbac
097245e
0c2753a
f3505bb
ee790d1
f3505bb
af880ff
32ff21e
 
 
 
 
 
4354680
121b578
4b21134
 
4df3ec6
 
ee790d1
 
6514554
24d1c8f
 
6514554
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import nltk
import validators
import streamlit as st
from transformers import AutoTokenizer, pipeline

# local modules
from extractive_summarizer.model_processors import Summarizer
from utils import (
    clean_text,
    fetch_article_text,
    preprocess_text_for_abstractive_summarization,
    read_text_from_file,
)

from rouge import Rouge

if __name__ == "__main__":
    # ---------------------------------
    # Main Application
    # ---------------------------------
    st.title("Text Summarizer πŸ“")

    st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
    st.markdown(
        "Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
    )
    summarize_type = st.sidebar.selectbox(
        "Summarization type", options=["Extractive", "Abstractive"]
    )

    st.markdown(
        "Enter a text or a url to get a concise summary of the article while conserving the overall meaning. This app supports text in the following formats:"
    )
    st.markdown(
        """- Raw text in text box 
- URL of article/news to be summarized 
- .txt, .pdf, .docx file formats"""
    )
    st.markdown(
        """This app supports two type of summarization:

1. **Extractive Summarization**: The extractive approach involves picking up the most important phrases and lines from the documents. It then combines all the important lines to create the summary. So, in this case, every line and word of the summary actually belongs to the original document which is summarized.
2. **Abstractive Summarization**: The abstractive approach involves rephrasing the complete document while capturing the complete meaning of the document. This type of summarization provides more human-like summary"""
    )
    st.markdown("---")
    # ---------------------------
    # SETUP & Constants
    nltk.download("punkt")
    abs_tokenizer_name = "facebook/bart-large-cnn"
    abs_model_name = "facebook/bart-large-cnn"
    abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
    abs_max_length = 90
    abs_min_length = 30
    # ---------------------------

    inp_text = st.text_input("Enter text or a url here")
    st.markdown(
        "<h3 style='text-align: center; color: green;'>OR</h3>",
        unsafe_allow_html=True,
    )
    uploaded_file = st.file_uploader(
        "Upload a .txt, .pdf, .docx file for summarization"
    )

    is_url = validators.url(inp_text)
    if is_url:
        # complete text, chunks to summarize (list of sentences for long docs)
        text, cleaned_txt = fetch_article_text(url=inp_text)
    elif uploaded_file:
        cleaned_txt = read_text_from_file(uploaded_file)
        cleaned_txt = clean_text(cleaned_txt)
    else:
        cleaned_txt = clean_text(inp_text)

    # view summarized text (expander)
    with st.expander("View input text"):
        if is_url:
            st.write(cleaned_txt[0])
        else:
            st.write(cleaned_txt)
    summarize = st.button("Summarize")

    # called on toggle button [summarize]
    if summarize:
        if summarize_type == "Extractive":
            if is_url:
                text_to_summarize = " ".join([txt for txt in cleaned_txt])
            else:
                text_to_summarize = cleaned_txt
            # extractive summarizer

            with st.spinner(
                text="Creating extractive summary. This might take a few seconds ..."
            ):
                ext_model = Summarizer()
                summarized_text = ext_model(text_to_summarize, num_sentences=5)

        elif summarize_type == "Abstractive":
            with st.spinner(
                text="Creating abstractive summary. This might take a few seconds ..."
            ):
                text_to_summarize = cleaned_txt
                abs_summarizer = pipeline(
                    "summarization", model=abs_model_name, tokenizer=abs_tokenizer_name
                )

                if is_url is False:
                    # list of chunks
                    text_to_summarize = preprocess_text_for_abstractive_summarization(
                        tokenizer=abs_tokenizer, text=cleaned_txt
                    )

                tmp_sum = abs_summarizer(
                    text_to_summarize,
                    max_length=abs_max_length,
                    min_length=abs_min_length,
                    do_sample=False,
                )

                summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])

        # final summarized output
        st.subheader("Summarized text")
        st.info(summarized_text)

        st.subheader("Rogue Scores")
        rouge_sc = Rouge()
        ground_truth = cleaned_txt[0] if is_url else cleaned_txt
        score = rouge_sc.get_scores(summarized_text, ground_truth, avg=True)
        st.code(score)