File size: 9,049 Bytes
01d9aad
 
fb7a57d
01d9aad
 
 
b390e60
d4be6e6
 
b390e60
e452a5c
 
01d9aad
 
 
96aa704
01d9aad
 
 
66a86a3
 
0945896
 
662dc37
 
66a86a3
 
 
 
6d47b6a
66a86a3
 
 
 
51fcc5c
 
 
 
 
96aa704
66a86a3
96aa704
662dc37
 
96aa704
662dc37
 
 
96aa704
66a86a3
 
0945896
51fcc5c
f60d1c6
fb7a57d
e452a5c
51fcc5c
fb7a57d
 
51fcc5c
 
 
f60d1c6
 
 
633f6ea
f60d1c6
633f6ea
66a86a3
51fcc5c
 
 
 
1f1805f
e452a5c
51fcc5c
 
 
 
 
f60d1c6
 
 
 
 
 
 
 
 
 
 
 
 
 
51fcc5c
f60d1c6
51fcc5c
 
f60d1c6
51fcc5c
 
f60d1c6
 
 
 
51fcc5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87ccdf0
633f6ea
e452a5c
633f6ea
 
 
 
 
 
 
 
 
 
 
e452a5c
633f6ea
 
 
d4be6e6
633f6ea
d4be6e6
 
 
5db15b3
d4be6e6
 
 
 
633f6ea
 
 
d4be6e6
5db15b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4be6e6
bf19bee
d4be6e6
bf19bee
 
 
 
51fcc5c
bf19bee
 
 
 
 
 
d4be6e6
51fcc5c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190

from os import write
import time
import pandas as pd
import base64
from typing import Sequence
import streamlit as st
from sklearn.metrics import classification_report


# from models import create_nest_sentences, load_summary_model, summarizer_gen, load_model, classifier_zero
import models as md
from utils import plot_result, plot_dual_bar_chart, examples_load, example_long_text_load
import json

ex_text, ex_license, ex_labels, ex_glabels = examples_load()
ex_long_text = example_long_text_load()


# if __name__ == '__main__':
st.header("Summzarization & Multi-label Classification for Long Text")
st.write("This app summarizes and then classifies your long text with multiple labels.")
st.write("__Inputs__: User enters their own custom text and labels.")
st.write("__Outputs__: A summary of the text, likelihood percentages for each label and a downloadable csv of the results. \
    Option to evaluate results against a list of ground truth labels, if available.")

with st.form(key='my_form'):
    example_text = ex_long_text #ex_text
    display_text = "[Excerpt from Project Gutenberg: Frankenstein]\n" + example_text + "\n\n" + ex_license
    text_input = st.text_area("Input any text you want to summarize & classify here (keep in mind very long text will take a while to process):", display_text)

    if text_input == display_text:
        text_input = example_text

    gen_keywords = st.radio(
        "Generate keywords from text?",
        ('Yes', 'No')
        )

    labels = st.text_input('Enter possible labels (comma-separated):',ex_labels, max_chars=1000)
    labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
    
    glabels = st.text_input('If available, enter ground truth labels to evaluate results, otherwise leave blank (comma-separated):',ex_glabels, max_chars=1000)
    glabels = list(set([x.strip() for x in glabels.strip().split(',') if len(x.strip()) > 0]))

    threshold_value = st.slider(
         'Select a threshold cutoff for matching percentage (used for ground truth label evaluation)',
         0.0, 1.0, (0.5))

    submit_button = st.form_submit_button(label='Submit')



with st.spinner('Loading pretrained models...'):
    start = time.time()
    summarizer = md.load_summary_model()   
    s_time = round(time.time() - start,4)

    start = time.time()
    classifier = md.load_model()  
    c_time = round(time.time() - start,4)

    start = time.time()
    kw_model = md.load_keyword_model()
    k_time = round(time.time() - start,4)

    st.success(f'Time taken to load BART summarizer mnli model: {s_time}s & BART classifier mnli model: {c_time}s & KeyBERT model: {k_time}s')

if submit_button:
    if len(text_input) == 0:
        st.write("Enter some text to generate a summary")
    else:
        with st.spinner('Breaking up text into more reasonable chunks (tranformers cannot exceed a 1024 token max)...'):  
            # For each body of text, create text chunks of a certain token size required for the transformer
            nested_sentences = md.create_nest_sentences(document = text_input, token_max_length = 1024)
                    # For each chunk of sentences (within the token max)
        text_chunks = []
        for n in range(0, len(nested_sentences)):
            tc = " ".join(map(str, nested_sentences[n]))
            text_chunks.append(tc)

        if gen_keywords == 'Yes':
            st.markdown("### Top Keywords")
            with st.spinner("Generating keywords from text..."):

                kw_df = pd.DataFrame()
                for text_chunk in text_chunks:
                    keywords_list = md.keyword_gen(kw_model, text_chunk)
                    kw_df = kw_df.append(pd.DataFrame(keywords_list))
                kw_df.columns = ['keyword', 'score']
                top_kw_df = kw_df.groupby('keyword')['score'].max().reset_index()

                top_kw_df = top_kw_df.sort_values('score', ascending = False).reset_index().drop(['index'], axis=1)
                st.dataframe(top_kw_df.head(10))
 
        st.markdown("### Text Chunk & Summaries")
        with st.spinner('Generating summaries for text chunks...'):

            my_expander = st.expander(label='Expand to see intermediate summary generation details')
            with my_expander:
                summary = []
                
                st.markdown("_The original text is broken into chunks with complete sentences totaling \
                    fewer than 1024 tokens, a requirement for the summarizer. Each block of text is then summarized separately \
                    and then combined at the very end to generate the final summary._")

                for num_chunk, text_chunk in enumerate(text_chunks):
                    st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
                    st.markdown(text_chunk)

                    chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
                    summary.append(chunk_summary) 
                    st.markdown(f"###### Partial Summary {num_chunk+1}/{len(text_chunks)}")
                    st.markdown(chunk_summary)
                    # Combine all the summaries into a list and compress into one document, again
                    final_summary = " \n\n".join(list(summary))

            st.markdown("### Combined Summary")
            st.markdown(final_summary)

    if len(text_input) == 0 or len(labels) == 0:
        st.write('Enter some text and at least one possible topic to see predictions.')
    else:
        st.markdown("### Top Label Predictions on Summary & Full Text")
        with st.spinner('Matching labels...'):
            topics, scores = md.classifier_zero(classifier, sequence=final_summary, labels=labels, multi_class=True)
            # st.markdown("### Top Label Predictions: Combined Summary")
            # plot_result(topics[::-1][:], scores[::-1][:])
            # st.markdown("### Download Data")
            data = pd.DataFrame({'label': topics, 'scores_from_summary': scores})
            # st.dataframe(data)
            # coded_data = base64.b64encode(data.to_csv(index = False). encode ()).decode()
            # st.markdown(
            #     f'<a href="data:file/csv;base64, {coded_data}" download = "data.csv">Download Data</a>',
            #     unsafe_allow_html = True
            #     )

            topics_ex_text, scores_ex_text = md.classifier_zero(classifier, sequence=example_text, labels=labels, multi_class=True)
            plot_dual_bar_chart(topics, scores, topics_ex_text, scores_ex_text)

            data_ex_text = pd.DataFrame({'label': topics_ex_text, 'scores_from_full_text': scores_ex_text})
            
            data2 = pd.merge(data, data_ex_text, on = ['label'])

            if len(glabels) > 0:
                gdata = pd.DataFrame({'label': glabels})
                gdata['is_true_label'] = int(1)           
            
                data2 = pd.merge(data2, gdata, how = 'left', on = ['label'])
                data2['is_true_label'].fillna(0, inplace = True)

            st.markdown("### Data Table")
            with st.spinner('Generating a table of results and a download link...'):
                st.dataframe(data2)

                @st.cache
                def convert_df(df):
                     # IMPORTANT: Cache the conversion to prevent computation on every rerun
                     return df.to_csv().encode('utf-8')
                csv = convert_df(data2)
                st.download_button(
                     label="Download data as CSV",
                     data=csv,
                     file_name='text_labels.csv',
                     mime='text/csv',
                 )
                # coded_data = base64.b64encode(data2.to_csv(index = False). encode ()).decode()
                # st.markdown(
                #     f'<a href="data:file/csv;base64, {coded_data}" download = "data.csv">Click here to download the data</a>',
                #     unsafe_allow_html = True
                #     )

            if len(glabels) > 0:
                st.markdown("### Evaluation Metrics")
                with st.spinner('Evaluating output against ground truth...'):

                    section_header_description = ['Summary Label Performance', 'Original Full Text Label Performance']
                    data_headers = ['scores_from_summary', 'scores_from_full_text']
                    for i in range(0,2):
                        st.markdown(f"###### {section_header_description[i]}")
                        report = classification_report(y_true = data2[['is_true_label']], 
                            y_pred = (data2[[data_headers[i]]] >= threshold_value) * 1.0,
                            output_dict=True)
                        df_report = pd.DataFrame(report).transpose()
                        st.markdown(f"Threshold set for: {threshold_value}")
                        st.dataframe(df_report)

        st.success('All done!')
        st.balloons()