Spaces:
Build error
Build error
sdhanabal1
commited on
Commit
·
ce42613
1
Parent(s):
91466d8
Address PR review comments
Browse files- Summarizer.py +1 -0
- app.py +88 -87
- requirements.txt +3 -3
Summarizer.py
CHANGED
@@ -12,6 +12,7 @@ from transformers import Pipeline
|
|
12 |
|
13 |
class Summarizer:
|
14 |
DEFAULT_LANGUAGE = "english"
|
|
|
15 |
|
16 |
def __init__(self, pipeline: Pipeline):
|
17 |
self.pipeline = pipeline
|
|
|
12 |
|
13 |
class Summarizer:
|
14 |
DEFAULT_LANGUAGE = "english"
|
15 |
+
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
|
16 |
|
17 |
def __init__(self, pipeline: Pipeline):
|
18 |
self.pipeline = pipeline
|
app.py
CHANGED
@@ -6,99 +6,100 @@ from validators import ValidationFailure
|
|
6 |
|
7 |
from Summarizer import Summarizer
|
8 |
|
9 |
-
nltk.download('punkt')
|
10 |
-
|
11 |
-
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
|
12 |
-
|
13 |
-
st.markdown('# Terms & conditions summarization :pencil:')
|
14 |
-
st.write('Do you also take the time out of your day to thoroughly read every word of the Terms & Conditions before signing up for a new app? :thinking_face: \nNo?'
|
15 |
-
'Well have we got a demo for you!'
|
16 |
-
'Just copy-paste the lengthy Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!'
|
17 |
-
'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)'
|
18 |
-
'The abstractive summarization is preceded by LSA (Latent Semantic Analysis) extractive summarization')
|
19 |
-
st.write('Want to find out more?'
|
20 |
-
'For information about the extractive summarization :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis'
|
21 |
-
'For information about the abstractive summarization :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr')
|
22 |
-
|
23 |
-
st.markdown("""
|
24 |
-
To use this:
|
25 |
-
- Number of sentences to be extracted is configurable
|
26 |
-
- Specify an URL to extract contents OR copy terms & conditions content and hit 'Summarize'
|
27 |
-
""")
|
28 |
-
|
29 |
-
|
30 |
-
@st.cache(allow_output_mutation=True,
|
31 |
-
suppress_st_warning=True,
|
32 |
-
show_spinner=False)
|
33 |
-
def create_pipeline():
|
34 |
-
with st.spinner('Please wait for the model to load...'):
|
35 |
-
terms_and_conditions_pipeline = pipeline(
|
36 |
-
task='summarization',
|
37 |
-
model='ml6team/distilbart-tos-summarizer-tosdr',
|
38 |
-
tokenizer='ml6team/distilbart-tos-summarizer-tosdr'
|
39 |
-
)
|
40 |
-
return terms_and_conditions_pipeline
|
41 |
-
|
42 |
-
|
43 |
-
def display_abstractive_summary(summary) -> None:
|
44 |
-
st.subheader("Abstractive Summary")
|
45 |
-
st.markdown('#####')
|
46 |
-
st.markdown(summary)
|
47 |
-
|
48 |
-
|
49 |
-
def display_extractive_summary(terms_and_conditions_sentences: list, summary_sentences: list) -> None:
|
50 |
-
st.subheader("Extractive Summary")
|
51 |
-
st.markdown('#####')
|
52 |
-
terms_and_conditions = " ".join(sentence for sentence in terms_and_conditions_sentences)
|
53 |
-
replaced_text = terms_and_conditions
|
54 |
-
for sentence in summary_sentences:
|
55 |
-
replaced_text = replaced_text.replace(sentence, f"<span style='background-color: #FFFF00'>{sentence}</span>")
|
56 |
-
st.write(replaced_text, unsafe_allow_html=True)
|
57 |
-
|
58 |
|
59 |
-
def
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
|
72 |
-
st.session_state['sentences_length'] = DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH
|
73 |
|
74 |
-
|
75 |
-
st.header("Input")
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
tc_text_input = st.text_area(
|
84 |
-
value=st.session_state.tc_text,
|
85 |
-
label='Terms & conditions content or specify an URL:',
|
86 |
-
height=240
|
87 |
-
)
|
88 |
|
89 |
-
|
|
|
90 |
|
91 |
-
|
|
|
92 |
|
93 |
-
if is_valid_url(tc_text_input):
|
94 |
-
(all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_url(tc_text_input,
|
95 |
-
sentences_length_input)
|
96 |
-
else:
|
97 |
-
(all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_text(tc_text_input,
|
98 |
-
sentences_length_input)
|
99 |
|
100 |
-
|
101 |
-
|
102 |
|
103 |
-
display_extractive_summary(all_sentences, extract_summary_sentences)
|
104 |
-
display_abstractive_summary(abstract_summary)
|
|
|
6 |
|
7 |
from Summarizer import Summarizer
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
def main() -> None:
|
11 |
+
nltk.download('punkt')
|
12 |
+
|
13 |
+
st.markdown('# Terms & conditions summarization :pencil:')
|
14 |
+
st.write('Do you also take the time out of your day to thoroughly read every word of the Terms & Conditions before signing up for a new app? :thinking_face: \nNo?'
|
15 |
+
'Well have we got a demo for you!'
|
16 |
+
'Just copy-paste the lengthy Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!'
|
17 |
+
'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)'
|
18 |
+
'The abstractive summarization is preceded by LSA (Latent Semantic Analysis) extractive summarization')
|
19 |
+
st.write('Want to find out more?'
|
20 |
+
'For information about the extractive summarization :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis'
|
21 |
+
'For information about the abstractive summarization :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr')
|
22 |
+
|
23 |
+
st.markdown("""
|
24 |
+
To use this:
|
25 |
+
- Number of sentences to be extracted is configurable
|
26 |
+
- Specify an URL to extract contents OR copy terms & conditions content and hit 'Summarize'
|
27 |
+
""")
|
28 |
+
|
29 |
+
@st.cache(allow_output_mutation=True,
|
30 |
+
suppress_st_warning=True,
|
31 |
+
show_spinner=False)
|
32 |
+
def create_pipeline():
|
33 |
+
with st.spinner('Please wait for the model to load...'):
|
34 |
+
terms_and_conditions_pipeline = pipeline(
|
35 |
+
task='summarization',
|
36 |
+
model='ml6team/distilbart-tos-summarizer-tosdr',
|
37 |
+
tokenizer='ml6team/distilbart-tos-summarizer-tosdr'
|
38 |
+
)
|
39 |
+
return terms_and_conditions_pipeline
|
40 |
+
|
41 |
+
def display_abstractive_summary(summary) -> None:
|
42 |
+
st.subheader("Abstractive Summary")
|
43 |
+
st.markdown('#####')
|
44 |
+
st.markdown(summary)
|
45 |
+
|
46 |
+
def display_extractive_summary(terms_and_conditions_sentences: list, summary_sentences: list) -> None:
|
47 |
+
st.subheader("Extractive Summary")
|
48 |
+
st.markdown('#####')
|
49 |
+
terms_and_conditions = " ".join(sentence for sentence in terms_and_conditions_sentences)
|
50 |
+
replaced_text = terms_and_conditions
|
51 |
+
for sentence in summary_sentences:
|
52 |
+
replaced_text = replaced_text.replace(sentence,
|
53 |
+
f"<span style='background-color: #FFFF00'>{sentence}</span>")
|
54 |
+
st.write(replaced_text, unsafe_allow_html=True)
|
55 |
+
|
56 |
+
def is_valid_url(url: str) -> bool:
|
57 |
+
result = validators.url(url)
|
58 |
+
if isinstance(result, ValidationFailure):
|
59 |
+
return False
|
60 |
+
return True
|
61 |
+
|
62 |
+
summarizer: Summarizer = Summarizer(create_pipeline())
|
63 |
+
|
64 |
+
if 'tc_text' not in st.session_state:
|
65 |
+
st.session_state['tc_text'] = ''
|
66 |
+
|
67 |
+
if 'sentences_length' not in st.session_state:
|
68 |
+
st.session_state['sentences_length'] = Summarizer.DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH
|
69 |
+
|
70 |
+
st.write('<style>div.row-widget.stRadio > div{flex-direction:row;}</style>', unsafe_allow_html=True)
|
71 |
+
st.header("Input")
|
72 |
+
|
73 |
+
with st.form(key='terms-and-conditions'):
|
74 |
+
sentences_length_input = st.number_input(
|
75 |
+
label='Number of sentences to be extracted:',
|
76 |
+
min_value=1,
|
77 |
+
value=st.session_state.sentences_length
|
78 |
+
)
|
79 |
+
tc_text_input = st.text_area(
|
80 |
+
value=st.session_state.tc_text,
|
81 |
+
label='Terms & conditions content or specify an URL:',
|
82 |
+
height=240
|
83 |
+
)
|
84 |
|
85 |
+
submit_button = st.form_submit_button(label='Summarize')
|
|
|
86 |
|
87 |
+
if submit_button:
|
|
|
88 |
|
89 |
+
if is_valid_url(tc_text_input):
|
90 |
+
(all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_url(tc_text_input,
|
91 |
+
sentences_length_input)
|
92 |
+
else:
|
93 |
+
(all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_text(tc_text_input,
|
94 |
+
sentences_length_input)
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
+
extract_summary = " ".join([sentence for sentence in extract_summary_sentences])
|
97 |
+
abstract_summary = summarizer.abstractive_summary(extract_summary)
|
98 |
|
99 |
+
display_extractive_summary(all_sentences, extract_summary_sentences)
|
100 |
+
display_abstractive_summary(abstract_summary)
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
+
if __name__ == "__main__":
|
104 |
+
main()
|
105 |
|
|
|
|
requirements.txt
CHANGED
@@ -2,7 +2,7 @@ nlpaug==1.1.7
|
|
2 |
streamlit
|
3 |
torch==1.9.1
|
4 |
torchvision==0.10.1
|
5 |
-
transformers
|
6 |
sumy==0.9.0
|
7 |
-
nltk
|
8 |
-
validators
|
|
|
2 |
streamlit
|
3 |
torch==1.9.1
|
4 |
torchvision==0.10.1
|
5 |
+
transformers==4.10.3
|
6 |
sumy==0.9.0
|
7 |
+
nltk==3.6.7
|
8 |
+
validators==0.18.2
|