Paula Leonova
commited on
Commit
·
f60d1c6
1
Parent(s):
51fcc5c
Add keyBERT in order to generate top keywords
Browse files
app.py
CHANGED
@@ -51,7 +51,7 @@ with st.form(key='my_form'):
|
|
51 |
|
52 |
|
53 |
|
54 |
-
with st.spinner('Loading pretrained
|
55 |
start = time.time()
|
56 |
summarizer = md.load_summary_model()
|
57 |
s_time = round(time.time() - start,4)
|
@@ -60,13 +60,11 @@ with st.spinner('Loading pretrained summarizer and classifier mnli model...'):
|
|
60 |
classifier = md.load_model()
|
61 |
c_time = round(time.time() - start,4)
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
# start = time.time()
|
67 |
-
# classifier = md.load_model()
|
68 |
-
# st.success(f'Time taken to load classifier mnli model: {round(time.time() - start,4)} seconds')
|
69 |
|
|
|
70 |
|
71 |
if submit_button:
|
72 |
if len(text_input) == 0:
|
@@ -80,22 +78,31 @@ if submit_button:
|
|
80 |
for n in range(0, len(nested_sentences)):
|
81 |
tc = " ".join(map(str, nested_sentences[n]))
|
82 |
text_chunks.append(tc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
|
|
84 |
with st.spinner('Generating summaries for text chunks...'):
|
85 |
|
86 |
-
my_expander = st.expander(label='Expand to see summary generation details')
|
87 |
with my_expander:
|
88 |
summary = []
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
# # For each chunk of sentences (within the token max), generate a summary
|
95 |
-
# for n in range(0, len(nested_sentences)):
|
96 |
-
# text_chunk = " ".join(map(str, nested_sentences[n]))
|
97 |
-
# st.markdown(f"###### Original Text Chunk {n+1}/{len(nested_sentences)}" )
|
98 |
-
# st.markdown(text_chunk)
|
99 |
|
100 |
for num_chunk, text_chunk in enumerate(text_chunks):
|
101 |
st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
|
@@ -108,15 +115,9 @@ if submit_button:
|
|
108 |
# Combine all the summaries into a list and compress into one document, again
|
109 |
final_summary = " \n\n".join(list(summary))
|
110 |
|
111 |
-
# final_summary = md.summarizer_gen(summarizer, sequence=text_input, maximum_tokens = 30, minimum_tokens = 100)
|
112 |
st.markdown("### Combined Summary")
|
113 |
st.markdown(final_summary)
|
114 |
|
115 |
-
# if gen_keywords == 'Yes':
|
116 |
-
# st.markdown("### Top Keywords")
|
117 |
-
# with st.spinner("Generating keywords from text...")
|
118 |
-
# keywords =
|
119 |
-
|
120 |
if len(text_input) == 0 or len(labels) == 0:
|
121 |
st.write('Enter some text and at least one possible topic to see predictions.')
|
122 |
else:
|
|
|
51 |
|
52 |
|
53 |
|
54 |
+
with st.spinner('Loading pretrained models...'):
|
55 |
start = time.time()
|
56 |
summarizer = md.load_summary_model()
|
57 |
s_time = round(time.time() - start,4)
|
|
|
60 |
classifier = md.load_model()
|
61 |
c_time = round(time.time() - start,4)
|
62 |
|
63 |
+
start = time.time()
|
64 |
+
kw_model = md.load_keyword_model()
|
65 |
+
k_time = round(time.time() - start,4)
|
|
|
|
|
|
|
66 |
|
67 |
+
st.success(f'Time taken to load BART summarizer mnli model: {s_time}s & BART classifier mnli model: {c_time}s & KeyBERT model: {k_time}s')
|
68 |
|
69 |
if submit_button:
|
70 |
if len(text_input) == 0:
|
|
|
78 |
for n in range(0, len(nested_sentences)):
|
79 |
tc = " ".join(map(str, nested_sentences[n]))
|
80 |
text_chunks.append(tc)
|
81 |
+
|
82 |
+
if gen_keywords == 'Yes':
|
83 |
+
st.markdown("### Top Keywords")
|
84 |
+
with st.spinner("Generating keywords from text..."):
|
85 |
+
|
86 |
+
kw_df = pd.DataFrame()
|
87 |
+
for text_chunk in text_chunks:
|
88 |
+
keywords_list = md.keyword_gen(kw_model, text_chunk)
|
89 |
+
kw_df = kw_df.append(pd.DataFrame(keywords_list))
|
90 |
+
kw_df.columns = ['keyword', 'score']
|
91 |
+
top_kw_df = kw_df.groupby('keyword')['score'].max().reset_index()
|
92 |
+
|
93 |
+
top_kw_df = top_kw_df.sort_values('score', ascending = False).reset_index().drop(['index'], axis=1)
|
94 |
+
st.dataframe(top_kw_df.head(10))
|
95 |
|
96 |
+
st.markdown("### Text Chunk & Summaries")
|
97 |
with st.spinner('Generating summaries for text chunks...'):
|
98 |
|
99 |
+
my_expander = st.expander(label='Expand to see intermediate summary generation details')
|
100 |
with my_expander:
|
101 |
summary = []
|
102 |
+
|
103 |
+
st.markdown("_The original text is broken into chunks with complete sentences totaling \
|
104 |
+
fewer than 1024 tokens, a requirement for the summarizer. Each block of text is then summarized separately \
|
105 |
+
and then combined at the very end to generate the final summary._")
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
for num_chunk, text_chunk in enumerate(text_chunks):
|
108 |
st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
|
|
|
115 |
# Combine all the summaries into a list and compress into one document, again
|
116 |
final_summary = " \n\n".join(list(summary))
|
117 |
|
|
|
118 |
st.markdown("### Combined Summary")
|
119 |
st.markdown(final_summary)
|
120 |
|
|
|
|
|
|
|
|
|
|
|
121 |
if len(text_input) == 0 or len(labels) == 0:
|
122 |
st.write('Enter some text and at least one possible topic to see predictions.')
|
123 |
else:
|
models.py
CHANGED
@@ -34,9 +34,9 @@ def create_nest_sentences(document:str, token_max_length = 1024):
|
|
34 |
@st.cache(allow_output_mutation=True)
|
35 |
def load_keyword_model():
|
36 |
kw_model = KeyBERT()
|
37 |
-
return
|
38 |
|
39 |
-
def keyword_gen(sequence:str):
|
40 |
keywords = kw_model.extract_keywords(sequence,
|
41 |
keyphrase_ngram_range=(1, 1),
|
42 |
stop_words='english',
|
|
|
34 |
@st.cache(allow_output_mutation=True)
|
35 |
def load_keyword_model():
|
36 |
kw_model = KeyBERT()
|
37 |
+
return kw_model
|
38 |
|
39 |
+
def keyword_gen(kw_model, sequence:str):
|
40 |
keywords = kw_model.extract_keywords(sequence,
|
41 |
keyphrase_ngram_range=(1, 1),
|
42 |
stop_words='english',
|