Spaces:
Build error
Build error
sdhanabal1
commited on
Commit
·
99b1da3
1
Parent(s):
338f4fe
Use tokenizer to split sentences
Browse files- Summarizer.py +35 -4
- app.py +2 -1
- requirements.txt +2 -1
- test_summarizer.py +26 -0
Summarizer.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
|
2 |
|
3 |
from sumy.parsers import DocumentParser
|
4 |
from sumy.parsers.html import HtmlParser
|
@@ -7,12 +7,14 @@ from sumy.nlp.tokenizers import Tokenizer
|
|
7 |
from sumy.nlp.stemmers import Stemmer
|
8 |
from sumy.summarizers.lsa import LsaSummarizer
|
9 |
from sumy.utils import get_stop_words
|
10 |
-
from transformers import Pipeline
|
11 |
|
12 |
|
13 |
class Summarizer:
|
14 |
DEFAULT_LANGUAGE = "english"
|
15 |
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
|
|
|
|
|
16 |
|
17 |
def __init__(self, pipeline: Pipeline):
|
18 |
self.pipeline = pipeline
|
@@ -27,6 +29,30 @@ class Summarizer:
|
|
27 |
summarized_list.append(sentence._text)
|
28 |
return summarized_list
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
def __extractive_summary(self, parser: DocumentParser, sentences_count) -> list:
|
31 |
summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
|
32 |
summarized_list = Summarizer.sentence_list(summarized_sentences)
|
@@ -41,8 +67,13 @@ class Summarizer:
|
|
41 |
return self.__extractive_summary(parser, sentences_count)
|
42 |
|
43 |
def abstractive_summary(self, extract_summary_sentences: list) -> list:
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
46 |
abstractive_summary_list = []
|
47 |
for result in self.pipeline(wrapped_sentences, min_length=5, max_length=512):
|
48 |
abstractive_summary_list.append(result['summary_text'])
|
|
|
1 |
+
import string
|
2 |
|
3 |
from sumy.parsers import DocumentParser
|
4 |
from sumy.parsers.html import HtmlParser
|
|
|
7 |
from sumy.nlp.stemmers import Stemmer
|
8 |
from sumy.summarizers.lsa import LsaSummarizer
|
9 |
from sumy.utils import get_stop_words
|
10 |
+
from transformers import Pipeline, BertTokenizer
|
11 |
|
12 |
|
13 |
class Summarizer:
|
14 |
DEFAULT_LANGUAGE = "english"
|
15 |
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
|
16 |
+
TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased')
|
17 |
+
STOP_WORDS = list(get_stop_words(language=DEFAULT_LANGUAGE)) + list(string.punctuation)
|
18 |
|
19 |
def __init__(self, pipeline: Pipeline):
|
20 |
self.pipeline = pipeline
|
|
|
29 |
summarized_list.append(sentence._text)
|
30 |
return summarized_list
|
31 |
|
32 |
+
@staticmethod
|
33 |
+
def join_sentences(summary_sentences: list) -> str:
|
34 |
+
return " ".join([sentence for sentence in summary_sentences])
|
35 |
+
|
36 |
+
@staticmethod
|
37 |
+
def split_sentences_by_token_length(summary_sentences: list, max_token_length: int) -> list:
|
38 |
+
accumulated_lists = []
|
39 |
+
result_list = []
|
40 |
+
cumulative_token_length = 0
|
41 |
+
for sentence in summary_sentences:
|
42 |
+
result_list.append(sentence)
|
43 |
+
token_list = Summarizer.TOKENIZER.tokenize(sentence)
|
44 |
+
token_words = [token for token in token_list if token.lower() not in Summarizer.STOP_WORDS]
|
45 |
+
token_length = len(token_words)
|
46 |
+
if token_length + cumulative_token_length >= max_token_length:
|
47 |
+
accumulated_lists.append(Summarizer.join_sentences(result_list))
|
48 |
+
result_list = []
|
49 |
+
cumulative_token_length = 0
|
50 |
+
else:
|
51 |
+
cumulative_token_length += token_length
|
52 |
+
if result_list:
|
53 |
+
accumulated_lists.append(Summarizer.join_sentences(result_list))
|
54 |
+
return accumulated_lists
|
55 |
+
|
56 |
def __extractive_summary(self, parser: DocumentParser, sentences_count) -> list:
|
57 |
summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
|
58 |
summarized_list = Summarizer.sentence_list(summarized_sentences)
|
|
|
67 |
return self.__extractive_summary(parser, sentences_count)
|
68 |
|
69 |
def abstractive_summary(self, extract_summary_sentences: list) -> list:
|
70 |
+
"""
|
71 |
+
:param extract_summary_sentences: Extractive summary of sentences after Latent semantic analysis
|
72 |
+
:return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
|
73 |
+
"""
|
74 |
+
wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
|
75 |
+
max_token_length=1000)
|
76 |
+
# The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
|
77 |
abstractive_summary_list = []
|
78 |
for result in self.pipeline(wrapped_sentences, min_length=5, max_length=512):
|
79 |
abstractive_summary_list.append(result['summary_text'])
|
app.py
CHANGED
@@ -87,7 +87,8 @@ def main() -> None:
|
|
87 |
|
88 |
sentences_length = st.number_input(
|
89 |
label='Number of sentences to be extracted:',
|
90 |
-
min_value=
|
|
|
91 |
value=st.session_state.sentences_length
|
92 |
)
|
93 |
sample_choice = st.selectbox(
|
|
|
87 |
|
88 |
sentences_length = st.number_input(
|
89 |
label='Number of sentences to be extracted:',
|
90 |
+
min_value=5,
|
91 |
+
max_value=15,
|
92 |
value=st.session_state.sentences_length
|
93 |
)
|
94 |
sample_choice = st.selectbox(
|
requirements.txt
CHANGED
@@ -5,4 +5,5 @@ torchvision==0.10.1
|
|
5 |
transformers==4.10.3
|
6 |
sumy==0.9.0
|
7 |
nltk==3.6.7
|
8 |
-
validators==0.18.2
|
|
|
|
5 |
transformers==4.10.3
|
6 |
sumy==0.9.0
|
7 |
nltk==3.6.7
|
8 |
+
validators==0.18.2
|
9 |
+
pytest==6.2.5
|
test_summarizer.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from Summarizer import Summarizer
|
2 |
+
|
3 |
+
|
4 |
+
def test_split_sentences_by_token_length():
|
5 |
+
summary_sentences = [
|
6 |
+
'Python is a programming language.',
|
7 |
+
'Memory allocation.',
|
8 |
+
'Free.'
|
9 |
+
]
|
10 |
+
|
11 |
+
split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=3)
|
12 |
+
assert split_sentences == [
|
13 |
+
'Python is a programming language.',
|
14 |
+
'Memory allocation. Free.'
|
15 |
+
]
|
16 |
+
|
17 |
+
split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=5)
|
18 |
+
assert split_sentences == [
|
19 |
+
'Python is a programming language. Memory allocation.',
|
20 |
+
'Free.'
|
21 |
+
]
|
22 |
+
|
23 |
+
split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=10)
|
24 |
+
assert split_sentences == [
|
25 |
+
'Python is a programming language. Memory allocation. Free.'
|
26 |
+
]
|