sdhanabal1 commited on
Commit
99b1da3
·
1 Parent(s): 338f4fe

Use tokenizer to split sentences

Browse files
Files changed (4) hide show
  1. Summarizer.py +35 -4
  2. app.py +2 -1
  3. requirements.txt +2 -1
  4. test_summarizer.py +26 -0
Summarizer.py CHANGED
@@ -1,4 +1,4 @@
1
- from textwrap import wrap
2
 
3
  from sumy.parsers import DocumentParser
4
  from sumy.parsers.html import HtmlParser
@@ -7,12 +7,14 @@ from sumy.nlp.tokenizers import Tokenizer
7
  from sumy.nlp.stemmers import Stemmer
8
  from sumy.summarizers.lsa import LsaSummarizer
9
  from sumy.utils import get_stop_words
10
- from transformers import Pipeline
11
 
12
 
13
  class Summarizer:
14
  DEFAULT_LANGUAGE = "english"
15
  DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
 
 
16
 
17
  def __init__(self, pipeline: Pipeline):
18
  self.pipeline = pipeline
@@ -27,6 +29,30 @@ class Summarizer:
27
  summarized_list.append(sentence._text)
28
  return summarized_list
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def __extractive_summary(self, parser: DocumentParser, sentences_count) -> list:
31
  summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
32
  summarized_list = Summarizer.sentence_list(summarized_sentences)
@@ -41,8 +67,13 @@ class Summarizer:
41
  return self.__extractive_summary(parser, sentences_count)
42
 
43
  def abstractive_summary(self, extract_summary_sentences: list) -> list:
44
- extract_summary = " ".join([sentence for sentence in extract_summary_sentences])
45
- wrapped_sentences = wrap(extract_summary, 2048)
 
 
 
 
 
46
  abstractive_summary_list = []
47
  for result in self.pipeline(wrapped_sentences, min_length=5, max_length=512):
48
  abstractive_summary_list.append(result['summary_text'])
 
1
+ import string
2
 
3
  from sumy.parsers import DocumentParser
4
  from sumy.parsers.html import HtmlParser
 
7
  from sumy.nlp.stemmers import Stemmer
8
  from sumy.summarizers.lsa import LsaSummarizer
9
  from sumy.utils import get_stop_words
10
+ from transformers import Pipeline, BertTokenizer
11
 
12
 
13
  class Summarizer:
14
  DEFAULT_LANGUAGE = "english"
15
  DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
16
+ TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased')
17
+ STOP_WORDS = list(get_stop_words(language=DEFAULT_LANGUAGE)) + list(string.punctuation)
18
 
19
  def __init__(self, pipeline: Pipeline):
20
  self.pipeline = pipeline
 
29
  summarized_list.append(sentence._text)
30
  return summarized_list
31
 
32
+ @staticmethod
33
+ def join_sentences(summary_sentences: list) -> str:
34
+ return " ".join([sentence for sentence in summary_sentences])
35
+
36
+ @staticmethod
37
+ def split_sentences_by_token_length(summary_sentences: list, max_token_length: int) -> list:
38
+ accumulated_lists = []
39
+ result_list = []
40
+ cumulative_token_length = 0
41
+ for sentence in summary_sentences:
42
+ result_list.append(sentence)
43
+ token_list = Summarizer.TOKENIZER.tokenize(sentence)
44
+ token_words = [token for token in token_list if token.lower() not in Summarizer.STOP_WORDS]
45
+ token_length = len(token_words)
46
+ if token_length + cumulative_token_length >= max_token_length:
47
+ accumulated_lists.append(Summarizer.join_sentences(result_list))
48
+ result_list = []
49
+ cumulative_token_length = 0
50
+ else:
51
+ cumulative_token_length += token_length
52
+ if result_list:
53
+ accumulated_lists.append(Summarizer.join_sentences(result_list))
54
+ return accumulated_lists
55
+
56
  def __extractive_summary(self, parser: DocumentParser, sentences_count) -> list:
57
  summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
58
  summarized_list = Summarizer.sentence_list(summarized_sentences)
 
67
  return self.__extractive_summary(parser, sentences_count)
68
 
69
  def abstractive_summary(self, extract_summary_sentences: list) -> list:
70
+ """
71
+ :param extract_summary_sentences: Extractive summary of sentences after Latent semantic analysis
72
+ :return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
73
+ """
74
+ wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
75
+ max_token_length=1000)
76
+ # The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
77
  abstractive_summary_list = []
78
  for result in self.pipeline(wrapped_sentences, min_length=5, max_length=512):
79
  abstractive_summary_list.append(result['summary_text'])
app.py CHANGED
@@ -87,7 +87,8 @@ def main() -> None:
87
 
88
  sentences_length = st.number_input(
89
  label='Number of sentences to be extracted:',
90
- min_value=1,
 
91
  value=st.session_state.sentences_length
92
  )
93
  sample_choice = st.selectbox(
 
87
 
88
  sentences_length = st.number_input(
89
  label='Number of sentences to be extracted:',
90
+ min_value=5,
91
+ max_value=15,
92
  value=st.session_state.sentences_length
93
  )
94
  sample_choice = st.selectbox(
requirements.txt CHANGED
@@ -5,4 +5,5 @@ torchvision==0.10.1
5
  transformers==4.10.3
6
  sumy==0.9.0
7
  nltk==3.6.7
8
- validators==0.18.2
 
 
5
  transformers==4.10.3
6
  sumy==0.9.0
7
  nltk==3.6.7
8
+ validators==0.18.2
9
+ pytest==6.2.5
test_summarizer.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Summarizer import Summarizer
2
+
3
+
4
+ def test_split_sentences_by_token_length():
5
+ summary_sentences = [
6
+ 'Python is a programming language.',
7
+ 'Memory allocation.',
8
+ 'Free.'
9
+ ]
10
+
11
+ split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=3)
12
+ assert split_sentences == [
13
+ 'Python is a programming language.',
14
+ 'Memory allocation. Free.'
15
+ ]
16
+
17
+ split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=5)
18
+ assert split_sentences == [
19
+ 'Python is a programming language. Memory allocation.',
20
+ 'Free.'
21
+ ]
22
+
23
+ split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=10)
24
+ assert split_sentences == [
25
+ 'Python is a programming language. Memory allocation. Free.'
26
+ ]