wldmr's picture
init
68d26c9
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import metrics
import os
import nltk
def summarize(in_text):
if len(in_text)==0:
return 'Error: No text provided', None
nltk_file = '/home/user/nltk_data/tokenizers/punkt.zip'
if os.path.exists(nltk_file):
print('nltk punkt file exists in ', nltk_file)
else:
print("downloading punkt file")
nltk.download('punkt')
in_longtext = []
# Discard all senteces that have less than 10 words in them
in_text_sentenses = in_text.split('.')
for sen in in_text_sentenses:
sen_split = sen.split()
len_sen_split = len(sen_split)
if len_sen_split > 10:
in_longtext.append(sen)
in_text = '.'.join(in_longtext)+'.'
# The size of the summary is limited to 1024
# The Lexrank algorith accepts only sentences as a limit
# We start with one sentece and check the token size
# Then increase the number of sentences until the tokensize
# of the next sentence exceed the limit
target_tokens = 1024
in_sents = metrics.num_sentences(in_text)
out_text = get_Summary(in_text,1)
n_tokens= metrics.num_tokens(out_text)
prev_n_tokens=0
for sen in range(2, in_sents):
if n_tokens >= target_tokens:
n_tokens = prev_n_tokens
break
else:
out_text = get_Summary(in_text,sen)
prev_n_tokens = n_tokens
n_tokens= metrics.num_tokens(out_text)
n_sents = metrics.num_sentences(out_text)
n_words = metrics.num_words(out_text)
n_chars = metrics.num_chars(out_text)
return out_text, n_words, n_sents, n_chars, n_tokens
def get_Summary(in_text, nr_sentences):
#sentences = in_text.split('. ')
# summarize small part of the text
#nr_sentences = 1 #len(sentences)
#print('nr_sentences: '+str(nr_sentences))
if nr_sentences == 0:
return 'Error: No sentences available', None
list_summary = get_Lexrank(in_text,nr_sentences)
# it can happen that for lexrank a sentence consists of multiple actual sentences,
# that are separated with full stops. Then the correspoinding timestamp cannot be found
# all items from the lexrank summary must be concatinated and split up by full stops.
concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ')
concat_list_summary = concat_list_summary.replace('\\n','')
concat_list_summary = concat_list_summary.replace('. ','.\n')+'.'
return concat_list_summary
def get_Lexrank(text, nr_sentences):
summary=[]
LANGUAGE = "english"
SENTENCES_COUNT = nr_sentences
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = LexRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, SENTENCES_COUNT):
summary.append(sentence)
return summary