from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lex_rank import LexRankSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words import metrics import os import nltk def summarize(in_text): if len(in_text)==0: return 'Error: No text provided', None nltk_file = '/home/user/nltk_data/tokenizers/punkt.zip' if os.path.exists(nltk_file): print('nltk punkt file exists in ', nltk_file) else: print("downloading punkt file") nltk.download('punkt') in_longtext = [] # Discard all senteces that have less than 10 words in them in_text_sentenses = in_text.split('.') for sen in in_text_sentenses: sen_split = sen.split() len_sen_split = len(sen_split) if len_sen_split > 10: in_longtext.append(sen) in_text = '.'.join(in_longtext)+'.' # The size of the summary is limited to 1024 # The Lexrank algorith accepts only sentences as a limit # We start with one sentece and check the token size # Then increase the number of sentences until the tokensize # of the next sentence exceed the limit target_tokens = 1024 in_sents = metrics.num_sentences(in_text) out_text = get_Summary(in_text,1) n_tokens= metrics.num_tokens(out_text) prev_n_tokens=0 for sen in range(2, in_sents): if n_tokens >= target_tokens: n_tokens = prev_n_tokens break else: out_text = get_Summary(in_text,sen) prev_n_tokens = n_tokens n_tokens= metrics.num_tokens(out_text) n_sents = metrics.num_sentences(out_text) n_words = metrics.num_words(out_text) n_chars = metrics.num_chars(out_text) return out_text, n_words, n_sents, n_chars, n_tokens def get_Summary(in_text, nr_sentences): #sentences = in_text.split('. ') # summarize small part of the text #nr_sentences = 1 #len(sentences) #print('nr_sentences: '+str(nr_sentences)) if nr_sentences == 0: return 'Error: No sentences available', None list_summary = get_Lexrank(in_text,nr_sentences) # it can happen that for lexrank a sentence consists of multiple actual sentences, # that are separated with full stops. Then the correspoinding timestamp cannot be found # all items from the lexrank summary must be concatinated and split up by full stops. concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ') concat_list_summary = concat_list_summary.replace('\\n','') concat_list_summary = concat_list_summary.replace('. ','.\n')+'.' return concat_list_summary def get_Lexrank(text, nr_sentences): summary=[] LANGUAGE = "english" SENTENCES_COUNT = nr_sentences parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): summary.append(sentence) return summary