File size: 3,222 Bytes
68d26c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94


from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import metrics
import os
import nltk

def summarize(in_text):

    if len(in_text)==0:
        return 'Error: No text provided', None

    nltk_file = '/home/user/nltk_data/tokenizers/punkt.zip'
    if os.path.exists(nltk_file):
        print('nltk punkt file exists in ', nltk_file)
    else:
        print("downloading punkt file")
        nltk.download('punkt')

    in_longtext = []
    # Discard all senteces that have less than 10 words in them
    in_text_sentenses = in_text.split('.')
    
    for sen in in_text_sentenses:
        sen_split = sen.split()
        len_sen_split = len(sen_split)
        if len_sen_split > 10:
            in_longtext.append(sen)
    in_text = '.'.join(in_longtext)+'.'
    
    # The size of the summary is limited to 1024
    # The Lexrank algorith accepts only sentences as a limit
    # We start with one sentece and check the token size
    # Then increase the number of sentences until the tokensize 
    # of the next sentence exceed the limit
    target_tokens = 1024

    in_sents = metrics.num_sentences(in_text)

    out_text = get_Summary(in_text,1)
    n_tokens= metrics.num_tokens(out_text)
    prev_n_tokens=0
    for sen in range(2, in_sents):
        if n_tokens >= target_tokens:
            n_tokens = prev_n_tokens
            break
        else:
            out_text = get_Summary(in_text,sen)
            prev_n_tokens = n_tokens
            n_tokens= metrics.num_tokens(out_text)
    
    n_sents = metrics.num_sentences(out_text)
    n_words = metrics.num_words(out_text)
    n_chars = metrics.num_chars(out_text)

    return out_text, n_words, n_sents, n_chars, n_tokens

def get_Summary(in_text, nr_sentences):
    
    #sentences = in_text.split('. ')
    # summarize small part of the text
    #nr_sentences = 1 #len(sentences)
    #print('nr_sentences: '+str(nr_sentences))
    
    if nr_sentences == 0:
        return 'Error: No sentences available', None 
    list_summary = get_Lexrank(in_text,nr_sentences)
    # it can happen that for lexrank a sentence consists of multiple actual sentences, 
    # that are separated with full stops. Then the correspoinding timestamp cannot be found
    # all items from the lexrank summary must be concatinated and split up by full stops.
    concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ')
    concat_list_summary = concat_list_summary.replace('\\n','')
    concat_list_summary = concat_list_summary.replace('. ','.\n')+'.'
    
    return concat_list_summary

def get_Lexrank(text, nr_sentences):
    summary=[]
    LANGUAGE = "english"
    SENTENCES_COUNT = nr_sentences
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = LexRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summary.append(sentence)

    return summary