Spaces:

wldmr
/

transcriptifier-st-hf7

Runtime error

App Files Files Community

transcriptifier-st-hf7 / lexrank.py

wldmr

init

68d26c9 over 1 year ago

raw

history blame contribute delete

3.22 kB



	from sumy.parsers.html import HtmlParser
	from sumy.parsers.plaintext import PlaintextParser
	from sumy.nlp.tokenizers import Tokenizer
	from sumy.summarizers.lex_rank import LexRankSummarizer
	from sumy.nlp.stemmers import Stemmer
	from sumy.utils import get_stop_words
	import metrics
	import os
	import nltk

	def summarize(in_text):

	if len(in_text)==0:
	return 'Error: No text provided', None

	nltk_file = '/home/user/nltk_data/tokenizers/punkt.zip'
	if os.path.exists(nltk_file):
	print('nltk punkt file exists in ', nltk_file)
	else:
	print("downloading punkt file")
	nltk.download('punkt')

	in_longtext = []
	# Discard all senteces that have less than 10 words in them
	in_text_sentenses = in_text.split('.')

	for sen in in_text_sentenses:
	sen_split = sen.split()
	len_sen_split = len(sen_split)
	if len_sen_split > 10:
	in_longtext.append(sen)
	in_text = '.'.join(in_longtext)+'.'

	# The size of the summary is limited to 1024
	# The Lexrank algorith accepts only sentences as a limit
	# We start with one sentece and check the token size
	# Then increase the number of sentences until the tokensize
	# of the next sentence exceed the limit
	target_tokens = 1024

	in_sents = metrics.num_sentences(in_text)

	out_text = get_Summary(in_text,1)
	n_tokens= metrics.num_tokens(out_text)
	prev_n_tokens=0
	for sen in range(2, in_sents):
	if n_tokens >= target_tokens:
	n_tokens = prev_n_tokens
	break
	else:
	out_text = get_Summary(in_text,sen)
	prev_n_tokens = n_tokens
	n_tokens= metrics.num_tokens(out_text)

	n_sents = metrics.num_sentences(out_text)
	n_words = metrics.num_words(out_text)
	n_chars = metrics.num_chars(out_text)

	return out_text, n_words, n_sents, n_chars, n_tokens

	def get_Summary(in_text, nr_sentences):

	#sentences = in_text.split('. ')
	# summarize small part of the text
	#nr_sentences = 1 #len(sentences)
	#print('nr_sentences: '+str(nr_sentences))

	if nr_sentences == 0:
	return 'Error: No sentences available', None
	list_summary = get_Lexrank(in_text,nr_sentences)
	# it can happen that for lexrank a sentence consists of multiple actual sentences,
	# that are separated with full stops. Then the correspoinding timestamp cannot be found
	# all items from the lexrank summary must be concatinated and split up by full stops.
	concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ')
	concat_list_summary = concat_list_summary.replace('\\n','')
	concat_list_summary = concat_list_summary.replace('. ','.\n')+'.'

	return concat_list_summary

	def get_Lexrank(text, nr_sentences):
	summary=[]
	LANGUAGE = "english"
	SENTENCES_COUNT = nr_sentences
	parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
	stemmer = Stemmer(LANGUAGE)
	summarizer = LexRankSummarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)
	for sentence in summarizer(parser.document, SENTENCES_COUNT):
	summary.append(sentence)

	return summary