|
import torch |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline |
|
|
|
import spacy |
|
nlp = spacy.load('en_core_web_sm') |
|
|
|
|
|
def create_nest_sentences(document:str, token_max_length = 1024): |
|
nested = [] |
|
sent = [] |
|
length = 0 |
|
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli') |
|
tokens = nlp(document) |
|
|
|
for sentence in tokens.sents: |
|
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] |
|
length += len(tokens_in_sentence) |
|
|
|
if length < token_max_length: |
|
sent.append(sentence) |
|
else: |
|
nested.append(sent) |
|
sent = [] |
|
length = 0 |
|
|
|
if sent: |
|
nested.append(sent) |
|
return nested |
|
|
|
|
|
def load_summary_model(): |
|
model_name = "facebook/bart-large-mnli" |
|
summarizer = pipeline(task='summarization', model=model_name) |
|
return summarizer |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def summarizer_gen(summarizer, sequence:str, maximum_tokens:int, minimum_tokens:int): |
|
output = summarizer(sequence, num_beams=4, max_length=maximum_tokens, min_length=minimum_tokens, do_sample=False) |
|
return output[0].get('summary_text') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_model(): |
|
model_name = "facebook/bart-large-mnli" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
classifier = pipeline(task='zero-shot-classification', model=model, tokenizer=tokenizer, framework='pt') |
|
return classifier |
|
|
|
def classifier_zero(classifier, sequence:str, labels:list, multi_class:bool): |
|
outputs = classifier(sequence, labels, multi_label=multi_class) |
|
return outputs['labels'], outputs['scores'] |
|
|
|
|