Gladiator commited on
Commit
f3505bb
·
1 Parent(s): 4354680

add abs for url + normal

Browse files
Files changed (2) hide show
  1. app.py +12 -2
  2. src/abstractive_summarizer.py +11 -16
app.py CHANGED
@@ -8,7 +8,10 @@ from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
8
  # local modules
9
  from extractive_summarizer.model_processors import Summarizer
10
  from src.utils import clean_text, fetch_article_text
11
- from src.abstractive_summarizer import abstractive_summarizer
 
 
 
12
 
13
  # abstractive summarizer model
14
  @st.cache()
@@ -62,8 +65,15 @@ if __name__ == "__main__":
62
  with st.spinner(
63
  text="Creating abstractive summary. This might take a few seconds ..."
64
  ):
 
 
65
  if not is_url:
66
- text_to_summarize = sent_tokenize(clean_txt)
 
 
 
 
 
67
 
68
  # abs_tokenizer, abs_model = load_abs_model()
69
  # summarized_text = abstractive_summarizer(
 
8
  # local modules
9
  from extractive_summarizer.model_processors import Summarizer
10
  from src.utils import clean_text, fetch_article_text
11
+ from src.abstractive_summarizer import (
12
+ abstractive_summarizer,
13
+ preprocess_text_for_abstractive_summarization,
14
+ )
15
 
16
  # abstractive summarizer model
17
  @st.cache()
 
65
  with st.spinner(
66
  text="Creating abstractive summary. This might take a few seconds ..."
67
  ):
68
+ text_to_summarize = clean_txt
69
+ abs_tokenizer, abs_model = load_abs_model()
70
  if not is_url:
71
+ text_to_summarize = preprocess_text_for_abstractive_summarization(
72
+ tokenizer=abs_tokenizer, text=clean_txt
73
+ )
74
+ summarized_text = abstractive_summarizer(
75
+ abs_tokenizer, abs_model, text_to_summarize
76
+ )
77
 
78
  # abs_tokenizer, abs_model = load_abs_model()
79
  # summarized_text = abstractive_summarizer(
src/abstractive_summarizer.py CHANGED
@@ -4,22 +4,17 @@ from transformers import T5Tokenizer
4
 
5
 
6
  def abstractive_summarizer(tokenizer, model, text):
7
- device = torch.device("cpu")
8
- preprocess_text = text.strip().replace("\n", "")
9
- t5_prepared_text = "summarize: " + preprocess_text
10
- tokenized_text = tokenizer.encode(t5_prepared_text, return_tensors="pt").to(device)
11
-
12
- # summmarize
13
- summary_ids = model.generate(
14
- tokenized_text,
15
- num_beams=4,
16
- no_repeat_ngram_size=2,
17
- min_length=30,
18
- max_length=300,
19
- early_stopping=True,
20
- )
21
- abs_summarized_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
22
-
23
  return abs_summarized_text
24
 
25
 
 
4
 
5
 
6
  def abstractive_summarizer(tokenizer, model, text):
7
+ # inputs to the model
8
+ inputs = [
9
+ tokenizer.encode(f"summarize: {chunk}", return_tensors="pt") for chunk in text
10
+ ]
11
+ abs_summarized_text = []
12
+ for input in inputs:
13
+ output = model.generate(**input)
14
+ tmp_sum = tokenizer.decode(*output, skip_special_tokens=True)
15
+ abs_summarized_text.append(tmp_sum)
16
+
17
+ abs_summarized_text = " ".join([summ for summ in abs_summarized_text])
 
 
 
 
 
18
  return abs_summarized_text
19
 
20