Spaces:
Runtime error
Runtime error
File size: 4,823 Bytes
4354680 fe021fb cf53b75 d97bcce 4df3ec6 4b21134 e9ee3ed f3505bb 462dc3c f3505bb cf53b75 6514554 ee790d1 cf53b75 e36f01a f39343a 2fdb3df 4b21134 e2a202c ea60296 8381f6e e2a202c ea60296 8381f6e e2a202c 121b578 32ff21e 4354680 e7fc023 d97bcce bcdeb67 32ff21e 121b578 f39343a fe021fb 0f104d5 727bc47 0f104d5 79d5beb 5e2bb2b 79d5beb fe021fb ee790d1 462dc3c ee790d1 fe021fb ee790d1 4b21134 4df3ec6 4354680 ee790d1 4354680 ee790d1 4df3ec6 f39343a 4df3ec6 b916752 ee790d1 cf93567 ee790d1 4df3ec6 4b21134 6f0c363 48200ac 4df3ec6 4b21134 ee790d1 890cbac c099517 890cbac 097245e 0c2753a f3505bb ee790d1 f3505bb af880ff 32ff21e 4354680 121b578 4b21134 4df3ec6 ee790d1 6514554 24d1c8f 6514554 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import nltk
import validators
import streamlit as st
from transformers import AutoTokenizer, pipeline
# local modules
from extractive_summarizer.model_processors import Summarizer
from utils import (
clean_text,
fetch_article_text,
preprocess_text_for_abstractive_summarization,
read_text_from_file,
)
from rouge import Rouge
if __name__ == "__main__":
# ---------------------------------
# Main Application
# ---------------------------------
st.title("Text Summarizer π")
st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
st.markdown(
"Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
)
summarize_type = st.sidebar.selectbox(
"Summarization type", options=["Extractive", "Abstractive"]
)
st.markdown(
"Enter a text or a url to get a concise summary of the article while conserving the overall meaning. This app supports text in the following formats:"
)
st.markdown(
"""- Raw text in text box
- URL of article/news to be summarized
- .txt, .pdf, .docx file formats"""
)
st.markdown(
"""This app supports two type of summarization:
1. **Extractive Summarization**: The extractive approach involves picking up the most important phrases and lines from the documents. It then combines all the important lines to create the summary. So, in this case, every line and word of the summary actually belongs to the original document which is summarized.
2. **Abstractive Summarization**: The abstractive approach involves rephrasing the complete document while capturing the complete meaning of the document. This type of summarization provides more human-like summary"""
)
st.markdown("---")
# ---------------------------
# SETUP & Constants
nltk.download("punkt")
abs_tokenizer_name = "facebook/bart-large-cnn"
abs_model_name = "facebook/bart-large-cnn"
abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
abs_max_length = 90
abs_min_length = 30
# ---------------------------
inp_text = st.text_input("Enter text or a url here")
st.markdown(
"<h3 style='text-align: center; color: green;'>OR</h3>",
unsafe_allow_html=True,
)
uploaded_file = st.file_uploader(
"Upload a .txt, .pdf, .docx file for summarization"
)
is_url = validators.url(inp_text)
if is_url:
# complete text, chunks to summarize (list of sentences for long docs)
text, cleaned_txt = fetch_article_text(url=inp_text)
elif uploaded_file:
cleaned_txt = read_text_from_file(uploaded_file)
cleaned_txt = clean_text(cleaned_txt)
else:
cleaned_txt = clean_text(inp_text)
# view summarized text (expander)
with st.expander("View input text"):
if is_url:
st.write(cleaned_txt[0])
else:
st.write(cleaned_txt)
summarize = st.button("Summarize")
# called on toggle button [summarize]
if summarize:
if summarize_type == "Extractive":
if is_url:
text_to_summarize = " ".join([txt for txt in cleaned_txt])
else:
text_to_summarize = cleaned_txt
# extractive summarizer
with st.spinner(
text="Creating extractive summary. This might take a few seconds ..."
):
ext_model = Summarizer()
summarized_text = ext_model(text_to_summarize, num_sentences=5)
elif summarize_type == "Abstractive":
with st.spinner(
text="Creating abstractive summary. This might take a few seconds ..."
):
text_to_summarize = cleaned_txt
abs_summarizer = pipeline(
"summarization", model=abs_model_name, tokenizer=abs_tokenizer_name
)
if is_url is False:
# list of chunks
text_to_summarize = preprocess_text_for_abstractive_summarization(
tokenizer=abs_tokenizer, text=cleaned_txt
)
tmp_sum = abs_summarizer(
text_to_summarize,
max_length=abs_max_length,
min_length=abs_min_length,
do_sample=False,
)
summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])
# final summarized output
st.subheader("Summarized text")
st.info(summarized_text)
st.subheader("Rogue Scores")
rouge_sc = Rouge()
ground_truth = cleaned_txt[0] if is_url else cleaned_txt
score = rouge_sc.get_scores(summarized_text, ground_truth, avg=True)
st.code(score)
|