|
|
|
|
|
|
|
|
|
|
|
|
|
import validators, re
|
|
from fake_useragent import UserAgent
|
|
from bs4 import BeautifulSoup
|
|
import streamlit as st
|
|
from transformers import pipeline
|
|
import time
|
|
import base64
|
|
import requests
|
|
import docx2txt
|
|
from io import StringIO
|
|
from PyPDF2 import PdfFileReader
|
|
import warnings
|
|
warnings.filterwarnings("ignore")
|
|
|
|
|
|
|
|
|
|
time_str = time.strftime("%d%m%Y-%H%M%S")
|
|
|
|
|
|
def article_text_extractor(url: str):
|
|
|
|
'''Extract text from url and divide text into chunks if length of text is more than 500 words'''
|
|
|
|
ua = UserAgent()
|
|
|
|
headers = {'User-Agent':str(ua.chrome)}
|
|
|
|
r = requests.get(url,headers=headers)
|
|
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
title_text = soup.find_all(["h1"])
|
|
para_text = soup.find_all(["p"])
|
|
article_text = [result.text for result in para_text]
|
|
article_header = [result.text for result in title_text][0]
|
|
article = " ".join(article_text)
|
|
article = article.replace(".", ".<eos>")
|
|
article = article.replace("!", "!<eos>")
|
|
article = article.replace("?", "?<eos>")
|
|
sentences = article.split("<eos>")
|
|
|
|
current_chunk = 0
|
|
chunks = []
|
|
|
|
for sentence in sentences:
|
|
if len(chunks) == current_chunk + 1:
|
|
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 600:
|
|
chunks[current_chunk].extend(sentence.split(" "))
|
|
else:
|
|
current_chunk += 1
|
|
chunks.append(sentence.split(" "))
|
|
else:
|
|
print(current_chunk)
|
|
chunks.append(sentence.split(" "))
|
|
|
|
for chunk_id in range(len(chunks)):
|
|
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
|
|
|
return article_header, chunks
|
|
|
|
def preprocess_plain_text(x):
|
|
|
|
x = x.encode("ascii", "ignore").decode()
|
|
x = re.sub(r"https*\S+", " ", x)
|
|
x = re.sub(r"@\S+", " ", x)
|
|
x = re.sub(r"#\S+", " ", x)
|
|
x = re.sub(r"\s{2,}", " ", x)
|
|
x = re.sub("[^.,!?A-Za-z0-9]+", " ", x)
|
|
|
|
return x
|
|
|
|
def extract_pdf(file):
|
|
|
|
'''Extract text from PDF file'''
|
|
|
|
pdfReader = PdfFileReader(file)
|
|
count = pdfReader.numPages
|
|
all_text = ""
|
|
for i in range(count):
|
|
page = pdfReader.getPage(i)
|
|
all_text += page.extractText()
|
|
|
|
return all_text
|
|
|
|
|
|
def extract_text_from_file(file):
|
|
|
|
'''Extract text from uploaded file'''
|
|
|
|
|
|
if file.type == "text/plain":
|
|
|
|
stringio = StringIO(file.getvalue().decode("utf-8"))
|
|
|
|
|
|
file_text = stringio.read()
|
|
|
|
|
|
elif file.type == "application/pdf":
|
|
file_text = extract_pdf(file)
|
|
|
|
|
|
elif (
|
|
file.type
|
|
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
):
|
|
file_text = docx2txt.process(file)
|
|
|
|
return file_text
|
|
|
|
def summary_downloader(raw_text):
|
|
|
|
b64 = base64.b64encode(raw_text.encode()).decode()
|
|
new_filename = "new_text_file_{}_.txt".format(time_str)
|
|
st.markdown("#### Download Summary as a File ###")
|
|
href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
|
|
st.markdown(href,unsafe_allow_html=True)
|
|
|
|
@st.cache(allow_output_mutation=True)
|
|
def facebook_model():
|
|
|
|
summarizer = pipeline('summarization',model='facebook/bart-large-cnn')
|
|
return summarizer
|
|
|
|
@st.cache(allow_output_mutation=True)
|
|
def schleifer_model():
|
|
|
|
summarizer = pipeline('summarization',model='sshleifer/distilbart-cnn-12-6')
|
|
return summarizer
|
|
|
|
|
|
|
|
st.title("Article Text and Link Extractive Summarizer 📝")
|
|
|
|
model_type = st.sidebar.selectbox(
|
|
"Model type", options=["Facebook-Bart", "Sshleifer-DistilBart"]
|
|
)
|
|
|
|
st.markdown(
|
|
"Model Source: [Facebook-Bart-large-CNN](https://huggingface.co/facebook/bart-large-cnn) and [Sshleifer-distilbart-cnn-12-6](https://huggingface.co/sshleifer/distilbart-cnn-12-6)"
|
|
)
|
|
|
|
st.markdown(
|
|
"""The app supports extractive summarization which aims to identify the salient information that is then extracted and grouped together to form a concise summary.
|
|
For documents or text that is more than 500 words long, the app will divide the text into chunks and summarize each chunk.
|
|
There are two models available to choose from:""")
|
|
|
|
st.markdown("""
|
|
- Facebook-Bart, trained on large [CNN and Daily Mail](https://huggingface.co/datasets/cnn_dailymail) news articles.
|
|
- Sshleifer-Distilbart, which is a distilled (smaller) version of the large Bart model."""
|
|
)
|
|
|
|
st.markdown("""Please do note that the model will take longer to generate summaries for documents that are too long.""")
|
|
|
|
st.markdown(
|
|
"The app only ingests the below formats for summarization task:"
|
|
)
|
|
st.markdown(
|
|
"""- Raw text entered in text box.
|
|
- URL of an article to be summarized.
|
|
- Documents with .txt, .pdf or .docx file formats."""
|
|
)
|
|
|
|
st.markdown("---")
|
|
|
|
url_text = st.text_input("Please Enter a url here")
|
|
|
|
|
|
st.markdown(
|
|
"<h3 style='text-align: center; color: red;'>OR</h3>",
|
|
unsafe_allow_html=True,
|
|
)
|
|
|
|
plain_text = st.text_input("Please Paste/Enter plain text here")
|
|
|
|
st.markdown(
|
|
"<h3 style='text-align: center; color: red;'>OR</h3>",
|
|
unsafe_allow_html=True,
|
|
)
|
|
|
|
upload_doc = st.file_uploader(
|
|
"Upload a .txt, .pdf, .docx file for summarization"
|
|
)
|
|
|
|
is_url = validators.url(url_text)
|
|
|
|
if is_url:
|
|
|
|
article_title,chunks = article_text_extractor(url=url_text)
|
|
|
|
elif upload_doc:
|
|
|
|
clean_text = preprocess_plain_text(extract_text_from_file(upload_doc))
|
|
|
|
else:
|
|
|
|
clean_text = preprocess_plain_text(plain_text)
|
|
|
|
summarize = st.button("Summarize")
|
|
|
|
|
|
if summarize:
|
|
if model_type == "Facebook-Bart":
|
|
if is_url:
|
|
text_to_summarize = chunks
|
|
else:
|
|
text_to_summarize = clean_text
|
|
|
|
with st.spinner(
|
|
text="Loading Facebook-Bart Model and Extracting summary. This might take a few seconds depending on the length of your text..."
|
|
):
|
|
summarizer_model = facebook_model()
|
|
summarized_text = summarizer_model(text_to_summarize, max_length=100, min_length=30)
|
|
summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
|
|
|
|
elif model_type == "Sshleifer-DistilBart":
|
|
if is_url:
|
|
text_to_summarize = chunks
|
|
else:
|
|
text_to_summarize = clean_text
|
|
|
|
with st.spinner(
|
|
text="Loading Sshleifer-DistilBart Model and Extracting summary. This might take a few seconds depending on the length of your text..."
|
|
):
|
|
summarizer_model = schleifer_model()
|
|
summarized_text = summarizer_model(text_to_summarize, max_length=100, min_length=30)
|
|
summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
|
|
|
|
|
|
st.subheader("Summarized text")
|
|
|
|
if is_url:
|
|
|
|
|
|
st.markdown(f"Article title: {article_title}")
|
|
|
|
st.write(summarized_text)
|
|
|
|
summary_downloader(summarized_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|