articlesummary / textractor.py
davidmezzetti's picture
Create textractor.py
a4aec71
raw
history blame
588 Bytes
"""
Textractor module
"""
import requests
from bs4 import BeautifulSoup
from txtai.pipeline.segmentation import Segmentation
class Textractor(Segmentation):
"""
Extracts text from files.
"""
def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False):
super().__init__(sentences, lines, paragraphs, minlength, join)
def text(self, text):
# text is a url
response = requests.get(text)
html = response.text
soup = BeautifulSoup(html, features="html.parser")
return soup.get_text()