Spaces:

NeuML
/

articlesummary

Running

articlesummary / textractor.py

Create textractor.py

a4aec71 about 3 years ago

588 Bytes

	"""
	Textractor module
	"""

	import requests

	from bs4 import BeautifulSoup

	from txtai.pipeline.segmentation import Segmentation

	class Textractor(Segmentation):
	"""
	Extracts text from files.
	"""

	def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False):
	super().__init__(sentences, lines, paragraphs, minlength, join)

	def text(self, text):
	# text is a url
	response = requests.get(text)
	html = response.text

	soup = BeautifulSoup(html, features="html.parser")
	return soup.get_text()