Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

raccoon / beautiful_soup /beautiful_soup.py

grapplerulrich

Remove debug info

7a36d2a unverified over 2 years ago

raw

history blame

5.92 kB

	import uuid
	import json
	from os import makedirs, remove
	from os.path import exists, dirname
	from bs4 import BeautifulSoup
	import requests

	'''
	- Error handing
	- Look if alternative to main tag is needed. Provide error message if main tag is not found.
	- Menus are li tags with a tags within.
	- li tags with text and tags should be exported
	- Find divs that have text or p tags maybe other tags like divs
	- Export the text
	'''

	# Get array of strings from page based off URL.
	def get_url_content( url ):
	file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'

	# Create directory if it doesn't exist.
	makedirs(dirname(file_path), exist_ok=True)

	# If cache file exists get content from cache.
	if exists( file_path ):
	with open( file_path, 'r' ) as file:
	strings = json.load( file )
	else:
	try:
	strings = extract_strings( url )
	except Exception as exception:
	raise exception

	# Write strings to cache.
	with open( file_path, 'w' ) as file:
	json.dump( strings, file )

	return strings

	# Extract text from page based off URL.
	def extract_strings( url ):
	try :
	# Parse html content using BeautifulSoup.
	soup = get_soup( url )
	except Exception as exception:
	raise exception

	if soup is None:
	raise Exception('No HTML content found.')

	# Remove scripts and styles.
	for script in soup(["script", "style"]):
	script.decompose()

	# Get main content of html page.
	content = get_main_content( soup )
	if content is None :
	raise Exception('No main content found.')

	# Extract strings from main content based on allowed tags.
	strings = get_tags_text( content )
	if strings is None :
	raise Exception('No text found.')
	return strings

	# Make request and get html content.
	def get_soup( url ):
	file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
	makedirs(dirname(file_path), exist_ok=True)
	# If cache file exists get content from cache.
	if exists( file_path ):
	with open( file_path, 'r' ) as web_page:
	html = web_page.read()
	else:
	# Add user agent header to request to make request more realistic.
	headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
	response = requests.get( url, headers=headers )
	# Raise exception if response is not 200.
	response.raise_for_status()
	if not response.text:
	raise Exception('HTML empty.')
	html = response.text
	# Save html to cache.
	with open( file_path, 'w' ) as file:
	file.write( html )

	return BeautifulSoup(html, 'html.parser')

	# Find main content of html page based rules.
	def get_main_content( soup ):

	content = soup.find( "div", { "class": "post-body" } )
	if content is not None:
	return content

	content = soup.find( "div", { "class": "article-content" } )
	if content is not None:
	return content

	content = soup.find( "div", { "class": "blog-post-content" } )
	if content is not None:
	return content

	content = soup.find( "div", { "class": "region-content" } )
	if content is not None:
	return content

	content = soup.find( "div", { "class": "entry-content" } )
	if content is not None:
	return content

	content = soup.find( "div", { "class": "region--content" } )
	if content is not None:
	return content

	content = soup.find( "div", { "class": "article" } )
	if content is not None:
	return content

	content = soup.find( "div", { "class": "article-inner_html" } )
	if content is not None:
	return content

	content = soup.find( "div", { "id": "bmdDetail-Content" } )
	if content is not None:
	return content

	content = soup.find( "div", { "id": "main" } )
	if content is not None:
	return content

	content = soup.main
	if content is not None:
	return content

	content = soup.find( "article" )
	if content is not None:
	return content

	content = soup.find( "body" )
	if content is not None:
	return content

	return None

	# Extract text from allowed tags.
	def get_tags_text( soup ):
	text = []
	# Find all tags that are allowed.
	tags = soup.find_all( allowed_tags )
	# Loop through tags and extract text.
	for tag in tags:
	# If div tag extract text from sub tags.
	if tag.name == 'div' :
	for div in tag.find_all(text=True, recursive=False):
	found_text = div.get_text( ' ', strip=True )
	if found_text != '':
	found_text = found_text.replace( '\n', ' ' )
	found_text = found_text.replace( '\r', ' ' )
	text.append( found_text )
	else :
	found_text = tag.get_text( ' ', strip=True )
	if found_text != '':
	found_text = found_text.replace( '\n', ' ' )
	found_text = found_text.replace( '\r', ' ' )
	text.append( found_text )
	return text

	# List of allowed tags.
	def allowed_tags( tag ):
	return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'

	## To be deleted.
	# -------------------------------------- #

	# Extract content from main tag.
	def get_main( soup ):
	return soup.main

	def get_deepest_divs( tag ):
	# Get all the divs from within a tag.
	return [div for div in tag.findAll('div') if not div.find('div')]

	def get_tag_text( tags ):
	text = ''
	for tag in tags:
	print(tag.find_all('li'))
	# text += [p.get_text() for p in tag.find_all('p)]
	return text