Spaces:
Sleeping
Sleeping
import uuid | |
import json | |
from os import makedirs, remove | |
from os.path import exists, dirname | |
from bs4 import BeautifulSoup | |
import requests | |
''' | |
- Error handing | |
- Look if alternative to main tag is needed. Provide error message if main tag is not found. | |
- Menus are li tags with a tags within. | |
- li tags with text and tags should be exported | |
- Find divs that have text or p tags maybe other tags like divs | |
- Export the text | |
''' | |
# Get array of strings from page based off URL. | |
def get_url_content( url ): | |
file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json' | |
# Create directory if it doesn't exist. | |
makedirs(dirname(file_path), exist_ok=True) | |
# If cache file exists get content from cache. | |
if exists( file_path ): | |
with open( file_path, 'r' ) as file: | |
strings = json.load( file ) | |
else: | |
try: | |
strings = extract_strings( url ) | |
except Exception as exception: | |
raise exception | |
# Write strings to cache. | |
with open( file_path, 'w' ) as file: | |
json.dump( strings, file ) | |
return strings | |
# Extract text from page based off URL. | |
def extract_strings( url ): | |
try : | |
# Parse html content using BeautifulSoup. | |
soup = get_soup( url ) | |
except Exception as exception: | |
raise exception | |
if soup is None: | |
raise Exception('No HTML content found.') | |
# Remove scripts and styles. | |
for script in soup(["script", "style"]): | |
script.decompose() | |
# Get main content of html page. | |
content = get_main_content( soup ) | |
if content is None : | |
raise Exception('No main content found.') | |
# Extract strings from main content based on allowed tags. | |
strings = get_tags_text( content ) | |
if strings is None : | |
raise Exception('No text found.') | |
return strings | |
# Make request and get html content. | |
def get_soup( url ): | |
file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html' | |
makedirs(dirname(file_path), exist_ok=True) | |
# If cache file exists get content from cache. | |
if exists( file_path ): | |
with open( file_path, 'r' ) as web_page: | |
html = web_page.read() | |
else: | |
# Add user agent header to request to make request more realistic. | |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'} | |
response = requests.get( url, headers=headers ) | |
# Raise exception if response is not 200. | |
response.raise_for_status() | |
if not response.text: | |
raise Exception('HTML empty.') | |
html = response.text | |
# Save html to cache. | |
with open( file_path, 'w' ) as file: | |
file.write( html ) | |
return BeautifulSoup(html, 'html.parser') | |
# Find main content of html page based rules. | |
def get_main_content( soup ): | |
content = soup.find( "div", { "class": "post-body" } ) | |
if content is not None: | |
return content | |
content = soup.find( "div", { "class": "article-content" } ) | |
if content is not None: | |
return content | |
content = soup.find( "div", { "class": "blog-post-content" } ) | |
if content is not None: | |
return content | |
content = soup.find( "div", { "class": "region-content" } ) | |
if content is not None: | |
return content | |
content = soup.find( "div", { "class": "entry-content" } ) | |
if content is not None: | |
return content | |
content = soup.find( "div", { "class": "region--content" } ) | |
if content is not None: | |
return content | |
content = soup.find( "div", { "class": "article" } ) | |
if content is not None: | |
return content | |
content = soup.find( "div", { "class": "article-inner_html" } ) | |
if content is not None: | |
return content | |
content = soup.find( "div", { "id": "bmdDetail-Content" } ) | |
if content is not None: | |
return content | |
content = soup.find( "div", { "id": "main" } ) | |
if content is not None: | |
return content | |
content = soup.main | |
if content is not None: | |
return content | |
content = soup.find( "article" ) | |
if content is not None: | |
return content | |
content = soup.find( "body" ) | |
if content is not None: | |
return content | |
return None | |
# Extract text from allowed tags. | |
def get_tags_text( soup ): | |
text = [] | |
# Find all tags that are allowed. | |
tags = soup.find_all( allowed_tags ) | |
# Loop through tags and extract text. | |
for tag in tags: | |
# If div tag extract text from sub tags. | |
if tag.name == 'div' : | |
for div in tag.find_all(text=True, recursive=False): | |
found_text = div.get_text( ' ', strip=True ) | |
if found_text != '': | |
found_text = found_text.replace( '\n', ' ' ) | |
found_text = found_text.replace( '\r', ' ' ) | |
text.append( found_text ) | |
else : | |
found_text = tag.get_text( ' ', strip=True ) | |
if found_text != '': | |
found_text = found_text.replace( '\n', ' ' ) | |
found_text = found_text.replace( '\r', ' ' ) | |
text.append( found_text ) | |
return text | |
# List of allowed tags. | |
def allowed_tags( tag ): | |
return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div' | |
## To be deleted. | |
# -------------------------------------- # | |
# Extract content from main tag. | |
def get_main( soup ): | |
return soup.main | |
def get_deepest_divs( tag ): | |
# Get all the divs from within a tag. | |
return [div for div in tag.findAll('div') if not div.find('div')] | |
def get_tag_text( tags ): | |
text = '' | |
for tag in tags: | |
print(tag.find_all('li')) | |
# text += [p.get_text() for p in tag.find_all('p)] | |
return text | |