|
import urllib.parse |
|
from bs4 import BeautifulSoup |
|
from crawl4ai import AsyncWebCrawler |
|
import re |
|
import asyncio |
|
|
|
def process_url(url, sub_url): |
|
return urllib.parse.urljoin(url, sub_url) |
|
|
|
|
|
def clean_markdown(res): |
|
pattern = r'\[.*?\]\(.*?\)' |
|
try: |
|
|
|
result = re.sub(pattern, '', res) |
|
url_pattern = pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' |
|
result = re.sub(url_pattern, '', result) |
|
result = result.replace("* \n","") |
|
result = re.sub(r"\n\n+", "\n", result) |
|
return result |
|
except Exception: |
|
return res |
|
|
|
async def get_info(url, screentshot = True) -> str: |
|
async with AsyncWebCrawler() as crawler: |
|
if screentshot: |
|
result = await crawler.arun(url, screenshot=screentshot) |
|
|
|
return result.html, clean_markdown(result.markdown), result.screenshot |
|
else: |
|
result = await crawler.arun(url, screenshot=screentshot) |
|
return result.html, clean_markdown(result.markdown) |
|
|
|
if __name__ == "__main__": |
|
asyncio.run(get_info("https://2024.aclweb.org/")) |