WebWalker / utils.py
callanwu's picture
update
60058de
raw
history blame
1.22 kB
import urllib.parse
from bs4 import BeautifulSoup
from crawl4ai import AsyncWebCrawler
import re
import asyncio
def process_url(url, sub_url):
return urllib.parse.urljoin(url, sub_url)
def clean_markdown(res):
pattern = r'\[.*?\]\(.*?\)'
try:
# 使用 re.sub() 将匹配的内容替换为空字符
result = re.sub(pattern, '', res)
url_pattern = pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
result = re.sub(url_pattern, '', result)
result = result.replace("* \n","")
result = re.sub(r"\n\n+", "\n", result)
return result
except Exception:
return res
async def get_info(url, screentshot = True) -> str:
async with AsyncWebCrawler() as crawler:
if screentshot:
result = await crawler.arun(url, screenshot=screentshot)
# print(result)
return result.html, clean_markdown(result.markdown), result.screenshot
else:
result = await crawler.arun(url, screenshot=screentshot)
return result.html, clean_markdown(result.markdown)
if __name__ == "__main__":
asyncio.run(get_info("https://2024.aclweb.org/"))