WebWalker

Running

WebWalker / utils.py

update

60058de 6 days ago

1.22 kB

	import urllib.parse
	from bs4 import BeautifulSoup
	from crawl4ai import AsyncWebCrawler
	import re
	import asyncio

	def process_url(url, sub_url):
	return urllib.parse.urljoin(url, sub_url)


	def clean_markdown(res):
	pattern = r'\[.?\]$.?$'
	try:
	# 使用 re.sub() 将匹配的内容替换为空字符
	result = re.sub(pattern, '', res)
	url_pattern = pattern = r'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\$\$,]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
	result = re.sub(url_pattern, '', result)
	result = result.replace("* \n","")
	result = re.sub(r"\n\n+", "\n", result)
	return result
	except Exception:
	return res

	async def get_info(url, screentshot = True) -> str:
	async with AsyncWebCrawler() as crawler:
	if screentshot:
	result = await crawler.arun(url, screenshot=screentshot)
	# print(result)
	return result.html, clean_markdown(result.markdown), result.screenshot
	else:
	result = await crawler.arun(url, screenshot=screentshot)
	return result.html, clean_markdown(result.markdown)

	if __name__ == "__main__":
	asyncio.run(get_info("https://2024.aclweb.org/"))