import streamlit as st import os import json5 from agents import Seeker from qwen_agent.tools.base import BaseTool, register_tool import os import re import json import asyncio from utils import * import base64 from PIL import Image import subprocess def run_command(command): try: # Run the shell command result = subprocess.run(command, shell=True, text=True, capture_output=True) # Print the output of the command print("Command Output:") print(result.stdout) # Print the error if there is any if result.stderr: print("Command Error:") print(result.stderr) except Exception as e: print(f"An error occurred: {e}") # Run crawl4ai-setup def init_crawl4ai(): try: # 使用 subprocess 执行命令 result = subprocess.run( ["python", "-m", "playwright", "install", "--with-deps", "chromium"], check=True, # 如果命令失败,抛出 CalledProcessError text=True, # 输出以文本形式返回 capture_output=True # 捕获输出 ) print("Success!") print(result.stdout) except subprocess.CalledProcessError as e: print("Error!") print(e.stderr) model = "qwen-max" llm_cfg = { 'model': model, 'api_key': os.getenv('API_KEY'), 'model_server': "https://dashscope.aliyuncs.com/compatible-mode/v1" , 'generate_cfg': { 'top_p': 0.8, 'max_input_tokens': 120000, 'max_retries': 20 }, } def extract_links_with_text(html): with open("ROOT_URL.txt", "r") as f: ROOT_URL = f.read() soup = BeautifulSoup(html, 'html.parser') links = [] # 常规的标签 for a_tag in soup.find_all('a', href=True): url = a_tag['href'] text = ''.join(a_tag.stripped_strings) # 过滤掉图片链接和javascript链接 if text and "javascript" not in url and not url.endswith(('.jpg', '.png', '.gif', '.jpeg', '.pdf')): if process_url(ROOT_URL, url).startswith(ROOT_URL): links.append({'url': process_url(ROOT_URL, url), 'text': text}) # 特殊情况1: 使用onclick事件的链接 for a_tag in soup.find_all('a', onclick=True): onclick_text = a_tag['onclick'] text = ''.join(a_tag.stripped_strings) # 使用正则表达式提取url match = re.search(r"window\.location\.href='([^']*)'", onclick_text) if match: url = match.group(1) if url and text and not url.endswith(('.jpg', '.png', '.gif', '.jpeg', '.pdf')): if process_url(ROOT_URL, url).startswith(ROOT_URL): links.append({'url': process_url(ROOT_URL, url), 'text': text}) # 特殊情况2: data-url属性的链接 for a_tag in soup.find_all('a', attrs={'data-url': True}): url = a_tag['data-url'] text = ''.join(a_tag.stripped_strings) if url and text and not url.endswith(('.jpg', '.png', '.gif', '.jpeg', '.pdf')): if process_url(ROOT_URL, url).startswith(ROOT_URL): links.append({'url': process_url(ROOT_URL, url), 'text': text}) # 特殊情况3: class为"herf-mask"的链接 for a_tag in soup.find_all('a', class_='herf-mask'): url = a_tag.get('href') text = a_tag.get('title') or ''.join(a_tag.stripped_strings) if url and text and not url.endswith(('.jpg', '.png', '.gif', '.jpeg', '.pdf')): if process_url(ROOT_URL, url).startswith(ROOT_URL): links.append({'url': process_url(ROOT_URL, url), 'text': text}) # 特殊情况4: