2025-03-28 18:23:30 +08:00
|
|
|
|
import argparse
|
2025-04-03 22:11:20 +08:00
|
|
|
|
import asyncio
|
2025-03-28 18:23:30 +08:00
|
|
|
|
import sys
|
|
|
|
|
|
import os
|
2025-03-28 23:19:42 +08:00
|
|
|
|
import time
|
2025-04-03 22:11:20 +08:00
|
|
|
|
import signal
|
2025-03-28 18:23:30 +08:00
|
|
|
|
|
2025-03-30 22:49:37 +08:00
|
|
|
|
from app.engines.report_engine import Reporter
|
2025-03-28 18:23:30 +08:00
|
|
|
|
|
|
|
|
|
|
from .config import load_config, AppConfig
|
2025-04-01 22:53:32 +08:00
|
|
|
|
from .config.config import AppCtx
|
2025-03-28 23:19:42 +08:00
|
|
|
|
from .engines.crawl_engine import CrawlEngine
|
2025-03-30 16:04:34 +08:00
|
|
|
|
from .engines.evidence_engine import EvidenceEngine
|
2025-03-28 18:23:30 +08:00
|
|
|
|
from .models.base import connect_db, create_database
|
|
|
|
|
|
|
|
|
|
|
|
from loguru import logger
|
|
|
|
|
|
import sqlalchemy.exc
|
|
|
|
|
|
|
2025-04-03 22:11:20 +08:00
|
|
|
|
from .web.web import WebApp
|
|
|
|
|
|
|
2025-03-28 18:23:30 +08:00
|
|
|
|
|
|
|
|
|
|
class MainApp:
|
|
|
|
|
|
"""主应用"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
|
self.args = None
|
|
|
|
|
|
self.config: AppConfig = None
|
|
|
|
|
|
self.db_engine = None
|
|
|
|
|
|
|
|
|
|
|
|
def parse_args(self):
|
|
|
|
|
|
"""解析命令行参数"""
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="Baidu Reporter")
|
|
|
|
|
|
|
|
|
|
|
|
# 指定配置文件
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
"-c",
|
|
|
|
|
|
"--config",
|
|
|
|
|
|
default="./config.local.toml",
|
|
|
|
|
|
help="指定配置文件路径,默认为 ./config.local.toml",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
2025-04-01 00:36:01 +08:00
|
|
|
|
"--crawl", help="采集模式,根据域名批量采集 SURL,多个域名可使用英文逗号分割,也可通过 --crawl-file 传入文件",
|
|
|
|
|
|
)
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
"--crawl-file", help="目标域名文件,批量传入待采集的域名,每行一个"
|
2025-03-30 16:04:34 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
"--evidence", help="收集证据模式,对数据库内的 SURL 获取证据",
|
|
|
|
|
|
action="store_true"
|
2025-03-28 18:23:30 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 添加运行模式参数
|
|
|
|
|
|
parser.add_argument(
|
2025-03-30 22:49:37 +08:00
|
|
|
|
"--report",
|
|
|
|
|
|
const="pc,site,wap",
|
|
|
|
|
|
nargs="?",
|
2025-03-28 18:23:30 +08:00
|
|
|
|
help="指定运行模式:pc/site/wap,不指定则运行所有模式,多个模式使用英文逗号分隔",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 添加 web 服务器参数
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
"--web", action="store_true", help="启动 web 服务器,启动后将忽略其他选项"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-04-01 22:53:32 +08:00
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
"-s",
|
|
|
|
|
|
action="store_true",
|
|
|
|
|
|
help="当设置此选项的时候,将以正常模式启动Chrome(非headless模式),方便调试与观察运行情况"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-03-28 18:23:30 +08:00
|
|
|
|
# 如果没有传入任何参数,显示帮助信息
|
|
|
|
|
|
if len(sys.argv) == 1:
|
|
|
|
|
|
parser.print_help()
|
|
|
|
|
|
sys.exit(0)
|
|
|
|
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
2025-04-01 00:36:01 +08:00
|
|
|
|
logger.debug(f"{args=}")
|
2025-03-28 18:23:30 +08:00
|
|
|
|
|
|
|
|
|
|
# 处理模式参数
|
2025-03-30 22:49:37 +08:00
|
|
|
|
if args.report:
|
|
|
|
|
|
reports = [m.strip() for m in args.report.split(",")]
|
2025-03-28 18:23:30 +08:00
|
|
|
|
valid_modes = ["pc", "site", "wap"]
|
2025-03-30 22:49:37 +08:00
|
|
|
|
invalid_modes = [m for m in reports if m not in valid_modes]
|
2025-03-28 18:23:30 +08:00
|
|
|
|
if invalid_modes:
|
|
|
|
|
|
parser.error(f'无效的运行模式: {", ".join(invalid_modes)}')
|
2025-03-30 22:49:37 +08:00
|
|
|
|
args.report = reports
|
2025-03-28 18:23:30 +08:00
|
|
|
|
else:
|
2025-03-30 22:49:37 +08:00
|
|
|
|
args.report = ["pc", "site", "wap"]
|
2025-03-28 18:23:30 +08:00
|
|
|
|
|
|
|
|
|
|
# 检查输入的文件是否存在
|
2025-04-01 00:36:01 +08:00
|
|
|
|
if args.crawl_file and not os.path.exists(args.crawl_file):
|
|
|
|
|
|
parser.error(f"--crawl_file 指定的文件 {args.crawl_file} 不存在")
|
2025-03-28 18:23:30 +08:00
|
|
|
|
|
|
|
|
|
|
# 检查配置文件是否存在
|
|
|
|
|
|
if not os.path.exists(args.config):
|
|
|
|
|
|
parser.error(f"配置文件不存在: {args.config}")
|
|
|
|
|
|
|
|
|
|
|
|
self.args = args
|
|
|
|
|
|
|
|
|
|
|
|
def start_cli(self):
|
|
|
|
|
|
"""开启 CLI 模式"""
|
2025-04-01 21:54:01 +08:00
|
|
|
|
if self.args.crawl or self.args.crawl_file:
|
2025-03-30 16:04:34 +08:00
|
|
|
|
crawl = CrawlEngine()
|
2025-04-01 00:36:01 +08:00
|
|
|
|
crawl.cli_start(self.args.crawl, self.args.crawl_file)
|
|
|
|
|
|
crawl.cli_wait()
|
2025-03-30 16:04:34 +08:00
|
|
|
|
crawl.stop()
|
|
|
|
|
|
elif self.args.evidence:
|
|
|
|
|
|
evidence = EvidenceEngine()
|
|
|
|
|
|
evidence.cli_start()
|
|
|
|
|
|
evidence.stop()
|
2025-03-30 22:49:37 +08:00
|
|
|
|
elif self.args.report:
|
|
|
|
|
|
reporter = Reporter(self.args.report)
|
|
|
|
|
|
reporter.cli_start()
|
|
|
|
|
|
reporter.stop()
|
2025-04-01 21:54:01 +08:00
|
|
|
|
else:
|
|
|
|
|
|
logger.error("模式错误!")
|
|
|
|
|
|
return
|
2025-03-28 18:23:30 +08:00
|
|
|
|
|
|
|
|
|
|
def start_web(self):
|
|
|
|
|
|
"""开启 Web 模式"""
|
2025-04-03 22:11:20 +08:00
|
|
|
|
|
|
|
|
|
|
# 注册 ctrl+c 处理程序,正常结束所有的 engine
|
|
|
|
|
|
signal.signal(signal.SIGINT, self.exit_handler)
|
|
|
|
|
|
|
|
|
|
|
|
# 启动 web 页面
|
|
|
|
|
|
web_app = WebApp()
|
|
|
|
|
|
asyncio.run(web_app.start())
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("web stop.")
|
2025-03-28 18:23:30 +08:00
|
|
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
|
|
"""运行应用"""
|
|
|
|
|
|
|
|
|
|
|
|
# 解析命令行参数
|
|
|
|
|
|
self.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
# 加载配置文件
|
|
|
|
|
|
self.config = load_config(self.args.config)
|
|
|
|
|
|
logger.info(f"加载配置文件 {self.args.config} 成功")
|
|
|
|
|
|
|
2025-04-01 22:53:32 +08:00
|
|
|
|
# 设置 chrome 模式
|
|
|
|
|
|
if self.args.s:
|
|
|
|
|
|
AppCtx.g_app_config.headless_chrome = False
|
|
|
|
|
|
else:
|
|
|
|
|
|
AppCtx.g_app_config.headless_chrome = True
|
|
|
|
|
|
|
2025-03-28 18:23:30 +08:00
|
|
|
|
# 连接数据库
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.db_engine = connect_db(self.config)
|
|
|
|
|
|
logger.info(f"连接数据库 {self.config.database.database} 成功")
|
|
|
|
|
|
except sqlalchemy.exc.OperationalError as e:
|
|
|
|
|
|
# 如果错误类型是数据库不存在,询问用户是否执行初始化操作
|
|
|
|
|
|
if "1049" in str(e):
|
|
|
|
|
|
logger.info("数据库不存在,尝试初始化数据库")
|
|
|
|
|
|
create_database(self.config)
|
|
|
|
|
|
logger.info("数据库初始化成功,尝试连接数据库")
|
|
|
|
|
|
self.db_engine = connect_db(self.config)
|
|
|
|
|
|
logger.info(f"连接数据库 {self.config.database.database} 成功")
|
|
|
|
|
|
else:
|
|
|
|
|
|
logger.error(f"连接数据库失败,请检查配置文件或数据库服务是否正常: {e}")
|
2025-03-28 23:19:42 +08:00
|
|
|
|
sys.exit(1)
|
2025-03-28 18:23:30 +08:00
|
|
|
|
|
|
|
|
|
|
# 如果指定了 --web 参数,启动 web 服务器,忽略其他选项
|
|
|
|
|
|
if self.args.web:
|
|
|
|
|
|
logger.info("启动 Web 模式")
|
|
|
|
|
|
return self.start_web()
|
|
|
|
|
|
else:
|
|
|
|
|
|
logger.info("启动 CLI 模式")
|
|
|
|
|
|
return self.start_cli()
|
2025-04-03 22:11:20 +08:00
|
|
|
|
|
|
|
|
|
|
def exit_handler(self, signum, frame):
|
|
|
|
|
|
# TODO 在这里结束各个 engine
|
|
|
|
|
|
print("CTRL+C called.")
|