wap截图改为可选项

This commit is contained in:
xhy 2025-04-01 21:54:01 +08:00
parent cdc51a6eac
commit de563198ac
8 changed files with 57 additions and 23 deletions

View File

@ -6,6 +6,15 @@
# 采集模式采集指定关键字的URL列表直接存入数据库 # 采集模式采集指定关键字的URL列表直接存入数据库
python main.py --crawl www.yunzhiju.net python main.py --crawl www.yunzhiju.net
# 批量采集模式,使用英文逗号分割多个域名
python main.py --crawl www.yunzhiju.net,www.yunzhiju.net
# 另外一种批量采集模式,从文件中读取域名,每行一个
python main.py --crawl-file ./domains.txt
# 这两种采集模式可以一起使用
python main.py --crawl www.yunzhiju.net,www.yunzhiju.net --crawl-file ./domains.txt
# 收集模式,收集所有待举报的链接的截图与 Token # 收集模式,收集所有待举报的链接的截图与 Token
python main.py --evidence python main.py --evidence

View File

@ -90,7 +90,7 @@ class MainApp:
def start_cli(self): def start_cli(self):
"""开启 CLI 模式""" """开启 CLI 模式"""
if self.args.crawl: if self.args.crawl or self.args.crawl_file:
crawl = CrawlEngine() crawl = CrawlEngine()
crawl.cli_start(self.args.crawl, self.args.crawl_file) crawl.cli_start(self.args.crawl, self.args.crawl_file)
crawl.cli_wait() crawl.cli_wait()
@ -103,6 +103,9 @@ class MainApp:
reporter = Reporter(self.args.report) reporter = Reporter(self.args.report)
reporter.cli_start() reporter.cli_start()
reporter.stop() reporter.stop()
else:
logger.error("模式错误!")
return
def start_web(self): def start_web(self):
"""开启 Web 模式""" """开启 Web 模式"""

View File

@ -2,15 +2,6 @@ from dataclasses import dataclass
import toml import toml
class AppCtx:
# 全局变量
# 配置信息
g_app_config = None
# 数据库连接
g_db_engine = None
@dataclass @dataclass
class DatabaseConfig: class DatabaseConfig:
"""数据库配置""" """数据库配置"""
@ -35,6 +26,8 @@ class AppConfig:
"""总配置,如果未来增加 Web 服务,添加 WebConfig 即可""" """总配置,如果未来增加 Web 服务,添加 WebConfig 即可"""
debug: bool debug: bool
wap_screenshot: bool
database: DatabaseConfig database: DatabaseConfig
chrome: ChromeConfig chrome: ChromeConfig
@ -51,3 +44,12 @@ def load_config(config_path: str) -> AppConfig:
debug=config_dict["debug"], database=database_config, chrome=chrome_config debug=config_dict["debug"], database=database_config, chrome=chrome_config
) )
return AppCtx.g_app_config return AppCtx.g_app_config
class AppCtx:
# 全局变量
# 配置信息
g_app_config: AppConfig = None
# 数据库连接
g_db_engine = None

View File

@ -26,7 +26,7 @@ class CrawlEngine:
# 线程池 # 线程池
self.pool: list[threading.Thread] = [] self.pool: list[threading.Thread] = []
self.worker_count = 4 self.worker_count = 2
# 工作队列 # 工作队列
self.target_queue = queue.Queue(1024) self.target_queue = queue.Queue(1024)
@ -104,7 +104,9 @@ class CrawlEngine:
def add_domain(self, input_domains: str, input_domain_filepath: str) -> list[str]: def add_domain(self, input_domains: str, input_domain_filepath: str) -> list[str]:
"""把输入的域名存到库里""" """把输入的域名存到库里"""
# 生成所有待采集的域名列表 # 生成所有待采集的域名列表
domains = [d.strip() for d in input_domains.split(",") if d.strip()] domains = []
if input_domains:
domains.extend([d.strip() for d in input_domains.split(",") if d.strip()])
if input_domain_filepath: if input_domain_filepath:
with open(input_domain_filepath, "r") as fp: with open(input_domain_filepath, "r") as fp:
for line in fp: for line in fp:
@ -130,7 +132,7 @@ class CrawlEngine:
return domains return domains
def stop(self): def stop(self):
"""停止采集器""" """停止采集器,通用的"""
self.ev.set() self.ev.set()
self.worker_status = 0 self.worker_status = 0
self.dp_engine.browser.quit() self.dp_engine.browser.quit()
@ -143,7 +145,7 @@ class CrawlEngine:
thread.start() thread.start()
def worker(self): def worker(self):
"""真正的工作函数""" """真正的工作函数后续以Web模式启动的时候走这个"""
logger.info("crawl worker start!") logger.info("crawl worker start!")
while self.worker_status == 1: while self.worker_status == 1:
# 检查数据库,从中获取需要爬取的域名 # 检查数据库,从中获取需要爬取的域名

View File

@ -92,9 +92,10 @@ class EvidenceEngine:
return None return None
# Part2 截一张surl本身的图 # Part2 截一张surl本身的图
logger.debug(f"开始获取 {surl} 的截图") if AppCtx.g_app_config.wap_screenshot:
img_path, wap_tab = self.get_wap_screenshot(target) logger.debug(f"开始获取 {surl} 的截图")
wap_tab.close() img_path, wap_tab = self.get_wap_screenshot(target)
wap_tab.close()
# Part3 获取举报链接 # Part3 获取举报链接
logger.debug(f"开始获取 {surl} 的举报链接") logger.debug(f"开始获取 {surl} 的举报链接")

View File

@ -73,19 +73,28 @@ class PcReporter(BaseReporter):
# 检查图片是否存在 # 检查图片是否存在
img_path = f"./imgs/{domain}/{md5(surl)}.png" img_path = f"./imgs/{domain}/{md5(surl)}.png"
wap_img_path = f"./imgs/{domain}/{md5(surl)}-wap.png" if not os.path.exists(img_path):
if not all((os.path.exists(img_path), os.path.exists(wap_img_path))): logger.warning(f"图片{img_path}不存在")
logger.debug(f"图片{img_path}{wap_img_path} 不存在")
continue continue
wap_img_path = f"./imgs/{domain}/{md5(surl)}-wap.png"
if AppCtx.g_app_config.wap_screenshot:
if not os.path.exists(wap_img_path):
logger.warning(f"图片{wap_img_path}不存在")
continue
# 上传图片 # 上传图片
img_filename = self.upload_report_pic(img_path) img_filename = self.upload_report_pic(img_path)
wap_img_filename = self.upload_report_pic(wap_img_path)
logger.debug(f"{img_filename=}, {wap_img_filename=}") logger.debug(f"{img_filename=}, {wap_img_filename=}")
if not all((img_filename, wap_img_filename)): if not img_filename:
logger.debug(f"图片 {img_path}{wap_img_path} 上传失败") logger.warning(f"图片 {img_path} 上传失败")
continue continue
if AppCtx.g_app_config.wap_screenshot:
wap_img_filename = self.upload_report_pic(wap_img_path)
if not wap_img_filename:
logger.warning(f"图片 {wap_img_filename} 上传失败")
# 提交举报 # 提交举报
retry = 0 retry = 0
while retry < 3: while retry < 3:

View File

@ -1,5 +1,11 @@
# 是否开启 debug 模式debug模式下会打印执行的SQL语句
# 但是因为 SQL 太多了,直接在代码里屏蔽掉了
debug = true debug = true
# 是否截取目标URL本身的图片
wap_snapshot = false
# 数据库配置
[database] [database]
host = "localhost" host = "localhost"
port = 3306 port = 3306
@ -7,6 +13,7 @@ user = "root"
password = "123456" password = "123456"
database = "baidu_reporter" database = "baidu_reporter"
# chrome 配置
[chrome] [chrome]
proxy = "http://127.0.0.1:7890" proxy = "http://127.0.0.1:7890"
browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe" browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe"

View File

@ -1,4 +1,5 @@
debug = false debug = false
wap_screenshot = false
[database] [database]
host = "localhost" host = "localhost"