wap截图改为可选项
This commit is contained in:
parent
cdc51a6eac
commit
de563198ac
@ -6,6 +6,15 @@
|
||||
# 采集模式,采集指定关键字的URL列表,直接存入数据库
|
||||
python main.py --crawl www.yunzhiju.net
|
||||
|
||||
# 批量采集模式,使用英文逗号分割多个域名
|
||||
python main.py --crawl www.yunzhiju.net,www.yunzhiju.net
|
||||
|
||||
# 另外一种批量采集模式,从文件中读取域名,每行一个
|
||||
python main.py --crawl-file ./domains.txt
|
||||
|
||||
# 这两种采集模式可以一起使用
|
||||
python main.py --crawl www.yunzhiju.net,www.yunzhiju.net --crawl-file ./domains.txt
|
||||
|
||||
# 收集模式,收集所有待举报的链接的截图与 Token
|
||||
python main.py --evidence
|
||||
|
||||
|
||||
@ -90,7 +90,7 @@ class MainApp:
|
||||
|
||||
def start_cli(self):
|
||||
"""开启 CLI 模式"""
|
||||
if self.args.crawl:
|
||||
if self.args.crawl or self.args.crawl_file:
|
||||
crawl = CrawlEngine()
|
||||
crawl.cli_start(self.args.crawl, self.args.crawl_file)
|
||||
crawl.cli_wait()
|
||||
@ -103,6 +103,9 @@ class MainApp:
|
||||
reporter = Reporter(self.args.report)
|
||||
reporter.cli_start()
|
||||
reporter.stop()
|
||||
else:
|
||||
logger.error("模式错误!")
|
||||
return
|
||||
|
||||
def start_web(self):
|
||||
"""开启 Web 模式"""
|
||||
|
||||
@ -2,15 +2,6 @@ from dataclasses import dataclass
|
||||
import toml
|
||||
|
||||
|
||||
class AppCtx:
|
||||
# 全局变量
|
||||
# 配置信息
|
||||
g_app_config = None
|
||||
|
||||
# 数据库连接
|
||||
g_db_engine = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatabaseConfig:
|
||||
"""数据库配置"""
|
||||
@ -35,6 +26,8 @@ class AppConfig:
|
||||
"""总配置,如果未来增加 Web 服务,添加 WebConfig 即可"""
|
||||
|
||||
debug: bool
|
||||
wap_screenshot: bool
|
||||
|
||||
database: DatabaseConfig
|
||||
chrome: ChromeConfig
|
||||
|
||||
@ -51,3 +44,12 @@ def load_config(config_path: str) -> AppConfig:
|
||||
debug=config_dict["debug"], database=database_config, chrome=chrome_config
|
||||
)
|
||||
return AppCtx.g_app_config
|
||||
|
||||
|
||||
class AppCtx:
|
||||
# 全局变量
|
||||
# 配置信息
|
||||
g_app_config: AppConfig = None
|
||||
|
||||
# 数据库连接
|
||||
g_db_engine = None
|
||||
|
||||
@ -26,7 +26,7 @@ class CrawlEngine:
|
||||
|
||||
# 线程池
|
||||
self.pool: list[threading.Thread] = []
|
||||
self.worker_count = 4
|
||||
self.worker_count = 2
|
||||
|
||||
# 工作队列
|
||||
self.target_queue = queue.Queue(1024)
|
||||
@ -104,7 +104,9 @@ class CrawlEngine:
|
||||
def add_domain(self, input_domains: str, input_domain_filepath: str) -> list[str]:
|
||||
"""把输入的域名存到库里"""
|
||||
# 生成所有待采集的域名列表
|
||||
domains = [d.strip() for d in input_domains.split(",") if d.strip()]
|
||||
domains = []
|
||||
if input_domains:
|
||||
domains.extend([d.strip() for d in input_domains.split(",") if d.strip()])
|
||||
if input_domain_filepath:
|
||||
with open(input_domain_filepath, "r") as fp:
|
||||
for line in fp:
|
||||
@ -130,7 +132,7 @@ class CrawlEngine:
|
||||
return domains
|
||||
|
||||
def stop(self):
|
||||
"""停止采集器"""
|
||||
"""停止采集器,通用的"""
|
||||
self.ev.set()
|
||||
self.worker_status = 0
|
||||
self.dp_engine.browser.quit()
|
||||
@ -143,7 +145,7 @@ class CrawlEngine:
|
||||
thread.start()
|
||||
|
||||
def worker(self):
|
||||
"""真正的工作函数"""
|
||||
"""真正的工作函数,后续以Web模式启动的时候,走这个"""
|
||||
logger.info("crawl worker start!")
|
||||
while self.worker_status == 1:
|
||||
# 检查数据库,从中获取需要爬取的域名
|
||||
|
||||
@ -92,9 +92,10 @@ class EvidenceEngine:
|
||||
return None
|
||||
|
||||
# Part2 截一张surl本身的图
|
||||
logger.debug(f"开始获取 {surl} 的截图")
|
||||
img_path, wap_tab = self.get_wap_screenshot(target)
|
||||
wap_tab.close()
|
||||
if AppCtx.g_app_config.wap_screenshot:
|
||||
logger.debug(f"开始获取 {surl} 的截图")
|
||||
img_path, wap_tab = self.get_wap_screenshot(target)
|
||||
wap_tab.close()
|
||||
|
||||
# Part3 获取举报链接
|
||||
logger.debug(f"开始获取 {surl} 的举报链接")
|
||||
|
||||
@ -73,19 +73,28 @@ class PcReporter(BaseReporter):
|
||||
|
||||
# 检查图片是否存在
|
||||
img_path = f"./imgs/{domain}/{md5(surl)}.png"
|
||||
wap_img_path = f"./imgs/{domain}/{md5(surl)}-wap.png"
|
||||
if not all((os.path.exists(img_path), os.path.exists(wap_img_path))):
|
||||
logger.debug(f"图片{img_path} 或 {wap_img_path} 不存在")
|
||||
if not os.path.exists(img_path):
|
||||
logger.warning(f"图片{img_path}不存在")
|
||||
continue
|
||||
|
||||
wap_img_path = f"./imgs/{domain}/{md5(surl)}-wap.png"
|
||||
if AppCtx.g_app_config.wap_screenshot:
|
||||
if not os.path.exists(wap_img_path):
|
||||
logger.warning(f"图片{wap_img_path}不存在")
|
||||
continue
|
||||
|
||||
# 上传图片
|
||||
img_filename = self.upload_report_pic(img_path)
|
||||
wap_img_filename = self.upload_report_pic(wap_img_path)
|
||||
logger.debug(f"{img_filename=}, {wap_img_filename=}")
|
||||
if not all((img_filename, wap_img_filename)):
|
||||
logger.debug(f"图片 {img_path} 或 {wap_img_path} 上传失败")
|
||||
if not img_filename:
|
||||
logger.warning(f"图片 {img_path} 上传失败")
|
||||
continue
|
||||
|
||||
if AppCtx.g_app_config.wap_screenshot:
|
||||
wap_img_filename = self.upload_report_pic(wap_img_path)
|
||||
if not wap_img_filename:
|
||||
logger.warning(f"图片 {wap_img_filename} 上传失败")
|
||||
|
||||
# 提交举报
|
||||
retry = 0
|
||||
while retry < 3:
|
||||
|
||||
@ -1,5 +1,11 @@
|
||||
# 是否开启 debug 模式,debug模式下会打印执行的SQL语句
|
||||
# 但是因为 SQL 太多了,直接在代码里屏蔽掉了
|
||||
debug = true
|
||||
|
||||
# 是否截取目标URL本身的图片
|
||||
wap_snapshot = false
|
||||
|
||||
# 数据库配置
|
||||
[database]
|
||||
host = "localhost"
|
||||
port = 3306
|
||||
@ -7,6 +13,7 @@ user = "root"
|
||||
password = "123456"
|
||||
database = "baidu_reporter"
|
||||
|
||||
# chrome 配置
|
||||
[chrome]
|
||||
proxy = "http://127.0.0.1:7890"
|
||||
browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe"
|
||||
@ -1,4 +1,5 @@
|
||||
debug = false
|
||||
wap_screenshot = false
|
||||
|
||||
[database]
|
||||
host = "localhost"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user