wap截图改为可选项
This commit is contained in:
parent
cdc51a6eac
commit
de563198ac
@ -6,6 +6,15 @@
|
|||||||
# 采集模式,采集指定关键字的URL列表,直接存入数据库
|
# 采集模式,采集指定关键字的URL列表,直接存入数据库
|
||||||
python main.py --crawl www.yunzhiju.net
|
python main.py --crawl www.yunzhiju.net
|
||||||
|
|
||||||
|
# 批量采集模式,使用英文逗号分割多个域名
|
||||||
|
python main.py --crawl www.yunzhiju.net,www.yunzhiju.net
|
||||||
|
|
||||||
|
# 另外一种批量采集模式,从文件中读取域名,每行一个
|
||||||
|
python main.py --crawl-file ./domains.txt
|
||||||
|
|
||||||
|
# 这两种采集模式可以一起使用
|
||||||
|
python main.py --crawl www.yunzhiju.net,www.yunzhiju.net --crawl-file ./domains.txt
|
||||||
|
|
||||||
# 收集模式,收集所有待举报的链接的截图与 Token
|
# 收集模式,收集所有待举报的链接的截图与 Token
|
||||||
python main.py --evidence
|
python main.py --evidence
|
||||||
|
|
||||||
|
|||||||
@ -90,7 +90,7 @@ class MainApp:
|
|||||||
|
|
||||||
def start_cli(self):
|
def start_cli(self):
|
||||||
"""开启 CLI 模式"""
|
"""开启 CLI 模式"""
|
||||||
if self.args.crawl:
|
if self.args.crawl or self.args.crawl_file:
|
||||||
crawl = CrawlEngine()
|
crawl = CrawlEngine()
|
||||||
crawl.cli_start(self.args.crawl, self.args.crawl_file)
|
crawl.cli_start(self.args.crawl, self.args.crawl_file)
|
||||||
crawl.cli_wait()
|
crawl.cli_wait()
|
||||||
@ -103,6 +103,9 @@ class MainApp:
|
|||||||
reporter = Reporter(self.args.report)
|
reporter = Reporter(self.args.report)
|
||||||
reporter.cli_start()
|
reporter.cli_start()
|
||||||
reporter.stop()
|
reporter.stop()
|
||||||
|
else:
|
||||||
|
logger.error("模式错误!")
|
||||||
|
return
|
||||||
|
|
||||||
def start_web(self):
|
def start_web(self):
|
||||||
"""开启 Web 模式"""
|
"""开启 Web 模式"""
|
||||||
|
|||||||
@ -2,15 +2,6 @@ from dataclasses import dataclass
|
|||||||
import toml
|
import toml
|
||||||
|
|
||||||
|
|
||||||
class AppCtx:
|
|
||||||
# 全局变量
|
|
||||||
# 配置信息
|
|
||||||
g_app_config = None
|
|
||||||
|
|
||||||
# 数据库连接
|
|
||||||
g_db_engine = None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DatabaseConfig:
|
class DatabaseConfig:
|
||||||
"""数据库配置"""
|
"""数据库配置"""
|
||||||
@ -35,6 +26,8 @@ class AppConfig:
|
|||||||
"""总配置,如果未来增加 Web 服务,添加 WebConfig 即可"""
|
"""总配置,如果未来增加 Web 服务,添加 WebConfig 即可"""
|
||||||
|
|
||||||
debug: bool
|
debug: bool
|
||||||
|
wap_screenshot: bool
|
||||||
|
|
||||||
database: DatabaseConfig
|
database: DatabaseConfig
|
||||||
chrome: ChromeConfig
|
chrome: ChromeConfig
|
||||||
|
|
||||||
@ -51,3 +44,12 @@ def load_config(config_path: str) -> AppConfig:
|
|||||||
debug=config_dict["debug"], database=database_config, chrome=chrome_config
|
debug=config_dict["debug"], database=database_config, chrome=chrome_config
|
||||||
)
|
)
|
||||||
return AppCtx.g_app_config
|
return AppCtx.g_app_config
|
||||||
|
|
||||||
|
|
||||||
|
class AppCtx:
|
||||||
|
# 全局变量
|
||||||
|
# 配置信息
|
||||||
|
g_app_config: AppConfig = None
|
||||||
|
|
||||||
|
# 数据库连接
|
||||||
|
g_db_engine = None
|
||||||
|
|||||||
@ -26,7 +26,7 @@ class CrawlEngine:
|
|||||||
|
|
||||||
# 线程池
|
# 线程池
|
||||||
self.pool: list[threading.Thread] = []
|
self.pool: list[threading.Thread] = []
|
||||||
self.worker_count = 4
|
self.worker_count = 2
|
||||||
|
|
||||||
# 工作队列
|
# 工作队列
|
||||||
self.target_queue = queue.Queue(1024)
|
self.target_queue = queue.Queue(1024)
|
||||||
@ -104,7 +104,9 @@ class CrawlEngine:
|
|||||||
def add_domain(self, input_domains: str, input_domain_filepath: str) -> list[str]:
|
def add_domain(self, input_domains: str, input_domain_filepath: str) -> list[str]:
|
||||||
"""把输入的域名存到库里"""
|
"""把输入的域名存到库里"""
|
||||||
# 生成所有待采集的域名列表
|
# 生成所有待采集的域名列表
|
||||||
domains = [d.strip() for d in input_domains.split(",") if d.strip()]
|
domains = []
|
||||||
|
if input_domains:
|
||||||
|
domains.extend([d.strip() for d in input_domains.split(",") if d.strip()])
|
||||||
if input_domain_filepath:
|
if input_domain_filepath:
|
||||||
with open(input_domain_filepath, "r") as fp:
|
with open(input_domain_filepath, "r") as fp:
|
||||||
for line in fp:
|
for line in fp:
|
||||||
@ -130,7 +132,7 @@ class CrawlEngine:
|
|||||||
return domains
|
return domains
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
"""停止采集器"""
|
"""停止采集器,通用的"""
|
||||||
self.ev.set()
|
self.ev.set()
|
||||||
self.worker_status = 0
|
self.worker_status = 0
|
||||||
self.dp_engine.browser.quit()
|
self.dp_engine.browser.quit()
|
||||||
@ -143,7 +145,7 @@ class CrawlEngine:
|
|||||||
thread.start()
|
thread.start()
|
||||||
|
|
||||||
def worker(self):
|
def worker(self):
|
||||||
"""真正的工作函数"""
|
"""真正的工作函数,后续以Web模式启动的时候,走这个"""
|
||||||
logger.info("crawl worker start!")
|
logger.info("crawl worker start!")
|
||||||
while self.worker_status == 1:
|
while self.worker_status == 1:
|
||||||
# 检查数据库,从中获取需要爬取的域名
|
# 检查数据库,从中获取需要爬取的域名
|
||||||
|
|||||||
@ -92,9 +92,10 @@ class EvidenceEngine:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# Part2 截一张surl本身的图
|
# Part2 截一张surl本身的图
|
||||||
logger.debug(f"开始获取 {surl} 的截图")
|
if AppCtx.g_app_config.wap_screenshot:
|
||||||
img_path, wap_tab = self.get_wap_screenshot(target)
|
logger.debug(f"开始获取 {surl} 的截图")
|
||||||
wap_tab.close()
|
img_path, wap_tab = self.get_wap_screenshot(target)
|
||||||
|
wap_tab.close()
|
||||||
|
|
||||||
# Part3 获取举报链接
|
# Part3 获取举报链接
|
||||||
logger.debug(f"开始获取 {surl} 的举报链接")
|
logger.debug(f"开始获取 {surl} 的举报链接")
|
||||||
|
|||||||
@ -73,19 +73,28 @@ class PcReporter(BaseReporter):
|
|||||||
|
|
||||||
# 检查图片是否存在
|
# 检查图片是否存在
|
||||||
img_path = f"./imgs/{domain}/{md5(surl)}.png"
|
img_path = f"./imgs/{domain}/{md5(surl)}.png"
|
||||||
wap_img_path = f"./imgs/{domain}/{md5(surl)}-wap.png"
|
if not os.path.exists(img_path):
|
||||||
if not all((os.path.exists(img_path), os.path.exists(wap_img_path))):
|
logger.warning(f"图片{img_path}不存在")
|
||||||
logger.debug(f"图片{img_path} 或 {wap_img_path} 不存在")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
wap_img_path = f"./imgs/{domain}/{md5(surl)}-wap.png"
|
||||||
|
if AppCtx.g_app_config.wap_screenshot:
|
||||||
|
if not os.path.exists(wap_img_path):
|
||||||
|
logger.warning(f"图片{wap_img_path}不存在")
|
||||||
|
continue
|
||||||
|
|
||||||
# 上传图片
|
# 上传图片
|
||||||
img_filename = self.upload_report_pic(img_path)
|
img_filename = self.upload_report_pic(img_path)
|
||||||
wap_img_filename = self.upload_report_pic(wap_img_path)
|
|
||||||
logger.debug(f"{img_filename=}, {wap_img_filename=}")
|
logger.debug(f"{img_filename=}, {wap_img_filename=}")
|
||||||
if not all((img_filename, wap_img_filename)):
|
if not img_filename:
|
||||||
logger.debug(f"图片 {img_path} 或 {wap_img_path} 上传失败")
|
logger.warning(f"图片 {img_path} 上传失败")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if AppCtx.g_app_config.wap_screenshot:
|
||||||
|
wap_img_filename = self.upload_report_pic(wap_img_path)
|
||||||
|
if not wap_img_filename:
|
||||||
|
logger.warning(f"图片 {wap_img_filename} 上传失败")
|
||||||
|
|
||||||
# 提交举报
|
# 提交举报
|
||||||
retry = 0
|
retry = 0
|
||||||
while retry < 3:
|
while retry < 3:
|
||||||
|
|||||||
@ -1,5 +1,11 @@
|
|||||||
|
# 是否开启 debug 模式,debug模式下会打印执行的SQL语句
|
||||||
|
# 但是因为 SQL 太多了,直接在代码里屏蔽掉了
|
||||||
debug = true
|
debug = true
|
||||||
|
|
||||||
|
# 是否截取目标URL本身的图片
|
||||||
|
wap_snapshot = false
|
||||||
|
|
||||||
|
# 数据库配置
|
||||||
[database]
|
[database]
|
||||||
host = "localhost"
|
host = "localhost"
|
||||||
port = 3306
|
port = 3306
|
||||||
@ -7,6 +13,7 @@ user = "root"
|
|||||||
password = "123456"
|
password = "123456"
|
||||||
database = "baidu_reporter"
|
database = "baidu_reporter"
|
||||||
|
|
||||||
|
# chrome 配置
|
||||||
[chrome]
|
[chrome]
|
||||||
proxy = "http://127.0.0.1:7890"
|
proxy = "http://127.0.0.1:7890"
|
||||||
browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe"
|
browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe"
|
||||||
@ -1,4 +1,5 @@
|
|||||||
debug = false
|
debug = false
|
||||||
|
wap_screenshot = false
|
||||||
|
|
||||||
[database]
|
[database]
|
||||||
host = "localhost"
|
host = "localhost"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user