diff --git a/README.md b/README.md index bd1c61a..25734c9 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,15 @@ # 采集模式,采集指定关键字的URL列表,直接存入数据库 python main.py --crawl www.yunzhiju.net +# 批量采集模式,使用英文逗号分割多个域名 +python main.py --crawl www.yunzhiju.net,www.yunzhiju.net + +# 另外一种批量采集模式,从文件中读取域名,每行一个 +python main.py --crawl-file ./domains.txt + +# 这两种采集模式可以一起使用 +python main.py --crawl www.yunzhiju.net,www.yunzhiju.net --crawl-file ./domains.txt + # 收集模式,收集所有待举报的链接的截图与 Token python main.py --evidence diff --git a/app/app.py b/app/app.py index 4d36c83..347ea3a 100644 --- a/app/app.py +++ b/app/app.py @@ -90,7 +90,7 @@ class MainApp: def start_cli(self): """开启 CLI 模式""" - if self.args.crawl: + if self.args.crawl or self.args.crawl_file: crawl = CrawlEngine() crawl.cli_start(self.args.crawl, self.args.crawl_file) crawl.cli_wait() @@ -103,6 +103,9 @@ class MainApp: reporter = Reporter(self.args.report) reporter.cli_start() reporter.stop() + else: + logger.error("模式错误!") + return def start_web(self): """开启 Web 模式""" diff --git a/app/config/config.py b/app/config/config.py index cb15b62..cf426e3 100644 --- a/app/config/config.py +++ b/app/config/config.py @@ -2,15 +2,6 @@ from dataclasses import dataclass import toml -class AppCtx: - # 全局变量 - # 配置信息 - g_app_config = None - - # 数据库连接 - g_db_engine = None - - @dataclass class DatabaseConfig: """数据库配置""" @@ -35,6 +26,8 @@ class AppConfig: """总配置,如果未来增加 Web 服务,添加 WebConfig 即可""" debug: bool + wap_screenshot: bool + database: DatabaseConfig chrome: ChromeConfig @@ -51,3 +44,12 @@ def load_config(config_path: str) -> AppConfig: debug=config_dict["debug"], database=database_config, chrome=chrome_config ) return AppCtx.g_app_config + + +class AppCtx: + # 全局变量 + # 配置信息 + g_app_config: AppConfig = None + + # 数据库连接 + g_db_engine = None diff --git a/app/engines/crawl_engine.py b/app/engines/crawl_engine.py index a6eb225..6c49b36 100644 --- a/app/engines/crawl_engine.py +++ b/app/engines/crawl_engine.py @@ -26,7 +26,7 @@ class CrawlEngine: # 线程池 self.pool: list[threading.Thread] = [] - self.worker_count = 4 + self.worker_count = 2 # 工作队列 self.target_queue = queue.Queue(1024) @@ -104,7 +104,9 @@ class CrawlEngine: def add_domain(self, input_domains: str, input_domain_filepath: str) -> list[str]: """把输入的域名存到库里""" # 生成所有待采集的域名列表 - domains = [d.strip() for d in input_domains.split(",") if d.strip()] + domains = [] + if input_domains: + domains.extend([d.strip() for d in input_domains.split(",") if d.strip()]) if input_domain_filepath: with open(input_domain_filepath, "r") as fp: for line in fp: @@ -130,7 +132,7 @@ class CrawlEngine: return domains def stop(self): - """停止采集器""" + """停止采集器,通用的""" self.ev.set() self.worker_status = 0 self.dp_engine.browser.quit() @@ -143,7 +145,7 @@ class CrawlEngine: thread.start() def worker(self): - """真正的工作函数""" + """真正的工作函数,后续以Web模式启动的时候,走这个""" logger.info("crawl worker start!") while self.worker_status == 1: # 检查数据库,从中获取需要爬取的域名 diff --git a/app/engines/evidence_engine.py b/app/engines/evidence_engine.py index b81fd19..9cc4d00 100644 --- a/app/engines/evidence_engine.py +++ b/app/engines/evidence_engine.py @@ -92,9 +92,10 @@ class EvidenceEngine: return None # Part2 截一张surl本身的图 - logger.debug(f"开始获取 {surl} 的截图") - img_path, wap_tab = self.get_wap_screenshot(target) - wap_tab.close() + if AppCtx.g_app_config.wap_screenshot: + logger.debug(f"开始获取 {surl} 的截图") + img_path, wap_tab = self.get_wap_screenshot(target) + wap_tab.close() # Part3 获取举报链接 logger.debug(f"开始获取 {surl} 的举报链接") diff --git a/app/engines/reporters/pc_reporter.py b/app/engines/reporters/pc_reporter.py index 26869e1..33a5325 100644 --- a/app/engines/reporters/pc_reporter.py +++ b/app/engines/reporters/pc_reporter.py @@ -73,19 +73,28 @@ class PcReporter(BaseReporter): # 检查图片是否存在 img_path = f"./imgs/{domain}/{md5(surl)}.png" - wap_img_path = f"./imgs/{domain}/{md5(surl)}-wap.png" - if not all((os.path.exists(img_path), os.path.exists(wap_img_path))): - logger.debug(f"图片{img_path} 或 {wap_img_path} 不存在") + if not os.path.exists(img_path): + logger.warning(f"图片{img_path}不存在") continue + wap_img_path = f"./imgs/{domain}/{md5(surl)}-wap.png" + if AppCtx.g_app_config.wap_screenshot: + if not os.path.exists(wap_img_path): + logger.warning(f"图片{wap_img_path}不存在") + continue + # 上传图片 img_filename = self.upload_report_pic(img_path) - wap_img_filename = self.upload_report_pic(wap_img_path) logger.debug(f"{img_filename=}, {wap_img_filename=}") - if not all((img_filename, wap_img_filename)): - logger.debug(f"图片 {img_path} 或 {wap_img_path} 上传失败") + if not img_filename: + logger.warning(f"图片 {img_path} 上传失败") continue + if AppCtx.g_app_config.wap_screenshot: + wap_img_filename = self.upload_report_pic(wap_img_path) + if not wap_img_filename: + logger.warning(f"图片 {wap_img_filename} 上传失败") + # 提交举报 retry = 0 while retry < 3: diff --git a/config.local.toml b/config.local.toml index ac29e81..3d48548 100644 --- a/config.local.toml +++ b/config.local.toml @@ -1,5 +1,11 @@ +# 是否开启 debug 模式,debug模式下会打印执行的SQL语句 +# 但是因为 SQL 太多了,直接在代码里屏蔽掉了 debug = true +# 是否截取目标URL本身的图片 +wap_snapshot = false + +# 数据库配置 [database] host = "localhost" port = 3306 @@ -7,6 +13,7 @@ user = "root" password = "123456" database = "baidu_reporter" +# chrome 配置 [chrome] proxy = "http://127.0.0.1:7890" browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe" \ No newline at end of file diff --git a/config.prod.toml b/config.prod.toml index 292dcf1..982b9f1 100644 --- a/config.prod.toml +++ b/config.prod.toml @@ -1,4 +1,5 @@ debug = false +wap_screenshot = false [database] host = "localhost"