From 4329045cc68b432e120f2dfead68127cec61796e Mon Sep 17 00:00:00 2001 From: xhy Date: Tue, 1 Apr 2025 22:53:32 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E4=BA=86=E4=B8=80=E5=A0=86bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/app.py | 13 +++++ app/config/config.py | 3 + app/engines/crawl_engine.py | 81 +++++++++++++------------- app/engines/reporters/pc_reporter.py | 3 +- app/engines/reporters/site_reporter.py | 1 + app/engines/reporters/wap_reporter.py | 11 +++- app/utils/common.py | 4 ++ app/utils/dp.py | 4 +- config.local.toml | 4 +- 9 files changed, 78 insertions(+), 46 deletions(-) diff --git a/app/app.py b/app/app.py index 347ea3a..86007a2 100644 --- a/app/app.py +++ b/app/app.py @@ -6,6 +6,7 @@ import time from app.engines.report_engine import Reporter from .config import load_config, AppConfig +from .config.config import AppCtx from .engines.crawl_engine import CrawlEngine from .engines.evidence_engine import EvidenceEngine from .models.base import connect_db, create_database @@ -59,6 +60,12 @@ class MainApp: "--web", action="store_true", help="启动 web 服务器,启动后将忽略其他选项" ) + parser.add_argument( + "-s", + action="store_true", + help="当设置此选项的时候,将以正常模式启动Chrome(非headless模式),方便调试与观察运行情况" + ) + # 如果没有传入任何参数,显示帮助信息 if len(sys.argv) == 1: parser.print_help() @@ -121,6 +128,12 @@ class MainApp: self.config = load_config(self.args.config) logger.info(f"加载配置文件 {self.args.config} 成功") + # 设置 chrome 模式 + if self.args.s: + AppCtx.g_app_config.headless_chrome = False + else: + AppCtx.g_app_config.headless_chrome = True + # 连接数据库 try: self.db_engine = connect_db(self.config) diff --git a/app/config/config.py b/app/config/config.py index 676e618..f03dfd4 100644 --- a/app/config/config.py +++ b/app/config/config.py @@ -28,6 +28,8 @@ class AppConfig: debug: bool wap_screenshot: bool + headless_chrome: bool + database: DatabaseConfig chrome: ChromeConfig @@ -43,6 +45,7 @@ def load_config(config_path: str) -> AppConfig: AppCtx.g_app_config = AppConfig( debug=config_dict["debug"], wap_screenshot=config_dict["wap_screenshot"], + headless_chrome=True, database=database_config, chrome=chrome_config ) diff --git a/app/engines/crawl_engine.py b/app/engines/crawl_engine.py index 6c49b36..8c1c45c 100644 --- a/app/engines/crawl_engine.py +++ b/app/engines/crawl_engine.py @@ -2,6 +2,7 @@ import queue import threading import time +from DrissionPage.errors import ElementNotFoundError from loguru import logger from sqlmodel import Session, select @@ -85,7 +86,7 @@ class CrawlEngine: domain = self.target_queue.get_nowait() surl = self.crawl(domain) if not surl: - logger.debug(f"{threading.current_thread().name} 爬取 {domain} 异常,开始处理下一个") + logger.debug(f"{threading.current_thread().name} 爬取 {domain} 无结果,开始处理下一个") continue # 存入数据库 @@ -206,49 +207,51 @@ class CrawlEngine: tab.wait.eles_loaded(["#container", ".content_none", "#content_left"], any_one=True) while True: + try: + # 增加页码 + current_page += 1 + logger.debug(f"{threading.current_thread().name} 爬取 {domain} 的第 {current_page} 页数据") + # 直接访问 URL 会触发验证码 + # tab.get( + # f"https://www.baidu.com/s?wd=site%3A{domain}&gpc=stf%3D{start_time}%2C{end_time}%7Cstftype%3D1&pn={(current_page - 1) * 10}") + # tab.get(f"https://www.baidu.com/s?wd=site%3A{domain}&pn={(current_page - 1) * 10}") - # 增加页码 - current_page += 1 - logger.debug(f"{threading.current_thread().name} 爬取 {domain} 的第 {current_page} 页数据") - # 直接访问 URL 会触发验证码 - # tab.get( - # f"https://www.baidu.com/s?wd=site%3A{domain}&gpc=stf%3D{start_time}%2C{end_time}%7Cstftype%3D1&pn={(current_page - 1) * 10}") - # tab.get(f"https://www.baidu.com/s?wd=site%3A{domain}&pn={(current_page - 1) * 10}") + # 终止条件 + if current_page > max_page and max_page: + logger.debug(f"{threading.current_thread().name} 达到指定页码,退出") + break - # 终止条件 - if current_page > max_page and max_page: - logger.debug(f"{threading.current_thread().name} 达到指定页码,退出") + # logger.debug(f"tab.html: {tab.html}") + self.ev.wait(0.3) + if "未找到相关结果" in tab.html: + logger.debug(f"{threading.current_thread().name} 未找到结果,退出") + break + + # 获取数据 + tab.wait.eles_loaded("@id=content_left") + results = tab.ele("@id=content_left").eles("@class:result") + # temp = [result.attr("mu") for result in results if result.attr("mu") is not None] + for result in results: + surl = result.attr("mu") + if not surl: + continue + logger.debug(f"{threading.current_thread().name} 找到 URL : {surl}") + surl_set.add(surl) + + # 翻页的时候等一下,别太快了 + self.ev.wait(0.3) + # 如果没有下一页了,这个地方会找不到元素,有 10 秒的 timeout + next_btn = tab.ele("t:a@@text():下一页") + if not next_btn: + logger.debug(f"{threading.current_thread().name} 没有下一页了") + break + next_btn.click(True) + except ElementNotFoundError as e: + logger.error(f"没有找到 HTML 元素,跳过,详细信息: {e}") break - - # logger.debug(f"tab.html: {tab.html}") - self.ev.wait(0.3) - if "未找到相关结果" in tab.html: - logger.debug(f"{threading.current_thread().name} 未找到结果,退出") - break - - # 获取数据 - tab.wait.eles_loaded("@id=content_left") - results = tab.ele("@id=content_left").eles("@class:result") - # temp = [result.attr("mu") for result in results if result.attr("mu") is not None] - for result in results: - surl = result.attr("mu") - if not surl: - continue - logger.debug(f"{threading.current_thread().name} 找到 URL : {surl}") - surl_set.add(surl) - - # 翻页的时候等一下,别太快了 - self.ev.wait(0.3) - # 如果没有下一页了,这个地方会找不到元素,有 10 秒的 timeout - next_btn = tab.ele("t:a@@text():下一页") - if not next_btn: - logger.debug(f"{threading.current_thread().name} 没有下一页了") - break - next_btn.click(True) - return surl_set except Exception as e: - logger.error(f"{threading.current_thread().name} 爬取{domain}发生错误:{e}") + logger.error(f"{threading.current_thread().name} 爬取 {domain} 发生错误:{e}") import traceback traceback.print_exc() finally: diff --git a/app/engines/reporters/pc_reporter.py b/app/engines/reporters/pc_reporter.py index 33a5325..f77913a 100644 --- a/app/engines/reporters/pc_reporter.py +++ b/app/engines/reporters/pc_reporter.py @@ -85,13 +85,14 @@ class PcReporter(BaseReporter): # 上传图片 img_filename = self.upload_report_pic(img_path) - logger.debug(f"{img_filename=}, {wap_img_filename=}") + logger.debug(f"{img_filename=}") if not img_filename: logger.warning(f"图片 {img_path} 上传失败") continue if AppCtx.g_app_config.wap_screenshot: wap_img_filename = self.upload_report_pic(wap_img_path) + logger.debug(f"{wap_img_filename=}") if not wap_img_filename: logger.warning(f"图片 {wap_img_filename} 上传失败") diff --git a/app/engines/reporters/site_reporter.py b/app/engines/reporters/site_reporter.py index 05a3883..874828b 100644 --- a/app/engines/reporters/site_reporter.py +++ b/app/engines/reporters/site_reporter.py @@ -64,6 +64,7 @@ class SiteReporter(BaseReporter): cookie = random.choice(self.all_cookies) report_site_cookie = GenCookie.run(cookie) self.headers["Cookie"] = report_site_cookie + self.headers["User-Agent"] = random_ua() logger.debug(f"设置 cookie 为:{report_site_cookie}") # 先上传图片 diff --git a/app/engines/reporters/wap_reporter.py b/app/engines/reporters/wap_reporter.py index ab3c6f4..4a92dab 100644 --- a/app/engines/reporters/wap_reporter.py +++ b/app/engines/reporters/wap_reporter.py @@ -12,6 +12,7 @@ from .base import BaseReporter from ...config.config import AppCtx from ...models.report_urls import ReportUrlModel from ...utils.common import get_proxies, get_all_cookies, md5 +from ...utils.gen_cookie import GenCookie from ...utils.ua import random_ua @@ -51,7 +52,9 @@ class WapReporter(BaseReporter): # 选个 cookie report_cookie = random.choice(get_all_cookies()) - self.headers["Cookie"] = report_cookie + report_site_cookie = GenCookie.run(report_cookie) + self.headers["Cookie"] = report_site_cookie + self.headers["User-Agent"] = random_ua() logger.debug(f"{report_cookie=}") # 获取用户信息 @@ -79,13 +82,14 @@ class WapReporter(BaseReporter): # wapUserAgent = random.choice(self.wapUserAgent) response = self.request.get( "https://ufosdk.baidu.com/api?m=Web&a=getUserInfo&appid=293852", - headers=self.headers, proxies=self.proxies, allow_redirects=False, timeout=5 + headers=self.headers, proxies=self.proxies, allow_redirects=False, timeout=10, verify=False ) json_data = response.json() uid = json_data['result']['uid'] un = json_data['result']['un'] userinfo["uid"] = uid userinfo["un"] = un + logger.debug(f"{userinfo=}") return userinfo except Exception as e: logger.error(f"[{self.engine_name}]获取用户信息错误: {e}") @@ -135,7 +139,8 @@ class WapReporter(BaseReporter): headers=self.headers, proxies=self.proxies, allow_redirects=False, - timeout=5 + timeout=10, + verify=False ) # logger.debug(req.json()) logger.debug(response.json()) diff --git a/app/utils/common.py b/app/utils/common.py index 554ec18..2b2655f 100644 --- a/app/utils/common.py +++ b/app/utils/common.py @@ -16,6 +16,10 @@ def get_proxies(): "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}, "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel} } + # proxies = { + # "http": "http://127.0.0.1:8080", + # "https": "http://127.0.0.1:8080" + # } return proxies diff --git a/app/utils/dp.py b/app/utils/dp.py index 92b0d7c..33110be 100644 --- a/app/utils/dp.py +++ b/app/utils/dp.py @@ -1,4 +1,5 @@ from DrissionPage import Chromium, ChromiumOptions +from loguru import logger from .ua import random_ua from ..config.config import AppCtx @@ -10,7 +11,8 @@ class DPEngine: def __init__(self, is_wap: bool = False, no_img: bool = True): chrome_opts = ChromiumOptions() chrome_opts.mute(True) # 静音 - chrome_opts.headless(False) # 无头模式 + logger.debug(f"{AppCtx.g_app_config.headless_chrome=}") + chrome_opts.headless(AppCtx.g_app_config.headless_chrome) # 无头模式 chrome_opts.no_imgs(no_img) # 不加载图片 chrome_opts.set_argument("--disable-gpu") # 禁用GPU chrome_opts.set_argument('--ignore-certificate-errors') # 忽略证书错误 diff --git a/config.local.toml b/config.local.toml index 3d48548..f28c651 100644 --- a/config.local.toml +++ b/config.local.toml @@ -3,7 +3,7 @@ debug = true # 是否截取目标URL本身的图片 -wap_snapshot = false +wap_screenshot = false # 数据库配置 [database] @@ -15,5 +15,5 @@ database = "baidu_reporter" # chrome 配置 [chrome] -proxy = "http://127.0.0.1:7890" +proxy = "http://127.0.0.1:8080" browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe" \ No newline at end of file