修了一堆bug

This commit is contained in:
xhy 2025-04-01 22:53:32 +08:00
parent e5456ef67c
commit 4329045cc6
9 changed files with 78 additions and 46 deletions

View File

@ -6,6 +6,7 @@ import time
from app.engines.report_engine import Reporter from app.engines.report_engine import Reporter
from .config import load_config, AppConfig from .config import load_config, AppConfig
from .config.config import AppCtx
from .engines.crawl_engine import CrawlEngine from .engines.crawl_engine import CrawlEngine
from .engines.evidence_engine import EvidenceEngine from .engines.evidence_engine import EvidenceEngine
from .models.base import connect_db, create_database from .models.base import connect_db, create_database
@ -59,6 +60,12 @@ class MainApp:
"--web", action="store_true", help="启动 web 服务器,启动后将忽略其他选项" "--web", action="store_true", help="启动 web 服务器,启动后将忽略其他选项"
) )
parser.add_argument(
"-s",
action="store_true",
help="当设置此选项的时候将以正常模式启动Chrome(非headless模式),方便调试与观察运行情况"
)
# 如果没有传入任何参数,显示帮助信息 # 如果没有传入任何参数,显示帮助信息
if len(sys.argv) == 1: if len(sys.argv) == 1:
parser.print_help() parser.print_help()
@ -121,6 +128,12 @@ class MainApp:
self.config = load_config(self.args.config) self.config = load_config(self.args.config)
logger.info(f"加载配置文件 {self.args.config} 成功") logger.info(f"加载配置文件 {self.args.config} 成功")
# 设置 chrome 模式
if self.args.s:
AppCtx.g_app_config.headless_chrome = False
else:
AppCtx.g_app_config.headless_chrome = True
# 连接数据库 # 连接数据库
try: try:
self.db_engine = connect_db(self.config) self.db_engine = connect_db(self.config)

View File

@ -28,6 +28,8 @@ class AppConfig:
debug: bool debug: bool
wap_screenshot: bool wap_screenshot: bool
headless_chrome: bool
database: DatabaseConfig database: DatabaseConfig
chrome: ChromeConfig chrome: ChromeConfig
@ -43,6 +45,7 @@ def load_config(config_path: str) -> AppConfig:
AppCtx.g_app_config = AppConfig( AppCtx.g_app_config = AppConfig(
debug=config_dict["debug"], debug=config_dict["debug"],
wap_screenshot=config_dict["wap_screenshot"], wap_screenshot=config_dict["wap_screenshot"],
headless_chrome=True,
database=database_config, database=database_config,
chrome=chrome_config chrome=chrome_config
) )

View File

@ -2,6 +2,7 @@ import queue
import threading import threading
import time import time
from DrissionPage.errors import ElementNotFoundError
from loguru import logger from loguru import logger
from sqlmodel import Session, select from sqlmodel import Session, select
@ -85,7 +86,7 @@ class CrawlEngine:
domain = self.target_queue.get_nowait() domain = self.target_queue.get_nowait()
surl = self.crawl(domain) surl = self.crawl(domain)
if not surl: if not surl:
logger.debug(f"{threading.current_thread().name} 爬取 {domain} 异常,开始处理下一个") logger.debug(f"{threading.current_thread().name} 爬取 {domain} 无结果,开始处理下一个")
continue continue
# 存入数据库 # 存入数据库
@ -206,7 +207,7 @@ class CrawlEngine:
tab.wait.eles_loaded(["#container", ".content_none", "#content_left"], any_one=True) tab.wait.eles_loaded(["#container", ".content_none", "#content_left"], any_one=True)
while True: while True:
try:
# 增加页码 # 增加页码
current_page += 1 current_page += 1
logger.debug(f"{threading.current_thread().name} 爬取 {domain} 的第 {current_page} 页数据") logger.debug(f"{threading.current_thread().name} 爬取 {domain} 的第 {current_page} 页数据")
@ -245,7 +246,9 @@ class CrawlEngine:
logger.debug(f"{threading.current_thread().name} 没有下一页了") logger.debug(f"{threading.current_thread().name} 没有下一页了")
break break
next_btn.click(True) next_btn.click(True)
except ElementNotFoundError as e:
logger.error(f"没有找到 HTML 元素,跳过,详细信息: {e}")
break
return surl_set return surl_set
except Exception as e: except Exception as e:
logger.error(f"{threading.current_thread().name} 爬取 {domain} 发生错误:{e}") logger.error(f"{threading.current_thread().name} 爬取 {domain} 发生错误:{e}")

View File

@ -85,13 +85,14 @@ class PcReporter(BaseReporter):
# 上传图片 # 上传图片
img_filename = self.upload_report_pic(img_path) img_filename = self.upload_report_pic(img_path)
logger.debug(f"{img_filename=}, {wap_img_filename=}") logger.debug(f"{img_filename=}")
if not img_filename: if not img_filename:
logger.warning(f"图片 {img_path} 上传失败") logger.warning(f"图片 {img_path} 上传失败")
continue continue
if AppCtx.g_app_config.wap_screenshot: if AppCtx.g_app_config.wap_screenshot:
wap_img_filename = self.upload_report_pic(wap_img_path) wap_img_filename = self.upload_report_pic(wap_img_path)
logger.debug(f"{wap_img_filename=}")
if not wap_img_filename: if not wap_img_filename:
logger.warning(f"图片 {wap_img_filename} 上传失败") logger.warning(f"图片 {wap_img_filename} 上传失败")

View File

@ -64,6 +64,7 @@ class SiteReporter(BaseReporter):
cookie = random.choice(self.all_cookies) cookie = random.choice(self.all_cookies)
report_site_cookie = GenCookie.run(cookie) report_site_cookie = GenCookie.run(cookie)
self.headers["Cookie"] = report_site_cookie self.headers["Cookie"] = report_site_cookie
self.headers["User-Agent"] = random_ua()
logger.debug(f"设置 cookie 为:{report_site_cookie}") logger.debug(f"设置 cookie 为:{report_site_cookie}")
# 先上传图片 # 先上传图片

View File

@ -12,6 +12,7 @@ from .base import BaseReporter
from ...config.config import AppCtx from ...config.config import AppCtx
from ...models.report_urls import ReportUrlModel from ...models.report_urls import ReportUrlModel
from ...utils.common import get_proxies, get_all_cookies, md5 from ...utils.common import get_proxies, get_all_cookies, md5
from ...utils.gen_cookie import GenCookie
from ...utils.ua import random_ua from ...utils.ua import random_ua
@ -51,7 +52,9 @@ class WapReporter(BaseReporter):
# 选个 cookie # 选个 cookie
report_cookie = random.choice(get_all_cookies()) report_cookie = random.choice(get_all_cookies())
self.headers["Cookie"] = report_cookie report_site_cookie = GenCookie.run(report_cookie)
self.headers["Cookie"] = report_site_cookie
self.headers["User-Agent"] = random_ua()
logger.debug(f"{report_cookie=}") logger.debug(f"{report_cookie=}")
# 获取用户信息 # 获取用户信息
@ -79,13 +82,14 @@ class WapReporter(BaseReporter):
# wapUserAgent = random.choice(self.wapUserAgent) # wapUserAgent = random.choice(self.wapUserAgent)
response = self.request.get( response = self.request.get(
"https://ufosdk.baidu.com/api?m=Web&a=getUserInfo&appid=293852", "https://ufosdk.baidu.com/api?m=Web&a=getUserInfo&appid=293852",
headers=self.headers, proxies=self.proxies, allow_redirects=False, timeout=5 headers=self.headers, proxies=self.proxies, allow_redirects=False, timeout=10, verify=False
) )
json_data = response.json() json_data = response.json()
uid = json_data['result']['uid'] uid = json_data['result']['uid']
un = json_data['result']['un'] un = json_data['result']['un']
userinfo["uid"] = uid userinfo["uid"] = uid
userinfo["un"] = un userinfo["un"] = un
logger.debug(f"{userinfo=}")
return userinfo return userinfo
except Exception as e: except Exception as e:
logger.error(f"[{self.engine_name}]获取用户信息错误: {e}") logger.error(f"[{self.engine_name}]获取用户信息错误: {e}")
@ -135,7 +139,8 @@ class WapReporter(BaseReporter):
headers=self.headers, headers=self.headers,
proxies=self.proxies, proxies=self.proxies,
allow_redirects=False, allow_redirects=False,
timeout=5 timeout=10,
verify=False
) )
# logger.debug(req.json()) # logger.debug(req.json())
logger.debug(response.json()) logger.debug(response.json())

View File

@ -16,6 +16,10 @@ def get_proxies():
"http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}, "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
"https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel} "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
} }
# proxies = {
# "http": "http://127.0.0.1:8080",
# "https": "http://127.0.0.1:8080"
# }
return proxies return proxies

View File

@ -1,4 +1,5 @@
from DrissionPage import Chromium, ChromiumOptions from DrissionPage import Chromium, ChromiumOptions
from loguru import logger
from .ua import random_ua from .ua import random_ua
from ..config.config import AppCtx from ..config.config import AppCtx
@ -10,7 +11,8 @@ class DPEngine:
def __init__(self, is_wap: bool = False, no_img: bool = True): def __init__(self, is_wap: bool = False, no_img: bool = True):
chrome_opts = ChromiumOptions() chrome_opts = ChromiumOptions()
chrome_opts.mute(True) # 静音 chrome_opts.mute(True) # 静音
chrome_opts.headless(False) # 无头模式 logger.debug(f"{AppCtx.g_app_config.headless_chrome=}")
chrome_opts.headless(AppCtx.g_app_config.headless_chrome) # 无头模式
chrome_opts.no_imgs(no_img) # 不加载图片 chrome_opts.no_imgs(no_img) # 不加载图片
chrome_opts.set_argument("--disable-gpu") # 禁用GPU chrome_opts.set_argument("--disable-gpu") # 禁用GPU
chrome_opts.set_argument('--ignore-certificate-errors') # 忽略证书错误 chrome_opts.set_argument('--ignore-certificate-errors') # 忽略证书错误

View File

@ -3,7 +3,7 @@
debug = true debug = true
# 是否截取目标URL本身的图片 # 是否截取目标URL本身的图片
wap_snapshot = false wap_screenshot = false
# 数据库配置 # 数据库配置
[database] [database]
@ -15,5 +15,5 @@ database = "baidu_reporter"
# chrome 配置 # chrome 配置
[chrome] [chrome]
proxy = "http://127.0.0.1:7890" proxy = "http://127.0.0.1:8080"
browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe" browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe"