修了一堆bug

This commit is contained in:
xhy 2025-04-01 22:53:32 +08:00
parent e5456ef67c
commit 4329045cc6
9 changed files with 78 additions and 46 deletions

View File

@ -6,6 +6,7 @@ import time
from app.engines.report_engine import Reporter
from .config import load_config, AppConfig
from .config.config import AppCtx
from .engines.crawl_engine import CrawlEngine
from .engines.evidence_engine import EvidenceEngine
from .models.base import connect_db, create_database
@ -59,6 +60,12 @@ class MainApp:
"--web", action="store_true", help="启动 web 服务器,启动后将忽略其他选项"
)
parser.add_argument(
"-s",
action="store_true",
help="当设置此选项的时候将以正常模式启动Chrome(非headless模式),方便调试与观察运行情况"
)
# 如果没有传入任何参数,显示帮助信息
if len(sys.argv) == 1:
parser.print_help()
@ -121,6 +128,12 @@ class MainApp:
self.config = load_config(self.args.config)
logger.info(f"加载配置文件 {self.args.config} 成功")
# 设置 chrome 模式
if self.args.s:
AppCtx.g_app_config.headless_chrome = False
else:
AppCtx.g_app_config.headless_chrome = True
# 连接数据库
try:
self.db_engine = connect_db(self.config)

View File

@ -28,6 +28,8 @@ class AppConfig:
debug: bool
wap_screenshot: bool
headless_chrome: bool
database: DatabaseConfig
chrome: ChromeConfig
@ -43,6 +45,7 @@ def load_config(config_path: str) -> AppConfig:
AppCtx.g_app_config = AppConfig(
debug=config_dict["debug"],
wap_screenshot=config_dict["wap_screenshot"],
headless_chrome=True,
database=database_config,
chrome=chrome_config
)

View File

@ -2,6 +2,7 @@ import queue
import threading
import time
from DrissionPage.errors import ElementNotFoundError
from loguru import logger
from sqlmodel import Session, select
@ -85,7 +86,7 @@ class CrawlEngine:
domain = self.target_queue.get_nowait()
surl = self.crawl(domain)
if not surl:
logger.debug(f"{threading.current_thread().name} 爬取 {domain} 异常,开始处理下一个")
logger.debug(f"{threading.current_thread().name} 爬取 {domain} 无结果,开始处理下一个")
continue
# 存入数据库
@ -206,49 +207,51 @@ class CrawlEngine:
tab.wait.eles_loaded(["#container", ".content_none", "#content_left"], any_one=True)
while True:
try:
# 增加页码
current_page += 1
logger.debug(f"{threading.current_thread().name} 爬取 {domain} 的第 {current_page} 页数据")
# 直接访问 URL 会触发验证码
# tab.get(
# f"https://www.baidu.com/s?wd=site%3A{domain}&gpc=stf%3D{start_time}%2C{end_time}%7Cstftype%3D1&pn={(current_page - 1) * 10}")
# tab.get(f"https://www.baidu.com/s?wd=site%3A{domain}&pn={(current_page - 1) * 10}")
# 增加页码
current_page += 1
logger.debug(f"{threading.current_thread().name} 爬取 {domain} 的第 {current_page} 页数据")
# 直接访问 URL 会触发验证码
# tab.get(
# f"https://www.baidu.com/s?wd=site%3A{domain}&gpc=stf%3D{start_time}%2C{end_time}%7Cstftype%3D1&pn={(current_page - 1) * 10}")
# tab.get(f"https://www.baidu.com/s?wd=site%3A{domain}&pn={(current_page - 1) * 10}")
# 终止条件
if current_page > max_page and max_page:
logger.debug(f"{threading.current_thread().name} 达到指定页码,退出")
break
# 终止条件
if current_page > max_page and max_page:
logger.debug(f"{threading.current_thread().name} 达到指定页码,退出")
# logger.debug(f"tab.html: {tab.html}")
self.ev.wait(0.3)
if "未找到相关结果" in tab.html:
logger.debug(f"{threading.current_thread().name} 未找到结果,退出")
break
# 获取数据
tab.wait.eles_loaded("@id=content_left")
results = tab.ele("@id=content_left").eles("@class:result")
# temp = [result.attr("mu") for result in results if result.attr("mu") is not None]
for result in results:
surl = result.attr("mu")
if not surl:
continue
logger.debug(f"{threading.current_thread().name} 找到 URL : {surl}")
surl_set.add(surl)
# 翻页的时候等一下,别太快了
self.ev.wait(0.3)
# 如果没有下一页了,这个地方会找不到元素,有 10 秒的 timeout
next_btn = tab.ele("t:a@@text():下一页")
if not next_btn:
logger.debug(f"{threading.current_thread().name} 没有下一页了")
break
next_btn.click(True)
except ElementNotFoundError as e:
logger.error(f"没有找到 HTML 元素,跳过,详细信息: {e}")
break
# logger.debug(f"tab.html: {tab.html}")
self.ev.wait(0.3)
if "未找到相关结果" in tab.html:
logger.debug(f"{threading.current_thread().name} 未找到结果,退出")
break
# 获取数据
tab.wait.eles_loaded("@id=content_left")
results = tab.ele("@id=content_left").eles("@class:result")
# temp = [result.attr("mu") for result in results if result.attr("mu") is not None]
for result in results:
surl = result.attr("mu")
if not surl:
continue
logger.debug(f"{threading.current_thread().name} 找到 URL : {surl}")
surl_set.add(surl)
# 翻页的时候等一下,别太快了
self.ev.wait(0.3)
# 如果没有下一页了,这个地方会找不到元素,有 10 秒的 timeout
next_btn = tab.ele("t:a@@text():下一页")
if not next_btn:
logger.debug(f"{threading.current_thread().name} 没有下一页了")
break
next_btn.click(True)
return surl_set
except Exception as e:
logger.error(f"{threading.current_thread().name} 爬取{domain}发生错误:{e}")
logger.error(f"{threading.current_thread().name} 爬取 {domain} 发生错误:{e}")
import traceback
traceback.print_exc()
finally:

View File

@ -85,13 +85,14 @@ class PcReporter(BaseReporter):
# 上传图片
img_filename = self.upload_report_pic(img_path)
logger.debug(f"{img_filename=}, {wap_img_filename=}")
logger.debug(f"{img_filename=}")
if not img_filename:
logger.warning(f"图片 {img_path} 上传失败")
continue
if AppCtx.g_app_config.wap_screenshot:
wap_img_filename = self.upload_report_pic(wap_img_path)
logger.debug(f"{wap_img_filename=}")
if not wap_img_filename:
logger.warning(f"图片 {wap_img_filename} 上传失败")

View File

@ -64,6 +64,7 @@ class SiteReporter(BaseReporter):
cookie = random.choice(self.all_cookies)
report_site_cookie = GenCookie.run(cookie)
self.headers["Cookie"] = report_site_cookie
self.headers["User-Agent"] = random_ua()
logger.debug(f"设置 cookie 为:{report_site_cookie}")
# 先上传图片

View File

@ -12,6 +12,7 @@ from .base import BaseReporter
from ...config.config import AppCtx
from ...models.report_urls import ReportUrlModel
from ...utils.common import get_proxies, get_all_cookies, md5
from ...utils.gen_cookie import GenCookie
from ...utils.ua import random_ua
@ -51,7 +52,9 @@ class WapReporter(BaseReporter):
# 选个 cookie
report_cookie = random.choice(get_all_cookies())
self.headers["Cookie"] = report_cookie
report_site_cookie = GenCookie.run(report_cookie)
self.headers["Cookie"] = report_site_cookie
self.headers["User-Agent"] = random_ua()
logger.debug(f"{report_cookie=}")
# 获取用户信息
@ -79,13 +82,14 @@ class WapReporter(BaseReporter):
# wapUserAgent = random.choice(self.wapUserAgent)
response = self.request.get(
"https://ufosdk.baidu.com/api?m=Web&a=getUserInfo&appid=293852",
headers=self.headers, proxies=self.proxies, allow_redirects=False, timeout=5
headers=self.headers, proxies=self.proxies, allow_redirects=False, timeout=10, verify=False
)
json_data = response.json()
uid = json_data['result']['uid']
un = json_data['result']['un']
userinfo["uid"] = uid
userinfo["un"] = un
logger.debug(f"{userinfo=}")
return userinfo
except Exception as e:
logger.error(f"[{self.engine_name}]获取用户信息错误: {e}")
@ -135,7 +139,8 @@ class WapReporter(BaseReporter):
headers=self.headers,
proxies=self.proxies,
allow_redirects=False,
timeout=5
timeout=10,
verify=False
)
# logger.debug(req.json())
logger.debug(response.json())

View File

@ -16,6 +16,10 @@ def get_proxies():
"http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
"https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
}
# proxies = {
# "http": "http://127.0.0.1:8080",
# "https": "http://127.0.0.1:8080"
# }
return proxies

View File

@ -1,4 +1,5 @@
from DrissionPage import Chromium, ChromiumOptions
from loguru import logger
from .ua import random_ua
from ..config.config import AppCtx
@ -10,7 +11,8 @@ class DPEngine:
def __init__(self, is_wap: bool = False, no_img: bool = True):
chrome_opts = ChromiumOptions()
chrome_opts.mute(True) # 静音
chrome_opts.headless(False) # 无头模式
logger.debug(f"{AppCtx.g_app_config.headless_chrome=}")
chrome_opts.headless(AppCtx.g_app_config.headless_chrome) # 无头模式
chrome_opts.no_imgs(no_img) # 不加载图片
chrome_opts.set_argument("--disable-gpu") # 禁用GPU
chrome_opts.set_argument('--ignore-certificate-errors') # 忽略证书错误

View File

@ -3,7 +3,7 @@
debug = true
# 是否截取目标URL本身的图片
wap_snapshot = false
wap_screenshot = false
# 数据库配置
[database]
@ -15,5 +15,5 @@ database = "baidu_reporter"
# chrome 配置
[chrome]
proxy = "http://127.0.0.1:7890"
proxy = "http://127.0.0.1:8080"
browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe"