修了一堆bug
This commit is contained in:
parent
e5456ef67c
commit
4329045cc6
13
app/app.py
13
app/app.py
@ -6,6 +6,7 @@ import time
|
|||||||
from app.engines.report_engine import Reporter
|
from app.engines.report_engine import Reporter
|
||||||
|
|
||||||
from .config import load_config, AppConfig
|
from .config import load_config, AppConfig
|
||||||
|
from .config.config import AppCtx
|
||||||
from .engines.crawl_engine import CrawlEngine
|
from .engines.crawl_engine import CrawlEngine
|
||||||
from .engines.evidence_engine import EvidenceEngine
|
from .engines.evidence_engine import EvidenceEngine
|
||||||
from .models.base import connect_db, create_database
|
from .models.base import connect_db, create_database
|
||||||
@ -59,6 +60,12 @@ class MainApp:
|
|||||||
"--web", action="store_true", help="启动 web 服务器,启动后将忽略其他选项"
|
"--web", action="store_true", help="启动 web 服务器,启动后将忽略其他选项"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-s",
|
||||||
|
action="store_true",
|
||||||
|
help="当设置此选项的时候,将以正常模式启动Chrome(非headless模式),方便调试与观察运行情况"
|
||||||
|
)
|
||||||
|
|
||||||
# 如果没有传入任何参数,显示帮助信息
|
# 如果没有传入任何参数,显示帮助信息
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
@ -121,6 +128,12 @@ class MainApp:
|
|||||||
self.config = load_config(self.args.config)
|
self.config = load_config(self.args.config)
|
||||||
logger.info(f"加载配置文件 {self.args.config} 成功")
|
logger.info(f"加载配置文件 {self.args.config} 成功")
|
||||||
|
|
||||||
|
# 设置 chrome 模式
|
||||||
|
if self.args.s:
|
||||||
|
AppCtx.g_app_config.headless_chrome = False
|
||||||
|
else:
|
||||||
|
AppCtx.g_app_config.headless_chrome = True
|
||||||
|
|
||||||
# 连接数据库
|
# 连接数据库
|
||||||
try:
|
try:
|
||||||
self.db_engine = connect_db(self.config)
|
self.db_engine = connect_db(self.config)
|
||||||
|
|||||||
@ -28,6 +28,8 @@ class AppConfig:
|
|||||||
debug: bool
|
debug: bool
|
||||||
wap_screenshot: bool
|
wap_screenshot: bool
|
||||||
|
|
||||||
|
headless_chrome: bool
|
||||||
|
|
||||||
database: DatabaseConfig
|
database: DatabaseConfig
|
||||||
chrome: ChromeConfig
|
chrome: ChromeConfig
|
||||||
|
|
||||||
@ -43,6 +45,7 @@ def load_config(config_path: str) -> AppConfig:
|
|||||||
AppCtx.g_app_config = AppConfig(
|
AppCtx.g_app_config = AppConfig(
|
||||||
debug=config_dict["debug"],
|
debug=config_dict["debug"],
|
||||||
wap_screenshot=config_dict["wap_screenshot"],
|
wap_screenshot=config_dict["wap_screenshot"],
|
||||||
|
headless_chrome=True,
|
||||||
database=database_config,
|
database=database_config,
|
||||||
chrome=chrome_config
|
chrome=chrome_config
|
||||||
)
|
)
|
||||||
|
|||||||
@ -2,6 +2,7 @@ import queue
|
|||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from DrissionPage.errors import ElementNotFoundError
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from sqlmodel import Session, select
|
from sqlmodel import Session, select
|
||||||
|
|
||||||
@ -85,7 +86,7 @@ class CrawlEngine:
|
|||||||
domain = self.target_queue.get_nowait()
|
domain = self.target_queue.get_nowait()
|
||||||
surl = self.crawl(domain)
|
surl = self.crawl(domain)
|
||||||
if not surl:
|
if not surl:
|
||||||
logger.debug(f"{threading.current_thread().name} 爬取 {domain} 异常,开始处理下一个")
|
logger.debug(f"{threading.current_thread().name} 爬取 {domain} 无结果,开始处理下一个")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 存入数据库
|
# 存入数据库
|
||||||
@ -206,7 +207,7 @@ class CrawlEngine:
|
|||||||
tab.wait.eles_loaded(["#container", ".content_none", "#content_left"], any_one=True)
|
tab.wait.eles_loaded(["#container", ".content_none", "#content_left"], any_one=True)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
try:
|
||||||
# 增加页码
|
# 增加页码
|
||||||
current_page += 1
|
current_page += 1
|
||||||
logger.debug(f"{threading.current_thread().name} 爬取 {domain} 的第 {current_page} 页数据")
|
logger.debug(f"{threading.current_thread().name} 爬取 {domain} 的第 {current_page} 页数据")
|
||||||
@ -245,7 +246,9 @@ class CrawlEngine:
|
|||||||
logger.debug(f"{threading.current_thread().name} 没有下一页了")
|
logger.debug(f"{threading.current_thread().name} 没有下一页了")
|
||||||
break
|
break
|
||||||
next_btn.click(True)
|
next_btn.click(True)
|
||||||
|
except ElementNotFoundError as e:
|
||||||
|
logger.error(f"没有找到 HTML 元素,跳过,详细信息: {e}")
|
||||||
|
break
|
||||||
return surl_set
|
return surl_set
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"{threading.current_thread().name} 爬取 {domain} 发生错误:{e}")
|
logger.error(f"{threading.current_thread().name} 爬取 {domain} 发生错误:{e}")
|
||||||
|
|||||||
@ -85,13 +85,14 @@ class PcReporter(BaseReporter):
|
|||||||
|
|
||||||
# 上传图片
|
# 上传图片
|
||||||
img_filename = self.upload_report_pic(img_path)
|
img_filename = self.upload_report_pic(img_path)
|
||||||
logger.debug(f"{img_filename=}, {wap_img_filename=}")
|
logger.debug(f"{img_filename=}")
|
||||||
if not img_filename:
|
if not img_filename:
|
||||||
logger.warning(f"图片 {img_path} 上传失败")
|
logger.warning(f"图片 {img_path} 上传失败")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if AppCtx.g_app_config.wap_screenshot:
|
if AppCtx.g_app_config.wap_screenshot:
|
||||||
wap_img_filename = self.upload_report_pic(wap_img_path)
|
wap_img_filename = self.upload_report_pic(wap_img_path)
|
||||||
|
logger.debug(f"{wap_img_filename=}")
|
||||||
if not wap_img_filename:
|
if not wap_img_filename:
|
||||||
logger.warning(f"图片 {wap_img_filename} 上传失败")
|
logger.warning(f"图片 {wap_img_filename} 上传失败")
|
||||||
|
|
||||||
|
|||||||
@ -64,6 +64,7 @@ class SiteReporter(BaseReporter):
|
|||||||
cookie = random.choice(self.all_cookies)
|
cookie = random.choice(self.all_cookies)
|
||||||
report_site_cookie = GenCookie.run(cookie)
|
report_site_cookie = GenCookie.run(cookie)
|
||||||
self.headers["Cookie"] = report_site_cookie
|
self.headers["Cookie"] = report_site_cookie
|
||||||
|
self.headers["User-Agent"] = random_ua()
|
||||||
logger.debug(f"设置 cookie 为:{report_site_cookie}")
|
logger.debug(f"设置 cookie 为:{report_site_cookie}")
|
||||||
|
|
||||||
# 先上传图片
|
# 先上传图片
|
||||||
|
|||||||
@ -12,6 +12,7 @@ from .base import BaseReporter
|
|||||||
from ...config.config import AppCtx
|
from ...config.config import AppCtx
|
||||||
from ...models.report_urls import ReportUrlModel
|
from ...models.report_urls import ReportUrlModel
|
||||||
from ...utils.common import get_proxies, get_all_cookies, md5
|
from ...utils.common import get_proxies, get_all_cookies, md5
|
||||||
|
from ...utils.gen_cookie import GenCookie
|
||||||
from ...utils.ua import random_ua
|
from ...utils.ua import random_ua
|
||||||
|
|
||||||
|
|
||||||
@ -51,7 +52,9 @@ class WapReporter(BaseReporter):
|
|||||||
|
|
||||||
# 选个 cookie
|
# 选个 cookie
|
||||||
report_cookie = random.choice(get_all_cookies())
|
report_cookie = random.choice(get_all_cookies())
|
||||||
self.headers["Cookie"] = report_cookie
|
report_site_cookie = GenCookie.run(report_cookie)
|
||||||
|
self.headers["Cookie"] = report_site_cookie
|
||||||
|
self.headers["User-Agent"] = random_ua()
|
||||||
logger.debug(f"{report_cookie=}")
|
logger.debug(f"{report_cookie=}")
|
||||||
|
|
||||||
# 获取用户信息
|
# 获取用户信息
|
||||||
@ -79,13 +82,14 @@ class WapReporter(BaseReporter):
|
|||||||
# wapUserAgent = random.choice(self.wapUserAgent)
|
# wapUserAgent = random.choice(self.wapUserAgent)
|
||||||
response = self.request.get(
|
response = self.request.get(
|
||||||
"https://ufosdk.baidu.com/api?m=Web&a=getUserInfo&appid=293852",
|
"https://ufosdk.baidu.com/api?m=Web&a=getUserInfo&appid=293852",
|
||||||
headers=self.headers, proxies=self.proxies, allow_redirects=False, timeout=5
|
headers=self.headers, proxies=self.proxies, allow_redirects=False, timeout=10, verify=False
|
||||||
)
|
)
|
||||||
json_data = response.json()
|
json_data = response.json()
|
||||||
uid = json_data['result']['uid']
|
uid = json_data['result']['uid']
|
||||||
un = json_data['result']['un']
|
un = json_data['result']['un']
|
||||||
userinfo["uid"] = uid
|
userinfo["uid"] = uid
|
||||||
userinfo["un"] = un
|
userinfo["un"] = un
|
||||||
|
logger.debug(f"{userinfo=}")
|
||||||
return userinfo
|
return userinfo
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"[{self.engine_name}]获取用户信息错误: {e}")
|
logger.error(f"[{self.engine_name}]获取用户信息错误: {e}")
|
||||||
@ -135,7 +139,8 @@ class WapReporter(BaseReporter):
|
|||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
proxies=self.proxies,
|
proxies=self.proxies,
|
||||||
allow_redirects=False,
|
allow_redirects=False,
|
||||||
timeout=5
|
timeout=10,
|
||||||
|
verify=False
|
||||||
)
|
)
|
||||||
# logger.debug(req.json())
|
# logger.debug(req.json())
|
||||||
logger.debug(response.json())
|
logger.debug(response.json())
|
||||||
|
|||||||
@ -16,6 +16,10 @@ def get_proxies():
|
|||||||
"http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
|
"http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
|
||||||
"https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
|
"https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
|
||||||
}
|
}
|
||||||
|
# proxies = {
|
||||||
|
# "http": "http://127.0.0.1:8080",
|
||||||
|
# "https": "http://127.0.0.1:8080"
|
||||||
|
# }
|
||||||
return proxies
|
return proxies
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
from DrissionPage import Chromium, ChromiumOptions
|
from DrissionPage import Chromium, ChromiumOptions
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
from .ua import random_ua
|
from .ua import random_ua
|
||||||
from ..config.config import AppCtx
|
from ..config.config import AppCtx
|
||||||
@ -10,7 +11,8 @@ class DPEngine:
|
|||||||
def __init__(self, is_wap: bool = False, no_img: bool = True):
|
def __init__(self, is_wap: bool = False, no_img: bool = True):
|
||||||
chrome_opts = ChromiumOptions()
|
chrome_opts = ChromiumOptions()
|
||||||
chrome_opts.mute(True) # 静音
|
chrome_opts.mute(True) # 静音
|
||||||
chrome_opts.headless(False) # 无头模式
|
logger.debug(f"{AppCtx.g_app_config.headless_chrome=}")
|
||||||
|
chrome_opts.headless(AppCtx.g_app_config.headless_chrome) # 无头模式
|
||||||
chrome_opts.no_imgs(no_img) # 不加载图片
|
chrome_opts.no_imgs(no_img) # 不加载图片
|
||||||
chrome_opts.set_argument("--disable-gpu") # 禁用GPU
|
chrome_opts.set_argument("--disable-gpu") # 禁用GPU
|
||||||
chrome_opts.set_argument('--ignore-certificate-errors') # 忽略证书错误
|
chrome_opts.set_argument('--ignore-certificate-errors') # 忽略证书错误
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
debug = true
|
debug = true
|
||||||
|
|
||||||
# 是否截取目标URL本身的图片
|
# 是否截取目标URL本身的图片
|
||||||
wap_snapshot = false
|
wap_screenshot = false
|
||||||
|
|
||||||
# 数据库配置
|
# 数据库配置
|
||||||
[database]
|
[database]
|
||||||
@ -15,5 +15,5 @@ database = "baidu_reporter"
|
|||||||
|
|
||||||
# chrome 配置
|
# chrome 配置
|
||||||
[chrome]
|
[chrome]
|
||||||
proxy = "http://127.0.0.1:7890"
|
proxy = "http://127.0.0.1:8080"
|
||||||
browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe"
|
browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe"
|
||||||
Loading…
x
Reference in New Issue
Block a user