225 lines
7.9 KiB
Python
225 lines
7.9 KiB
Python
import os.path
|
||
import threading
|
||
import urllib
|
||
from urllib.parse import urlparse
|
||
|
||
import requests
|
||
from DrissionPage._pages.mix_tab import MixTab
|
||
from loguru import logger
|
||
from sqlmodel import Session, select
|
||
|
||
from app.config.config import AppCtx
|
||
from app.models.report_urls import ReportUrlModel
|
||
from app.utils.common import md5
|
||
from app.utils.dp import DPEngine
|
||
|
||
|
||
class EvidenceEngine:
|
||
"""固定色站证据,搜索 URL 后截图,并生成举报链接存入数据库"""
|
||
|
||
def __init__(self):
|
||
# 开启一个浏览器窗口
|
||
self.dp_engine = DPEngine()
|
||
self.wap_dp_engine = DPEngine(is_wap=True)
|
||
|
||
# 控制运行状态的数据
|
||
self.ev = threading.Event()
|
||
self.status = 1
|
||
|
||
# 工作线程
|
||
self.worker_thread = None
|
||
|
||
# 数据库连接
|
||
self.database = AppCtx.g_db_engine
|
||
|
||
def start(self):
|
||
"""启动线程"""
|
||
self.worker_thread = threading.Thread(target=self.worker, name="evidence_engine", daemon=True)
|
||
self.worker_thread.start()
|
||
|
||
def cli_start(self):
|
||
"""以CLI模式开启,就是只执行一次,不循环"""
|
||
# 从数据库中获取所有待收集证据的 URL 列表
|
||
targets = self.get_surl_from_db()
|
||
logger.debug(f"共获取到 {len(targets)} 条待处理数据")
|
||
|
||
# 依次处理
|
||
for target in targets:
|
||
logger.debug(f"开始获取 {target['surl']} 的举报数据")
|
||
self.get_screenshot_and_report_link(target)
|
||
|
||
def worker(self):
|
||
"""工作函数"""
|
||
while self.status:
|
||
# 从数据库中获取所有待收集证据的 URL 列表
|
||
targets = self.get_surl_from_db()
|
||
|
||
# 依次处理
|
||
for target in targets:
|
||
logger.debug(f"开始获取 {target['surl']} 的举报数据")
|
||
self.get_screenshot_and_report_link(target)
|
||
|
||
# 每10秒跑一次
|
||
self.ev.wait(10)
|
||
|
||
def stop(self):
|
||
"""结束线程"""
|
||
self.status = 0
|
||
self.ev.set()
|
||
self.dp_engine.close()
|
||
self.wap_dp_engine.close()
|
||
|
||
def wait(self):
|
||
self.worker_thread.join()
|
||
|
||
def get_surl_from_db(self):
|
||
"""从数据库中获取数据"""
|
||
result: list = []
|
||
with Session(self.database) as session:
|
||
stmt = select(ReportUrlModel).where(ReportUrlModel.has_evidence == False)
|
||
surl = session.exec(stmt).all()
|
||
for url in surl:
|
||
result.append({"id": url.id, "surl": url.surl, "domain": url.domain})
|
||
|
||
return result
|
||
|
||
def get_screenshot_and_report_link(self, target: dict):
|
||
"""获取证据截图和举报链接"""
|
||
try:
|
||
surl = target["surl"]
|
||
|
||
# Part1 获取证据截图
|
||
logger.debug(f"开始获取 {surl} 在百度搜索中的截图")
|
||
img_path, tab = self.get_screenshot(target)
|
||
if not img_path:
|
||
return None
|
||
|
||
# Part2 截一张surl本身的图
|
||
if AppCtx.g_app_config.wap_screenshot:
|
||
logger.debug(f"开始获取 {surl} 的截图")
|
||
img_path, wap_tab = self.get_wap_screenshot(target)
|
||
wap_tab.close()
|
||
|
||
# Part3 获取举报链接
|
||
logger.debug(f"开始获取 {surl} 的举报链接")
|
||
report_link = self.get_report_link(tab)
|
||
logger.debug(f"获取到举报链接为: {report_link}")
|
||
if not report_link:
|
||
return None
|
||
|
||
# Part4 获取举报链接的信息
|
||
logger.debug(f"开始获取举报链接的参数信息")
|
||
params = self.resolve_report_link(report_link)
|
||
if not params:
|
||
logger.error(f"解析举报链接失败,surl: {surl}")
|
||
return None
|
||
|
||
token = params["token"][0]
|
||
title = params["title"][0]
|
||
q = params["q"][0]
|
||
surl = params["surl"][0]
|
||
|
||
# 更新数据库
|
||
with Session(self.database) as session:
|
||
stmt = select(ReportUrlModel).where(ReportUrlModel.id == target["id"])
|
||
model: ReportUrlModel = session.exec(stmt).first()
|
||
if not model:
|
||
logger.error(f"{target['id']} 记录不存在,跳过...")
|
||
return None
|
||
# 更新数据
|
||
model.token = token
|
||
model.title = title
|
||
model.q = q
|
||
model.has_evidence = True
|
||
session.add(model)
|
||
session.commit()
|
||
logger.debug(f"{surl} 处理完成")
|
||
except Exception as e:
|
||
logger.error(f"获取证据截图和举报链接失败: {e}")
|
||
|
||
def get_screenshot(self, target: dict) -> tuple[str | None, MixTab]:
|
||
"""获取搜索页面的截图,返回 img_path """
|
||
search_keyword = target["surl"].lstrip("https://").lstrip("http://")
|
||
tab = self.dp_engine.browser.new_tab()
|
||
tab.get("https://www.baidu.com")
|
||
tab.ele("#kw").input(f"{search_keyword}\n", clear=True)
|
||
tab.wait.eles_loaded([".content_none", "#content_left"], any_one=True)
|
||
|
||
if "未找到相关结果" in tab.html:
|
||
logger.info(f"没有关于 {search_keyword} 的数据")
|
||
return None, tab
|
||
|
||
# 图片的存储路径
|
||
# 截完图先不要关闭 tab,别的地方还要用
|
||
img_path = f"./imgs/{target['domain']}/{md5(target['surl'])}.png"
|
||
return self.do_screenshot(tab, img_path)
|
||
|
||
def get_wap_screenshot(self, target: dict) -> tuple[str | None, MixTab]:
|
||
"""用 wap dp 再截一张 surl 本身的图"""
|
||
tab = self.wap_dp_engine.browser.new_tab()
|
||
tab.get(target["surl"])
|
||
tab.wait(5) # 这里只能硬等,不知道surl的结构,没办法精确判断
|
||
|
||
img_path = f"./imgs/{target['domain']}/{md5(target['surl'])}-wap.png"
|
||
return self.do_screenshot(tab, img_path)
|
||
|
||
@staticmethod
|
||
def do_screenshot(tab: MixTab, img_path: str, force=False) -> tuple[str | None, MixTab]:
|
||
"""截图函数"""
|
||
if os.path.exists(img_path):
|
||
if force:
|
||
os.remove(img_path)
|
||
else:
|
||
logger.debug(f"截图路径 {img_path} 已经存在,跳过截图")
|
||
return img_path, tab
|
||
|
||
tab.get_screenshot(path=img_path)
|
||
logger.debug(f"截图成功: {img_path}")
|
||
|
||
return img_path, tab
|
||
|
||
@staticmethod
|
||
def get_report_link(tab: MixTab):
|
||
"""获取举报链接,这个时候页面应该停留在搜索结果页"""
|
||
tools = tab.eles(".:c-tools")
|
||
tab.wait(0.5)
|
||
if tools:
|
||
tool = tools[0]
|
||
tool.hover(0, 0)
|
||
tool.click(True)
|
||
|
||
tips = tab.eles(".c-tip-menu")
|
||
if tips:
|
||
tip = tips[0]
|
||
report = tip.ele("t:a@@text()=举报")
|
||
if report:
|
||
return report.attr("href")
|
||
|
||
return None
|
||
|
||
@staticmethod
|
||
def resolve_report_link(report_link):
|
||
try:
|
||
proxy_link = AppCtx.g_app_config.chrome.proxy
|
||
proxies = {
|
||
"http": proxy_link,
|
||
"https": proxy_link,
|
||
}
|
||
response = requests.get(report_link, proxies=proxies, timeout=5, allow_redirects=False)
|
||
location = response.headers["Location"]
|
||
if not location:
|
||
logger.warning("没有获取到举报链接的 Location")
|
||
return None
|
||
|
||
parsed_url = urllib.parse.urlparse(response.headers["Location"])
|
||
query_params = urllib.parse.parse_qs(parsed_url.query)
|
||
decoded_params = {
|
||
key: [urllib.parse.unquote(value) for value in values] for key, values in query_params.items()
|
||
}
|
||
if len(decoded_params) == 0:
|
||
return None
|
||
return decoded_params
|
||
except Exception as e:
|
||
logger.error(f"解析举报链接失败,错误: {e}")
|
||
pass
|