2025-03-28 23:19:42 +08:00
|
|
|
|
from loguru import logger
|
2025-03-28 18:23:30 +08:00
|
|
|
|
from sqlalchemy import Engine, select
|
|
|
|
|
|
from sqlmodel import Session
|
|
|
|
|
|
|
|
|
|
|
|
from app.utils.dp import DPEngine
|
|
|
|
|
|
from .reporters.pc_reporter import PcReporter
|
|
|
|
|
|
from .reporters.site_reporter import SiteReporter
|
2025-03-28 23:19:42 +08:00
|
|
|
|
from .reporters.wap_reporter import WapReporter
|
2025-03-28 18:23:30 +08:00
|
|
|
|
from ..models.report_urls import ReportUrlModel
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Reporter:
|
|
|
|
|
|
"""举报器,目前有三个渠道,以后可以继续扩展"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, urls_file: str, mode: list[str], db_engine: Engine):
|
|
|
|
|
|
self.urls_file = urls_file
|
|
|
|
|
|
self.mode = mode
|
|
|
|
|
|
self.db_engine = db_engine
|
|
|
|
|
|
|
|
|
|
|
|
# 初始化其他的 reporter
|
|
|
|
|
|
self.reporters = {
|
|
|
|
|
|
"pc": PcReporter(),
|
|
|
|
|
|
"wap": WapReporter(),
|
|
|
|
|
|
"site": SiteReporter(),
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# TODO 初始化 reporter 需要的公共数据,比如headless chrome等等
|
|
|
|
|
|
# self.baseDP = DPEngine(is_wap=False, no_img=True)
|
2025-03-28 23:19:42 +08:00
|
|
|
|
|
2025-03-28 18:23:30 +08:00
|
|
|
|
def run(self):
|
|
|
|
|
|
"""开始k站"""
|
|
|
|
|
|
self.get_reports_data()
|
|
|
|
|
|
|
|
|
|
|
|
def get_reports_data(self):
|
|
|
|
|
|
"""获取举报数据:页面截图,举报链接等,并存到数据库中"""
|
2025-03-28 23:19:42 +08:00
|
|
|
|
|
2025-03-28 18:23:30 +08:00
|
|
|
|
urls = self.read_urls()
|
|
|
|
|
|
logger.info(f"从文件 {self.urls_file} 读取到 {len(urls)} 个 URL")
|
|
|
|
|
|
|
|
|
|
|
|
# 如果数据库中已经有了,就跳过
|
|
|
|
|
|
# 如果数据库中没有,去获取页面截图、举报链接
|
|
|
|
|
|
# 如果获取成功,就插入数据库
|
|
|
|
|
|
# 如果获取失败,就记录日志
|
2025-03-28 23:19:42 +08:00
|
|
|
|
dp = DPEngine(is_wap=False, no_img=True) # 用来截图的浏览器实例
|
2025-03-28 18:23:30 +08:00
|
|
|
|
with Session(self.db_engine) as session:
|
|
|
|
|
|
for url in urls:
|
2025-03-28 23:19:42 +08:00
|
|
|
|
|
|
|
|
|
|
# 检查是否已经存在,如果存在就跳过
|
2025-03-28 18:23:30 +08:00
|
|
|
|
stmt = select(ReportUrlModel).where(ReportUrlModel.surl == url)
|
|
|
|
|
|
report_url = session.exec(stmt).one_or_none()
|
|
|
|
|
|
logger.debug(f"查询 {url} 的结果: {report_url}")
|
|
|
|
|
|
if report_url:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# 获取页面截图、举报链接
|
2025-03-28 23:19:42 +08:00
|
|
|
|
# 打开baidu搜索URL
|
|
|
|
|
|
tab = dp.browser.new_tab(url)
|
|
|
|
|
|
tab.wait(5)
|
|
|
|
|
|
if "未找到相关结果" in tab.html:
|
|
|
|
|
|
logger.info(f"SRUl {url} 搜索结果空")
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# 截一张图
|
|
|
|
|
|
# img_path = f"./imgs/{report_domain}/{md5_hash(report_url)}.png"
|
|
|
|
|
|
# tab.get_screenshot()
|
|
|
|
|
|
|
2025-03-28 18:23:30 +08:00
|
|
|
|
# 2. 点击举报按钮,获取举报链接
|
|
|
|
|
|
# 3. 打开 URL 截一张图
|
|
|
|
|
|
# 4. 存到数据库里
|
|
|
|
|
|
|
|
|
|
|
|
def read_urls(self) -> list[str]:
|
|
|
|
|
|
"""读取 urls 文件"""
|
|
|
|
|
|
urls: list[str] = []
|
|
|
|
|
|
with open(self.urls_file, "r") as fp:
|
|
|
|
|
|
for url in fp:
|
|
|
|
|
|
url = url.strip()
|
|
|
|
|
|
if not url:
|
|
|
|
|
|
continue
|
|
|
|
|
|
urls.append(url)
|
|
|
|
|
|
return urls
|