baidu-reporter/app/engines/reporter.py
2025-03-28 18:23:30 +08:00

73 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from sqlalchemy import Engine, select
from sqlmodel import Session
from app.utils.dp import DPEngine
from .reporters.pc_reporter import PcReporter
from .reporters.wap_reporter import WapReporter
from .reporters.site_reporter import SiteReporter
from ..models.report_urls import ReportUrlModel
from loguru import logger
class Reporter:
"""举报器,目前有三个渠道,以后可以继续扩展"""
def __init__(self, urls_file: str, mode: list[str], db_engine: Engine):
self.urls_file = urls_file
self.mode = mode
self.db_engine = db_engine
# 初始化其他的 reporter
self.reporters = {
"pc": PcReporter(),
"wap": WapReporter(),
"site": SiteReporter(),
}
# TODO 初始化 reporter 需要的公共数据比如headless chrome等等
# self.baseDP = DPEngine(is_wap=False, no_img=True)
def run(self):
"""开始k站"""
self.get_reports_data()
def get_reports_data(self):
"""获取举报数据:页面截图,举报链接等,并存到数据库中"""
urls = self.read_urls()
logger.info(f"从文件 {self.urls_file} 读取到 {len(urls)} 个 URL")
# 如果数据库中已经有了,就跳过
# 如果数据库中没有,去获取页面截图、举报链接
# 如果获取成功,就插入数据库
# 如果获取失败,就记录日志
dp = DPEngine(is_wap=False, no_img=True) # 用来截图的浏览器实例
with Session(self.db_engine) as session:
for url in urls:
stmt = select(ReportUrlModel).where(ReportUrlModel.surl == url)
report_url = session.exec(stmt).one_or_none()
logger.debug(f"查询 {url} 的结果: {report_url}")
if report_url:
continue
# 获取页面截图、举报链接
# 1. 打开baidu搜索URL,如果有结果就截一张图,没有的话就跳过
# 2. 点击举报按钮,获取举报链接
# 3. 打开 URL 截一张图
# 4. 存到数据库里
dp.browser.new_tab(url)
def read_urls(self) -> list[str]:
"""读取 urls 文件"""
urls: list[str] = []
with open(self.urls_file, "r") as fp:
for url in fp:
url = url.strip()
if not url:
continue
urls.append(url)
return urls