from loguru import logger from sqlalchemy import Engine, select from sqlmodel import Session from app.utils.dp import DPEngine from .reporters.pc_reporter import PcReporter from .reporters.site_reporter import SiteReporter from .reporters.wap_reporter import WapReporter from ..models.report_urls import ReportUrlModel class Reporter: """举报器,目前有三个渠道,以后可以继续扩展""" def __init__(self, urls_file: str, mode: list[str], db_engine: Engine): self.urls_file = urls_file self.mode = mode self.db_engine = db_engine # 初始化其他的 reporter self.reporters = { "pc": PcReporter(), "wap": WapReporter(), "site": SiteReporter(), } # TODO 初始化 reporter 需要的公共数据,比如headless chrome等等 # self.baseDP = DPEngine(is_wap=False, no_img=True) def run(self): """开始k站""" self.get_reports_data() def get_reports_data(self): """获取举报数据:页面截图,举报链接等,并存到数据库中""" urls = self.read_urls() logger.info(f"从文件 {self.urls_file} 读取到 {len(urls)} 个 URL") # 如果数据库中已经有了,就跳过 # 如果数据库中没有,去获取页面截图、举报链接 # 如果获取成功,就插入数据库 # 如果获取失败,就记录日志 dp = DPEngine(is_wap=False, no_img=True) # 用来截图的浏览器实例 with Session(self.db_engine) as session: for url in urls: # 检查是否已经存在,如果存在就跳过 stmt = select(ReportUrlModel).where(ReportUrlModel.surl == url) report_url = session.exec(stmt).one_or_none() logger.debug(f"查询 {url} 的结果: {report_url}") if report_url: continue # 获取页面截图、举报链接 # 打开baidu搜索URL tab = dp.browser.new_tab(url) tab.wait(5) if "未找到相关结果" in tab.html: logger.info(f"SRUl {url} 搜索结果空") continue # 截一张图 # img_path = f"./imgs/{report_domain}/{md5_hash(report_url)}.png" # tab.get_screenshot() # 2. 点击举报按钮,获取举报链接 # 3. 打开 URL 截一张图 # 4. 存到数据库里 def read_urls(self) -> list[str]: """读取 urls 文件""" urls: list[str] = [] with open(self.urls_file, "r") as fp: for url in fp: url = url.strip() if not url: continue urls.append(url) return urls