diff --git a/app/engines/crawl_engine.py b/app/engines/crawl_engine.py index 8c1c45c..9a24000 100644 --- a/app/engines/crawl_engine.py +++ b/app/engines/crawl_engine.py @@ -231,12 +231,19 @@ class CrawlEngine: tab.wait.eles_loaded("@id=content_left") results = tab.ele("@id=content_left").eles("@class:result") # temp = [result.attr("mu") for result in results if result.attr("mu") is not None] + # logger.debug(f"{len(results)=}") for result in results: + # logger.debug(f"{result=}") surl = result.attr("mu") if not surl: continue - logger.debug(f"{threading.current_thread().name} 找到 URL : {surl}") - surl_set.add(surl) + + # 添加结果的时候,也检查一下抓到的 surl 是否和目标域名有关 + if domain not in surl: + logger.debug(f"{threading.current_thread().name} URL {surl} 与目标域名 {domain} 无关,跳过") + else: + surl_set.add(surl) + logger.debug(f"{threading.current_thread().name} 找到 {surl}") # 翻页的时候等一下,别太快了 self.ev.wait(0.3) @@ -261,6 +268,12 @@ class CrawlEngine: def save_surl(session: Session, domain: str, surl_set: set[str]): """保存采集到的URL""" for surl in surl_set: + + # 简单的判断一下 surl 中是否包含目标域名 + if domain not in surl: + logger.debug(f"跳过保存 {surl} 因为与目标域名 {domain} 不符合") + continue + # 先检查是否存在 stmt = select(ReportUrlModel).where(ReportUrlModel.surl == surl) exist = session.exec(stmt).first() diff --git a/config.local.toml b/config.local.toml index f28c651..a722d61 100644 --- a/config.local.toml +++ b/config.local.toml @@ -15,5 +15,5 @@ database = "baidu_reporter" # chrome 配置 [chrome] -proxy = "http://127.0.0.1:8080" +proxy = "http://127.0.0.1:7890" browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe" \ No newline at end of file