爬取的时候,过滤掉与域名无关的URL

This commit is contained in:
xhy 2025-04-02 11:58:18 +08:00
parent 4329045cc6
commit 9ff7c18743
2 changed files with 16 additions and 3 deletions

View File

@ -231,12 +231,19 @@ class CrawlEngine:
tab.wait.eles_loaded("@id=content_left")
results = tab.ele("@id=content_left").eles("@class:result")
# temp = [result.attr("mu") for result in results if result.attr("mu") is not None]
# logger.debug(f"{len(results)=}")
for result in results:
# logger.debug(f"{result=}")
surl = result.attr("mu")
if not surl:
continue
logger.debug(f"{threading.current_thread().name} 找到 URL : {surl}")
surl_set.add(surl)
# 添加结果的时候,也检查一下抓到的 surl 是否和目标域名有关
if domain not in surl:
logger.debug(f"{threading.current_thread().name} URL {surl} 与目标域名 {domain} 无关,跳过")
else:
surl_set.add(surl)
logger.debug(f"{threading.current_thread().name} 找到 {surl}")
# 翻页的时候等一下,别太快了
self.ev.wait(0.3)
@ -261,6 +268,12 @@ class CrawlEngine:
def save_surl(session: Session, domain: str, surl_set: set[str]):
"""保存采集到的URL"""
for surl in surl_set:
# 简单的判断一下 surl 中是否包含目标域名
if domain not in surl:
logger.debug(f"跳过保存 {surl} 因为与目标域名 {domain} 不符合")
continue
# 先检查是否存在
stmt = select(ReportUrlModel).where(ReportUrlModel.surl == surl)
exist = session.exec(stmt).first()

View File

@ -15,5 +15,5 @@ database = "baidu_reporter"
# chrome 配置
[chrome]
proxy = "http://127.0.0.1:8080"
proxy = "http://127.0.0.1:7890"
browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe"