爬取的时候,过滤掉与域名无关的URL
This commit is contained in:
parent
4329045cc6
commit
9ff7c18743
@ -231,12 +231,19 @@ class CrawlEngine:
|
|||||||
tab.wait.eles_loaded("@id=content_left")
|
tab.wait.eles_loaded("@id=content_left")
|
||||||
results = tab.ele("@id=content_left").eles("@class:result")
|
results = tab.ele("@id=content_left").eles("@class:result")
|
||||||
# temp = [result.attr("mu") for result in results if result.attr("mu") is not None]
|
# temp = [result.attr("mu") for result in results if result.attr("mu") is not None]
|
||||||
|
# logger.debug(f"{len(results)=}")
|
||||||
for result in results:
|
for result in results:
|
||||||
|
# logger.debug(f"{result=}")
|
||||||
surl = result.attr("mu")
|
surl = result.attr("mu")
|
||||||
if not surl:
|
if not surl:
|
||||||
continue
|
continue
|
||||||
logger.debug(f"{threading.current_thread().name} 找到 URL : {surl}")
|
|
||||||
surl_set.add(surl)
|
# 添加结果的时候,也检查一下抓到的 surl 是否和目标域名有关
|
||||||
|
if domain not in surl:
|
||||||
|
logger.debug(f"{threading.current_thread().name} URL {surl} 与目标域名 {domain} 无关,跳过")
|
||||||
|
else:
|
||||||
|
surl_set.add(surl)
|
||||||
|
logger.debug(f"{threading.current_thread().name} 找到 {surl}")
|
||||||
|
|
||||||
# 翻页的时候等一下,别太快了
|
# 翻页的时候等一下,别太快了
|
||||||
self.ev.wait(0.3)
|
self.ev.wait(0.3)
|
||||||
@ -261,6 +268,12 @@ class CrawlEngine:
|
|||||||
def save_surl(session: Session, domain: str, surl_set: set[str]):
|
def save_surl(session: Session, domain: str, surl_set: set[str]):
|
||||||
"""保存采集到的URL"""
|
"""保存采集到的URL"""
|
||||||
for surl in surl_set:
|
for surl in surl_set:
|
||||||
|
|
||||||
|
# 简单的判断一下 surl 中是否包含目标域名
|
||||||
|
if domain not in surl:
|
||||||
|
logger.debug(f"跳过保存 {surl} 因为与目标域名 {domain} 不符合")
|
||||||
|
continue
|
||||||
|
|
||||||
# 先检查是否存在
|
# 先检查是否存在
|
||||||
stmt = select(ReportUrlModel).where(ReportUrlModel.surl == surl)
|
stmt = select(ReportUrlModel).where(ReportUrlModel.surl == surl)
|
||||||
exist = session.exec(stmt).first()
|
exist = session.exec(stmt).first()
|
||||||
|
|||||||
@ -15,5 +15,5 @@ database = "baidu_reporter"
|
|||||||
|
|
||||||
# chrome 配置
|
# chrome 配置
|
||||||
[chrome]
|
[chrome]
|
||||||
proxy = "http://127.0.0.1:8080"
|
proxy = "http://127.0.0.1:7890"
|
||||||
browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe"
|
browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe"
|
||||||
Loading…
x
Reference in New Issue
Block a user