采集器增加验证码打码功能
This commit is contained in:
parent
b7f545dab8
commit
19607428f8
@ -1,7 +1,10 @@
|
|||||||
import queue
|
import queue
|
||||||
|
import re
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
import execjs
|
||||||
|
import requests
|
||||||
from DrissionPage.errors import ElementNotFoundError
|
from DrissionPage.errors import ElementNotFoundError
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from sqlmodel import Session, select, or_, and_
|
from sqlmodel import Session, select, or_, and_
|
||||||
@ -10,7 +13,9 @@ from app.config.config import AppCtx
|
|||||||
from app.constants.domain import DomainStatus
|
from app.constants.domain import DomainStatus
|
||||||
from app.models.domain import DomainModel
|
from app.models.domain import DomainModel
|
||||||
from app.models.report_urls import ReportUrlModel
|
from app.models.report_urls import ReportUrlModel
|
||||||
|
from app.utils.common import get_proxies
|
||||||
from app.utils.dp import DPEngine
|
from app.utils.dp import DPEngine
|
||||||
|
from app.utils.ydm_verify import YdmVerify
|
||||||
|
|
||||||
|
|
||||||
class CrawlEngine:
|
class CrawlEngine:
|
||||||
@ -36,8 +41,6 @@ class CrawlEngine:
|
|||||||
# 创建一个浏览器
|
# 创建一个浏览器
|
||||||
self.dp_engine = DPEngine()
|
self.dp_engine = DPEngine()
|
||||||
|
|
||||||
self.database = AppCtx.g_db_engine
|
|
||||||
|
|
||||||
def cli_start(self, target_domains: str, target_domain_filepath: str):
|
def cli_start(self, target_domains: str, target_domain_filepath: str):
|
||||||
"""CLI 模式启动
|
"""CLI 模式启动
|
||||||
target_domains: 英文逗号分割的字符串
|
target_domains: 英文逗号分割的字符串
|
||||||
@ -91,7 +94,7 @@ class CrawlEngine:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# 存入数据库
|
# 存入数据库
|
||||||
with Session(self.database) as session:
|
with Session(AppCtx.g_db_engine) as session:
|
||||||
self.save_surl(session, domain, surl)
|
self.save_surl(session, domain, surl)
|
||||||
except queue.Empty:
|
except queue.Empty:
|
||||||
# 队列空了,等1秒再取一次
|
# 队列空了,等1秒再取一次
|
||||||
@ -120,7 +123,7 @@ class CrawlEngine:
|
|||||||
|
|
||||||
# 检查在数据库中是否有重复的
|
# 检查在数据库中是否有重复的
|
||||||
for domain in domains:
|
for domain in domains:
|
||||||
with Session(self.database) as session:
|
with Session(AppCtx.g_db_engine) as session:
|
||||||
stmt = select(DomainModel).where(DomainModel.domain == domain)
|
stmt = select(DomainModel).where(DomainModel.domain == domain)
|
||||||
result = session.exec(stmt).first()
|
result = session.exec(stmt).first()
|
||||||
if not result:
|
if not result:
|
||||||
@ -163,10 +166,6 @@ class CrawlEngine:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# stmt = select(DomainModel).where(
|
|
||||||
# DomainModel.latest_crawl_time + DomainModel.crawl_interval * 60 <= current_timestamp
|
|
||||||
# )
|
|
||||||
domains = session.exec(stmt).all()
|
domains = session.exec(stmt).all()
|
||||||
|
|
||||||
for domain_model in domains:
|
for domain_model in domains:
|
||||||
@ -234,6 +233,26 @@ class CrawlEngine:
|
|||||||
# f"https://www.baidu.com/s?wd=site%3A{domain}&gpc=stf%3D{start_time}%2C{end_time}%7Cstftype%3D1&pn={(current_page - 1) * 10}")
|
# f"https://www.baidu.com/s?wd=site%3A{domain}&gpc=stf%3D{start_time}%2C{end_time}%7Cstftype%3D1&pn={(current_page - 1) * 10}")
|
||||||
# tab.get(f"https://www.baidu.com/s?wd=site%3A{domain}&pn={(current_page - 1) * 10}")
|
# tab.get(f"https://www.baidu.com/s?wd=site%3A{domain}&pn={(current_page - 1) * 10}")
|
||||||
|
|
||||||
|
# 检查一下当前的URL是不是跳到验证码的页面
|
||||||
|
if "//wappass.baidu.com/static/captcha/tuxing_v2.html" in tab.url:
|
||||||
|
logger.warning("触发验证码了,尝试识别")
|
||||||
|
idx = 0
|
||||||
|
while idx < 3:
|
||||||
|
idx += 1
|
||||||
|
logger.debug(f"开始第{idx}次识别...")
|
||||||
|
captcha_result = self.verify_captcha(tab.url)
|
||||||
|
if not captcha_result:
|
||||||
|
tab.refresh()
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
tab.get(captcha_result)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
logger.error("验证码打码失败,放弃本次采集,等待3分钟后继续")
|
||||||
|
self.ev.wait(180)
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
# 终止条件
|
# 终止条件
|
||||||
if current_page > max_page and max_page:
|
if current_page > max_page and max_page:
|
||||||
logger.debug(f"{threading.current_thread().name} 达到指定页码,退出")
|
logger.debug(f"{threading.current_thread().name} 达到指定页码,退出")
|
||||||
@ -314,3 +333,169 @@ class CrawlEngine:
|
|||||||
)
|
)
|
||||||
session.add(example)
|
session.add(example)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
|
# def captcha_listener(self):
|
||||||
|
# for pkg in self.tab.listen.steps():
|
||||||
|
# if "/cap/init" in pkg.url:
|
||||||
|
# self.captcha_data["init"] = pkg.response.body
|
||||||
|
# if "/cap/style" in pkg.url:
|
||||||
|
# self.captcha_data["style"] = pkg.response.body
|
||||||
|
# self.captcha_data["referer"] = pkg.request.headers.get("Referer")
|
||||||
|
# logger.debug(f"触发验证码的 referer: {self.captcha_data["referer"]}")
|
||||||
|
#
|
||||||
|
# self.captcha_data["cookie"] = pkg.request.headers.get("Cookie")
|
||||||
|
# logger.debug(f"触发验证码的 cookie: {self.captcha_data['cookie']}")
|
||||||
|
# if "/cap/log" in pkg.url:
|
||||||
|
# self.captcha_data["log"] = pkg.response.body
|
||||||
|
|
||||||
|
def verify_captcha(self, current_url: str):
|
||||||
|
"""尝试识别验证码,因为和 pc_reporter 的逻辑有点区别,所以单独写一遍"""
|
||||||
|
headers = {
|
||||||
|
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7,zh-TW;q=0.6',
|
||||||
|
'Cache-Control': 'no-cache',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Pragma': 'no-cache',
|
||||||
|
'Referer': current_url,
|
||||||
|
'Sec-Fetch-Dest': 'empty',
|
||||||
|
'Sec-Fetch-Mode': 'cors',
|
||||||
|
'Sec-Fetch-Site': 'same-origin',
|
||||||
|
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.0.0",
|
||||||
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
|
'sec-ch-ua_wap': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
|
||||||
|
'sec-ch-ua_wap-mobile': '?0',
|
||||||
|
'sec-ch-ua_wap-platform': '"Windows"',
|
||||||
|
# "Cookie": self.captcha_data["cookie"],
|
||||||
|
}
|
||||||
|
|
||||||
|
# 解出AS / TK
|
||||||
|
ts = time.time()
|
||||||
|
ts1 = int(ts)
|
||||||
|
ts2 = int(ts * 1000)
|
||||||
|
response = requests.post(
|
||||||
|
"https://passport.baidu.com/cap/init",
|
||||||
|
data={
|
||||||
|
"_": ts2,
|
||||||
|
"refer": re.sub(r'timestamp=\d+', f'timestamp={ts1}', current_url),
|
||||||
|
"ak": "c27bbc89afca0463650ac9bde68ebe06",
|
||||||
|
"ver": "2",
|
||||||
|
"scene": "",
|
||||||
|
"ds": "",
|
||||||
|
"tk": "",
|
||||||
|
"as": "",
|
||||||
|
"reinit": 0
|
||||||
|
},
|
||||||
|
headers=headers,
|
||||||
|
proxies=get_proxies()
|
||||||
|
).json()
|
||||||
|
as_value = response["data"]["as"]
|
||||||
|
tk_value = response["data"]["tk"]
|
||||||
|
|
||||||
|
# 解出 style
|
||||||
|
response = requests.post(
|
||||||
|
"https://passport.baidu.com/cap/style",
|
||||||
|
data={
|
||||||
|
"_": int(time.time() * 1000),
|
||||||
|
"refer": re.sub(r'timestamp=\d+', f'timestamp={ts1}', current_url),
|
||||||
|
"ak": "c27bbc89afca0463650ac9bde68ebe06",
|
||||||
|
"tk": tk_value,
|
||||||
|
"scene": "",
|
||||||
|
"isios": "0",
|
||||||
|
"type": "spin",
|
||||||
|
"ver": "2"
|
||||||
|
},
|
||||||
|
headers=headers,
|
||||||
|
proxies=get_proxies()
|
||||||
|
)
|
||||||
|
logger.debug(f"{response.content=}")
|
||||||
|
response = response.json()
|
||||||
|
backstr = response["data"]["backstr"]
|
||||||
|
captcha_link = response["data"]["captchalist"][0]["source"]["back"]["path"]
|
||||||
|
|
||||||
|
# 下载验证码图片
|
||||||
|
image_response = requests.get(captcha_link, headers=headers, proxies=get_proxies())
|
||||||
|
with open("captcha.png", "wb") as f:
|
||||||
|
f.write(image_response.content)
|
||||||
|
logger.debug("download captcha.png")
|
||||||
|
|
||||||
|
# 识别验证码
|
||||||
|
ydm = YdmVerify()
|
||||||
|
with open("captcha.png", "rb") as fp:
|
||||||
|
picture = fp.read()
|
||||||
|
|
||||||
|
slide_distance = ydm.rotate(picture)
|
||||||
|
logger.debug(f"{slide_distance=}")
|
||||||
|
if not slide_distance:
|
||||||
|
logger.error("识别验证码失败")
|
||||||
|
return None
|
||||||
|
rotate_angle_rate = round(slide_distance / 360, 2)
|
||||||
|
logger.debug(f"{rotate_angle_rate=}")
|
||||||
|
|
||||||
|
if not rotate_angle_rate:
|
||||||
|
logger.debug("识别验证码失败")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 发送验证码请求
|
||||||
|
time_log = str(int(time.time() * 1000))
|
||||||
|
with open("./js/mkd_v2_link_submit.js", 'r', encoding='utf-8') as f:
|
||||||
|
ds_js = f.read()
|
||||||
|
fs = execjs.compile(ds_js).call('getFs2', backstr, rotate_angle_rate, as_value)
|
||||||
|
data = {
|
||||||
|
"_": time_log,
|
||||||
|
"refer": current_url,
|
||||||
|
"ak": "c27bbc89afca0463650ac9bde68ebe06",
|
||||||
|
"as": as_value,
|
||||||
|
"scene": "",
|
||||||
|
"tk": tk_value,
|
||||||
|
"ver": "2",
|
||||||
|
"cv": "submit",
|
||||||
|
"typeid": "spin-0",
|
||||||
|
"fuid": "FOCoIC3q5fKa8fgJnwzbE67EJ49BGJeplOzf+4l4EOvDuu2RXBRv6R3A1AZMa49I27C0gDDLrJyxcIIeAeEhD8JYsoLTpBiaCXhLqvzbzmvy3SeAW17tKgNq/Xx+RgOdb8TWCFe62MVrDTY6lMf2GrfqL8c87KLF2qFER3obJGnfaJTn/Ne60I9LwR04t6XmGEimjy3MrXEpSuItnI4KD0FJKzTbw1AN69fBnzR2FuvMmmQZ+1zgJ72wdcVU+mcQxiE2ir0+TEYgjPJt1Qa3K1mLi+P4IWJeag2lvxB4yJ/GgLbz7OSojK1zRbqBESR5Pdk2R9IA3lxxOVzA+Iw1TWLSgWjlFVG9Xmh1+20oPSbrzvDjYtVPmZ+9/6evcXmhcO1Y58MgLozKnaQIaLfWRPAn9I0uOqAMff6fuUeWcH1mRYoTw2Nhr4J4agZi377iM/izL6cVCGRy2F8c0VpEvM5FjnYxYstXg/9EfB3EVmKAfzNRIeToJ5YV9twMcgdmlV1Uhhp5FAe6gNJIUptp7EMAaXYKm11G+JVPszQFdp9AJLcm4YSsYUXkaPI2Tl66J246cmjWQDTahAOINR5rXR5r/7VVI1RMZ8gb40q7az7vCK56XLooKT5a+rsFrf5Zu0yyCiiagElhrTEOtNdBJJq8eHwEHuFBni9ahSwpC7lbKkUwaKH69tf0DFV7hJROiLETSFloIVkHdy3+I2JUr1LsplAz0hMkWt/tE4tXVUV7QcTDTZWS/2mCoS/GV3N9awQ6iM6hs/BWjlgnEa1+5iP7WSc7RJ34FaE5PsyGXyoCWdXwNRGSZPSvVtB/Ea6w5FKazXcZ/j40FJv+iLGBn3nkkgHlne61I8I7KhtQgIkmBMJIjPMkS/L051MeqdGScsKYTJuSucgI5c3+79eVH+y2TvbOTuuHv1uGxwXFb2atIU1ZYPbmmXculmizKcKI/s44qf8uM8iBZLGkKeVyL74aPyLkg7Gk359g98BIGN/ZzJR/h+Y6AyFx+HlMoYJnS06dVmqFbvlCtSdGylKQ5f8eWtxPkJGqOFtWjIVteQYMsH/AaSJonqw+WLiZvGjYfm9p0alEyujapoTy77HzDcUoU1wUSXa5xS/Z6hXEr2OnLi0LdPVcGjz8lpLcdVeSfm9p0alEyujapoTy77HzDWf5PERRSTFqLd9BTUHLyY4Ji3EQLGQPaM1aeHxG1bJZH0s1Si/KwzTaTYzu6ziQiqwcr2kaYUiH+fMOxn69/BhNJVMhpQkhprc1KZuJRvXjppq0gKweencPxgS/jd0rjw==",
|
||||||
|
"fs": fs
|
||||||
|
}
|
||||||
|
response = requests.post(
|
||||||
|
"https://passport.baidu.com/cap/log",
|
||||||
|
headers=headers,
|
||||||
|
data=data,
|
||||||
|
proxies=get_proxies(),
|
||||||
|
).json()
|
||||||
|
try:
|
||||||
|
result = {
|
||||||
|
"ds": response["data"]["ds"],
|
||||||
|
"op": response["data"]["op"],
|
||||||
|
"tk": response["data"]["tk"]
|
||||||
|
}
|
||||||
|
except KeyError:
|
||||||
|
logger.error(f"验证码没转成功, response: {response=}")
|
||||||
|
time.sleep(1)
|
||||||
|
return None
|
||||||
|
logger.debug(f"{result=}")
|
||||||
|
|
||||||
|
# 检查验证码是否正确
|
||||||
|
if result["op"] != 1:
|
||||||
|
logger.error(f"op != 1, 重试")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 发送验证码请求 /cap/c 请求,获取待跳转的URL
|
||||||
|
response = requests.post(
|
||||||
|
"https://passport.baidu.com/cap/c?ak=c27bbc89afca0463650ac9bde68ebe06",
|
||||||
|
headers=headers,
|
||||||
|
json={
|
||||||
|
"tk": result["tk"],
|
||||||
|
"ds": result["ds"],
|
||||||
|
"qrsign": "",
|
||||||
|
"refer": current_url
|
||||||
|
},
|
||||||
|
proxies=get_proxies()
|
||||||
|
)
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
if data["data"].get("f"):
|
||||||
|
logger.error(f"验证码失败: {data['data'].get('f')}")
|
||||||
|
return None
|
||||||
|
if data["data"].get("s"):
|
||||||
|
logger.debug("验证成功,URL:" + data["data"].get("s").get("url"))
|
||||||
|
url = data["data"].get("s").get("url")
|
||||||
|
url = url.encode("utf-8").decode("unicode-escape")
|
||||||
|
logger.success("解码后的URL:" + url)
|
||||||
|
return url
|
||||||
|
|||||||
@ -7,7 +7,7 @@ from loguru import logger
|
|||||||
|
|
||||||
class YdmVerify(object):
|
class YdmVerify(object):
|
||||||
_custom_url = "https://www.jfbym.com/api/YmServer/customApi"
|
_custom_url = "https://www.jfbym.com/api/YmServer/customApi"
|
||||||
_token = "HhUGwpI6AtQGoux36i1ZpsDv7hwGSbr1hQ0RX-HXSZE"
|
_token = "2HNCDBee_JFmXAZZanQm9I7x1sqQln9BggF1xaGtMX0"
|
||||||
_headers = {
|
_headers = {
|
||||||
'Content-Type': 'application/json'
|
'Content-Type': 'application/json'
|
||||||
}
|
}
|
||||||
|
|||||||
@ -78,7 +78,7 @@ class DomainService:
|
|||||||
domain=x,
|
domain=x,
|
||||||
status=DomainStatus.READY.value,
|
status=DomainStatus.READY.value,
|
||||||
crawl_interval=interval,
|
crawl_interval=interval,
|
||||||
latest_crawl_time=0 if not crawl_now else int(time.time())
|
latest_crawl_time=0 if crawl_now else int(time.time())
|
||||||
) for x in domains
|
) for x in domains
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
0
tests/test_unicode.py
Normal file
0
tests/test_unicode.py
Normal file
Loading…
x
Reference in New Issue
Block a user