采集器增加验证码打码功能

This commit is contained in:
xhy 2025-04-09 22:58:25 +08:00
parent b7f545dab8
commit 19607428f8
4 changed files with 195 additions and 10 deletions

View File

@ -1,7 +1,10 @@
import queue import queue
import re
import threading import threading
import time import time
import execjs
import requests
from DrissionPage.errors import ElementNotFoundError from DrissionPage.errors import ElementNotFoundError
from loguru import logger from loguru import logger
from sqlmodel import Session, select, or_, and_ from sqlmodel import Session, select, or_, and_
@ -10,7 +13,9 @@ from app.config.config import AppCtx
from app.constants.domain import DomainStatus from app.constants.domain import DomainStatus
from app.models.domain import DomainModel from app.models.domain import DomainModel
from app.models.report_urls import ReportUrlModel from app.models.report_urls import ReportUrlModel
from app.utils.common import get_proxies
from app.utils.dp import DPEngine from app.utils.dp import DPEngine
from app.utils.ydm_verify import YdmVerify
class CrawlEngine: class CrawlEngine:
@ -36,8 +41,6 @@ class CrawlEngine:
# 创建一个浏览器 # 创建一个浏览器
self.dp_engine = DPEngine() self.dp_engine = DPEngine()
self.database = AppCtx.g_db_engine
def cli_start(self, target_domains: str, target_domain_filepath: str): def cli_start(self, target_domains: str, target_domain_filepath: str):
"""CLI 模式启动 """CLI 模式启动
target_domains: 英文逗号分割的字符串 target_domains: 英文逗号分割的字符串
@ -91,7 +94,7 @@ class CrawlEngine:
continue continue
# 存入数据库 # 存入数据库
with Session(self.database) as session: with Session(AppCtx.g_db_engine) as session:
self.save_surl(session, domain, surl) self.save_surl(session, domain, surl)
except queue.Empty: except queue.Empty:
# 队列空了等1秒再取一次 # 队列空了等1秒再取一次
@ -120,7 +123,7 @@ class CrawlEngine:
# 检查在数据库中是否有重复的 # 检查在数据库中是否有重复的
for domain in domains: for domain in domains:
with Session(self.database) as session: with Session(AppCtx.g_db_engine) as session:
stmt = select(DomainModel).where(DomainModel.domain == domain) stmt = select(DomainModel).where(DomainModel.domain == domain)
result = session.exec(stmt).first() result = session.exec(stmt).first()
if not result: if not result:
@ -163,10 +166,6 @@ class CrawlEngine:
) )
) )
) )
# stmt = select(DomainModel).where(
# DomainModel.latest_crawl_time + DomainModel.crawl_interval * 60 <= current_timestamp
# )
domains = session.exec(stmt).all() domains = session.exec(stmt).all()
for domain_model in domains: for domain_model in domains:
@ -234,6 +233,26 @@ class CrawlEngine:
# f"https://www.baidu.com/s?wd=site%3A{domain}&gpc=stf%3D{start_time}%2C{end_time}%7Cstftype%3D1&pn={(current_page - 1) * 10}") # f"https://www.baidu.com/s?wd=site%3A{domain}&gpc=stf%3D{start_time}%2C{end_time}%7Cstftype%3D1&pn={(current_page - 1) * 10}")
# tab.get(f"https://www.baidu.com/s?wd=site%3A{domain}&pn={(current_page - 1) * 10}") # tab.get(f"https://www.baidu.com/s?wd=site%3A{domain}&pn={(current_page - 1) * 10}")
# 检查一下当前的URL是不是跳到验证码的页面
if "//wappass.baidu.com/static/captcha/tuxing_v2.html" in tab.url:
logger.warning("触发验证码了,尝试识别")
idx = 0
while idx < 3:
idx += 1
logger.debug(f"开始第{idx}次识别...")
captcha_result = self.verify_captcha(tab.url)
if not captcha_result:
tab.refresh()
continue
else:
tab.get(captcha_result)
break
else:
logger.error("验证码打码失败放弃本次采集等待3分钟后继续")
self.ev.wait(180)
break
# 终止条件 # 终止条件
if current_page > max_page and max_page: if current_page > max_page and max_page:
logger.debug(f"{threading.current_thread().name} 达到指定页码,退出") logger.debug(f"{threading.current_thread().name} 达到指定页码,退出")
@ -314,3 +333,169 @@ class CrawlEngine:
) )
session.add(example) session.add(example)
session.commit() session.commit()
# def captcha_listener(self):
# for pkg in self.tab.listen.steps():
# if "/cap/init" in pkg.url:
# self.captcha_data["init"] = pkg.response.body
# if "/cap/style" in pkg.url:
# self.captcha_data["style"] = pkg.response.body
# self.captcha_data["referer"] = pkg.request.headers.get("Referer")
# logger.debug(f"触发验证码的 referer: {self.captcha_data["referer"]}")
#
# self.captcha_data["cookie"] = pkg.request.headers.get("Cookie")
# logger.debug(f"触发验证码的 cookie: {self.captcha_data['cookie']}")
# if "/cap/log" in pkg.url:
# self.captcha_data["log"] = pkg.response.body
def verify_captcha(self, current_url: str):
"""尝试识别验证码,因为和 pc_reporter 的逻辑有点区别,所以单独写一遍"""
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7,zh-TW;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Referer': current_url,
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.0.0",
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua_wap': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
'sec-ch-ua_wap-mobile': '?0',
'sec-ch-ua_wap-platform': '"Windows"',
# "Cookie": self.captcha_data["cookie"],
}
# 解出AS / TK
ts = time.time()
ts1 = int(ts)
ts2 = int(ts * 1000)
response = requests.post(
"https://passport.baidu.com/cap/init",
data={
"_": ts2,
"refer": re.sub(r'timestamp=\d+', f'timestamp={ts1}', current_url),
"ak": "c27bbc89afca0463650ac9bde68ebe06",
"ver": "2",
"scene": "",
"ds": "",
"tk": "",
"as": "",
"reinit": 0
},
headers=headers,
proxies=get_proxies()
).json()
as_value = response["data"]["as"]
tk_value = response["data"]["tk"]
# 解出 style
response = requests.post(
"https://passport.baidu.com/cap/style",
data={
"_": int(time.time() * 1000),
"refer": re.sub(r'timestamp=\d+', f'timestamp={ts1}', current_url),
"ak": "c27bbc89afca0463650ac9bde68ebe06",
"tk": tk_value,
"scene": "",
"isios": "0",
"type": "spin",
"ver": "2"
},
headers=headers,
proxies=get_proxies()
)
logger.debug(f"{response.content=}")
response = response.json()
backstr = response["data"]["backstr"]
captcha_link = response["data"]["captchalist"][0]["source"]["back"]["path"]
# 下载验证码图片
image_response = requests.get(captcha_link, headers=headers, proxies=get_proxies())
with open("captcha.png", "wb") as f:
f.write(image_response.content)
logger.debug("download captcha.png")
# 识别验证码
ydm = YdmVerify()
with open("captcha.png", "rb") as fp:
picture = fp.read()
slide_distance = ydm.rotate(picture)
logger.debug(f"{slide_distance=}")
if not slide_distance:
logger.error("识别验证码失败")
return None
rotate_angle_rate = round(slide_distance / 360, 2)
logger.debug(f"{rotate_angle_rate=}")
if not rotate_angle_rate:
logger.debug("识别验证码失败")
return None
# 发送验证码请求
time_log = str(int(time.time() * 1000))
with open("./js/mkd_v2_link_submit.js", 'r', encoding='utf-8') as f:
ds_js = f.read()
fs = execjs.compile(ds_js).call('getFs2', backstr, rotate_angle_rate, as_value)
data = {
"_": time_log,
"refer": current_url,
"ak": "c27bbc89afca0463650ac9bde68ebe06",
"as": as_value,
"scene": "",
"tk": tk_value,
"ver": "2",
"cv": "submit",
"typeid": "spin-0",
"fuid": "FOCoIC3q5fKa8fgJnwzbE67EJ49BGJeplOzf+4l4EOvDuu2RXBRv6R3A1AZMa49I27C0gDDLrJyxcIIeAeEhD8JYsoLTpBiaCXhLqvzbzmvy3SeAW17tKgNq/Xx+RgOdb8TWCFe62MVrDTY6lMf2GrfqL8c87KLF2qFER3obJGnfaJTn/Ne60I9LwR04t6XmGEimjy3MrXEpSuItnI4KD0FJKzTbw1AN69fBnzR2FuvMmmQZ+1zgJ72wdcVU+mcQxiE2ir0+TEYgjPJt1Qa3K1mLi+P4IWJeag2lvxB4yJ/GgLbz7OSojK1zRbqBESR5Pdk2R9IA3lxxOVzA+Iw1TWLSgWjlFVG9Xmh1+20oPSbrzvDjYtVPmZ+9/6evcXmhcO1Y58MgLozKnaQIaLfWRPAn9I0uOqAMff6fuUeWcH1mRYoTw2Nhr4J4agZi377iM/izL6cVCGRy2F8c0VpEvM5FjnYxYstXg/9EfB3EVmKAfzNRIeToJ5YV9twMcgdmlV1Uhhp5FAe6gNJIUptp7EMAaXYKm11G+JVPszQFdp9AJLcm4YSsYUXkaPI2Tl66J246cmjWQDTahAOINR5rXR5r/7VVI1RMZ8gb40q7az7vCK56XLooKT5a+rsFrf5Zu0yyCiiagElhrTEOtNdBJJq8eHwEHuFBni9ahSwpC7lbKkUwaKH69tf0DFV7hJROiLETSFloIVkHdy3+I2JUr1LsplAz0hMkWt/tE4tXVUV7QcTDTZWS/2mCoS/GV3N9awQ6iM6hs/BWjlgnEa1+5iP7WSc7RJ34FaE5PsyGXyoCWdXwNRGSZPSvVtB/Ea6w5FKazXcZ/j40FJv+iLGBn3nkkgHlne61I8I7KhtQgIkmBMJIjPMkS/L051MeqdGScsKYTJuSucgI5c3+79eVH+y2TvbOTuuHv1uGxwXFb2atIU1ZYPbmmXculmizKcKI/s44qf8uM8iBZLGkKeVyL74aPyLkg7Gk359g98BIGN/ZzJR/h+Y6AyFx+HlMoYJnS06dVmqFbvlCtSdGylKQ5f8eWtxPkJGqOFtWjIVteQYMsH/AaSJonqw+WLiZvGjYfm9p0alEyujapoTy77HzDcUoU1wUSXa5xS/Z6hXEr2OnLi0LdPVcGjz8lpLcdVeSfm9p0alEyujapoTy77HzDWf5PERRSTFqLd9BTUHLyY4Ji3EQLGQPaM1aeHxG1bJZH0s1Si/KwzTaTYzu6ziQiqwcr2kaYUiH+fMOxn69/BhNJVMhpQkhprc1KZuJRvXjppq0gKweencPxgS/jd0rjw==",
"fs": fs
}
response = requests.post(
"https://passport.baidu.com/cap/log",
headers=headers,
data=data,
proxies=get_proxies(),
).json()
try:
result = {
"ds": response["data"]["ds"],
"op": response["data"]["op"],
"tk": response["data"]["tk"]
}
except KeyError:
logger.error(f"验证码没转成功, response: {response=}")
time.sleep(1)
return None
logger.debug(f"{result=}")
# 检查验证码是否正确
if result["op"] != 1:
logger.error(f"op != 1, 重试")
return None
# 发送验证码请求 /cap/c 请求获取待跳转的URL
response = requests.post(
"https://passport.baidu.com/cap/c?ak=c27bbc89afca0463650ac9bde68ebe06",
headers=headers,
json={
"tk": result["tk"],
"ds": result["ds"],
"qrsign": "",
"refer": current_url
},
proxies=get_proxies()
)
data = response.json()
if data["data"].get("f"):
logger.error(f"验证码失败: {data['data'].get('f')}")
return None
if data["data"].get("s"):
logger.debug("验证成功URL" + data["data"].get("s").get("url"))
url = data["data"].get("s").get("url")
url = url.encode("utf-8").decode("unicode-escape")
logger.success("解码后的URL" + url)
return url

View File

@ -7,7 +7,7 @@ from loguru import logger
class YdmVerify(object): class YdmVerify(object):
_custom_url = "https://www.jfbym.com/api/YmServer/customApi" _custom_url = "https://www.jfbym.com/api/YmServer/customApi"
_token = "HhUGwpI6AtQGoux36i1ZpsDv7hwGSbr1hQ0RX-HXSZE" _token = "2HNCDBee_JFmXAZZanQm9I7x1sqQln9BggF1xaGtMX0"
_headers = { _headers = {
'Content-Type': 'application/json' 'Content-Type': 'application/json'
} }

View File

@ -78,7 +78,7 @@ class DomainService:
domain=x, domain=x,
status=DomainStatus.READY.value, status=DomainStatus.READY.value,
crawl_interval=interval, crawl_interval=interval,
latest_crawl_time=0 if not crawl_now else int(time.time()) latest_crawl_time=0 if crawl_now else int(time.time())
) for x in domains ) for x in domains
] ]

0
tests/test_unicode.py Normal file
View File