diff --git a/app/engines/crawl_engine.py b/app/engines/crawl_engine.py index 34c3ab8..32ee0c1 100644 --- a/app/engines/crawl_engine.py +++ b/app/engines/crawl_engine.py @@ -1,7 +1,10 @@ import queue +import re import threading import time +import execjs +import requests from DrissionPage.errors import ElementNotFoundError from loguru import logger from sqlmodel import Session, select, or_, and_ @@ -10,7 +13,9 @@ from app.config.config import AppCtx from app.constants.domain import DomainStatus from app.models.domain import DomainModel from app.models.report_urls import ReportUrlModel +from app.utils.common import get_proxies from app.utils.dp import DPEngine +from app.utils.ydm_verify import YdmVerify class CrawlEngine: @@ -36,8 +41,6 @@ class CrawlEngine: # 创建一个浏览器 self.dp_engine = DPEngine() - self.database = AppCtx.g_db_engine - def cli_start(self, target_domains: str, target_domain_filepath: str): """CLI 模式启动 target_domains: 英文逗号分割的字符串 @@ -91,7 +94,7 @@ class CrawlEngine: continue # 存入数据库 - with Session(self.database) as session: + with Session(AppCtx.g_db_engine) as session: self.save_surl(session, domain, surl) except queue.Empty: # 队列空了,等1秒再取一次 @@ -120,7 +123,7 @@ class CrawlEngine: # 检查在数据库中是否有重复的 for domain in domains: - with Session(self.database) as session: + with Session(AppCtx.g_db_engine) as session: stmt = select(DomainModel).where(DomainModel.domain == domain) result = session.exec(stmt).first() if not result: @@ -163,10 +166,6 @@ class CrawlEngine: ) ) ) - - # stmt = select(DomainModel).where( - # DomainModel.latest_crawl_time + DomainModel.crawl_interval * 60 <= current_timestamp - # ) domains = session.exec(stmt).all() for domain_model in domains: @@ -234,6 +233,26 @@ class CrawlEngine: # f"https://www.baidu.com/s?wd=site%3A{domain}&gpc=stf%3D{start_time}%2C{end_time}%7Cstftype%3D1&pn={(current_page - 1) * 10}") # tab.get(f"https://www.baidu.com/s?wd=site%3A{domain}&pn={(current_page - 1) * 10}") + # 检查一下当前的URL是不是跳到验证码的页面 + if "//wappass.baidu.com/static/captcha/tuxing_v2.html" in tab.url: + logger.warning("触发验证码了,尝试识别") + idx = 0 + while idx < 3: + idx += 1 + logger.debug(f"开始第{idx}次识别...") + captcha_result = self.verify_captcha(tab.url) + if not captcha_result: + tab.refresh() + continue + else: + tab.get(captcha_result) + break + else: + logger.error("验证码打码失败,放弃本次采集,等待3分钟后继续") + self.ev.wait(180) + break + + # 终止条件 if current_page > max_page and max_page: logger.debug(f"{threading.current_thread().name} 达到指定页码,退出") @@ -314,3 +333,169 @@ class CrawlEngine: ) session.add(example) session.commit() + + # def captcha_listener(self): + # for pkg in self.tab.listen.steps(): + # if "/cap/init" in pkg.url: + # self.captcha_data["init"] = pkg.response.body + # if "/cap/style" in pkg.url: + # self.captcha_data["style"] = pkg.response.body + # self.captcha_data["referer"] = pkg.request.headers.get("Referer") + # logger.debug(f"触发验证码的 referer: {self.captcha_data["referer"]}") + # + # self.captcha_data["cookie"] = pkg.request.headers.get("Cookie") + # logger.debug(f"触发验证码的 cookie: {self.captcha_data['cookie']}") + # if "/cap/log" in pkg.url: + # self.captcha_data["log"] = pkg.response.body + + def verify_captcha(self, current_url: str): + """尝试识别验证码,因为和 pc_reporter 的逻辑有点区别,所以单独写一遍""" + headers = { + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7,zh-TW;q=0.6', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'Pragma': 'no-cache', + 'Referer': current_url, + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-origin', + 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.0.0", + 'X-Requested-With': 'XMLHttpRequest', + 'sec-ch-ua_wap': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"', + 'sec-ch-ua_wap-mobile': '?0', + 'sec-ch-ua_wap-platform': '"Windows"', + # "Cookie": self.captcha_data["cookie"], + } + + # 解出AS / TK + ts = time.time() + ts1 = int(ts) + ts2 = int(ts * 1000) + response = requests.post( + "https://passport.baidu.com/cap/init", + data={ + "_": ts2, + "refer": re.sub(r'timestamp=\d+', f'timestamp={ts1}', current_url), + "ak": "c27bbc89afca0463650ac9bde68ebe06", + "ver": "2", + "scene": "", + "ds": "", + "tk": "", + "as": "", + "reinit": 0 + }, + headers=headers, + proxies=get_proxies() + ).json() + as_value = response["data"]["as"] + tk_value = response["data"]["tk"] + + # 解出 style + response = requests.post( + "https://passport.baidu.com/cap/style", + data={ + "_": int(time.time() * 1000), + "refer": re.sub(r'timestamp=\d+', f'timestamp={ts1}', current_url), + "ak": "c27bbc89afca0463650ac9bde68ebe06", + "tk": tk_value, + "scene": "", + "isios": "0", + "type": "spin", + "ver": "2" + }, + headers=headers, + proxies=get_proxies() + ) + logger.debug(f"{response.content=}") + response = response.json() + backstr = response["data"]["backstr"] + captcha_link = response["data"]["captchalist"][0]["source"]["back"]["path"] + + # 下载验证码图片 + image_response = requests.get(captcha_link, headers=headers, proxies=get_proxies()) + with open("captcha.png", "wb") as f: + f.write(image_response.content) + logger.debug("download captcha.png") + + # 识别验证码 + ydm = YdmVerify() + with open("captcha.png", "rb") as fp: + picture = fp.read() + + slide_distance = ydm.rotate(picture) + logger.debug(f"{slide_distance=}") + if not slide_distance: + logger.error("识别验证码失败") + return None + rotate_angle_rate = round(slide_distance / 360, 2) + logger.debug(f"{rotate_angle_rate=}") + + if not rotate_angle_rate: + logger.debug("识别验证码失败") + return None + + # 发送验证码请求 + time_log = str(int(time.time() * 1000)) + with open("./js/mkd_v2_link_submit.js", 'r', encoding='utf-8') as f: + ds_js = f.read() + fs = execjs.compile(ds_js).call('getFs2', backstr, rotate_angle_rate, as_value) + data = { + "_": time_log, + "refer": current_url, + "ak": "c27bbc89afca0463650ac9bde68ebe06", + "as": as_value, + "scene": "", + "tk": tk_value, + "ver": "2", + "cv": "submit", + "typeid": "spin-0", + "fuid": "FOCoIC3q5fKa8fgJnwzbE67EJ49BGJeplOzf+4l4EOvDuu2RXBRv6R3A1AZMa49I27C0gDDLrJyxcIIeAeEhD8JYsoLTpBiaCXhLqvzbzmvy3SeAW17tKgNq/Xx+RgOdb8TWCFe62MVrDTY6lMf2GrfqL8c87KLF2qFER3obJGnfaJTn/Ne60I9LwR04t6XmGEimjy3MrXEpSuItnI4KD0FJKzTbw1AN69fBnzR2FuvMmmQZ+1zgJ72wdcVU+mcQxiE2ir0+TEYgjPJt1Qa3K1mLi+P4IWJeag2lvxB4yJ/GgLbz7OSojK1zRbqBESR5Pdk2R9IA3lxxOVzA+Iw1TWLSgWjlFVG9Xmh1+20oPSbrzvDjYtVPmZ+9/6evcXmhcO1Y58MgLozKnaQIaLfWRPAn9I0uOqAMff6fuUeWcH1mRYoTw2Nhr4J4agZi377iM/izL6cVCGRy2F8c0VpEvM5FjnYxYstXg/9EfB3EVmKAfzNRIeToJ5YV9twMcgdmlV1Uhhp5FAe6gNJIUptp7EMAaXYKm11G+JVPszQFdp9AJLcm4YSsYUXkaPI2Tl66J246cmjWQDTahAOINR5rXR5r/7VVI1RMZ8gb40q7az7vCK56XLooKT5a+rsFrf5Zu0yyCiiagElhrTEOtNdBJJq8eHwEHuFBni9ahSwpC7lbKkUwaKH69tf0DFV7hJROiLETSFloIVkHdy3+I2JUr1LsplAz0hMkWt/tE4tXVUV7QcTDTZWS/2mCoS/GV3N9awQ6iM6hs/BWjlgnEa1+5iP7WSc7RJ34FaE5PsyGXyoCWdXwNRGSZPSvVtB/Ea6w5FKazXcZ/j40FJv+iLGBn3nkkgHlne61I8I7KhtQgIkmBMJIjPMkS/L051MeqdGScsKYTJuSucgI5c3+79eVH+y2TvbOTuuHv1uGxwXFb2atIU1ZYPbmmXculmizKcKI/s44qf8uM8iBZLGkKeVyL74aPyLkg7Gk359g98BIGN/ZzJR/h+Y6AyFx+HlMoYJnS06dVmqFbvlCtSdGylKQ5f8eWtxPkJGqOFtWjIVteQYMsH/AaSJonqw+WLiZvGjYfm9p0alEyujapoTy77HzDcUoU1wUSXa5xS/Z6hXEr2OnLi0LdPVcGjz8lpLcdVeSfm9p0alEyujapoTy77HzDWf5PERRSTFqLd9BTUHLyY4Ji3EQLGQPaM1aeHxG1bJZH0s1Si/KwzTaTYzu6ziQiqwcr2kaYUiH+fMOxn69/BhNJVMhpQkhprc1KZuJRvXjppq0gKweencPxgS/jd0rjw==", + "fs": fs + } + response = requests.post( + "https://passport.baidu.com/cap/log", + headers=headers, + data=data, + proxies=get_proxies(), + ).json() + try: + result = { + "ds": response["data"]["ds"], + "op": response["data"]["op"], + "tk": response["data"]["tk"] + } + except KeyError: + logger.error(f"验证码没转成功, response: {response=}") + time.sleep(1) + return None + logger.debug(f"{result=}") + + # 检查验证码是否正确 + if result["op"] != 1: + logger.error(f"op != 1, 重试") + return None + + # 发送验证码请求 /cap/c 请求,获取待跳转的URL + response = requests.post( + "https://passport.baidu.com/cap/c?ak=c27bbc89afca0463650ac9bde68ebe06", + headers=headers, + json={ + "tk": result["tk"], + "ds": result["ds"], + "qrsign": "", + "refer": current_url + }, + proxies=get_proxies() + ) + + data = response.json() + if data["data"].get("f"): + logger.error(f"验证码失败: {data['data'].get('f')}") + return None + if data["data"].get("s"): + logger.debug("验证成功,URL:" + data["data"].get("s").get("url")) + url = data["data"].get("s").get("url") + url = url.encode("utf-8").decode("unicode-escape") + logger.success("解码后的URL:" + url) + return url diff --git a/app/utils/ydm_verify.py b/app/utils/ydm_verify.py index 6bd175d..89dcfdc 100644 --- a/app/utils/ydm_verify.py +++ b/app/utils/ydm_verify.py @@ -7,7 +7,7 @@ from loguru import logger class YdmVerify(object): _custom_url = "https://www.jfbym.com/api/YmServer/customApi" - _token = "HhUGwpI6AtQGoux36i1ZpsDv7hwGSbr1hQ0RX-HXSZE" + _token = "2HNCDBee_JFmXAZZanQm9I7x1sqQln9BggF1xaGtMX0" _headers = { 'Content-Type': 'application/json' } diff --git a/app/web/service/domain_service.py b/app/web/service/domain_service.py index cda8e99..1f20302 100644 --- a/app/web/service/domain_service.py +++ b/app/web/service/domain_service.py @@ -78,7 +78,7 @@ class DomainService: domain=x, status=DomainStatus.READY.value, crawl_interval=interval, - latest_crawl_time=0 if not crawl_now else int(time.time()) + latest_crawl_time=0 if crawl_now else int(time.time()) ) for x in domains ] diff --git a/tests/test_unicode.py b/tests/test_unicode.py new file mode 100644 index 0000000..e69de29