技术声明:本文内容仅用于技术研究和学习交流。任何数据请求行为都应遵循目标网站的robots.txt协议及服务条款。
在开发光遇每日任务推送插件时,遇到了一个棘手的问题:原有方案依赖用户手动填写微博Cookie(SUB和XSRF-TOKEN),这带来了两个痛点:
直接请求微博API(如/api/container/getIndex)会返回 432,这表明即使是"访客"状态,服务端也有严格的会话管理机制。
技术目标:逆向微博H5访客鉴权协议,实现自动化获取临时访客凭证,让插件无需用户提供Cookie即可访问微博数据。
通过Edge DevTools抓包分析,清除Cookie前后对比,确认了微博H5的访客鉴权是一个跨域交互的三阶段闭环。
Endpoint: POST https://visitor.passport.weibo.cn/visitor/genvisitor2
关键参数:
技术难点:
visitor_gray_callback({...})而非标准JSON,需要正则提取核心代码:
Endpoint: GET https://m.weibo.cn/
功能:携带SUB访问主站,激活会话并获取XSRF-TOKEN
关键点:
安全机制:Double Submit Cookie模式防御CSRF
客户端要求:
x-xsrf-token请求头服务端校验:
完整的概念验证代码:
将PoC集成到生产环境时,面临以下挑战:
设计了一个智能fallback机制:
工作流程:
PC端API (/ajax/statuses/mymblog):
移动端API (/api/container/getIndex):
适配方案:
移动端API返回的文本会被截断,需要调用长文本API:
问题:每个数据源都需要访客认证,导致请求次数过多
优化前(2个数据源):
优化方案:
优化后(2个数据源):
问题:移动端API清理HTML后,超话标签格式变化导致正则匹配失败
原始HTML:
清理后:
解决方案:修改正则表达式,从依赖特定格式改为关键词匹配
工作逻辑:
cookies.enabled = true 且填写了cookie → 使用PC端APIcookies.enabled = false 或cookie为空 → 自动使用无Cookie方案优化前:
优化后:
日志示例:
本方案适用于:
项目地址:GitHub - Sky Daily Plugin
本文完整代码已开源,欢迎Star和PR。如有问题,请提Issue讨论。
data = {
'cb': 'visitor_gray_callback',
'tid': '',
'new_tid': 'null'
}
resp = session.post(url, headers=headers, data=data)
match = re.search(r'visitor_gray_callback\((.*)\)', resp.text)
json_data = json.loads(match.group(1))
sub = json_data['data']['sub']
session.cookies.set('SUB', sub, domain='.weibo.cn')
Header['x-xsrf-token'] == Cookie['XSRF-TOKEN']
# -*- coding: utf-8 -*-
import requests
import json
import re
class WeiboH5VisitorAuth:
def __init__(self):
self.session = requests.Session()
self._init_headers()
self.sub = None
self.xsrf_token = None
def _init_headers(self):
"""配置浏览器指纹"""
headers_base = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'sec-ch-ua': '"Chromium";v="142", "Microsoft Edge";v="142"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'dnt': '1',
}
self.session.headers.update(headers_base)
def step1_obtain_identity_token(self):
"""阶段一:获取SUB"""
url = 'https://visitor.passport.weibo.cn/visitor/genvisitor2'
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://visitor.passport.weibo.cn',
}
data = {'cb': 'visitor_gray_callback', 'tid': '', 'new_tid': 'null'}
resp = self.session.post(url, headers=headers, data=data)
match = re.search(r'visitor_gray_callback\((.*)\)', resp.text)
json_data = json.loads(match.group(1))
if json_data.get('retcode') == 20000000:
self.sub = json_data['data']['sub']
self.session.cookies.set('SUB', self.sub, domain='.weibo.cn')
self.session.cookies.set('SUBP', json_data['data']['subp'], domain='.weibo.cn')
def step2_initialize_session(self):
"""阶段二:获取XSRF-TOKEN"""
url = 'https://m.weibo.cn'
headers = {'Referer': 'https://visitor.passport.weibo.cn/'}
self.session.get(url, headers=headers)
self.xsrf_token = self.session.cookies.get('XSRF-TOKEN')
def step3_access_api(self, container_id):
"""阶段三:访问API"""
api_url = 'https://m.weibo.cn/api/container/getIndex'
params = {'containerid': container_id, 'page': 1, 'count': 10}
headers = {
'Accept': 'application/json, text/plain, */*',
'x-xsrf-token': self.xsrf_token, # Double Submit
'Referer': f'https://m.weibo.cn/u/{container_id}'
}
resp = self.session.get(api_url, params=params, headers=headers)
return resp.json()
def run(self, target_id):
self.step1_obtain_identity_token()
self.step2_initialize_session()
return self.step3_access_api(target_id)
class Auth:
def __init__(self, config):
self.use_cookie = config.get("cookies", {}).get("enabled", False)
self._visitor_cookies = {}
async def init_visitor_auth(self, session):
"""无Cookie方案:初始化访客认证"""
# 实现三阶段鉴权
pass
class Spider:
async def fetch(self, page=0):
# 优先尝试Cookie方案
if self.auth.use_cookie and self.auth._get_cookie():
try:
return await self._fetch_with_cookie(page)
except Exception as e:
logger.warning(f"Cookie方案失败: {e},切换到无Cookie方案")
# Fallback到无Cookie方案
return await self._fetch_without_cookie(page)
{
"ok": 1,
"data": {
"list": [{
"mblogid": "xxx",
"text_raw": "纯文本",
"pic_infos": {...}
}]
}
}
{
"ok": 1,
"data": {
"cards": [{
"card_group": [{
"mblog": {
"mid": "xxx",
"text": "<br />HTML文本<br />",
"pics": [...]
}
}]
}]
}
}
def _parse_mobile_mblogs(self, data):
for card in data.get("cards", []):
for item in card.get("card_group", []):
mblog = item["mblog"]
# 1. 将<br>转换为换行符
text_raw = re.sub(r'<br\s*/?>', '\n', mblog.get("text", ""))
# 2. 清理其他HTML标签
text_raw = re.sub(r'<[^>]+>', '', text_raw).strip()
self._results.append(Blog(
mblogid=mblog.get("mid"),
text_raw=text_raw,
is_long_text=mblog.get("isLongText", False),
use_mobile_api=True
))
async def fetch_long_text(self, client, auth):
if not self.is_long_text:
return self.text_raw
if self.use_mobile_api:
api = "https://m.weibo.cn/statuses/extend"
else:
api = "https://weibo.com/ajax/statuses/longtext"
response = await client.get(api, params={"id": self.mblogid})
long_text = response.json()["data"]["longTextContent"]
# 清理HTML并保留换行
long_text = re.sub(r'<br\s*/?>', '\n', long_text)
return re.sub(r'<[^>]+>', '', long_text).strip()
class Spider:
def __init__(self):
self._client = None # 保存client引用
async def _fetch_without_cookie(self, page):
self._client = httpx.AsyncClient()
await self.auth.init_visitor_auth(self._client)
# 获取列表...
return self # client保持打开状态
# 在SkyDaily.get_daily_data中
try:
await spider.fetch()
blog = spider.filter_by_regex(pattern).one()
# 复用spider的client获取长文本
if spider._client:
text = await blog.fetch_long_text(spider._client, auth=auth)
finally:
# 确保关闭client
if spider._client:
await spider._client.aclose()
<span class="surl-text">sky光遇超话</span> 11.22 | 每日任务
sky光遇超话 11.22 | 每日任务
# 旧正则(依赖#标签)
pattern = r"^#[^#]*光遇[^#]*超话]#\s*\d{1,2}\.\d{1,2}\s*"
# 新正则(关键词匹配)
pattern = r".*sky光遇.*每日任务.*"
{
"cookies": {
"enabled": false,
"sub": "",
"xsrf_token": ""
},
"data_sources": [
"7360748659:.*sky光遇.*每日任务.*:今天游离翻车了吗",
"5539106873:^【国服·每日任务攻略】:陈陈努力不鸽"
]
}
async def fetch(self, page=0, max_attempts=3, retry_delay=2):
for attempt in range(max_attempts):
try:
# 尝试获取数据
pass
except (httpx.HTTPStatusError, httpx.TimeoutException) as e:
if attempt < max_attempts - 1:
logger.warning(f"请求失败,{retry_delay}秒后重试...")
await asyncio.sleep(retry_delay)
continue
else:
raise
finally:
# 确保清理资源
if self._client:
await self._client.aclose()
[INFO] 使用无Cookie方案获取微博数据
[INFO] 访客认证初始化成功
[INFO] 获取到 今天游离翻车了吗 今日微博 1 条
[INFO] 今日国服攻略查询完成,成功获取 2/2 个数据源