【Python爬虫】之百度翻译sgin解密
作者:互联网
目录
Fiddler配合谷歌游览器抓取数据
首先发现百度翻译是先对输入的词进行查询语言类别,通过这个地址https://fanyi.baidu.com/langdetect
其次还发现了对提交的数据进行了url编码,输入的“我是超人”,查询返回的是"lan":"zh",到目前为止还没有发现有加密的地方,咱们继续往下看真正发送请求的链接
谷歌游览器调试
https://fanyi.baidu.com/v2transapi?from=zh&to=en 这个链接才是真正发送请求的链接。一眼就看到了一个sign参数,不用想这肯定是加密了(因为我搜过这个值,没有 = =!),其次token这个参数其实是这个服务器的返回值,从历史返回里就可以找到了。主要解决了这个sgin就能成,在谷歌游览器里切换到【Sources】,然后按Ctrl + Shift + F 搜sign
发现居然有56个,太多了,我们加个冒号试试,注意是英文冒号。
这次只有14个,很好,我们全部下断点(点进去,搜sign: 然后在行数的前面点一下),然后重新输入要翻译的词,看看会不会断下来
发现成功断下来了,是把L(e) 的值赋给了sign,然后我们在控制台打印下L(e)的值确定是我们想要的值。 而e的值就是“你好”。我们跟进去看下这个函数(按F11进入函数)
然后发现看了下没有什么特别的加密,就只有这一段JS而已,JS代码如下
window = {};
var i = null;
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var a = o.charAt(t + 2);
a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
}
return r
}
function e(r) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++) "" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = void 0,
l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i : (i = window[l] || "") || "";
u = '320305.131321201';
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
有一点需要注意的是,js里有根据 i 的值来取 u 的值,在调试的时候发现 i 的值是固定的320305.131321201,所以我在js里直接把u = ‘320305.131321201’,在python里执行下就成功了
Python代码
import requests
from urllib.parse import urlencode
import execjs
import time
import json
#处理请求头
def getHeaders(cookies):
headers = {
'Host':'fanyi.baidu.com',
'Connection':'keep-alive',
'Accept':'*/*',
'X-Requested-With':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Accept-Language':'zh-CN,zh;q=0.9',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
'Cookie':cookies
}
return headers
#处理cookies
def getCookies(t):
cookies = 'BAIDUID=2798F941BEE3BAD44CC9E6225279FF4A:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=' + t + '; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=' + t + ';'
return cookies
#取sign,用到了execjs模块
def getSign(wd):
ctx = execjs.compile("""
window = {};
var i = null;
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var a = o.charAt(t + 2);
a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
}
return r
}
function e(r) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++) "" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = void 0,
l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i : (i = window[l] || "") || "";
u = '320305.131321201';
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
""")
return ctx.call("e", wd)
#取输入的语言
def getLan(wd,headers):
url = "https://fanyi.baidu.com/langdetect"
data={'query':wd}
#对post提交的表单进行url编码
resp = requests.post(url,data=urlencode(data).encode("utf-8"),headers=headers)
lan = ''
#如果状态码 == 200 就说明正常请求正常
if resp.status_code == 200:
#json解析
data_json = json.loads(resp.text)
msg = data_json['msg']
if msg == "success":
lan = data_json['lan']
else:
print(resp.text)
return lan
def fanyi(wd,lan,sign,token,headers):
language = ""
if lan == 'zh':
language = "en"
elif lan == 'en':
language = "zh"
else:
language = "en"
url = "https://fanyi.baidu.com/v2transapi?from=" + lan + "&to=" + language
#print(url)
data = {
'from':lan,
'to':language,
'query':wd,
'transtype':'realtime',
'simple_means_flag':'3',
'sign':sign,
'token':token,
'domain':'common'
}
resp = requests.post(url=url,data=urlencode(data).encode("utf-8"),headers=headers)
retdata = ""
if resp.status_code == 200:
print("请求成功")
retdata = (resp.text.encode("utf-8"))
return retdata
def getToken(headers):
url = "https://fanyi.baidu.com/translate?aldtype=16047&query=&keyfrom=baidu&smartresult=dict&lang=auto2zh"
resp = requests.get(url,headers=headers)
print(resp.text)
#这里我偷懒了自己取一下返回的token吧
if __name__=="__main__":
#取10位时间戳
t = round(time.time())
cookies = getCookies(str(t))
headers = getHeaders(cookies)
#getToken(headers)
wd = input("请输入要翻译的内容:")
lan = getLan(wd,headers)
sign = getSign(wd)
json_data = fanyi(wd,lan,sign,'97f41ef953422689ecd99065d10c7775',headers)
json_data = json.loads(bytes(json_data).decode("utf-8"))
print(json_data)
就到这,好了天黑了,该溜了!
标签:10,String,Python,爬虫,headers,sgin,&&,var,fromCharCode 来源: https://blog.csdn.net/qq_33516409/article/details/119120218