编程语言
首页 > 编程语言> > 【Python爬虫】之百度翻译sgin解密

【Python爬虫】之百度翻译sgin解密

作者:互联网

目录

Fiddler配合谷歌游览器抓取数据

谷歌游览器调试

 Python代码


Fiddler配合谷歌游览器抓取数据

 首先发现百度翻译是先对输入的词进行查询语言类别,通过这个地址https://fanyi.baidu.com/langdetect

 其次还发现了对提交的数据进行了url编码,输入的“我是超人”,查询返回的是"lan":"zh",到目前为止还没有发现有加密的地方,咱们继续往下看真正发送请求的链接

谷歌游览器调试

https://fanyi.baidu.com/v2transapi?from=zh&to=en  这个链接才是真正发送请求的链接。一眼就看到了一个sign参数,不用想这肯定是加密了(因为我搜过这个值,没有 = =!),其次token这个参数其实是这个服务器的返回值,从历史返回里就可以找到了。主要解决了这个sgin就能成,在谷歌游览器里切换到【Sources】,然后按Ctrl + Shift + F 搜sign

发现居然有56个,太多了,我们加个冒号试试,注意是英文冒号。

 这次只有14个,很好,我们全部下断点(点进去,搜sign:  然后在行数的前面点一下),然后重新输入要翻译的词,看看会不会断下来

发现成功断下来了,是把L(e) 的值赋给了sign,然后我们在控制台打印下L(e)的值确定是我们想要的值。 而e的值就是“你好”。我们跟进去看下这个函数(按F11进入函数)

 然后发现看了下没有什么特别的加密,就只有这一段JS而已,JS代码如下

window = {};
var i = null;

function n(r, o) {
    for (var t = 0; t < o.length - 2; t += 3) {
        var a = o.charAt(t + 2);
        a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
        a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
        r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
    }
    return r
}

function e(r) {
    var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
    if (null === o) {
        var t = r.length;
        t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
    } else {
        for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++) "" !== e[C] && f.push.apply(f, a(e[C].split(""))),
        C !== h - 1 && f.push(o[C]);
        var g = f.length;
        g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
    }
    var u = void 0,
        l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
    u = null !== i ? i : (i = window[l] || "") || "";
    u = '320305.131321201';
    for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
        var A = r.charCodeAt(v);
        128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
        S[c++] = A >> 18 | 240,
        S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
        S[c++] = A >> 6 & 63 | 128),
        S[c++] = 63 & A | 128)
    }
    for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
    p += S[b],
    p = n(p, F);
    return p = n(p, D),
    p ^= s,
    0 > p && (p = (2147483647 & p) + 2147483648),
    p %= 1e6,
    p.toString() + "." + (p ^ m)
}

有一点需要注意的是,js里有根据 i 的值来取 u 的值,在调试的时候发现 i 的值是固定的320305.131321201,所以我在js里直接把u = ‘320305.131321201’,在python里执行下就成功了

 Python代码

import requests
from urllib.parse import urlencode
import execjs
import time
import json

#处理请求头
def getHeaders(cookies):
    headers = {
        'Host':'fanyi.baidu.com',
        'Connection':'keep-alive',
        'Accept':'*/*',
        'X-Requested-With':'XMLHttpRequest',
        'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
        'Cookie':cookies
    }
    return headers
#处理cookies
def getCookies(t):
    cookies = 'BAIDUID=2798F941BEE3BAD44CC9E6225279FF4A:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=' + t + '; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=' + t + ';'
    return cookies

#取sign,用到了execjs模块
def getSign(wd):
    ctx = execjs.compile("""
    window = {};
    var i = null;
    function n(r, o) {
        for (var t = 0; t < o.length - 2; t += 3) {
            var a = o.charAt(t + 2);
            a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
            a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
            r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
        }
        return r
    }

    function e(r) {
        var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
        if (null === o) {
            var t = r.length;
            t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
        } else {
            for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++) "" !== e[C] && f.push.apply(f, a(e[C].split(""))),
            C !== h - 1 && f.push(o[C]);
            var g = f.length;
            g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
        }
        var u = void 0,
            l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
        u = null !== i ? i : (i = window[l] || "") || "";
        u = '320305.131321201';
        for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
            var A = r.charCodeAt(v);
            128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
            S[c++] = A >> 18 | 240,
            S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
            S[c++] = A >> 6 & 63 | 128),
            S[c++] = 63 & A | 128)
        }
        for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
        p += S[b],
        p = n(p, F);
        return p = n(p, D),
        p ^= s,
        0 > p && (p = (2147483647 & p) + 2147483648),
        p %= 1e6,
        p.toString() + "." + (p ^ m)
    }
    """)
    return ctx.call("e", wd)
#取输入的语言
def getLan(wd,headers):
    url = "https://fanyi.baidu.com/langdetect"
    data={'query':wd}
    #对post提交的表单进行url编码
    resp = requests.post(url,data=urlencode(data).encode("utf-8"),headers=headers)
    lan = ''
    #如果状态码 == 200 就说明正常请求正常
    if resp.status_code == 200:
        #json解析
        data_json = json.loads(resp.text)
        msg = data_json['msg']
        if msg == "success":
            lan = data_json['lan']
    else:
        print(resp.text)

    return lan

def fanyi(wd,lan,sign,token,headers):
    language = ""
    if lan == 'zh':
        language = "en"
    elif lan == 'en':
        language = "zh"
    else:
        language = "en"

    url = "https://fanyi.baidu.com/v2transapi?from=" + lan + "&to=" + language
    #print(url)
    data = {
        'from':lan,
        'to':language,
        'query':wd,
        'transtype':'realtime',
        'simple_means_flag':'3',
        'sign':sign,
        'token':token,
        'domain':'common'
    }
    resp = requests.post(url=url,data=urlencode(data).encode("utf-8"),headers=headers)
    retdata = ""
    if resp.status_code == 200:
        print("请求成功")
        retdata = (resp.text.encode("utf-8"))
    return retdata
def getToken(headers):
    url = "https://fanyi.baidu.com/translate?aldtype=16047&query=&keyfrom=baidu&smartresult=dict&lang=auto2zh"
    resp = requests.get(url,headers=headers)
    print(resp.text)
    #这里我偷懒了自己取一下返回的token吧

if __name__=="__main__":
    #取10位时间戳
    t = round(time.time())
    cookies = getCookies(str(t))
    headers = getHeaders(cookies)
    #getToken(headers)
    wd = input("请输入要翻译的内容:")
    lan  = getLan(wd,headers)
    sign = getSign(wd)
    json_data = fanyi(wd,lan,sign,'97f41ef953422689ecd99065d10c7775',headers)

    json_data = json.loads(bytes(json_data).decode("utf-8"))

    print(json_data)

就到这,好了天黑了,该溜了!

标签:10,String,Python,爬虫,headers,sgin,&&,var,fromCharCode
来源: https://blog.csdn.net/qq_33516409/article/details/119120218