今日头条反反爬思路总结
作者:互联网
一、目标网站
今日头条 (www.toutiao.com)
二、分析步骤
- 首先使用浏览器访问 www.toutiao.com 进入头条主页, 在向下滚动窗口查看更多内容时, 发现新内容是动态加载的, 利用快捷键 F12 打开浏览器开发者工具监控 Elements 面板下的 DOM 树, 发现动态生成的标签, 进一步验证得知头条主页内容采用异步请求动态加载
- 接下来分析网络请求
- 快捷键 Ctrl + F5 重新加载页面资源, 切换到 Network 面板下 XHR 选项卡查看所有的 XHR 类型的请求
- 初始的所有 XHR 类型请求
- 向下滚动窗口, 直到加载新的内容停止滚动, 此时出现新的 XHR 类型请求
- 发现两个很相似的请求
- 点击查看其详细信息
- Headers 中包含了 Request Headers(请求头), Query String Parameters(查询字符串参数)
- ?min_behot_time=156955... 如下
- ?max_behot_time=0.... 如下
- 对比后发现这两个请求只有部分请求参数是变动的
- min_behot_time / max_behot_time
- as
- cp
- 接下来的分析着重考虑这三个参数的生成机制
- Preview (预览响应内容) 查看响应数据和其结构
- ?min_behot_time=0.... 如下
- 发现 min_behot_time=... 这个请求获得的响应中包含 ?max_behot_time=... 这个请求需要的max_behot_time 请求参数(next: {max_behot_time: 1569556156})
- 分析 min_behot_time=... 这个请求可能为获取到初始的动态加载内容
- ?min_behot_time=0.... 如下
- ?max_behot_time=0.... 如下
- 利用正则全局搜索参数 as 和 cp
- 发现一个名为index.d337d64118bf9b864485.js的文件中存在匹配项
- 继而发现自定制的加密算法
-
接下来采用 Debug 调试 JS 代码, 了解上述参数的具体生成机制
-
首先找到 index.d337d64118bf9b864485.js 文件
-
- 打开文件, 接着找到加密函数 a() 添加断点 (了解其机制)
- 接下来要找到 ?min_behot_time=... 和 ?max_behot_time=... 这两个异步请求是如何发起的
- Initiator 标记请求是由哪个对象或进程发起的 (请求源)
-
-
1.跳转到 Sources 面板:
-
2.查看格式化后的代码, 发现发起请求的外层函数 l(t) , 添加断点进行调试(了解其机制)
-
3.清空所有 XHR 类型的请求:
-
-
清空后:
-
-
4.快捷键 Ctrl + F5 重新加载页面, 在第一个断点处暂停
-
- 5. 这时查看所有的 XHR 类型的请求
- 证明了上面的分析结果: "分析 min_behot_time=... 这个请求可能为获取到初始的动态加载内容"
-
6. 现在开始 Debug
- 获取请求路径
-
获取动态参数as和cp
- 变量 i 赋值时 (0, o.default)(t) 发生跳转, 相当于执行了 k(t)
-
调用 k(t) 后一系列连锁调用
-
分析上面的代码发现其非常类似 MD5 信息摘要算法
- 可查询md5.js进行比对
-
/* * A JavaScript implementation of the RSA Data Security, Inc. MD5 Message * Digest Algorithm, as defined in RFC 1321. * Version 2.1 Copyright (C) Paul Johnston 1999 - 2002. * Other contributors: Greg Holt, Andrew Kepert, Ydnar, Lostinet * Distributed under the BSD License * See http://pajhome.org.uk/crypt/md5 for more info. */ /* * Configurable variables. You may need to tweak these to be compatible with * the server-side, but the defaults work in most cases. */ var hexcase = 0; /* hex output format. 0 - lowercase; 1 - uppercase */ var b64pad = ""; /* base-64 pad character. "=" for strict RFC compliance */ var chrsz = 8; /* bits per input character. 8 - ASCII; 16 - Unicode */ /* * These are the functions you'll usually want to call * They take string arguments and return either hex or base-64 encoded strings */ function hex_md5(s){ return binl2hex(core_md5(str2binl(s), s.length * chrsz));} function b64_md5(s){ return binl2b64(core_md5(str2binl(s), s.length * chrsz));} function str_md5(s){ return binl2str(core_md5(str2binl(s), s.length * chrsz));} function hex_hmac_md5(key, data) { return binl2hex(core_hmac_md5(key, data)); } function b64_hmac_md5(key, data) { return binl2b64(core_hmac_md5(key, data)); } function str_hmac_md5(key, data) { return binl2str(core_hmac_md5(key, data)); } /* * Perform a simple self-test to see if the VM is working */ function md5_vm_test() { return hex_md5("abc") == "900150983cd24fb0d6963f7d28e17f72"; } /* * Calculate the MD5 of an array of little-endian words, and a bit length */ function core_md5(x, len) { /* append padding */ x[len >> 5] |= 0x80 << ((len) % 32); x[(((len + 64) >>> 9) << 4) + 14] = len; var a = 1732584193; var b = -271733879; var c = -1732584194; var d = 271733878; for(var i = 0; i < x.length; i += 16) { var olda = a; var oldb = b; var oldc = c; var oldd = d; a = md5_ff(a, b, c, d, x[i+ 0], 7 , -680876936); d = md5_ff(d, a, b, c, x[i+ 1], 12, -389564586); c = md5_ff(c, d, a, b, x[i+ 2], 17, 606105819); b = md5_ff(b, c, d, a, x[i+ 3], 22, -1044525330); a = md5_ff(a, b, c, d, x[i+ 4], 7 , -176418897); d = md5_ff(d, a, b, c, x[i+ 5], 12, 1200080426); c = md5_ff(c, d, a, b, x[i+ 6], 17, -1473231341); b = md5_ff(b, c, d, a, x[i+ 7], 22, -45705983); a = md5_ff(a, b, c, d, x[i+ 8], 7 , 1770035416); d = md5_ff(d, a, b, c, x[i+ 9], 12, -1958414417); c = md5_ff(c, d, a, b, x[i+10], 17, -42063); b = md5_ff(b, c, d, a, x[i+11], 22, -1990404162); a = md5_ff(a, b, c, d, x[i+12], 7 , 1804603682); d = md5_ff(d, a, b, c, x[i+13], 12, -40341101); c = md5_ff(c, d, a, b, x[i+14], 17, -1502002290); b = md5_ff(b, c, d, a, x[i+15], 22, 1236535329); a = md5_gg(a, b, c, d, x[i+ 1], 5 , -165796510); d = md5_gg(d, a, b, c, x[i+ 6], 9 , -1069501632); c = md5_gg(c, d, a, b, x[i+11], 14, 643717713); b = md5_gg(b, c, d, a, x[i+ 0], 20, -373897302); a = md5_gg(a, b, c, d, x[i+ 5], 5 , -701558691); d = md5_gg(d, a, b, c, x[i+10], 9 , 38016083); c = md5_gg(c, d, a, b, x[i+15], 14, -660478335); b = md5_gg(b, c, d, a, x[i+ 4], 20, -405537848); a = md5_gg(a, b, c, d, x[i+ 9], 5 , 568446438); d = md5_gg(d, a, b, c, x[i+14], 9 , -1019803690); c = md5_gg(c, d, a, b, x[i+ 3], 14, -187363961); b = md5_gg(b, c, d, a, x[i+ 8], 20, 1163531501); a = md5_gg(a, b, c, d, x[i+13], 5 , -1444681467); d = md5_gg(d, a, b, c, x[i+ 2], 9 , -51403784); c = md5_gg(c, d, a, b, x[i+ 7], 14, 1735328473); b = md5_gg(b, c, d, a, x[i+12], 20, -1926607734); a = md5_hh(a, b, c, d, x[i+ 5], 4 , -378558); d = md5_hh(d, a, b, c, x[i+ 8], 11, -2022574463); c = md5_hh(c, d, a, b, x[i+11], 16, 1839030562); b = md5_hh(b, c, d, a, x[i+14], 23, -35309556); a = md5_hh(a, b, c, d, x[i+ 1], 4 , -1530992060); d = md5_hh(d, a, b, c, x[i+ 4], 11, 1272893353); c = md5_hh(c, d, a, b, x[i+ 7], 16, -155497632); b = md5_hh(b, c, d, a, x[i+10], 23, -1094730640); a = md5_hh(a, b, c, d, x[i+13], 4 , 681279174); d = md5_hh(d, a, b, c, x[i+ 0], 11, -358537222); c = md5_hh(c, d, a, b, x[i+ 3], 16, -722521979); b = md5_hh(b, c, d, a, x[i+ 6], 23, 76029189); a = md5_hh(a, b, c, d, x[i+ 9], 4 , -640364487); d = md5_hh(d, a, b, c, x[i+12], 11, -421815835); c = md5_hh(c, d, a, b, x[i+15], 16, 530742520); b = md5_hh(b, c, d, a, x[i+ 2], 23, -995338651); a = md5_ii(a, b, c, d, x[i+ 0], 6 , -198630844); d = md5_ii(d, a, b, c, x[i+ 7], 10, 1126891415); c = md5_ii(c, d, a, b, x[i+14], 15, -1416354905); b = md5_ii(b, c, d, a, x[i+ 5], 21, -57434055); a = md5_ii(a, b, c, d, x[i+12], 6 , 1700485571); d = md5_ii(d, a, b, c, x[i+ 3], 10, -1894986606); c = md5_ii(c, d, a, b, x[i+10], 15, -1051523); b = md5_ii(b, c, d, a, x[i+ 1], 21, -2054922799); a = md5_ii(a, b, c, d, x[i+ 8], 6 , 1873313359); d = md5_ii(d, a, b, c, x[i+15], 10, -30611744); c = md5_ii(c, d, a, b, x[i+ 6], 15, -1560198380); b = md5_ii(b, c, d, a, x[i+13], 21, 1309151649); a = md5_ii(a, b, c, d, x[i+ 4], 6 , -145523070); d = md5_ii(d, a, b, c, x[i+11], 10, -1120210379); c = md5_ii(c, d, a, b, x[i+ 2], 15, 718787259); b = md5_ii(b, c, d, a, x[i+ 9], 21, -343485551); a = safe_add(a, olda); b = safe_add(b, oldb); c = safe_add(c, oldc); d = safe_add(d, oldd); } return Array(a, b, c, d); } /* * These functions implement the four basic operations the algorithm uses. */ function md5_cmn(q, a, b, x, s, t) { return safe_add(bit_rol(safe_add(safe_add(a, q), safe_add(x, t)), s),b); } function md5_ff(a, b, c, d, x, s, t) { return md5_cmn((b & c) | ((~b) & d), a, b, x, s, t); } function md5_gg(a, b, c, d, x, s, t) { return md5_cmn((b & d) | (c & (~d)), a, b, x, s, t); } function md5_hh(a, b, c, d, x, s, t) { return md5_cmn(b ^ c ^ d, a, b, x, s, t); } function md5_ii(a, b, c, d, x, s, t) { return md5_cmn(c ^ (b | (~d)), a, b, x, s, t); } /* * Calculate the HMAC-MD5, of a key and some data */ function core_hmac_md5(key, data) { var bkey = str2binl(key); if(bkey.length > 16) bkey = core_md5(bkey, key.length * chrsz); var ipad = Array(16), opad = Array(16); for(var i = 0; i < 16; i++) { ipad[i] = bkey[i] ^ 0x36363636; opad[i] = bkey[i] ^ 0x5C5C5C5C; } var hash = core_md5(ipad.concat(str2binl(data)), 512 + data.length * chrsz); return core_md5(opad.concat(hash), 512 + 128); } /* * Add integers, wrapping at 2^32. This uses 16-bit operations internally * to work around bugs in some JS interpreters. */ function safe_add(x, y) { var lsw = (x & 0xFFFF) + (y & 0xFFFF); var msw = (x >> 16) + (y >> 16) + (lsw >> 16); return (msw << 16) | (lsw & 0xFFFF); } /* * Bitwise rotate a 32-bit number to the left. */ function bit_rol(num, cnt) { return (num << cnt) | (num >>> (32 - cnt)); } /* * Convert a string to an array of little-endian words * If chrsz is ASCII, characters >255 have their hi-byte silently ignored. */ function str2binl(str) { var bin = Array(); var mask = (1 << chrsz) - 1; for(var i = 0; i < str.length * chrsz; i += chrsz) bin[i>>5] |= (str.charCodeAt(i / chrsz) & mask) << (i%32); return bin; } /* * Convert an array of little-endian words to a string */ function binl2str(bin) { var str = ""; var mask = (1 << chrsz) - 1; for(var i = 0; i < bin.length * 32; i += chrsz) str += String.fromCharCode((bin[i>>5] >>> (i % 32)) & mask); return str; } /* * Convert an array of little-endian words to a hex string. */ function binl2hex(binarray) { var hex_tab = hexcase ? "0123456789ABCDEF" : "0123456789abcdef"; var str = ""; for(var i = 0; i < binarray.length * 4; i++) { str += hex_tab.charAt((binarray[i>>2] >> ((i%4)*8+4)) & 0xF) + hex_tab.charAt((binarray[i>>2] >> ((i%4)*8 )) & 0xF); } return str; } /* * Convert an array of little-endian words to a base-64 string */ function binl2b64(binarray) { var tab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; var str = ""; for(var i = 0; i < binarray.length * 4; i += 3) { var triplet = (((binarray[i >> 2] >> 8 * ( i %4)) & 0xFF) << 16) | (((binarray[i+1 >> 2] >> 8 * ((i+1)%4)) & 0xFF) << 8 ) | ((binarray[i+2 >> 2] >> 8 * ((i+2)%4)) & 0xFF); for(var j = 0; j < 4; j++) { if(i * 8 + j * 6 > binarray.length * 32) str += b64pad; else str += tab.charAt((triplet >> 6*(3-j)) & 0x3F); } } return str; }
md5.js 源代码 - 获得结果值
- 利用Python 标准库 hashlib 验证结果是否与以上分析相吻合
- 结果相同, 证明上述分析正确
- 接下来开始拼接查询参数
- 开始构建异步请求
- 设置请求头
- 发起请求后回到最初断点处
- 完成上述步骤后Network面板下XHR选项卡查看到 ?min_behot_time=0 请求已完成, 其中查询字符串参数完全符和上述步骤中所生成的动态参数
-
- 到此分析过程结束
三、部分代码展示
# 导入相关模块
import time import math import datetime import json import hashlib from urllib.parse import urlencode import execjs import requests import xlsxwriter from pymongo import MongoClient
# 基本配置
# 创建数据库连接 client = MongoClient("localhost", 27017) # 初始化数据库 db = client["Toutiaopro"] # 目标地址 url = "https://www.toutiao.com/" # 请求头 headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" "AppleWebKit/537.36 (KHTML, like Gecko)" "Chrome/79.0.3907.0 Safari/537.36 Edg/79.0.279.0" ) } # 实例化 session 对象 session = requests.Session() # 获取 cookies session.get(url=url, headers=headers) # 指定初始 url start_url = "https://www.toutiao.com/api/pc/feed/?{}&category=__all__&utm_source=toutiao&widen=1&tadrequire=true&as={}&cp={}"
# 定义加密函数(方式一)
def get_md5(_str): md5 = hashlib.md5() md5.update(_str.encode()) return md5.hexdigest() def get_params(): t = str(math.floor(time.time())) e = hex(int(t))[2:] i = get_md5(t).upper() if 8 != len(e): return { "as": "479BB4B7254C150", "cp": "7E0AC8874BB0985" } s = "" n = i[:5] for r in range(5): s += n[r] + e[r] l = "" a = i[-5:] for u in range(5): l += e[u+3] + a[u] return { "as": "A1" + s + e[-3:], "cp": e[:3] + l + "E1" }
# 定义加密函数(方式二)
# 基于以上分析修改 js 文件如下 jsCode.js
function m (n) { function s(t, e) { var i = (65535 & t) + (65535 & e), n = (t >> 16) + (e >> 16) + (i >> 16); return n << 16 | 65535 & i } function o(t, e) { return t << e | t >>> 32 - e } function r(t, e, i, n, a, r) { return s(o(s(s(e, t), s(n, r)), a), i) } function l(t, e, i, n, a, s, o) { return r(e & i | ~e & n, t, e, a, s, o) } function u(t, e, i, n, a, s, o) { return r(e & n | i & ~n, t, e, a, s, o) } function c(t, e, i, n, a, s, o) { return r(e ^ i ^ n, t, e, a, s, o) } function d(t, e, i, n, a, s, o) { return r(i ^ (e | ~n), t, e, a, s, o) } function h(t, e) { t[e >> 5] |= 128 << e % 32, t[(e + 64 >>> 9 << 4) + 14] = e; var i, n, a, o, r, h = 1732584193, _ = -271733879, m = -1732584194, p = 271733878; for (i = 0; i < t.length; i += 16) n = h, a = _, o = m, r = p, h = l(h, _, m, p, t[i], 7, -680876936), p = l(p, h, _, m, t[i + 1], 12, -389564586), m = l(m, p, h, _, t[i + 2], 17, 606105819), _ = l(_, m, p, h, t[i + 3], 22, -1044525330), h = l(h, _, m, p, t[i + 4], 7, -176418897), p = l(p, h, _, m, t[i + 5], 12, 1200080426), m = l(m, p, h, _, t[i + 6], 17, -1473231341), _ = l(_, m, p, h, t[i + 7], 22, -45705983), h = l(h, _, m, p, t[i + 8], 7, 1770035416), p = l(p, h, _, m, t[i + 9], 12, -1958414417), m = l(m, p, h, _, t[i + 10], 17, -42063), _ = l(_, m, p, h, t[i + 11], 22, -1990404162), h = l(h, _, m, p, t[i + 12], 7, 1804603682), p = l(p, h, _, m, t[i + 13], 12, -40341101), m = l(m, p, h, _, t[i + 14], 17, -1502002290), _ = l(_, m, p, h, t[i + 15], 22, 1236535329), h = u(h, _, m, p, t[i + 1], 5, -165796510), p = u(p, h, _, m, t[i + 6], 9, -1069501632), m = u(m, p, h, _, t[i + 11], 14, 643717713), _ = u(_, m, p, h, t[i], 20, -373897302), h = u(h, _, m, p, t[i + 5], 5, -701558691), p = u(p, h, _, m, t[i + 10], 9, 38016083), m = u(m, p, h, _, t[i + 15], 14, -660478335), _ = u(_, m, p, h, t[i + 4], 20, -405537848), h = u(h, _, m, p, t[i + 9], 5, 568446438), p = u(p, h, _, m, t[i + 14], 9, -1019803690), m = u(m, p, h, _, t[i + 3], 14, -187363961), _ = u(_, m, p, h, t[i + 8], 20, 1163531501), h = u(h, _, m, p, t[i + 13], 5, -1444681467), p = u(p, h, _, m, t[i + 2], 9, -51403784), m = u(m, p, h, _, t[i + 7], 14, 1735328473), _ = u(_, m, p, h, t[i + 12], 20, -1926607734), h = c(h, _, m, p, t[i + 5], 4, -378558), p = c(p, h, _, m, t[i + 8], 11, -2022574463), m = c(m, p, h, _, t[i + 11], 16, 1839030562), _ = c(_, m, p, h, t[i + 14], 23, -35309556), h = c(h, _, m, p, t[i + 1], 4, -1530992060), p = c(p, h, _, m, t[i + 4], 11, 1272893353), m = c(m, p, h, _, t[i + 7], 16, -155497632), _ = c(_, m, p, h, t[i + 10], 23, -1094730640), h = c(h, _, m, p, t[i + 13], 4, 681279174), p = c(p, h, _, m, t[i], 11, -358537222), m = c(m, p, h, _, t[i + 3], 16, -722521979), _ = c(_, m, p, h, t[i + 6], 23, 76029189), h = c(h, _, m, p, t[i + 9], 4, -640364487), p = c(p, h, _, m, t[i + 12], 11, -421815835), m = c(m, p, h, _, t[i + 15], 16, 530742520), _ = c(_, m, p, h, t[i + 2], 23, -995338651), h = d(h, _, m, p, t[i], 6, -198630844), p = d(p, h, _, m, t[i + 7], 10, 1126891415), m = d(m, p, h, _, t[i + 14], 15, -1416354905), _ = d(_, m, p, h, t[i + 5], 21, -57434055), h = d(h, _, m, p, t[i + 12], 6, 1700485571), p = d(p, h, _, m, t[i + 3], 10, -1894986606), m = d(m, p, h, _, t[i + 10], 15, -1051523), _ = d(_, m, p, h, t[i + 1], 21, -2054922799), h = d(h, _, m, p, t[i + 8], 6, 1873313359), p = d(p, h, _, m, t[i + 15], 10, -30611744), m = d(m, p, h, _, t[i + 6], 15, -1560198380), _ = d(_, m, p, h, t[i + 13], 21, 1309151649), h = d(h, _, m, p, t[i + 4], 6, -145523070), p = d(p, h, _, m, t[i + 11], 10, -1120210379), m = d(m, p, h, _, t[i + 2], 15, 718787259), _ = d(_, m, p, h, t[i + 9], 21, -343485551), h = s(h, n), _ = s(_, a), m = s(m, o), p = s(p, r); return [h, _, m, p] } function _(t) { var e, i = ""; for (e = 0; e < 32 * t.length; e += 8) i += String.fromCharCode(t[e >> 5] >>> e % 32 & 255); return i } function m(t) { var e, i = []; for (i[(t.length >> 2) - 1] = void 0, e = 0; e < i.length; e += 1) i[e] = 0; for (e = 0; e < 8 * t.length; e += 8) i[e >> 5] |= (255 & t.charCodeAt(e / 8)) << e % 32; return i } function p(t) { return _(h(m(t), 8 * t.length)) } function f(t, e) { var i, n, a = m(t), s = [], o = []; for (s[15] = o[15] = void 0, a.length > 16 && (a = h(a, 8 * t.length)), i = 0; i < 16; i += 1) s[i] = 909522486 ^ a[i], o[i] = 1549556828 ^ a[i]; return n = h(s.concat(m(e)), 512 + 8 * e.length), _(h(o.concat(n), 640)) } function g(t) { var e, i, n = "0123456789abcdef", a = ""; for (i = 0; i < t.length; i += 1) e = t.charCodeAt(i), a += n.charAt(e >>> 4 & 15) + n.charAt(15 & e); return a } function v(t) { return unescape(encodeURIComponent(t)) } function w(t) { return p(v(t)) } function y(t) { return g(w(t)) } function b(t, e) { return f(v(t), v(e)) } function x(t, e) { return g(b(t, e)) } function k(t, e, i) { return e ? i ? b(e, t) : x(e, t) : i ? w(t) : y(t) } return k(n, 0,0); } function o(s) { var e = parseInt(s) , t = e.toString(16).toUpperCase() , i = m(e).toString().toUpperCase(); if (8 != t.length) return { as: "479BB4B7254C150", cp: "7E0AC8874BB0985" }; for (var n = i.slice(0, 5), o = i.slice(-5), r = "", s = 0; s < 5; s++) r += n[s] + t[s]; for (var l = "", c = 0; c < 5; c++) l += t[c + 3] + o[c]; return { as: "A1" + r + t.slice(-3), cp: t.slice(0, 3) + l + "E1" } }
# 定义加密函数
def get_params(): timestamp = str(math.floor(time.time())) with open("./jsCode.js", 'r', encoding="utf-8") as f: js = f.read() result = execjs.compile(js) return result.call("o", timestamp)
# 定义函数生成 xls 文件
def data2xls(data_list): row = 1 col = 0 workbook = xlsxwriter.Workbook('{}.xlsx'.format(datetime.date.today())) cell_format = workbook.add_format({ 'border': 1, 'text_wrap': 1 }) merge_format = workbook.add_format({ 'bold': True, 'border': 1, 'text_wrap': 1 }) worksheet = workbook.add_worksheet("首页新闻") worksheet.write(0, 0, "chinese_tag", merge_format) worksheet.write(0, 1, "media_avatar_url", merge_format) worksheet.write(0, 2, "title", merge_format) worksheet.write(0, 3, "abstract", merge_format) worksheet.write(0, 4, "tag", merge_format) worksheet.write(0, 5, "source_url", merge_format) worksheet.write(0, 6, "source", merge_format) worksheet.write(0, 7, "media_url", merge_format) worksheet.set_column(0, 0, 20) worksheet.set_column(1, 1, 65) worksheet.set_column(2, 2, 70) worksheet.set_column(3, 3, 255) worksheet.set_column(4, 4, 25) worksheet.set_column(5, 5, 30) worksheet.set_column(6, 6, 20) worksheet.set_column(7, 7, 75) for data in data_list: chinese_tag = data.get("chinese_tag") media_avatar_url = data.get("media_avatar_url") title = data.get("title") abstract = data.get("abstract") tag = data.get("tag") source_url = data.get("source_url") source = data.get("source") media_url = data.get("media_url") worksheet.write(row, col, chinese_tag, cell_format) worksheet.write(row, col+1, media_avatar_url, cell_format) worksheet.write(row, col+2, title, cell_format) worksheet.write(row, col+3, abstract, cell_format) worksheet.write(row, col+4, tag, cell_format) worksheet.write(row, col+5, source_url, cell_format) worksheet.write(row, col+6, source, cell_format) worksheet.write(row, col+7, media_url, cell_format) row += 1 workbook.close()
# 定义 main 函数
def main(timeparam): params_date = get_params() new_url = start_url.format(timeparam, params_date["as"], params_date["cp"]) response = session.get(url=new_url, headers=headers) result = response.json() data = result["data"] next_timestamp = result["next"] return { "data": data, "next_timestamp": next_timestamp }
# 执行 main 函数
if __name__ == "__main__": data_list = [] for i in range(10): if i == 0: timeparam = "min_behot_time=0" result = main(timeparam) data = result["data"] timeparam = urlencode(result["next_timestamp"]) data_list.extend(data) data2xls(data_list) db.content.insert_many(data_list)
四、部分数据展示
标签:function,return,反反,worksheet,var,思路,md5,data,头条 来源: https://www.cnblogs.com/dmcs95/p/11667817.html