JS爬虫 利用axios和cheerio爬取好大夫病历并生成xlsx
作者:互联网
var axios = require("axios") var cheerio = require("cheerio") var xlsx = require('node-xlsx'); var fs = require('fs'); var userAgentPool = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50' ]; var userAgentFlag = 0; var urls = []; var num = 1; var data = [ { name : 'sheet1', data : [ [ '', '疾病描述', '疾病', '病历概要' ] ] } ] let si1 = setInterval(getUrls, 1000) console.log('开始爬取病症链接'); function getUrls() { if(userAgentFlag === userAgentPool.length) userAgentFlag = 0; if(num > 100) { clearInterval(si1); console.log('病症链接爬取完成,爬取数量:' + urls.length); getDatas(); return; } axios.get("https://zixun.haodf.com/dispatched/45001000.htm?p=" + num++, { headers: { 'User-Agent': userAgentPool[userAgentFlag] } }).then(resp => { var $ = cheerio.load(resp.data) var lis = $('.clearfix li'); for (var i = 0; i < lis.length; i++) { var li = lis.eq(i); if(li.find(".fl a").attr("href")) { urls.push(li.find(".fl a").attr("href")); } } console.log('已爬取第', num - 1, '页', '总爬取数量:', 'urls:', urls.length, '该页末位链接:', urls[urls.length - 1]); }) userAgentFlag++; } function getDatas() { console.log('开始爬取具体数据'); num = 0; si1 = setInterval(getItem, 100) } function getItem() { if(userAgentFlag === userAgentPool.length) userAgentFlag = 0; if(num === urls.length) { clearInterval(si1); var buffer = xlsx.build(data); fs.writeFile('./res.xls', buffer, function (err) { if (err) throw err; console.log('Write to xls has finished'); }) return; } axios.get(urls[num], { headers: { 'User-Agent': userAgentPool[userAgentFlag] } }).then(resp => { var $ = cheerio.load(resp.data) var section = $('.bccard section').eq(0).find('.info3-value p'); data[0].data.push([ ++num, section.eq(0).text().trim(), section.eq(2).text().trim(), $('.suggestions-content .suggestions-text-value').text().trim() ]) console.log('爬取数据:', data[0].data[data[0].data.length - 1]); } ) userAgentFlag++; }
标签:5.0,xlsx,axios,Windows,data,Mozilla,cheerio,var,NT 来源: https://www.cnblogs.com/shiningmage/p/14774711.html