uniCloud爬虫获取网页数据
作者:互联网
'use strict'; let request = require('request') let cheerio = require('cheerio'); //爬虫 let iconv = require('iconv-lite'); //处理gbk编码的网页 let Entities = require('html-entities').XmlEntities; let entities = new Entities(); const host = 'http://www.quanshuwang.com/shuku/' const db = uniCloud.database() const collection = db.collection('repiles-book') const dbCom = db.command exports.main = async (event, context) => { // 开始抓取首页链接 let indexArr = [] //发送请求获取页面内容 var body=await requestFn() var $ = cheerio.load(body); //兼容网页编码格式 if($('meta[charset]').attr('charset')=='utf-8'){//如果网页是utf-8的编码 }else{//如果网页是gbk的编码 body = iconv.decode(body,'gbk');//转换gbk编码的页面内容 $ = cheerio.load(body); } //处理网页数据 获取排行列表 let list = $('.yd-book-content .tab-item').find('.yd-book-item') for (var i = 0; i < list.length; i++) { let href = list.eq(i).find('a').attr("href") //获取书的id let index = href.indexOf('_') + 1 let index2 = href.lastIndexOf('.') let _id = href.slice(index, index2) //书ID //获取书的封面 let bookImageSrc = list.eq(i).find('img').attr("src") //获取书的标题 注意使用html-entities解码 let bookTitle = entities.decode(list.eq(i).find('h2').html()) //获取书的作者 注意使用html-entities解码 let bookAuthor = entities.decode(list.eq(i).find('.dl-horizontal-inline p').html()) console.log('书的封面:' + bookImageSrc); console.log('书的标题:' + bookTitle); console.log('书的作者:' + bookAuthor); console.log('书的id:' + _id); } console.log('新增文章数量:', indexArr.length); // 循环抓取每个新文章详情页 // if (indexArr.length > 0) { // for (let i = 0; i < indexArr.length; i++) { // let href = list.eq(indexArr[i]).attr("href") // let imgSrc = list.eq(indexArr[i]).find('img').attr('src') // let title = list.eq(indexArr[i]).find('.title').text() // await saveArticle(href, title, imgSrc) // } // } //返回数据给客户端 return event }; function requestFn() { return new Promise((resolve, reject) => { request({ url: host, encoding: null //设置抓取页面时不要对数据做任何转换 }, function(err, res, body) { if (err){ reject(err) }else{ resolve(body) } }); }) }
标签:网页,indexArr,list,爬虫,uniCloud,href,let,eq,find 来源: https://www.cnblogs.com/lizhao123/p/13738449.html