首页 > 其他分享> > uniCloud爬虫获取网页数据
uniCloud爬虫获取网页数据

2020-09-27 11:31:34 作者：互联网
'use strict';
let request = require('request')
let cheerio = require('cheerio'); //爬虫
let iconv = require('iconv-lite'); //处理gbk编码的网页
let Entities = require('html-entities').XmlEntities;
let entities = new Entities();

const host = 'http://www.quanshuwang.com/shuku/'

const db = uniCloud.database()
const collection = db.collection('repiles-book')
const dbCom = db.command

exports.main = async (event, context) => {

    // 开始抓取首页链接
    let indexArr = []
    //发送请求获取页面内容
    var body=await requestFn()
    var $ = cheerio.load(body);
    //兼容网页编码格式
    if($('meta[charset]').attr('charset')=='utf-8'){//如果网页是utf-8的编码
        
    }else{//如果网页是gbk的编码
        body = iconv.decode(body,'gbk');//转换gbk编码的页面内容
        $ = cheerio.load(body);
    }
    //处理网页数据 获取排行列表
    let list = $('.yd-book-content .tab-item').find('.yd-book-item')
    for (var i = 0; i < list.length; i++) {
        let href = list.eq(i).find('a').attr("href")
        //获取书的id
        let index = href.indexOf('_') + 1
        let index2 = href.lastIndexOf('.')
        let _id = href.slice(index, index2) //书ID
        //获取书的封面
        let bookImageSrc = list.eq(i).find('img').attr("src")
        //获取书的标题 注意使用html-entities解码
        let bookTitle = entities.decode(list.eq(i).find('h2').html())
        //获取书的作者 注意使用html-entities解码
        let bookAuthor = entities.decode(list.eq(i).find('.dl-horizontal-inline p').html())

        console.log('书的封面：' + bookImageSrc);
        console.log('书的标题：' + bookTitle);
        console.log('书的作者：' + bookAuthor);
        console.log('书的id：' + _id);
    }

    console.log('新增文章数量：', indexArr.length);

    // 循环抓取每个新文章详情页
    // if (indexArr.length > 0) {
    //     for (let i = 0; i < indexArr.length; i++) {
    //         let href = list.eq(indexArr[i]).attr("href")
    //         let imgSrc = list.eq(indexArr[i]).find('img').attr('src')
    //         let title = list.eq(indexArr[i]).find('.title').text()
    //         await saveArticle(href, title, imgSrc)
    //     }
    // }

    //返回数据给客户端
    return event
};


function requestFn() {
    return new Promise((resolve, reject) => {
        request({
            url: host,
            encoding: null //设置抓取页面时不要对数据做任何转换
        }, function(err, res, body) {
            if (err){
                reject(err)
            }else{
                resolve(body)
            }
        });
    })
}
标签：网页,indexArr,list,爬虫,uniCloud,href,let,eq,find
来源： https://www.cnblogs.com/lizhao123/p/13738449.html