其他分享
首页 > 其他分享> > 豌豆荚爬虫

豌豆荚爬虫

作者:互联网

由于详情页数据无法破解、
模拟请求详情页数据会出现一个滑块,手动划也可以不能通过
在这里插入图片描述
结果
结果

只能拿到列表页上面包含的信息

import requests.sessions
from lxml import etree
from openpyxl import workbook
import time
import json

def get_category_url_list(url, session, headers, sheet):
    try:
        # 关闭多余连接
        session.keep_alive = False
        res = session.get(url, headers=headers)
        html = etree.HTML(res.text)
        category_url_list = html.xpath('//ul[@class="parent-menu clearfix"]/li/a/@href')
        category_url_list = category_url_list[1:3]
        # print(category_url_list)
        for i in category_url_list:
            get_second_category_url_list(i, session, sheet)
    except Exception as e:
        print(e)
    return category_url_list


def get_second_category_url_list(category_url, session, sheet):
    try:
        # 关闭多余连接
        session.keep_alive = False
        res = session.get(category_url)
        html = etree.HTML(res.text)
        second_category_url_list = html.xpath('//li[@class="parent-cate"]/div/a/@href')
        # print(second_category_url_list)
        for i in second_category_url_list:
            page_url_list = get_page_url_list(i, session, sheet)
            print(page_url_list)
    except Exception as e:
        print(e)
    return second_category_url_list


def get_load_more(url, session):
    try:
        res = session.get(url)
        if "?" not in url:
            html = etree.HTML(res.text)
            load_more = html.xpath('//a[@id="j-refresh-btn"]')
        else:
            res = json.loads(res.text)
            load_more = res["data"]["content"]
            # with open('4.html', "w+", encoding="utf-8") as f:
            #     f.write(load_more)
    except Exception as e:
        print(e)
    return load_more


def get_page_url_list(second_category_url, session, sheet):
    try:
        # 关闭多余连接
        session.keep_alive = False
        page_url_list = []
        page_url_list.append(second_category_url)
        catId = second_category_url.replace("https://", "").split("/")[2].split("_")[0]
        print(catId)
        subCatId = second_category_url.replace("https://", "").split("/")[2].split("_")[1]
        print(subCatId)
        load_more = get_load_more(second_category_url, session)
        basic_url = "https://www.wandoujia.com/wdjweb/api/category/more?catId={}&subCatId={}&page={}&ctoken=ZgezDf1STPfLGGW6HFDbrwmV"
        i = 1
        # 如果下一页存在,就添加到page_url_list
        while len(load_more) > 0:
            i = i + 1
            url = basic_url.format(catId, subCatId, i)
            load_more = get_load_more(url, session)
            if len(load_more) > 0:
                page_url_list.append(url)
        for i in page_url_list:
            get_detail_url_list(i, session, sheet)
    except Exception as e:
        print(e)
    return page_url_list


def get_detail_url_list(page_url, session, sheet):
    try:
        time.sleep(sleep_time)
        # 关闭多余连接
        session.keep_alive = False
        res = session.get(page_url)
        if "?" not in page_url:
            html = etree.HTML(res.text)
        else:
            html = etree.HTML(json.loads(res.text)["data"]["content"])
        detail_url_list = html.xpath('//div[@class="icon-wrap"]/a/@href')
        res = json.loads(res.text)["data"]["content"]
        html = etree.HTML(res)
        detail_url_list = html.xpath('//div[@class="icon-wrap"]/a/@href')
        title_list = html.xpath('//h2//text()')
        category_list = html.xpath('//a[@class="tag-link"]/text()')
        install_num_list = html.xpath('//span[@class="install-count"]/text()')
        install_num_list = [i.replace("安装", "") for i in install_num_list]
        store_mermory_list = html.xpath('//span[@title]/text()')
        record_list = []
        # all_record_list = []
        for i in range(len(detail_url_list)):
            record = [title_list[i], category_list[i], install_num_list[i], store_mermory_list[i], detail_url_list[i]]
            print(record)
            write_to_excel(sheet, record)
    except Exception as e:
        print(e)
    return detail_url_list




# 获取文件夹路径
def get_file_name():
    file_name = "豌豆荚APP数据" + time.strftime("%Y%m%d", time.localtime()) + ".xlsx"
    return file_name


def write_to_excel(sheet, record):
    # 往表中写入标题行,以列表形式写入!
    sheet.append(record)


def main():
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
    }
    url = "https://www.wandoujia.com/"
    global sleep_time
    sleep_time = 1
    # detail_url = 'https://www.wandoujia.com/apps/281291'
    session = requests.Session()
    wb = workbook.Workbook()  # 创建Excel对象
    sheet = wb.active  # 获取当前正在操作的表对象
    sheet.append(
        ['APP名称', '分类', '安装人数', "APP大小", '链接'])
    file_name = get_file_name()
    get_category_url_list(url, session, headers, sheet)
    wb.save(file_name)

if __name__ == '__main__':
    main()

标签:category,get,url,list,爬虫,session,html,豌豆荚
来源: https://blog.csdn.net/weixin_44826979/article/details/119209442