豌豆荚爬虫
作者:互联网
由于详情页数据无法破解、
模拟请求详情页数据会出现一个滑块,手动划也可以不能通过
结果
只能拿到列表页上面包含的信息
import requests.sessions
from lxml import etree
from openpyxl import workbook
import time
import json
def get_category_url_list(url, session, headers, sheet):
try:
# 关闭多余连接
session.keep_alive = False
res = session.get(url, headers=headers)
html = etree.HTML(res.text)
category_url_list = html.xpath('//ul[@class="parent-menu clearfix"]/li/a/@href')
category_url_list = category_url_list[1:3]
# print(category_url_list)
for i in category_url_list:
get_second_category_url_list(i, session, sheet)
except Exception as e:
print(e)
return category_url_list
def get_second_category_url_list(category_url, session, sheet):
try:
# 关闭多余连接
session.keep_alive = False
res = session.get(category_url)
html = etree.HTML(res.text)
second_category_url_list = html.xpath('//li[@class="parent-cate"]/div/a/@href')
# print(second_category_url_list)
for i in second_category_url_list:
page_url_list = get_page_url_list(i, session, sheet)
print(page_url_list)
except Exception as e:
print(e)
return second_category_url_list
def get_load_more(url, session):
try:
res = session.get(url)
if "?" not in url:
html = etree.HTML(res.text)
load_more = html.xpath('//a[@id="j-refresh-btn"]')
else:
res = json.loads(res.text)
load_more = res["data"]["content"]
# with open('4.html', "w+", encoding="utf-8") as f:
# f.write(load_more)
except Exception as e:
print(e)
return load_more
def get_page_url_list(second_category_url, session, sheet):
try:
# 关闭多余连接
session.keep_alive = False
page_url_list = []
page_url_list.append(second_category_url)
catId = second_category_url.replace("https://", "").split("/")[2].split("_")[0]
print(catId)
subCatId = second_category_url.replace("https://", "").split("/")[2].split("_")[1]
print(subCatId)
load_more = get_load_more(second_category_url, session)
basic_url = "https://www.wandoujia.com/wdjweb/api/category/more?catId={}&subCatId={}&page={}&ctoken=ZgezDf1STPfLGGW6HFDbrwmV"
i = 1
# 如果下一页存在,就添加到page_url_list
while len(load_more) > 0:
i = i + 1
url = basic_url.format(catId, subCatId, i)
load_more = get_load_more(url, session)
if len(load_more) > 0:
page_url_list.append(url)
for i in page_url_list:
get_detail_url_list(i, session, sheet)
except Exception as e:
print(e)
return page_url_list
def get_detail_url_list(page_url, session, sheet):
try:
time.sleep(sleep_time)
# 关闭多余连接
session.keep_alive = False
res = session.get(page_url)
if "?" not in page_url:
html = etree.HTML(res.text)
else:
html = etree.HTML(json.loads(res.text)["data"]["content"])
detail_url_list = html.xpath('//div[@class="icon-wrap"]/a/@href')
res = json.loads(res.text)["data"]["content"]
html = etree.HTML(res)
detail_url_list = html.xpath('//div[@class="icon-wrap"]/a/@href')
title_list = html.xpath('//h2//text()')
category_list = html.xpath('//a[@class="tag-link"]/text()')
install_num_list = html.xpath('//span[@class="install-count"]/text()')
install_num_list = [i.replace("安装", "") for i in install_num_list]
store_mermory_list = html.xpath('//span[@title]/text()')
record_list = []
# all_record_list = []
for i in range(len(detail_url_list)):
record = [title_list[i], category_list[i], install_num_list[i], store_mermory_list[i], detail_url_list[i]]
print(record)
write_to_excel(sheet, record)
except Exception as e:
print(e)
return detail_url_list
# 获取文件夹路径
def get_file_name():
file_name = "豌豆荚APP数据" + time.strftime("%Y%m%d", time.localtime()) + ".xlsx"
return file_name
def write_to_excel(sheet, record):
# 往表中写入标题行,以列表形式写入!
sheet.append(record)
def main():
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
url = "https://www.wandoujia.com/"
global sleep_time
sleep_time = 1
# detail_url = 'https://www.wandoujia.com/apps/281291'
session = requests.Session()
wb = workbook.Workbook() # 创建Excel对象
sheet = wb.active # 获取当前正在操作的表对象
sheet.append(
['APP名称', '分类', '安装人数', "APP大小", '链接'])
file_name = get_file_name()
get_category_url_list(url, session, headers, sheet)
wb.save(file_name)
if __name__ == '__main__':
main()
标签:category,get,url,list,爬虫,session,html,豌豆荚 来源: https://blog.csdn.net/weixin_44826979/article/details/119209442