爬虫-下厨房
作者:互联网
from selenium import webdriver
import time,json
from lxml import etree
def request_html(url):
driver = webdriver.PhantomJS(executable_path=r'F:\老师发的文件\系统班第四阶段\day923\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get(url=url)
# with open('xiachufang.html','w',encoding='utf-8') as f:
# f.write(driver.page_source)
tree = etree.HTML(driver.page_source)
return tree
def parse_html(tree):
href_list = tree.xpath('//div[@class="block-bg p40 font16"]//a/@href')
base_url = 'http://www.xiachufang.com'
lis = []
for href in href_list:
url_category = base_url + href
# 获取分类详情页
tree2 = request_html(url_category)
while True:
name_category = tree2.xpath('//h1[@class="page-title"]/text()')[0]
href_detail_list = tree2.xpath('//div[@class="normal-recipe-list"]//p[@class="name"]/a/@href')
# print(href_detail_list)
for href_detail in href_detail_list:
url_detail = base_url + href_detail
tree3 = request_html(url_detail)
# # 菜名
food_name = tree3.xpath('//h1[@class="page-title"]/text()')[0].strip()
print(food_name)
# # 图片
img = tree3.xpath('//div[@class="cover image expandable block-negative-margin"]/img/@src')[0]
# # 评分
score_list = tree3.xpath('//div[@class="score float-left"]/span[@class="number"]/text()')
if score_list:
score = score_list[0]
else:
score = '暂无'
# # 做过的人数
num = tree3.xpath('//div[@class ="cooked float-left"]/span[@class="number"]/text()')[0]
# print(num_list)
# # 作者名字
author = tree3.xpath('//div[@class="author"]//span/text()')[0]
# 描述
desc_list = tree3.xpath('//div[@class="desc mt30"]//text()')
desc_list1 = ''.join(desc_list)
desc_list2 = desc_list1.replace(' ', '')
desc = desc_list2.replace('\n', ' ')
# # 用料
yongliao_list = tree3.xpath('//div[@class="ings"]//tr//text()')
# # print(yongliao_list)
yongliao1 = ''.join(yongliao_list)
yongliao2 = yongliao1.replace(' ', '')
yongliao = yongliao2.replace('\n', ' ')
# # print(yongliao)
# # 详细做法
step_list = tree3.xpath('//div[@class="steps"]/ol//text()')
step1 = ''.join(step_list)
step2 = step1.replace(' ', '')
step = step2.replace('\n', ' ')
# print(step)
dic = {
'分类路由':url_category,
'分类名字':name_category,
'详情路由':url_detail,
'菜名':food_name,
'封面路由':img,
'评分':score,
'做过人数':num,
'作者昵称':author,
'描述':desc,
'用料':yongliao,
'详细步骤':step,
}
# lis.append(dic)
with open('下厨房.json', 'a', encoding='utf-8') as f:
f.write(str(dic))
next_url_list = tree2.xpath('//a[@class="next"]/@href')
if next_url_list:
next_url = base_url + next_url_list[0]
tree2 = request_html(next_url)
else:
break
if __name__ == '__main__':
url = 'http://www.xiachufang.com/category/'
tree = request_html(url)
parse_html(tree)
标签:xpath,下厨房,url,list,爬虫,tree3,href,class 来源: https://blog.csdn.net/weixin_42766128/article/details/101305679