爬虫-下厨房

2019-09-24 19:38:35 作者：互联网
from selenium import webdriver
import time,json
from lxml import etree
def request_html(url):
    driver = webdriver.PhantomJS(executable_path=r'F:\老师发的文件\系统班第四阶段\day923\phantomjs-2.1.1-windows\bin\phantomjs.exe')
    driver.get(url=url)
    # with open('xiachufang.html','w',encoding='utf-8') as f:
    #     f.write(driver.page_source)
    tree = etree.HTML(driver.page_source)
    return tree

def parse_html(tree):
    href_list = tree.xpath('//div[@class="block-bg p40 font16"]//a/@href')
    base_url = 'http://www.xiachufang.com'
    lis = []
    for href in href_list:
        url_category = base_url + href
        # 获取分类详情页
        tree2 = request_html(url_category)
        while True:
            name_category = tree2.xpath('//h1[@class="page-title"]/text()')[0]
            href_detail_list = tree2.xpath('//div[@class="normal-recipe-list"]//p[@class="name"]/a/@href')
            # print(href_detail_list)
            for href_detail in href_detail_list:
                url_detail = base_url + href_detail
                tree3 = request_html(url_detail)
                #     # 菜名
                food_name = tree3.xpath('//h1[@class="page-title"]/text()')[0].strip()
                print(food_name)
                #     # 图片
                img = tree3.xpath('//div[@class="cover image expandable block-negative-margin"]/img/@src')[0]
                #     # 评分
                score_list = tree3.xpath('//div[@class="score float-left"]/span[@class="number"]/text()')
                if score_list:
                    score = score_list[0]
                else:
                    score = '暂无'
                #     # 做过的人数
                num = tree3.xpath('//div[@class ="cooked float-left"]/span[@class="number"]/text()')[0]
                # print(num_list)
                #     # 作者名字
                author = tree3.xpath('//div[@class="author"]//span/text()')[0]
                # 描述
                desc_list = tree3.xpath('//div[@class="desc mt30"]//text()')
                desc_list1 = ''.join(desc_list)
                desc_list2 = desc_list1.replace(' ', '')
                desc = desc_list2.replace('\n', ' ')
                #     # 用料
                yongliao_list = tree3.xpath('//div[@class="ings"]//tr//text()')
                #     # print(yongliao_list)
                yongliao1 = ''.join(yongliao_list)
                yongliao2 = yongliao1.replace(' ', '')
                yongliao = yongliao2.replace('\n', ' ')
                #     # print(yongliao)
                #     # 详细做法
                step_list = tree3.xpath('//div[@class="steps"]/ol//text()')
                step1 = ''.join(step_list)
                step2 = step1.replace(' ', '')
                step = step2.replace('\n', ' ')
                # print(step)
                dic = {
                    '分类路由':url_category,
                    '分类名字':name_category,
                    '详情路由':url_detail,
                    '菜名':food_name,
                    '封面路由':img,
                    '评分':score,
                    '做过人数':num,
                    '作者昵称':author,
                    '描述':desc,
                    '用料':yongliao,
                    '详细步骤':step,

                }
                # lis.append(dic)
                with open('下厨房.json', 'a', encoding='utf-8') as f:
                    f.write(str(dic))
            next_url_list = tree2.xpath('//a[@class="next"]/@href')
            if next_url_list:
                next_url = base_url + next_url_list[0]
                tree2 = request_html(next_url)
            else:
                break


if __name__ == '__main__':
    url = 'http://www.xiachufang.com/category/'
    tree = request_html(url)
    parse_html(tree)
标签：xpath,下厨房,url,list,爬虫,tree3,href,class
来源： https://blog.csdn.net/weixin_42766128/article/details/101305679