编程语言
首页 > 编程语言> > python爬虫爬取免费简历模板实例

python爬虫爬取免费简历模板实例

作者:互联网

爬取目标网站https://sc.chinaz.com/jianli/free.html

思路

 

 

 

 

 

 思路捋清,直接上代码

# -*- codeing = utf-8 -*-
# @Time : 2021/7/20 10:13
# @Author : ArthurHuang
# @File : 10_xpath解析案例_站长素材中免费简历模板爬取.py
# @Software : PyCharm
import requests

from lxml import html
etree = html.etree #新版本etree现在需要这样导入

import  os

if __name__ == "__main__":

    url = 'http://sc.chinaz.com/jianli/free_%d.html'
    for page in range(1, 6):  # 循环取前5页,每页20张简历
        # UA伪装:将对应的User-Agent封装到一个字典中
        headers = {
            "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 91.0.4472.77 Safari / 537.36"
        }

        if page == 1:  # 第一页与其余几页的url不同,需要分开写
            new_url = 'http://sc.chinaz.com/jianli/free.html'
        else:
            new_url = format(url % page)

        page_text = requests.get(url=new_url, headers=headers).text
        # 实例化etree对象
        tree = etree.HTML(page_text)

        # 创建一个文件夹保存图片
        if not os.path.exists('./jianliLibs'):
            os.mkdir('./jianliLibs')


        a_list = tree.xpath('//div[@id="container"]/div/a')
        for a in a_list:
            # 获取简历名称列表
            all_titles = a.xpath('./img/@alt')[0]+'.zip'
            all_titles = all_titles.encode('iso-8859-1').decode('utf-8')  # 通用处理中文乱码的解决方案
            #print(all_titles)

            # 获取每个简历对应的单独网页地址
            all_href = 'https:'+a.xpath('./@href')[0]
            response = requests.get(url=all_href, headers=headers)
            resume_data = response.text
            resumetree = etree.HTML(resume_data)
            resume_download_list = resumetree.xpath('//div[@id="down"]/div[2]/ul/li[1]')
            # 每个简历对应的点击下载的地址
            for download in resume_download_list:
                all_downloads = download.xpath('./a/@href')[0]
                resume_rar_page = requests.get(url=all_downloads, headers=headers).content  # 向点击下载的url发送请求,把简历下载到本地
            resume_path = 'jianliLibs/' + all_titles
            with open(resume_path, 'wb')as fp:
                fp.write(resume_rar_page )
                print(all_titles, "下载成功!!!")

成功获取

 

标签:etree,resume,python,爬虫,爬取,url,titles,page,headers
来源: https://blog.csdn.net/weixin_42436236/article/details/118937089