其他分享
首页 > 其他分享> > 爬虫实现51job谁看过我的简历多条记录功能

爬虫实现51job谁看过我的简历多条记录功能

作者:互联网

默认情况下51job只能看到最近一条记录,查看更多记录需要付费。

本文目的:用爬虫进行定时循环抓取记录,并追加写入到文本。

import requests
from bs4 import BeautifulSoup

class www_51job_com(object):
    def __init__(self):
        self.url = "https://i.51job.com/userset/resume_browsed.php?lang=c"

        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
        }

        self.cookies = "替换你的cookie"

    def get_url(self,url):
        cookies_dict = {}
        for i in self.cookies.split("; "):
            cookies_dict[i.split("=")[0]] = i.split("=")[1]

        response = requests.get(url=url,headers=self.headers,cookies=cookies_dict)
        return response.content.decode('gbk')

    def soup(self,data):
        soup_data = BeautifulSoup(data,'lxml')
        company = soup_data.select("body > div.content > div.exrt > div.lmsg > div.e > div.txt > div.li.l1 > p > a")[0].get_text()
        care_time = soup_data.select("body > div.content > div.exrt > div.lmsg > div.e > div.txt > div.li.l3 > div.f12 > span")[0].get_text()
        return company +' '+ care_time

    def save_file(self,company_caretime):
        with open('www_51job_com.txt','a+',encoding='utf-8') as f:
            f.seek(0)
            lines = f.readlines()
            try:
                if lines[-1] != company_caretime+'\n':
                    f.write(company_caretime + '\n')
            except IndexError:
                    f.write(company_caretime+'\n')

    def run(self):
        response = self.get_url(self.url)
        result = self.soup(response)
        self.save_file(result)

if __name__ == '__main__':
    personal_center = www_51job_com()
    personal_center.run()

 

标签:__,cookies,简历,url,51job,爬虫,div,company,self
来源: https://www.cnblogs.com/vpandaxjl/p/11064037.html