爬虫实现51job谁看过我的简历多条记录功能
作者:互联网
默认情况下51job只能看到最近一条记录,查看更多记录需要付费。
本文目的:用爬虫进行定时循环抓取记录,并追加写入到文本。
import requests from bs4 import BeautifulSoup class www_51job_com(object): def __init__(self): self.url = "https://i.51job.com/userset/resume_browsed.php?lang=c" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36" } self.cookies = "替换你的cookie" def get_url(self,url): cookies_dict = {} for i in self.cookies.split("; "): cookies_dict[i.split("=")[0]] = i.split("=")[1] response = requests.get(url=url,headers=self.headers,cookies=cookies_dict) return response.content.decode('gbk') def soup(self,data): soup_data = BeautifulSoup(data,'lxml') company = soup_data.select("body > div.content > div.exrt > div.lmsg > div.e > div.txt > div.li.l1 > p > a")[0].get_text() care_time = soup_data.select("body > div.content > div.exrt > div.lmsg > div.e > div.txt > div.li.l3 > div.f12 > span")[0].get_text() return company +' '+ care_time def save_file(self,company_caretime): with open('www_51job_com.txt','a+',encoding='utf-8') as f: f.seek(0) lines = f.readlines() try: if lines[-1] != company_caretime+'\n': f.write(company_caretime + '\n') except IndexError: f.write(company_caretime+'\n') def run(self): response = self.get_url(self.url) result = self.soup(response) self.save_file(result) if __name__ == '__main__': personal_center = www_51job_com() personal_center.run()
标签:__,cookies,简历,url,51job,爬虫,div,company,self 来源: https://www.cnblogs.com/vpandaxjl/p/11064037.html