一键抓取拉勾网跟boss直聘的招聘信息(常规操作,未借用Selenium这些)
作者:互联网
笔者有话说:针对大多数电商类的望着那而言,其信息偷明度与时效性不言而喻,同样,他的反爬机制也相当到位,这里遇到的常见的反爬手段无非就是cookie跟refer字段,cookie动态加载的信息尤其的恶心,在不用selenium进行破解的时候,那过程简直了。。。(某直聘网站还定点封ip大概请求次数超过三次你就没了…)而代理ip这种东西吧。。。免费的大多数不能用或者失效奇快无比(这里指高匿代理ip),等这段时间过了(笔者六级跟考研一轮复习完了)一定抽出时间好好玩玩selenium跟scrapy,现在就算了,办法虽然愚蠢,总比没有强,为了针对白嫖伸手党,笔者代码中有惊喜哦~~
先来康康拉勾网的:
‘’’
网址为:https://www.lagou.com/,爬取内容为关于数据挖掘工程师的相关招聘信息,
爬取内容为职位,公司全称、城市、月薪、学历、工作经验和岗位优势,
至少爬取50家公司,将这些存入Excel文件中,文件后缀为.xlsx。
‘’’
import requests
from lxml import etree
from multiprocessing.dummy import Pool
import time
import json
import csv
class LG():
def __init__(self):
self.s=requests.Session()
self.headers={
"为了本宝宝ip长存,这里你们自己想办法吧~"
}
self.lst=[]
self.info=[["positionName","companyFullName","city","salary","education","workYear","companyLabelList"]]
self.pool = Pool(5)
#获取cookie信息
def Get_Cookie(self):
url = "https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98%E5%B7%A5%E7%A8%8B%E5%B8%88?labelWords=&fromSearch=true&suginput="
res=self.s.get(url=url,headers=self.headers)
#print(res.text)
#return res.cookies
self.Limit_page()
#获取详细页面信息
def Get_info(self,pages):
url="https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
for page in pages:
data={
"first": "true"
,"pn": str(page)
,"kd": "数据挖掘工程师"
}
headers={
"为了本宝宝ip长存,这里你们自己想办法吧~"
}
res=self.s.post(url=url,headers=headers,data=data)
res=json.loads(res.text)
print("第{}页数据已近加载完毕!!!".format(page))
self.lst.append(res['content']['positionResult']['result'])
time.sleep(1.5)
#限制翻页次数
def Limit_page(self):
pages=[page for page in range(1,5)]
self.Get_info(pages)
#进行字段抓取:职位,公司全称、城市、月薪、学历、工作经验和岗位优势,
def Get_postion(self):
self.Get_Cookie()
for page in self.lst:
for page1 in page:
page=page1
dic={
"positionName":page['positionName'],
"companyFullName":page['companyFullName'],
"city":page['city'],
"salary":page['salary'],
"education":page['education'],
"workYear":page['workYear'],
"positionAdvantage":page['positionAdvantage'],
}
self.info.append([dic["positionName"],dic["companyFullName"],dic["city"],dic["salary"],dic["education"],dic["workYear"],dic["positionAdvantage"]])
self.Save_info()
#将抓取的信息保存到CSV当中去
def Save_info(self):
f=open('./Lg.csv','a',encoding='utf-8',newline="")
writer=csv.writer(f)
writer.writerows(self.info)
f.close()
if __name__ == '__main__':
lg=LG()
lg.Get_postion()
- boss直聘这一类电商网站反爬做的一直很恶心,他的cookie时效短而且访问次数受限制
- 我给出的就是比较傻瓜式的但是非常有效一种方法,但是老实说,这种只针对页数较少的比较有效(即在获取页面信息时利用本地中转,懂的都懂,不懂的咱也没办法…)
- 在这种方法的基础上进行批量操作可以借助设立cookie池,但是这就要买ip了,穷逼不配…(免费代理ip太顶了!!!)
再来康康有丢丢麻烦的boss的:
‘’’
网址为:https://www.zhipin.com/,爬取内容为关于数据分析师的相关招聘信息
爬取内容为职位,公司全称、城市、月薪、学历、工作经验和岗位描述,至少爬取30家公司
将这些存入csv文件中,文件后缀为.csv
‘’’
import requests
import time
import json
import time
import csv
import xlwt
from lxml import etree
class Boss:
def __init__(self):
self.url="https://www.zhipin.com/job_detail/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&city=100010000&industry=&position="
self.headers={
"为了本宝宝ip长存,这里你们自己想办法吧~"
}
self.info=[["Position","Full_Company","City","Salary","Eduction","WorkYear","PositionDescribe"]]
#获取当前页面html信息并保存到本地,防止请求次数过多而导致ip被封
def get_html(self):
res=requests.get(url=self.url,headers=self.headers)
with open("./boss1.html","w",encoding="utf-8") as f:
f.write(res.text)
#对本地html信息进行读取,从而获取需求信息
def get_info(self):
f=open("./boss1.html","r",encoding="utf-8")
html=f.read()
html=etree.HTML(html)
Position=html.xpath('//ul/li/div[@class="job-primary"]/div[1]/div[1]/div/div[1]/span[1]/a/@title')
Full_Company=html.xpath('//ul/li/div/div[1]/div[2]/div/h3/a/@title')
City=html.xpath('//ul/li/div/div[1]/div[1]/div/div[1]/span[2]/span/text()')
Salary=html.xpath('//ul/li/div/div[1]/div[1]/div/div[2]/span/text()')
PositionDescribe=html.xpath('//ul/li/div/div[2]/div[2]/text()')
WorkYear_Eduction=html.xpath('//ul/li/div/div[1]/div[1]/div/div[2]/p')
WorkYear=[]
Eduction=[]
for i in WorkYear_Eduction:
WorkYear.append(i.xpath('./text()')[0])
Eduction.append(i.xpath('./text()')[1])
for j in range(len(Position)):
self.info.append([Position[j],Full_Company[j],City[j],Salary[j],Eduction[j],WorkYear[j],PositionDescribe[j]])
#print(Position,Full_Company,City,Salary,Eduction,WorkYear,PositionDescribe)
def save_excel_info(self):
book=xlwt.Workbook()
sheet = book.add_sheet(sheetname="Boss_01")
for i in range(len(self.info)):
for j in range(len(self.info[i])):
sheet.write(i,j,self.info[i][j])
book.save("./Boss_01.xlsx")
def save_csv_info(self):
f=open('./Boss_01.csv','a',encoding='utf-8',newline="")
writer=csv.writer(f)
writer.writerows(self.info)
f.close()
if __name__ == '__main__':
boss=Boss()
boss.get_html()
boss.get_info()
boss.save_excel_info()
boss.save_csv_info()
标签:info,直聘,拉勾,self,Selenium,html,import,div,page 来源: https://blog.csdn.net/Tianxuancsdn/article/details/110181465