首页 > 其他分享> > 一键抓取拉勾网跟boss直聘的招聘信息（常规操作，未借用Selenium这些）

一键抓取拉勾网跟boss直聘的招聘信息（常规操作，未借用Selenium这些）

2020-11-26 11:58:57 作者：互联网

笔者有话说：针对大多数电商类的望着那而言，其信息偷明度与时效性不言而喻，同样，他的反爬机制也相当到位，这里遇到的常见的反爬手段无非就是cookie跟refer字段，cookie动态加载的信息尤其的恶心，在不用selenium进行破解的时候，那过程简直了。。。（某直聘网站还定点封ip大概请求次数超过三次你就没了…)而代理ip这种东西吧。。。免费的大多数不能用或者失效奇快无比（这里指高匿代理ip）,等这段时间过了（笔者六级跟考研一轮复习完了）一定抽出时间好好玩玩selenium跟scrapy，现在就算了，办法虽然愚蠢，总比没有强，_{为了针对白嫖伸手党，笔者代码中有惊喜哦}~~

先来康康拉勾网的：
‘’’
网址为：https://www.lagou.com/，爬取内容为关于数据挖掘工程师的相关招聘信息,
爬取内容为职位，公司全称、城市、月薪、学历、工作经验和岗位优势，
至少爬取50家公司，将这些存入Excel文件中，文件后缀为.xlsx。
‘’’

import requests
from lxml import etree
from multiprocessing.dummy import Pool
import time
import json
import csv
class LG():
    def __init__(self):
        self.s=requests.Session()
        self.headers={
                  	"为了本宝宝ip长存，这里你们自己想办法吧~"
            }
        self.lst=[]
        self.info=[["positionName","companyFullName","city","salary","education","workYear","companyLabelList"]]
        self.pool = Pool(5)
        
    #获取cookie信息
    def Get_Cookie(self):
        url = "https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98%E5%B7%A5%E7%A8%8B%E5%B8%88?labelWords=&fromSearch=true&suginput="
        res=self.s.get(url=url,headers=self.headers)
        #print(res.text)
        #return res.cookies
        self.Limit_page()
    
    #获取详细页面信息
    def Get_info(self,pages):
        url="https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
        for page in pages:
            data={
                "first": "true"
                ,"pn": str(page)
                ,"kd": "数据挖掘工程师"
            }
            headers={
                "为了本宝宝ip长存，这里你们自己想办法吧~"
            }
            res=self.s.post(url=url,headers=headers,data=data)
            res=json.loads(res.text)
            print("第{}页数据已近加载完毕！！！".format(page))
            self.lst.append(res['content']['positionResult']['result'])
            time.sleep(1.5)
        
    #限制翻页次数
    def Limit_page(self):
        pages=[page for page in range(1,5)]
        self.Get_info(pages)
        
    #进行字段抓取:职位，公司全称、城市、月薪、学历、工作经验和岗位优势，
    def Get_postion(self):
        self.Get_Cookie()
        for page in self.lst:
            for page1 in page:
                page=page1
                dic={
                    "positionName":page['positionName'],
                    "companyFullName":page['companyFullName'],
                    "city":page['city'],
                    "salary":page['salary'],
                    "education":page['education'],
                    "workYear":page['workYear'],
                    "positionAdvantage":page['positionAdvantage'],
                }
                self.info.append([dic["positionName"],dic["companyFullName"],dic["city"],dic["salary"],dic["education"],dic["workYear"],dic["positionAdvantage"]])
        self.Save_info()
        
    #将抓取的信息保存到CSV当中去
    def Save_info(self):
        f=open('./Lg.csv','a',encoding='utf-8',newline="")
        writer=csv.writer(f)
        writer.writerows(self.info)
        f.close()
        
if __name__ == '__main__':
    lg=LG()
    lg.Get_postion()

boss直聘这一类电商网站反爬做的一直很恶心，他的cookie时效短而且访问次数受限制
我给出的就是比较傻瓜式的但是非常有效一种方法，但是老实说，这种只针对页数较少的比较有效（即在获取页面信息时利用本地中转，懂的都懂，不懂的咱也没办法…）
在这种方法的基础上进行批量操作可以借助设立cookie池，但是这就要买ip了，穷逼不配…(免费代理ip太顶了！！！)

再来康康有丢丢麻烦的boss的：
‘’’
网址为：https://www.zhipin.com/，爬取内容为关于数据分析师的相关招聘信息
爬取内容为职位，公司全称、城市、月薪、学历、工作经验和岗位描述，至少爬取30家公司
将这些存入csv文件中，文件后缀为.csv
‘’’

import requests
import time
import json
import time
import csv
import xlwt
from lxml import etree
class Boss:
    def __init__(self):
        self.url="https://www.zhipin.com/job_detail/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&city=100010000&industry=&position="
        self.headers={
            "为了本宝宝ip长存，这里你们自己想办法吧~"
        }
        self.info=[["Position","Full_Company","City","Salary","Eduction","WorkYear","PositionDescribe"]]
    
    #获取当前页面html信息并保存到本地，防止请求次数过多而导致ip被封
    def get_html(self):
        res=requests.get(url=self.url,headers=self.headers)
        with open("./boss1.html","w",encoding="utf-8") as f:
            f.write(res.text)
    
    #对本地html信息进行读取，从而获取需求信息
    def get_info(self):
        f=open("./boss1.html","r",encoding="utf-8")
        html=f.read()
        html=etree.HTML(html)
        Position=html.xpath('//ul/li/div[@class="job-primary"]/div[1]/div[1]/div/div[1]/span[1]/a/@title')
        Full_Company=html.xpath('//ul/li/div/div[1]/div[2]/div/h3/a/@title')
        City=html.xpath('//ul/li/div/div[1]/div[1]/div/div[1]/span[2]/span/text()')
        Salary=html.xpath('//ul/li/div/div[1]/div[1]/div/div[2]/span/text()')
        PositionDescribe=html.xpath('//ul/li/div/div[2]/div[2]/text()')
        WorkYear_Eduction=html.xpath('//ul/li/div/div[1]/div[1]/div/div[2]/p')
        WorkYear=[]
        Eduction=[]
        for i in WorkYear_Eduction:
            WorkYear.append(i.xpath('./text()')[0])
            Eduction.append(i.xpath('./text()')[1])
        for j in range(len(Position)):
            self.info.append([Position[j],Full_Company[j],City[j],Salary[j],Eduction[j],WorkYear[j],PositionDescribe[j]])
        #print(Position,Full_Company,City,Salary,Eduction,WorkYear,PositionDescribe)
        
    def save_excel_info(self):
        book=xlwt.Workbook()
        sheet = book.add_sheet(sheetname="Boss_01")
        for i in range(len(self.info)):
            for j in range(len(self.info[i])):
                sheet.write(i,j,self.info[i][j])
        book.save("./Boss_01.xlsx")
        
    def save_csv_info(self):
        f=open('./Boss_01.csv','a',encoding='utf-8',newline="")
        writer=csv.writer(f)
        writer.writerows(self.info)
        f.close()
        
if __name__ == '__main__':
    boss=Boss()
    boss.get_html()
    boss.get_info() 
    boss.save_excel_info()
    boss.save_csv_info()

标签：info,直聘,拉勾,self,Selenium,html,import,div,page
来源： https://blog.csdn.net/Tianxuancsdn/article/details/110181465