其他分享
首页 > 其他分享> > 一键抓取拉勾网跟boss直聘的招聘信息(常规操作,未借用Selenium这些)

一键抓取拉勾网跟boss直聘的招聘信息(常规操作,未借用Selenium这些)

作者:互联网

笔者有话说:针对大多数电商类的望着那而言,其信息偷明度与时效性不言而喻,同样,他的反爬机制也相当到位,这里遇到的常见的反爬手段无非就是cookie跟refer字段,cookie动态加载的信息尤其的恶心,在不用selenium进行破解的时候,那过程简直了。。。(某直聘网站还定点封ip大概请求次数超过三次你就没了…)而代理ip这种东西吧。。。免费的大多数不能用或者失效奇快无比(这里指高匿代理ip),等这段时间过了(笔者六级跟考研一轮复习完了)一定抽出时间好好玩玩selenium跟scrapy,现在就算了,办法虽然愚蠢,总比没有强,为了针对白嫖伸手党,笔者代码中有惊喜哦~~

先来康康拉勾网的:
‘’’
网址为:https://www.lagou.com/,爬取内容为关于数据挖掘工程师的相关招聘信息,
爬取内容为职位,公司全称、城市、月薪、学历、工作经验和岗位优势,
至少爬取50家公司,将这些存入Excel文件中,文件后缀为.xlsx。
‘’’

import requests
from lxml import etree
from multiprocessing.dummy import Pool
import time
import json
import csv
class LG():
    def __init__(self):
        self.s=requests.Session()
        self.headers={
                  	"为了本宝宝ip长存,这里你们自己想办法吧~"
            }
        self.lst=[]
        self.info=[["positionName","companyFullName","city","salary","education","workYear","companyLabelList"]]
        self.pool = Pool(5)
        
    #获取cookie信息
    def Get_Cookie(self):
        url = "https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98%E5%B7%A5%E7%A8%8B%E5%B8%88?labelWords=&fromSearch=true&suginput="
        res=self.s.get(url=url,headers=self.headers)
        #print(res.text)
        #return res.cookies
        self.Limit_page()
    
    #获取详细页面信息
    def Get_info(self,pages):
        url="https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
        for page in pages:
            data={
                "first": "true"
                ,"pn": str(page)
                ,"kd": "数据挖掘工程师"
            }
            headers={
                "为了本宝宝ip长存,这里你们自己想办法吧~"
            }
            res=self.s.post(url=url,headers=headers,data=data)
            res=json.loads(res.text)
            print("第{}页数据已近加载完毕!!!".format(page))
            self.lst.append(res['content']['positionResult']['result'])
            time.sleep(1.5)
        
    #限制翻页次数
    def Limit_page(self):
        pages=[page for page in range(1,5)]
        self.Get_info(pages)
        
    #进行字段抓取:职位,公司全称、城市、月薪、学历、工作经验和岗位优势,
    def Get_postion(self):
        self.Get_Cookie()
        for page in self.lst:
            for page1 in page:
                page=page1
                dic={
                    "positionName":page['positionName'],
                    "companyFullName":page['companyFullName'],
                    "city":page['city'],
                    "salary":page['salary'],
                    "education":page['education'],
                    "workYear":page['workYear'],
                    "positionAdvantage":page['positionAdvantage'],
                }
                self.info.append([dic["positionName"],dic["companyFullName"],dic["city"],dic["salary"],dic["education"],dic["workYear"],dic["positionAdvantage"]])
        self.Save_info()
        
    #将抓取的信息保存到CSV当中去
    def Save_info(self):
        f=open('./Lg.csv','a',encoding='utf-8',newline="")
        writer=csv.writer(f)
        writer.writerows(self.info)
        f.close()
        
if __name__ == '__main__':
    lg=LG()
    lg.Get_postion()
import requests
import time
import json
import time
import csv
import xlwt
from lxml import etree
class Boss:
    def __init__(self):
        self.url="https://www.zhipin.com/job_detail/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&city=100010000&industry=&position="
        self.headers={
            "为了本宝宝ip长存,这里你们自己想办法吧~"
        }
        self.info=[["Position","Full_Company","City","Salary","Eduction","WorkYear","PositionDescribe"]]
    
    #获取当前页面html信息并保存到本地,防止请求次数过多而导致ip被封
    def get_html(self):
        res=requests.get(url=self.url,headers=self.headers)
        with open("./boss1.html","w",encoding="utf-8") as f:
            f.write(res.text)
    
    #对本地html信息进行读取,从而获取需求信息
    def get_info(self):
        f=open("./boss1.html","r",encoding="utf-8")
        html=f.read()
        html=etree.HTML(html)
        Position=html.xpath('//ul/li/div[@class="job-primary"]/div[1]/div[1]/div/div[1]/span[1]/a/@title')
        Full_Company=html.xpath('//ul/li/div/div[1]/div[2]/div/h3/a/@title')
        City=html.xpath('//ul/li/div/div[1]/div[1]/div/div[1]/span[2]/span/text()')
        Salary=html.xpath('//ul/li/div/div[1]/div[1]/div/div[2]/span/text()')
        PositionDescribe=html.xpath('//ul/li/div/div[2]/div[2]/text()')
        WorkYear_Eduction=html.xpath('//ul/li/div/div[1]/div[1]/div/div[2]/p')
        WorkYear=[]
        Eduction=[]
        for i in WorkYear_Eduction:
            WorkYear.append(i.xpath('./text()')[0])
            Eduction.append(i.xpath('./text()')[1])
        for j in range(len(Position)):
            self.info.append([Position[j],Full_Company[j],City[j],Salary[j],Eduction[j],WorkYear[j],PositionDescribe[j]])
        #print(Position,Full_Company,City,Salary,Eduction,WorkYear,PositionDescribe)
        
    def save_excel_info(self):
        book=xlwt.Workbook()
        sheet = book.add_sheet(sheetname="Boss_01")
        for i in range(len(self.info)):
            for j in range(len(self.info[i])):
                sheet.write(i,j,self.info[i][j])
        book.save("./Boss_01.xlsx")
        
    def save_csv_info(self):
        f=open('./Boss_01.csv','a',encoding='utf-8',newline="")
        writer=csv.writer(f)
        writer.writerows(self.info)
        f.close()
        
if __name__ == '__main__':
    boss=Boss()
    boss.get_html()
    boss.get_info() 
    boss.save_excel_info()
    boss.save_csv_info()

标签:info,直聘,拉勾,self,Selenium,html,import,div,page
来源: https://blog.csdn.net/Tianxuancsdn/article/details/110181465