拉钩爬虫
作者:互联网
# -*-coding:utf-8-*- ''' FileName:LaG爬取岗位信息 CreatTime:2018-4-10 Author: ___dx___ FileDescript: ''' import requests import xlwt import ssl ssl._create_default_https_context = ssl._create_unverified_context # https校验证书 class Lagou_job(object): def __init__(self): self.url = 'https://www.lagou.com/jobs/positionAjax.json?px=new&needAddtionalResult=false' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 'Referer': 'https://www.lagou.com/jobs/list_%E6%B5%8B%E8%AF%95?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=', 'Connection': 'keep - alive', 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 'Origin':'https://www.lagou.com', 'X-Anit-Forge-Code': '0', 'X-Anit-Forge-Token': 'None', 'X-Requested-With': 'XMLHttpRequest' } # 抓取接口函数 def getJobList(self, page): self.data = { 'first': 'true', 'pn': page, 'kd': '测试' } session = requests.Session() res =session.post(self.url, data=self.data, headers=self.headers) result = res.json() print(result) # debug print(res.status_code) jobs = result['content']['positionResult']['result'] return jobs # 抓取结果存入excel def saveExcel(self): excelTabel = xlwt.Workbook() # 创建excel对象 # 如果对一个单元格重复操作,会引发 # returns error: # Exception: Attempt to overwrite cell: # sheetname=u'sheet 1' rowx=0 colx=0 # 所以在打开时加cell_overwrite_ok=True 解决 sheet_1 = excelTabel.add_sheet('daixiang', cell_overwrite_ok=True) #创建sheet页 sheet_1.write(0, 0, u'公司全名') sheet_1.write(0, 1, u'公司简称') sheet_1.write(0, 2, u'城市') sheet_1.write(0, 3, u'区域') sheet_1.write(0, 4, u'工作性质') sheet_1.write(0, 5, u'职位名称') sheet_1.write(0, 6, u'薪资范围') sheet_1.write(0, 7, u'职位') sheet_1.write(0, 8, u'工作年限') sheet_1.write(0, 9, u'公司规模') sheet_1.write(0, 10, u'学历要求') n = 1 for page in range(1, 2): # 前99页 for job in self.getJobList(page=page): if '' in job['workYear'] and u'' in job['jobNature'] and u'' in job['education']: if '' in job['workYear'] and u'全职' in job['jobNature'] and u'深圳' in job['city']: sheet_1.write(n, 0, job['companyFullName']) sheet_1.write(n, 1, job['companyShortName']) sheet_1.write(n, 2, job['city']) sheet_1.write(n, 3, job['district']) sheet_1.write(n, 4, job['jobNature']) sheet_1.write(n, 5, job['positionName']) sheet_1.write(n, 6, job['salary']) sheet_1.write(n, 7, job['secondType']) sheet_1.write(n, 8, job['workYear']) sheet_1.write(n, 9, job['companySize']) sheet_1.write(n, 10, job['education']) n += 1 print (job['companyShortName'],job['salary']) #print ('{},{}'.format(job['companyShortName'].encode('utf-8'),job['salary'].encode('utf-8'))) #print "{0[0]} is {0[1]} years old".format(li) #print {0}{1}.format(job['companyShortName'], job['salary']) #print('[{name:<{len}}\tx'.format(name=job['companyShortName'] + ']', len=50 - len(job['companyShortName'].encode('utf-8')) + len(job['companyShortName']))) # 保存文件到excel #excelTabel.save('daidai.xls') excelTabel.save("深圳测试_By_dx.xls") if __name__ == '__main__': lagou_job = Lagou_job() #lagou_job.getJobList(1) lagou_job.saveExcel()
标签:write,sheet,self,爬虫,拉钩,job,print,page 来源: https://www.cnblogs.com/jsondai/p/11393056.html