前程无忧职位信息爬取
作者:互联网
前程无忧职位信息爬取
# coding=UTF-8
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import requests
import bs4
import json
import xlwt
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'}
for a in range(1,6):
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('My Worksheet')
worksheet.write(0, 0, label='公司名称')
worksheet.write(0, 1, label='职务名称')
worksheet.write(0, 2, label='薪资')
worksheet.write(0, 3, label='求职招聘网址')
worksheet.write(0, 4, label='岗位要求')
url = "https://search.51job.com/list/080200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=".format(a)
info = Request(headers=header,url=url)
html = urlopen(info)
bs = bs4.BeautifulSoup(html, 'html.parser')
alls = bs.find_all('script',type='text/javascript')
for x in alls:
new_alls = x.get_text().replace('window.__SEARCH_RESULT__ = ','')
if len(new_alls)>0:
python_data=json.loads(new_alls)
data_keys=python_data.get('engine_search_result')
for i in range(0,len(data_keys)):
company_names=data_keys[i]['company_name']
job_names=data_keys[i]['job_name']
salarys=data_keys[i]['providesalary_text']
hrefs=data_keys[i]['job_href']
worksheet.write(i+1, 0, label=company_names)
worksheet.write(i+1, 1, label=job_names)
worksheet.write(i+1, 2, label=salarys)
worksheet.write(i+1, 3, label=hrefs)
urls=hrefs
infos = Request(urls,headers=header)
htmls = urlopen(infos)
bss = bs4.BeautifulSoup(htmls, 'html.parser')
try:
texts = bss.find('div', {"class":'bmsg job_msg inbox'}).get_text().split()
job_requests="".join(texts)
worksheet.write(i + 1, 4, label=job_requests)
except:
worksheet.write(i + 1, 4, label=' ')
workbook.save('前程无忧{}.xls'.format(a))
标签:write,keys,worksheet,职位,label,爬取,job,前程无忧,data 来源: https://blog.csdn.net/weixin_47149258/article/details/112393396