爬取招聘网站符合关键字的网址
作者:互联网
爬取华为招聘网站符合关键字的网址
# -*- coding:utf-8 -*-
import requests
import re
from http.cookiejar import CookieJar
import json
from bs4 import BeautifulSoup
from urllib import request,parse
from http import cookiejar
s = requests.session()
urlall = []
def login():
for page in range(1,13):
page = str(page)
url1 = "http://career.huawei.com/socRecruitment/services/portal3/portalnew/getJobList/page/15/{0}?keywords=linux&orderBy=P_COUNT_DESC&jobType=1".format(page)
param1 = { "jobType":1,
"keywords": "linux",
"orderBy": "P_COUNT_DESC",
}
response = s.get(url1, params=param1, verify=False).text
reex1 = re.compile('\"jobId\":[0-9]+')
jobids = re.findall(reex1, response)
print(jobids)
for jobid in jobids:
jobid = jobid.split(':')[1]
print(jobid)
url2 = "http://career.huawei.com/socRecruitment/services/portal/portalpub/getJobDetail?jobId={0}".format(jobid)
param2 = {"jobId": jobid}
response2 = s.get(url2, params=param2, verify=False).text
print(response2)
if re.match('.*shell.*|.*python.*', response2, re.I) and re.match('.*linux', response2, re.I) and not re.match('.*c\+\+.*', response2, re.I):
urlget = "http://career.huawei.com/socRecruitment/soc_index.html#soc/pages/job/jobterminal.html?jobId={0}&language=cn&keywords=linux&keywords=linux".format(jobid)
urlall.append(urlget)
print(urlall)
login()
标签:response2,re,http,jobid,爬取,关键字,网址,linux,import 来源: https://blog.csdn.net/weixin_39833509/article/details/94358971