前程无忧——数据分析岗位爬取
作者:互联网
本文主要是介绍从前程无忧上爬取岗位为数据分析的职位,主要是五个字段,职位名称、公司名称、工作地点、薪资和发布时间。同时把爬取下来的数据保存到mysql数据库中。
1 import requests 2 import pymysql 3 import re 4 5 6 # 连接数据库并创建数据表 7 db = pymysql.connect('localhost', 'root', 'password', 'lookforjob') 8 cursor = db.cursor() 9 cursor.execute('drop table if exists DataAnalyst') 10 sql = """ 11 create table DataAnalyst 12 ( 13 PositionName VARCHAR(40), #职位名称 14 CompanyName VARCHAR(40), #公司名称 15 WorkingPlace VARCHAR(40), #工作地点 16 Salary VARCHAR(40), #薪资 17 ReleaseTime VARCHAR(40) #发布时间 18 ) 19 """ 20 21 cursor.execute(sql) 22 23 def getHTMLText(page): 24 url = "https://search.51job.com/list/080200,000000,0000,00,9,99," \ 25 "%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2," + str(page) + '.html' 26 try: 27 r = requests.get(url, timeout=30) 28 r.raise_for_status() 29 r.encoding = r.apparent_encoding 30 html = r.text 31 return html 32 except: 33 return "" 34 35 36 def get(html): 37 reg = re.compile( 38 r'class="t1 ">.*?<a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title="('r'.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">('r'.*?)</span>', 39 re.S) 40 items = re.findall(reg, html) 41 42 sql = """ 43 INSERT INTO lookforjob.dataanalyst values(%s,%s,%s,%s,%s) 44 """ 45 for i in range(len(items)): 46 cursor.execute(sql, items[i]) 47 print(items[i]) 48 db.commit() 49 50 51 if __name__ == "__main__": 52 for each in range(1, 7): 53 get(getHTMLText(each)) 54 cursor.close()
标签:数据分析,cursor,VARCHAR,40,前程无忧,爬取,re,html,__ 来源: https://www.cnblogs.com/lsyb-python/p/11838393.html