python爬取教务处空闲教室
作者:互联网
# coding=utf-8 from bs4 import BeautifulSoup #网页解析获取数据 import re#正则表达式。进行文字匹配 import urllib.request,urllib.error#制定URL ,获取网页数据 import xlwt#进行excle操作 import sqlite3#进行数据库操作 import urllib.request import urllib.parse import parsel as parsel import pymysql#导入数据库函数 from datetime import datetime, date import math #1.爬取网页 #2.解析数据 #3.保存数据 roomnamelist = ['基教','一教','二教','三教','龙山A','龙山B','龙山C','龙山D'] def jiaoshi(jiaoshi): for i in range(8): #or item1 in jiaoshi: name1 = roomnamelist[i] #print(name1) item1 = jiaoshi[i] main(item1,name1) def main(item1,name1): for i in range(5): print("开始爬取教务系统网站......") baseurl = "https://tiedao.vatuu.com/vatuu/CourseAction" datalist = getData(baseurl,i,item1) savepath=f"{name1}教务系统{i}课表.xls" print("保存成功"+savepath) savaData(datalist,savepath) savedatasql(datalist) findXh=re.compile(r'<td>(\d{1,2})</td>') findX=re.compile(r'<font color="#000080">(.*?)</font>') findJxl=re.compile(r'<td>(.*)</td>') findJsmc=re.compile(r'<font color="#0000FF">(.*)</font>') findJslx=re.compile(r'<td>(多媒体)</td>') findRl=re.compile(r'<td>(\d*)</td>') findZt=re.compile(r'<font color="blue">(空闲)</font>') findSfkj=re.compile(r'(可借|不可借)') findZc=re.compile(r'<td>(第.*周)</td>') findXq=re.compile(r'(星期.)') findJc=re.compile(r'<td>(第.*节)</td>') jijiao = [["0000000000011","1","13"],["0000000011100","1","13"],["0000001100000","1","13"],["0000110000000","1","13"],["1111000000000","1","13"]] jiaoshilist = [[["0000000000011","1","13"],["0000000011100","1","13"],["0000001100000","1","13"],["0000110000000","1","13"],["1111000000000","1","13"]],[["0000000000011","1","31"],["0000000011100","1","31"],["0000001100000","1","31"],["0000110000000","1","31"],["1111000000000","1","31"]],[["0000000000011","1","7"],["0000000011100","1","7"],["0000001100000","1","7"],["0000110000000","1","7"],["1111000000000","1","7"]],[["0000000000011","1","21"],["0000000011100","1","21"],["0000001100000","1","21"],["0000110000000","1","21"],["1111000000000","1","21"]],[["0000000000011","2","36"],["0000000011100","2","36"],["0000001100000","2","36"],["0000110000000","2","36"],["1111000000000","2","36"]],[["0000000000011","2","37"],["0000000011100","2","37"],["0000001100000","2","37"],["0000110000000","2","37"],["1111000000000","2","37"]],[["0000000000011","2","38"],["0000000011100","2","38"],["0000001100000","2","38"],["0000110000000","2","38"],["1111000000000","2","38"]],[["0000000000011","2","39"],["0000000011100","2","39"],["0000001100000","2","39"],["0000110000000","2","39"],["1111000000000","2","39"]]] def getData(baseurl,i,item1):#获取数据 #print(f"+++++++++++{i}") datalist = [] html = askURL(baseurl,item1[i]) # 保存获取到的网页源码 #2.逐一解析 soup = BeautifulSoup(html, "html.parser") for item in soup.select('tr'):# 查找符合要求的字符串,形成列表 data = [] # 保存一部电影的全部信息 item = str(item) xh = re.findall(findXh, item) if(len(xh)==2):# re库通过正则表达式来查找指定的字符串, data.append(xh[0]) else:# 添加序号 data.append(xh) x = re.findall(findX, item) # re库通过正则表达式来查找指定的字符串, data.append(x) # 添加校区 jxl = re.findall(findJxl, item) if(len(jxl)>0):# re库通过正则表达式来查找指定的字符串, data.append(jxl[1]) # 添加教学楼 jsmc = re.findall(findJsmc, item) # re库通过正则表达式来查找指定的字符串, data.append(jsmc) # 添加教室名称 jslx = re.findall(findJslx, item) # re库通过正则表达式来查找指定的字符串, data.append(jslx) # 添加教室类型 rl = re.findall(findRl, item) if(len(rl)==2):# re库通过正则表达式来查找指定的字符串, data.append(rl[1]) # 添加教室容量 zt = re.findall(findZt, item) # re库通过正则表达式来查找指定的字符串, data.append(zt) # 添加教室状态 fkj = re.findall(findSfkj, item) # re库通过正则表达式来查找指定的字符串, data.append(fkj) # 添加教室是否可借 zc = re.findall(findZc, item) # re库通过正则表达式来查找指定的字符串, data.append(zc) # 添加教室周次 xq = re.findall(findXq, item) # re库通过正则表达式来查找指定的字符串, data.append(xq) # 添加教室星期 jc = re.findall(findJc, item) # re库通过正则表达式来查找指定的字符串, data.append(jc) # 添加教室节次 datalist.append(data) # 把一部电影的信息放入数组 #print(datalist) #逐一解析数据 return datalist#返回数据列表 def savaData(datalist,savapath): book = xlwt.Workbook(encoding="utf-8",style_compression=0) # 创建workbook对象 sheet = book.add_sheet('教务系统课表',cell_overwrite_ok=True) # 创建工作表 col = ("序号","校区","教学楼","教室名称","教室类型","容量","状态","是否可借","周次","星期","节次") for i in range(0,11): sheet.write(0,i,col[i])#设置列名 for i in range(3, len(datalist)): print("第%d条"%i) data=datalist[i] for j in range(0,11): if len(data[j])==0: break sheet.write(i+1-3,j,data[j]) book.save(savapath)#保存 #得到指定一个url的网页内容 def week(): def askURL(url): head = { #伪装请求头,模拟浏览器访问 "User-Agent":" Mozilla / 5.0(Linux;Android6.0;Nexus5 Build / MRA58N) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 99.0.4844.51Mobile Safari / 537.36" } request = urllib.request.Request(url,headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode('utf-8') #print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html #返回爬到所有的html数据 html = askURL("https://tiedao.vatuu.com/vatuu/CourseAction?setAction=classroomQuery") #print(html) findweek=re.compile(r'第(\d*)周') selector = parsel.Selector(html) week = selector.xpath('//*[@id="table2"]/tr[1]/td/text()[2]').get() week = week.strip(); week = str(re.findall(findweek,week)) week = week.replace("['",'') week = week.replace("']",'') #print(week) return week def askURL(url,list): week_no = week(); week_no = int(week_no)-1 week_no = int(math.pow(2,week_no)) #print(week_no) dayOfWeek = datetime.now().weekday() day_no = int(math.pow(2,dayOfWeek)) data = bytes(urllib.parse.urlencode({ # case "基础楼-本部":place="13";break; # case "一教-本部":place="31";;break; # case "二教-本部":place="7";;break; # case "三教-本部":place="21";;break; # case "新教A-龙山":place="36";;break; # case "新教B-龙山":place="37";;break; # case "新教C-龙山":place="38";;break; # case "新教D-龙山":place="39";;break; "setAction": "classroomQuery", "PageAction": "Query", "day_time_text": f"{list[0]}", "school_area_code": f"{list[1]}", "building": f"{list[2]}", "week_no": f"{week_no}" , "day_no": f"{day_no}", "day_time1": "ON", "B1": "查询"}), encoding="utf-8") headers = { # 模拟浏览器头部信息,向浏览器发送消息 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0" } #用户代理,表示告诉服务器,我么是什么类型的机器,本质上是告诉浏览器,我们可以接受什么类型的内容 request=urllib.request.Request(url,headers=headers,data=data,method="POST") html="" try: resonse = urllib.request.urlopen(request) html=resonse.read().decode("utf-8") #print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html def savedatasql(datalist): conn = pymysql.connect(host='localhost', user='root', password='root', database='test', cursorclass=pymysql.cursors.DictCursor) # 建立游标 cursor = conn.cursor() list = [] data = [] for i in range(3, len(datalist)): list = datalist[i] data1 = tuple(list) sql = 'insert into vatuu(xh,area,Jxl,Jsmc,Jslx,Rl,Zt,Sfkj,Zc,Xq,Jc) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' # (2)准备数据 # (3)操作 try: cursor.execute(sql, data1) conn.commit() except Exception as e: print('插入数据失败', e) conn.rollback() # 回滚 # 关闭游标 #cursor.close() # 关闭连接 #conn.close() if __name__=="__main__":#当前程序被调用执行时 #调用函数 jiaoshi(jiaoshilist) print("成功导入到数据库")
标签:week,item,python,爬取,re,html,print,教务处,data 来源: https://www.cnblogs.com/ljq20204136/p/16315860.html