编程语言
首页 > 编程语言> > python爬取教务处空闲教室

python爬取教务处空闲教室

作者:互联网

# coding=utf-8
from bs4 import BeautifulSoup #网页解析获取数据
import re#正则表达式。进行文字匹配
import urllib.request,urllib.error#制定URL ,获取网页数据
import xlwt#进行excle操作
import sqlite3#进行数据库操作
import urllib.request
import urllib.parse
import parsel as parsel
import pymysql#导入数据库函数
from datetime import datetime, date
import math
#1.爬取网页
#2.解析数据
#3.保存数据
roomnamelist = ['基教','一教','二教','三教','龙山A','龙山B','龙山C','龙山D']
def jiaoshi(jiaoshi):
    for i in range(8):
        #or item1 in jiaoshi:
        name1 = roomnamelist[i]
        #print(name1)
        item1 = jiaoshi[i]
        main(item1,name1)

def main(item1,name1):
    for i in range(5):
        print("开始爬取教务系统网站......")
        baseurl = "https://tiedao.vatuu.com/vatuu/CourseAction"
        datalist = getData(baseurl,i,item1)
        savepath=f"{name1}教务系统{i}课表.xls"
        print("保存成功"+savepath)
        savaData(datalist,savepath)
        savedatasql(datalist)


findXh=re.compile(r'<td>(\d{1,2})</td>')
findX=re.compile(r'<font color="#000080">(.*?)</font>')
findJxl=re.compile(r'<td>(.*)</td>')
findJsmc=re.compile(r'<font color="#0000FF">(.*)</font>')
findJslx=re.compile(r'<td>(多媒体)</td>')
findRl=re.compile(r'<td>(\d*)</td>')
findZt=re.compile(r'<font color="blue">(空闲)</font>')
findSfkj=re.compile(r'(可借|不可借)')
findZc=re.compile(r'<td>(第.*周)</td>')
findXq=re.compile(r'(星期.)')
findJc=re.compile(r'<td>(第.*节)</td>')
jijiao  = [["0000000000011","1","13"],["0000000011100","1","13"],["0000001100000","1","13"],["0000110000000","1","13"],["1111000000000","1","13"]]
jiaoshilist = [[["0000000000011","1","13"],["0000000011100","1","13"],["0000001100000","1","13"],["0000110000000","1","13"],["1111000000000","1","13"]],[["0000000000011","1","31"],["0000000011100","1","31"],["0000001100000","1","31"],["0000110000000","1","31"],["1111000000000","1","31"]],[["0000000000011","1","7"],["0000000011100","1","7"],["0000001100000","1","7"],["0000110000000","1","7"],["1111000000000","1","7"]],[["0000000000011","1","21"],["0000000011100","1","21"],["0000001100000","1","21"],["0000110000000","1","21"],["1111000000000","1","21"]],[["0000000000011","2","36"],["0000000011100","2","36"],["0000001100000","2","36"],["0000110000000","2","36"],["1111000000000","2","36"]],[["0000000000011","2","37"],["0000000011100","2","37"],["0000001100000","2","37"],["0000110000000","2","37"],["1111000000000","2","37"]],[["0000000000011","2","38"],["0000000011100","2","38"],["0000001100000","2","38"],["0000110000000","2","38"],["1111000000000","2","38"]],[["0000000000011","2","39"],["0000000011100","2","39"],["0000001100000","2","39"],["0000110000000","2","39"],["1111000000000","2","39"]]]

def getData(baseurl,i,item1):#获取数据
    #print(f"+++++++++++{i}")
    datalist = []
    html = askURL(baseurl,item1[i])  # 保存获取到的网页源码
    #2.逐一解析
    soup = BeautifulSoup(html, "html.parser")
    for item in soup.select('tr'):# 查找符合要求的字符串,形成列表
        data = []  # 保存一部电影的全部信息
        item = str(item)
        xh = re.findall(findXh, item)
        if(len(xh)==2):# re库通过正则表达式来查找指定的字符串,
            data.append(xh[0])
        else:# 添加序号
            data.append(xh)

        x = re.findall(findX, item)  # re库通过正则表达式来查找指定的字符串,
        data.append(x)  # 添加校区

        jxl = re.findall(findJxl, item)
        if(len(jxl)>0):# re库通过正则表达式来查找指定的字符串,
            data.append(jxl[1])  # 添加教学楼

        jsmc = re.findall(findJsmc, item)  # re库通过正则表达式来查找指定的字符串,
        data.append(jsmc)  # 添加教室名称

        jslx = re.findall(findJslx, item)  # re库通过正则表达式来查找指定的字符串,
        data.append(jslx)  # 添加教室类型

        rl = re.findall(findRl, item)
        if(len(rl)==2):# re库通过正则表达式来查找指定的字符串,
            data.append(rl[1])  # 添加教室容量

        zt = re.findall(findZt, item)  # re库通过正则表达式来查找指定的字符串,
        data.append(zt)  # 添加教室状态

        fkj = re.findall(findSfkj, item)  # re库通过正则表达式来查找指定的字符串,
        data.append(fkj)  # 添加教室是否可借

        zc = re.findall(findZc, item)  # re库通过正则表达式来查找指定的字符串,
        data.append(zc)  # 添加教室周次

        xq = re.findall(findXq, item)  # re库通过正则表达式来查找指定的字符串,
        data.append(xq)  # 添加教室星期

        jc = re.findall(findJc, item)  # re库通过正则表达式来查找指定的字符串,
        data.append(jc)  # 添加教室节次

        datalist.append(data)  # 把一部电影的信息放入数组
    #print(datalist)
    #逐一解析数据
    return datalist#返回数据列表
def savaData(datalist,savapath):
    book = xlwt.Workbook(encoding="utf-8",style_compression=0)  # 创建workbook对象
    sheet = book.add_sheet('教务系统课表',cell_overwrite_ok=True)  # 创建工作表
    col = ("序号","校区","教学楼","教室名称","教室类型","容量","状态","是否可借","周次","星期","节次")
    for i in range(0,11):
        sheet.write(0,i,col[i])#设置列名
    for i in  range(3, len(datalist)):
        print("第%d条"%i)
        data=datalist[i]

        for j in range(0,11):

            if len(data[j])==0:
                break
            sheet.write(i+1-3,j,data[j])

    book.save(savapath)#保存
#得到指定一个url的网页内容
def week():
    def askURL(url):
        head = {   #伪装请求头,模拟浏览器访问
           "User-Agent":" Mozilla / 5.0(Linux;Android6.0;Nexus5 Build / MRA58N) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 99.0.4844.51Mobile Safari / 537.36"
        }
        request = urllib.request.Request(url,headers=head)
        html = ""
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode('utf-8')
            #print(html)
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)
            if hasattr(e,"reason"):
                print(e.reason)
        return html  #返回爬到所有的html数据

    html = askURL("https://tiedao.vatuu.com/vatuu/CourseAction?setAction=classroomQuery")
    #print(html)
    findweek=re.compile(r'第(\d*)周')
    selector = parsel.Selector(html)
    week = selector.xpath('//*[@id="table2"]/tr[1]/td/text()[2]').get()
    week = week.strip();
    week = str(re.findall(findweek,week))
    week = week.replace("['",'')
    week = week.replace("']",'')
    #print(week)
    return week
def askURL(url,list):
    week_no = week();
    week_no = int(week_no)-1
    week_no = int(math.pow(2,week_no))
    #print(week_no)
    dayOfWeek = datetime.now().weekday()
    day_no = int(math.pow(2,dayOfWeek))
    data = bytes(urllib.parse.urlencode({
                 # case "基础楼-本部":place="13";break;
                 # case "一教-本部":place="31";;break;
                 # case "二教-本部":place="7";;break;
                 # case "三教-本部":place="21";;break;
                 # case "新教A-龙山":place="36";;break;
                 # case "新教B-龙山":place="37";;break;
                 # case "新教C-龙山":place="38";;break;
                 # case "新教D-龙山":place="39";;break;
        "setAction": "classroomQuery",
        "PageAction": "Query",
        "day_time_text": f"{list[0]}",
        "school_area_code": f"{list[1]}",
        "building": f"{list[2]}",
        "week_no": f"{week_no}"
        , "day_no": f"{day_no}",
        "day_time1": "ON",
        "B1": "查询"}), encoding="utf-8")
    headers = {  # 模拟浏览器头部信息,向浏览器发送消息
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0"
    }
#用户代理,表示告诉服务器,我么是什么类型的机器,本质上是告诉浏览器,我们可以接受什么类型的内容
    request=urllib.request.Request(url,headers=headers,data=data,method="POST")
    html=""
    try:
        resonse = urllib.request.urlopen(request)
        html=resonse.read().decode("utf-8")
        #print(html)
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
    return html
def savedatasql(datalist):
    conn = pymysql.connect(host='localhost',
                           user='root',
                           password='root',
                           database='test',
                           cursorclass=pymysql.cursors.DictCursor)
    # 建立游标
    cursor = conn.cursor()
    list = []
    data = []
    for i in range(3, len(datalist)):
        list = datalist[i]
        data1 = tuple(list)
        sql = 'insert into vatuu(xh,area,Jxl,Jsmc,Jslx,Rl,Zt,Sfkj,Zc,Xq,Jc) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
    # (2)准备数据
    # (3)操作
        try:
             cursor.execute(sql, data1)
             conn.commit()
        except Exception as e:
             print('插入数据失败', e)
             conn.rollback()  # 回滚
         # 关闭游标
             #cursor.close()
         # 关闭连接
             #conn.close()
if __name__=="__main__":#当前程序被调用执行时
    #调用函数
    jiaoshi(jiaoshilist)
    print("成功导入到数据库")

  

标签:week,item,python,爬取,re,html,print,教务处,data
来源: https://www.cnblogs.com/ljq20204136/p/16315860.html