电影天堂爬虫实战
作者:互联网
#!/usr/bin/python3 # -*- coding:utf-8 -*- # @Time:2021/8/28 22:38 # @author: Mrwhite # @File:电影天堂爬虫.py # @DESC: import re import urllib.request,urllib.error #制定URL,获取网页数据 import xlwt from bs4 import BeautifulSoup def main(): pass #电影天堂url baseurl = "https://dy.dytt8.net/index.htm" #1.爬取主页-电影名称,超链接,更新日期 #据超链地址打开后,获取导演/主演/豆瓣评分/磁力链接/简介 datalist = getData(baseurl) #2.保存数据excel表,根据分类插入对应sheet页 saveData(datalist,"电影天堂电影.xls") #创建正则表达式对象,表示规则(字符串的模式) findLink = re.compile(r'・\[<a href="/html/gndy/.*<a href="(.*?)">') #影片链接匹配规则 findMovieName = re.compile( r'・\[<a href="/html/gndy/.*">(.*?)</a><br/>' ) #匹配电影名称 findUpDateTime = re.compile( r'<td class="inddline" width="15%"><font color="#FF0000">(.*?)</font></td>' ) #匹配更新日期 findDirect = re.compile( r'<br />◎导 演 (.*?)<br />' ) #匹配导演 findActor = re.compile( r'<br />◎主 演 (.*?)<br /><br />◎标 签' ) #匹配演员 findScore = re.compile( r'<br />◎豆瓣评分 (.*?) from' ) #匹配豆瓣评分 findDownloadLink = re.compile( r'<a target="_blank" href="(.*?)">' ) #匹配下载链接 findInfo = re.compile( r'◎简 介<br /><br /> (.*?)<br />' ) #相信信息 def getData(baseurl): datalist = [] titles,links,updateTimes,directs,actors,scores,downloadLinks,infos=[],[],[],[],[],[],[],[] #1.爬取网页 html = askURl(baseurl) #print(html) # 2.解析数据 soup=BeautifulSoup( html, "html.parser" ) #nth-child需要替换为nth-of-type item = soup.select("div:nth-of-type(2) > div:nth-of-type(1) > div > div > div.co_content8") item = str(item) #print(item) titles = re.findall(findMovieName, item) #正则匹配标题 #links = f'https://dy.dytt8.net/{re.findall(findLink, html)}' linksUnSet = re.findall(findLink, item) #正则匹配超链接并拼接完整路径 for link in linksUnSet: link = f'https://dy.dytt8.net{link}' links.append(link) updateTimes = re.findall(findUpDateTime,item) #正则匹配更新实际 #3.循环访问电影子链接获取:导演/主演/豆瓣评分/磁力链接/简介 for link in links: #print(link) html=askURl(link) #print(html) directUnSet = re.findall(findDirect,html) # 正则匹配导演并处理 if directUnSet==[]: directs.append("") else: direct=directUnSet[0].replace(" ","").replace("·","·") directs.append(direct) actorsUnset = re.findall(findActor,html) # 正则匹配主演 if actorsUnset==[]: actors.append("") else: actorList = actorsUnset[0].replace("·","·").replace(" ","").replace("\u3000","").split("<br />")[0:3] actor="/".join( actorList ) actors.append(actor) scoresUnset = re.findall(findScore,html) # 正则匹配豆瓣评分 if scoresUnset==[]: scores.append("无评分") else: score=scoresUnset[0].split("/")[0] scores.append(score) downloadLink = re.findall(findDownloadLink,html) # 正则匹配磁力链接 downloadLinks.append(downloadLink) infosUnSet = re.findall(findInfo,html) # 正则匹配简介 if infosUnSet==[]: infos.append("") else: info = infosUnSet[0].replace("·","·").replace(" ","").replace("“","") infos.append(info) dataList=[titles, updateTimes,directs, actors, scores, downloadLinks, infos] #print( len( titles ), len( updateTimes ),len(links), len( directs ), len( actors ), len( scores ), len( downloadLinks ),len( infos ) ) return dataList #得到指定一个URL的网页内容 def askURl(url): #head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"} #request = urllib.request.Request(url,headers=head) request = urllib.request.Request(url) try: response = urllib.request.urlopen(request) html = response.read().decode("gb2312",errors = 'ignore') #print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) return html #保存数据 def saveData(datalist,savepath): print("save......") book = xlwt.Workbook(encoding="utf8",style_compression=0) sheet = book.add_sheet("from电影天堂",cell_overwrite_ok=True) col = ('标题',"更新时间","导演","主演","豆瓣评分","磁力链接","简介") try: for j in range(7): #i为行,j为列 sheet.write(0,j,col[j]) #列名 for i in range(1,len(datalist[0])): sheet.write(i,j,datalist[j][i]) print("datalist的",i,"行",j,"列的数据为:",datalist[j][i],"成功写入") book.save(savepath) #保存 except Exception as e: print("datalist的",i,"行",j,"列的数据为:",datalist[j][i],"写入失败") print(e) if __name__ == "__main__": #当程序执行时 #调用函数 main() print("爬取完毕")
展示效果如下:可继续添加下优化爬虫的效率
标签:实战,匹配,datalist,爬虫,re,html,print,天堂,append 来源: https://www.cnblogs.com/mrwhite2020/p/15203355.html