豆瓣top250影视剧爬虫(含完整代码)
作者:互联网
目录
流程
graph LR A(模拟发送请求) --> B(获取并解析数据) B --> C(创建数据库) C-->d(存储数据)目标网站 https://movie.douban.com/top250?start=
模拟发送请求
调用urllib库
URL(Uniform Resource Locator ):统一资源定位符,是网页存放文件的地址
head:请求头,告诉你要访问的网站请求是谁发出的,也就是我们的设备信息
Google浏览器->快捷键F12(开发者模式)
请求头内容在下图蓝色方框所示位置
import urllib.request, urllib.error # 指定URL,获取网页数据 def askURL(url): head = {" "} #请填入上图中蓝框内代码(user agent信息) request = urllib.request.Request(url, headers=head) html = "" # 异常处理 try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") # print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html
获取并解析数据
- 调用BeautifulSoup库和re库
- BeautifulSoup:“靓汤”,把我们爬取的信息资源熬成靓汤
- re(regular expression):正则表达式,能够根据关键词或句式,对文本信息进行搜索匹配
from bs4 import BeautifulSoup # 网页解析, 获取数据(拆分) import re # 正则表达式,进行文字匹配(提炼) #获取数据 def getData(baseurl): datalist = [] for i in range(0,10): url = baseurl + str(i*25) html = askURL(url) #保存网页源码 # 逐一解析数据 soup = BeautifulSoup(html,"html.parser") for item in soup.find_all('div',class_="item"): # 查找符合要求字符串,形成列表 data = [] item = str(item) link = re.findall(findLink,item)[0] # re库通过正则表达式查找指定字符串 data.append(link) # 添加链接 ImgSrc = re.findall(findImgSrc,item)[0] # 添加图片 data.append(ImgSrc) titles = re.findall(findTitle, item) # 添加片名 if(len(titles)==2): ctitle = titles[0] otitle = titles[1].replace("/","") otitle = re.sub(r'\xa0',"",otitle) # re.sub替换指定字符 data.append(ctitle) data.append(otitle) else: ctitle = titles[0] data.append(ctitle) data.append(' ') rating = re.findall(findRating,item)[0] # 添加评分 data.append(rating) judgeNum = re.findall(findJudge,item)[0] # 添加评论人数 data.append(judgeNum) inq = re.findall(findInq,item) # 添加简介 if len(inq) != 0: inq = inq[0].replace("。","") data.append(inq) else: data.append(" ") bd = re.findall(findBd,item)[0] bd = re.sub('<br(\s+)?/>(\s+)?'," ",bd) bd = re.sub('/'," ",bd) bd = re.sub(r'\xa0', "", bd) data.append(bd.strip()) # strip()去头尾 datalist.append(data) # print(datalist) return datalist
- 调出获取到的html文件,根据文本特点,设计正则表达式
# html文件 <div class="item"> <div class="pic"> <em class="">1</em> <a href="https://movie.douban.com/subject/1292052/"> <img alt="肖申克的救赎" class="" src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.jpg" width="100"/> </a> </div> <div class="info"> <div class="hd"> <a class="" href="https://movie.douban.com/subject/1292052/"> <span class="title">肖申克的救赎</span> <span class="title"> / The Shawshank Redemption</span> <span class="other"> / 月黑高飞(港) / 刺激1995(台)</span> </a> <span class="playable">[可播放]</span> </div> <div class="bd"> <p class=""> 导演: 弗兰克·德拉邦特 Frank Darabont 主演: 蒂姆·罗宾斯 Tim Robbins /...<br/> 1994 / 美国 / 犯罪 剧情 </p> <div class="star"> <span class="rating5-t"></span> <span class="rating_num" property="v:average">9.7</span> <span content="10.0" property="v:best"></span> <span>2591411人评价</span> </div> <p class="quote"> <span class="inq">希望让人自由。</span> </p> </div> </div> </div>
# 正则表达式 findLink = re.compile(r'<a href="(.*?)">') #创建正则表达式对象,表示规则 findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) # .一个 *多个 ?贪婪(搜索到第一个就停止) findTitle = re.compile(r'<span class="title">(.*)</span>') findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>') findJudge = re.compile(r'<span>(\d*)人评价</span>') findInq = re.compile(r'<span class="inq">(.*)</span>') findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # re.S 换行不重新匹配 字符串整体进行匹配
创建数据库
调用sqlite3数据库
import sqlite3 # SQLite数据库操作 # 创建数据库 def init_db(dbpath): sql = ''' # 创建数据表 create table if not exists movie250 ( id integer primary key autoincrement, info_link text, pic_link text, cname varchar, ename varchar, score numeric, rated numeric, instroduction text, info text ) ''' conn = sqlite3.connect(dbpath) # 打开或创建数据库文件 cursor = conn.cursor() # 获取游标 cursor.execute(sql) # 执行sql语句 conn.commit() # 提交数据库操作 conn.close() # 关闭数据库连接
保存数据
# 保存数据 def saveData2DB(datalist,dbpath): init_db(dbpath) conn = sqlite3.connect(dbpath) cur = conn.cursor() for data in datalist: for index in range(len(data)): # data[index] = '"' + str(data[index]) + '"' if index == 4 or index == 5: continue data[index] = '"'+data[index]+'"' sql = ''' insert into movie250( info_link,pic_link,cname,ename,score,rated,instroduction,info) values(%s)'''%",".join(data) # 逗号连接 print(sql) cur.execute(sql) conn.commit() cur.close() conn.close()
完整代码
# -*- coding = utf-8 -*-
# @Time : 2022-04-06 上午 12:15
# @Author : SYSUer
# @File : crawler.py
# @Software : PyCharm
# 学习参考 https://www.bilibili.com/video/BV12E411A7ZQ?p=15
from bs4 import BeautifulSoup # 网页解析, 获取数据(拆分)
import re # 正则表达式,进行文字匹配(提炼)
import urllib.request, urllib.error # 指定URL,获取网页数据
import sqlite3 # SQLite数据库操作
def main():
baseurl = "https://movie.douban.com/top250?start="
# 1.爬取网页
datalist = getData(baseurl)
# savepath = ".\\豆瓣电影Top250.xls"
dbpath = "movie.db"
# 3.保存数据
# saveData(savepath)
saveData2DB(datalist,dbpath)
findLink = re.compile(r'<a href="(.*?)">') #创建正则表达式对象,表示规则
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) # .一个 *多个 ?贪婪(搜索到第一个就停止)
findTitle = re.compile(r'<span class="title">(.*)</span>')
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
findJudge = re.compile(r'<span>(\d*)人评价</span>')
findInq = re.compile(r'<span class="inq">(.*)</span>')
findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # re.S 换行不重新匹配 字符串整体进行匹配
# 爬取网页
def getData(baseurl):
datalist = []
for i in range(0,10):
url = baseurl + str(i*25)
html = askURL(url) #保存网页源码
# 逐一解析数据
soup = BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_="item"): # 查找符合要求字符串,形成列表
data = []
item = str(item)
link = re.findall(findLink,item)[0] # re库通过正则表达式查找指定字符串
data.append(link) # 添加链接
ImgSrc = re.findall(findImgSrc,item)[0] # 添加图片
data.append(ImgSrc)
titles = re.findall(findTitle, item) # 添加片名
if(len(titles)==2):
ctitle = titles[0]
otitle = titles[1].replace("/","")
otitle = re.sub(r'\xa0',"",otitle) # re.sub替换指定字符
data.append(ctitle)
data.append(otitle)
else:
ctitle = titles[0]
data.append(ctitle)
data.append(' ')
rating = re.findall(findRating,item)[0] # 添加评分
data.append(rating)
judgeNum = re.findall(findJudge,item)[0] # 添加评论人数
data.append(judgeNum)
inq = re.findall(findInq,item) # 添加简介
if len(inq) != 0:
inq = inq[0].replace("。","")
data.append(inq)
else:
data.append(" ")
bd = re.findall(findBd,item)[0]
bd = re.sub('<br(\s+)?/>(\s+)?'," ",bd)
bd = re.sub('/'," ",bd)
bd = re.sub(r'\xa0', "", bd)
data.append(bd.strip()) # strip()去头尾
datalist.append(data)
# print(datalist)
return datalist
def askURL(url):
head = {" "} #请填入user agent信息
request = urllib.request.Request(url, headers=head)
html = ""
# 异常处理
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
# print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
# 保存数据
def saveData2DB(datalist,dbpath):
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for data in datalist:
for index in range(len(data)):
# data[index] = '"' + str(data[index]) + '"'
if index == 4 or index == 5:
continue
data[index] = '"'+data[index]+'"'
sql = '''
insert into movie250(
info_link,pic_link,cname,ename,score,rated,instroduction,info)
values(%s)'''%",".join(data) # 逗号连接
print(sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
# 创建数据表
def init_db(dbpath):
sql = '''
create table if not exists movie250
(
id integer primary key autoincrement,
info_link text,
pic_link text,
cname varchar,
ename varchar,
score numeric,
rated numeric,
instroduction text,
info text
)
'''
conn = sqlite3.connect(dbpath) # 打开或创建数据库文件
cursor = conn.cursor() # 获取游标
cursor.execute(sql) # 执行sql语句
conn.commit() # 提交数据库操作
conn.close() # 关闭数据库连接
if __name__ == '__main__': # 当程序执行时
# 调用函数
init_db("movietest.db")
main()
效果展示
参考
标签:bd,item,data,爬虫,re,豆瓣,top250,html,append 来源: https://www.cnblogs.com/Code--geass/p/16128131.html