爬虫学习
作者:互联网
记录一下爬虫学习
最近在了解爬虫就学了一点赶紧记录一下 爬取豆瓣一些内容.一点小实现 记录到xls表格中
import urllib.request
from bs4 import BeautifulSoup
import re
import xlwt
url="https://movie.douban.com/top250?start="
findlink =re.compile(r'<a href="(.*?)">') #创建正则表达式规则
findimg =re.compile(r'<img alt=".*" class="" src="(.*?)" width="100"/>') # 图片地址
findname =re.compile(r'<span class="title">(.*?)</span>')#片名
findcontent=re.compile(r'<p class="">(.*?)</p>',re.S)
def askurl(url):
head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"}
request= urllib.request.Request(url=url,headers=head)
respond= urllib.request.urlopen(request)
return respond.read().decode('utf-8')
def getdata(alldata,baseurl):
for i in range(10):
url=baseurl+str(i*25)
html=askurl(url)
soup=BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_="item"):
data=[]
item=str(item)
link= re.findall(findlink,item)[0]
data.append(link)
image=re.findall(findimg,item)
data.append(image[0])
name =re.findall(findname,item)[0]
data.append(name)
content=re.findall(findcontent,item)[0]
# print(link)
# print(image[0])
# print(name)
# print(re.sub('<br/>(\s+)?','',content).strip())
alldata.append(data)
def storeData(alldata):
book= xlwt.Workbook(encoding='utf-8')
word= book.add_sheet('hellp',cell_overwrite_ok=True)
for i in range(0,250):
data2=alldata[i]
for j in range(3):
word.write(i,j,data2[j])
book.save('111.xls')
def main():
alldata=[]
getdata(alldata,url)
storeData(alldata)
main()
标签:item,url,request,爬虫,学习,re,data,alldata 来源: https://blog.csdn.net/qq_46540840/article/details/114991849