首页 > 其他分享> > 爬某豆瓣读书 Top 250

爬某豆瓣读书 Top 250

2022-08-31 19:00:28 作者：互联网

import re
import time

import requests
from lxml import etree

#爬某豆瓣读书 Top 250
#浏览器的代理
#在网址上输入about://version 浏览器的代理
cz={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.70'}

io=0
for i in range(1,10):
 #遍历网页
 zxc_lp='https://book.douban.com/top250?start='+str(io)
 asd=requests.get(zxc_lp,headers=cz).text
 qwe=etree.HTML(asd)



 c='//*[@valign="top"]/p/text()' #书名
 lp11='//*[@class="star clearfix"]/span[2]/text()' #作者,出版社,日期，
 lp111='//*[@class="star clearfix"]/span[3]/text()'#多少人评价
 lp1111='//*[@class="inq"]/text()'#名句
 lp11111='//*[@class="nbg"]/img/@src'##图片

 asd1=qwe.xpath('//*[@class="pl2"]/a/@title')
 lp=qwe.xpath(c)
 lp1=qwe.xpath(lp11)
 lp12=qwe.xpath(lp111)
 lp13=qwe.xpath(lp1111)
 lp14=qwe.xpath(lp11111)



# 去除空格和换行
 zxc33=[str(i1).strip() for i1 in lp if str(i1).strip()!='']
 qwe=[str(i2).strip() for i2 in asd1 if str(i2).strip()!='']

 qwe11=[str(i4).replace(' ','').split('\n') for i4 in lp12]
 #['(',
 # '66963人评价'
 # ,           ')']
 '替换空格 位 空 在删除换行之后'
 # ['(', '66963人评价', ')']

 for a1,a2,a3,a4,a5,a6 in zip(qwe,zxc33,lp1,qwe11,lp13,lp14):
    print(f'作者:{a1},作者,出版社,日期:{a2},评分:{a3} ,评价{a4},名句:{a5} 图片地址:'+str(a6))
 io+=25

标签：xpath,lp,text,Top,class,豆瓣,str,250,qwe
来源： https://www.cnblogs.com/xxh12/p/16644230.html