其他分享
首页 > 其他分享> > xpath爬取当当网

xpath爬取当当网

作者:互联网

xpath爬取当当网

import requests
from lxml import etree
#from itertools import chain
#import json # 利用接口读取访问json文件
import time
'''
xpath爬取当当网图书畅销榜
'''

def main():
	#r = []
	d = {}

	for i in range(1,26):
		resp = requests.get('http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-recent7-0-0-1-'+str(i))
		res_xpath = etree.HTML(resp.text)
		title = res_xpath.xpath('///div[@class="name"]/a/text()') # 书名
		author = res_xpath.xpath('///div[@class="publisher_info"]/a[@title]/text()') # 作者
		price = res_xpath.xpath('///div[@class="price"]/p/span[@class="price_n"]/text()') # 价格
		#publisher = res_xpath.xpath('///div[@class="publisher_info"]/a[@href]/text()') # 出版社
		star = res_xpath.xpath('///div[@class="star"]/span[@class="tuijian"]/text()') # 推荐度
		date = res_xpath.xpath('///div[@class="publisher_info"]/span/text()') # 出版日期
		d.setdefault('_title',[]).append(title) # 格式化字典
		d.setdefault('_author',[]).append(author)
		d.setdefault('_price',[]).append(price)
		d.setdefault('_star',[]).append(star)
		d.setdefault('_date',[]).append(date)
		print('第%s页爬取完成!' % i)

		#r = list(chain(r,rst))
		time.sleep(1)
	print(d)
	# with open('D:/r/book4.txt','a+',encoding='utf-8') as f:
	# 	for i in range(0,len(r)):
	# 		f.write(str(i+1)+'. '+r[i]+'\n')
	# 	# data_model = json.loads(i)
	# 	# for news in data_model:
	# 	# 	print(news)
	# 	# #print(rst)

	# 	print('已全部写入!')
		


if __name__ == '__main__':
    main()

标签:xpath,setdefault,res,当当网,爬取,text,div,class
来源: https://blog.csdn.net/weixin_44200553/article/details/120169825