爬虫_scrapy_当当网爬取数据
作者:互联网
1.创建项目
scrapy startproject scrapy_dangdang
2.创建一个爬虫文件
爬取地址:http://category.dangdang.com/cp01.01.02.00.00.00.html
scrapy genspider dang http://category.dangdang.com/cp01.01.02.00.00.00.html
3.各部分代码
dang.py文件
import scrapy from scrapy_dangdang.items import ScrapyDangdangItem class DangSpider(scrapy.Spider): name = 'dang' # 多页请求一般情况下只写域名 allowed_domains = ['category.dangdang.com'] start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html'] base_url='http://category.dangdang.com/pg' page = 1 def parse(self, response): # pipelines 下载数据 # items 定义数据结构的 # src='//ul[@id="component_59"]/li//img/@src' # alt='//ul[@id="component_59"]/li//img/@alt' # price='//ul[@id="component_59"]/li//p[@class="price"]/span[1]/text()' # 所有的selector的对象都可以再次调用xpath方法 li_list = response.xpath('//ul[@id="component_59"]/li') print('=====================获取数据================================') for li in li_list: # 这里涉及到了图片的懒加载 src = li.xpath('.//img/@data-original').extract_first() # 第一张图片和其他图片的标签的熟悉是不一样的 # 第一张图片的src是可以使用的,其他的图片的地址是data-original if src: src = src else: src = li.xpath('.//img/@src').extract_first() name = li.xpath('.//img/@alt').extract_first() price = li.xpath('.//p[@class="price"]/span[1]/text()').extract_first() book = ScrapyDangdangItem(src=src,name=name,price=price) # 获取一个book就将book交给pipelines yield book #每一页爬取的业务逻辑都是一样的,所以我们只需要将执行的那个页的请求再次调用parse方法就可以了 if self.page < 10: self.page = self.page + 1 url = self.base_url+str(self.page)+'-cp01.01.02.00.00.00.html' # 调用parse方法 # scrapy.Request就是scrapy的get请求 yield scrapy.Request(url=url,callback=self.parse)
items.py文件
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class ScrapyDangdangItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 通俗的说就是你要下载的数据都有什么 # 图片 src = scrapy.Field() # 名字 name = scrapy.Field() # 价格 price = scrapy.Field() pass
pipelines.py文件
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter # 如果想要使用管道的话,那么就必须在settings中开启管道 class ScrapyDangdangPipeline: # 在爬虫文件开始的之前就执行一个方法 def open_spider(self,spider): self.fp = open('book.json','w',encoding='utf-8') # item就是yield后面的book对象 def process_item(self, item, spider): # 以下这种模式不推荐,因为每传递过来一个对象,那么就打开一次文件,对文件的操作过于频繁 # (1) write方法必须要写一个字符串,而不是其他的对象 # (2)w模式会每一个对象都打开一次文件,覆盖之前的内容,所有用a模式 # with open('book.json','a',encoding='utf-8')as fp: # fp.write(str(item)) self.fp.write(str(item)) return item # 在爬虫文件执行完之后,执行的方法 def close_spider(self,spider): self.fp.close() import urllib.request # 多条管道开启 # (1) 定义管道类(下载图片) # (2)在settings中开启管道 class DangDangDownloadPipeline: def process_item(self,item,spider): url = 'http:'+item.get('src') filename = './books/'+item.get('name')+'.jpg' urllib.request.urlretrieve(url=url,filename=filename) return item
settings.py文件
# Scrapy settings for scrapy_dangdang project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'scrapy_dangdang' SPIDER_MODULES = ['scrapy_dangdang.spiders'] NEWSPIDER_MODULE = 'scrapy_dangdang.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'scrapy_dangdang (+http://www.yourdomain.com)' # Obey robots.txt rules # ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'scrapy_dangdang.middlewares.ScrapyDangdangSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'scrapy_dangdang.middlewares.ScrapyDangdangDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html # 开启管道 ITEM_PIPELINES = { # 管道可以有很多个,那么管道是有优先级的,优先级的范围是1到1000,值越小优先级越高 'scrapy_dangdang.pipelines.ScrapyDangdangPipeline': 300, 'scrapy_dangdang.pipelines.DangDangDownloadPipeline': 301 } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
代码地址:https://gitee.com/heating-cloud/python_spider.git
标签:self,当当网,爬取,item,scrapy,html,https,dangdang 来源: https://www.cnblogs.com/ckfuture/p/16327206.html