其他分享
首页 > 其他分享> > scrapy_分布式_reids

scrapy_分布式_reids

作者:互联网

进阶式练习

settings.py
设置这里一如既往常规

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
ITEM_PIPELINES = {
   'moviePro.pipelines.MovieproPipeline': 300,
}

movie.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from redis import Redis
from moviePro.items import MovieproItem
class MovieSpider(CrawlSpider):
    name = 'movie'
    # allowed_domains = ['www.ccc.com']
    start_urls = ['https://www.4567tv.tv/frim/index1.html']

    rules = (
        Rule(LinkExtractor(allow=r'/frim/index1-\d+\.html'), callback='parse_item', follow=True),
    )
    # 创建redis链接对象
    conn = Redis(host='127.0.0.1', port=6379)

    #用于解析每一个页码对应页面中的电影详情页的url
    def parse_item(self, response):
        li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
        for li in li_list:
            # 获取详情页的url
            detail_url = 'https://www.4567tv.tv' + li.xpath('./div/a/@href').extract_first()

            # 将详情页的url存入redis的set中
            ex = self.conn.sadd('urls', detail_url)
            if ex == 1:
                print('该url没有被爬取过,可以进行数据的爬取')
                yield scrapy.Request(url=detail_url, callback=self.parst_detail)
            else:
                print('数据还没有更新,暂无新数据可爬取!')

    # 解析详情页中的电影名称和类型,进行持久化存储
    def parst_detail(self, response):
        item = MovieproItem()
        item['name'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
        item['desc'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]//text()').extract()
        item['desc'] = ''.join(item['desc'])
        yield item

items.py

import scrapy


class MovieproItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    desc = scrapy.Field()
    # pass

piplines.py

from redis import Redis
class MovieproPipeline(object):
    conn = None
    def open_spider(self,spider):
        self.conn = spider.conn
    def process_item(self, item, spider):
        dic = {
            'name':item['name'],
            'desc':item['desc']
        }
        # print(dic)
        self.conn.lpush('movieData',dic)
        return item

标签:url,reids,redis,item,scrapy,详情页,div,分布式
来源: https://blog.csdn.net/qq_39799322/article/details/115803945