Building Distributed Scrapy Spiders with Redis

RedisSpider Overview

RedisSpider extends Scrapy's base Spider class to enable distributed crawling. Instead of using a static start_urls list, this spider reads URLs from a Redis queue.

Key Differences from Standard Spider

The main modifications involve imports, inheritance, and replacing the static URL list with a Redis key:

from scrapy_redis.spiders import RedisSpider

class BookSpider(RedisSpider):
    """Reads URLs from Redis queue."""
    name = 'book_spider'
    redis_key = 'book:start_urls'
    allowed_domains = ['dangdang.com', 'p.3.cn']

    def parse(self, response):
        pass

Required Settings Configuration

Add the following to settings.py to enable distributed scheduling:

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_URL = "redis://127.0.0.1:6379"

ITEM_PIPELINES = {
    'book.pipelines.BookPipeline': 300,
}

Complete RedisSpider Example: Crawling Dangdang Books

Project Setup

scrapy startproject book
cd book
scrapy genspider dangdang dangdang.com

Spider Implementation

# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
from scrapy_redis.spiders import RedisSpider

class DangdangSpider(RedisSpider):
    name = 'dangdang'
    allowed_domains = ['dangdang.com']
    redis_key = "dangdang"

    def parse(self, response):
        category_divs = response.xpath("//div[@class='con flq_body']/div")
        for div in category_divs:
            item = {}
            item["top_category"] = div.xpath("./dl//a/text()").extract()
            dl_elements = div.xpath("./div//dl[@class='inner_dl']")
            for dl in dl_elements:
                item["mid_category"] = dl.xpath("./dt/a/text()").extract_first()
                links = dl.xpath("./dd/a")
                for link in links:
                    item["sub_category"] = link.xpath("./@title").extract_first()
                    item["sub_category_url"] = link.xpath("./@href").extract_first()
                    if item["sub_category_url"]:
                        yield scrapy.Request(
                            item["sub_category_url"],
                            callback=self.parse_book_list,
                            meta={"item": deepcopy(item)}
                        )

    def parse_book_list(self, response):
        item = deepcopy(response.meta["item"])
        book_items = response.xpath("//ul[@class='bigimg']/li")
        
        for book in book_items:
            item["cover_image"] = book.xpath("./a/img/@data-original").extract_first()
            if item["cover_image"] is None:
                item["cover_image"] = book.xpath("./a/img/@src").extract_first()
            
            item["book_url"] = book.xpath("./a/@href").extract_first()
            item["title"] = book.xpath("./p[@class='name']/a/@title").extract_first()
            item["description"] = book.xpath("./p[@class='detail']/text()").extract_first()
            item["price"] = book.xpath(".//span[@class='search_now_price']/text()").extract_first()
            item["authors"] = book.xpath("./p[@class='search_book_author']/span[1]/a/text()").extract()
            item["publish_date"] = book.xpath("./p[@class='search_book_author']/span[2]/text()").extract_first()
            item["publisher"] = book.xpath("./p[@class='search_book_author']/span[3]/a/text()").extract_first()
            yield item

        next_page = response.xpath("//li[@class='next']/a/@href").extract_first()
        if next_page:
            next_url = "http://category.dangdang.com/" + next_page
            yield scrapy.Request(
                next_url,
                callback=self.parse_book_list,
                meta={"item": response.meta["item"]}
            )

Running the Distributed Spider

Start the spider on multiple terminals. The process will block until URLs are added to Redis:

scrapy crawl dangdang

Push the starting URL to Redis:

redis-cli lpush dangdang http://book.dangdang.com/

Once URLs are in Redis, all running instances will begin crawling simultaneously, distributing the workload across machines.

RedisCrawlSpider Overview

RedisCrawlSpider extends CrawlSpider with Redis-based URL distribution. It supports automatic link following through Rule definitions.

Basic Template

from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from scrapy_redis.spiders import RedisCrawlSpider

class MyCrawler(RedisCrawlSpider):
    name = 'mycrawler_redis'
    redis_key = 'mycrawler:start_urls'
    allowed_domains = ['example.com']

    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        pass

Complete RedisCrawlSpider Example: Crawling Amazon Books

Project Setup

scrapy genspider -t crawl amazon amazon.com

Spider Implementation

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from scrapy_redis.spiders import RedisCrawlSpider

class AmazonSpider(RedisCrawlSpider):
    name = 'amazon'
    allowed_domains = ['amazon.cn']
    redis_key = "amazon"

    rules = (
        Rule(LinkExtractor(
            restrict_xpaths=["//div[@class='categoryRefinementsSection']/ul/li"]
        ), follow=True),
        Rule(LinkExtractor(
            restrict_xpaths=["//div[@id='mainResults']//h2/.."]
        ), callback="parse_book_detail")
    )

    def parse_book_detail(self, response):
        item = {}
        page_title = response.xpath("//title/text()").extract_first()
        item["is_ebook"] = "Kindle" in page_title if page_title else False
        item["title"] = response.xpath("//span[contains(@id,'productTitle')]/text()").extract_first()
        item["publish_date"] = response.xpath("//h1[@id='title']/span[3]/text()").extract_first()
        item["authors"] = response.xpath("//div[@id='byline']/span/a/text()").extract()
        item["price"] = response.xpath("//div[@id='soldByThirdParty']/span/text()").extract()
        
        if item["is_ebook"]:
            item["price"] = response.xpath("//tr[@class='kindle-price']/td/text()").extract()
        
        item["publisher"] = response.xpath("//b[text()='出版社:']/../text()").extract_first()
        item["categories"] = response.xpath("//ul[@class='zg_hrsr']/li[1]/span[2]//a/text()").extract()
        yield item

Running the Spider

scrapy crawl amazon

Push startign URLs to Redis:

redis-cli lpush amazon https://www.amazon.cn/s/ref=lp_658390051_nr_n_4/462-2558471-4466339

Understanding restrict_xpaths

The restrict_xpaths parameter narrows link extraction to specific page regions. LinkExtractors then collect all URLs within the matched elements:

rules = (
    Rule(LinkExtractor(
        restrict_xpaths=["//div[@class='categoryRefinementsSection']/ul/li"]
    ), follow=True),
    Rule(LinkExtractor(
        restrict_xpaths=["//div[@id='mainResults']//h2/.."]
    ), callback="parse_book_detail")
)

The first rule follows all links within category navigation elements to traverse from top-level categories through subcategories. The second rule extracts links within search results and passes them to the callback for detail extraction.

Tags: scrapy-redis distributed crawling Redis python web scraping

Posted on Sun, 17 May 2026 03:26:31 +0000 by glassroof