RedisSpider Overview
RedisSpider extends Scrapy's base Spider class to enable distributed crawling. Instead of using a static start_urls list, this spider reads URLs from a Redis queue.
Key Differences from Standard Spider
The main modifications involve imports, inheritance, and replacing the static URL list with a Redis key:
from scrapy_redis.spiders import RedisSpider
class BookSpider(RedisSpider):
"""Reads URLs from Redis queue."""
name = 'book_spider'
redis_key = 'book:start_urls'
allowed_domains = ['dangdang.com', 'p.3.cn']
def parse(self, response):
pass
Required Settings Configuration
Add the following to settings.py to enable distributed scheduling:
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_URL = "redis://127.0.0.1:6379"
ITEM_PIPELINES = {
'book.pipelines.BookPipeline': 300,
}
Complete RedisSpider Example: Crawling Dangdang Books
Project Setup
scrapy startproject book
cd book
scrapy genspider dangdang dangdang.com
Spider Implementation
# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
from scrapy_redis.spiders import RedisSpider
class DangdangSpider(RedisSpider):
name = 'dangdang'
allowed_domains = ['dangdang.com']
redis_key = "dangdang"
def parse(self, response):
category_divs = response.xpath("//div[@class='con flq_body']/div")
for div in category_divs:
item = {}
item["top_category"] = div.xpath("./dl//a/text()").extract()
dl_elements = div.xpath("./div//dl[@class='inner_dl']")
for dl in dl_elements:
item["mid_category"] = dl.xpath("./dt/a/text()").extract_first()
links = dl.xpath("./dd/a")
for link in links:
item["sub_category"] = link.xpath("./@title").extract_first()
item["sub_category_url"] = link.xpath("./@href").extract_first()
if item["sub_category_url"]:
yield scrapy.Request(
item["sub_category_url"],
callback=self.parse_book_list,
meta={"item": deepcopy(item)}
)
def parse_book_list(self, response):
item = deepcopy(response.meta["item"])
book_items = response.xpath("//ul[@class='bigimg']/li")
for book in book_items:
item["cover_image"] = book.xpath("./a/img/@data-original").extract_first()
if item["cover_image"] is None:
item["cover_image"] = book.xpath("./a/img/@src").extract_first()
item["book_url"] = book.xpath("./a/@href").extract_first()
item["title"] = book.xpath("./p[@class='name']/a/@title").extract_first()
item["description"] = book.xpath("./p[@class='detail']/text()").extract_first()
item["price"] = book.xpath(".//span[@class='search_now_price']/text()").extract_first()
item["authors"] = book.xpath("./p[@class='search_book_author']/span[1]/a/text()").extract()
item["publish_date"] = book.xpath("./p[@class='search_book_author']/span[2]/text()").extract_first()
item["publisher"] = book.xpath("./p[@class='search_book_author']/span[3]/a/text()").extract_first()
yield item
next_page = response.xpath("//li[@class='next']/a/@href").extract_first()
if next_page:
next_url = "http://category.dangdang.com/" + next_page
yield scrapy.Request(
next_url,
callback=self.parse_book_list,
meta={"item": response.meta["item"]}
)
Running the Distributed Spider
Start the spider on multiple terminals. The process will block until URLs are added to Redis:
scrapy crawl dangdang
Push the starting URL to Redis:
redis-cli lpush dangdang http://book.dangdang.com/
Once URLs are in Redis, all running instances will begin crawling simultaneously, distributing the workload across machines.
RedisCrawlSpider Overview
RedisCrawlSpider extends CrawlSpider with Redis-based URL distribution. It supports automatic link following through Rule definitions.
Basic Template
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from scrapy_redis.spiders import RedisCrawlSpider
class MyCrawler(RedisCrawlSpider):
name = 'mycrawler_redis'
redis_key = 'mycrawler:start_urls'
allowed_domains = ['example.com']
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
pass
Complete RedisCrawlSpider Example: Crawling Amazon Books
Project Setup
scrapy genspider -t crawl amazon amazon.com
Spider Implementation
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from scrapy_redis.spiders import RedisCrawlSpider
class AmazonSpider(RedisCrawlSpider):
name = 'amazon'
allowed_domains = ['amazon.cn']
redis_key = "amazon"
rules = (
Rule(LinkExtractor(
restrict_xpaths=["//div[@class='categoryRefinementsSection']/ul/li"]
), follow=True),
Rule(LinkExtractor(
restrict_xpaths=["//div[@id='mainResults']//h2/.."]
), callback="parse_book_detail")
)
def parse_book_detail(self, response):
item = {}
page_title = response.xpath("//title/text()").extract_first()
item["is_ebook"] = "Kindle" in page_title if page_title else False
item["title"] = response.xpath("//span[contains(@id,'productTitle')]/text()").extract_first()
item["publish_date"] = response.xpath("//h1[@id='title']/span[3]/text()").extract_first()
item["authors"] = response.xpath("//div[@id='byline']/span/a/text()").extract()
item["price"] = response.xpath("//div[@id='soldByThirdParty']/span/text()").extract()
if item["is_ebook"]:
item["price"] = response.xpath("//tr[@class='kindle-price']/td/text()").extract()
item["publisher"] = response.xpath("//b[text()='出版社:']/../text()").extract_first()
item["categories"] = response.xpath("//ul[@class='zg_hrsr']/li[1]/span[2]//a/text()").extract()
yield item
Running the Spider
scrapy crawl amazon
Push startign URLs to Redis:
redis-cli lpush amazon https://www.amazon.cn/s/ref=lp_658390051_nr_n_4/462-2558471-4466339
Understanding restrict_xpaths
The restrict_xpaths parameter narrows link extraction to specific page regions. LinkExtractors then collect all URLs within the matched elements:
rules = (
Rule(LinkExtractor(
restrict_xpaths=["//div[@class='categoryRefinementsSection']/ul/li"]
), follow=True),
Rule(LinkExtractor(
restrict_xpaths=["//div[@id='mainResults']//h2/.."]
), callback="parse_book_detail")
)
The first rule follows all links within category navigation elements to traverse from top-level categories through subcategories. The second rule extracts links within search results and passes them to the callback for detail extraction.