Understanding Scrapy Start URLs and Downloader Middleware Configuration

How Scrapy Processes Start URLs

The Scrapy engine handles initial URLs through the following sequence:

  1. Invokes start_requests and collects its return value
  2. Creates a iterator from the return value
  3. Iterates through results, calling __next__() on each item
  4. Places all generated request objects into the scheduler

Source Implementation

def start_requests(self):
    cls = self.__class__
    if method_is_overridden(cls, Spider, 'make_requests_from_url'):
        warnings.warn(
            "Spider.make_requests_from_url method is deprecated; it "
            "won't be called in future Scrapy releases. Please "
            "override Spider.start_requests method instead (see %s.%s)." % (
                cls.__module__, cls.__name__
            ),
        )
        for url in self.start_urls:
            yield self.make_requests_from_url(url)
    else:
        for url in self.start_urls:
            yield Request(url, dont_filter=True)

Custom Start Requesst

You can override start_requests to implement custom URL generation logic:

def start_requests(self):
    seed_url = 'https://news.example.com/'
    yield Request(
        url=seed_url, 
        callback=self.handle_login, 
        meta={'cookiejar': True}
    )

A practical extension involves fetching initial URLs from external sources like Redis:

def start_requests(self):
    initial_urls = redis_client.smembers('crawler:pending_urls')
    for url in initial_urls:
        yield Request(url=url, callback=self.parse)

Depth Management and Priority

Depth Tracking

  • Initial depth starts at 0
  • Each yield increments depth based on the parent request's depth plus one

Configuration Options

Setting Purpose
DEPTH_LIMIT Maximum crawl depth
DEPTH_PRIORITY Priority reduction factor per depth level

The priority calculation:

request.priority -= depth * DEPTH_PRIORITY

DepthMiddleware Implementation

from scrapy.spidermiddlewares.depth import DepthMiddleware

class DepthMiddleware(object):

    def __init__(self, maxdepth, stats=None, verbose_stats=False, prio=1):
        self.maxdepth = maxdepth
        self.stats = stats
        self.verbose_stats = verbose_stats
        self.prio = prio

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        maxdepth = settings.getint('DEPTH_LIMIT')
        verbose = settings.getbool('DEPTH_STATS_VERBOSE')
        prio = settings.getint('DEPTH_PRIORITY')
        return cls(maxdepth, crawler.stats, verbose, prio)

    def process_spider_output(self, response, result, spider):
        def _filter(request):
            if isinstance(request, Request):
                current_depth = response.meta['depth'] + 1
                request.meta['depth'] = current_depth
                
                if self.prio:
                    request.priority -= current_depth * self.prio
                
                if self.maxdepth and current_depth > self.maxdepth:
                    logger.debug(
                        "Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
                        {'maxdepth': self.maxdepth, 'requrl': request.url},
                        extra={'spider': spider}
                    )
                    return False
                elif self.stats:
                    if self.verbose_stats:
                        self.stats.inc_value('request_depth_count/%s' % current_depth,
                                             spider=spider)
                    self.stats.max_value('request_depth_max', current_depth,
                                         spider=spider)
            return True

        if self.stats and 'depth' not in response.meta:
            response.meta['depth'] = 0
            if self.verbose_stats:
                self.stats.inc_value('request_depth_count/0', spider=spider)

        return (r for r in result or () if _filter(r))

The response's meta dictionary inherits from the original request:

from scrapy.http import Response

class Response(object_ref):

    def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None):
        self.headers = Headers(headers or {})
        self.status = int(status)
        self._set_body(body)
        self._set_url(url)
        self.request = request
        self.flags = [] if flags is None else list(flags)

    @property
    def meta(self):
        try:
            return self.request.meta

Proxy Configuration

Method 1: Environment Variables

Set proxy values in environment variables before the spider starts:

class NewsSpider(scrapy.Spider):
    name = 'news'
    allowed_domains = ['example.com']
    start_urls = ['https://example.com/']

    def start_requests(self):
        import os
        os.environ['HTTPS_PROXY'] = "http://user:pass123@10.0.0.1:8080/"
        os.environ['HTTP_PROXY'] = '10.0.0.2:3128'
        
        for url in self.start_urls:
            yield Request(url=url, callback=self.parse)

Method 2: Request Meta

Include proxy information directly in request metadata:

def start_requests(self):
    proxy = 'http://user:pass123@10.0.0.1:8080/'
    for url in self.start_urls:
        yield Request(
            url=url, 
            callback=self.parse,
            meta={'proxy': proxy}
        )

Custom Proxy Middleware

ProxyRotationMiddleware:

import base64
import random
from six.moves.urllib.parse import unquote
from six.moves.urllib.request import _parse_proxy
from six.moves.urllib.parse import urlunparse
from scrapy.utils.python import to_bytes

class ProxyRotationMiddleware:

    def _encode_credentials(self, username, password):
        credentials = to_bytes(
            '%s:%s' % (unquote(username), unquote(password)),
            encoding='latin-1'
        )
        return base64.b64encode(credentials).strip()

    def process_request(self, request, spider):
        proxy_pool = [
            "http://user1:pass1@192.168.1.10:8080/",
            "http://user2:pass2@192.168.1.11:8080/",
            "http://user3:pass3@192.168.1.12:8080/",
        ]
        selected_proxy = random.choice(proxy_pool)

        orig_type = ""
        proxy_type, user, password, hostport = _parse_proxy(selected_proxy)
        clean_proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))

        auth_header = None
        if user:
            auth_header = self._encode_credentials(user, password)
        
        request.meta['proxy'] = clean_proxy_url
        if auth_header:
            request.headers['Proxy-Authorization'] = b'Basic ' + auth_header

SimpleProxyMiddleware:

class SimpleProxyMiddleware:
    def process_request(self, request, spider):
        proxy_pool = [
            {'ip_port': '111.11.228.75:80', 'user_pass': ''},
            {'ip_port': '120.198.243.22:80', 'user_pass': ''},
            {'ip_port': '111.8.60.9:8123', 'user_pass': ''},
            {'ip_port': '101.71.27.120:80', 'user_pass': ''},
        ]
        selected = random.choice(proxy_pool)
        request.meta['proxy'] = to_bytes("http://%s" % selected['ip_port'])
        
        if selected['user_pass']:
            encoded = base64.b64encode(to_bytes(selected['user_pass']))
            request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded)

Enable in settings:

DOWNLOADER_MIDDLEWARES = {
    'mypackage.middleware.ProxyRotationMiddleware': 751,
    'mypackage.middleware.SimpleProxyMiddleware': 751,
}

Using Selectors Directly

You can leverage Scrapy's selector functionality without a full crawl:

html_content = """
<html>
    <head lang="en">
        <meta charset="UTF-8">
        <title>Crawler Demo</title>
    </head>
    <body>
        <ul>
            <li class="item-"><a id='i1' href="link1.html">First item</a></li>
            <li class="item-0"><a id='i2' href="link2.html">Second item</a></li>
            <li class="item-1"><a href="link3.html">Third item<span>extra</span></a></li>
        </ul>
        <div><a href="link4.html">Another link</a></div>
    </body>
</html>
"""

from scrapy.http import HtmlResponse
from scrapy.selector import Selector

response_obj = HtmlResponse(
    url='http://example.com', 
    body=html_content,
    encoding='utf-8'
)

selector = Selector(response_obj)
links = selector.xpath('//a/@href').extract()
print(links)

Creaitng Custom Commands

Step 1: Create Commands Directory

Add a commands directory alongside your spiders folder.

Step 2: Implement the Command

from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings

class RunAllSpidersCommand(ScrapyCommand):

    requires_project = True

    def syntax(self):
        return '[options]'

    def short_desc(self):
        return 'Executes all project spiders sequentially'

    def run(self, args, opts):
        available_spiders = self.crawler_process.spiders.list()
        for spider_name in available_spiders:
            self.crawler_process.crawl(spider_name, **opts.__dict__)
        self.crawler_process.start()

Step 3: Configure Settings

COMMANDS_MODULE = "myproject.commands"

Step 4: Execute

scrapy runall

Running Individual Spiders Programmatically

import sys
from scrapy.cmdline import execute

if __name__ == '__main__':
    execute(["scrapy", "myspider", "--nolog"])

This approach allows you to launch specific spiders from Python code with configurable arguments.

Tags: scrapy spider middleware start-urls depth

Posted on Sat, 09 May 2026 12:51:31 +0000 by not_skeletor