How Scrapy Processes Start URLs
The Scrapy engine handles initial URLs through the following sequence:
- Invokes
start_requestsand collects its return value - Creates a iterator from the return value
- Iterates through results, calling
__next__()on each item - Places all generated request objects into the scheduler
Source Implementation
def start_requests(self):
cls = self.__class__
if method_is_overridden(cls, Spider, 'make_requests_from_url'):
warnings.warn(
"Spider.make_requests_from_url method is deprecated; it "
"won't be called in future Scrapy releases. Please "
"override Spider.start_requests method instead (see %s.%s)." % (
cls.__module__, cls.__name__
),
)
for url in self.start_urls:
yield self.make_requests_from_url(url)
else:
for url in self.start_urls:
yield Request(url, dont_filter=True)
Custom Start Requesst
You can override start_requests to implement custom URL generation logic:
def start_requests(self):
seed_url = 'https://news.example.com/'
yield Request(
url=seed_url,
callback=self.handle_login,
meta={'cookiejar': True}
)
A practical extension involves fetching initial URLs from external sources like Redis:
def start_requests(self):
initial_urls = redis_client.smembers('crawler:pending_urls')
for url in initial_urls:
yield Request(url=url, callback=self.parse)
Depth Management and Priority
Depth Tracking
- Initial depth starts at
0 - Each
yieldincrements depth based on the parent request's depth plus one
Configuration Options
| Setting | Purpose |
|---|---|
DEPTH_LIMIT |
Maximum crawl depth |
DEPTH_PRIORITY |
Priority reduction factor per depth level |
The priority calculation:
request.priority -= depth * DEPTH_PRIORITY
DepthMiddleware Implementation
from scrapy.spidermiddlewares.depth import DepthMiddleware
class DepthMiddleware(object):
def __init__(self, maxdepth, stats=None, verbose_stats=False, prio=1):
self.maxdepth = maxdepth
self.stats = stats
self.verbose_stats = verbose_stats
self.prio = prio
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
maxdepth = settings.getint('DEPTH_LIMIT')
verbose = settings.getbool('DEPTH_STATS_VERBOSE')
prio = settings.getint('DEPTH_PRIORITY')
return cls(maxdepth, crawler.stats, verbose, prio)
def process_spider_output(self, response, result, spider):
def _filter(request):
if isinstance(request, Request):
current_depth = response.meta['depth'] + 1
request.meta['depth'] = current_depth
if self.prio:
request.priority -= current_depth * self.prio
if self.maxdepth and current_depth > self.maxdepth:
logger.debug(
"Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
{'maxdepth': self.maxdepth, 'requrl': request.url},
extra={'spider': spider}
)
return False
elif self.stats:
if self.verbose_stats:
self.stats.inc_value('request_depth_count/%s' % current_depth,
spider=spider)
self.stats.max_value('request_depth_max', current_depth,
spider=spider)
return True
if self.stats and 'depth' not in response.meta:
response.meta['depth'] = 0
if self.verbose_stats:
self.stats.inc_value('request_depth_count/0', spider=spider)
return (r for r in result or () if _filter(r))
The response's meta dictionary inherits from the original request:
from scrapy.http import Response
class Response(object_ref):
def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None):
self.headers = Headers(headers or {})
self.status = int(status)
self._set_body(body)
self._set_url(url)
self.request = request
self.flags = [] if flags is None else list(flags)
@property
def meta(self):
try:
return self.request.meta
Proxy Configuration
Method 1: Environment Variables
Set proxy values in environment variables before the spider starts:
class NewsSpider(scrapy.Spider):
name = 'news'
allowed_domains = ['example.com']
start_urls = ['https://example.com/']
def start_requests(self):
import os
os.environ['HTTPS_PROXY'] = "http://user:pass123@10.0.0.1:8080/"
os.environ['HTTP_PROXY'] = '10.0.0.2:3128'
for url in self.start_urls:
yield Request(url=url, callback=self.parse)
Method 2: Request Meta
Include proxy information directly in request metadata:
def start_requests(self):
proxy = 'http://user:pass123@10.0.0.1:8080/'
for url in self.start_urls:
yield Request(
url=url,
callback=self.parse,
meta={'proxy': proxy}
)
Custom Proxy Middleware
ProxyRotationMiddleware:
import base64
import random
from six.moves.urllib.parse import unquote
from six.moves.urllib.request import _parse_proxy
from six.moves.urllib.parse import urlunparse
from scrapy.utils.python import to_bytes
class ProxyRotationMiddleware:
def _encode_credentials(self, username, password):
credentials = to_bytes(
'%s:%s' % (unquote(username), unquote(password)),
encoding='latin-1'
)
return base64.b64encode(credentials).strip()
def process_request(self, request, spider):
proxy_pool = [
"http://user1:pass1@192.168.1.10:8080/",
"http://user2:pass2@192.168.1.11:8080/",
"http://user3:pass3@192.168.1.12:8080/",
]
selected_proxy = random.choice(proxy_pool)
orig_type = ""
proxy_type, user, password, hostport = _parse_proxy(selected_proxy)
clean_proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
auth_header = None
if user:
auth_header = self._encode_credentials(user, password)
request.meta['proxy'] = clean_proxy_url
if auth_header:
request.headers['Proxy-Authorization'] = b'Basic ' + auth_header
SimpleProxyMiddleware:
class SimpleProxyMiddleware:
def process_request(self, request, spider):
proxy_pool = [
{'ip_port': '111.11.228.75:80', 'user_pass': ''},
{'ip_port': '120.198.243.22:80', 'user_pass': ''},
{'ip_port': '111.8.60.9:8123', 'user_pass': ''},
{'ip_port': '101.71.27.120:80', 'user_pass': ''},
]
selected = random.choice(proxy_pool)
request.meta['proxy'] = to_bytes("http://%s" % selected['ip_port'])
if selected['user_pass']:
encoded = base64.b64encode(to_bytes(selected['user_pass']))
request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded)
Enable in settings:
DOWNLOADER_MIDDLEWARES = {
'mypackage.middleware.ProxyRotationMiddleware': 751,
'mypackage.middleware.SimpleProxyMiddleware': 751,
}
Using Selectors Directly
You can leverage Scrapy's selector functionality without a full crawl:
html_content = """
<html>
<head lang="en">
<meta charset="UTF-8">
<title>Crawler Demo</title>
</head>
<body>
<ul>
<li class="item-"><a id='i1' href="link1.html">First item</a></li>
<li class="item-0"><a id='i2' href="link2.html">Second item</a></li>
<li class="item-1"><a href="link3.html">Third item<span>extra</span></a></li>
</ul>
<div><a href="link4.html">Another link</a></div>
</body>
</html>
"""
from scrapy.http import HtmlResponse
from scrapy.selector import Selector
response_obj = HtmlResponse(
url='http://example.com',
body=html_content,
encoding='utf-8'
)
selector = Selector(response_obj)
links = selector.xpath('//a/@href').extract()
print(links)
Creaitng Custom Commands
Step 1: Create Commands Directory
Add a commands directory alongside your spiders folder.
Step 2: Implement the Command
from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings
class RunAllSpidersCommand(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Executes all project spiders sequentially'
def run(self, args, opts):
available_spiders = self.crawler_process.spiders.list()
for spider_name in available_spiders:
self.crawler_process.crawl(spider_name, **opts.__dict__)
self.crawler_process.start()
Step 3: Configure Settings
COMMANDS_MODULE = "myproject.commands"
Step 4: Execute
scrapy runall
Running Individual Spiders Programmatically
import sys
from scrapy.cmdline import execute
if __name__ == '__main__':
execute(["scrapy", "myspider", "--nolog"])
This approach allows you to launch specific spiders from Python code with configurable arguments.