Here are the examples of the python api scrapy.exceptions.IgnoreRequest taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
8 Examples
3
Example 1
def _redirect(self, redirected, request, spider, reason):
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
redirects = request.meta.get('redirect_times', 0) + 1
if ttl and redirects <= self.max_redirect_times:
redirected.meta['redirect_times'] = redirects
redirected.meta['redirect_ttl'] = ttl - 1
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
[request.url]
redirected.dont_filter = request.dont_filter
redirected.priority = request.priority + self.priority_adjust
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
{'reason': reason, 'redirected': redirected, 'request': request},
extra={'spider': spider})
return redirected
else:
logger.debug("Discarding %(request)s: max redirections reached",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest("max redirections reached")
3
Example 2
Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
def test_ignore_robotstxt_request(self):
self.crawler.settings.set('ROBOTSTXT_OBEY', True)
def ignore_request(request, spider):
deferred = Deferred()
reactor.callFromThread(deferred.errback, failure.Failure(IgnoreRequest()))
return deferred
self.crawler.engine.download.side_effect = ignore_request
middleware = RobotsTxtMiddleware(self.crawler)
mw_module_logger.error = mock.MagicMock()
d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
d.addCallback(lambda _: self.assertFalse(mw_module_logger.error.called))
return d
3
Example 3
def _process_requests(self, items_or_requests, start=False):
"""Acquire the webdriver manager when it's available for requests."""
error_msg = "WebdriverRequests from start_requests can't be in-page."
for request in iter(items_or_requests):
if isinstance(request, WebdriverRequest):
if start and isinstance(request, WebdriverActionRequest):
raise IgnoreRequest(error_msg)
request = self.manager.acquire(request)
if request is WebdriverRequest.WAITING:
continue # Request has been enqueued, so drop it.
yield request
3
Example 4
def process_response(self, request, response, spider):
if 'x-ignore-response' in request.url:
raise IgnoreRequest()
elif 'x-error-response' in request.url:
_ = 1 / 0
else:
return response
0
Example 5
def process_request(self, request, spider):
if request.meta.get('dont_cache', False):
return
# Skip uncacheable requests
if not self.policy.should_cache_request(request):
request.meta['_dont_cache'] = True # flag as uncacheable
return
# Look for cached response and check if expired
cachedresponse = self.storage.retrieve_response(spider, request)
if cachedresponse is None:
self.stats.inc_value('httpcache/miss', spider=spider)
if self.ignore_missing:
self.stats.inc_value('httpcache/ignore', spider=spider)
raise IgnoreRequest("Ignored request not in cache: %s" % request)
return # first time request
# Return cached response only if not expired
cachedresponse.flags.append('cached')
if self.policy.is_cached_response_fresh(cachedresponse, request):
self.stats.inc_value('httpcache/hit', spider=spider)
return cachedresponse
# Keep a reference to cached response to avoid a second cache lookup on
# process_response hook
request.meta['cached_response'] = cachedresponse
0
Example 6
Project: scrapy Source File: robotstxt.py
def process_request_2(self, rp, request, spider):
if rp is not None and not rp.can_fetch(self._useragent, request.url):
logger.debug("Forbidden by robots.txt: %(request)s",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest()
0
Example 7
def process_request(self, request, spider):
if 'x-ignore-request' in request.url:
raise IgnoreRequest()
elif 'x-error-request' in request.url:
_ = 1 / 0
0
Example 8
def process_response(self, request, response, spider):
# Since scrappy use start_urls and stop_urls before creating the request
# If the url get redirected then this url gets crawled even if it's not allowed to
# So we check if the final url is allowed
if spider.remove_get_params:
o = urlparse(response.url)
url_without_params = o.scheme + "://" + o.netloc + o.path
response = response.replace(url=url_without_params)
for rule in spider._rules:
if not spider.strict_redirect:
if rule.link_extractor._link_allowed(response):
continue
if rule.link_extractor._link_allowed(request):
response.replace(url=request.url)
continue
else:
if rule.link_extractor._link_allowed(response) and rule.link_extractor._link_allowed(request):
continue
if request.url in spider.start_urls and spider.scrap_start_urls is False:
continue
if not (spider.scrap_start_urls and response.url in spider.start_urls):
print("\033[94m> Ignored:\033[0m " + response.url)
raise IgnoreRequest()
return response