scrapy.exceptions.IgnoreRequest

Here are the examples of the python api scrapy.exceptions.IgnoreRequest taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

8 Examples 7

Example 1

Project: scrapy Source File: redirect.py
Function: redirect
    def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
                         {'reason': reason, 'redirected': redirected, 'request': request},
                         extra={'spider': spider})
            return redirected
        else:
            logger.debug("Discarding %(request)s: max redirections reached",
                         {'request': request}, extra={'spider': spider})
            raise IgnoreRequest("max redirections reached")

Example 2

Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
    def test_ignore_robotstxt_request(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        def ignore_request(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.errback, failure.Failure(IgnoreRequest()))
            return deferred
        self.crawler.engine.download.side_effect = ignore_request

        middleware = RobotsTxtMiddleware(self.crawler)
        mw_module_logger.error = mock.MagicMock()

        d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
        d.addCallback(lambda _: self.assertFalse(mw_module_logger.error.called))
        return d

Example 3

Project: scrapy-webdriver Source File: middlewares.py
Function: process_requests
    def _process_requests(self, items_or_requests, start=False):
        """Acquire the webdriver manager when it's available for requests."""
        error_msg = "WebdriverRequests from start_requests can't be in-page."
        for request in iter(items_or_requests):
            if isinstance(request, WebdriverRequest):
                if start and isinstance(request, WebdriverActionRequest):
                    raise IgnoreRequest(error_msg)
                request = self.manager.acquire(request)
                if request is WebdriverRequest.WAITING:
                    continue  # Request has been enqueued, so drop it.
            yield request

Example 4

Project: sozlukcrawler Source File: middleware.py
Function: process_response
    def process_response(self, request, response, spider):
        if 'x-ignore-response' in request.url:
            raise IgnoreRequest()
        elif 'x-error-response' in request.url:
            _ = 1 / 0
        else:
            return response

Example 5

Project: scrapy Source File: httpcache.py
Function: process_request
    def process_request(self, request, spider):
        if request.meta.get('dont_cache', False):
            return

        # Skip uncacheable requests
        if not self.policy.should_cache_request(request):
            request.meta['_dont_cache'] = True  # flag as uncacheable
            return

        # Look for cached response and check if expired
        cachedresponse = self.storage.retrieve_response(spider, request)
        if cachedresponse is None:
            self.stats.inc_value('httpcache/miss', spider=spider)
            if self.ignore_missing:
                self.stats.inc_value('httpcache/ignore', spider=spider)
                raise IgnoreRequest("Ignored request not in cache: %s" % request)
            return  # first time request

        # Return cached response only if not expired
        cachedresponse.flags.append('cached')
        if self.policy.is_cached_response_fresh(cachedresponse, request):
            self.stats.inc_value('httpcache/hit', spider=spider)
            return cachedresponse

        # Keep a reference to cached response to avoid a second cache lookup on
        # process_response hook
        request.meta['cached_response'] = cachedresponse

Example 6

Project: scrapy Source File: robotstxt.py
    def process_request_2(self, rp, request, spider):
        if rp is not None and not rp.can_fetch(self._useragent, request.url):
            logger.debug("Forbidden by robots.txt: %(request)s",
                         {'request': request}, extra={'spider': spider})
            raise IgnoreRequest()

Example 7

Project: sozlukcrawler Source File: middleware.py
Function: process_request
    def process_request(self, request, spider):
        if 'x-ignore-request' in request.url:
            raise IgnoreRequest()
        elif 'x-error-request' in request.url:
            _ = 1 / 0

Example 8

Project: docsearch-scraper Source File: custom_middleware.py
Function: process_response
    def process_response(self, request, response, spider):
        # Since scrappy use start_urls and stop_urls before creating the request
        # If the url get redirected then this url gets crawled even if it's not allowed to
        # So we check if the final url is allowed

        if spider.remove_get_params:
            o = urlparse(response.url)
            url_without_params = o.scheme + "://" + o.netloc + o.path
            response = response.replace(url=url_without_params)

        for rule in spider._rules:
            if not spider.strict_redirect:
                if rule.link_extractor._link_allowed(response):
                    continue

                if rule.link_extractor._link_allowed(request):
                    response.replace(url=request.url)
                    continue
            else:
                if rule.link_extractor._link_allowed(response) and rule.link_extractor._link_allowed(request):
                    continue

            if request.url in spider.start_urls and spider.scrap_start_urls is False:
                continue

            if not (spider.scrap_start_urls and response.url in spider.start_urls):
                print("\033[94m> Ignored:\033[0m " + response.url)
                raise IgnoreRequest()

        return response