scrapy.exceptions.IgnoreRequest

Here are the examples of the python api scrapy.exceptions.IgnoreRequest taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

15 Examples 7

3 Source : robotstxt.py
with MIT License
from autofelix

    def process_request_2(self, rp, request, spider):
        if rp is None:
            return
        if not rp.can_fetch(to_native_str(self._useragent), request.url):
            logger.debug("Forbidden by robots.txt: %(request)s",
                         {'request': request}, extra={'spider': spider})
            self.crawler.stats.inc_value('robotstxt/forbidden')
            raise IgnoreRequest("Forbidden by robots.txt")

    def robot_parser(self, request, spider):

3 Source : controllers.py
with MIT License
from crawlerflow

    def process_request(self, request, spider):
        current_request_traversal_id = request.meta.get("current_request_traversal_id")
        current_traversal_max_count = request.meta.get("current_traversal_max_count")
        if current_request_traversal_id != "init":
            current_request_traversal_count = spider.crawler.stats.get_value(
                'crawlerflow-stats/traversals/{}/requests_count'.format(current_request_traversal_id), spider=spider)
            if current_request_traversal_count and current_request_traversal_count > current_traversal_max_count:
                raise IgnoreRequest(
                    reason="max traversals for traversal_id: {} achieved".format(current_request_traversal_id))


class SpiderRequestsBasedStopController(object):

3 Source : controllers.py
with MIT License
from crawlerflow

    def process_request(self, request, spider):
        spider_id = spider.spider_config.get("spider_id")

        current_spider_requests_count = spider.crawler.stats.get_value(
            'crawlerflow-stats/spiders/{}/requests_count'.format(spider_id),
            spider=spider)

        current_spider_max_requests_count = spider.spider_config.get("stop_criteria", {}).get("max_requests")
        if current_spider_max_requests_count and current_spider_requests_count:
            if current_spider_requests_count > current_spider_max_requests_count:
                raise IgnoreRequest(
                    reason="max requests {} for spider: {} achieved."
                           "Ignoring the rest of this spider requests".format(current_spider_max_requests_count,
                                                                              spider_id))


class SpiderResponsesBasedStopController(object):

3 Source : controllers.py
with MIT License
from crawlerflow

    def process_request(self, request, spider):
        spider_id = spider.spider_config.get("spider_id")

        current_spider_responses_count = spider.crawler.stats.get_value(
            'crawlerflow-stats/spiders/{}/responses_count'.format(spider_id),
            spider=spider)

        current_spider_max_responses_count = spider.spider_config.get("stop_criteria", {}).get("max_responses")
        if current_spider_max_responses_count and current_spider_responses_count:
            if current_spider_responses_count > current_spider_max_responses_count:
                raise IgnoreRequest(
                    reason="max responses {} for spider: {} achieved."
                           "Ignoring the rest of this spider requests ".format(current_spider_max_responses_count,
                                                                               spider_id))

3 Source : robotstxt.py
with Apache License 2.0
from lynings

    def process_request_2(self, rp, request, spider):
        if rp is not None and not rp.can_fetch(
                 to_native_str(self._useragent), request.url):
            logger.debug("Forbidden by robots.txt: %(request)s",
                         {'request': request}, extra={'spider': spider})
            raise IgnoreRequest()

    def robot_parser(self, request, spider):

3 Source : downloadermiddlewares.py
with BSD 3-Clause "New" or "Revised" License
from open-contracting

    def process_request(self, request, spider):
        if 'auth' in request.meta and request.meta['auth'] is False:
            return
        if spider.auth_failed:
            spider.crawler.engine.close_spider(spider, 'auth_failed')
            raise IgnoreRequest("Max attempts to get an access token reached. Stopping crawl...")
        request.headers['Authorization'] = spider.access_token
        if self._expires_soon(spider):
            # SAVE the last request to continue after getting the token
            spider.last_requests.append(request)
            spider.logger.info('Saving request for after getting the token: %s', request.url)
            # spider MUST implement the request_access_token method
            return spider.request_access_token()

    def process_response(self, request, response, spider):

3 Source : anti_spider.py
with MIT License
from zhanghe06

    def process_request(self, request, spider):
        # 处理微信反爬(反爬机制一, sogou)
        if spider.name in ['weixin'] and 'antispider' in request.url:
            # 获取来源链接
            redirect_urls = request.meta['redirect_urls']

            # 清理失效 cookies
            cookies_id = request.meta['cookiejar']
            del_cookies(spider.name, cookies_id)

            # spider.log(message='AntiSpider cookies_id: %s; url: %s' % (cookies_id, redirect_urls[0]))
            raise IgnoreRequest(
                'Spider: %s, AntiSpider cookies_id: %s; url: %s' % (spider.name, cookies_id, redirect_urls[0]))

    def process_response(self, request, response, spider):

3 Source : de_duplication_request.py
with MIT License
from zhanghe06

    def process_request(self, request, spider):
        if not request.url:
            return None
        channel_id = request.meta.get('channel_id', 0)
        # 处理详情页面(忽略列表页面)与pipeline配合
        if is_dup_detail(request.url, spider.name, channel_id):
            raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" % (spider.name, request.url))

0 Source : httpcache.py
with MIT License
from autofelix

    def process_request(self, request, spider):
        if request.meta.get('dont_cache', False):
            return

        # Skip uncacheable requests
        if not self.policy.should_cache_request(request):
            request.meta['_dont_cache'] = True  # flag as uncacheable
            return

        # Look for cached response and check if expired
        cachedresponse = self.storage.retrieve_response(spider, request)
        if cachedresponse is None:
            self.stats.inc_value('httpcache/miss', spider=spider)
            if self.ignore_missing:
                self.stats.inc_value('httpcache/ignore', spider=spider)
                raise IgnoreRequest("Ignored request not in cache: %s" % request)
            return  # first time request

        # Return cached response only if not expired
        cachedresponse.flags.append('cached')
        if self.policy.is_cached_response_fresh(cachedresponse, request):
            self.stats.inc_value('httpcache/hit', spider=spider)
            return cachedresponse

        # Keep a reference to cached response to avoid a second cache lookup on
        # process_response hook
        request.meta['cached_response'] = cachedresponse

    def process_response(self, request, response, spider):

0 Source : redirect.py
with MIT License
from autofelix

    def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects   <  = self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \
                [reason]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
                         {'reason': reason, 'redirected': redirected, 'request': request},
                         extra={'spider': spider})
            return redirected
        else:
            logger.debug("Discarding %(request)s: max redirections reached",
                         {'request': request}, extra={'spider': spider})
            raise IgnoreRequest("max redirections reached")

    def _redirect_request_using_get(self, request, redirect_url):

0 Source : middlewares.py
with Apache License 2.0
from cat9

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest

        if (request.meta.get('dont_redirect', False) or
                response.status in getattr(spider, 'handle_httpstatus_list', []) or
                response.status in request.meta.get('handle_httpstatus_list', []) or
                request.meta.get('handle_httpstatus_all', False)):
            return response

        allowed_status = (301, 302, 303, 307, 308)
        if 'Location' not in response.headers or response.status not in allowed_status:
            return response

        location = safe_url_string(response.headers['location'])

        if response.status == 302 and (location.startswith('https://sec.aliexpress.com') or location.startswith('https://login.aliexpress.com')):
            redirects = request.meta.get('redirect_times', 0) + 1
            print("AliexpressRedirectMiddleware redirects %d,%s" % (redirects, location))
            if redirects   <  = self.max_redirect_times:
                return response
            else:
                logger.debug("Discarding %(request)s: max redirections reached",
                             {'request': request}, extra={'spider': spider})
                raise IgnoreRequest("max redirections reached")
        else:
            redirected_url = urljoin(request.url, location)

        if response.status in (301, 307, 308) or request.method == 'HEAD':
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        redirected = self._redirect_request_using_get(request, redirected_url)
        return self._redirect(redirected, request, spider, response.status)

0 Source : redirect.py
with The Unlicense
from dspray95

    def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects   <  = self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
                         {'reason': reason, 'redirected': redirected, 'request': request},
                         extra={'spider': spider})
            return redirected
        else:
            logger.debug("Discarding %(request)s: max redirections reached",
                         {'request': request}, extra={'spider': spider})
            raise IgnoreRequest("max redirections reached")

    def _redirect_request_using_get(self, request, redirect_url):

0 Source : middlewares.py
with MIT License
from ginopalazzo

    def process_response(self, request, response, spider):
        """
        Process the response, checking if the server has block an IP.
        :param request: the request that originated the response
        :param response: the response being processed
        :param spider: the spider for which this response is intended
        :return: pass the response to the next layer.
        """
        log.debug('   request: %s %s' % (request.url, request.meta))
        log.debug('   response: %s %s' % (response.url, response.status))
        log.debug('   num_pages: %s' % self._num_pages)
        self._num_pages = self._num_pages + 1
        # check if the server send a block response and delete that IP from the list.
        if response.status in self._http_status_codes:
            del self.proxies[-1]
            reason = '  %d in %s . IP %s deleted. %d proxies left' % (response.status, response.url, request.meta['proxy'], len(self.proxies))
            log.warning(reason)
            raise IgnoreRequest(reason)
        return response

    def process_exception(self, request, exception, spider):

0 Source : middlewares.py
with MIT License
from Open-Speech-EkStep

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        # if request.url.rstrip() in self.bing_archive:
        #     raise IgnoreRequest()

        for word in self.word_to_ignore:
            if word.lower() in request.url.lower():
                raise IgnoreRequest()

        for ext in self.extensions_to_ignore:
            if request.url.lower().endswith(ext):
                raise IgnoreRequest()
        # with open(self.bing_archive_path,'a') as f:
        #     f.write(request.url+"\n")
        self.visited_urls.append(request.url)
        return None

    def process_response(self, request, response, spider):

0 Source : middlewares.py
with MIT License
from tonywu7

    def process_response(self, request, response: Response, spider):
        if response.status == 401:
            self.log.warning('Server returned HTTP 401 Unauthorized.')
            self.log.warning('This is because you are accessing an API that requires authorization, and')
            self.log.warning('your either did not provide, or provided a wrong access token.')
            self.log.warning(f'URL: {request.url}')
            raise IgnoreRequest()
        if response.status == 429 and urlsplit(request.url) == 'cloud.feedly.com':
            retry_after = response.headers.get('Retry-After')
            if retry_after:
                retry_after = int(retry_after)
                self.log.warning('Server returned HTTP 429 Too Many Requests.')
                self.log.warning('Either your IP address or your developer account is being rate-limited.')
                self.log.warning(f'Retry-After = {retry_after}s')
                self.log.warning(f'Scrapy will now pause for {retry_after}s')
                spider.crawler.engine.pause()
                to_sleep = retry_after * 1.2
                try:
                    wait(to_sleep)
                except KeyboardInterrupt:
                    self.crawler.engine.unpause()
                    raise
                spider.crawler.engine.unpause()
                self.log.info('Resuming crawl.')
                return request.copy()
            else:
                self.log.critical('Server returned HTTP 429 Too Many Requests.')
                self.log.critical('Either your IP address or your developer account is being rate-limited.')
                self.log.critical('Crawler will now stop.')
                self.crawler.engine.close_spider(spider, 'rate_limited')
                raise IgnoreRequest()
        return response