scrapy.signals.spider_idle

Here are the examples of the python api scrapy.signals.spider_idle taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

8 Examples 7

Example 1

Project: scrapy-mosquitera Source File: mixin.py
Function: dm_setup
    def dm_setup(self):
        """ Set method for spider idle state.
        It's implemented this way to support one and many instances of the mixin.

        """
        dispatcher.connect(
            self.dequeue_next_page_requests,
            signal=signals.spider_idle
        )
        self._was_setup_called = True

Example 2

Project: scrapy-mosquitera Source File: mixin.py
    def dm_teardown(self):
        """ Disconnect the method from the signal.
        It's done to avoid conflicts when many instances of the mixin are being executed.

        """
        try:
            dispatcher.disconnect(
                self.dequeue_next_page_requests,
                signal=signals.spider_idle
            )
        except DispatcherKeyError:
            pass

Example 3

Project: scrapy Source File: engine.py
    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

Example 4

Project: scrapy-redis Source File: test_spiders.py
    @mock.patch('scrapy_redis.spiders.connection')
    def test_via_from_crawler(self, connection):
        server = connection.from_settings.return_value = mock.Mock()
        crawler = get_crawler()
        myspider = MySpider.from_crawler(crawler)
        assert myspider.server is server
        connection.from_settings.assert_called_with(crawler.settings)
        crawler.signals.connect.assert_called_with(myspider.spider_idle, signal=signals.spider_idle)
        # Second call does nothing.
        server = myspider.server
        crawler.signals.connect.reset_mock()
        myspider.setup_redis()
        assert myspider.server is server
        assert crawler.signals.connect.call_count == 0

Example 5

Project: scrapy-rabbitmq Source File: spiders.py
    def setup_rabbitmq(self):
        """ Setup RabbitMQ connection.

            Call this method after spider has set its crawler object.
        :return: None
        """

        if not self.rabbitmq_key:
            self.rabbitmq_key = '{}:start_urls'.format(self.name)

        self.server = connection.from_settings(self.crawler.settings)
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)

Example 6

Project: scrapy-redis Source File: spiders.py
Function: set_up_redis
    def setup_redis(self, crawler=None):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.
        """
        if self.server is not None:
            return

        if crawler is None:
            # We allow optional crawler argument to keep backwards
            # compatibility.
            # XXX: Raise a deprecation warning.
            crawler = getattr(self, 'crawler', None)

        if crawler is None:
            raise ValueError("crawler is required")

        settings = crawler.settings

        if self.redis_key is None:
            self.redis_key = settings.get(
                'REDIS_START_URLS_KEY', DEFAULT_START_URLS_KEY,
            )

        self.redis_key = self.redis_key % {'name': self.name}

        if not self.redis_key.strip():
            raise ValueError("redis_key must not be empty")

        if self.redis_batch_size is None:
            self.redis_batch_size = settings.getint(
                'REDIS_START_URLS_BATCH_SIZE', DEFAULT_START_URLS_BATCH_SIZE,
            )

        try:
            self.redis_batch_size = int(self.redis_batch_size)
        except (TypeError, ValueError):
            raise ValueError("redis_batch_size must be an integer")

        self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
                         "(batch size: %(redis_batch_size)s)", self.__dict__)

        self.server = connection.from_settings(crawler.settings)
        # The idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)

Example 7

Project: scrapy-kafka Source File: spiders.py
    def setup_kafka(self, settings):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.

        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name

        hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
        _kafka = KafkaClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
                                       auto_commit=True, iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)

Example 8

Project: scrapy-cluster Source File: redis_spider.py
Function: set_crawler
    def _set_crawler(self, crawler):
        super(RedisSpider, self)._set_crawler(crawler)
        self.crawler.signals.connect(self.spider_idle,
                                     signal=signals.spider_idle)