Here are the examples of the python api scrapy.signals.spider_idle taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
8 Examples
3
Example 1
def dm_setup(self):
""" Set method for spider idle state.
It's implemented this way to support one and many instances of the mixin.
"""
dispatcher.connect(
self.dequeue_next_page_requests,
signal=signals.spider_idle
)
self._was_setup_called = True
3
Example 2
Project: scrapy-mosquitera Source File: mixin.py
def dm_teardown(self):
""" Disconnect the method from the signal.
It's done to avoid conflicts when many instances of the mixin are being executed.
"""
try:
dispatcher.disconnect(
self.dequeue_next_page_requests,
signal=signals.spider_idle
)
except DispatcherKeyError:
pass
3
Example 3
Project: scrapy Source File: engine.py
def _spider_idle(self, spider):
"""Called when a spider gets idle. This function is called when there
are no remaining pages to download or schedule. It can be called
multiple times. If some extension raises a DontCloseSpider exception
(in the spider_idle signal handler) the spider is not closed until the
next loop and this function is guaranteed to be called (at least) once
again for this spider.
"""
res = self.signals.send_catch_log(signal=signals.spider_idle, \
spider=spider, dont_log=DontCloseSpider)
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
for _, x in res):
return
if self.spider_is_idle(spider):
self.close_spider(spider, reason='finished')
2
Example 4
Project: scrapy-redis Source File: test_spiders.py
@mock.patch('scrapy_redis.spiders.connection')
def test_via_from_crawler(self, connection):
server = connection.from_settings.return_value = mock.Mock()
crawler = get_crawler()
myspider = MySpider.from_crawler(crawler)
assert myspider.server is server
connection.from_settings.assert_called_with(crawler.settings)
crawler.signals.connect.assert_called_with(myspider.spider_idle, signal=signals.spider_idle)
# Second call does nothing.
server = myspider.server
crawler.signals.connect.reset_mock()
myspider.setup_redis()
assert myspider.server is server
assert crawler.signals.connect.call_count == 0
2
Example 5
Project: scrapy-rabbitmq Source File: spiders.py
def setup_rabbitmq(self):
""" Setup RabbitMQ connection.
Call this method after spider has set its crawler object.
:return: None
"""
if not self.rabbitmq_key:
self.rabbitmq_key = '{}:start_urls'.format(self.name)
self.server = connection.from_settings(self.crawler.settings)
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
0
Example 6
def setup_redis(self, crawler=None):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
"""
if self.server is not None:
return
if crawler is None:
# We allow optional crawler argument to keep backwards
# compatibility.
# XXX: Raise a deprecation warning.
crawler = getattr(self, 'crawler', None)
if crawler is None:
raise ValueError("crawler is required")
settings = crawler.settings
if self.redis_key is None:
self.redis_key = settings.get(
'REDIS_START_URLS_KEY', DEFAULT_START_URLS_KEY,
)
self.redis_key = self.redis_key % {'name': self.name}
if not self.redis_key.strip():
raise ValueError("redis_key must not be empty")
if self.redis_batch_size is None:
self.redis_batch_size = settings.getint(
'REDIS_START_URLS_BATCH_SIZE', DEFAULT_START_URLS_BATCH_SIZE,
)
try:
self.redis_batch_size = int(self.redis_batch_size)
except (TypeError, ValueError):
raise ValueError("redis_batch_size must be an integer")
self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
"(batch size: %(redis_batch_size)s)", self.__dict__)
self.server = connection.from_settings(crawler.settings)
# The idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
0
Example 7
Project: scrapy-kafka Source File: spiders.py
def setup_kafka(self, settings):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
:param settings: The current Scrapy settings being used
:type settings: scrapy.settings.Settings
"""
if not hasattr(self, 'topic') or not self.topic:
self.topic = '%s-starturls' % self.name
hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
_kafka = KafkaClient(hosts)
# wait at most 1sec for more messages. Otherwise continue
self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
auto_commit=True, iter_timeout=1.0)
# idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from kafka topic
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
0
Example 8
def _set_crawler(self, crawler):
super(RedisSpider, self)._set_crawler(crawler)
self.crawler.signals.connect(self.spider_idle,
signal=signals.spider_idle)