scrapy.signals.item_scraped

Here are the examples of the python api scrapy.signals.item_scraped taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

4 Examples 7

Example 1

Project: autologin
Source File: scrapyutils.py
View license
    def __init__(self, crawl_d, crawler):
        self.crawl_d = crawl_d
        self.crawler = crawler

        crawler.signals.connect(self._on_item_scraped, signals.item_scraped)
        crawler.signals.connect(self._on_error, signals.spider_error)

        crawl_d.addCallback(self._on_finished)
        crawl_d.addErrback(self._on_error)

        self.closed = False
        self._items_available = Deferred()
        self._items = collections.deque()

Example 2

Project: scrapydo
Source File: api.py
View license
@crochet.run_in_reactor
def _run_spider_in_reactor(spider_cls, capture_items=True, return_crawler=False,
                           settings=None):
    """Runs given spider inside the twisted reactdor.

    Parameters
    ----------
    spider_cls : scrapy.Spider
        Spider to run.
    capture_items : bool (default: True)
        If enabled, the scraped items are captured and returned.
    return_crawler : bool (default: False)
        If enabled, the crawler instance is returned. If ``capture_items`` is
        enabled, the scraped items is collected in ``crawler.items``.
    settings : dict, optional
        Custom crawler settings.

    Returns
    -------
    out : crochet.EventualResult
        If ``return_items`` is ``True``, returns scraped items. If
        ``return_crawler`` is ``True``, returns the crawler instance.

    """
    settings = settings or {}
    crawler_settings = get_project_settings().copy()
    crawler_settings.setdict(settings)
    log_scrapy_info(crawler_settings)
    crawler = Crawler(spider_cls, crawler_settings)
    d = crawler.crawl()
    if capture_items:
        crawler.items = _OutputItems()
        crawler.signals.connect(crawler.items.append, signal=signals.item_scraped)
        d.addCallback(lambda _: crawler.items)
    if return_crawler:
        d.addCallback(lambda _: crawler)
    return d

Example 3

Project: scrapyrt
Source File: core.py
View license
    def crawl(self, spidercls, *args, **kwargs):
        if isinstance(spidercls, six.string_types):
            spidercls = self.spiders.load(spidercls)
        # creating our own crawler that will allow us to disable start requests easily
        crawler = ScrapyrtCrawler(
            spidercls, self.settings, self.scrapyrt_manager.start_requests)
        self.scrapyrt_manager.crawler = crawler
        # Connecting signals to handlers that control crawl process
        crawler.signals.connect(self.scrapyrt_manager.get_item,
                                signals.item_scraped)
        crawler.signals.connect(self.scrapyrt_manager.collect_dropped,
                                signals.item_dropped)
        crawler.signals.connect(self.scrapyrt_manager.spider_idle,
                                signals.spider_idle)
        crawler.signals.connect(self.scrapyrt_manager.handle_spider_error,
                                signals.spider_error)
        crawler.signals.connect(self.scrapyrt_manager.handle_scheduling,
                                signals.request_scheduled)
        dfd = super(ScrapyrtCrawlerProcess, self).crawl(crawler, *args, **kwargs)
        _cleanup_handler = setup_spider_logging(crawler.spider, self.settings)

        def cleanup_logging(result):
            _cleanup_handler()
            return result

        return dfd.addBoth(cleanup_logging)

Example 4

Project: scrapy
Source File: test_engine.py
View license
    def run(self):
        self.port = start_test_site()
        self.portno = self.port.getHost().port

        start_urls = [self.geturl("/"), self.geturl("/redirect"),
                      self.geturl("/redirect")]  # a duplicate

        for name, signal in vars(signals).items():
            if not name.startswith('_'):
                dispatcher.connect(self.record_signal, signal)

        self.crawler = get_crawler(self.spider_class)
        self.crawler.signals.connect(self.item_scraped, signals.item_scraped)
        self.crawler.signals.connect(self.request_scheduled, signals.request_scheduled)
        self.crawler.signals.connect(self.request_dropped, signals.request_dropped)
        self.crawler.signals.connect(self.response_downloaded, signals.response_downloaded)
        self.crawler.crawl(start_urls=start_urls)
        self.spider = self.crawler.spider

        self.deferred = defer.Deferred()
        dispatcher.connect(self.stop, signals.engine_stopped)
        return self.deferred