scrapy.crawler.CrawlerProcess

Here are the examples of the python api scrapy.crawler.CrawlerProcess taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

6 Examples 7

Example 1

Project: StrepHit Source File: cli.py
Function: crawl
@click.command()
@click.argument('spider-name', nargs=-1, required=True)
@click.argument('results-dir', type=click.Path(resolve_path=True, file_okay=False))
def crawl(spider_name, results_dir):
    """ Run one or more spiders """
    settings = get_project_settings()
    # prevent scrapy from configuring its own logging, since we already have it
    settings.set('LOG_ENABLED', False)

    process = CrawlerProcess(settings)
    for s in spider_name:
        process.settings.set('FEED_URI',
                             'file://%s.jsonlines' % os.path.join(results_dir, s))
        process.settings.set('FEED_FORMAT', 'jsonlines')
        spider = process.spider_loader.load(s)
        process.crawl(spider)
    process.start()

Example 2

Project: scrapy Source File: cmdline.py
Function: execute
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)

Example 3

Project: scrapy Source File: test_crawler.py
    def test_crawler_process_accepts_dict(self):
        runner = CrawlerProcess({'foo': 'bar'})
        self.assertEqual(runner.settings['foo'], 'bar')
        self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')

Example 4

Project: scrapy Source File: test_crawler.py
    def test_crawler_process_accepts_None(self):
        runner = CrawlerProcess()
        self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')

Example 5

Project: docsearch-scraper Source File: index.py
Function: run_config
def run_config(config):
    config = ConfigLoader(config)
    CustomMiddleware.driver = config.driver
    DocuementationSpider.NB_INDEXED = 0

    if config.use_anchors:
        from . import scrapy_patch

    strategy = DefaultStrategy(config)

    algolia_helper = AlgoliaHelper(
        config.app_id,
        config.api_key,
        config.index_name,
        AlgoliaSettings.get(config, strategy.levels)
    )

    DOWNLOADER_MIDDLEWARES_PATH = 'scraper.src.custom_middleware.CustomMiddleware'
    DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.src.scrapy_patch.CustomContextFactory'

    if __name__ == '__main__':
        DOWNLOADER_MIDDLEWARES_PATH = 'src.custom_middleware.CustomMiddleware'
        DOWNLOADER_CLIENTCONTEXTFACTORY = 'src.scrapy_patch.CustomContextFactory'

    process = CrawlerProcess({
        'LOG_ENABLED': '1',
        'LOG_LEVEL': 'ERROR',
        # 'LOG_LEVEL': 'DEBUG',
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36',
        'DOWNLOADER_MIDDLEWARES': {DOWNLOADER_MIDDLEWARES_PATH: 900},
        # Need to be > 600 to be after the redirectMiddleware
        'DOWNLOADER_CLIENTCONTEXTFACTORY': DOWNLOADER_CLIENTCONTEXTFACTORY
    })

    process.crawl(
        DocuementationSpider,
        config=config,
        algolia_helper=algolia_helper,
        strategy=strategy
    )

    process.start()
    process.stop()

    # Kill browser if needed
    BrowserHandler.destroy(config.driver)

    if len(config.extra_records) > 0:
        algolia_helper.add_records(config.extra_records, "Extra records")

    if len(Camelizer.synonyms) > 0:
        algolia_helper.add_synonyms(Camelizer.synonyms)

    print("")

    if DocuementationSpider.NB_INDEXED > 0:
        algolia_helper.commit_tmp_index()
        print('Nb hits: ' + str(DocuementationSpider.NB_INDEXED))
        config.update_nb_hits(DocuementationSpider.NB_INDEXED)
    else:
        print('Crawling issue: nbHits 0 for ' + config.index_name)
        algolia_helper.report_crawling_issue()

    print("")

Example 6

Project: django-dynamic-scraper Source File: scraper_test.py
    def setUp(self):
        if os.path.exists(self.IMG_DIR):
            shutil.rmtree(self.IMG_DIR)
        os.mkdir(self.IMG_DIR)

        settings.set('ITEM_PIPELINES', self.dds_settings['ITEM_PIPELINES'], priority='cmdline')
        settings.set('SPLASH_URL', self.dds_settings['SPLASH_URL'], priority='cmdline')
        settings.set('DUPEFILTER_CLASS', self.dds_settings['DUPEFILTER_CLASS'], priority='cmdline')
        settings.set('DOWNLOADER_MIDDLEWARES', self.dds_settings['DOWNLOADER_MIDDLEWARES'], priority='cmdline')
        settings.set('IMAGES_STORE', self.dds_settings['IMAGES_STORE'], priority='cmdline')
        if 'IMAGES_THUMBS' in self.dds_settings:
            settings.set('IMAGES_THUMBS', self.dds_settings['IMAGES_THUMBS'], priority='cmdline')
        if 'DSCRAPER_IMAGES_STORE_FORMAT' in self.dds_settings:
            settings.set('DSCRAPER_IMAGES_STORE_FORMAT', self.dds_settings['DSCRAPER_IMAGES_STORE_FORMAT'], priority='cmdline')

        settings.set('COOKIES_DEBUG', True)
        settings.set('LOG_LEVEL', 'DEBUG')
        settings.set('LOG_ENABLED', False)
        
        #self.crawler = Crawler(settings)
        #self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        #self.crawler.configure()
        
        self.process = CrawlerProcess(settings)
        
        self.sc = ScrapedObjClass(name='Event')
        self.sc.save()
        self.soa_base = ScrapedObjAttr(name='base', attr_type='B', obj_class=self.sc)
        self.soa_base.save()
        self.soa_title = ScrapedObjAttr(name='title', attr_type='S', obj_class=self.sc)
        self.soa_title.save()
        self.soa_url = ScrapedObjAttr(name='url', attr_type='U', obj_class=self.sc, id_field=True)
        self.soa_url.save()
        self.soa_url2 = ScrapedObjAttr(name='url2', attr_type='U', obj_class=self.sc)
        self.soa_url2.save()
        self.soa_desc = ScrapedObjAttr(name='description', attr_type='S', obj_class=self.sc)
        self.soa_desc.save()
        self.soa_desc2 = ScrapedObjAttr(name='description2', attr_type='S', obj_class=self.sc)
        self.soa_desc2.save()
        self.soa_es_1 = ScrapedObjAttr(name='extra_standard_1', attr_type='S', obj_class=self.sc, save_to_db=False)
        self.soa_es_1.save()

        self.scraper = Scraper(name='Event Scraper', scraped_obj_class=self.sc, status='A',)
        self.scraper.save()
        
        self.se_base = ScraperElem(scraped_obj_attr=self.soa_base, scraper=self.scraper, 
        x_path='//ul/li', request_page_type='MP')
        self.se_base.save()
        self.se_title = ScraperElem(scraped_obj_attr=self.soa_title, scraper=self.scraper, 
            x_path='a/text()', request_page_type='MP')
        self.se_title.save()
        self.se_url = ScraperElem(scraped_obj_attr=self.soa_url, scraper=self.scraper, 
            x_path='a/@href', request_page_type='MP')
        self.se_url.save()
        self.se_desc = ScraperElem(scraped_obj_attr=self.soa_desc, scraper=self.scraper, 
            x_path='//div/div[@class="description"]/text()', request_page_type='DP1', mandatory=False)
        self.se_desc.save()
        self.se_es_1 = ScraperElem(scraped_obj_attr=self.soa_es_1, scraper=self.scraper, 
            x_path='a/text()', request_page_type='MP')
        self.se_es_1.save()

        self.rpt_mp  = RequestPageType(page_type='MP', scraper=self.scraper)
        self.rpt_mp.save()
        self.rpt_dp1 = RequestPageType(page_type='DP1', scraper=self.scraper, scraped_obj_attr=self.soa_url)
        self.rpt_dp1.save()
        
        self.sched_rt = SchedulerRuntime()
        self.sched_rt.save()
        
        self.event_website = EventWebsite(pk=1, name='Event Website', scraper=self.scraper,
            url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt,)
        self.event_website.save()
        
        for name, signal in list(vars(signals).items()):
            if not name.startswith('_'):
                dispatcher.connect(self.record_signal, signal)