Here are the examples of the python api scrapy.crawler.CrawlerProcess taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
6 Examples
3
Example 1
@click.command()
@click.argument('spider-name', nargs=-1, required=True)
@click.argument('results-dir', type=click.Path(resolve_path=True, file_okay=False))
def crawl(spider_name, results_dir):
""" Run one or more spiders """
settings = get_project_settings()
# prevent scrapy from configuring its own logging, since we already have it
settings.set('LOG_ENABLED', False)
process = CrawlerProcess(settings)
for s in spider_name:
process.settings.set('FEED_URI',
'file://%s.jsonlines' % os.path.join(results_dir, s))
process.settings.set('FEED_FORMAT', 'jsonlines')
spider = process.spider_loader.load(s)
process.crawl(spider)
process.start()
0
Example 2
def execute(argv=None, settings=None):
if argv is None:
argv = sys.argv
# --- backwards compatibility for scrapy.conf.settings singleton ---
if settings is None and 'scrapy.conf' in sys.modules:
from scrapy import conf
if hasattr(conf, 'settings'):
settings = conf.settings
# ------------------------------------------------------------------
if settings is None:
settings = get_project_settings()
check_deprecated_settings(settings)
# --- backwards compatibility for scrapy.conf.settings singleton ---
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore", ScrapyDeprecationWarning)
from scrapy import conf
conf.settings = settings
# ------------------------------------------------------------------
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
cmdname = _pop_command_name(argv)
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
conflict_handler='resolve')
if not cmdname:
_print_commands(settings, inproject)
sys.exit(0)
elif cmdname not in cmds:
_print_unknown_command(settings, cmdname, inproject)
sys.exit(2)
cmd = cmds[cmdname]
parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
parser.description = cmd.long_desc()
settings.setdict(cmd.default_settings, priority='command')
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts)
cmd.crawler_process = CrawlerProcess(settings)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
0
Example 3
Project: scrapy Source File: test_crawler.py
def test_crawler_process_accepts_dict(self):
runner = CrawlerProcess({'foo': 'bar'})
self.assertEqual(runner.settings['foo'], 'bar')
self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')
0
Example 4
Project: scrapy Source File: test_crawler.py
def test_crawler_process_accepts_None(self):
runner = CrawlerProcess()
self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')
0
Example 5
def run_config(config):
config = ConfigLoader(config)
CustomMiddleware.driver = config.driver
DocuementationSpider.NB_INDEXED = 0
if config.use_anchors:
from . import scrapy_patch
strategy = DefaultStrategy(config)
algolia_helper = AlgoliaHelper(
config.app_id,
config.api_key,
config.index_name,
AlgoliaSettings.get(config, strategy.levels)
)
DOWNLOADER_MIDDLEWARES_PATH = 'scraper.src.custom_middleware.CustomMiddleware'
DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.src.scrapy_patch.CustomContextFactory'
if __name__ == '__main__':
DOWNLOADER_MIDDLEWARES_PATH = 'src.custom_middleware.CustomMiddleware'
DOWNLOADER_CLIENTCONTEXTFACTORY = 'src.scrapy_patch.CustomContextFactory'
process = CrawlerProcess({
'LOG_ENABLED': '1',
'LOG_LEVEL': 'ERROR',
# 'LOG_LEVEL': 'DEBUG',
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36',
'DOWNLOADER_MIDDLEWARES': {DOWNLOADER_MIDDLEWARES_PATH: 900},
# Need to be > 600 to be after the redirectMiddleware
'DOWNLOADER_CLIENTCONTEXTFACTORY': DOWNLOADER_CLIENTCONTEXTFACTORY
})
process.crawl(
DocuementationSpider,
config=config,
algolia_helper=algolia_helper,
strategy=strategy
)
process.start()
process.stop()
# Kill browser if needed
BrowserHandler.destroy(config.driver)
if len(config.extra_records) > 0:
algolia_helper.add_records(config.extra_records, "Extra records")
if len(Camelizer.synonyms) > 0:
algolia_helper.add_synonyms(Camelizer.synonyms)
print("")
if DocuementationSpider.NB_INDEXED > 0:
algolia_helper.commit_tmp_index()
print('Nb hits: ' + str(DocuementationSpider.NB_INDEXED))
config.update_nb_hits(DocuementationSpider.NB_INDEXED)
else:
print('Crawling issue: nbHits 0 for ' + config.index_name)
algolia_helper.report_crawling_issue()
print("")
0
Example 6
Project: django-dynamic-scraper Source File: scraper_test.py
def setUp(self):
if os.path.exists(self.IMG_DIR):
shutil.rmtree(self.IMG_DIR)
os.mkdir(self.IMG_DIR)
settings.set('ITEM_PIPELINES', self.dds_settings['ITEM_PIPELINES'], priority='cmdline')
settings.set('SPLASH_URL', self.dds_settings['SPLASH_URL'], priority='cmdline')
settings.set('DUPEFILTER_CLASS', self.dds_settings['DUPEFILTER_CLASS'], priority='cmdline')
settings.set('DOWNLOADER_MIDDLEWARES', self.dds_settings['DOWNLOADER_MIDDLEWARES'], priority='cmdline')
settings.set('IMAGES_STORE', self.dds_settings['IMAGES_STORE'], priority='cmdline')
if 'IMAGES_THUMBS' in self.dds_settings:
settings.set('IMAGES_THUMBS', self.dds_settings['IMAGES_THUMBS'], priority='cmdline')
if 'DSCRAPER_IMAGES_STORE_FORMAT' in self.dds_settings:
settings.set('DSCRAPER_IMAGES_STORE_FORMAT', self.dds_settings['DSCRAPER_IMAGES_STORE_FORMAT'], priority='cmdline')
settings.set('COOKIES_DEBUG', True)
settings.set('LOG_LEVEL', 'DEBUG')
settings.set('LOG_ENABLED', False)
#self.crawler = Crawler(settings)
#self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
#self.crawler.configure()
self.process = CrawlerProcess(settings)
self.sc = ScrapedObjClass(name='Event')
self.sc.save()
self.soa_base = ScrapedObjAttr(name='base', attr_type='B', obj_class=self.sc)
self.soa_base.save()
self.soa_title = ScrapedObjAttr(name='title', attr_type='S', obj_class=self.sc)
self.soa_title.save()
self.soa_url = ScrapedObjAttr(name='url', attr_type='U', obj_class=self.sc, id_field=True)
self.soa_url.save()
self.soa_url2 = ScrapedObjAttr(name='url2', attr_type='U', obj_class=self.sc)
self.soa_url2.save()
self.soa_desc = ScrapedObjAttr(name='description', attr_type='S', obj_class=self.sc)
self.soa_desc.save()
self.soa_desc2 = ScrapedObjAttr(name='description2', attr_type='S', obj_class=self.sc)
self.soa_desc2.save()
self.soa_es_1 = ScrapedObjAttr(name='extra_standard_1', attr_type='S', obj_class=self.sc, save_to_db=False)
self.soa_es_1.save()
self.scraper = Scraper(name='Event Scraper', scraped_obj_class=self.sc, status='A',)
self.scraper.save()
self.se_base = ScraperElem(scraped_obj_attr=self.soa_base, scraper=self.scraper,
x_path='//ul/li', request_page_type='MP')
self.se_base.save()
self.se_title = ScraperElem(scraped_obj_attr=self.soa_title, scraper=self.scraper,
x_path='a/text()', request_page_type='MP')
self.se_title.save()
self.se_url = ScraperElem(scraped_obj_attr=self.soa_url, scraper=self.scraper,
x_path='a/@href', request_page_type='MP')
self.se_url.save()
self.se_desc = ScraperElem(scraped_obj_attr=self.soa_desc, scraper=self.scraper,
x_path='//div/div[@class="description"]/text()', request_page_type='DP1', mandatory=False)
self.se_desc.save()
self.se_es_1 = ScraperElem(scraped_obj_attr=self.soa_es_1, scraper=self.scraper,
x_path='a/text()', request_page_type='MP')
self.se_es_1.save()
self.rpt_mp = RequestPageType(page_type='MP', scraper=self.scraper)
self.rpt_mp.save()
self.rpt_dp1 = RequestPageType(page_type='DP1', scraper=self.scraper, scraped_obj_attr=self.soa_url)
self.rpt_dp1.save()
self.sched_rt = SchedulerRuntime()
self.sched_rt.save()
self.event_website = EventWebsite(pk=1, name='Event Website', scraper=self.scraper,
url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt,)
self.event_website.save()
for name, signal in list(vars(signals).items()):
if not name.startswith('_'):
dispatcher.connect(self.record_signal, signal)