Here are the examples of the python api scrapy.utils.project.get_project_settings taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
9 Examples
3
Example 1
Project: uefi-spider Source File: asus_spider.py
def _get_uas(self):
### Edit user agent
settings = get_project_settings()
return " ".join([
settings.get("USER_AGENT"),
### The ASP.NET application is checking for async-compatible browsers.
"Mozilla/5.0 (Windows NT 6.1; WOW64)"
#"AppleWebKit/537.36 (KHTML, like Gecko)",
#"Chrome/34.0.1847.116",
#"Safari/537.36",
])
pass
3
Example 2
Project: daywatch Source File: spiders.py
def run_spider_instance(spider_class, site_id, main_url):
"""Run a spider given its spider class. For example, importing the TestSpider
and passing it to this function will run it."""
spider = spider_class(site_id=site_id, main_url=main_url)
settings = get_project_settings()
crawler = Crawler(settings)
crawler.configure()
# Scrapy uses a deprecated Twisted interface. Until the fix makes it to a
# new version (>0.24.4), we'll use this so deprecation warnings don't
# clutter the output
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
warnings.filterwarnings("ignore", category=DeprecationWarning)
crawler.crawl(spider)
crawler.start()
reactor.run()
3
Example 3
Project: StrepHit Source File: cli.py
@click.command()
@click.argument('spider-name', nargs=-1, required=True)
@click.argument('results-dir', type=click.Path(resolve_path=True, file_okay=False))
def crawl(spider_name, results_dir):
""" Run one or more spiders """
settings = get_project_settings()
# prevent scrapy from configuring its own logging, since we already have it
settings.set('LOG_ENABLED', False)
process = CrawlerProcess(settings)
for s in spider_name:
process.settings.set('FEED_URI',
'file://%s.jsonlines' % os.path.join(results_dir, s))
process.settings.set('FEED_FORMAT', 'jsonlines')
spider = process.spider_loader.load(s)
process.crawl(spider)
process.start()
0
Example 4
Project: scrapy Source File: cmdline.py
def execute(argv=None, settings=None):
if argv is None:
argv = sys.argv
# --- backwards compatibility for scrapy.conf.settings singleton ---
if settings is None and 'scrapy.conf' in sys.modules:
from scrapy import conf
if hasattr(conf, 'settings'):
settings = conf.settings
# ------------------------------------------------------------------
if settings is None:
settings = get_project_settings()
check_deprecated_settings(settings)
# --- backwards compatibility for scrapy.conf.settings singleton ---
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore", ScrapyDeprecationWarning)
from scrapy import conf
conf.settings = settings
# ------------------------------------------------------------------
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
cmdname = _pop_command_name(argv)
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
conflict_handler='resolve')
if not cmdname:
_print_commands(settings, inproject)
sys.exit(0)
elif cmdname not in cmds:
_print_unknown_command(settings, cmdname, inproject)
sys.exit(2)
cmd = cmds[cmdname]
parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
parser.description = cmd.long_desc()
settings.setdict(cmd.default_settings, priority='command')
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts)
cmd.crawler_process = CrawlerProcess(settings)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
0
Example 5
Project: django-dynamic-scraper Source File: django_base_spider.py
def _set_config(self, log_msg, **kwargs):
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
#run_type
if 'run_type' in kwargs:
self.conf['RUN_TYPE'] = kwargs['run_type']
if len(log_msg) > 0:
log_msg += ", "
log_msg += "run_type " + self.conf['RUN_TYPE']
#do_action
if 'do_action' in kwargs:
if kwargs['do_action'] == 'yes':
self.conf['DO_ACTION'] = True
else:
self.conf['DO_ACTION'] = False
if len(log_msg) > 0:
log_msg += ", "
log_msg += "do_action " + str(self.conf['DO_ACTION'])
self.conf['SPLASH_ARGS'] = settings.get('DSCRAPER_SPLASH_ARGS', self.conf['SPLASH_ARGS'])
if 'wait' not in self.conf['SPLASH_ARGS']:
self.conf['SPLASH_ARGS']['wait'] = 0.5
self.conf['IMAGES_STORE_FORMAT'] = settings.get('DSCRAPER_IMAGES_STORE_FORMAT', self.conf['IMAGES_STORE_FORMAT'])
if self.conf["IMAGES_STORE_FORMAT"] == 'FLAT':
msg = "Use simplified FLAT images store format (save the original or one thumbnail image)"
logging.info(msg)
if settings.get('IMAGES_THUMBS') and len(settings.get('IMAGES_THUMBS')) > 0:
msg = "IMAGES_THUMBS setting found, saving images as thumbnail images "
msg += "with size {size} (first entry)".format(
size=next(iter(settings.get('IMAGES_THUMBS').keys())))
else:
msg = "IMAGES_THUMBS setting not found, saving images with original size"
logging.info(msg)
elif self.conf["IMAGES_STORE_FORMAT"] == 'ALL':
msg = "Use ALL images store format (Scrapy behaviour, save both original and thumbnail images)"
logging.info(msg)
else:
msg = "Use THUMBS images store format (save only the thumbnail images)"
logging.info(msg)
self.conf['CUSTOM_PROCESSORS'] = settings.get('DSCRAPER_CUSTOM_PROCESSORS', [])
self.conf['LOG_ENABLED'] = settings.get('DSCRAPER_LOG_ENABLED', self.conf['LOG_ENABLED'])
self.conf['LOG_LEVEL'] = settings.get('DSCRAPER_LOG_LEVEL', self.conf['LOG_LEVEL'])
self.conf['LOG_LIMIT'] = settings.get('DSCRAPER_LOG_LIMIT', self.conf['LOG_LIMIT'])
self.log("Runtime config: " + log_msg, logging.INFO)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
0
Example 6
Project: django-dynamic-scraper Source File: django_checker.py
def _del_ref_object(self):
if self.action_successful:
self.log("Item already deleted, skipping.", logging.INFO)
return
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
try:
img_elem = self.scraper.get_image_elem()
if hasattr(self.ref_object, img_elem.scraped_obj_attr.name):
img_name = getattr(self.ref_object, img_elem.scraped_obj_attr.name)
thumb_paths = []
if settings.get('IMAGES_THUMBS') and len(settings.get('IMAGES_THUMBS')) > 0:
for key in settings.get('IMAGES_THUMBS').keys():
thumb_paths.append(('thumbnail, {k}'.format(k=key), os.path.join(settings.get('IMAGES_STORE'), 'thumbs', key, img_name),))
del_paths = []
if self.conf['IMAGES_STORE_FORMAT'] == 'FLAT':
del_paths.append(('original, flat path', os.path.join(settings.get('IMAGES_STORE'), img_name),))
if self.conf['IMAGES_STORE_FORMAT'] == 'ALL':
del_paths.append(('original, full/ path', os.path.join(settings.get('IMAGES_STORE'), 'full' , img_name),))
del_paths += thumb_paths
if self.conf['IMAGES_STORE_FORMAT'] == 'THUMBS':
del_paths += thumb_paths
for path in del_paths:
if os.access(path[1], os.F_OK):
try:
os.unlink(path[1])
self.log("Associated image ({n}, {p}) deleted.".format(n=img_name, p=path[0]), logging.INFO)
except Exception:
self.log("Associated image ({n}, {p}) could not be deleted!".format(n=img_name, p=path[0]), logging.ERROR)
else:
self.log("Associated image ({n}, {p}) could not be found!".format(n=img_name, p=path[0]), logging.WARNING)
except ScraperElem.DoesNotExist:
pass
self.ref_object.delete()
self.scraper.last_checker_delete = datetime.datetime.now()
self.scraper.save()
self.action_successful = True
self.log("Item deleted.", logging.INFO)
0
Example 7
Project: scrapy-cluster Source File: tests_online.py
def setUp(self):
self.settings = get_project_settings()
self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test")
# set up redis
self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
port=self.settings['REDIS_PORT'])
try:
self.redis_conn.info()
except ConnectionError:
print "Could not connect to Redis"
# plugin is essential to functionality
sys.exit(1)
# clear out older test keys if any
keys = self.redis_conn.keys("test-spider:*")
for key in keys:
self.redis_conn.delete(key)
# set up kafka to consumer potential result
self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose")
self.consumer = SimpleConsumer(
self.kafka_conn,
"demo-id",
"demo_test.crawled_firehose",
buffer_size=1024*100,
fetch_size_bytes=1024*100,
max_buffer_size=None
)
# move cursor to end of kafka topic
self.consumer.seek(0, 2)
0
Example 8
Project: legco-watch Source File: tasks.py
@shared_task
def do_scrape(spider_name):
"""
Asynchronous task for individual scrapes that is executed by Celery workers.
:param spider_name: str name of the spider that should be run
:return: the full path of the jsonlines output file to which results are stored
"""
# create and configure the spider
crawl_settings = get_project_settings()
# configure the output
# Technically don't need this unless we actually do the scrape, but need to put
# up here before the crawler is instantiated so the FEED_URI override is active
output_name = generate_scrape_name(spider_name)
output_path = os.path.join(crawl_settings.get('DATA_DIR_BASE'), 'scrapes', output_name)
crawl_settings.overrides['FEED_URI'] = output_path
crawler = Crawler(crawl_settings)
crawler.configure()
try:
spider = crawler.spiders.create(spider_name)
except KeyError as e:
# No spider found.
raise RuntimeError('Could not find spider with name {}'.format(spider_name))
# Check to see if we're already running a scrape by looking for open ScrapeJobs
is_scraping = is_spider_scraping(spider_name)
if is_scraping is False:
logger.info('Starting new scrape of {}'.format(spider_name))
# Create the ScrapeJob record
job_id = do_scrape.request.id
if job_id is None:
# Case if called directly without using Celery, put in a dummy job id
timestamp = datetime.now().strftime('%y%m%d%H%M')
job_id = 'MANUAL_RUN{}'.format(timestamp)
job = ScrapeJob.objects.create(
spider=spider_name,
scheduled=datetime.now(),
# see http://stackoverflow.com/questions/18872854/getting-task-id-inside-a-celery-task
job_id=job_id,
raw_response=output_path
)
# and set up the callback for updating it
complete_cb = complete_job(job.id)
# Connect the signals and logging, then start it up
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.signals.connect(complete_cb, signal=signals.spider_closed)
log.start(loglevel=log.INFO, logstdout=True)
crawler.crawl(spider)
crawler.start()
reactor.run()
else:
logger.info('Pending job found for spider {}'.format(spider_name))
job = is_scraping
return job.raw_response
0
Example 9
Project: legco-watch Source File: utils.py
def list_spiders():
settings = get_project_settings()
crawler = Crawler(settings)
return crawler.spiders.list()