Here are the examples of the python api scrapy.utils.test.get_crawler taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
52 Examples
3
Example 1
Project: scrapy Source File: test_spider.py
def test_deprecated_set_crawler_method(self):
spider = self.spider_class('example.com')
crawler = get_crawler()
with warnings.catch_warnings(record=True) as w:
spider.set_crawler(crawler)
self.assertIn("set_crawler", str(w[0].message))
self.assertTrue(hasattr(spider, 'crawler'))
self.assertIs(spider.crawler, crawler)
self.assertTrue(hasattr(spider, 'settings'))
self.assertIs(spider.settings, crawler.settings)
3
Example 2
@defer.inlineCallbacks
def test_logging(self):
crawler = get_crawler(_HttpErrorSpider)
with LogCapture() as log:
yield crawler.crawl(bypass_status_codes={402})
self.assertEqual(crawler.spider.parsed, {'200', '402'})
self.assertEqual(crawler.spider.skipped, {'402'})
self.assertEqual(crawler.spider.failed, {'404', '500'})
self.assertIn('Ignoring response <404', str(log))
self.assertIn('Ignoring response <500', str(log))
self.assertNotIn('Ignoring response <200', str(log))
self.assertNotIn('Ignoring response <402', str(log))
3
Example 3
Project: scrapy Source File: test_downloader_handlers.py
def test_not_configured_handler(self):
handlers = {'scheme': 'tests.test_downloader_handlers.OffDH'}
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
dh = DownloadHandlers(crawler)
self.assertIn('scheme', dh._schemes)
for scheme in handlers: # force load handlers
dh._get_handler(scheme)
self.assertNotIn('scheme', dh._handlers)
self.assertIn('scheme', dh._notconfigured)
3
Example 4
Project: scrapy Source File: test_proxy_connect.py
@defer.inlineCallbacks
def test_https_connect_tunnel_error(self):
crawler = get_crawler(SimpleSpider)
with LogCapture() as l:
yield crawler.crawl("https://localhost:99999/status?n=200")
self._assert_got_tunnel_error(l)
3
Example 5
Project: scrapy Source File: test_downloadermiddleware_cookies.py
def test_setting_disabled_cookies_debug(self):
crawler = get_crawler(settings_dict={'COOKIES_DEBUG': False})
mw = CookiesMiddleware.from_crawler(crawler)
with LogCapture('scrapy.downloadermiddlewares.cookies',
propagate=False,
level=logging.DEBUG) as l:
req = Request('http://scrapytest.org/')
res = Response('http://scrapytest.org/',
headers={'Set-Cookie': 'C1=value1; path=/'})
mw.process_response(req, res, crawler.spider)
req2 = Request('http://scrapytest.org/sub1/')
mw.process_request(req2, crawler.spider)
l.check()
3
Example 6
Project: scrapy Source File: test_downloadermiddleware_cookies.py
def test_setting_false_cookies_enabled(self):
self.assertRaises(
NotConfigured,
CookiesMiddleware.from_crawler,
get_crawler(settings_dict={'COOKIES_ENABLED': False})
)
3
Example 7
Project: scrapy Source File: test_spider.py
def test_follow_links_attribute_population(self):
crawler = get_crawler()
spider = self.spider_class.from_crawler(crawler, 'example.com')
self.assertTrue(hasattr(spider, '_follow_links'))
self.assertTrue(spider._follow_links)
settings_dict = {'CRAWLSPIDER_FOLLOW_LINKS': False}
crawler = get_crawler(settings_dict=settings_dict)
spider = self.spider_class.from_crawler(crawler, 'example.com')
self.assertTrue(hasattr(spider, '_follow_links'))
self.assertFalse(spider._follow_links)
3
Example 8
Project: scrapy Source File: test_spider.py
def test_follow_links_attribute_deprecated_population(self):
spider = self.spider_class('example.com')
self.assertFalse(hasattr(spider, '_follow_links'))
spider.set_crawler(get_crawler())
self.assertTrue(hasattr(spider, '_follow_links'))
self.assertTrue(spider._follow_links)
spider = self.spider_class('example.com')
settings_dict = {'CRAWLSPIDER_FOLLOW_LINKS': False}
spider.set_crawler(get_crawler(settings_dict=settings_dict))
self.assertTrue(hasattr(spider, '_follow_links'))
self.assertFalse(spider._follow_links)
3
Example 9
def _get_crawler(settings_dict):
settings_dict = settings_dict.copy()
settings_dict['DOWNLOAD_HANDLERS'] = {'s3': None} # for faster test running
crawler = get_crawler(settings_dict=settings_dict)
if not hasattr(crawler, 'logformatter'):
crawler.logformatter = None
crawler.engine = ExecutionEngine(crawler, lambda _: None)
# spider = crawler._create_spider("foo")
return crawler
3
Example 10
Project: scrapy Source File: test_closespider.py
@defer.inlineCallbacks
def test_closespider_timeout(self):
close_on = 0.1
crawler = get_crawler(FollowAllSpider, {'CLOSESPIDER_TIMEOUT': close_on})
yield crawler.crawl(total=1000000)
reason = crawler.spider.meta['close_reason']
self.assertEqual(reason, 'closespider_timeout')
stats = crawler.stats
start = stats.get_value('start_time')
stop = stats.get_value('finish_time')
diff = stop - start
total_seconds = diff.seconds + diff.microseconds
self.assertTrue(total_seconds >= close_on)
3
Example 11
Project: scrapy Source File: test_proxy_connect.py
@defer.inlineCallbacks
def test_https_connect_tunnel(self):
crawler = get_crawler(SimpleSpider)
with LogCapture() as l:
yield crawler.crawl("https://localhost:8999/status?n=200")
self._assert_got_response_code(200, l)
3
Example 12
Project: scrapy Source File: test_downloadermiddleware_defaultheaders.py
def get_defaults_spider_mw(self):
crawler = get_crawler(Spider)
spider = crawler._create_spider('foo')
defaults = {
to_bytes(k): [to_bytes(v)]
for k, v in crawler.settings.get('DEFAULT_REQUEST_HEADERS').items()
}
return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)
3
Example 13
Project: scrapy Source File: test_downloadermiddleware_stats.py
def setUp(self):
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider('scrapytest.org')
self.mw = DownloaderStats(self.crawler.stats)
self.crawler.stats.open_spider(self.spider)
self.req = Request('http://scrapytest.org')
self.res = Response('scrapytest.org', status=400)
3
Example 14
Project: scrapy Source File: test_proxy_connect.py
@defer.inlineCallbacks
def test_https_tunnel_without_leak_proxy_authorization_header(self):
request = Request("https://localhost:8999/echo")
crawler = get_crawler(SingleRequestSpider)
with LogCapture() as l:
yield crawler.crawl(seed=request)
self._assert_got_response_code(200, l)
echo = json.loads(crawler.spider.meta['responses'][0].body)
self.assertTrue('Proxy-Authorization' not in echo['headers'])
3
Example 15
Project: scrapy Source File: test_proxy_connect.py
@defer.inlineCallbacks
def test_https_noconnect_auth_error(self):
os.environ['https_proxy'] = 'http://wrong:wronger@localhost:8888?noconnect'
crawler = get_crawler(SimpleSpider)
with LogCapture() as l:
yield crawler.crawl("https://localhost:8999/status?n=200")
self._assert_got_response_code(407, l)
3
Example 16
Project: scrapy Source File: test_spider.py
def test_from_crawler_init_call(self):
with mock.patch.object(self.spider_class, '__init__',
return_value=None) as mock_init:
self.spider_class.from_crawler(get_crawler(), 'example.com',
foo='bar')
mock_init.assert_called_once_with('example.com', foo='bar')
3
Example 17
Project: scrapy Source File: test_closespider.py
@defer.inlineCallbacks
def test_closespider_errorcount(self):
close_on = 5
crawler = get_crawler(ErrorSpider, {'CLOSESPIDER_ERRORCOUNT': close_on})
yield crawler.crawl(total=1000000)
reason = crawler.spider.meta['close_reason']
self.assertEqual(reason, 'closespider_errorcount')
key = 'spider_exceptions/{name}'\
.format(name=crawler.spider.exception_cls.__name__)
errorcount = crawler.stats.get_value(key)
self.assertTrue(errorcount >= close_on)
3
Example 18
def setUp(self):
crawler = get_crawler(Spider)
self.spider = crawler._create_spider('scrapytest.org')
self.stats = StatsCollector(crawler)
self.stats.open_spider(self.spider)
self.mw = DepthMiddleware(1, self.stats, True)
3
Example 19
Project: scrapy Source File: test_spider.py
def test_closed_signal_call(self):
class TestSpider(self.spider_class):
closed_called = False
def closed(self, reason):
self.closed_called = True
crawler = get_crawler()
spider = TestSpider.from_crawler(crawler, 'example.com')
crawler.signals.send_catch_log(signal=signals.spider_opened,
spider=spider)
crawler.signals.send_catch_log(signal=signals.spider_closed,
spider=spider, reason=None)
self.assertTrue(spider.closed_called)
3
Example 20
Project: scrapy Source File: test_downloader_handlers.py
def test_disabled_handler(self):
handlers = {'scheme': None}
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
dh = DownloadHandlers(crawler)
self.assertNotIn('scheme', dh._schemes)
for scheme in handlers: # force load handlers
dh._get_handler(scheme)
self.assertNotIn('scheme', dh._handlers)
self.assertIn('scheme', dh._notconfigured)
3
Example 21
Project: scrapy Source File: test_downloadermiddleware_cookies.py
def test_setting_true_cookies_enabled(self):
self.assertIsInstance(
CookiesMiddleware.from_crawler(
get_crawler(settings_dict={'COOKIES_ENABLED': True})
),
CookiesMiddleware
)
3
Example 22
@defer.inlineCallbacks
def test_download(self):
crawler = get_crawler(SingleRequestSpider)
yield crawler.crawl(seed=Request(url='http://localhost:8998'))
failure = crawler.spider.meta.get('failure')
self.assertTrue(failure == None)
reason = crawler.spider.meta['close_reason']
self.assertTrue(reason, 'finished')
3
Example 23
Project: scrapy Source File: test_spidermiddleware_httperror.py
@defer.inlineCallbacks
def test_middleware_works(self):
crawler = get_crawler(_HttpErrorSpider)
yield crawler.crawl()
assert not crawler.spider.skipped, crawler.spider.skipped
self.assertEqual(crawler.spider.parsed, {'200'})
self.assertEqual(crawler.spider.failed, {'404', '402', '500'})
3
Example 24
Project: scrapy Source File: test_closespider.py
@defer.inlineCallbacks
def test_closespider_itemcount(self):
close_on = 5
crawler = get_crawler(ItemSpider, {'CLOSESPIDER_ITEMCOUNT': close_on})
yield crawler.crawl()
reason = crawler.spider.meta['close_reason']
self.assertEqual(reason, 'closespider_itemcount')
itemcount = crawler.stats.get_value('item_scraped_count')
self.assertTrue(itemcount >= close_on)
3
Example 25
@defer.inlineCallbacks
def test_download_with_content_length(self):
crawler = get_crawler(SingleRequestSpider)
# http://localhost:8998/partial set Content-Length to 1024, use download_maxsize= 1000 to avoid
# download it
yield crawler.crawl(seed=Request(url='http://localhost:8998/partial', meta={'download_maxsize': 1000}))
failure = crawler.spider.meta['failure']
self.assertIsInstance(failure.value, defer.CancelledError)
3
Example 26
Project: scrapy Source File: test_feedexport.py
def get_test_spider(self, settings=None):
class TestSpider(scrapy.Spider):
name = 'test_spider'
crawler = get_crawler(settings_dict=settings)
spider = TestSpider.from_crawler(crawler)
return spider
3
Example 27
Project: scrapy Source File: test_engine.py
@defer.inlineCallbacks
def test_close_spiders_downloader(self):
e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
yield e.open_spider(TestSpider(), [])
self.assertEqual(len(e.open_spiders), 1)
yield e.close()
self.assertEqual(len(e.open_spiders), 0)
3
Example 28
def setUp(self):
self.yesterday = email.utils.formatdate(time.time() - 86400)
self.today = email.utils.formatdate()
self.tomorrow = email.utils.formatdate(time.time() + 86400)
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider('example.com')
self.tmpdir = tempfile.mkdtemp()
self.request = Request('http://www.example.com',
headers={'User-Agent': 'test'})
self.response = Response('http://www.example.com',
headers={'Content-Type': 'text/html'},
body=b'test body',
status=202)
self.crawler.stats.open_spider(self.spider)
3
Example 29
Project: scrapy Source File: test_proxy_connect.py
@defer.inlineCallbacks
def test_https_noconnect(self):
os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888?noconnect'
crawler = get_crawler(SimpleSpider)
with LogCapture() as l:
yield crawler.crawl("https://localhost:8999/status?n=200")
self._assert_got_response_code(200, l)
os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888'
3
Example 30
Project: scrapy Source File: test_spider.py
def test_from_crawler_crawler_and_settings_population(self):
crawler = get_crawler()
spider = self.spider_class.from_crawler(crawler, 'example.com')
self.assertTrue(hasattr(spider, 'crawler'))
self.assertIs(spider.crawler, crawler)
self.assertTrue(hasattr(spider, 'settings'))
self.assertIs(spider.settings, crawler.settings)
3
Example 31
Project: scrapy Source File: test_proxy_connect.py
@defer.inlineCallbacks
def test_https_tunnel_auth_error(self):
os.environ['https_proxy'] = 'http://wrong:wronger@localhost:8888'
crawler = get_crawler(SimpleSpider)
with LogCapture() as l:
yield crawler.crawl("https://localhost:8999/status?n=200")
# The proxy returns a 407 error code but it does not reach the client;
# he just sees a TunnelError.
self._assert_got_tunnel_error(l)
os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888'
3
Example 32
Project: scrapy Source File: test_closespider.py
@defer.inlineCallbacks
def test_closespider_pagecount(self):
close_on = 5
crawler = get_crawler(FollowAllSpider, {'CLOSESPIDER_PAGECOUNT': close_on})
yield crawler.crawl()
reason = crawler.spider.meta['close_reason']
self.assertEqual(reason, 'closespider_pagecount')
pagecount = crawler.stats.get_value('response_received_count')
self.assertTrue(pagecount >= close_on)
3
Example 33
Project: scrapy Source File: test_downloader_handlers.py
def test_enabled_handler(self):
handlers = {'scheme': 'tests.test_downloader_handlers.DummyDH'}
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
dh = DownloadHandlers(crawler)
self.assertIn('scheme', dh._schemes)
for scheme in handlers: # force load handlers
dh._get_handler(scheme)
self.assertIn('scheme', dh._handlers)
self.assertNotIn('scheme', dh._notconfigured)
3
Example 34
Project: scrapy Source File: test_utils_log.py
def setUp(self):
self.logger = logging.getLogger('test')
self.logger.setLevel(logging.NOTSET)
self.logger.propagate = False
self.crawler = get_crawler(settings_dict={'LOG_LEVEL': 'WARNING'})
self.handler = LogCounterHandler(self.crawler)
self.logger.addHandler(self.handler)
3
Example 35
def setUp(self):
self.crawler = get_crawler(Spider, self.settings_dict)
self.spider = self.crawler._create_spider('foo')
self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
# some mw depends on stats collector
self.crawler.stats.open_spider(self.spider)
return self.mwman.open_spider(self.spider)
3
Example 36
Project: scrapy Source File: test_engine.py
@defer.inlineCallbacks
def test_close_engine_spiders_downloader(self):
e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
yield e.open_spider(TestSpider(), [])
e.start()
self.assertTrue(e.running)
yield e.close()
self.assertFalse(e.running)
self.assertEqual(len(e.open_spiders), 0)
0
Example 37
def setUp(self):
crawler = get_crawler(Spider, {'AJAXCRAWL_ENABLED': True})
self.spider = crawler._create_spider('foo')
self.mw = AjaxCrawlMiddleware.from_crawler(crawler)
0
Example 38
def setUp(self):
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider('foo')
self.mw = RedirectMiddleware.from_crawler(self.crawler)
0
Example 39
def setUp(self):
crawler = get_crawler(Spider)
self.spider = crawler._create_spider('foo')
self.mw = MetaRefreshMiddleware.from_crawler(crawler)
0
Example 40
def setUp(self):
crawler = get_crawler(Spider)
self.spider = crawler._create_spider('foo')
self.mw = RetryMiddleware.from_crawler(crawler)
self.mw.max_retry_times = 2
0
Example 41
def setUp(self):
self.url = 'http://localhost'
self.kwargs = {'url': self.url}
self.start_requests_mock = MagicMock()
self.spidercls = MetaSpider
self._start_requests = self.spidercls.start_requests
self.spidercls.start_requests = self.start_requests_mock
self.crawler = get_crawler(self.spidercls)
class CustomCrawlManager(CrawlManager):
def get_project_settings(self):
crawl_settings = super(
CustomCrawlManager, self).get_project_settings()
crawl_settings.setdict(
{'SPIDER_MODULES': 'tests.spiders'}, priority='cmdline')
return crawl_settings
self.crawl_manager = CustomCrawlManager(
self.spidercls.name, self.kwargs.copy())
self.crawl_manager.crawler = self.crawler
0
Example 42
Project: scrapy Source File: test_downloadermiddleware_cookies.py
def test_setting_default_cookies_enabled(self):
self.assertIsInstance(
CookiesMiddleware.from_crawler(get_crawler()),
CookiesMiddleware
)
0
Example 43
Project: scrapy Source File: test_downloadermiddleware_useragent.py
def get_spider_and_mw(self, default_useragent):
crawler = get_crawler(Spider, {'USER_AGENT': default_useragent})
spider = crawler._create_spider('foo')
return spider, UserAgentMiddleware.from_crawler(crawler)
0
Example 44
Project: scrapy Source File: test_downloadermiddleware_cookies.py
def test_setting_enabled_cookies_debug(self):
crawler = get_crawler(settings_dict={'COOKIES_DEBUG': True})
mw = CookiesMiddleware.from_crawler(crawler)
with LogCapture('scrapy.downloadermiddlewares.cookies',
propagate=False,
level=logging.DEBUG) as l:
req = Request('http://scrapytest.org/')
res = Response('http://scrapytest.org/',
headers={'Set-Cookie': 'C1=value1; path=/'})
mw.process_response(req, res, crawler.spider)
req2 = Request('http://scrapytest.org/sub1/')
mw.process_request(req2, crawler.spider)
l.check(
('scrapy.downloadermiddlewares.cookies',
'DEBUG',
'Received cookies from: <200 http://scrapytest.org/>\n'
'Set-Cookie: C1=value1; path=/\n'),
('scrapy.downloadermiddlewares.cookies',
'DEBUG',
'Sending cookies to: <GET http://scrapytest.org/sub1/>\n'
'Cookie: C1=value1\n'),
)
0
Example 45
def setUp(self):
crawler = get_crawler(Spider)
self.spider = crawler._create_spider(**self._get_spiderargs())
self.mw = OffsiteMiddleware.from_crawler(crawler)
self.mw.spider_opened(self.spider)
0
Example 46
Project: scrapy Source File: test_downloader_handlers.py
@defer.inlineCallbacks
def test_download_gzip_response(self):
if twisted_version > (12, 3, 0):
crawler = get_crawler(SingleRequestSpider)
body = b'1'*100 # PayloadResource requires body length to be 100
request = Request('http://localhost:8998/payload', method='POST', body=body, meta={'download_maxsize': 50})
yield crawler.crawl(seed=request)
failure = crawler.spider.meta['failure']
# download_maxsize < 100, hence the CancelledError
self.assertIsInstance(failure.value, defer.CancelledError)
if six.PY2:
request.headers.setdefault(b'Accept-Encoding', b'gzip,deflate')
request = request.replace(url='http://localhost:8998/xpayload')
yield crawler.crawl(seed=request)
# download_maxsize = 50 is enough for the gzipped response
failure = crawler.spider.meta.get('failure')
self.assertTrue(failure == None)
reason = crawler.spider.meta['close_reason']
self.assertTrue(reason, 'finished')
else:
# See issue https://twistedmatrix.com/trac/ticket/8175
raise unittest.SkipTest("xpayload only enabled for PY2")
else:
raise unittest.SkipTest("xpayload and payload endpoint only enabled for twisted > 12.3.0")
0
Example 47
def get_spider(*args, **kwargs):
crawler = get_crawler(spidercls=kwargs.pop('spidercls', None),
settings_dict=kwargs.pop('settings_dict', None))
return crawler._create_spider(*args, **kwargs)
0
Example 48
def run(self):
self.port = start_test_site()
self.portno = self.port.getHost().port
start_urls = [self.geturl("/"), self.geturl("/redirect"),
self.geturl("/redirect")] # a duplicate
for name, signal in vars(signals).items():
if not name.startswith('_'):
dispatcher.connect(self.record_signal, signal)
self.crawler = get_crawler(self.spider_class)
self.crawler.signals.connect(self.item_scraped, signals.item_scraped)
self.crawler.signals.connect(self.request_scheduled, signals.request_scheduled)
self.crawler.signals.connect(self.request_dropped, signals.request_dropped)
self.crawler.signals.connect(self.response_downloaded, signals.response_downloaded)
self.crawler.crawl(start_urls=start_urls)
self.spider = self.crawler.spider
self.deferred = defer.Deferred()
dispatcher.connect(self.stop, signals.engine_stopped)
return self.deferred
0
Example 49
Project: scrapy Source File: test_engine.py
@defer.inlineCallbacks
def test_close_downloader(self):
e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
yield e.close()
0
Example 50
Project: scrapy Source File: test_downloadermiddleware_downloadtimeout.py
def get_request_spider_mw(self, settings=None):
crawler = get_crawler(Spider, settings)
spider = crawler._create_spider('foo')
request = Request('http://scrapytest.org/')
return request, spider, DownloadTimeoutMiddleware.from_crawler(crawler)