Here are the examples of the python api scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
8 Examples
3
Example 1
Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
def test_robotstxt(self):
middleware = RobotsTxtMiddleware(self._get_successful_crawler())
return DeferredList([
self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
self.assertIgnored(Request('http://site.local/admin/main'), middleware),
self.assertIgnored(Request('http://site.local/static/'), middleware)
], fireOnOneErrback=True)
3
Example 2
Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
def test_robotstxt_meta(self):
middleware = RobotsTxtMiddleware(self._get_successful_crawler())
meta = {'dont_obey_robotstxt': True}
return DeferredList([
self.assertNotIgnored(Request('http://site.local/allowed', meta=meta), middleware),
self.assertNotIgnored(Request('http://site.local/admin/main', meta=meta), middleware),
self.assertNotIgnored(Request('http://site.local/static/', meta=meta), middleware)
], fireOnOneErrback=True)
3
Example 3
Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
def test_robotstxt_garbage(self):
# garbage response should be discarded, equal 'allow all'
middleware = RobotsTxtMiddleware(self._get_garbage_crawler())
deferred = DeferredList([
self.assertNotIgnored(Request('http://site.local'), middleware),
self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
self.assertNotIgnored(Request('http://site.local/admin/main'), middleware),
self.assertNotIgnored(Request('http://site.local/static/'), middleware)
], fireOnOneErrback=True)
return deferred
3
Example 4
Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
def test_robotstxt_empty_response(self):
# empty response should equal 'allow all'
middleware = RobotsTxtMiddleware(self._get_emptybody_crawler())
return DeferredList([
self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
self.assertNotIgnored(Request('http://site.local/admin/main'), middleware),
self.assertNotIgnored(Request('http://site.local/static/'), middleware)
], fireOnOneErrback=True)
3
Example 5
Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
def test_robotstxt_error(self):
self.crawler.settings.set('ROBOTSTXT_OBEY', True)
err = error.DNSLookupError('Robotstxt address not found')
def return_failure(request, spider):
deferred = Deferred()
reactor.callFromThread(deferred.errback, failure.Failure(err))
return deferred
self.crawler.engine.download.side_effect = return_failure
middleware = RobotsTxtMiddleware(self.crawler)
middleware._logerror = mock.MagicMock(side_effect=middleware._logerror)
deferred = middleware.process_request(Request('http://site.local'), None)
deferred.addCallback(lambda _: self.assertTrue(middleware._logerror.called))
return deferred
3
Example 6
Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
def test_robotstxt_immediate_error(self):
self.crawler.settings.set('ROBOTSTXT_OBEY', True)
err = error.DNSLookupError('Robotstxt address not found')
def immediate_failure(request, spider):
deferred = Deferred()
deferred.errback(failure.Failure(err))
return deferred
self.crawler.engine.download.side_effect = immediate_failure
middleware = RobotsTxtMiddleware(self.crawler)
return self.assertNotIgnored(Request('http://site.local'), middleware)
3
Example 7
Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
def test_ignore_robotstxt_request(self):
self.crawler.settings.set('ROBOTSTXT_OBEY', True)
def ignore_request(request, spider):
deferred = Deferred()
reactor.callFromThread(deferred.errback, failure.Failure(IgnoreRequest()))
return deferred
self.crawler.engine.download.side_effect = ignore_request
middleware = RobotsTxtMiddleware(self.crawler)
mw_module_logger.error = mock.MagicMock()
d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
d.addCallback(lambda _: self.assertFalse(mw_module_logger.error.called))
return d
0
Example 8
Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
def test_robotstxt_ready_parser(self):
middleware = RobotsTxtMiddleware(self._get_successful_crawler())
d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
d.addCallback(lambda _: self.assertNotIgnored(Request('http://site.local/allowed'), middleware))
return d