scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware

Here are the examples of the python api scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

8 Examples 7

Example 1

Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
    def test_robotstxt(self):
        middleware = RobotsTxtMiddleware(self._get_successful_crawler())
        return DeferredList([
            self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
            self.assertIgnored(Request('http://site.local/admin/main'), middleware),
            self.assertIgnored(Request('http://site.local/static/'), middleware)
        ], fireOnOneErrback=True)

Example 2

Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
    def test_robotstxt_meta(self):
        middleware = RobotsTxtMiddleware(self._get_successful_crawler())
        meta = {'dont_obey_robotstxt': True}
        return DeferredList([
            self.assertNotIgnored(Request('http://site.local/allowed', meta=meta), middleware),
            self.assertNotIgnored(Request('http://site.local/admin/main', meta=meta), middleware),
            self.assertNotIgnored(Request('http://site.local/static/', meta=meta), middleware)
        ], fireOnOneErrback=True)

Example 3

Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
    def test_robotstxt_garbage(self):
        # garbage response should be discarded, equal 'allow all'
        middleware = RobotsTxtMiddleware(self._get_garbage_crawler())
        deferred = DeferredList([
            self.assertNotIgnored(Request('http://site.local'), middleware),
            self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
            self.assertNotIgnored(Request('http://site.local/admin/main'), middleware),
            self.assertNotIgnored(Request('http://site.local/static/'), middleware)
        ], fireOnOneErrback=True)
        return deferred

Example 4

Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
    def test_robotstxt_empty_response(self):
        # empty response should equal 'allow all'
        middleware = RobotsTxtMiddleware(self._get_emptybody_crawler())
        return DeferredList([
            self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
            self.assertNotIgnored(Request('http://site.local/admin/main'), middleware),
            self.assertNotIgnored(Request('http://site.local/static/'), middleware)
        ], fireOnOneErrback=True)

Example 5

Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
    def test_robotstxt_error(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        err = error.DNSLookupError('Robotstxt address not found')
        def return_failure(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.errback, failure.Failure(err))
            return deferred
        self.crawler.engine.download.side_effect = return_failure

        middleware = RobotsTxtMiddleware(self.crawler)
        middleware._logerror = mock.MagicMock(side_effect=middleware._logerror)
        deferred = middleware.process_request(Request('http://site.local'), None)
        deferred.addCallback(lambda _: self.assertTrue(middleware._logerror.called))
        return deferred

Example 6

Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
    def test_robotstxt_immediate_error(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        err = error.DNSLookupError('Robotstxt address not found')
        def immediate_failure(request, spider):
            deferred = Deferred()
            deferred.errback(failure.Failure(err))
            return deferred
        self.crawler.engine.download.side_effect = immediate_failure

        middleware = RobotsTxtMiddleware(self.crawler)
        return self.assertNotIgnored(Request('http://site.local'), middleware)

Example 7

Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
    def test_ignore_robotstxt_request(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        def ignore_request(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.errback, failure.Failure(IgnoreRequest()))
            return deferred
        self.crawler.engine.download.side_effect = ignore_request

        middleware = RobotsTxtMiddleware(self.crawler)
        mw_module_logger.error = mock.MagicMock()

        d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
        d.addCallback(lambda _: self.assertFalse(mw_module_logger.error.called))
        return d

Example 8

Project: scrapy Source File: test_downloadermiddleware_robotstxt.py
    def test_robotstxt_ready_parser(self):
        middleware = RobotsTxtMiddleware(self._get_successful_crawler())
        d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
        d.addCallback(lambda _: self.assertNotIgnored(Request('http://site.local/allowed'), middleware))
        return d