scrapy.spiders.Spider

Here are the examples of the python api scrapy.spiders.Spider taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

42 Examples 7

Example 1

Project: frontera Source File: test_seed_loader.py
    def test_process_start_requests_ignore_comments(self):
        seeds_content = """
https://www.example.com
# https://www.dmoz.org
https://www.scrapy.org
# https://www.test.com
"""
        seed_loader = self.seed_loader_setup(seeds_content)
        requests = seed_loader.process_start_requests(None, Spider(name='spider'))
        self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org'])

Example 2

Project: frontera Source File: test_seed_loader.py
    def check_request_urls(self, urls, key_extension='.txt'):
        with open(self.seed_path_1, 'rU') as s1:
            with open(self.seed_path_2, 'rU') as s2:
                conn = MockConnection()
                bucket = conn.create_bucket('some-bucket')
                bucket.add_key('seeds-folder/seeds1%s' % key_extension, s1)
                bucket.add_key('seeds-folder/seeds2%s' % key_extension, s2)

                def mocked_connect_s3(*args, **kwargs):
                    return conn

                with mock.patch('frontera.contrib.scrapy.middlewares.seeds.s3.connect_s3',
                                side_effect=mocked_connect_s3):
                    requests = self.seed_loader.process_start_requests(None, Spider(name='spider'))
                    self.assertEqual(set([r.url for r in requests]), set(urls))

Example 3

Project: scrapy Source File: test_downloader_handlers.py
    def test_download(self):
        def _test(response):
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.status, 200)
            self.assertEquals(response.body, b'0123456789')

        request = Request(path_to_file_uri(self.tmpname + '^'))
        assert request.url.upper().endswith('%5E')
        return self.download_request(request, Spider('foo')).addCallback(_test)

Example 4

Project: scrapy Source File: test_downloader_handlers.py
Function: test_download
    def test_download(self):
        request = Request(self.getURL('file'))
        d = self.download_request(request, Spider('foo'))
        d.addCallback(lambda r: r.body)
        d.addCallback(self.assertEquals, b"0123456789")
        return d

Example 5

Project: scrapy Source File: test_downloader_handlers.py
    def test_download_head(self):
        request = Request(self.getURL('file'), method='HEAD')
        d = self.download_request(request, Spider('foo'))
        d.addCallback(lambda r: r.body)
        d.addCallback(self.assertEquals, b'')
        return d

Example 6

Project: scrapy Source File: test_downloader_handlers.py
    def test_redirect_status(self):
        request = Request(self.getURL('redirect'))
        d = self.download_request(request, Spider('foo'))
        d.addCallback(lambda r: r.status)
        d.addCallback(self.assertEquals, 302)
        return d

Example 7

Project: scrapy Source File: test_downloader_handlers.py
    def test_redirect_status_head(self):
        request = Request(self.getURL('redirect'), method='HEAD')
        d = self.download_request(request, Spider('foo'))
        d.addCallback(lambda r: r.status)
        d.addCallback(self.assertEquals, 302)
        return d

Example 8

Project: scrapy Source File: test_downloader_handlers.py
    @defer.inlineCallbacks
    def test_timeout_download_from_spider_nodata_rcvd(self):
        # client connects but no data is received
        spider = Spider('foo')
        meta = {'download_timeout': 0.2}
        request = Request(self.getURL('wait'), meta=meta)
        d = self.download_request(request, spider)
        yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)

Example 9

Project: scrapy Source File: test_downloader_handlers.py
    @defer.inlineCallbacks
    def test_timeout_download_from_spider_server_hangs(self):
        # client connects, server send headers and some body bytes but hangs
        spider = Spider('foo')
        meta = {'download_timeout': 0.2}
        request = Request(self.getURL('hang-after-headers'), meta=meta)
        d = self.download_request(request, spider)
        yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)

Example 10

Project: scrapy Source File: test_downloader_handlers.py
    def test_host_header_not_in_request_headers(self):
        def _test(response):
            self.assertEquals(
                response.body, to_bytes('%s:%d' % (self.host, self.portno)))
            self.assertEquals(request.headers, {})

        request = Request(self.getURL('host'))
        return self.download_request(request, Spider('foo')).addCallback(_test)

Example 11

Project: scrapy Source File: test_downloader_handlers.py
    def test_host_header_seted_in_request_headers(self):
        def _test(response):
            self.assertEquals(response.body, b'example.com')
            self.assertEquals(request.headers.get('Host'), b'example.com')

        request = Request(self.getURL('host'), headers={'Host': 'example.com'})
        return self.download_request(request, Spider('foo')).addCallback(_test)

        d = self.download_request(request, Spider('foo'))
        d.addCallback(lambda r: r.body)
        d.addCallback(self.assertEquals, b'example.com')
        return d

Example 12

Project: scrapy Source File: test_downloader_handlers.py
    def test_content_length_zero_bodyless_post_request_headers(self):
        """Tests if "Content-Length: 0" is sent for bodyless POST requests.

        This is not strictly required by HTTP RFCs but can cause trouble
        for some web servers.
        See:
        https://github.com/scrapy/scrapy/issues/823
        https://issues.apache.org/jira/browse/TS-2902
        https://github.com/kennethreitz/requests/issues/405
        https://bugs.python.org/issue14721
        """
        def _test(response):
            self.assertEquals(response.body, b'0')

        request = Request(self.getURL('contentlength'), method='POST', headers={'Host': 'example.com'})
        return self.download_request(request, Spider('foo')).addCallback(_test)

Example 13

Project: scrapy Source File: test_downloader_handlers.py
Function: test_payload
    def test_payload(self):
        body = b'1'*100 # PayloadResource requires body length to be 100
        request = Request(self.getURL('payload'), method='POST', body=body)
        d = self.download_request(request, Spider('foo'))
        d.addCallback(lambda r: r.body)
        d.addCallback(self.assertEquals, body)
        return d

Example 14

Project: scrapy Source File: test_downloader_handlers.py
    def test_download_without_maxsize_limit(self):
        request = Request(self.getURL('file'))
        d = self.download_request(request, Spider('foo'))
        d.addCallback(lambda r: r.body)
        d.addCallback(self.assertEquals, b"0123456789")
        return d

Example 15

Project: scrapy Source File: test_downloader_handlers.py
    def test_response_class_choosing_request(self):
        """Tests choosing of correct response type
         in case of Content-Type is empty but body contains text.
        """
        body = b'Some plain text\ndata with tabs\t and null bytes\0'

        def _test_type(response):
            self.assertEquals(type(response), TextResponse)

        request = Request(self.getURL('nocontenttype'), body=body)
        d = self.download_request(request, Spider('foo'))
        d.addCallback(_test_type)
        return d

Example 16

Project: scrapy Source File: test_downloader_handlers.py
    @defer.inlineCallbacks
    def test_download_with_maxsize(self):
        request = Request(self.getURL('file'))

        # 10 is minimal size for this request and the limit is only counted on
        # response body. (regardless of headers)
        d = self.download_request(request, Spider('foo', download_maxsize=10))
        d.addCallback(lambda r: r.body)
        d.addCallback(self.assertEquals, b"0123456789")
        yield d

        d = self.download_request(request, Spider('foo', download_maxsize=9))
        yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)

Example 17

Project: scrapy Source File: test_downloader_handlers.py
    @defer.inlineCallbacks
    def test_download_with_maxsize_per_req(self):
        meta = {'download_maxsize': 2}
        request = Request(self.getURL('file'), meta=meta)
        d = self.download_request(request, Spider('foo'))
        yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)

Example 18

Project: scrapy Source File: test_downloader_handlers.py
    def test_download_with_large_maxsize_per_spider(self):
        request = Request(self.getURL('file'))
        d = self.download_request(request, Spider('foo', download_maxsize=100))
        d.addCallback(lambda r: r.body)
        d.addCallback(self.assertEquals, b"0123456789")
        return d

Example 19

Project: scrapy Source File: test_downloader_handlers.py
    def test_download_with_proxy(self):
        def _test(response):
            self.assertEquals(response.status, 200)
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.body, b'http://example.com')

        http_proxy = self.getURL('')
        request = Request('http://example.com', meta={'proxy': http_proxy})
        return self.download_request(request, Spider('foo')).addCallback(_test)

Example 20

Project: scrapy Source File: test_downloader_handlers.py
    def test_download_with_proxy_https_noconnect(self):
        def _test(response):
            self.assertEquals(response.status, 200)
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.body, b'https://example.com')

        http_proxy = '%s?noconnect' % self.getURL('')
        request = Request('https://example.com', meta={'proxy': http_proxy})
        return self.download_request(request, Spider('foo')).addCallback(_test)

Example 21

Project: scrapy Source File: test_downloader_handlers.py
    def test_download_without_proxy(self):
        def _test(response):
            self.assertEquals(response.status, 200)
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.body, b'/path/to/resource')

        request = Request(self.getURL('path/to/resource'))
        return self.download_request(request, Spider('foo')).addCallback(_test)

Example 22

Project: scrapy Source File: test_downloader_handlers.py
    @defer.inlineCallbacks
    def test_download_with_proxy_https_timeout(self):
        """ Test TunnelingTCP4ClientEndpoint """
        http_proxy = self.getURL('')
        domain = 'https://no-such-domain.nosuch'
        request = Request(
            domain, meta={'proxy': http_proxy, 'download_timeout': 0.2})
        d = self.download_request(request, Spider('foo'))
        timeout = yield self.assertFailure(d, error.TimeoutError)
        self.assertIn(domain, timeout.osError)

Example 23

Project: scrapy Source File: test_downloader_handlers.py
    def setUp(self):
        skip_if_no_boto()
        self.s3reqh = S3DownloadHandler(Settings(),
                httpdownloadhandler=HttpDownloadHandlerMock,
                #anon=True, # is implicit
        )
        self.download_request = self.s3reqh.download_request
        self.spider = Spider('foo')

Example 24

Project: scrapy Source File: test_downloader_handlers.py
    def setUp(self):
        skip_if_no_boto()
        s3reqh = S3DownloadHandler(Settings(), self.AWS_ACCESS_KEY_ID,
                self.AWS_SECRET_ACCESS_KEY,
                httpdownloadhandler=HttpDownloadHandlerMock)
        self.download_request = s3reqh.download_request
        self.spider = Spider('foo')

Example 25

Project: scrapy Source File: test_spidermiddleware_urllength.py
    def test_process_spider_output(self):
        res = Response('http://scrapytest.org')

        short_url_req = Request('http://scrapytest.org/')
        long_url_req = Request('http://scrapytest.org/this_is_a_long_url')
        reqs = [short_url_req, long_url_req]

        mw = UrlLengthMiddleware(maxlength=25)
        spider = Spider('foo')
        out = list(mw.process_spider_output(res, reqs, spider))
        self.assertEquals(out, [short_url_req])

Example 26

Project: scrapy Source File: test_spiderstate.py
    def test_store_load(self):
        jobdir = self.mktemp()
        os.mkdir(jobdir)
        spider = Spider(name='default')
        dt = datetime.now()

        ss = SpiderState(jobdir)
        ss.spider_opened(spider)
        spider.state['one'] = 1
        spider.state['dt'] = dt
        ss.spider_closed(spider)

        spider2 = Spider(name='default')
        ss2 = SpiderState(jobdir)
        ss2.spider_opened(spider2)
        self.assertEqual(spider.state, {'one': 1, 'dt': dt})
        ss2.spider_closed(spider2)

Example 27

Project: scrapy Source File: test_spiderstate.py
    def test_state_attribute(self):
        # state attribute must be present if jobdir is not set, to provide a
        # consistent interface
        spider = Spider(name='default')
        ss = SpiderState()
        ss.spider_opened(spider)
        self.assertEqual(spider.state, {})
        ss.spider_closed(spider)

Example 28

Project: scrapy Source File: test_utils_url.py
    def test_url_is_from_spider(self):
        spider = Spider(name='example.com')
        self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
        self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
        self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', spider))
        self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', spider))

Example 29

Project: scrapy Source File: test_utils_url.py
    def test_url_is_from_spider_with_allowed_domains(self):
        spider = Spider(name='example.com', allowed_domains=['example.org', 'example.net'])
        self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
        self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
        self.assertTrue(url_is_from_spider('http://example.com/some/page.html', spider))
        self.assertTrue(url_is_from_spider('http://www.example.org/some/page.html', spider))
        self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', spider))
        self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', spider))

        spider = Spider(name='example.com', allowed_domains=set(('example.com', 'example.net')))
        self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))

        spider = Spider(name='example.com', allowed_domains=('example.com', 'example.net'))
        self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))

Example 30

Project: frontera Source File: test_seed_loader.py
    def test_process_start_requests(self):
        seed_loader = self.seed_loader_setup()
        requests = seed_loader.process_start_requests(None, Spider(name='spider'))
        self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org'])

Example 31

Project: scrapy Source File: test_downloadermiddleware_cookies.py
Function: set_up
    def setUp(self):
        self.spider = Spider('foo')
        self.mw = CookiesMiddleware()

Example 32

Project: scrapy Source File: test_downloadermiddleware_decompression.py
Function: set_up
    def setUp(self):
        self.mw = DecompressionMiddleware()
        self.spider = Spider('foo')

Example 33

Project: scrapy Source File: test_downloadermiddleware_httpcompression.py
Function: set_up
    def setUp(self):
        self.spider = Spider('foo')
        self.mw = HttpCompressionMiddleware()

Example 34

Project: scrapy Source File: test_downloader_handlers.py
Function: test_non_existent
    def test_non_existent(self):
        request = Request('file://%s' % self.mktemp())
        d = self.download_request(request, Spider('foo'))
        return self.assertFailure(d, IOError)

Example 35

Project: scrapy Source File: test_downloader_handlers.py
    @defer.inlineCallbacks
    def test_download_with_small_maxsize_per_spider(self):
        request = Request(self.getURL('file'))
        d = self.download_request(request, Spider('foo', download_maxsize=2))
        yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)

Example 36

Project: scrapy Source File: test_logformatter.py
Function: set_up
    def setUp(self):
        self.formatter = LogFormatter()
        self.spider = Spider('default')

Example 37

Project: scrapy Source File: test_pipeline_media.py
Function: set_up
    def setUp(self):
        self.spider = Spider('media.com')
        self.pipe = self.pipeline_class(download_func=_mocked_download_func)
        self.pipe.open_spider(self.spider)
        self.info = self.pipe.spiderinfo

Example 38

Project: scrapy Source File: test_spidermiddleware_httperror.py
Function: set_up
    def setUp(self):
        self.spider = Spider('foo')
        self.mw = HttpErrorMiddleware(Settings({}))
        self.req = Request('http://scrapytest.org')
        self.res200, self.res404 = _responses(self.req, [200, 404])

Example 39

Project: scrapy Source File: test_spidermiddleware_httperror.py
Function: set_up
    def setUp(self):
        self.spider = Spider('foo')
        self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOWED_CODES': (402,)}))
        self.req = Request('http://scrapytest.org')
        self.res200, self.res404, self.res402 = _responses(self.req, [200, 404, 402])

Example 40

Project: scrapy Source File: test_spidermiddleware_httperror.py
Function: set_up
    def setUp(self):
        self.spider = Spider('foo')
        self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOW_ALL': True}))
        self.req = Request('http://scrapytest.org')
        self.res200, self.res404, self.res402 = _responses(self.req, [200, 404, 402])

Example 41

Project: scrapy Source File: test_spidermiddleware_offsite.py
Function: get_spider
    def _get_spider(self):
        return Spider('foo')

Example 42

Project: scrapy Source File: test_spidermiddleware_referer.py
Function: set_up
    def setUp(self):
        self.spider = Spider('foo')
        self.mw = RefererMiddleware()