Here are the examples of the python api scrapy.spiders.Spider taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
42 Examples
3
Example 1
Project: frontera Source File: test_seed_loader.py
def test_process_start_requests_ignore_comments(self):
seeds_content = """
https://www.example.com
# https://www.dmoz.org
https://www.scrapy.org
# https://www.test.com
"""
seed_loader = self.seed_loader_setup(seeds_content)
requests = seed_loader.process_start_requests(None, Spider(name='spider'))
self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org'])
3
Example 2
Project: frontera Source File: test_seed_loader.py
def check_request_urls(self, urls, key_extension='.txt'):
with open(self.seed_path_1, 'rU') as s1:
with open(self.seed_path_2, 'rU') as s2:
conn = MockConnection()
bucket = conn.create_bucket('some-bucket')
bucket.add_key('seeds-folder/seeds1%s' % key_extension, s1)
bucket.add_key('seeds-folder/seeds2%s' % key_extension, s2)
def mocked_connect_s3(*args, **kwargs):
return conn
with mock.patch('frontera.contrib.scrapy.middlewares.seeds.s3.connect_s3',
side_effect=mocked_connect_s3):
requests = self.seed_loader.process_start_requests(None, Spider(name='spider'))
self.assertEqual(set([r.url for r in requests]), set(urls))
3
Example 3
Project: scrapy Source File: test_downloader_handlers.py
def test_download(self):
def _test(response):
self.assertEquals(response.url, request.url)
self.assertEquals(response.status, 200)
self.assertEquals(response.body, b'0123456789')
request = Request(path_to_file_uri(self.tmpname + '^'))
assert request.url.upper().endswith('%5E')
return self.download_request(request, Spider('foo')).addCallback(_test)
3
Example 4
def test_download(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, b"0123456789")
return d
3
Example 5
Project: scrapy Source File: test_downloader_handlers.py
def test_download_head(self):
request = Request(self.getURL('file'), method='HEAD')
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, b'')
return d
3
Example 6
Project: scrapy Source File: test_downloader_handlers.py
def test_redirect_status(self):
request = Request(self.getURL('redirect'))
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.status)
d.addCallback(self.assertEquals, 302)
return d
3
Example 7
Project: scrapy Source File: test_downloader_handlers.py
def test_redirect_status_head(self):
request = Request(self.getURL('redirect'), method='HEAD')
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.status)
d.addCallback(self.assertEquals, 302)
return d
3
Example 8
Project: scrapy Source File: test_downloader_handlers.py
@defer.inlineCallbacks
def test_timeout_download_from_spider_nodata_rcvd(self):
# client connects but no data is received
spider = Spider('foo')
meta = {'download_timeout': 0.2}
request = Request(self.getURL('wait'), meta=meta)
d = self.download_request(request, spider)
yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
3
Example 9
Project: scrapy Source File: test_downloader_handlers.py
@defer.inlineCallbacks
def test_timeout_download_from_spider_server_hangs(self):
# client connects, server send headers and some body bytes but hangs
spider = Spider('foo')
meta = {'download_timeout': 0.2}
request = Request(self.getURL('hang-after-headers'), meta=meta)
d = self.download_request(request, spider)
yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
3
Example 10
Project: scrapy Source File: test_downloader_handlers.py
def test_host_header_not_in_request_headers(self):
def _test(response):
self.assertEquals(
response.body, to_bytes('%s:%d' % (self.host, self.portno)))
self.assertEquals(request.headers, {})
request = Request(self.getURL('host'))
return self.download_request(request, Spider('foo')).addCallback(_test)
3
Example 11
Project: scrapy Source File: test_downloader_handlers.py
def test_host_header_seted_in_request_headers(self):
def _test(response):
self.assertEquals(response.body, b'example.com')
self.assertEquals(request.headers.get('Host'), b'example.com')
request = Request(self.getURL('host'), headers={'Host': 'example.com'})
return self.download_request(request, Spider('foo')).addCallback(_test)
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, b'example.com')
return d
3
Example 12
Project: scrapy Source File: test_downloader_handlers.py
def test_content_length_zero_bodyless_post_request_headers(self):
"""Tests if "Content-Length: 0" is sent for bodyless POST requests.
This is not strictly required by HTTP RFCs but can cause trouble
for some web servers.
See:
https://github.com/scrapy/scrapy/issues/823
https://issues.apache.org/jira/browse/TS-2902
https://github.com/kennethreitz/requests/issues/405
https://bugs.python.org/issue14721
"""
def _test(response):
self.assertEquals(response.body, b'0')
request = Request(self.getURL('contentlength'), method='POST', headers={'Host': 'example.com'})
return self.download_request(request, Spider('foo')).addCallback(_test)
3
Example 13
def test_payload(self):
body = b'1'*100 # PayloadResource requires body length to be 100
request = Request(self.getURL('payload'), method='POST', body=body)
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, body)
return d
3
Example 14
Project: scrapy Source File: test_downloader_handlers.py
def test_download_without_maxsize_limit(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, b"0123456789")
return d
3
Example 15
Project: scrapy Source File: test_downloader_handlers.py
def test_response_class_choosing_request(self):
"""Tests choosing of correct response type
in case of Content-Type is empty but body contains text.
"""
body = b'Some plain text\ndata with tabs\t and null bytes\0'
def _test_type(response):
self.assertEquals(type(response), TextResponse)
request = Request(self.getURL('nocontenttype'), body=body)
d = self.download_request(request, Spider('foo'))
d.addCallback(_test_type)
return d
3
Example 16
Project: scrapy Source File: test_downloader_handlers.py
@defer.inlineCallbacks
def test_download_with_maxsize(self):
request = Request(self.getURL('file'))
# 10 is minimal size for this request and the limit is only counted on
# response body. (regardless of headers)
d = self.download_request(request, Spider('foo', download_maxsize=10))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, b"0123456789")
yield d
d = self.download_request(request, Spider('foo', download_maxsize=9))
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
3
Example 17
Project: scrapy Source File: test_downloader_handlers.py
@defer.inlineCallbacks
def test_download_with_maxsize_per_req(self):
meta = {'download_maxsize': 2}
request = Request(self.getURL('file'), meta=meta)
d = self.download_request(request, Spider('foo'))
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
3
Example 18
Project: scrapy Source File: test_downloader_handlers.py
def test_download_with_large_maxsize_per_spider(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo', download_maxsize=100))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, b"0123456789")
return d
3
Example 19
Project: scrapy Source File: test_downloader_handlers.py
def test_download_with_proxy(self):
def _test(response):
self.assertEquals(response.status, 200)
self.assertEquals(response.url, request.url)
self.assertEquals(response.body, b'http://example.com')
http_proxy = self.getURL('')
request = Request('http://example.com', meta={'proxy': http_proxy})
return self.download_request(request, Spider('foo')).addCallback(_test)
3
Example 20
Project: scrapy Source File: test_downloader_handlers.py
def test_download_with_proxy_https_noconnect(self):
def _test(response):
self.assertEquals(response.status, 200)
self.assertEquals(response.url, request.url)
self.assertEquals(response.body, b'https://example.com')
http_proxy = '%s?noconnect' % self.getURL('')
request = Request('https://example.com', meta={'proxy': http_proxy})
return self.download_request(request, Spider('foo')).addCallback(_test)
3
Example 21
Project: scrapy Source File: test_downloader_handlers.py
def test_download_without_proxy(self):
def _test(response):
self.assertEquals(response.status, 200)
self.assertEquals(response.url, request.url)
self.assertEquals(response.body, b'/path/to/resource')
request = Request(self.getURL('path/to/resource'))
return self.download_request(request, Spider('foo')).addCallback(_test)
3
Example 22
Project: scrapy Source File: test_downloader_handlers.py
@defer.inlineCallbacks
def test_download_with_proxy_https_timeout(self):
""" Test TunnelingTCP4ClientEndpoint """
http_proxy = self.getURL('')
domain = 'https://no-such-domain.nosuch'
request = Request(
domain, meta={'proxy': http_proxy, 'download_timeout': 0.2})
d = self.download_request(request, Spider('foo'))
timeout = yield self.assertFailure(d, error.TimeoutError)
self.assertIn(domain, timeout.osError)
3
Example 23
Project: scrapy Source File: test_downloader_handlers.py
def setUp(self):
skip_if_no_boto()
self.s3reqh = S3DownloadHandler(Settings(),
httpdownloadhandler=HttpDownloadHandlerMock,
#anon=True, # is implicit
)
self.download_request = self.s3reqh.download_request
self.spider = Spider('foo')
3
Example 24
Project: scrapy Source File: test_downloader_handlers.py
def setUp(self):
skip_if_no_boto()
s3reqh = S3DownloadHandler(Settings(), self.AWS_ACCESS_KEY_ID,
self.AWS_SECRET_ACCESS_KEY,
httpdownloadhandler=HttpDownloadHandlerMock)
self.download_request = s3reqh.download_request
self.spider = Spider('foo')
3
Example 25
Project: scrapy Source File: test_spidermiddleware_urllength.py
def test_process_spider_output(self):
res = Response('http://scrapytest.org')
short_url_req = Request('http://scrapytest.org/')
long_url_req = Request('http://scrapytest.org/this_is_a_long_url')
reqs = [short_url_req, long_url_req]
mw = UrlLengthMiddleware(maxlength=25)
spider = Spider('foo')
out = list(mw.process_spider_output(res, reqs, spider))
self.assertEquals(out, [short_url_req])
3
Example 26
Project: scrapy Source File: test_spiderstate.py
def test_store_load(self):
jobdir = self.mktemp()
os.mkdir(jobdir)
spider = Spider(name='default')
dt = datetime.now()
ss = SpiderState(jobdir)
ss.spider_opened(spider)
spider.state['one'] = 1
spider.state['dt'] = dt
ss.spider_closed(spider)
spider2 = Spider(name='default')
ss2 = SpiderState(jobdir)
ss2.spider_opened(spider2)
self.assertEqual(spider.state, {'one': 1, 'dt': dt})
ss2.spider_closed(spider2)
3
Example 27
Project: scrapy Source File: test_spiderstate.py
def test_state_attribute(self):
# state attribute must be present if jobdir is not set, to provide a
# consistent interface
spider = Spider(name='default')
ss = SpiderState()
ss.spider_opened(spider)
self.assertEqual(spider.state, {})
ss.spider_closed(spider)
3
Example 28
Project: scrapy Source File: test_utils_url.py
def test_url_is_from_spider(self):
spider = Spider(name='example.com')
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', spider))
self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', spider))
3
Example 29
Project: scrapy Source File: test_utils_url.py
def test_url_is_from_spider_with_allowed_domains(self):
spider = Spider(name='example.com', allowed_domains=['example.org', 'example.net'])
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://www.example.org/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', spider))
self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', spider))
spider = Spider(name='example.com', allowed_domains=set(('example.com', 'example.net')))
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
spider = Spider(name='example.com', allowed_domains=('example.com', 'example.net'))
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
0
Example 30
Project: frontera Source File: test_seed_loader.py
def test_process_start_requests(self):
seed_loader = self.seed_loader_setup()
requests = seed_loader.process_start_requests(None, Spider(name='spider'))
self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org'])
0
Example 31
def setUp(self):
self.spider = Spider('foo')
self.mw = CookiesMiddleware()
0
Example 32
def setUp(self):
self.mw = DecompressionMiddleware()
self.spider = Spider('foo')
0
Example 33
def setUp(self):
self.spider = Spider('foo')
self.mw = HttpCompressionMiddleware()
0
Example 34
def test_non_existent(self):
request = Request('file://%s' % self.mktemp())
d = self.download_request(request, Spider('foo'))
return self.assertFailure(d, IOError)
0
Example 35
Project: scrapy Source File: test_downloader_handlers.py
@defer.inlineCallbacks
def test_download_with_small_maxsize_per_spider(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo', download_maxsize=2))
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
0
Example 36
def setUp(self):
self.formatter = LogFormatter()
self.spider = Spider('default')
0
Example 37
def setUp(self):
self.spider = Spider('media.com')
self.pipe = self.pipeline_class(download_func=_mocked_download_func)
self.pipe.open_spider(self.spider)
self.info = self.pipe.spiderinfo
0
Example 38
def setUp(self):
self.spider = Spider('foo')
self.mw = HttpErrorMiddleware(Settings({}))
self.req = Request('http://scrapytest.org')
self.res200, self.res404 = _responses(self.req, [200, 404])
0
Example 39
def setUp(self):
self.spider = Spider('foo')
self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOWED_CODES': (402,)}))
self.req = Request('http://scrapytest.org')
self.res200, self.res404, self.res402 = _responses(self.req, [200, 404, 402])
0
Example 40
def setUp(self):
self.spider = Spider('foo')
self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOW_ALL': True}))
self.req = Request('http://scrapytest.org')
self.res200, self.res404, self.res402 = _responses(self.req, [200, 404, 402])
0
Example 41
def _get_spider(self):
return Spider('foo')
0
Example 42
def setUp(self):
self.spider = Spider('foo')
self.mw = RefererMiddleware()