Here are the examples of the python api scrapy.utils.httpobj.urlparse_cached taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
12 Examples
3
Example 1
Project: scrapy Source File: webclient.py
def _set_connection_attributes(self, request):
parsed = urlparse_cached(request)
self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
proxy = request.meta.get('proxy')
if proxy:
self.scheme, _, self.host, self.port, _ = _parse(proxy)
self.path = self.url
3
Example 2
def _get_slot_key(self, request, spider):
if 'download_slot' in request.meta:
return request.meta['download_slot']
key = urlparse_cached(request).hostname or ''
if self.ip_concurrency:
key = dnscache.get(key, key)
return key
3
Example 3
def process_request(self, request, spider):
# ignore if proxy is already set
if 'proxy' in request.meta:
return
parsed = urlparse_cached(request)
scheme = parsed.scheme
# 'no_proxy' is only supported by http schemes
if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
return
if scheme in self.proxies:
self._set_proxy(request, scheme)
3
Example 4
def should_cache_request(self, request):
if urlparse_cached(request).scheme in self.ignore_schemes:
return False
cc = self._parse_cachecontrol(request)
# obey user-agent directive "Cache-Control: no-store"
if b'no-store' in cc:
return False
# Any other is eligible for caching
return True
3
Example 5
Project: scrapy Source File: request.py
def request_httprepr(request):
"""Return the raw HTTP representation (as bytes) of the given request.
This is provided only for reference since it's not the actual stream of
bytes that will be send when performing the request (that's controlled
by Twisted).
"""
parsed = urlparse_cached(request)
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
if request.headers:
s += request.headers.to_string() + b"\r\n"
s += b"\r\n"
s += request.body
return s
3
Example 6
Project: scrapy Source File: test_utils_httpobj.py
def test_urlparse_cached(self):
url = "http://www.example.com/index.html"
request1 = Request(url)
request2 = Request(url)
req1a = urlparse_cached(request1)
req1b = urlparse_cached(request1)
req2 = urlparse_cached(request2)
urlp = urlparse(url)
assert req1a == req2
assert req1a == urlp
assert req1a is req1b
assert req1a is not req2
assert req1a is not req2
0
Example 7
Project: frontera Source File: overusedbuffer.py
def _get_key(self, request, type):
key = urlparse_cached(request).hostname or ''
if type == 'ip':
key = dnscache.get(key, key)
return key
0
Example 8
def download_request(self, request, spider):
p = urlparse_cached(request)
scheme = 'https' if request.meta.get('is_secure') else 'http'
bucket = p.hostname
path = p.path + '?' + p.query if p.query else p.path
url = '%s://%s.s3.amazonaws.com%s' % (scheme, bucket, path)
if self.anon:
request = request.replace(url=url)
elif self._signer is not None:
import botocore.awsrequest
awsrequest = botocore.awsrequest.AWSRequest(
method=request.method,
url='%s://s3.amazonaws.com/%s%s' % (scheme, bucket, path),
headers=request.headers.to_unicode_dict(),
data=request.body)
self._signer.add_auth(awsrequest)
request = request.replace(
url=url, headers=awsrequest.headers.items())
else:
signed_headers = self.conn.make_request(
method=request.method,
bucket=bucket,
key=unquote(p.path),
query_args=unquote(p.query),
headers=request.headers,
data=request.body)
request = request.replace(url=url, headers=signed_headers)
return self._download_http(request, spider)
0
Example 9
def robot_parser(self, request, spider):
url = urlparse_cached(request)
netloc = url.netloc
if netloc not in self._parsers:
self._parsers[netloc] = Deferred()
robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
robotsreq = Request(
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={'dont_obey_robotstxt': True}
)
dfd = self.crawler.engine.download(robotsreq, spider)
dfd.addCallback(self._parse_robots, netloc)
dfd.addErrback(self._logerror, robotsreq, spider)
dfd.addErrback(self._robots_error, netloc)
if isinstance(self._parsers[netloc], Deferred):
d = Deferred()
def cb(result):
d.callback(result)
return result
self._parsers[netloc].addCallback(cb)
return d
else:
return self._parsers[netloc]
0
Example 10
def should_cache_request(self, request):
return urlparse_cached(request).scheme not in self.ignore_schemes
0
Example 11
def should_follow(self, request, spider):
regex = self.host_regex
# hostname can be None for wrong urls (like javascript links)
host = urlparse_cached(request).hostname or ''
return bool(regex.search(host))
0
Example 12
def is_cacheable(self, request):
return urlparse_cached(request).scheme not in self.ignore_schemes