Here are the examples of the python api scrapy.utils.python.to_unicode taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
32 Examples
3
Example 1
Project: scrapy Source File: http10.py
def _connect(self, factory):
host, port = to_unicode(factory.host), factory.port
if factory.scheme == b'https':
return reactor.connectSSL(host, port, factory,
self.ClientContextFactory())
else:
return reactor.connectTCP(host, port, factory)
3
Example 2
Project: scrapy Source File: httpcache.py
def rfc1123_to_epoch(date_str):
try:
date_str = to_unicode(date_str, encoding='ascii')
return mktime_tz(parsedate_tz(date_str))
except Exception:
return None
3
Example 3
Project: scrapy Source File: headers.py
def to_unicode_dict(self):
""" Return headers as a CaselessDict with unicode keys
and unicode values. Multiple values are joined with ','.
"""
return CaselessDict(
(to_unicode(key, encoding=self.encoding),
to_unicode(b','.join(value), encoding=self.encoding))
for key, value in self.items())
3
Example 4
def parse_url(url, encoding=None):
"""Return urlparsed url from the given argument (which could be an already
parsed url)
"""
if isinstance(url, ParseResult):
return url
return urlparse(to_unicode(url, encoding))
3
Example 5
Project: scrapy Source File: mockserver.py
def render_GET(self, request):
output = {
'headers': dict(
(to_unicode(k), [to_unicode(v) for v in vs])
for k, vs in request.requestHeaders.getAllRawHeaders()),
'body': to_unicode(request.content.read()),
}
return to_bytes(json.dumps(output))
3
Example 6
Project: scrapy Source File: test_exporters.py
def assertCsvEqual(self, first, second, msg=None):
first = to_unicode(first)
second = to_unicode(second)
csvsplit = lambda csv: [sorted(re.split(r'(,|\s+)', line))
for line in csv.splitlines(True)]
return self.assertEqual(csvsplit(first), csvsplit(second), msg)
3
Example 7
def test_nested_item(self):
i1 = TestItem(name=u'Joseph', age='22')
i2 = dict(name=u'Maria', age=i1)
i3 = TestItem(name=u'Jesus', age=i2)
self.ie.start_exporting()
self.ie.export_item(i3)
self.ie.finish_exporting()
exported = json.loads(to_unicode(self.output.getvalue()))
self.assertEqual(exported, self._expected_nested)
3
Example 8
def test_nonstring_types_item(self):
item = self._get_nonstring_types_item()
self.ie.start_exporting()
self.ie.export_item(item)
self.ie.finish_exporting()
exported = json.loads(to_unicode(self.output.getvalue()))
item['time'] = str(item['time'])
self.assertEqual(exported, item)
3
Example 9
Project: scrapy Source File: test_exporters.py
def assertTwoItemsExported(self, item):
self.ie.start_exporting()
self.ie.export_item(item)
self.ie.export_item(item)
self.ie.finish_exporting()
exported = json.loads(to_unicode(self.output.getvalue()))
self.assertEqual(exported, [dict(item), dict(item)])
3
Example 10
def test_nested_item(self):
i1 = TestItem(name=u'Joseph\xa3', age='22')
i2 = TestItem(name=u'Maria', age=i1)
i3 = TestItem(name=u'Jesus', age=i2)
self.ie.start_exporting()
self.ie.export_item(i3)
self.ie.finish_exporting()
exported = json.loads(to_unicode(self.output.getvalue()))
expected = {'name': u'Jesus', 'age': {'name': 'Maria', 'age': dict(i1)}}
self.assertEqual(exported, [expected])
3
Example 11
Project: scrapy Source File: test_exporters.py
def test_nested_dict_item(self):
i1 = dict(name=u'Joseph\xa3', age='22')
i2 = TestItem(name=u'Maria', age=i1)
i3 = dict(name=u'Jesus', age=i2)
self.ie.start_exporting()
self.ie.export_item(i3)
self.ie.finish_exporting()
exported = json.loads(to_unicode(self.output.getvalue()))
expected = {'name': u'Jesus', 'age': {'name': 'Maria', 'age': i1}}
self.assertEqual(exported, [expected])
3
Example 12
def test_nonstring_types_item(self):
item = self._get_nonstring_types_item()
self.ie.start_exporting()
self.ie.export_item(item)
self.ie.finish_exporting()
exported = json.loads(to_unicode(self.output.getvalue()))
item['time'] = str(item['time'])
self.assertEqual(exported, [item])
3
Example 13
Project: scrapy Source File: test_webclient.py
def getPage(url, contextFactory=None, response_transform=None, *args, **kwargs):
"""Adapted version of twisted.web.client.getPage"""
def _clientfactory(url, *args, **kwargs):
url = to_unicode(url)
timeout = kwargs.pop('timeout', 0)
f = client.ScrapyHTTPClientFactory(
Request(url, *args, **kwargs), timeout=timeout)
f.deferred.addCallback(response_transform or (lambda r: r.body))
return f
from twisted.web.client import _makeGetterFactory
return _makeGetterFactory(to_bytes(url), _clientfactory,
contextFactory=contextFactory, *args, **kwargs).deferred
3
Example 14
Project: scrapy Source File: test_webclient.py
def testFactoryInfo(self):
url = self.getURL('file')
_, _, host, port, _ = client._parse(url)
factory = client.ScrapyHTTPClientFactory(Request(url))
reactor.connectTCP(to_unicode(host), port, factory)
return factory.deferred.addCallback(self._cbFactoryInfo, factory)
3
Example 15
Project: scrapy-splash Source File: utils.py
def to_native_str(text, encoding=None, errors='strict'):
""" Return str representation of `text`
(bytes in Python 2.x and unicode in Python 3.x). """
if six.PY2:
return to_bytes(text, encoding, errors)
else:
return to_unicode(text, encoding, errors)
3
Example 16
Project: scrapy-splash Source File: utils.py
def scrapy_headers_to_unicode_dict(headers):
"""
Convert scrapy.http.Headers instance to a dictionary
suitable for JSON encoding.
"""
return {
to_unicode(key): to_unicode(b','.join(value))
for key, value in headers.items()
}
0
Example 17
def _get_agent(self, request, timeout):
bindaddress = request.meta.get('bindaddress') or self._bindAddress
proxy = request.meta.get('proxy')
if proxy:
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
scheme = _parse(request.url)[0]
proxyHost = to_unicode(proxyHost)
omitConnectTunnel = b'noconnect' in proxyParams
if scheme == b'https' and not omitConnectTunnel:
proxyConf = (proxyHost, proxyPort,
request.headers.get(b'Proxy-Authorization', None))
return self._TunnelingAgent(reactor, proxyConf,
contextFactory=self._contextFactory, connectTimeout=timeout,
bindAddress=bindaddress, pool=self._pool)
else:
endpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort,
timeout=timeout, bindAddress=bindaddress)
return self._ProxyAgent(endpoint)
return self._Agent(reactor, contextFactory=self._contextFactory,
connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
0
Example 18
def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
""" Returns an iterator of dictionaries from the given csv object
obj can be:
- a Response object
- a unicode string
- a string encoded as utf-8
delimiter is the character used to separate fields on the given obj.
headers is an iterable that when provided offers the keys
for the returned dictionaries, if not the first row is used.
quotechar is the character used to enclosure fields on the given obj.
"""
encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'
def _getrow(csv_r):
return [to_unicode(field, encoding) for field in next(csv_r)]
# Python 3 csv reader input object needs to return strings
if six.PY3:
lines = StringIO(_body_or_str(obj, unicode=True))
else:
lines = BytesIO(_body_or_str(obj, unicode=False))
kwargs = {}
if delimiter: kwargs["delimiter"] = delimiter
if quotechar: kwargs["quotechar"] = quotechar
csv_r = csv.reader(lines, **kwargs)
if not headers:
headers = _getrow(csv_r)
while True:
row = _getrow(csv_r)
if len(row) != len(headers):
logger.warning("ignoring row %(csvlnum)d (length: %(csvrow)d, "
"should be: %(csvheader)d)",
{'csvlnum': csv_r.line_num, 'csvrow': len(row),
'csvheader': len(headers)})
continue
else:
yield dict(zip(headers, row))
0
Example 19
Project: scrapy Source File: misc.py
def extract_regex(regex, text, encoding='utf-8'):
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
if isinstance(regex, six.string_types):
regex = re.compile(regex, re.UNICODE)
try:
strings = [regex.search(text).group('extract')] # named group
except:
strings = regex.findall(text) # full regex or numbered groups
strings = flatten(strings)
if isinstance(text, six.text_type):
return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
else:
return [replace_entities(to_unicode(s, encoding), keep=['lt', 'amp'])
for s in strings]
0
Example 20
Project: scrapy Source File: reqser.py
def request_to_dict(request, spider=None):
"""Convert Request object to a dict.
If a spider is given, it will try to find out the name of the spider method
used in the callback and store that as the callback.
"""
cb = request.callback
if callable(cb):
cb = _find_method(spider, cb)
eb = request.errback
if callable(eb):
eb = _find_method(spider, eb)
d = {
'url': to_unicode(request.url), # urls should be safe (safe_string_url)
'callback': cb,
'errback': eb,
'method': request.method,
'headers': dict(request.headers),
'body': request.body,
'cookies': request.cookies,
'meta': request.meta,
'_encoding': request._encoding,
'priority': request.priority,
'dont_filter': request.dont_filter,
}
return d
0
Example 21
Project: scrapy Source File: test_crawl.py
@defer.inlineCallbacks
def test_referer_header(self):
"""Referer header is set by RefererMiddleware unless it is already set"""
req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1)
req1 = req0.replace()
req2 = req0.replace(headers={'Referer': None})
req3 = req0.replace(headers={'Referer': 'http://example.com'})
req0.meta['next'] = req1
req1.meta['next'] = req2
req2.meta['next'] = req3
crawler = self.runner.create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=req0)
# basic asserts in case of weird communication errors
self.assertIn('responses', crawler.spider.meta)
self.assertNotIn('failures', crawler.spider.meta)
# start requests doesn't set Referer header
echo0 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
self.assertNotIn('Referer', echo0['headers'])
# following request sets Referer to start request url
echo1 = json.loads(to_unicode(crawler.spider.meta['responses'][1].body))
self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
# next request avoids Referer header
echo2 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
self.assertNotIn('Referer', echo2['headers'])
# last request explicitly sets a Referer header
echo3 = json.loads(to_unicode(crawler.spider.meta['responses'][3].body))
self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
0
Example 22
Project: scrapy Source File: test_exporters.py
def _assert_expected_item(self, exported_dict):
for k, v in exported_dict.items():
exported_dict[k] = to_unicode(v)
self.assertEqual(self.i, exported_dict)
0
Example 23
def _check_output(self):
self.assertCsvEqual(to_unicode(self.output.getvalue()), u'age,name\r\n22,John\xa3\r\n')
0
Example 24
def _check_output(self):
exported = json.loads(to_unicode(self.output.getvalue().strip()))
self.assertEqual(exported, dict(self.i))
0
Example 25
def _check_output(self):
exported = json.loads(to_unicode(self.output.getvalue().strip()))
self.assertEqual(exported, [dict(self.i)])
0
Example 26
Project: scrapy Source File: test_utils_python.py
def test_converting_an_utf8_encoded_string_to_unicode(self):
self.assertEqual(to_unicode(b'lel\xc3\xb1e'), u'lel\xf1e')
0
Example 27
Project: scrapy Source File: test_utils_python.py
def test_converting_a_latin_1_encoded_string_to_unicode(self):
self.assertEqual(to_unicode(b'lel\xf1e', 'latin-1'), u'lel\xf1e')
0
Example 28
Project: scrapy Source File: test_utils_python.py
def test_converting_a_unicode_to_unicode_should_return_the_same_object(self):
self.assertEqual(to_unicode(u'\xf1e\xf1e\xf1e'), u'\xf1e\xf1e\xf1e')
0
Example 29
Project: scrapy Source File: test_utils_python.py
def test_errors_argument(self):
self.assertEqual(
to_unicode(b'a\xedb', 'utf-8', errors='replace'),
u'a\ufffdb'
)
0
Example 30
def render(self, request):
body = to_unicode(request.content.read())
request.setHeader(b'content-encoding', self.out_encoding)
return body.encode(self.out_encoding)
0
Example 31
def _check_Encoding(self, response, original_body):
content_encoding = to_unicode(response.headers[b'Content-Encoding'])
self.assertEquals(content_encoding, EncodingResource.out_encoding)
self.assertEquals(
response.body.decode(content_encoding), to_unicode(original_body))
0
Example 32
Project: scrapy-splash Source File: utils.py
def parse_x_splash_saved_arguments_header(value):
"""
Parse X-Splash-Saved-Arguments header value.
>>> value = u"name1=9a6747fc6259aa374ab4e1bb03074b6ec672cf99;name2=ba001160ef96fe2a3f938fea9e6762e204a562b3"
>>> dct = parse_x_splash_saved_arguments_header(value)
>>> sorted(list(dct.keys()))
['name1', 'name2']
>>> dct['name1']
'9a6747fc6259aa374ab4e1bb03074b6ec672cf99'
>>> dct['name2']
'ba001160ef96fe2a3f938fea9e6762e204a562b3'
Binary header values are also supported:
>>> dct2 = parse_x_splash_saved_arguments_header(value.encode('utf8'))
>>> dct2 == dct
True
"""
value = to_unicode(value)
return dict(kv.split('=', 1) for kv in value.split(";"))