scrapy.utils.python.to_unicode

Here are the examples of the python api scrapy.utils.python.to_unicode taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

32 Examples 7

Example 1

Project: scrapy Source File: http10.py
    def _connect(self, factory):
        host, port = to_unicode(factory.host), factory.port
        if factory.scheme == b'https':
            return reactor.connectSSL(host, port, factory,
                                      self.ClientContextFactory())
        else:
            return reactor.connectTCP(host, port, factory)

Example 2

Project: scrapy Source File: httpcache.py
def rfc1123_to_epoch(date_str):
    try:
        date_str = to_unicode(date_str, encoding='ascii')
        return mktime_tz(parsedate_tz(date_str))
    except Exception:
        return None

Example 3

Project: scrapy Source File: headers.py
    def to_unicode_dict(self):
        """ Return headers as a CaselessDict with unicode keys
        and unicode values. Multiple values are joined with ','.
        """
        return CaselessDict(
            (to_unicode(key, encoding=self.encoding),
             to_unicode(b','.join(value), encoding=self.encoding))
            for key, value in self.items())

Example 4

Project: scrapy Source File: url.py
Function: parse_url
def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    if isinstance(url, ParseResult):
        return url
    return urlparse(to_unicode(url, encoding))

Example 5

Project: scrapy Source File: mockserver.py
    def render_GET(self, request):
        output = {
            'headers': dict(
                (to_unicode(k), [to_unicode(v) for v in vs])
                for k, vs in request.requestHeaders.getAllRawHeaders()),
            'body': to_unicode(request.content.read()),
        }
        return to_bytes(json.dumps(output))

Example 6

Project: scrapy Source File: test_exporters.py
    def assertCsvEqual(self, first, second, msg=None):
        first = to_unicode(first)
        second = to_unicode(second)
        csvsplit = lambda csv: [sorted(re.split(r'(,|\s+)', line))
                                for line in csv.splitlines(True)]
        return self.assertEqual(csvsplit(first), csvsplit(second), msg)

Example 7

Project: scrapy Source File: test_exporters.py
Function: test_nested_item
    def test_nested_item(self):
        i1 = TestItem(name=u'Joseph', age='22')
        i2 = dict(name=u'Maria', age=i1)
        i3 = TestItem(name=u'Jesus', age=i2)
        self.ie.start_exporting()
        self.ie.export_item(i3)
        self.ie.finish_exporting()
        exported = json.loads(to_unicode(self.output.getvalue()))
        self.assertEqual(exported, self._expected_nested)

Example 8

Project: scrapy Source File: test_exporters.py
Function: test_nonstring_types_item
    def test_nonstring_types_item(self):
        item = self._get_nonstring_types_item()
        self.ie.start_exporting()
        self.ie.export_item(item)
        self.ie.finish_exporting()
        exported = json.loads(to_unicode(self.output.getvalue()))
        item['time'] = str(item['time'])
        self.assertEqual(exported, item)

Example 9

Project: scrapy Source File: test_exporters.py
    def assertTwoItemsExported(self, item):
        self.ie.start_exporting()
        self.ie.export_item(item)
        self.ie.export_item(item)
        self.ie.finish_exporting()
        exported = json.loads(to_unicode(self.output.getvalue()))
        self.assertEqual(exported, [dict(item), dict(item)])

Example 10

Project: scrapy Source File: test_exporters.py
Function: test_nested_item
    def test_nested_item(self):
        i1 = TestItem(name=u'Joseph\xa3', age='22')
        i2 = TestItem(name=u'Maria', age=i1)
        i3 = TestItem(name=u'Jesus', age=i2)
        self.ie.start_exporting()
        self.ie.export_item(i3)
        self.ie.finish_exporting()
        exported = json.loads(to_unicode(self.output.getvalue()))
        expected = {'name': u'Jesus', 'age': {'name': 'Maria', 'age': dict(i1)}}
        self.assertEqual(exported, [expected])

Example 11

Project: scrapy Source File: test_exporters.py
    def test_nested_dict_item(self):
        i1 = dict(name=u'Joseph\xa3', age='22')
        i2 = TestItem(name=u'Maria', age=i1)
        i3 = dict(name=u'Jesus', age=i2)
        self.ie.start_exporting()
        self.ie.export_item(i3)
        self.ie.finish_exporting()
        exported = json.loads(to_unicode(self.output.getvalue()))
        expected = {'name': u'Jesus', 'age': {'name': 'Maria', 'age': i1}}
        self.assertEqual(exported, [expected])

Example 12

Project: scrapy Source File: test_exporters.py
Function: test_nonstring_types_item
    def test_nonstring_types_item(self):
        item = self._get_nonstring_types_item()
        self.ie.start_exporting()
        self.ie.export_item(item)
        self.ie.finish_exporting()
        exported = json.loads(to_unicode(self.output.getvalue()))
        item['time'] = str(item['time'])
        self.assertEqual(exported, [item])

Example 13

Project: scrapy Source File: test_webclient.py
def getPage(url, contextFactory=None, response_transform=None, *args, **kwargs):
    """Adapted version of twisted.web.client.getPage"""
    def _clientfactory(url, *args, **kwargs):
        url = to_unicode(url)
        timeout = kwargs.pop('timeout', 0)
        f = client.ScrapyHTTPClientFactory(
            Request(url, *args, **kwargs), timeout=timeout)
        f.deferred.addCallback(response_transform or (lambda r: r.body))
        return f

    from twisted.web.client import _makeGetterFactory
    return _makeGetterFactory(to_bytes(url), _clientfactory,
        contextFactory=contextFactory, *args, **kwargs).deferred

Example 14

Project: scrapy Source File: test_webclient.py
    def testFactoryInfo(self):
        url = self.getURL('file')
        _, _, host, port, _ = client._parse(url)
        factory = client.ScrapyHTTPClientFactory(Request(url))
        reactor.connectTCP(to_unicode(host), port, factory)
        return factory.deferred.addCallback(self._cbFactoryInfo, factory)

Example 15

Project: scrapy-splash Source File: utils.py
    def to_native_str(text, encoding=None, errors='strict'):
        """ Return str representation of `text`
        (bytes in Python 2.x and unicode in Python 3.x). """
        if six.PY2:
            return to_bytes(text, encoding, errors)
        else:
            return to_unicode(text, encoding, errors)

Example 16

Project: scrapy-splash Source File: utils.py
def scrapy_headers_to_unicode_dict(headers):
    """
    Convert scrapy.http.Headers instance to a dictionary
    suitable for JSON encoding.
    """
    return {
        to_unicode(key): to_unicode(b','.join(value))
        for key, value in headers.items()
    }

Example 17

Project: scrapy Source File: http11.py
Function: get_agent
    def _get_agent(self, request, timeout):
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxyHost = to_unicode(proxyHost)
            omitConnectTunnel = b'noconnect' in proxyParams
            if  scheme == b'https' and not omitConnectTunnel:
                proxyConf = (proxyHost, proxyPort,
                             request.headers.get(b'Proxy-Authorization', None))
                return self._TunnelingAgent(reactor, proxyConf,
                    contextFactory=self._contextFactory, connectTimeout=timeout,
                    bindAddress=bindaddress, pool=self._pool)
            else:
                endpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort,
                    timeout=timeout, bindAddress=bindaddress)
                return self._ProxyAgent(endpoint)

        return self._Agent(reactor, contextFactory=self._contextFactory,
            connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

Example 18

Project: scrapy Source File: iterators.py
Function: csviter
def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
    """ Returns an iterator of dictionaries from the given csv object

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8

    delimiter is the character used to separate fields on the given obj.

    headers is an iterable that when provided offers the keys
    for the returned dictionaries, if not the first row is used.

    quotechar is the character used to enclosure fields on the given obj.
    """

    encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'
    def _getrow(csv_r):
        return [to_unicode(field, encoding) for field in next(csv_r)]

    # Python 3 csv reader input object needs to return strings
    if six.PY3:
        lines = StringIO(_body_or_str(obj, unicode=True))
    else:
        lines = BytesIO(_body_or_str(obj, unicode=False))

    kwargs = {}
    if delimiter: kwargs["delimiter"] = delimiter
    if quotechar: kwargs["quotechar"] = quotechar
    csv_r = csv.reader(lines, **kwargs)

    if not headers:
        headers = _getrow(csv_r)

    while True:
        row = _getrow(csv_r)
        if len(row) != len(headers):
            logger.warning("ignoring row %(csvlnum)d (length: %(csvrow)d, "
                           "should be: %(csvheader)d)",
                           {'csvlnum': csv_r.line_num, 'csvrow': len(row),
                            'csvheader': len(headers)})
            continue
        else:
            yield dict(zip(headers, row))

Example 19

Project: scrapy Source File: misc.py
def extract_regex(regex, text, encoding='utf-8'):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, six.string_types):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group('extract')]   # named group
    except:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, six.text_type):
        return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
    else:
        return [replace_entities(to_unicode(s, encoding), keep=['lt', 'amp'])
                for s in strings]

Example 20

Project: scrapy Source File: reqser.py
def request_to_dict(request, spider=None):
    """Convert Request object to a dict.

    If a spider is given, it will try to find out the name of the spider method
    used in the callback and store that as the callback.
    """
    cb = request.callback
    if callable(cb):
        cb = _find_method(spider, cb)
    eb = request.errback
    if callable(eb):
        eb = _find_method(spider, eb)
    d = {
        'url': to_unicode(request.url),  # urls should be safe (safe_string_url)
        'callback': cb,
        'errback': eb,
        'method': request.method,
        'headers': dict(request.headers),
        'body': request.body,
        'cookies': request.cookies,
        'meta': request.meta,
        '_encoding': request._encoding,
        'priority': request.priority,
        'dont_filter': request.dont_filter,
    }
    return d

Example 21

Project: scrapy Source File: test_crawl.py
    @defer.inlineCallbacks
    def test_referer_header(self):
        """Referer header is set by RefererMiddleware unless it is already set"""
        req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1)
        req1 = req0.replace()
        req2 = req0.replace(headers={'Referer': None})
        req3 = req0.replace(headers={'Referer': 'http://example.com'})
        req0.meta['next'] = req1
        req1.meta['next'] = req2
        req2.meta['next'] = req3
        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=req0)
        # basic asserts in case of weird communication errors
        self.assertIn('responses', crawler.spider.meta)
        self.assertNotIn('failures', crawler.spider.meta)
        # start requests doesn't set Referer header
        echo0 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo0['headers'])
        # following request sets Referer to start request url
        echo1 = json.loads(to_unicode(crawler.spider.meta['responses'][1].body))
        self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
        # next request avoids Referer header
        echo2 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo2['headers'])
        # last request explicitly sets a Referer header
        echo3 = json.loads(to_unicode(crawler.spider.meta['responses'][3].body))
        self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])

Example 22

Project: scrapy Source File: test_exporters.py
    def _assert_expected_item(self, exported_dict):
        for k, v in exported_dict.items():
            exported_dict[k] = to_unicode(v)
        self.assertEqual(self.i, exported_dict)

Example 23

Project: scrapy Source File: test_exporters.py
Function: check_output
    def _check_output(self):
        self.assertCsvEqual(to_unicode(self.output.getvalue()), u'age,name\r\n22,John\xa3\r\n')

Example 24

Project: scrapy Source File: test_exporters.py
Function: check_output
    def _check_output(self):
        exported = json.loads(to_unicode(self.output.getvalue().strip()))
        self.assertEqual(exported, dict(self.i))

Example 25

Project: scrapy Source File: test_exporters.py
Function: check_output
    def _check_output(self):
        exported = json.loads(to_unicode(self.output.getvalue().strip()))
        self.assertEqual(exported, [dict(self.i)])

Example 26

Project: scrapy Source File: test_utils_python.py
    def test_converting_an_utf8_encoded_string_to_unicode(self):
        self.assertEqual(to_unicode(b'lel\xc3\xb1e'), u'lel\xf1e')

Example 27

Project: scrapy Source File: test_utils_python.py
    def test_converting_a_latin_1_encoded_string_to_unicode(self):
        self.assertEqual(to_unicode(b'lel\xf1e', 'latin-1'), u'lel\xf1e')

Example 28

Project: scrapy Source File: test_utils_python.py
    def test_converting_a_unicode_to_unicode_should_return_the_same_object(self):
        self.assertEqual(to_unicode(u'\xf1e\xf1e\xf1e'), u'\xf1e\xf1e\xf1e')

Example 29

Project: scrapy Source File: test_utils_python.py
    def test_errors_argument(self):
        self.assertEqual(
            to_unicode(b'a\xedb', 'utf-8', errors='replace'),
            u'a\ufffdb'
        )

Example 30

Project: scrapy Source File: test_webclient.py
Function: render
    def render(self, request):
        body = to_unicode(request.content.read())
        request.setHeader(b'content-encoding', self.out_encoding)
        return body.encode(self.out_encoding)

Example 31

Project: scrapy Source File: test_webclient.py
Function: check_encoding
    def _check_Encoding(self, response, original_body):
        content_encoding = to_unicode(response.headers[b'Content-Encoding'])
        self.assertEquals(content_encoding, EncodingResource.out_encoding)
        self.assertEquals(
            response.body.decode(content_encoding), to_unicode(original_body))

Example 32

Project: scrapy-splash Source File: utils.py
def parse_x_splash_saved_arguments_header(value):
    """
    Parse X-Splash-Saved-Arguments header value.

    >>> value = u"name1=9a6747fc6259aa374ab4e1bb03074b6ec672cf99;name2=ba001160ef96fe2a3f938fea9e6762e204a562b3"
    >>> dct = parse_x_splash_saved_arguments_header(value)
    >>> sorted(list(dct.keys()))
    ['name1', 'name2']
    >>> dct['name1']
    '9a6747fc6259aa374ab4e1bb03074b6ec672cf99'
    >>> dct['name2']
    'ba001160ef96fe2a3f938fea9e6762e204a562b3'

    Binary header values are also supported:
    >>> dct2 = parse_x_splash_saved_arguments_header(value.encode('utf8'))
    >>> dct2 == dct
    True
    """
    value = to_unicode(value)
    return dict(kv.split('=', 1) for kv in  value.split(";"))