aiohttp.ClientError

Here are the examples of the python api aiohttp.ClientError taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

1 Examples 7

Example 1

Project: sky Source File: crawling.py
    @asyncio.coroutine
    def fetch(self, prio, url, max_redirects_per_url):
        """Fetch one URL."""
        # Using max_workers since they are not being quit
        if self.num_saved_responses >= self.max_saved_responses:
            return
        if (self.cache is not None and slugify(url) in self.cache and
                (not self.cache.only_save_index_pages or self.should_save(url))):
            LOGGER.info('%r from cache', url)
            links = set()
            num_allowed_urls = 0
            response = yield from self.get_from_cache(slugify(url))
            current_url = response['url']
            if self.should_save(response['url']):
                _ = yield from self.save_response(response['content'], response['url'],
                                                  response['headers'], response['crawl_date'])

                # fck = yield from response.text(encoding="cp1252")
                self.num_saved_responses += 1
                LOGGER.info('results: %r, CONVERTED url %r, ',
                            self.num_saved_responses, current_url)

            # Replace href with (?:href|src) to follow image links.
            urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                  response['content']))

            for url in urls:
                normalized = urllib.parse.urljoin(current_url, url)
                defragmented, _ = urllib.parse.urldefrag(normalized)
                if self.url_allowed(defragmented) and self.should_crawl(normalized):
                    if defragmented not in links and defragmented not in self.seen_urls:
                        num_allowed_urls += 1
                        links.add(defragmented)

            # visitable means: "urls that may be visit according to config"
            LOGGER.info('Queue: %r, FOUND ~%r visitable urls from %r, ',
                        self.q.qsize(), num_allowed_urls, current_url)

            stat = FetchStatistic(
                url=response['url'],
                next_url=None,
                status=response['status'],
                exception=None,
                size=len(response['content']),
                content_type=response['content_type'],
                encoding=response['encoding'],
                num_urls=len(links),
                num_new_urls=len(links - self.seen_urls))

            self.record_statistic(stat)
            for link in links.difference(self.seen_urls):
                good = sum([x in link for x in self.index_required_regexps])
                bad = 10 * any([x in link for x in self.index_filter_regexps])
                prio = bad - good  # lower is better
                self.q.put_nowait((prio, link, self.max_redirects_per_url))

            self.seen_urls.update(links)

            return
        tries = 0
        exception = None
        while tries < self.max_tries_per_url:
            try:
                LOGGER.debug('GET url: ' + url)
                response = yield from asyncio.wait_for(
                    self.session.get(url, allow_redirects=False), 20)
                if tries > 1:
                    LOGGER.info('try %r for %r SUCCESS', tries, url)
                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r RAISED %r', tries, url, client_error)
                exception = client_error
            except asyncio.TimeoutError as e:
                LOGGER.error('asyncio.TimeoutError for %r RAISED %r', url, e)
                exception = e
            except asyncio.CancelledError as e:
                LOGGER.error('asyncio.CancelledError for %r RAISED %r', url, e)
                return
            except Exception as e:
                LOGGER.error('General error for %r RAISED %r', url, e)
                exception = e
            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            if self.max_tries_per_url > 1:
                LOGGER.error('%r FAILED after %r tries', url, self.max_tries_per_url)

            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirects_per_url > 0:
                    LOGGER.info('REDIRECT to %r from %r', next_url, url)
                    self.add_url(prio, next_url, max_redirects_per_url - 1)
                else:
                    LOGGER.error('REDIRECT limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.handle_response(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    good = sum([x in link for x in self.index_required_regexps])
                    bad = 10 * any([x in link for x in self.index_filter_regexps])
                    prio = bad - good  # lower is better
                    self.q.put_nowait((prio, link, self.max_redirects_per_url))

                self.seen_urls.update(links)
        finally:
            yield from response.release()