Here are the examples of the python api aiohttp.ClientError taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
1 Examples
0
Example 1
Project: sky Source File: crawling.py
@asyncio.coroutine
def fetch(self, prio, url, max_redirects_per_url):
"""Fetch one URL."""
# Using max_workers since they are not being quit
if self.num_saved_responses >= self.max_saved_responses:
return
if (self.cache is not None and slugify(url) in self.cache and
(not self.cache.only_save_index_pages or self.should_save(url))):
LOGGER.info('%r from cache', url)
links = set()
num_allowed_urls = 0
response = yield from self.get_from_cache(slugify(url))
current_url = response['url']
if self.should_save(response['url']):
_ = yield from self.save_response(response['content'], response['url'],
response['headers'], response['crawl_date'])
# fck = yield from response.text(encoding="cp1252")
self.num_saved_responses += 1
LOGGER.info('results: %r, CONVERTED url %r, ',
self.num_saved_responses, current_url)
# Replace href with (?:href|src) to follow image links.
urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
response['content']))
for url in urls:
normalized = urllib.parse.urljoin(current_url, url)
defragmented, _ = urllib.parse.urldefrag(normalized)
if self.url_allowed(defragmented) and self.should_crawl(normalized):
if defragmented not in links and defragmented not in self.seen_urls:
num_allowed_urls += 1
links.add(defragmented)
# visitable means: "urls that may be visit according to config"
LOGGER.info('Queue: %r, FOUND ~%r visitable urls from %r, ',
self.q.qsize(), num_allowed_urls, current_url)
stat = FetchStatistic(
url=response['url'],
next_url=None,
status=response['status'],
exception=None,
size=len(response['content']),
content_type=response['content_type'],
encoding=response['encoding'],
num_urls=len(links),
num_new_urls=len(links - self.seen_urls))
self.record_statistic(stat)
for link in links.difference(self.seen_urls):
good = sum([x in link for x in self.index_required_regexps])
bad = 10 * any([x in link for x in self.index_filter_regexps])
prio = bad - good # lower is better
self.q.put_nowait((prio, link, self.max_redirects_per_url))
self.seen_urls.update(links)
return
tries = 0
exception = None
while tries < self.max_tries_per_url:
try:
LOGGER.debug('GET url: ' + url)
response = yield from asyncio.wait_for(
self.session.get(url, allow_redirects=False), 20)
if tries > 1:
LOGGER.info('try %r for %r SUCCESS', tries, url)
break
except aiohttp.ClientError as client_error:
LOGGER.info('try %r for %r RAISED %r', tries, url, client_error)
exception = client_error
except asyncio.TimeoutError as e:
LOGGER.error('asyncio.TimeoutError for %r RAISED %r', url, e)
exception = e
except asyncio.CancelledError as e:
LOGGER.error('asyncio.CancelledError for %r RAISED %r', url, e)
return
except Exception as e:
LOGGER.error('General error for %r RAISED %r', url, e)
exception = e
tries += 1
else:
# We never broke out of the loop: all tries failed.
if self.max_tries_per_url > 1:
LOGGER.error('%r FAILED after %r tries', url, self.max_tries_per_url)
self.record_statistic(FetchStatistic(url=url,
next_url=None,
status=None,
exception=exception,
size=0,
content_type=None,
encoding=None,
num_urls=0,
num_new_urls=0))
return
try:
if is_redirect(response):
location = response.headers['location']
next_url = urllib.parse.urljoin(url, location)
self.record_statistic(FetchStatistic(url=url,
next_url=next_url,
status=response.status,
exception=None,
size=0,
content_type=None,
encoding=None,
num_urls=0,
num_new_urls=0))
if next_url in self.seen_urls:
return
if max_redirects_per_url > 0:
LOGGER.info('REDIRECT to %r from %r', next_url, url)
self.add_url(prio, next_url, max_redirects_per_url - 1)
else:
LOGGER.error('REDIRECT limit reached for %r from %r',
next_url, url)
else:
stat, links = yield from self.handle_response(response)
self.record_statistic(stat)
for link in links.difference(self.seen_urls):
good = sum([x in link for x in self.index_required_regexps])
bad = 10 * any([x in link for x in self.index_filter_regexps])
prio = bad - good # lower is better
self.q.put_nowait((prio, link, self.max_redirects_per_url))
self.seen_urls.update(links)
finally:
yield from response.release()