Here are the examples of the python api scrapy.exceptions.IgnoreRequest taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
15 Examples
3
Source : robotstxt.py
with MIT License
from autofelix
with MIT License
from autofelix
def process_request_2(self, rp, request, spider):
if rp is None:
return
if not rp.can_fetch(to_native_str(self._useragent), request.url):
logger.debug("Forbidden by robots.txt: %(request)s",
{'request': request}, extra={'spider': spider})
self.crawler.stats.inc_value('robotstxt/forbidden')
raise IgnoreRequest("Forbidden by robots.txt")
def robot_parser(self, request, spider):
3
Source : controllers.py
with MIT License
from crawlerflow
with MIT License
from crawlerflow
def process_request(self, request, spider):
current_request_traversal_id = request.meta.get("current_request_traversal_id")
current_traversal_max_count = request.meta.get("current_traversal_max_count")
if current_request_traversal_id != "init":
current_request_traversal_count = spider.crawler.stats.get_value(
'crawlerflow-stats/traversals/{}/requests_count'.format(current_request_traversal_id), spider=spider)
if current_request_traversal_count and current_request_traversal_count > current_traversal_max_count:
raise IgnoreRequest(
reason="max traversals for traversal_id: {} achieved".format(current_request_traversal_id))
class SpiderRequestsBasedStopController(object):
3
Source : controllers.py
with MIT License
from crawlerflow
with MIT License
from crawlerflow
def process_request(self, request, spider):
spider_id = spider.spider_config.get("spider_id")
current_spider_requests_count = spider.crawler.stats.get_value(
'crawlerflow-stats/spiders/{}/requests_count'.format(spider_id),
spider=spider)
current_spider_max_requests_count = spider.spider_config.get("stop_criteria", {}).get("max_requests")
if current_spider_max_requests_count and current_spider_requests_count:
if current_spider_requests_count > current_spider_max_requests_count:
raise IgnoreRequest(
reason="max requests {} for spider: {} achieved."
"Ignoring the rest of this spider requests".format(current_spider_max_requests_count,
spider_id))
class SpiderResponsesBasedStopController(object):
3
Source : controllers.py
with MIT License
from crawlerflow
with MIT License
from crawlerflow
def process_request(self, request, spider):
spider_id = spider.spider_config.get("spider_id")
current_spider_responses_count = spider.crawler.stats.get_value(
'crawlerflow-stats/spiders/{}/responses_count'.format(spider_id),
spider=spider)
current_spider_max_responses_count = spider.spider_config.get("stop_criteria", {}).get("max_responses")
if current_spider_max_responses_count and current_spider_responses_count:
if current_spider_responses_count > current_spider_max_responses_count:
raise IgnoreRequest(
reason="max responses {} for spider: {} achieved."
"Ignoring the rest of this spider requests ".format(current_spider_max_responses_count,
spider_id))
3
Source : robotstxt.py
with Apache License 2.0
from lynings
with Apache License 2.0
from lynings
def process_request_2(self, rp, request, spider):
if rp is not None and not rp.can_fetch(
to_native_str(self._useragent), request.url):
logger.debug("Forbidden by robots.txt: %(request)s",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest()
def robot_parser(self, request, spider):
3
Source : downloadermiddlewares.py
with BSD 3-Clause "New" or "Revised" License
from open-contracting
with BSD 3-Clause "New" or "Revised" License
from open-contracting
def process_request(self, request, spider):
if 'auth' in request.meta and request.meta['auth'] is False:
return
if spider.auth_failed:
spider.crawler.engine.close_spider(spider, 'auth_failed')
raise IgnoreRequest("Max attempts to get an access token reached. Stopping crawl...")
request.headers['Authorization'] = spider.access_token
if self._expires_soon(spider):
# SAVE the last request to continue after getting the token
spider.last_requests.append(request)
spider.logger.info('Saving request for after getting the token: %s', request.url)
# spider MUST implement the request_access_token method
return spider.request_access_token()
def process_response(self, request, response, spider):
3
Source : anti_spider.py
with MIT License
from zhanghe06
with MIT License
from zhanghe06
def process_request(self, request, spider):
# 处理微信反爬(反爬机制一, sogou)
if spider.name in ['weixin'] and 'antispider' in request.url:
# 获取来源链接
redirect_urls = request.meta['redirect_urls']
# 清理失效 cookies
cookies_id = request.meta['cookiejar']
del_cookies(spider.name, cookies_id)
# spider.log(message='AntiSpider cookies_id: %s; url: %s' % (cookies_id, redirect_urls[0]))
raise IgnoreRequest(
'Spider: %s, AntiSpider cookies_id: %s; url: %s' % (spider.name, cookies_id, redirect_urls[0]))
def process_response(self, request, response, spider):
3
Source : de_duplication_request.py
with MIT License
from zhanghe06
with MIT License
from zhanghe06
def process_request(self, request, spider):
if not request.url:
return None
channel_id = request.meta.get('channel_id', 0)
# 处理详情页面(忽略列表页面)与pipeline配合
if is_dup_detail(request.url, spider.name, channel_id):
raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" % (spider.name, request.url))
0
Source : httpcache.py
with MIT License
from autofelix
with MIT License
from autofelix
def process_request(self, request, spider):
if request.meta.get('dont_cache', False):
return
# Skip uncacheable requests
if not self.policy.should_cache_request(request):
request.meta['_dont_cache'] = True # flag as uncacheable
return
# Look for cached response and check if expired
cachedresponse = self.storage.retrieve_response(spider, request)
if cachedresponse is None:
self.stats.inc_value('httpcache/miss', spider=spider)
if self.ignore_missing:
self.stats.inc_value('httpcache/ignore', spider=spider)
raise IgnoreRequest("Ignored request not in cache: %s" % request)
return # first time request
# Return cached response only if not expired
cachedresponse.flags.append('cached')
if self.policy.is_cached_response_fresh(cachedresponse, request):
self.stats.inc_value('httpcache/hit', spider=spider)
return cachedresponse
# Keep a reference to cached response to avoid a second cache lookup on
# process_response hook
request.meta['cached_response'] = cachedresponse
def process_response(self, request, response, spider):
0
Source : redirect.py
with MIT License
from autofelix
with MIT License
from autofelix
def _redirect(self, redirected, request, spider, reason):
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
redirects = request.meta.get('redirect_times', 0) + 1
if ttl and redirects < = self.max_redirect_times:
redirected.meta['redirect_times'] = redirects
redirected.meta['redirect_ttl'] = ttl - 1
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
[request.url]
redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \
[reason]
redirected.dont_filter = request.dont_filter
redirected.priority = request.priority + self.priority_adjust
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
{'reason': reason, 'redirected': redirected, 'request': request},
extra={'spider': spider})
return redirected
else:
logger.debug("Discarding %(request)s: max redirections reached",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest("max redirections reached")
def _redirect_request_using_get(self, request, redirect_url):
0
Source : middlewares.py
with Apache License 2.0
from cat9
with Apache License 2.0
from cat9
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
if (request.meta.get('dont_redirect', False) or
response.status in getattr(spider, 'handle_httpstatus_list', []) or
response.status in request.meta.get('handle_httpstatus_list', []) or
request.meta.get('handle_httpstatus_all', False)):
return response
allowed_status = (301, 302, 303, 307, 308)
if 'Location' not in response.headers or response.status not in allowed_status:
return response
location = safe_url_string(response.headers['location'])
if response.status == 302 and (location.startswith('https://sec.aliexpress.com') or location.startswith('https://login.aliexpress.com')):
redirects = request.meta.get('redirect_times', 0) + 1
print("AliexpressRedirectMiddleware redirects %d,%s" % (redirects, location))
if redirects < = self.max_redirect_times:
return response
else:
logger.debug("Discarding %(request)s: max redirections reached",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest("max redirections reached")
else:
redirected_url = urljoin(request.url, location)
if response.status in (301, 307, 308) or request.method == 'HEAD':
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, response.status)
redirected = self._redirect_request_using_get(request, redirected_url)
return self._redirect(redirected, request, spider, response.status)
0
Source : redirect.py
with The Unlicense
from dspray95
with The Unlicense
from dspray95
def _redirect(self, redirected, request, spider, reason):
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
redirects = request.meta.get('redirect_times', 0) + 1
if ttl and redirects < = self.max_redirect_times:
redirected.meta['redirect_times'] = redirects
redirected.meta['redirect_ttl'] = ttl - 1
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
[request.url]
redirected.dont_filter = request.dont_filter
redirected.priority = request.priority + self.priority_adjust
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
{'reason': reason, 'redirected': redirected, 'request': request},
extra={'spider': spider})
return redirected
else:
logger.debug("Discarding %(request)s: max redirections reached",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest("max redirections reached")
def _redirect_request_using_get(self, request, redirect_url):
0
Source : middlewares.py
with MIT License
from ginopalazzo
with MIT License
from ginopalazzo
def process_response(self, request, response, spider):
"""
Process the response, checking if the server has block an IP.
:param request: the request that originated the response
:param response: the response being processed
:param spider: the spider for which this response is intended
:return: pass the response to the next layer.
"""
log.debug(' request: %s %s' % (request.url, request.meta))
log.debug(' response: %s %s' % (response.url, response.status))
log.debug(' num_pages: %s' % self._num_pages)
self._num_pages = self._num_pages + 1
# check if the server send a block response and delete that IP from the list.
if response.status in self._http_status_codes:
del self.proxies[-1]
reason = ' %d in %s . IP %s deleted. %d proxies left' % (response.status, response.url, request.meta['proxy'], len(self.proxies))
log.warning(reason)
raise IgnoreRequest(reason)
return response
def process_exception(self, request, exception, spider):
0
Source : middlewares.py
with MIT License
from Open-Speech-EkStep
with MIT License
from Open-Speech-EkStep
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
# if request.url.rstrip() in self.bing_archive:
# raise IgnoreRequest()
for word in self.word_to_ignore:
if word.lower() in request.url.lower():
raise IgnoreRequest()
for ext in self.extensions_to_ignore:
if request.url.lower().endswith(ext):
raise IgnoreRequest()
# with open(self.bing_archive_path,'a') as f:
# f.write(request.url+"\n")
self.visited_urls.append(request.url)
return None
def process_response(self, request, response, spider):
0
Source : middlewares.py
with MIT License
from tonywu7
with MIT License
from tonywu7
def process_response(self, request, response: Response, spider):
if response.status == 401:
self.log.warning('Server returned HTTP 401 Unauthorized.')
self.log.warning('This is because you are accessing an API that requires authorization, and')
self.log.warning('your either did not provide, or provided a wrong access token.')
self.log.warning(f'URL: {request.url}')
raise IgnoreRequest()
if response.status == 429 and urlsplit(request.url) == 'cloud.feedly.com':
retry_after = response.headers.get('Retry-After')
if retry_after:
retry_after = int(retry_after)
self.log.warning('Server returned HTTP 429 Too Many Requests.')
self.log.warning('Either your IP address or your developer account is being rate-limited.')
self.log.warning(f'Retry-After = {retry_after}s')
self.log.warning(f'Scrapy will now pause for {retry_after}s')
spider.crawler.engine.pause()
to_sleep = retry_after * 1.2
try:
wait(to_sleep)
except KeyboardInterrupt:
self.crawler.engine.unpause()
raise
spider.crawler.engine.unpause()
self.log.info('Resuming crawl.')
return request.copy()
else:
self.log.critical('Server returned HTTP 429 Too Many Requests.')
self.log.critical('Either your IP address or your developer account is being rate-limited.')
self.log.critical('Crawler will now stop.')
self.crawler.engine.close_spider(spider, 'rate_limited')
raise IgnoreRequest()
return response