Here are the examples of the python api scrapy.spiders.Rule taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
9 Examples
3
Example 1
def __init__(self, *args, **kwargs):
super(FormWithCookieSpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('start_url')]
self.cookiejar = cookielib.LWPCookieJar()
self.cookiejar.load(kwargs.get('cookie_jar'))
self.rules = (
Rule (SgmlLinkExtractor(allow=('')), callback='parse_form', follow=True, process_request='add_cookie_for_request'),
)
super(FormWithCookieSpider, self)._compile_rules()
3
Example 2
def __init__(self, *args, **kwargs):
super(UrlSpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('start_url')]
follow = True if kwargs.get('follow') == 'true' else False
self.rules = (
Rule (SgmlLinkExtractor(allow=('')), callback='parse_url', follow=follow),
)
super(UrlSpider, self)._compile_rules()
try:
proxy = kwargs.get('proxy')
service_args = [
'--proxy=' + proxy,
'--proxy-type=http',
]
except:
service_args = None
3
Example 3
def __init__(self, *args, **kwargs):
super(UrlWithCookieSpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('start_url')]
self.cookiejar = cookielib.LWPCookieJar()
self.cookiejar.load(kwargs.get('cookie_jar'))
self.rules = (
Rule (SgmlLinkExtractor(allow=('')), callback='parse_url', follow=True, process_request='add_cookie_for_request'),
)
super(UrlWithCookieSpider, self)._compile_rules()
0
Example 4
Project: corpus-builder Source File: samakal.py
def request_index(self, response):
categories = list(set(response.css('#topMenuItem a::attr("href")').re('/([^\/]+)/$')))
if self.category is not None:
if self.category in categories:
categories = [self.category]
else:
raise ValueError('invalid category slug. available slugs: %s' % ", ".join(categories))
date_processing = self.start_date
while date_processing <= self.end_date:
for category in categories:
# redifining the rule again according to the specific date url
SamakalSpider.rules = (Rule(LinkExtractor(allow=('/' + date_processing.strftime('%Y/%m/%d') + '/\d+$',),
restrict_xpaths=('//div[@class="main-body"]')),
callback="parse_content", follow=True),)
super(SamakalSpider, self)._compile_rules()
# http://bangla.samakal.net/-education/2016/06/01
url = 'http://bangla.samakal.net/{0}/{1}'.format(
category,
date_processing.strftime('%Y/%m/%d')
)
yield self.make_requests_from_url(url)
date_processing += datetime.timedelta(days=1)
0
Example 5
Project: scrapy Source File: test_spider.py
def test_process_links(self):
response = HtmlResponse("http://example.org/somepage/index.html",
body=self.test_body)
class _CrawlSpider(self.spider_class):
name="test"
allowed_domains=['example.org']
rules = (
Rule(LinkExtractor(), process_links="dummy_process_links"),
)
def dummy_process_links(self, links):
return links
spider = _CrawlSpider()
output = list(spider._requests_to_follow(response))
self.assertEqual(len(output), 3)
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
self.assertEquals([r.url for r in output],
['http://example.org/somepage/item/12.html',
'http://example.org/about.html',
'http://example.org/nofollow.html'])
0
Example 6
Project: scrapy Source File: test_spider.py
def test_process_links_filter(self):
response = HtmlResponse("http://example.org/somepage/index.html",
body=self.test_body)
class _CrawlSpider(self.spider_class):
import re
name="test"
allowed_domains=['example.org']
rules = (
Rule(LinkExtractor(), process_links="filter_process_links"),
)
_test_regex = re.compile('nofollow')
def filter_process_links(self, links):
return [link for link in links
if not self._test_regex.search(link.url)]
spider = _CrawlSpider()
output = list(spider._requests_to_follow(response))
self.assertEqual(len(output), 2)
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
self.assertEquals([r.url for r in output],
['http://example.org/somepage/item/12.html',
'http://example.org/about.html'])
0
Example 7
Project: scrapy Source File: test_spider.py
def test_process_links_generator(self):
response = HtmlResponse("http://example.org/somepage/index.html",
body=self.test_body)
class _CrawlSpider(self.spider_class):
name="test"
allowed_domains=['example.org']
rules = (
Rule(LinkExtractor(), process_links="dummy_process_links"),
)
def dummy_process_links(self, links):
for link in links:
yield link
spider = _CrawlSpider()
output = list(spider._requests_to_follow(response))
self.assertEqual(len(output), 3)
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
self.assertEquals([r.url for r in output],
['http://example.org/somepage/item/12.html',
'http://example.org/about.html',
'http://example.org/nofollow.html'])
0
Example 8
Project: cmdbac Source File: form.py
def __init__(self, *args, **kwargs):
super(FormSpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('start_url')]
follow = True if kwargs.get('follow') == 'true' else False
self.rules = (
Rule (SgmlLinkExtractor(allow=('')), callback='parse_form', follow=follow),
)
super(FormSpider, self)._compile_rules()
try:
proxy = kwargs.get('proxy')
service_args = [
'--proxy=' + proxy,
'--proxy-type=http',
]
except:
service_args = None
self.browser = webdriver.PhantomJS(service_args=service_args)
0
Example 9
Project: docsearch-scraper Source File: documentation_spider.py
def __init__(self, config, algolia_helper, strategy, *args, **kwargs):
# Scrapy config
self.name = config.index_name
self.allowed_domains = config.allowed_domains
self.start_urls = [start_url['url'] for start_url in config.start_urls]
self.stop_urls = config.stop_urls
self.algolia_helper = algolia_helper
self.strategy = strategy
self.js_render = config.js_render
self.js_wait = config.js_wait
self.scrap_start_urls = config.scrap_start_urls
self.remove_get_params = config.remove_get_params
self.strict_redirect = config.strict_redirect
super(DocuementationSpider, self).__init__(*args, **kwargs)
link_extractor = LxmlLinkExtractor(
allow=self.start_urls,
deny=self.stop_urls,
tags=('a', 'area', 'iframe'),
attrs=('href', 'src'),
canonicalize=(not config.js_render or not config.use_anchors)
)
DocuementationSpider.rules = [
Rule(link_extractor, callback=self.add_records, follow=True),
]
super(DocuementationSpider, self)._compile_rules()