Here are the examples of the python api scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
4 Examples
3
Example 1
Project: cmdbac Source File: form_with_cookie.py
def __init__(self, *args, **kwargs):
super(FormWithCookieSpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('start_url')]
self.cookiejar = cookielib.LWPCookieJar()
self.cookiejar.load(kwargs.get('cookie_jar'))
self.rules = (
Rule (SgmlLinkExtractor(allow=('')), callback='parse_form', follow=True, process_request='add_cookie_for_request'),
)
super(FormWithCookieSpider, self)._compile_rules()
3
Example 2
Project: cmdbac Source File: url.py
def __init__(self, *args, **kwargs):
super(UrlSpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('start_url')]
follow = True if kwargs.get('follow') == 'true' else False
self.rules = (
Rule (SgmlLinkExtractor(allow=('')), callback='parse_url', follow=follow),
)
super(UrlSpider, self)._compile_rules()
try:
proxy = kwargs.get('proxy')
service_args = [
'--proxy=' + proxy,
'--proxy-type=http',
]
except:
service_args = None
3
Example 3
Project: cmdbac Source File: url_with_cookie.py
def __init__(self, *args, **kwargs):
super(UrlWithCookieSpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('start_url')]
self.cookiejar = cookielib.LWPCookieJar()
self.cookiejar.load(kwargs.get('cookie_jar'))
self.rules = (
Rule (SgmlLinkExtractor(allow=('')), callback='parse_url', follow=True, process_request='add_cookie_for_request'),
)
super(UrlWithCookieSpider, self)._compile_rules()
0
Example 4
Project: cmdbac Source File: form.py
def __init__(self, *args, **kwargs):
super(FormSpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('start_url')]
follow = True if kwargs.get('follow') == 'true' else False
self.rules = (
Rule (SgmlLinkExtractor(allow=('')), callback='parse_form', follow=follow),
)
super(FormSpider, self)._compile_rules()
try:
proxy = kwargs.get('proxy')
service_args = [
'--proxy=' + proxy,
'--proxy-type=http',
]
except:
service_args = None
self.browser = webdriver.PhantomJS(service_args=service_args)