scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor

Here are the examples of the python api scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

4 Examples 7

Example 1

Project: cmdbac Source File: form_with_cookie.py
    def __init__(self, *args, **kwargs): 
        super(FormWithCookieSpider, self).__init__(*args, **kwargs)

        self.start_urls = [kwargs.get('start_url')]
        self.cookiejar = cookielib.LWPCookieJar()
        self.cookiejar.load(kwargs.get('cookie_jar'))
        
        self.rules = (
            Rule (SgmlLinkExtractor(allow=('')), callback='parse_form', follow=True, process_request='add_cookie_for_request'),
        )
        super(FormWithCookieSpider, self)._compile_rules()

Example 2

Project: cmdbac Source File: url.py
    def __init__(self, *args, **kwargs): 
        super(UrlSpider, self).__init__(*args, **kwargs)

        self.start_urls = [kwargs.get('start_url')]
        
        follow = True if kwargs.get('follow') == 'true' else False
        self.rules = (
            Rule (SgmlLinkExtractor(allow=('')), callback='parse_url', follow=follow),
        )
        super(UrlSpider, self)._compile_rules()

        try:
            proxy = kwargs.get('proxy')
            service_args = [
                '--proxy=' + proxy,
                '--proxy-type=http',
            ]
        except:
            service_args = None

Example 3

Project: cmdbac Source File: url_with_cookie.py
    def __init__(self, *args, **kwargs): 
        super(UrlWithCookieSpider, self).__init__(*args, **kwargs)

        self.start_urls = [kwargs.get('start_url')]
        self.cookiejar = cookielib.LWPCookieJar()
        self.cookiejar.load(kwargs.get('cookie_jar'))
        
        self.rules = (
            Rule (SgmlLinkExtractor(allow=('')), callback='parse_url', follow=True, process_request='add_cookie_for_request'),
        )
        super(UrlWithCookieSpider, self)._compile_rules()

Example 4

Project: cmdbac Source File: form.py
    def __init__(self, *args, **kwargs): 
        super(FormSpider, self).__init__(*args, **kwargs)

        self.start_urls = [kwargs.get('start_url')]
        
        follow = True if kwargs.get('follow') == 'true' else False
        self.rules = (
            Rule (SgmlLinkExtractor(allow=('')), callback='parse_form', follow=follow),
        )
        super(FormSpider, self)._compile_rules()

        try:
            proxy = kwargs.get('proxy')
            service_args = [
                '--proxy=' + proxy,
                '--proxy-type=http',
            ]
        except:
            service_args = None
        self.browser = webdriver.PhantomJS(service_args=service_args)