scrapy.log.WARNING

Here are the examples of the python api scrapy.log.WARNING taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

12 Examples 7

Example 1

Project: django-scrape
Source File: processors.py
View license
    def __call__(self, values):
        out_values = []
        for v in arg_to_iter(values):
            if isinstance(v, (str, unicode)):
                try:
                    out_values.append(dateutil.parser.parse(str(v), fuzzy=True))
                except:
                    log.msg('Failed to convert datetime string: "%s"'%v, level=log.WARNING)
                    out_values.append(None)
            elif isinstance(v, datetime):
                out_values.append(v)
            else:
                out_values.append(datetime(v))
        return out_values

Example 2

Project: legco-watch
Source File: questions.py
View license
    def make_question(self, language, response, row, this_date):
        """
        Given a question row, create a dict with the question fields
        """
        # Sometimes replies don't have links
        this_question = {
            'date': this_date,
            'source_url': response.url,
            'number_and_type': row[0].text_content(),
            'asker': row[1].text_content(),
            'subject': row[2].text_content(),
            'subject_link': row[2][0].get('href', None),
            'language': language
        }
        try:
            this_question['reply_link'] = row[3][0].get('href', None)
        except IndexError:
            self.log(u'No reply link on {} from {}'.format(response.url, this_date), level=log.WARNING)
        return this_question

Example 3

Project: legco-watch
Source File: questions.py
View license
    def make_question(self, language, response, row, this_date):
        """
        Given a question row, create a dict with the question fields
        """
        # Sometimes replies don't have links
        this_question = {
            'date': this_date,
            'source_url': response.url,
            'number_and_type': row[0].text_content(),
            'asker': row[1].text_content(),
            'subject': row[2].text_content(),
            'subject_link': row[2][0].get('href', None),
            'language': language
        }
        try:
            this_question['reply_link'] = row[3][0].get('href', None)
        except IndexError:
            self.log(u'No reply link on {} from {}'.format(response.url, this_date), level=log.WARNING)
        return this_question

Example 4

Project: frontera
Source File: recording.py
View license
    def open(self, spider):
        super(RecorderScheduler, self).open(spider)

        self.stats_manager = StatsManager(spider.crawler.stats)

        settings = spider.crawler.settings
        self.recorder_enabled = settings.get('RECORDER_ENABLED', DEFAULT_RECORDER_ENABLED)

        if not self.recorder_enabled:
            log.msg('Recorder disabled!', log.WARNING)
            return

        log.msg('Starting recorder', log.INFO)

        recorder_storage = settings.get('RECORDER_STORAGE_ENGINE', None)
        if not recorder_storage:
            self.recorder_enabled = False
            log.msg('Missing Recorder storage! Recorder disabled...', log.WARNING)
            return

        self.graph = graphs.Manager(
            engine=recorder_storage,
            drop_all_tables=settings.getbool('RECORDER_STORAGE_DROP_ALL_TABLES',
                                             DEFAULT_RECORDER_STORAGE_DROP_ALL_TABLES),
            clear_content=settings.getbool('RECORDER_STORAGE_CLEAR_CONTENT',
                                           DEFAULT_RECORDER_STORAGE_CLEAR_CONTENT))

Example 5

Project: scrapy-mongodb
Source File: scrapy_mongodb.py
View license
    def configure(self):
        """ Configure the MongoDB connection """
        # Handle deprecated configuration
        if not not_set(self.settings['MONGODB_HOST']):
            log.msg(
                u'DeprecationWarning: MONGODB_HOST is deprecated',
                level=log.WARNING)
            mongodb_host = self.settings['MONGODB_HOST']

            if not not_set(self.settings['MONGODB_PORT']):
                log.msg(
                    u'DeprecationWarning: MONGODB_PORT is deprecated',
                    level=log.WARNING)
                self.config['uri'] = 'mongodb://{0}:{1:i}'.format(
                    mongodb_host,
                    self.settings['MONGODB_PORT'])
            else:
                self.config['uri'] = 'mongodb://{0}:27017'.format(mongodb_host)

        if not not_set(self.settings['MONGODB_REPLICA_SET']):
            if not not_set(self.settings['MONGODB_REPLICA_SET_HOSTS']):
                log.msg(
                    (
                        u'DeprecationWarning: '
                        u'MONGODB_REPLICA_SET_HOSTS is deprecated'
                    ),
                    level=log.WARNING)
                self.config['uri'] = 'mongodb://{0}'.format(
                    self.settings['MONGODB_REPLICA_SET_HOSTS'])

        # Set all regular options
        options = [
            ('uri', 'MONGODB_URI'),
            ('fsync', 'MONGODB_FSYNC'),
            ('write_concern', 'MONGODB_REPLICA_SET_W'),
            ('database', 'MONGODB_DATABASE'),
            ('collection', 'MONGODB_COLLECTION'),
            ('replica_set', 'MONGODB_REPLICA_SET'),
            ('unique_key', 'MONGODB_UNIQUE_KEY'),
            ('buffer', 'MONGODB_BUFFER_DATA'),
            ('append_timestamp', 'MONGODB_ADD_TIMESTAMP'),
            ('stop_on_duplicate', 'MONGODB_STOP_ON_DUPLICATE')
        ]

        for key, setting in options:
            if not not_set(self.settings[setting]):
                self.config[key] = self.settings[setting]

        # Check for illegal configuration
        if self.config['buffer'] and self.config['unique_key']:
            log.msg(
                (
                    u'IllegalConfig: Settings both MONGODB_BUFFER_DATA '
                    u'and MONGODB_UNIQUE_KEY is not supported'
                ),
                level=log.ERROR)
            raise SyntaxError(
                (
                    u'IllegalConfig: Settings both MONGODB_BUFFER_DATA '
                    u'and MONGODB_UNIQUE_KEY is not supported'
                ))

Example 6

View license
    def process_item(self, item, spider):
        if not type(item) == Alert:
            return item

        uri = item['uri']

        if not uri:
            raise DropItem("Not a valid alert URI: ", uri)

        if spider.custom_whitelist:
            for (pattern) in spider.custom_whitelist:
                if pattern[0] in uri:
                    raise DropItem("Whitelisted domain found in Alert: ", uri)

        if spider.alexa_whitelist:
            try:
                parsed_uri = urlparse(uri)
                parsed_domain = '{uri.netloc}'.format(uri=parsed_uri)
                domain = get_tld(uri)
                for alexa_domain in spider.alexa_whitelist:
                    if domain.endswith(alexa_domain):
                        raise DropItem("Alert domain found in Alexa Whitelist: ", domain)
            except (TldIOError,TldDomainNotFound,TldBadUrl) as e:
                log.msg("Error parsing TLD. Still allowing alert for " + uri, level=log.WARNING)
            except:
                raise

        return item

Example 7

Project: pystock-crawler
Source File: loaders.py
View license
    def __call__(self, value, loader_context):
        if not hasattr(value, 'select'):
            return IntermediateValue('', 0.0, '0', None)

        doc_end_date_str = loader_context['end_date']
        doc_type = loader_context['doc_type']
        selector = loader_context['selector']

        context_id = value.xpath('@contextRef')[0].extract()
        try:
            context = selector.xpath('//*[@id="%s"]' % context_id)[0]
        except IndexError:
            try:
                url = loader_context['response'].url
            except KeyError:
                url = None
            log.msg(u'Cannot find context: %s in %s' % (context_id, url), log.WARNING)
            return None

        date = instant = start_date = end_date = None
        try:
            instant = context.xpath('.//*[local-name()="instant"]/text()')[0].extract().strip()
        except (IndexError, ValueError):
            try:
                end_date_str = context.xpath('.//*[local-name()="endDate"]/text()')[0].extract().strip()
                end_date = datetime.strptime(end_date_str, DATE_FORMAT)

                start_date_str = context.xpath('.//*[local-name()="startDate"]/text()')[0].extract().strip()
                start_date = datetime.strptime(start_date_str, DATE_FORMAT)

                if self.ignore_date_range or date_range_matches_doc_type(doc_type, start_date, end_date):
                    date = end_date
            except (IndexError, ValueError):
                pass
        else:
            try:
                instant = datetime.strptime(instant, DATE_FORMAT)
            except ValueError:
                pass
            else:
                date = instant

        if date:
            doc_end_date = datetime.strptime(doc_end_date_str, DATE_FORMAT)
            delta_days = (doc_end_date - date).days
            if abs(delta_days) < 30:
                try:
                    text = value.xpath('./text()')[0].extract()
                    val = self.data_type(text)
                except (IndexError, ValueError):
                    pass
                else:
                    local_name = value.xpath('local-name()')[0].extract()
                    return IntermediateValue(
                        local_name, val, text, context, value,
                        start_date=start_date, end_date=end_date, instant=instant)

        return None

Example 8

Project: sozlukcrawler
Source File: __init__.py
View license
    def start_requests(self):
        self.log('Eliminating already seen web pages. If you think crawler is not working '
                 'please check "seen" table in the database', level=log.WARNING)

        return [Request(i) for i in self.urls if not is_request_seen(Request(i))]

Example 9

Project: legco-watch
Source File: hansard.py
View license
    def parse_hansard_post_1998(self, response):
        sel = Selector(response)    

        # Get the year that this index page is for
        # Meetings (Year 2013 - 2014)
        # This is mostly for debugging purposes so we can spit this out in the logs
        year_range = sel.xpath('//strong/em/text()').extract()
        if not year_range:
            self.log("%s: Could not find year range on hansard index page" % response.url, level=log.WARNING)
            return
        else:
            self.log("%s: Parsing Hansard Index: %s" % (response.url, year_range), level=log.INFO)

        # Find any dates at the top of this page. Other dates are identical
        # to this page, and indeed the current page will also be included in
        # the date list. Scrapy will prevent us recursing back into ourselves.
    
        year_urls = sel.xpath('//tr/td/a[contains(@href,"#toptbl")]/@href').extract()
        for year_url in year_urls:
            absolute_url = urlparse.urljoin(response.url, year_url.strip())
            req = Request(absolute_url, callback = self.parse_hansard_index_page)
            yield req
        
        # We are looking for table rows which link to Hansard entries for a
        # particular date. In newer versions these are 6-columned table rows
        # where column 6 is a link to a webcast (doesn't seem to exist)
        # Older revisions are 5 columned rows. These are all after the anchor
        # 'hansard'.

        print "Parsing Rows"
        # Find the handsard table
        table = sel.xpath("//div[@class='table_overflow']//a[@name='hansard']/following::table[1]")
        if not table:
            # http://www.legco.gov.hk/general/english/counmtg/yr08-12/mtg_0910.htm
            table = sel.xpath("//div[@id='_content_']//a[@name='hansard']/following::table[1]")

        rows = table.xpath(".//tr[count(td)>=5]")
        if not rows:
            self.log("%s: Could not find any Handard entries to crawl into" % response.url, level=log.WARNING)
            return
    
        self.log("%s: %i rows found" % (response.url, len(rows)), level=log.INFO)

        for row in rows:
            date_info = ' '.join(row.xpath('.//td[1]/node()/text()').extract())
            self.log("%s: Row: %s" % (response.url, date_info), level=log.INFO)

            # Recurse into the agenda, if it exists
            agenda_url = row.xpath('.//td[2]/a/@href').extract()
            if agenda_url:
                absolute_url = urlparse.urljoin(response.url, agenda_url[0].strip())
                req = Request(absolute_url, callback = self.parse_hansard_agenda)
                yield req
            else:
                self.log("%s: Could not find an agenda URL for %s" % (response.url, date_info), level=log.WARNING)
        
            # Download the minutes document if it exists. This is a PDF file
            minutes_url = row.xpath('.//td[3]/a/@href').extract()
            if minutes_url:
                absolute_url = urlparse.urljoin(response.url, minutes_url[0].strip())
                minutes = HansardMinutes()
                minutes['date'] = date_info
                minutes['file_urls'] = [absolute_url]
                yield minutes
            else:
                self.log("%s: Could not find an minutes URL for %s" % (response.url, date_info), level=log.WARNING)

            for (lang, index) in [('en',4),('cn',5)]:

                hansard_urls = row.xpath('.//td[%i]/a/@href' % index).extract()
                for url in hansard_urls:
                    # Is this a PDF entry, or do we need to recurse?
                    absolute_url = urlparse.urljoin(response.url, url.strip())
                    if absolute_url.endswith('pdf'):
                        hansard_record = HansardRecord()
                        hansard_record['date'] = date_info
                        hansard_record['language'] = lang
                        hansard_record["file_urls"] = [absolute_url]
                        yield hansard_record
                    else:
                        # Recurse into the HTML handler for the HTML Handard Record Index
                        req = Request(absolute_url, callback = self.parse_hansard_html_record)
                        yield req

                if not hansard_urls:
                    self.log("%s: Could not find an hansard URL for %s, lang %s" % (response.url, date_info, lang), level=log.WARNING)

Example 10

Project: legco-watch
Source File: hansard.py
View license
    def parse_hansard_post_1998(self, response):
        sel = Selector(response)    

        # Get the year that this index page is for
        # Meetings (Year 2013 - 2014)
        # This is mostly for debugging purposes so we can spit this out in the logs
        year_range = sel.xpath('//strong/em/text()').extract()
        if not year_range:
            self.log("%s: Could not find year range on hansard index page" % response.url, level=log.WARNING)
            return
        else:
            self.log("%s: Parsing Hansard Index: %s" % (response.url, year_range), level=log.INFO)

        # Find any dates at the top of this page. Other dates are identical
        # to this page, and indeed the current page will also be included in
        # the date list. Scrapy will prevent us recursing back into ourselves.
    
        year_urls = sel.xpath('//tr/td/a[contains(@href,"#toptbl")]/@href').extract()
        for year_url in year_urls:
            absolute_url = urlparse.urljoin(response.url, year_url.strip())
            req = Request(absolute_url, callback = self.parse_hansard_index_page)
            yield req
        
        # We are looking for table rows which link to Hansard entries for a
        # particular date. In newer versions these are 6-columned table rows
        # where column 6 is a link to a webcast (doesn't seem to exist)
        # Older revisions are 5 columned rows. These are all after the anchor
        # 'hansard'.

        print "Parsing Rows"
        # Find the handsard table
        table = sel.xpath("//div[@class='table_overflow']//a[@name='hansard']/following::table[1]")
        if not table:
            # http://www.legco.gov.hk/general/english/counmtg/yr08-12/mtg_0910.htm
            table = sel.xpath("//div[@id='_content_']//a[@name='hansard']/following::table[1]")

        rows = table.xpath(".//tr[count(td)>=5]")
        if not rows:
            self.log("%s: Could not find any Handard entries to crawl into" % response.url, level=log.WARNING)
            return
    
        self.log("%s: %i rows found" % (response.url, len(rows)), level=log.INFO)

        for row in rows:
            date_info = ' '.join(row.xpath('.//td[1]/node()/text()').extract())
            self.log("%s: Row: %s" % (response.url, date_info), level=log.INFO)

            # Recurse into the agenda, if it exists
            agenda_url = row.xpath('.//td[2]/a/@href').extract()
            if agenda_url:
                absolute_url = urlparse.urljoin(response.url, agenda_url[0].strip())
                req = Request(absolute_url, callback = self.parse_hansard_agenda)
                yield req
            else:
                self.log("%s: Could not find an agenda URL for %s" % (response.url, date_info), level=log.WARNING)
        
            # Download the minutes document if it exists. This is a PDF file
            minutes_url = row.xpath('.//td[3]/a/@href').extract()
            if minutes_url:
                absolute_url = urlparse.urljoin(response.url, minutes_url[0].strip())
                minutes = HansardMinutes()
                minutes['date'] = date_info
                minutes['file_urls'] = [absolute_url]
                yield minutes
            else:
                self.log("%s: Could not find an minutes URL for %s" % (response.url, date_info), level=log.WARNING)

            for (lang, index) in [('en',4),('cn',5)]:

                hansard_urls = row.xpath('.//td[%i]/a/@href' % index).extract()
                for url in hansard_urls:
                    # Is this a PDF entry, or do we need to recurse?
                    absolute_url = urlparse.urljoin(response.url, url.strip())
                    if absolute_url.endswith('pdf'):
                        hansard_record = HansardRecord()
                        hansard_record['date'] = date_info
                        hansard_record['language'] = lang
                        hansard_record["file_urls"] = [absolute_url]
                        yield hansard_record
                    else:
                        # Recurse into the HTML handler for the HTML Handard Record Index
                        req = Request(absolute_url, callback = self.parse_hansard_html_record)
                        yield req

                if not hansard_urls:
                    self.log("%s: Could not find an hansard URL for %s, lang %s" % (response.url, date_info, lang), level=log.WARNING)

Example 11

Project: legco-watch
Source File: questions.py
View license
    def parse(self, response):
        sel = Selector(response)
        body = sel.xpath('//div[@id="_content_"]')
        if len(body) != 1:
            self.log(u'Expected single body element, but found {} on {}'.format(len(body), response.url), level=log.WARNING)
            return
        body = body[0]
        if u'chinese' in response.url:
            language = 'C'
            matcher = self.HEADER_RE_C
        else:
            language = 'E'
            matcher = self.HEADER_RE_E
        # We'll need lxml to parse this
        parser = HTMLParser(encoding='utf-8')
        body_extract = body.extract().encode('utf-8')
        body_elements = lxml.html.fromstring(body_extract, parser=parser)
        # Iterate over the body elements, processing each h2-table pair for each meeting
        count_sessions = 0
        count_questions = 0
        for elem in body_elements:
            # Skip comments
            if elem.tag == lxml.etree.Comment:
                continue
            # Take the first 50 characters, so RE doesn't scan the whole body of text for large elements
            match = re.search(matcher, elem.text_content()[:50])
            if match is not None:
                this_date = match.groupdict()['date']
                self.log(u'Found table for date {}'.format(this_date))
                count_sessions += 1
                questions_table = elem.getnext()
                for row in questions_table.xpath('./tr'):
                    # We ignore the header row, which is indicated by ths
                    if row[0].tag == 'th':
                        continue
                    this_question = self.make_question(language, response, row, this_date)
                    count_questions += 1
                    yield Question(**this_question)

        self.log(u'Processed {} questions in {} sessions'.format(count_questions, count_sessions), level=log.INFO)

Example 12

Project: legco-watch
Source File: questions.py
View license
    def parse(self, response):
        sel = Selector(response)
        body = sel.xpath('//div[@id="_content_"]')
        if len(body) != 1:
            self.log(u'Expected single body element, but found {} on {}'.format(len(body), response.url), level=log.WARNING)
            return
        body = body[0]
        if u'chinese' in response.url:
            language = 'C'
            matcher = self.HEADER_RE_C
        else:
            language = 'E'
            matcher = self.HEADER_RE_E
        # We'll need lxml to parse this
        parser = HTMLParser(encoding='utf-8')
        body_extract = body.extract().encode('utf-8')
        body_elements = lxml.html.fromstring(body_extract, parser=parser)
        # Iterate over the body elements, processing each h2-table pair for each meeting
        count_sessions = 0
        count_questions = 0
        for elem in body_elements:
            # Skip comments
            if elem.tag == lxml.etree.Comment:
                continue
            # Take the first 50 characters, so RE doesn't scan the whole body of text for large elements
            match = re.search(matcher, elem.text_content()[:50])
            if match is not None:
                this_date = match.groupdict()['date']
                self.log(u'Found table for date {}'.format(this_date))
                count_sessions += 1
                questions_table = elem.getnext()
                for row in questions_table.xpath('./tr'):
                    # We ignore the header row, which is indicated by ths
                    if row[0].tag == 'th':
                        continue
                    this_question = self.make_question(language, response, row, this_date)
                    count_questions += 1
                    yield Question(**this_question)

        self.log(u'Processed {} questions in {} sessions'.format(count_questions, count_sessions), level=log.INFO)