scrapy.loader.ItemLoader

Here are the examples of the python api scrapy.loader.ItemLoader taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

20 Examples 7

Example 1

Project: scrapy Source File: test_loader.py
    def test_load_item_using_default_loader(self):
        i = TestItem()
        i['summary'] = u'lala'
        il = ItemLoader(item=i)
        il.add_value('name', u'marta')
        item = il.load_item()
        assert item is i
        self.assertEqual(item['summary'], u'lala')
        self.assertEqual(item['name'], [u'marta'])

Example 2

Project: Grouch Source File: test_course_item.py
    def test_loading(self):
        loader = ItemLoader(item=items.Course())
        loader.add_value('prerequisites', "CS1331")
        loader.add_value('prerequisites', "CS2110")
        item = loader.load_item()
        self.assertEqual(item['prerequisites'], ['CS1331', 'CS2110'])

Example 3

Project: django-dynamic-scraper Source File: django_spider.py
    def _set_loader(self, response, from_page, xs, item):
        self.from_page = from_page
        rpt = self.scraper.get_rpt(from_page)
        if not self.from_page == 'MP':
            item = response.request.meta['item']
            if rpt.content_type == 'J':
                json_resp = json.loads(response.body_as_unicode())
                self.loader = JsonItemLoader(item=item, selector=json_resp)
            else:
                self.loader = ItemLoader(item=item, response=response)
        else:
            if rpt.content_type == 'J':
                self.loader = JsonItemLoader(item=item, selector=xs)
            else:
                self.loader = ItemLoader(item=item, selector=xs)
        self.loader.default_output_processor = TakeFirst()
        self.loader.log = self.log

Example 4

Project: django-dynamic-scraper Source File: django_spider.py
    def _set_dummy_loader(self, response, from_page, xs, item):
        self.from_page = from_page
        rpt = self.scraper.get_rpt(from_page)
        if not self.from_page == 'MP':
            item = response.request.meta['item']
            if rpt.content_type == 'J':
                json_resp = json.loads(response.body_as_unicode())
                self.dummy_loader = JsonItemLoader(item=DummyItem(), selector=json_resp)
            else:
                self.dummy_loader = ItemLoader(item=DummyItem(), response=response)
        else:
            if rpt.content_type == 'J':
                self.dummy_loader = JsonItemLoader(item=DummyItem(), selector=xs)
            else:
                self.dummy_loader = ItemLoader(item=DummyItem(), selector=xs)
        self.dummy_loader.default_output_processor = TakeFirst()
        self.dummy_loader.log = self.log

Example 5

Project: rojak Source File: beritasatu.py
    def parse_news(self, response):
        self.logger.info('parse_news: {}'.format(response))

        # Init item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('div.content-detail > h4::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title.strip())

        # Extract raw html, not the text
        raw_content_selectors = response.css('div.content-body')
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Example: Selasa, 11 Oktober 2016 | 10:48
        date_selectors = response.css('div.date::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        date_str = date_selectors.extract()[0]
        # Example: 11 October 2016 10:48
        date_str = re.split('[\s,|-]', date_str)
        date_str = ' '.join([_(s) for s in date_str[1:] if s])

        # Parse date information
        try:
            published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_selectors = response.css('div.content-detail > p::text')
        if not author_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_selectors.extract()[0]
            author_name = author_name.split('/')[0]
            loader.add_value('author_name', author_name)

        # Move scraped news to pipeline
        return loader.load_item()

Example 6

Project: rojak Source File: cnnindonesia.py
    def parse_news(self, response):
        self.logger.info('parse_news: {}'.format(response))

        # Init item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('div.detail_text > h1::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)

        # Extract raw html, not the text
        # Using Xpath instead of CSS selector to eliminate useless children
        xpath_query = """
            //div[@class="detail_text"]/node()
                [not(
                    descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::img|
                    descendant-or-self::table|
                    descendant-or-self::iframe
                )]
         """
        raw_content_selectors = response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Example: Senin, 10/10/2016 05:12
        date_selectors = response.css('div.date::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        date_str = date_selectors.extract()[0]
        # Example: 10/10/2016 05:12
        date_str = date_str.split(',')[1].strip()
        # Parse date information
        try:
            published_at_wib = datetime.strptime(date_str, '%d/%m/%Y %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_name_selectors = response.css('div.author > strong::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        # Move scraped news to pipeline
        return loader.load_item()

Example 7

Project: rojak Source File: detikcom.py
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)
        title_selectors = response.css('div.detail_area > h1.jdl::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)

        # Parse date information
        # Example: Kamis 15 Sep 2016, 18:33 WIB
        date_selectors = response.css('div.detail_area > div.date::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        date_str = date_selectors.extract()[0]
        # Example: '15 Sep 2016, 18:33'
        date_str = ' '.join(date_str.split(' ')[1:5])
        try:
            published_at_wib = datetime.strptime(date_str, '%d %b %Y, %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_name_selectors = response.css('div.author > strong::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        # Check for multipage
        xpath_query = "//div[@class='list_multi']/article/a/@href"
        multipage_selectors = response.xpath(xpath_query)
        if multipage_selectors:
            indices = ['http:' + x for x in multipage_selectors.extract()]
            return self.parse_indices(indices, loader)

        # Extract the content using XPath instead of CSS selector
        # We get the XPath from chrome developer tools (copy XPath)
        # or equivalent tools from other browser
        xpath_query = """
            //div[@class="text_detail detail_area"]/node()
                [not(
                    descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::img|
                    descendant-or-self::table|
                    descendant-or-self::iframe
                )]
        """
        raw_content_selectors = response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Move scraped news to pipeline
        return loader.load_item()

Example 8

Project: rojak Source File: hallojakarta.py
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('h1 > span.h-title::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title.strip())

        # Parse date information
        # Example: 15 November, 2016
        date_selectors = response.css('article.single-content > div.meta > span.time > time::text')

        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        try:
            date = date_selectors.extract()[0].split(' ')
            published_at_wib = datetime.strptime(' '.join(date[1:]), '%d/%m/%Y | %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        # no author name
        loader.add_value('author_name', '')

        # Extract the content using XPath instead of CSS selector
        # We get the XPath from chrome developer tools (copy XPath)
        # or equivalent tools from other browser
        xpath_query = """
            //article/div[@class="the-content post-content clearfix"]/node()
                [not(
                    descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::img|
                    descendant-or-self::table|
                    descendant-or-self::iframe
                )]
        """
        raw_content_selectors = response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Move scraped news to pipeline
        return loader.load_item()

Example 9

Project: rojak Source File: jawapos.py
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('h1.detailtitle::text')
        if not title_selectors:
            # If error, drop from the item pipeline
            return loader.load_item()
        title = title_selectors.extract_first().strip()
        loader.add_value('title', title)

        # Parse date information
        date_time = response.css('body > div > div.container > div.page-header > div::text').extract_first().strip()
        date_time = date_time.split(',')[-1].strip()
        date_time = ' '.join([_(w) for w in date_time.split(' ')]) # October => Oktober
        try:
            published_at_wib = datetime.strptime(date_time, '%d %B %Y %H:%M')
        except ValueError:
            # If error, drop from the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        # If multipage
        multipage_selectors = response.css('.newsPagingWrap > a')
        if multipage_selectors:
            return self.parse_indices(multipage_selectors, loader)

        # Else if not multipage

        author_name_selectors = response.css('.newsContent > p > strong::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[-1].strip()
            loader.add_value('author_name', author_name)

        # Extract the news content
        raw_content_selectors = response.css('.newsContent > p')
        if not raw_content_selectors:
            # Drop from the item pipeline
            return loader.load_item()
            
        raw_content = ' '.join(raw_content_selectors.extract())
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Move scraped news to pipeline
        return loader.load_item()

Example 10

Project: rojak Source File: kompas.py
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)
        loader = ItemLoader(item=News(), response=response)        
        json_response = json.loads(response.body)

        try:
            url = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['NewsLines']['MoreLink']
        except KeyError:
            return loader.load_item()
        loader.add_value('url', url)

        try:
            title = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['NewsLines']['HeadLine']
        except KeyError:
            return loader.load_item()
        if not title:
            return loader.load_item()
        loader.add_value('title', title)

        try: 
            raw_content = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['ContentItem']['DataContent']['nitf']['body']['body.content']['p']
        except KeyError:
            return loader.load_item()
        if not raw_content:
            return loader.load_item()
        loader.add_value('raw_content', raw_content)

        try:
            author_name = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['Author']
        except KeyError:
            return loader.load_item()
        if not author_name:
            loader.add_value('author_name', '')
        else:
            loader.add_value('author_name', author_name)

        try:
            date_time_str = json_response['NewsML']['NewsItem']['NewsManagement']['FirstCreated']
        except KeyError:
            return loader.load_item()
        if not date_time_str:
            return loader.load_item()

        date_time_str = date_time_str.split('T')
        date_time_str[1] = '0' * (6 - len(date_time_str[1])) + date_time_str[1]
        try:
            published_at_wib = datetime.strptime(' '.join(date_time_str), '%Y%m%d %H%M%S');
        except Exception:
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        return loader.load_item()

Example 11

Project: rojak Source File: liputan6.py
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)
        title_selectors = response.css('h1.article-header__title::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)

        # Extract the content using XPath instead of CSS selector
        xpath_query = """
            //div[@class="article-raw-content"]/node()
                [not(
                    descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::img|
                    descendant-or-self::table|
                    descendant-or-self::iframe
                )]
        """
        raw_content_selectors = response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Parse date information
        # Example: ' pada 18 Okt 2016, 08:33 WIB'
        date_selectors = response.css('span.article-header__datetime::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        date_str = date_selectors.extract()[0].strip()
        # Example: '18 Oct 2016, 08:33'
        date_str = ' '.join([_(w) for w in date_str.split(' ')[1:-1]])
        try:
            published_at_wib = datetime.strptime(date_str, '%d %b %Y, %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_name_selectors = response.css('a.article-header__author-link::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        # Move scraped news to pipeline
        return loader.load_item()

Example 12

Project: rojak Source File: merdekacom.py
    def parse_news(self, article, sub_article):
        self.logger.info('parse_news: %s' % article)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News())

        # Example: https://m.merdeka.com/tag/p/pilgub-dki/politik/nachrowi-pastikan-agus-sylvi-tak-cuema-incar-suara-santri-ulama.html
        if not sub_article['news_url']:
            return loader.load_item()
        url = 'https://www.merdeka.com' + sub_article['news_url']
        loader.add_value('url', url)

        if not article['news_title']:
            return loader.load_item()
        loader.add_value('title', article['news_title'])

        if not article['news_reporter']:
            loader.add_value('author_name', '')
        else:
            loader.add_value('author_name', article['news_reporter'])

        if not sub_article['news_description']:
            return loader.load_item()
        loader.add_value('raw_content', sub_article['news_description'])

        # Parse date information
        date_time_str = article['news_date_publish']
        try:
            # Example: 2016-10-12 15:16:04
            published_at_wib = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
        except Exception as e:
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        # Move scraped news to pipeline
        return loader.load_item()

Example 13

Project: rojak Source File: metrotvnews.py
    def parse_news(self, response):
        self.logger.info('parse_news: {}'.format(response))
        is_video = response.css('ul.breadcrumb > li > a::text').extract()[0] == 'VIDEO'

        # Init item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        # Will be dropped if video page
        if is_video:
            return loader.load_item()

        title_selectors = response.css('div.part.lead.pr > h1::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)

        xpath_query = """
            //div[@class="part article"]/node()
                [not(
                    descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::img|
                    descendant-or-self::table|
                    descendant-or-self::iframe
                )]
        """
        raw_content_selectors = response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Example: Bambang - 10 Oktober 2016 21:10 wib
        info_selectors = response.css('div.part.lead.pr > span::text')
        if not info_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        info = info_selectors.extract()[0]

        # Parse date information
        # Example: 10 Oktober 2016 21:10 wib
        date_str = info.split('-')[1].strip()
        if not date_str:
            # Will be dropped on the item pipeline
            return loader.load_item()

        # Example: 10 October 2016 21:10
        date_str = ' '.join([_(w) for w in date_str[:-4].split(' ')])
        try:
            published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_name = info.split('-')[0].strip()
        if not author_name:
            loader.add_value('author_name', '')
        else:
            loader.add_value('author_name', author_name)

        # Move scraped news to pipeline
        return loader.load_item()

Example 14

Project: rojak Source File: okezone.py
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)
        parsed_news = json.loads(str(response.body))[0]

        # Initialize item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', parsed_news['url'])

        if not parsed_news['title']:
            # Will be dropped on the item pipeline
            return loader.load_item()
        loader.add_value('title', parsed_news['title'])

        # Convert HTML text to a scrapy response
        html_response = HtmlResponse(url=parsed_news['url'],
                body=parsed_news['content'].encode('utf-8', 'ignore'))
        xpath_query = '''
            //body/node()
                [not(descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::image|
                    descendant-or-self::img|
                    descendant-or-self::iframe
                )]
        '''
        raw_content_selectors = html_response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        if not parsed_news['published']:
            # Will be dropped on the item pipeline
            return loader.load_item()

        # Parse date information
        # Example: 12 Oct 2016 - 05:25
        date_time_str = ' '.join([_(w) for w in parsed_news['published'].split(',')[1].strip()[:-4].split(' ')])
        try:
            published_at_wib = datetime.strptime(date_time_str,
                    '%d %b %Y - %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        if not parsed_news['author']:
            loader.add_value('author_name', '')
        else:
            loader.add_value('author_name', parsed_news['author'])

        # Move scraped news to pipeline
        return loader.load_item()

Example 15

Project: rojak Source File: republikaonline.py
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('div.wrap-head > h2 > a')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = ''.join(title_selectors[0].xpath('.//text()').extract())
        loader.add_value('title', title.strip())

        # Parse date information
        # Example: Rabu, 02 November 2016, 10:29 WIB
        date_selectors = response.css('div.wrap-head > span.date::text')

        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        try:
            date = date_selectors.extract()[0].strip().split(' ')
            # Sanitize month
            date[2] = sanitize(date[2])
            published_at_wib = datetime.strptime(' '.join(date[1:]), '%d %b %Y | %H:%M WIB')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_name_selectors = response.css('div.red::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            authors = [author.strip() for author in author_name_selectors.extract()]
            # Only consider Red: as author
            # Example: ['Rep: Dadang Kurnia', 'Red: Bilal Ramadhan']
            author_names = [name[4:].strip() for name in filter(lambda a: a.count('Red:') > 0, authors)]
            loader.add_value('author_name', ','.join(author_names))

        # Extract the content using XPath instead of CSS selector
        # We get the XPath from chrome developer tools (copy XPath)
        # or equivalent tools from other browser
        xpath_query = """
            //article/div[@class="content-detail"]/p/node()
        """
        raw_content_selectors = response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Move scraped news to pipeline
        return loader.load_item()

Example 16

Project: rojak Source File: sindonews.py
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('h1[itemprop="headline"]::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)


        author_name_selectors = response.css('a[rel="author"] > span::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        raw_content_selectors = response.css('.content')
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        date_time_str_selectors = response.css('article > div.time::text')
        if not date_time_str_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        # Parse date information
        # Example: Selasa,  6 Oktober 2015 - 05:23 WIB
        date_time_str = date_time_str_selectors.extract()[0]
        date_time_str = date_time_str.split(',')[1].strip()[:-4]
        date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
        try:
            published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        # Move scraped news to pipeline
        return loader.load_item()

Example 17

Project: rojak Source File: tempoco.py
    def parse_news_metro(self, response):
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        date_selector = response.css('.artikel > div.block-tanggal::text')
        if not date_selector:
            return self.parse_news_pilkada(loader, response)
        try:
            date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4]
            date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')])
            published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M')
        except Exception:
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        if (self.media['last_scraped_at'] >= published_at):
            is_no_update = True
            self.logger.info('Media have no update')
            raise CloseSpider('finished')
        loader.add_value('published_at', published_at)

        title_selector = response.css('.artikel > h1::text')
        if not title_selector:
            return loader.load_item()
        loader.add_value('title', title_selector.extract()[0])

        # Select all p which don't have iframe inside it
        raw_content_selector = response.xpath('//div[@class="artikel"]//p[not(iframe)]')
        if not raw_content_selector:
            return loader.load_item()
        raw_content = ''
        for rsl in raw_content_selector:
            raw_content = raw_content + rsl.extract().strip()

        # Go to next page while there is next page button
        next_page_selector = response.css('.pagination-nb').xpath('//a[text()="next"]/@href')
        if next_page_selector:
            return Request(next_page_selector.extract()[0], callback=lambda x, loader=loader, raw_content=raw_content: self.parse_next_page_metro(x, loader, raw_content))

        loader.add_value('raw_content', raw_content)

        # The author usually put inside <strong> tag, however, some news is not using <strong> tag.
        # NOTE: this block of code may need revision in the future
        author_name = ''
        for author_name_selector in reversed(raw_content_selector):
            author_name_selector = author_name_selector.css('strong::text')
            for tmp in reversed(author_name_selector.extract()):
                tmp = tmp.strip()
                if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp):
                    author_name = tmp
                    break
            if author_name:
                break
        author_name = ','.join(author_name.split(' | '))
        loader.add_value('author_name', author_name)
        return loader.load_item()

Example 18

Project: rojak Source File: tirtoid.py
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)
        title_selectors = response.css('header > h1::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title.strip())

        # Parse date information
        # Example: 15 November, 2016
        date_selectors = response.css('header > div.date::text')

        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        try:
            date = date_selectors.extract()[0].strip().split(' ')
            # Sanitize month
            date[1] = sanitize(date[1])
            published_at_wib = datetime.strptime(' '.join(date), '%d %b %Y')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_name_selectors = response.css('div.reporter::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[0]
            # Example: Reporter: Mutaya Saroh -> Mutaya Saroh
            loader.add_value('author_name', author_name.split(':')[-1].strip())

        # Extract the content using XPath instead of CSS selector
        # We get the XPath from chrome developer tools (copy XPath)
        # or equivalent tools from other browser
        xpath_query = """
            //article/div[@class="content-text-editor"]/node()
                [not(
                    descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::img|
                    descendant-or-self::table|
                    descendant-or-self::iframe
                )]
        """
        raw_content_selectors = response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content.split('<p>')[0].strip())

        # Move scraped news to pipeline
        return loader.load_item()

Example 19

Project: rojak Source File: viva.py
    def parse_news(self, response):
        self.logger.info('parse_news: {}'.format(response))

        # Init item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('h1.title-big-detail::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0].strip()
        loader.add_value('title', title)

        # Extract raw html, not the text
        # We filter-out the noise: HTML comments, scripts, css styles etc
        xpath_query ='''
            //div[@class="detail-content"]/node()
                [not(
                    descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::img|
                    descendant-or-self::table|
                    descendant-or-self::iframe|
                    descendant-or-self::a[@class="share-btn-right shared"]
                )]
        '''
        raw_content_selectors = response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        loader.add_value('raw_content', raw_content)

        date_selectors = response.css('span.meta-author > span:nth-child(3)::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        # Example: Sabtu, 1 Oktober 2016, 15:47 WIB
        date_str = date_selectors.extract()[0].strip()
        # Example: 1 October 2016 15:47
        date_str = date_str.replace(',', '').split(' ')[1:-1]
        date_str = ' '.join([_(s) for s in date_str])
        # Parse date information
        try:
            published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        author_selectors = response.css('span.meta-author > span > b::text')
        if not author_selectors:
            author_name = ''
            loader.add_value('author_name', author_name)
        else:
            author_name = author_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        # Move scraped news to pipeline
        return loader.load_item()

Example 20

Project: rojak Source File: wowkeren.py
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('div.NewsTitle > h1::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title.strip())

        # Parse date information
        # Example: 27 Oct 2016, 18:33:36 WIB
        date_selectors = response.css('div.NewsDate::text')
        if not date_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        try:
            date_str = date_selectors.extract()[0]
            published_at_wib = datetime.strptime(date_str, '%d %b %Y %H:%M:%S WIB')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        # no author
        loader.add_value('author_name', '')

        # Extract the content using XPath instead of CSS selector
        # We get the XPath from chrome developer tools (copy XPath)
        # or equivalent tools from other browser
        xpath_query = """
            //div[@class="pad10"]/p/node()
                [not(
                    descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::img|
                    descendant-or-self::table|
                    descendant-or-self::iframe
                )]
        """
        raw_content_selectors = response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Move scraped news to pipeline
        return loader.load_item()