Here are the examples of the python api scrapy.loader.ItemLoader taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
20 Examples
3
Example 1
Project: scrapy Source File: test_loader.py
def test_load_item_using_default_loader(self):
i = TestItem()
i['summary'] = u'lala'
il = ItemLoader(item=i)
il.add_value('name', u'marta')
item = il.load_item()
assert item is i
self.assertEqual(item['summary'], u'lala')
self.assertEqual(item['name'], [u'marta'])
3
Example 2
Project: Grouch Source File: test_course_item.py
def test_loading(self):
loader = ItemLoader(item=items.Course())
loader.add_value('prerequisites', "CS1331")
loader.add_value('prerequisites', "CS2110")
item = loader.load_item()
self.assertEqual(item['prerequisites'], ['CS1331', 'CS2110'])
3
Example 3
Project: django-dynamic-scraper Source File: django_spider.py
def _set_loader(self, response, from_page, xs, item):
self.from_page = from_page
rpt = self.scraper.get_rpt(from_page)
if not self.from_page == 'MP':
item = response.request.meta['item']
if rpt.content_type == 'J':
json_resp = json.loads(response.body_as_unicode())
self.loader = JsonItemLoader(item=item, selector=json_resp)
else:
self.loader = ItemLoader(item=item, response=response)
else:
if rpt.content_type == 'J':
self.loader = JsonItemLoader(item=item, selector=xs)
else:
self.loader = ItemLoader(item=item, selector=xs)
self.loader.default_output_processor = TakeFirst()
self.loader.log = self.log
3
Example 4
Project: django-dynamic-scraper Source File: django_spider.py
def _set_dummy_loader(self, response, from_page, xs, item):
self.from_page = from_page
rpt = self.scraper.get_rpt(from_page)
if not self.from_page == 'MP':
item = response.request.meta['item']
if rpt.content_type == 'J':
json_resp = json.loads(response.body_as_unicode())
self.dummy_loader = JsonItemLoader(item=DummyItem(), selector=json_resp)
else:
self.dummy_loader = ItemLoader(item=DummyItem(), response=response)
else:
if rpt.content_type == 'J':
self.dummy_loader = JsonItemLoader(item=DummyItem(), selector=xs)
else:
self.dummy_loader = ItemLoader(item=DummyItem(), selector=xs)
self.dummy_loader.default_output_processor = TakeFirst()
self.dummy_loader.log = self.log
0
Example 5
Project: rojak Source File: beritasatu.py
def parse_news(self, response):
self.logger.info('parse_news: {}'.format(response))
# Init item loader
# extract news title, published_at, author, content, url
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
title_selectors = response.css('div.content-detail > h4::text')
if not title_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
title = title_selectors.extract()[0]
loader.add_value('title', title.strip())
# Extract raw html, not the text
raw_content_selectors = response.css('div.content-body')
if not raw_content_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
# Example: Selasa, 11 Oktober 2016 | 10:48
date_selectors = response.css('div.date::text')
if not date_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
date_str = date_selectors.extract()[0]
# Example: 11 October 2016 10:48
date_str = re.split('[\s,|-]', date_str)
date_str = ' '.join([_(s) for s in date_str[1:] if s])
# Parse date information
try:
published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M')
except ValueError:
# Will be dropped on the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
author_selectors = response.css('div.content-detail > p::text')
if not author_selectors:
loader.add_value('author_name', '')
else:
author_name = author_selectors.extract()[0]
author_name = author_name.split('/')[0]
loader.add_value('author_name', author_name)
# Move scraped news to pipeline
return loader.load_item()
0
Example 6
Project: rojak Source File: cnnindonesia.py
def parse_news(self, response):
self.logger.info('parse_news: {}'.format(response))
# Init item loader
# extract news title, published_at, author, content, url
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
title_selectors = response.css('div.detail_text > h1::text')
if not title_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
title = title_selectors.extract()[0]
loader.add_value('title', title)
# Extract raw html, not the text
# Using Xpath instead of CSS selector to eliminate useless children
xpath_query = """
//div[@class="detail_text"]/node()
[not(
descendant-or-self::comment()|
descendant-or-self::style|
descendant-or-self::script|
descendant-or-self::div|
descendant-or-self::span|
descendant-or-self::img|
descendant-or-self::table|
descendant-or-self::iframe
)]
"""
raw_content_selectors = response.xpath(xpath_query)
if not raw_content_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
# Example: Senin, 10/10/2016 05:12
date_selectors = response.css('div.date::text')
if not date_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
date_str = date_selectors.extract()[0]
# Example: 10/10/2016 05:12
date_str = date_str.split(',')[1].strip()
# Parse date information
try:
published_at_wib = datetime.strptime(date_str, '%d/%m/%Y %H:%M')
except ValueError:
# Will be dropped on the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
author_name_selectors = response.css('div.author > strong::text')
if not author_name_selectors:
loader.add_value('author_name', '')
else:
author_name = author_name_selectors.extract()[0]
loader.add_value('author_name', author_name)
# Move scraped news to pipeline
return loader.load_item()
0
Example 7
Project: rojak Source File: detikcom.py
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
# Initialize item loader
# extract news title, published_at, author, content, url
# Required: title, raw_content, published_at
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
title_selectors = response.css('div.detail_area > h1.jdl::text')
if not title_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
title = title_selectors.extract()[0]
loader.add_value('title', title)
# Parse date information
# Example: Kamis 15 Sep 2016, 18:33 WIB
date_selectors = response.css('div.detail_area > div.date::text')
if not date_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
date_str = date_selectors.extract()[0]
# Example: '15 Sep 2016, 18:33'
date_str = ' '.join(date_str.split(' ')[1:5])
try:
published_at_wib = datetime.strptime(date_str, '%d %b %Y, %H:%M')
except ValueError:
# Will be dropped on the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
author_name_selectors = response.css('div.author > strong::text')
if not author_name_selectors:
loader.add_value('author_name', '')
else:
author_name = author_name_selectors.extract()[0]
loader.add_value('author_name', author_name)
# Check for multipage
xpath_query = "//div[@class='list_multi']/article/a/@href"
multipage_selectors = response.xpath(xpath_query)
if multipage_selectors:
indices = ['http:' + x for x in multipage_selectors.extract()]
return self.parse_indices(indices, loader)
# Extract the content using XPath instead of CSS selector
# We get the XPath from chrome developer tools (copy XPath)
# or equivalent tools from other browser
xpath_query = """
//div[@class="text_detail detail_area"]/node()
[not(
descendant-or-self::comment()|
descendant-or-self::style|
descendant-or-self::script|
descendant-or-self::div|
descendant-or-self::span|
descendant-or-self::img|
descendant-or-self::table|
descendant-or-self::iframe
)]
"""
raw_content_selectors = response.xpath(xpath_query)
if not raw_content_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
# Move scraped news to pipeline
return loader.load_item()
0
Example 8
Project: rojak Source File: hallojakarta.py
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
# Initialize item loader
# extract news title, published_at, author, content, url
# Required: title, raw_content, published_at
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
title_selectors = response.css('h1 > span.h-title::text')
if not title_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
title = title_selectors.extract()[0]
loader.add_value('title', title.strip())
# Parse date information
# Example: 15 November, 2016
date_selectors = response.css('article.single-content > div.meta > span.time > time::text')
if not date_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
try:
date = date_selectors.extract()[0].split(' ')
published_at_wib = datetime.strptime(' '.join(date[1:]), '%d/%m/%Y | %H:%M')
except ValueError:
# Will be dropped on the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
# no author name
loader.add_value('author_name', '')
# Extract the content using XPath instead of CSS selector
# We get the XPath from chrome developer tools (copy XPath)
# or equivalent tools from other browser
xpath_query = """
//article/div[@class="the-content post-content clearfix"]/node()
[not(
descendant-or-self::comment()|
descendant-or-self::style|
descendant-or-self::script|
descendant-or-self::div|
descendant-or-self::span|
descendant-or-self::img|
descendant-or-self::table|
descendant-or-self::iframe
)]
"""
raw_content_selectors = response.xpath(xpath_query)
if not raw_content_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
# Move scraped news to pipeline
return loader.load_item()
0
Example 9
Project: rojak Source File: jawapos.py
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
# Initialize item loader
# extract news title, published_at, author, content, url
# Required: title, raw_content, published_at
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
title_selectors = response.css('h1.detailtitle::text')
if not title_selectors:
# If error, drop from the item pipeline
return loader.load_item()
title = title_selectors.extract_first().strip()
loader.add_value('title', title)
# Parse date information
date_time = response.css('body > div > div.container > div.page-header > div::text').extract_first().strip()
date_time = date_time.split(',')[-1].strip()
date_time = ' '.join([_(w) for w in date_time.split(' ')]) # October => Oktober
try:
published_at_wib = datetime.strptime(date_time, '%d %B %Y %H:%M')
except ValueError:
# If error, drop from the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
# If multipage
multipage_selectors = response.css('.newsPagingWrap > a')
if multipage_selectors:
return self.parse_indices(multipage_selectors, loader)
# Else if not multipage
author_name_selectors = response.css('.newsContent > p > strong::text')
if not author_name_selectors:
loader.add_value('author_name', '')
else:
author_name = author_name_selectors.extract()[-1].strip()
loader.add_value('author_name', author_name)
# Extract the news content
raw_content_selectors = response.css('.newsContent > p')
if not raw_content_selectors:
# Drop from the item pipeline
return loader.load_item()
raw_content = ' '.join(raw_content_selectors.extract())
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
# Move scraped news to pipeline
return loader.load_item()
0
Example 10
Project: rojak Source File: kompas.py
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
loader = ItemLoader(item=News(), response=response)
json_response = json.loads(response.body)
try:
url = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['NewsLines']['MoreLink']
except KeyError:
return loader.load_item()
loader.add_value('url', url)
try:
title = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['NewsLines']['HeadLine']
except KeyError:
return loader.load_item()
if not title:
return loader.load_item()
loader.add_value('title', title)
try:
raw_content = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['ContentItem']['DataContent']['nitf']['body']['body.content']['p']
except KeyError:
return loader.load_item()
if not raw_content:
return loader.load_item()
loader.add_value('raw_content', raw_content)
try:
author_name = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['Author']
except KeyError:
return loader.load_item()
if not author_name:
loader.add_value('author_name', '')
else:
loader.add_value('author_name', author_name)
try:
date_time_str = json_response['NewsML']['NewsItem']['NewsManagement']['FirstCreated']
except KeyError:
return loader.load_item()
if not date_time_str:
return loader.load_item()
date_time_str = date_time_str.split('T')
date_time_str[1] = '0' * (6 - len(date_time_str[1])) + date_time_str[1]
try:
published_at_wib = datetime.strptime(' '.join(date_time_str), '%Y%m%d %H%M%S');
except Exception:
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
return loader.load_item()
0
Example 11
Project: rojak Source File: liputan6.py
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
# Initialize item loader
# extract news title, published_at, author, content, url
# Required: title, raw_content, published_at
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
title_selectors = response.css('h1.article-header__title::text')
if not title_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
title = title_selectors.extract()[0]
loader.add_value('title', title)
# Extract the content using XPath instead of CSS selector
xpath_query = """
//div[@class="article-raw-content"]/node()
[not(
descendant-or-self::comment()|
descendant-or-self::style|
descendant-or-self::script|
descendant-or-self::div|
descendant-or-self::span|
descendant-or-self::img|
descendant-or-self::table|
descendant-or-self::iframe
)]
"""
raw_content_selectors = response.xpath(xpath_query)
if not raw_content_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
# Parse date information
# Example: ' pada 18 Okt 2016, 08:33 WIB'
date_selectors = response.css('span.article-header__datetime::text')
if not date_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
date_str = date_selectors.extract()[0].strip()
# Example: '18 Oct 2016, 08:33'
date_str = ' '.join([_(w) for w in date_str.split(' ')[1:-1]])
try:
published_at_wib = datetime.strptime(date_str, '%d %b %Y, %H:%M')
except ValueError:
# Will be dropped on the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
author_name_selectors = response.css('a.article-header__author-link::text')
if not author_name_selectors:
loader.add_value('author_name', '')
else:
author_name = author_name_selectors.extract()[0]
loader.add_value('author_name', author_name)
# Move scraped news to pipeline
return loader.load_item()
0
Example 12
Project: rojak Source File: merdekacom.py
def parse_news(self, article, sub_article):
self.logger.info('parse_news: %s' % article)
# Initialize item loader
# extract news title, published_at, author, content, url
loader = ItemLoader(item=News())
# Example: https://m.merdeka.com/tag/p/pilgub-dki/politik/nachrowi-pastikan-agus-sylvi-tak-cuema-incar-suara-santri-ulama.html
if not sub_article['news_url']:
return loader.load_item()
url = 'https://www.merdeka.com' + sub_article['news_url']
loader.add_value('url', url)
if not article['news_title']:
return loader.load_item()
loader.add_value('title', article['news_title'])
if not article['news_reporter']:
loader.add_value('author_name', '')
else:
loader.add_value('author_name', article['news_reporter'])
if not sub_article['news_description']:
return loader.load_item()
loader.add_value('raw_content', sub_article['news_description'])
# Parse date information
date_time_str = article['news_date_publish']
try:
# Example: 2016-10-12 15:16:04
published_at_wib = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
except Exception as e:
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
# Move scraped news to pipeline
return loader.load_item()
0
Example 13
Project: rojak Source File: metrotvnews.py
def parse_news(self, response):
self.logger.info('parse_news: {}'.format(response))
is_video = response.css('ul.breadcrumb > li > a::text').extract()[0] == 'VIDEO'
# Init item loader
# extract news title, published_at, author, content, url
# Required: title, raw_content, published_at
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
# Will be dropped if video page
if is_video:
return loader.load_item()
title_selectors = response.css('div.part.lead.pr > h1::text')
if not title_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
title = title_selectors.extract()[0]
loader.add_value('title', title)
xpath_query = """
//div[@class="part article"]/node()
[not(
descendant-or-self::comment()|
descendant-or-self::style|
descendant-or-self::script|
descendant-or-self::div|
descendant-or-self::span|
descendant-or-self::img|
descendant-or-self::table|
descendant-or-self::iframe
)]
"""
raw_content_selectors = response.xpath(xpath_query)
if not raw_content_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
# Example: Bambang - 10 Oktober 2016 21:10 wib
info_selectors = response.css('div.part.lead.pr > span::text')
if not info_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
info = info_selectors.extract()[0]
# Parse date information
# Example: 10 Oktober 2016 21:10 wib
date_str = info.split('-')[1].strip()
if not date_str:
# Will be dropped on the item pipeline
return loader.load_item()
# Example: 10 October 2016 21:10
date_str = ' '.join([_(w) for w in date_str[:-4].split(' ')])
try:
published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M')
except ValueError:
# Will be dropped on the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
author_name = info.split('-')[0].strip()
if not author_name:
loader.add_value('author_name', '')
else:
loader.add_value('author_name', author_name)
# Move scraped news to pipeline
return loader.load_item()
0
Example 14
Project: rojak Source File: okezone.py
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
parsed_news = json.loads(str(response.body))[0]
# Initialize item loader
# extract news title, published_at, author, content, url
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', parsed_news['url'])
if not parsed_news['title']:
# Will be dropped on the item pipeline
return loader.load_item()
loader.add_value('title', parsed_news['title'])
# Convert HTML text to a scrapy response
html_response = HtmlResponse(url=parsed_news['url'],
body=parsed_news['content'].encode('utf-8', 'ignore'))
xpath_query = '''
//body/node()
[not(descendant-or-self::comment()|
descendant-or-self::style|
descendant-or-self::script|
descendant-or-self::div|
descendant-or-self::span|
descendant-or-self::image|
descendant-or-self::img|
descendant-or-self::iframe
)]
'''
raw_content_selectors = html_response.xpath(xpath_query)
if not raw_content_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
if not parsed_news['published']:
# Will be dropped on the item pipeline
return loader.load_item()
# Parse date information
# Example: 12 Oct 2016 - 05:25
date_time_str = ' '.join([_(w) for w in parsed_news['published'].split(',')[1].strip()[:-4].split(' ')])
try:
published_at_wib = datetime.strptime(date_time_str,
'%d %b %Y - %H:%M')
except ValueError:
# Will be dropped on the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
if not parsed_news['author']:
loader.add_value('author_name', '')
else:
loader.add_value('author_name', parsed_news['author'])
# Move scraped news to pipeline
return loader.load_item()
0
Example 15
Project: rojak Source File: republikaonline.py
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
# Initialize item loader
# extract news title, published_at, author, content, url
# Required: title, raw_content, published_at
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
title_selectors = response.css('div.wrap-head > h2 > a')
if not title_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
title = ''.join(title_selectors[0].xpath('.//text()').extract())
loader.add_value('title', title.strip())
# Parse date information
# Example: Rabu, 02 November 2016, 10:29 WIB
date_selectors = response.css('div.wrap-head > span.date::text')
if not date_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
try:
date = date_selectors.extract()[0].strip().split(' ')
# Sanitize month
date[2] = sanitize(date[2])
published_at_wib = datetime.strptime(' '.join(date[1:]), '%d %b %Y | %H:%M WIB')
except ValueError:
# Will be dropped on the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
author_name_selectors = response.css('div.red::text')
if not author_name_selectors:
loader.add_value('author_name', '')
else:
authors = [author.strip() for author in author_name_selectors.extract()]
# Only consider Red: as author
# Example: ['Rep: Dadang Kurnia', 'Red: Bilal Ramadhan']
author_names = [name[4:].strip() for name in filter(lambda a: a.count('Red:') > 0, authors)]
loader.add_value('author_name', ','.join(author_names))
# Extract the content using XPath instead of CSS selector
# We get the XPath from chrome developer tools (copy XPath)
# or equivalent tools from other browser
xpath_query = """
//article/div[@class="content-detail"]/p/node()
"""
raw_content_selectors = response.xpath(xpath_query)
if not raw_content_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
# Move scraped news to pipeline
return loader.load_item()
0
Example 16
Project: rojak Source File: sindonews.py
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
# Initialize item loader
# extract news title, published_at, author, content, url
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
title_selectors = response.css('h1[itemprop="headline"]::text')
if not title_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
title = title_selectors.extract()[0]
loader.add_value('title', title)
author_name_selectors = response.css('a[rel="author"] > span::text')
if not author_name_selectors:
loader.add_value('author_name', '')
else:
author_name = author_name_selectors.extract()[0]
loader.add_value('author_name', author_name)
raw_content_selectors = response.css('.content')
if not raw_content_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
date_time_str_selectors = response.css('article > div.time::text')
if not date_time_str_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
# Parse date information
# Example: Selasa, 6 Oktober 2015 - 05:23 WIB
date_time_str = date_time_str_selectors.extract()[0]
date_time_str = date_time_str.split(',')[1].strip()[:-4]
date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
try:
published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M')
except ValueError:
# Will be dropped on the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
# Move scraped news to pipeline
return loader.load_item()
0
Example 17
Project: rojak Source File: tempoco.py
def parse_news_metro(self, response):
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
date_selector = response.css('.artikel > div.block-tanggal::text')
if not date_selector:
return self.parse_news_pilkada(loader, response)
try:
date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4]
date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')])
published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M')
except Exception:
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
if (self.media['last_scraped_at'] >= published_at):
is_no_update = True
self.logger.info('Media have no update')
raise CloseSpider('finished')
loader.add_value('published_at', published_at)
title_selector = response.css('.artikel > h1::text')
if not title_selector:
return loader.load_item()
loader.add_value('title', title_selector.extract()[0])
# Select all p which don't have iframe inside it
raw_content_selector = response.xpath('//div[@class="artikel"]//p[not(iframe)]')
if not raw_content_selector:
return loader.load_item()
raw_content = ''
for rsl in raw_content_selector:
raw_content = raw_content + rsl.extract().strip()
# Go to next page while there is next page button
next_page_selector = response.css('.pagination-nb').xpath('//a[text()="next"]/@href')
if next_page_selector:
return Request(next_page_selector.extract()[0], callback=lambda x, loader=loader, raw_content=raw_content: self.parse_next_page_metro(x, loader, raw_content))
loader.add_value('raw_content', raw_content)
# The author usually put inside <strong> tag, however, some news is not using <strong> tag.
# NOTE: this block of code may need revision in the future
author_name = ''
for author_name_selector in reversed(raw_content_selector):
author_name_selector = author_name_selector.css('strong::text')
for tmp in reversed(author_name_selector.extract()):
tmp = tmp.strip()
if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp):
author_name = tmp
break
if author_name:
break
author_name = ','.join(author_name.split(' | '))
loader.add_value('author_name', author_name)
return loader.load_item()
0
Example 18
Project: rojak Source File: tirtoid.py
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
# Initialize item loader
# extract news title, published_at, author, content, url
# Required: title, raw_content, published_at
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
title_selectors = response.css('header > h1::text')
if not title_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
title = title_selectors.extract()[0]
loader.add_value('title', title.strip())
# Parse date information
# Example: 15 November, 2016
date_selectors = response.css('header > div.date::text')
if not date_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
try:
date = date_selectors.extract()[0].strip().split(' ')
# Sanitize month
date[1] = sanitize(date[1])
published_at_wib = datetime.strptime(' '.join(date), '%d %b %Y')
except ValueError:
# Will be dropped on the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
author_name_selectors = response.css('div.reporter::text')
if not author_name_selectors:
loader.add_value('author_name', '')
else:
author_name = author_name_selectors.extract()[0]
# Example: Reporter: Mutaya Saroh -> Mutaya Saroh
loader.add_value('author_name', author_name.split(':')[-1].strip())
# Extract the content using XPath instead of CSS selector
# We get the XPath from chrome developer tools (copy XPath)
# or equivalent tools from other browser
xpath_query = """
//article/div[@class="content-text-editor"]/node()
[not(
descendant-or-self::comment()|
descendant-or-self::style|
descendant-or-self::script|
descendant-or-self::div|
descendant-or-self::span|
descendant-or-self::img|
descendant-or-self::table|
descendant-or-self::iframe
)]
"""
raw_content_selectors = response.xpath(xpath_query)
if not raw_content_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content.split('<p>')[0].strip())
# Move scraped news to pipeline
return loader.load_item()
0
Example 19
Project: rojak Source File: viva.py
def parse_news(self, response):
self.logger.info('parse_news: {}'.format(response))
# Init item loader
# extract news title, published_at, author, content, url
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
title_selectors = response.css('h1.title-big-detail::text')
if not title_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
title = title_selectors.extract()[0].strip()
loader.add_value('title', title)
# Extract raw html, not the text
# We filter-out the noise: HTML comments, scripts, css styles etc
xpath_query ='''
//div[@class="detail-content"]/node()
[not(
descendant-or-self::comment()|
descendant-or-self::style|
descendant-or-self::script|
descendant-or-self::div|
descendant-or-self::span|
descendant-or-self::img|
descendant-or-self::table|
descendant-or-self::iframe|
descendant-or-self::a[@class="share-btn-right shared"]
)]
'''
raw_content_selectors = response.xpath(xpath_query)
if not raw_content_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
loader.add_value('raw_content', raw_content)
date_selectors = response.css('span.meta-author > span:nth-child(3)::text')
if not date_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
# Example: Sabtu, 1 Oktober 2016, 15:47 WIB
date_str = date_selectors.extract()[0].strip()
# Example: 1 October 2016 15:47
date_str = date_str.replace(',', '').split(' ')[1:-1]
date_str = ' '.join([_(s) for s in date_str])
# Parse date information
try:
published_at_wib = datetime.strptime(date_str, '%d %B %Y %H:%M')
except ValueError:
# Will be dropped on the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
author_selectors = response.css('span.meta-author > span > b::text')
if not author_selectors:
author_name = ''
loader.add_value('author_name', author_name)
else:
author_name = author_selectors.extract()[0]
loader.add_value('author_name', author_name)
# Move scraped news to pipeline
return loader.load_item()
0
Example 20
Project: rojak Source File: wowkeren.py
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
# Initialize item loader
# extract news title, published_at, author, content, url
# Required: title, raw_content, published_at
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
title_selectors = response.css('div.NewsTitle > h1::text')
if not title_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
title = title_selectors.extract()[0]
loader.add_value('title', title.strip())
# Parse date information
# Example: 27 Oct 2016, 18:33:36 WIB
date_selectors = response.css('div.NewsDate::text')
if not date_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
try:
date_str = date_selectors.extract()[0]
published_at_wib = datetime.strptime(date_str, '%d %b %Y %H:%M:%S WIB')
except ValueError:
# Will be dropped on the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
# no author
loader.add_value('author_name', '')
# Extract the content using XPath instead of CSS selector
# We get the XPath from chrome developer tools (copy XPath)
# or equivalent tools from other browser
xpath_query = """
//div[@class="pad10"]/p/node()
[not(
descendant-or-self::comment()|
descendant-or-self::style|
descendant-or-self::script|
descendant-or-self::div|
descendant-or-self::span|
descendant-or-self::img|
descendant-or-self::table|
descendant-or-self::iframe
)]
"""
raw_content_selectors = response.xpath(xpath_query)
if not raw_content_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
# Move scraped news to pipeline
return loader.load_item()