Here are the examples of the python api scrapy.spider.Request taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
2 Examples
0
Example 1
Project: legco-watch Source File: hansard.py
def parse_hansard_post_1998(self, response):
sel = Selector(response)
# Get the year that this index page is for
# Meetings (Year 2013 - 2014)
# This is mostly for debugging purposes so we can spit this out in the logs
year_range = sel.xpath('//strong/em/text()').extract()
if not year_range:
self.log("%s: Could not find year range on hansard index page" % response.url, level=log.WARNING)
return
else:
self.log("%s: Parsing Hansard Index: %s" % (response.url, year_range), level=log.INFO)
# Find any dates at the top of this page. Other dates are identical
# to this page, and indeed the current page will also be included in
# the date list. Scrapy will prevent us recursing back into ourselves.
year_urls = sel.xpath('//tr/td/a[contains(@href,"#toptbl")]/@href').extract()
for year_url in year_urls:
absolute_url = urlparse.urljoin(response.url, year_url.strip())
req = Request(absolute_url, callback = self.parse_hansard_index_page)
yield req
# We are looking for table rows which link to Hansard entries for a
# particular date. In newer versions these are 6-columned table rows
# where column 6 is a link to a webcast (doesn't seem to exist)
# Older revisions are 5 columned rows. These are all after the anchor
# 'hansard'.
print "Parsing Rows"
# Find the handsard table
table = sel.xpath("//div[@class='table_overflow']//a[@name='hansard']/following::table[1]")
if not table:
# http://www.legco.gov.hk/general/english/counmtg/yr08-12/mtg_0910.htm
table = sel.xpath("//div[@id='_content_']//a[@name='hansard']/following::table[1]")
rows = table.xpath(".//tr[count(td)>=5]")
if not rows:
self.log("%s: Could not find any Handard entries to crawl into" % response.url, level=log.WARNING)
return
self.log("%s: %i rows found" % (response.url, len(rows)), level=log.INFO)
for row in rows:
date_info = ' '.join(row.xpath('.//td[1]/node()/text()').extract())
self.log("%s: Row: %s" % (response.url, date_info), level=log.INFO)
# Recurse into the agenda, if it exists
agenda_url = row.xpath('.//td[2]/a/@href').extract()
if agenda_url:
absolute_url = urlparse.urljoin(response.url, agenda_url[0].strip())
req = Request(absolute_url, callback = self.parse_hansard_agenda)
yield req
else:
self.log("%s: Could not find an agenda URL for %s" % (response.url, date_info), level=log.WARNING)
# Download the minutes docuement if it exists. This is a PDF file
minutes_url = row.xpath('.//td[3]/a/@href').extract()
if minutes_url:
absolute_url = urlparse.urljoin(response.url, minutes_url[0].strip())
minutes = HansardMinutes()
minutes['date'] = date_info
minutes['file_urls'] = [absolute_url]
yield minutes
else:
self.log("%s: Could not find an minutes URL for %s" % (response.url, date_info), level=log.WARNING)
for (lang, index) in [('en',4),('cn',5)]:
hansard_urls = row.xpath('.//td[%i]/a/@href' % index).extract()
for url in hansard_urls:
# Is this a PDF entry, or do we need to recurse?
absolute_url = urlparse.urljoin(response.url, url.strip())
if absolute_url.endswith('pdf'):
hansard_record = HansardRecord()
hansard_record['date'] = date_info
hansard_record['language'] = lang
hansard_record["file_urls"] = [absolute_url]
yield hansard_record
else:
# Recurse into the HTML handler for the HTML Handard Record Index
req = Request(absolute_url, callback = self.parse_hansard_html_record)
yield req
if not hansard_urls:
self.log("%s: Could not find an hansard URL for %s, lang %s" % (response.url, date_info, lang), level=log.WARNING)
0
Example 2
def parse(self, response):
sel = Selector(response)
# Pages from 1998 onwards, new format
# These normally cover around a 2-6 year period
proceedings_menu = sel.xpath('//a[starts-with(text(),"Official Record of Proceedings")]/@href')
if proceedings_menu:
for url in proceedings_menu.extract():
absolute_url = urlparse.urljoin(response.url, url.strip())
req = Request(absolute_url, callback = self.parse_hansard_index_page)
yield req
# Former Legislative Council (before 7/1997)
table = sel.xpath("//h3[contains(text(),'Former Legislative Council (before 7/1997)')]/following::table[1]")
if table:
links = table[0].xpath(".//td/a[contains(text(),'Session')]/@href").extract()
if links:
for url in links:
absolute_url = urlparse.urljoin(response.url, url.strip())
req = Request(absolute_url, callback = self.parse_hansard_index_page)
yield req