scrapy.spider.Request

Here are the examples of the python api scrapy.spider.Request taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

2 Examples 7

Example 1

Project: legco-watch Source File: hansard.py
    def parse_hansard_post_1998(self, response):
        sel = Selector(response)    

        # Get the year that this index page is for
        # Meetings (Year 2013 - 2014)
        # This is mostly for debugging purposes so we can spit this out in the logs
        year_range = sel.xpath('//strong/em/text()').extract()
        if not year_range:
            self.log("%s: Could not find year range on hansard index page" % response.url, level=log.WARNING)
            return
        else:
            self.log("%s: Parsing Hansard Index: %s" % (response.url, year_range), level=log.INFO)

        # Find any dates at the top of this page. Other dates are identical
        # to this page, and indeed the current page will also be included in
        # the date list. Scrapy will prevent us recursing back into ourselves.
    
        year_urls = sel.xpath('//tr/td/a[contains(@href,"#toptbl")]/@href').extract()
        for year_url in year_urls:
            absolute_url = urlparse.urljoin(response.url, year_url.strip())
            req = Request(absolute_url, callback = self.parse_hansard_index_page)
            yield req
        
        # We are looking for table rows which link to Hansard entries for a
        # particular date. In newer versions these are 6-columned table rows
        # where column 6 is a link to a webcast (doesn't seem to exist)
        # Older revisions are 5 columned rows. These are all after the anchor
        # 'hansard'.

        print "Parsing Rows"
        # Find the handsard table
        table = sel.xpath("//div[@class='table_overflow']//a[@name='hansard']/following::table[1]")
        if not table:
            # http://www.legco.gov.hk/general/english/counmtg/yr08-12/mtg_0910.htm
            table = sel.xpath("//div[@id='_content_']//a[@name='hansard']/following::table[1]")

        rows = table.xpath(".//tr[count(td)>=5]")
        if not rows:
            self.log("%s: Could not find any Handard entries to crawl into" % response.url, level=log.WARNING)
            return
    
        self.log("%s: %i rows found" % (response.url, len(rows)), level=log.INFO)

        for row in rows:
            date_info = ' '.join(row.xpath('.//td[1]/node()/text()').extract())
            self.log("%s: Row: %s" % (response.url, date_info), level=log.INFO)

            # Recurse into the agenda, if it exists
            agenda_url = row.xpath('.//td[2]/a/@href').extract()
            if agenda_url:
                absolute_url = urlparse.urljoin(response.url, agenda_url[0].strip())
                req = Request(absolute_url, callback = self.parse_hansard_agenda)
                yield req
            else:
                self.log("%s: Could not find an agenda URL for %s" % (response.url, date_info), level=log.WARNING)
        
            # Download the minutes docuement if it exists. This is a PDF file
            minutes_url = row.xpath('.//td[3]/a/@href').extract()
            if minutes_url:
                absolute_url = urlparse.urljoin(response.url, minutes_url[0].strip())
                minutes = HansardMinutes()
                minutes['date'] = date_info
                minutes['file_urls'] = [absolute_url]
                yield minutes
            else:
                self.log("%s: Could not find an minutes URL for %s" % (response.url, date_info), level=log.WARNING)

            for (lang, index) in [('en',4),('cn',5)]:

                hansard_urls = row.xpath('.//td[%i]/a/@href' % index).extract()
                for url in hansard_urls:
                    # Is this a PDF entry, or do we need to recurse?
                    absolute_url = urlparse.urljoin(response.url, url.strip())
                    if absolute_url.endswith('pdf'):
                        hansard_record = HansardRecord()
                        hansard_record['date'] = date_info
                        hansard_record['language'] = lang
                        hansard_record["file_urls"] = [absolute_url]
                        yield hansard_record
                    else:
                        # Recurse into the HTML handler for the HTML Handard Record Index
                        req = Request(absolute_url, callback = self.parse_hansard_html_record)
                        yield req

                if not hansard_urls:
                    self.log("%s: Could not find an hansard URL for %s, lang %s" % (response.url, date_info, lang), level=log.WARNING)

Example 2

Project: legco-watch Source File: legcosite.py
Function: parse
    def parse(self, response):
        sel = Selector(response)    

        # Pages from 1998 onwards, new format
        # These normally cover around a 2-6 year period
        proceedings_menu = sel.xpath('//a[starts-with(text(),"Official Record of Proceedings")]/@href')
        if proceedings_menu:
            for url in proceedings_menu.extract():
                absolute_url = urlparse.urljoin(response.url, url.strip())
                req = Request(absolute_url, callback = self.parse_hansard_index_page)
                yield req
        
        # Former Legislative Council (before 7/1997)
        table = sel.xpath("//h3[contains(text(),'Former Legislative Council (before 7/1997)')]/following::table[1]")
        if table:
            links = table[0].xpath(".//td/a[contains(text(),'Session')]/@href").extract()
            if links:
                for url in links:
                    absolute_url = urlparse.urljoin(response.url, url.strip())
                    req = Request(absolute_url, callback = self.parse_hansard_index_page)
                    yield req