scrapy.http.HtmlResponse

Here are the examples of the python api scrapy.http.HtmlResponse taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

58 Examples 7

Example 51

Project: scrapy
Source File: test_spider.py
View license
    def test_process_links_generator(self):

        response = HtmlResponse("http://example.org/somepage/index.html",
            body=self.test_body)

        class _CrawlSpider(self.spider_class):
            name="test"
            allowed_domains=['example.org']
            rules = (
                Rule(LinkExtractor(), process_links="dummy_process_links"),
            )

            def dummy_process_links(self, links):
                for link in links:
                    yield link

        spider = _CrawlSpider()
        output = list(spider._requests_to_follow(response))
        self.assertEqual(len(output), 3)
        self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
        self.assertEquals([r.url for r in output],
                          ['http://example.org/somepage/item/12.html',
                           'http://example.org/about.html',
                           'http://example.org/nofollow.html'])

Example 52

Project: scrapy
Source File: test_utils_response.py
View license
    def test_get_meta_refresh(self):
        r1 = HtmlResponse("http://www.example.com", body=b"""
        <html>
        <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
        <body>blahablsdfsal&amp;</body>
        </html>""")
        r2 = HtmlResponse("http://www.example.com", body=b"""
        <html>
        <head><title>Dummy</title><noScript>
        <meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
        </noSCRIPT>
        <body>blahablsdfsal&amp;</body>
        </html>""")
        r3 = HtmlResponse("http://www.example.com", body=b"""
    <noscript><meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage</noscript>
    <script type="text/javascript">
    if(!checkCookies()){
        document.write('<meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage">');
    }
    </script>
        """)
        self.assertEqual(get_meta_refresh(r1), (5.0, 'http://example.org/newpage'))
        self.assertEqual(get_meta_refresh(r2), (None, None))
        self.assertEqual(get_meta_refresh(r3), (None, None))

Example 53

Project: scrapy-splash
Source File: test_request.py
View license
def test_form_request_from_response():
    # Copied from scrapy tests (test_from_response_submit_not_first_clickable)
    def _buildresponse(body, **kwargs):
        kwargs.setdefault('body', body)
        kwargs.setdefault('url', 'http://example.com')
        kwargs.setdefault('encoding', 'utf-8')
        return HtmlResponse(**kwargs)
    response = _buildresponse(
        """<form action="get.php" method="GET">
        <input type="submit" name="clickable1" value="clicked1">
        <input type="hidden" name="one" value="1">
        <input type="hidden" name="two" value="3">
        <input type="submit" name="clickable2" value="clicked2">
        </form>""")
    req = SplashFormRequest.from_response(
        response, formdata={'two': '2'}, clickdata={'name': 'clickable2'})
    assert req.method == 'GET'
    assert req.meta['splash']['args']['url'] == req.url
    fs = cgi.parse_qs(req.url.partition('?')[2], True)
    assert fs['clickable2'] == ['clicked2']
    assert 'clickable1' not in fs
    assert fs['one'] == ['1']
    assert fs['two'] == ['2']

Example 54

View license
    def test_parse_company_filing_page(self):
        '''
        Parse the page that lists all filings of a company.

        Example:
        http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001288776&type=10-&dateb=&owner=exclude&count=40

        '''
        spider = EdgarSpider()
        spider._follow_links = True  # HACK

        body = '''
            <html><body>
            <a href="http://example.com/">Useless Link</a>
            <a href="/Archives/edgar/data/abc-index.htm">Link</a>
            <a href="/Archives/edgar/data/123-index.htm">Link</a>
            <a href="/Archives/edgar/data/123.htm">Useless Link</a>
            <a href="/Archives/edgar/data/123/abc-index.htm">Link</a>
            <a href="/Archives/edgar/data/123/456/abc123-index.htm">Link</a>
            <a href="/Archives/edgar/123/abc-index.htm">Uselss Link</a>
            <a href="/Archives/edgar/data/123/456/789/HELLO-index.htm">Link</a>
            <a href="/Archives/hello-index.html">Useless Link</a>
            </body></html>
        '''

        response = HtmlResponse('http://sec.gov/mock', body=body)
        requests = spider.parse(response)
        urls = [r.url for r in requests]

        self.assertEqual(urls, [
            'http://sec.gov/Archives/edgar/data/abc-index.htm',
            'http://sec.gov/Archives/edgar/data/123-index.htm',
            'http://sec.gov/Archives/edgar/data/123/abc-index.htm',
            'http://sec.gov/Archives/edgar/data/123/456/abc123-index.htm',
            'http://sec.gov/Archives/edgar/data/123/456/789/HELLO-index.htm'
        ])

Example 55

View license
    def test_parse_quarter_or_annual_page(self):
        '''
        Parse the page that lists filings of a quater or a year of a company.

        Example:
        http://www.sec.gov/Archives/edgar/data/1288776/000128877613000055/0001288776-13-000055-index.htm

        '''
        spider = EdgarSpider()
        spider._follow_links = True  # HACK

        body = '''
            <html><body>
            <a href="http://example.com">Useless Link</a>
            <a href="/Archives/edgar/data/123/abc-20130630.xml">Link</a>
            <a href="/Archives/edgar/123/456/abc123-20130630.xml">Useless Link</a>
            <a href="/Archives/edgar/data/456/789/hello-20130630.xml">Link</a>
            <a href="/Archives/edgar/123/456/hello-20130630.xml">Useless Link</a>
            <a href="/Archives/data/123/456/hello-20130630.xml">Useless Link</a>
            <a href="/Archives/edgar/data/123/456/hello-201306300.xml">Useless Link</a>
            <a href="/Archives/edgar/data/123/456/xyz-20130630.html">Link</a>
            </body></html>
        '''

        response = HtmlResponse('http://sec.gov/mock', body=body)
        requests = spider.parse(response)
        urls = [r.url for r in requests]

        self.assertEqual(urls, [
            'http://sec.gov/Archives/edgar/data/123/abc-20130630.xml',
            'http://sec.gov/Archives/edgar/data/456/789/hello-20130630.xml'
        ])

Example 56

Project: docsearch-scraper
Source File: html_helper.py
View license
def get_links(url, body):
    start_url = url
    if '.html' in start_url:
        start_url = start_url.rsplit('/', 1)[0]

    response = HtmlResponse(
        url=start_url,
        body=body,
        encoding='utf8'
    )

    link_extractor = LxmlLinkExtractor(
        allow=[start_url],
        deny=[],
        tags='a',
        attrs='href',
        canonicalize=True
    )

    return link_extractor.extract_links(response)

Example 57

View license
    def process_request(self, request, spider):

        if not spider.js_render:
            return None

        if spider.remove_get_params:
            o = urlparse(request.url)
            url_without_params = o.scheme + "://" + o.netloc + o.path
            request = request.replace(url=url_without_params)

        if request.url in self.seen:
            return None

        self.seen[request.url] = True

        print("Getting " + request.url + " from selenium")

        self.driver.get(unquote_plus(request.url)) # Decode url otherwise firefox is not happy. Ex /#%21/ => /#!/%21
        time.sleep(spider.js_wait)
        body = self.driver.page_source.encode('utf-8')
        url = self.driver.current_url

        return HtmlResponse(
            url=url,
            body=body,
            encoding='utf8'
        )

Example 58

Project: ajax_crawler
Source File: common_spider.py
View license
    def parse_multi_items(self, hxs, node, item, response, index, count):
        if node.restrict_xpaths:
            for child in node.children:
                if child.xpaths:
                    restrict_xpath = '|'.join([restrict_xpath.replace("<<", "").replace(">>", "") for restrict_xpath in node.restrict_xpaths])
                    try:
                        m = re.search(r'<<(.+)&(.*)>>',xpath)
                        restrict_xpath = m.group(1)
                    except:
                        pass
                    restrict_selectors = hxs.select(restrict_xpath)
                    #fetch multi items from one page
                    if index != None and len(restrict_selectors) > index and len(restrict_selectors)==count:
                        try:
                            XmlXPathSelector = Selector
                        except:
                            pass
                        restrict_hxs = XmlXPathSelector(HtmlResponse(response.url, body=re.sub('[\n\r\t]+', '', restrict_selectors[index].extract()), encoding='utf8'))
                        #restrict_hxs = restrict_selectors[index]
                        self.parse_item_xpaths(restrict_hxs, child.xpaths, item, response.url, child.name, True, False)