Here are the examples of the python api scrapy.linkextractors.regex.RegexLinkExtractor taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
3 Examples
3
Example 1
def test_extraction(self):
# Default arguments
lx = RegexLinkExtractor()
self.assertEqual(lx.extract_links(self.response),
[Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])
3
Example 2
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_link_wrong_href(self):
html = """
<a href="http://example.org/item1.html">Item 1</a>
<a href="http://[example.org/item2.html">Item 2</a>
<a href="http://example.org/item3.html">Item 3</a>
"""
response = HtmlResponse("http://example.org/index.html", body=html)
lx = RegexLinkExtractor()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])
3
Example 3
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_html_base_href(self):
html = """
<html>
<head>
<base href="http://b.com/">
</head>
<body>
<a href="test.html"></a>
</body>
</html>
"""
response = HtmlResponse("http://a.com/", body=html)
lx = RegexLinkExtractor()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://b.com/test.html', text=u'', nofollow=False),
])