Here are the examples of the python api scrapy.link.Link taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
45 Examples
3
Example 1
def handle_starttag(self, tag, attrs):
if tag == 'base':
self.base_url = dict(attrs).get('href')
if self.scan_tag(tag):
for attr, value in attrs:
if self.scan_attr(attr):
url = self.process_attr(value)
link = Link(url=url)
self.links.append(link)
self.current_link = link
3
Example 2
Project: scrapy Source File: regex.py
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
def clean_text(text):
return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()
def clean_url(url):
clean_url = ''
try:
clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
except ValueError:
pass
return clean_url
if base_url is None:
base_url = get_base_url(response_text, response_url, response_encoding)
links_text = linkre.findall(response_text)
return [Link(clean_url(url).encode(response_encoding),
clean_text(text))
for url, _, text in links_text]
3
Example 3
Project: scrapy Source File: sgml.py
def unknown_starttag(self, tag, attrs):
if tag == 'base':
self.base_url = dict(attrs).get('href')
if self.scan_tag(tag):
for attr, value in attrs:
if self.scan_attr(attr):
url = self.process_value(value)
if url is not None:
link = Link(url=url, nofollow=rel_has_nofollow(dict(attrs).get('rel')))
self.links.append(link)
self.current_link = link
3
Example 4
Project: scrapy Source File: test_link.py
def test_non_str_url_py2(self):
if six.PY2:
with warnings.catch_warnings(record=True) as w:
link = Link(u"http://www.example.com/\xa3")
self.assertIsInstance(link.url, str)
self.assertEqual(link.url, b'http://www.example.com/\xc2\xa3')
assert len(w) == 1, "warning not issued"
else:
with self.assertRaises(TypeError):
Link(b"http://www.example.com/\xc2\xa3")
3
Example 5
Project: scrapy Source File: test_linkextractors.py
def test_extract_all_links(self):
lx = self.extractor_cls()
self.assertEqual([link for link in lx.extract_links(self.response)], [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),
])
3
Example 6
Project: scrapy Source File: test_linkextractors.py
def test_extract_filter_allow(self):
lx = self.extractor_cls(allow=('sample', ))
self.assertEqual([link for link in lx.extract_links(self.response)], [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
])
3
Example 7
Project: scrapy Source File: test_linkextractors.py
def test_extract_filter_allow_with_duplicates(self):
lx = self.extractor_cls(allow=('sample', ), unique=False)
self.assertEqual([link for link in lx.extract_links(self.response)], [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'),
])
3
Example 8
Project: scrapy Source File: test_linkextractors.py
def test_extract_filter_allow_and_deny(self):
lx = self.extractor_cls(allow=('sample', ), deny=('3', ))
self.assertEqual([link for link in lx.extract_links(self.response)], [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
])
3
Example 9
Project: scrapy Source File: test_linkextractors.py
def test_restrict_xpaths(self):
lx = self.extractor_cls(restrict_xpaths=('//div[@id="subwrapper"]', ))
self.assertEqual([link for link in lx.extract_links(self.response)], [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
])
3
Example 10
Project: scrapy Source File: test_linkextractors.py
def test_restrict_xpaths_encoding(self):
"""Test restrict_xpaths with encodings"""
html = b"""<html><head><title>Page title<title>
<body><p><a href="item/12.html">Item 12</a></p>
<div class='links'>
<p><a href="/about.html">About us\xa3</a></p>
</div>
<div>
<p><a href="/nofollow.html">This shouldn't be followed</a></p>
</div>
</body></html>"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252')
lx = self.extractor_cls(restrict_xpaths="//div[@class='links']")
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/about.html', text=u'About us\xa3')])
3
Example 11
Project: scrapy Source File: test_linkextractors.py
def test_restrict_xpaths_with_html_entities(self):
html = b'<html><body><p><a href="/♥/you?c=€">text</a></p></body></html>'
response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='iso8859-15')
links = self.extractor_cls(restrict_xpaths='//p').extract_links(response)
self.assertEqual(links,
[Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC', text=u'text')])
3
Example 12
Project: scrapy Source File: test_linkextractors.py
def test_restrict_xpaths_concat_in_handle_data(self):
"""html entities cause SGMLParser to call handle_data hook twice"""
body = b"""<html><body><div><a href="/foo">>\xbe\xa9<\xb6\xab</a></body></html>"""
response = HtmlResponse("http://example.org", body=body, encoding='gb18030')
lx = self.extractor_cls(restrict_xpaths="//div")
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
fragment='', nofollow=False)])
3
Example 13
Project: scrapy Source File: test_linkextractors.py
def test_restrict_css_and_restrict_xpaths_together(self):
lx = self.extractor_cls(restrict_xpaths=('//div[@id="subwrapper"]', ),
restrict_css=('#subwrapper + a', ))
self.assertEqual([link for link in lx.extract_links(self.response)], [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
])
3
Example 14
Project: scrapy Source File: test_linkextractors.py
def test_area_tag_with_unicode_present(self):
body = b"""<html><body>\xbe\xa9<map><area href="http://example.org/foo" /></map></body></html>"""
response = HtmlResponse("http://example.org", body=body, encoding='utf-8')
lx = self.extractor_cls()
lx.extract_links(response)
lx.extract_links(response)
lx.extract_links(response)
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/foo', text=u'',
fragment='', nofollow=False)])
3
Example 15
Project: scrapy Source File: test_linkextractors.py
def test_encoded_url(self):
body = b"""<html><body><div><a href="?page=2">BinB</a></body></html>"""
response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response), [
Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
])
3
Example 16
Project: scrapy Source File: test_linkextractors.py
def test_encoded_url_in_restricted_xpath(self):
body = b"""<html><body><div><a href="?page=2">BinB</a></body></html>"""
response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
lx = self.extractor_cls(restrict_xpaths="//div")
self.assertEqual(lx.extract_links(response), [
Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
])
3
Example 17
Project: scrapy Source File: test_linkextractors.py
def test_ignored_extensions(self):
# jpg is ignored by default
html = b"""<a href="page.html">asd</a> and <a href="photo.jpg">"""
response = HtmlResponse("http://example.org/", body=html)
lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.org/page.html', text=u'asd'),
])
# override denied extensions
lx = self.extractor_cls(deny_extensions=['html'])
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.org/photo.jpg'),
])
3
Example 18
Project: scrapy Source File: test_linkextractors.py
def test_process_value(self):
"""Test restrict_xpaths with encodings"""
html = b"""
<a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a>
<a href="/about.html">About us</a>
"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252')
def process_value(value):
m = re.search("javascript:goToPage\('(.*?)'", value)
if m:
return m.group(1)
lx = self.extractor_cls(process_value=process_value)
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/other/page.html', text='Link text')])
3
Example 19
Project: scrapy Source File: test_linkextractors.py
def test_base_url_with_restrict_xpaths(self):
html = b"""<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
<body><p><a href="item/12.html">Item 12</a></p>
</body></html>"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
lx = self.extractor_cls(restrict_xpaths="//p")
self.assertEqual(lx.extract_links(response),
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
3
Example 20
Project: scrapy Source File: test_linkextractors.py
def test_link_wrong_href(self):
html = b"""
<a href="http://example.org/item1.html">Item 1</a>
<a href="http://[example.org/item2.html">Item 2</a>
<a href="http://example.org/item3.html">Item 3</a>
"""
response = HtmlResponse("http://example.org/index.html", body=html)
lx = self.extractor_cls()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])
3
Example 21
Project: scrapy Source File: test_linkextractors.py
def test_link_wrong_href(self):
html = b"""
<a href="http://example.org/item1.html">Item 1</a>
<a href="http://[example.org/item2.html">Item 2</a>
<a href="http://example.org/item3.html">Item 3</a>
"""
response = HtmlResponse("http://example.org/index.html", body=html)
lx = self.extractor_cls()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])
3
Example 22
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_basic(self):
html = """<html><head><title>Page title<title>
<body><p><a href="item/12.html">Item 12</a></p>
<p><a href="/about.html">About us</a></p>
<img src="/logo.png" alt="Company logo (not a link)" />
<p><a href="../othercat.html">Other category</a></p>
<p><a href="/">>></a></p>
<p><a href="/" /></p>
</body></html>"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
Link(url='http://example.org/about.html', text='About us'),
Link(url='http://example.org/othercat.html', text='Other category'),
Link(url='http://example.org/', text='>>'),
Link(url='http://example.org/', text='')])
3
Example 23
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_link_text_wrong_encoding(self):
html = """<body><p><a href="item/12.html">Wrong: \xed</a></p></body></html>"""
response = HtmlResponse("http://www.example.com", body=html, encoding='utf-8')
lx = BaseSgmlLinkExtractor()
self.assertEqual(lx.extract_links(response), [
Link(url='http://www.example.com/item/12.html', text=u'Wrong: \ufffd'),
])
3
Example 24
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_extraction(self):
# Default arguments
lx = HtmlParserLinkExtractor()
self.assertEqual(lx.extract_links(self.response),
[Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])
3
Example 25
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_link_wrong_href(self):
html = """
<a href="http://example.org/item1.html">Item 1</a>
<a href="http://[example.org/item2.html">Item 2</a>
<a href="http://example.org/item3.html">Item 3</a>
"""
response = HtmlResponse("http://example.org/index.html", body=html)
lx = HtmlParserLinkExtractor()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])
3
Example 26
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_deny_extensions(self):
html = """<a href="page.html">asd</a> and <a href="photo.jpg">"""
response = HtmlResponse("http://example.org/", body=html)
lx = SgmlLinkExtractor(deny_extensions="jpg")
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.org/page.html', text=u'asd'),
])
3
Example 27
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_attrs_sgml(self):
html = """<html><area href="sample1.html"></area>
<a ref="sample2.html">sample text 2</a></html>"""
response = HtmlResponse("http://example.com/index.html", body=html)
lx = SgmlLinkExtractor(attrs="href")
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample1.html', text=u''),
])
3
Example 28
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_link_nofollow(self):
html = """
<a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
<a href="about.html">About us</a>
<a href="http://google.com/something" rel="external nofollow">Something</a>
"""
response = HtmlResponse("http://example.org/page.html", body=html)
lx = SgmlLinkExtractor()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
Link(url='http://example.org/about.html', text=u'About us', nofollow=False),
Link(url='http://google.com/something', text=u'Something', nofollow=True),
])
3
Example 29
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_extraction(self):
# Default arguments
lx = RegexLinkExtractor()
self.assertEqual(lx.extract_links(self.response),
[Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])
3
Example 30
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_link_wrong_href(self):
html = """
<a href="http://example.org/item1.html">Item 1</a>
<a href="http://[example.org/item2.html">Item 2</a>
<a href="http://example.org/item3.html">Item 3</a>
"""
response = HtmlResponse("http://example.org/index.html", body=html)
lx = RegexLinkExtractor()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])
3
Example 31
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_html_base_href(self):
html = """
<html>
<head>
<base href="http://b.com/">
</head>
<body>
<a href="test.html"></a>
</body>
</html>
"""
response = HtmlResponse("http://a.com/", body=html)
lx = RegexLinkExtractor()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://b.com/test.html', text=u'', nofollow=False),
])
0
Example 32
Project: scrapy Source File: lxmlhtml.py
def _extract_links(self, selector, response_url, response_encoding, base_url):
links = []
# hacky way to get the underlying lxml parsed docuement
for el, attr, attr_val in self._iter_links(selector.root):
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
try:
attr_val = urljoin(base_url, attr_val)
except ValueError:
continue # skipping bogus links
else:
url = self.process_attr(attr_val)
if url is None:
continue
url = to_native_str(url, encoding=response_encoding)
# to fix relative links after process_value
url = urljoin(response_url, url)
link = Link(url, _collect_string_content(el) or u'',
nofollow=rel_has_nofollow(el.get('rel')))
links.append(link)
return self._deduplicate_if_needed(links)
0
Example 33
Project: scrapy Source File: test_link.py
def test_eq_and_hash(self):
l1 = Link("http://www.example.com")
l2 = Link("http://www.example.com/other")
l3 = Link("http://www.example.com")
self._assert_same_links(l1, l1)
self._assert_different_links(l1, l2)
self._assert_same_links(l1, l3)
l4 = Link("http://www.example.com", text="test")
l5 = Link("http://www.example.com", text="test2")
l6 = Link("http://www.example.com", text="test")
self._assert_same_links(l4, l4)
self._assert_different_links(l4, l5)
self._assert_same_links(l4, l6)
l7 = Link("http://www.example.com", text="test", fragment='something', nofollow=False)
l8 = Link("http://www.example.com", text="test", fragment='something', nofollow=False)
l9 = Link("http://www.example.com", text="test", fragment='something', nofollow=True)
l10 = Link("http://www.example.com", text="test", fragment='other', nofollow=False)
self._assert_same_links(l7, l8)
self._assert_different_links(l7, l9)
self._assert_different_links(l7, l10)
0
Example 34
def test_repr(self):
l1 = Link("http://www.example.com", text="test", fragment='something', nofollow=True)
l2 = eval(repr(l1))
self._assert_same_links(l1, l2)
0
Example 35
Project: scrapy Source File: test_linkextractors.py
def test_extract_filter_allowed_domains(self):
lx = self.extractor_cls(allow_domains=('google.com', ))
self.assertEqual([link for link in lx.extract_links(self.response)], [
Link(url='http://www.google.com/something', text=u''),
])
0
Example 36
Project: scrapy Source File: test_linkextractors.py
def test_extraction_using_single_values(self):
'''Test the extractor's behaviour among different situations'''
lx = self.extractor_cls(allow='sample')
self.assertEqual([link for link in lx.extract_links(self.response)], [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
])
lx = self.extractor_cls(allow='sample', deny='3')
self.assertEqual([link for link in lx.extract_links(self.response)], [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
])
lx = self.extractor_cls(allow_domains='google.com')
self.assertEqual([link for link in lx.extract_links(self.response)], [
Link(url='http://www.google.com/something', text=u''),
])
lx = self.extractor_cls(deny_domains='example.com')
self.assertEqual([link for link in lx.extract_links(self.response)], [
Link(url='http://www.google.com/something', text=u''),
])
0
Example 37
Project: scrapy Source File: test_linkextractors.py
def test_nofollow(self):
'''Test the extractor's behaviour for links with rel="nofollow"'''
html = b"""<html><head><title>Page title<title>
<body>
<div class='links'>
<p><a href="/about.html">About us</a></p>
</div>
<div>
<p><a href="/follow.html">Follow this link</a></p>
</div>
<div>
<p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p>
</div>
<div>
<p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
</div>
<div>
<p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
</div>
</body></html>"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.org/about.html', text=u'About us'),
Link(url='http://example.org/follow.html', text=u'Follow this link'),
Link(url='http://example.org/nofollow.html', text=u'Dont follow this one', nofollow=True),
Link(url='http://example.org/nofollow2.html', text=u'Choose to follow or not'),
Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True),
])
0
Example 38
Project: scrapy Source File: test_linkextractors.py
def test_restrict_css(self):
lx = self.extractor_cls(restrict_css=('#subwrapper a',))
self.assertEqual(lx.extract_links(self.response), [
Link(url='http://example.com/sample2.html', text=u'sample 2')
])
0
Example 39
def test_attrs(self):
lx = self.extractor_cls(attrs="href")
self.assertEqual(lx.extract_links(self.response), [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),
])
lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
self.assertEqual(lx.extract_links(self.response), [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample2.jpg', text=u''),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),
])
lx = self.extractor_cls(attrs=None)
self.assertEqual(lx.extract_links(self.response), [])
0
Example 40
Project: scrapy Source File: test_linkextractors.py
def test_tags(self):
html = b"""<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>"""
response = HtmlResponse("http://example.com/index.html", body=html)
lx = self.extractor_cls(tags=None)
self.assertEqual(lx.extract_links(response), [])
lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
])
lx = self.extractor_cls(tags="area")
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample1.html', text=u''),
])
lx = self.extractor_cls(tags="a")
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample2.html', text=u'sample 2'),
])
lx = self.extractor_cls(tags=("a","img"), attrs=("href", "src"), deny_extensions=())
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample2.jpg', text=u''),
])
0
Example 41
Project: scrapy Source File: test_linkextractors.py
def test_tags_attrs(self):
html = b"""
<html><body>
<div id="item1" data-url="get?id=1"><a href="#">Item 1</a></div>
<div id="item2" data-url="get?id=2"><a href="#">Item 2</a></div>
</body></html>
"""
response = HtmlResponse("http://example.com/index.html", body=html)
lx = self.extractor_cls(tags='div', attrs='data-url')
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/get?id=1', text=u'Item 1', fragment='', nofollow=False),
Link(url='http://example.com/get?id=2', text=u'Item 2', fragment='', nofollow=False)
])
lx = self.extractor_cls(tags=('div',), attrs=('data-url',))
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/get?id=1', text=u'Item 1', fragment='', nofollow=False),
Link(url='http://example.com/get?id=2', text=u'Item 2', fragment='', nofollow=False)
])
0
Example 42
Project: scrapy Source File: test_linkextractors.py
def test_xhtml(self):
xhtml = b"""
<?xml version="1.0"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>XHTML docuement title</title>
</head>
<body>
<div class='links'>
<p><a href="/about.html">About us</a></p>
</div>
<div>
<p><a href="/follow.html">Follow this link</a></p>
</div>
<div>
<p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p>
</div>
<div>
<p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
</div>
<div>
<p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
</div>
</body>
</html>
"""
response = HtmlResponse("http://example.com/index.xhtml", body=xhtml)
lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False),
Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)]
)
response = XmlResponse("http://example.com/index.xhtml", body=xhtml)
lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False),
Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)]
)
0
Example 43
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_base_url(self):
html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
<body><p><a href="item/12.html">Item 12</a></p>
</body></html>"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_links(response),
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
# base url is an absolute path and relative to host
html = """<html><head><title>Page title<title><base href="/" />
<body><p><a href="item/12.html">Item 12</a></p></body></html>"""
response = HtmlResponse("https://example.org/somepage/index.html", body=html)
self.assertEqual(lx.extract_links(response),
[Link(url='https://example.org/item/12.html', text='Item 12')])
# base url has no scheme
html = """<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" />
<body><p><a href="item/12.html">Item 12</a></p></body></html>"""
response = HtmlResponse("https://example.org/somepage/index.html", body=html)
self.assertEqual(lx.extract_links(response),
[Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12')])
0
Example 44
Project: scrapy Source File: test_linkextractors_deprecated.py
def test_extraction_encoding(self):
body = get_testdata('link_extractor', 'linkextractor_noenc.html')
response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']})
response_noenc = HtmlResponse(url='http://example.com/noenc', body=body)
body = get_testdata('link_extractor', 'linkextractor_latin1.html')
response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)
lx = BaseSgmlLinkExtractor()
self.assertEqual(lx.extract_links(response_utf8), [
Link(url='http://example.com/sample_%C3%B1.html', text=''),
Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
])
self.assertEqual(lx.extract_links(response_noenc), [
Link(url='http://example.com/sample_%C3%B1.html', text=''),
Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
])
# docuement encoding does not affect URL path component, only query part
# >>> u'sample_ñ.html'.encode('utf8')
# b'sample_\xc3\xb1.html'
# >>> u"sample_á.html".encode('utf8')
# b'sample_\xc3\xa1.html'
# >>> u"sample_ö.html".encode('utf8')
# b'sample_\xc3\xb6.html'
# >>> u"£32".encode('latin1')
# b'\xa332'
# >>> u"µ".encode('latin1')
# b'\xb5'
self.assertEqual(lx.extract_links(response_latin1), [
Link(url='http://example.com/sample_%C3%B1.html', text=''),
Link(url='http://example.com/sample_%C3%A1.html', text='sample \xe1 text'.decode('latin1')),
Link(url='http://example.com/sample_%C3%B6.html?price=%A332&%B5=unit', text=''),
])
0
Example 45
Project: scrapy-cluster Source File: lxmlhtml.py
def _extract_links(self, selector, response_url, response_encoding, base_url):
'''
Pretty much the same function, just added 'ignore' to url.encode
'''
links = []
# hacky way to get the underlying lxml parsed docuement
for el, attr, attr_val in self._iter_links(selector._root):
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
try:
attr_val = urljoin(base_url, attr_val)
except ValueError:
continue # skipping bogus links
else:
url = self.process_attr(attr_val)
if url is None:
continue
if isinstance(url, unicode):
# add 'ignore' to encoding errors
url = url.encode(response_encoding, 'ignore')
# to fix relative links after process_value
url = urljoin(response_url, url)
link = Link(url, _collect_string_content(el) or u'',
nofollow=True if el.get('rel') == 'nofollow' else False)
links.append(link)
return unique_list(links, key=lambda link: link.url) \
if self.unique else links