Here are the examples of the python api scrapy.utils.python.unique_list taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
5 Examples
3
Example 1
Project: scrapy Source File: lxmlhtml.py
def extract_links(self, response):
base_url = get_base_url(response)
if self.restrict_xpaths:
docs = [subdoc
for x in self.restrict_xpaths
for subdoc in response.xpath(x)]
else:
docs = [response.selector]
all_links = []
for doc in docs:
links = self._extract_links(doc, response.url, response.encoding, base_url)
all_links.extend(self._process_links(links))
return unique_list(all_links)
3
Example 2
def _process_links(self, links):
""" Normalize and filter extracted links
The subclass should override it if necessary
"""
links = unique_list(links, key=lambda link: link.url) if self.unique else links
return links
0
Example 3
Project: scrapy Source File: htmlparser.py
def _extract_links(self, response_text, response_url, response_encoding):
self.reset()
self.feed(response_text)
self.close()
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
ret = []
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
for link in links:
if isinstance(link.url, unicode):
link.url = link.url.encode(response_encoding)
try:
link.url = urljoin(base_url, link.url)
except ValueError:
continue
link.url = safe_url_string(link.url, response_encoding)
link.text = link.text.decode(response_encoding)
ret.append(link)
return ret
0
Example 4
Project: scrapy Source File: lxmlhtml.py
def _deduplicate_if_needed(self, links):
if self.unique:
return unique_list(links, key=lambda link: link.url)
return links
0
Example 5
def _extract_links(self, selector, response_url, response_encoding, base_url):
'''
Pretty much the same function, just added 'ignore' to url.encode
'''
links = []
# hacky way to get the underlying lxml parsed docuement
for el, attr, attr_val in self._iter_links(selector._root):
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
try:
attr_val = urljoin(base_url, attr_val)
except ValueError:
continue # skipping bogus links
else:
url = self.process_attr(attr_val)
if url is None:
continue
if isinstance(url, unicode):
# add 'ignore' to encoding errors
url = url.encode(response_encoding, 'ignore')
# to fix relative links after process_value
url = urljoin(response_url, url)
link = Link(url, _collect_string_content(el) or u'',
nofollow=True if el.get('rel') == 'nofollow' else False)
links.append(link)
return unique_list(links, key=lambda link: link.url) \
if self.unique else links