Here are the examples of the python api scrapy.utils.sitemap.sitemap_urls_from_robots taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
2 Examples
4
Example 1
Project: scrapy Source File: test_utils_sitemap.py
def test_sitemap_urls_from_robots(self):
robots = """User-agent: *
Disallow: /aff/
Disallow: /wl/
# Search and shopping refining
Disallow: /s*/*facet
Disallow: /s*/*tags
# Sitemap files
Sitemap: http://example.com/sitemap.xml
Sitemap: http://example.com/sitemap-product-index.xml
# Forums
Disallow: /forum/search/
Disallow: /forum/active/
"""
self.assertEqual(list(sitemap_urls_from_robots(robots)),
['http://example.com/sitemap.xml', 'http://example.com/sitemap-product-index.xml'])
0
Example 2
Project: scrapy Source File: sitemap.py
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.text):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
if body is None:
logger.warning("Ignoring invalid sitemap: %(response)s",
{'response': response}, extra={'spider': self})
return
s = Sitemap(body)
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
yield Request(loc, callback=c)
break