Here are the examples of the python api scrapy.contrib.linkextractors.LinkExtractor.extract_links taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
4 Examples
3
Example 1
Project: scrapy-stats Source File: quhua.py
def Layer01_Parse(self, response):
item = Layer01_Item()
for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d\.html')).extract_links(response):
url = i.url
text = i.text
item['year'] = url[-12:-8]
item['name'] = text
item['code'] = url[-7:-5]
yield item
yield Request(url, callback=self.Layer02_Parse)
3
Example 2
Project: scrapy-stats Source File: quhua.py
def Layer02_Parse(self, response):
text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\
[0].extract()
item = Layer02_Item()
item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:]
for code, name in re.findall(r'href="\d\d/(\d{4})\.html">([^\d]+?)</a>', text):
item['name'] = name
item['code'] = code
yield item
for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d{4}\.html')).extract_links(response):
url = i.url
text = i.text
yield Request(url, callback=self.Layer03_Parse)
3
Example 3
Project: scrapy-stats Source File: quhua.py
def Layer03_Parse(self, response):
text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\
[0].extract()
item = Layer03_Item()
item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:]
for code, name in re.findall(r'href="\d\d/(\d{6})\.html">([^\d]+?)</a>', text):
item['name'] = name
item['code'] = code
yield item
for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d\d/\d{6}\.html')).extract_links(response):
url = i.url
text = i.text
yield Request(url, callback=self.Layer04_Parse)
3
Example 4
Project: scrapy-stats Source File: quhua.py
def Layer04_Parse(self, response):
text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\
[0].extract()
item = Layer04_Item()
item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:]
for code, name in re.findall(r'href="\d\d/(\d{9}).html">([^\d]+?)</a>', text):
item['name'] = name
item['code'] = code
yield item
for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d\d/\d\d/\d{9}\.html')).extract_links(response):
url = i.url
text = i.text
yield Request(url, callback=self.Layer05_Parse)