Here are the examples of the python api scrapy.log.ERROR taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
8 Examples
3
Example 1
def spider_opened(self, spider):
self.conn = MySQLdb.connect(host=settings.MYSQL_HOST, db=settings.MYSQL_DB, user=settings.MYSQL_USER, passwd=settings.MYSQL_PASSWORD, charset='utf8', use_unicode=True)
cursor = spider.conn.cursor()
sql_str = "SELECT pattern from whitelist"
cursor.execute(sql_str)
self.custom_whitelist = cursor.fetchall()
try:
alexa_whitelist_file = pkgutil.get_data("malspider", "resources/alexa-1k-whitelist.csv").decode('ascii')
self.alexa_whitelist = alexa_whitelist_file.splitlines()
except:
log.msg("Error loading alexa whitelist...", level=log.ERROR)
3
Example 2
def process_item(self, item, spider):
db = DB[spider.name]
try:
db.save(dict(item))
except Exception, e:
log.msg(str(e), level=log.ERROR)
return item
3
Example 3
def parse(self, response):
res = json.loads(response.body)
if not res:
return
try:
res['_id'] = res['id']
del res['id']
del res['status']
item = MemberItem(res)
except Exception, e:
self.log(str(e), level=log.ERROR)
return
yield item
3
Example 4
def parse(self, response):
res = json.loads(response.body)
if not res or not isinstance(res, list):
return
for r in res:
try:
r['_id'] = r['id']
del r['id']
r['topic_id'] = int(re.search('\=(\d+)', response.url).group(1))
item = ReplyItem(r)
except Exception, e:
self.log("%s"% response.url, level=log.ERROR)
self.log(str(e), level=log.DEBUG)
return
yield item
3
Example 5
def parse(self, response):
res = json.loads(response.body)
if not res or not isinstance(res, list):
return
try:
res = res[0]
res['_id'] = res['id']
del res['id']
item = TopicItem(res)
except Exception, e:
self.log(str(e), level=log.ERROR)
return
yield item
0
Example 6
def spider_closed(self, spider):
try:
self.conn.close()
except:
los.msg("Could not close database connection", level=log.ERROR)
0
Example 7
Project: MTQInfraScraper Source File: mtqinfra_spider.py
def parse_main_list(self, response):
try:
# Parse the main table
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[@id="R267337656202362799"]//table[@summary="Report"]/tr')
if not rows:
self.log("Failed to extract results table from response for URL '{:s}'. Has 'id' changed?".format(response.request.url), level=log.ERROR)
return
for row in rows:
cells = row.select('td')
# Skip header
if not cells:
continue
# Check if this is the last row. It contains only one cell and we must dig in to get page info
if len(cells) == 1:
total_num_records = int(hxs.select('//table[@id="R262940246607215751"]/tr[2]/td/table/tr[6]/td[2]/text()').extract()[0])
first_record_on_page = int(cells[0].select('//span[@class="fielddata"]/text()').extract()[0].split('-')[0].strip())
last_record_on_page = int(cells[0].select('//span[@class="fielddata"]/text()').extract()[0].split('-')[1].strip())
self.log("Scraping details for records {:d} to {:d} of {:d} [{:.2f}% done].".format(first_record_on_page,
last_record_on_page, total_num_records, float(last_record_on_page)/float(total_num_records)*100), level=log.INFO)
# DEBUG: Switch check if you only want to process a certain number of records (e.g. 45)
#if last_record_on_page < 45:
if last_record_on_page < total_num_records:
page_links = cells[0].select('//a[@class="fielddata"]/@href').extract()
if len(page_links) == 1:
# On first page
next_page_href = page_links[0]
else:
next_page_href = page_links[1]
# Request to scrape next page
yield Request(url=response.request.url.split('?')[0]+'?'+next_page_href.split('?')[1], callback=self.parse_main_list)
continue
else:
# Nothing more to do
break
# Cell 1: Record # + Record HREF
record_no = cells[0].select('a/text()').extract()[0].strip()
record_relative_href = cells[0].select('a/@href').extract()[0]
record_href = response.request.url.split('?')[0]+'?'+record_relative_href.split('?')[1]
structure_id = re.sub(ur"^.+:([0-9]+)$", ur'\1', record_href)
# Cell 2: Name
structure_name = "".join(cells[1].select('.//text()').extract()).strip()
# Cell 3: Structure Type Image
structure_type = cells[2].select('img/@alt').extract()[0]
structure_type_img_relative_href = cells[2].select('img/@src').extract()[0]
structure_type_img_href = re.sub(r'/[^/]*$', r'/', response.request.url) + structure_type_img_relative_href
# Cell 4: Combined Territorial Direction + Municipality
territorial_direction = "".join(cells[3].select('b//text()').extract()).strip()
# NOTE: Municipality taken from details page as it was easier to parse.
# Cell 5: Road
road = "".join(cells[4].select('.//text()').extract()).strip()
# Cell 6: Obstacle
obstacle = "".join(cells[5].select('.//text()').extract()).strip()
# Cell 7: GCI (General Condition Index)
gci = cells[6].select('nobr/text()').extract()[0].strip()
# Cell 8: AI (Accessibility Index)
# Defaults to "no_restriction" as most records will have this code.
ai_code = 'no_restriction'
if cells[7].select('nobr/img/@alt'):
ai_desc = cells[7].select('nobr/img/@alt').extract()[0]
ai_img_relative_href = cells[7].select('nobr/img/@src').extract()[0]
ai_img_href = re.sub(r'/[^/]*$', r'/', response.request.url) + ai_img_relative_href
else:
# If no image found for AI, then code = not available
ai_code = 'na'
if cells[7].select('nobr/text()'):
# Some text was available, use it
ai_desc = cells[7].select('nobr/text()').extract()[0]
else:
ai_desc = "N/D"
# Use our own Gray trafic light hosted on CloudApp
ai_img_href = "http://cl.ly/2r2A060b1g0N0l3f1y3L/feugris.png"
# Set ai_code according to description if applicable
if re.search(ur'certaines', ai_desc, re.I):
ai_code = 'restricted'
elif re.search(ur'fermée', ai_desc, re.I):
ai_code = 'closed'
# Cell 9: Location HREF
onclick = cells[8].select('a/@onclick').extract()[0]
location_href = re.sub(ur"^javascript:pop_url\('(.+)'\);$", ur'\1', onclick)
# Cell 10: Planned Intervention
planned_intervention = "".join(cells[9].select('.//text()').extract()).strip()
# Cell 11: Report (yes/no image only) (SKIP)
item = MTQInfraItem()
item['record_no'] = record_no # Fiche/Nº
item['record_href'] = record_href # Fiche/Nº
item['structure_id'] = structure_id # (determined from record_href)
item['structure_name'] = structure_name # Nom
item['structure_type'] = structure_type # Type
item['structure_type_img_href'] = structure_type_img_href # Type
item['territorial_direction'] = territorial_direction # Direction territoriale
item['road'] = road # Route
item['obstacle'] = obstacle # Obstacle
item['gci'] = gci # Indice de condition générale
item['ai_desc'] = ai_desc # Indice d'accessibilité
item['ai_img_href'] = ai_img_href # Indice d'accessibilité
item['ai_code'] = ai_code # (determined from ai_desc)
item['location_href'] = location_href # Diffusion des données spatiales
item['planned_intervention'] = planned_intervention # Intervention planifiée
self.items_buffer[structure_id] = item
# Request to scrape details
yield Request(url=record_href, callback=self.parse_details)
except Exception as e:
# Something went wrong parsing this page. Log URL so we can determine which one.
self.log("Parsing failed for URL '{:s}'".format(response.request.url), level=log.ERROR)
raise # Re-raise exception
0
Example 8
def parse_details(self, response):
# Parse the details of each structure
try:
# Extract structure ID from URL
structure_id = response.request.url.split(':')[-1]
hxs = HtmlXPathSelector(response)
road_class = "".join(hxs.select('//table[@id="R260791846806817377"]/tr[2]/td/table[1]/tr[7]/td//text()').extract()).strip()
municipality = "".join(hxs.select('//table[@id="R260791846806817377"]/tr[2]/td/table[1]/tr[9]/td//text()').extract()).strip()
rcm = "".join(hxs.select('//table[@id="R260791846806817377"]/tr[2]/td/table[1]/tr[10]/td//text()').extract()).strip()
latitude_text = hxs.select('//table[@id="R260791846806817377"]/tr[2]/td/table[2]/tr[2]/td[1]/text()').extract()[0].strip()
latitude = float(latitude_text.replace(",", "."))
longitude_text = hxs.select('//table[@id="R260791846806817377"]/tr[2]/td/table[2]/tr[2]/td[2]/text()').extract()[0].strip()
longitude = float(longitude_text.replace(",", "."))
construction_year = hxs.select('//table[@id="R260791846806817377"]/tr[2]/td/table[4]/tr[2]/td/text()').extract()[0].strip()
# Picture is not always available
picture_node = hxs.select("//img[contains(@src,'%s')]/@src" % structure_id)
if picture_node:
picture_href = picture_node.extract()[0]
else:
picture_href = ""
last_general_inspection_date = re.sub('\s+', ' ', "".join(hxs.select('//table[@id="R260791247138817374"]/tr[2]/td/table/tr[2]/td//text()').extract()).strip())
next_general_inspection_date = re.sub('\s+', ' ', "".join(hxs.select('//table[@id="R260791247138817374"]/tr[2]/td/table/tr[3]/td//text()').extract()).strip())
# The next fields can be missing if they do not apply
average_daily_flow_of_vehicles_node = hxs.select('//table[@id="R260791442281817375"]/tr[2]/td/table[1]/tr[2]/td[1]/text()')
if average_daily_flow_of_vehicles_node:
# NOTE: Large number have spaces in them. Remove them.
average_daily_flow_of_vehicles = average_daily_flow_of_vehicles_node.extract()[0].strip().replace(' ','')
else:
average_daily_flow_of_vehicles = ""
percent_trucks_node = hxs.select('//table[@id="R260791442281817375"]/tr[2]/td/table[1]/tr[2]/td[2]/text()')
if percent_trucks_node:
percent_trucks = percent_trucks_node.extract()[0].strip().replace('%','')
else:
percent_trucks = ""
num_lanes_node = hxs.select('//table[@id="R260791442281817375"]/tr[2]/td/table[2]/tr[2]/td/text()')
if num_lanes_node:
num_lanes = num_lanes_node.extract()[0].strip()
else:
num_lanes = ""
inspection_report_node = hxs.select('//table[@id="R268966050187887822"]/tr[2]/td/table[1]/tr[2]/td/a/@href')
if inspection_report_node:
inspection_report_href = 'http://www.mtq.gouv.qc.ca' + inspection_report_node.extract()[0]
else:
inspection_report_href = ""
limitation_text_node = hxs.select('//table[@id="R297755049131147236"]/tr[2]/td/table[1]/tr[2]/td/a/text()')
if limitation_text_node:
limitation = limitation_text_node.extract()[0].strip()
else:
limitation = ""
limitation_node = hxs.select('//table[@id="R297755049131147236"]/tr[2]/td/table[1]/tr[2]/td/a/@href')
if limitation_node:
limitation_href = 'http://www.mtq.gouv.qc.ca' + limitation_node.extract()[0]
else:
limitation_href = ""
except Exception as e:
# Something went wrong parsing this details page. Log structure ID so we can determine which one.
self.log("Details parsing failed for structure '{:s}'".format(structure_id), level=log.ERROR)
raise
item = self.items_buffer[structure_id]
item['road_class'] = road_class # Route: Classe route
item['municipality'] = municipality # Municipalité
item['rcm'] = rcm # MRC
# @todo CEP
# @todo Obstacle: Type de voie
# @todo Obstacle: Classe route
item['latitude'] = latitude # Latitude
item['longitude'] = longitude # Longitude
# @todo Longueur totale
# @todo Longueur tablier
# @todo Largeur hors tout
# @todo Largeur carrossable
# @todo Superficie tablier
item['construction_year'] = construction_year # Année: Construction
item['picture_href'] = picture_href
item['last_general_inspection_date'] = last_general_inspection_date # Dernière inspection générale
item['next_general_inspection_date'] = next_general_inspection_date # Prochaine inspection générale
item['inspection_report_href'] = inspection_report_href # Rapport(s) d'inspection
item['average_daily_flow_of_vehicles'] = average_daily_flow_of_vehicles # DJMA
item['percent_trucks'] = percent_trucks # % camion
item['num_lanes'] = num_lanes # Nombre de voies
item['limitation'] = limitation # Limitation
item['limitation_href'] = limitation_href # Limitation
del self.items_buffer[structure_id]
return item