Here are the examples of the python api scrapy.Request taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
837 Examples
5
Source : test_response_required_logic.py
with BSD 3-Clause "New" or "Revised" License
from scrapinghub
with BSD 3-Clause "New" or "Revised" License
from scrapinghub
def test_get_callback():
spider = MySpider()
req = scrapy.Request("http://example.com")
assert get_callback(req, spider) == spider.parse
req = scrapy.Request("http://example.com", spider.parse2)
assert get_callback(req, spider) == spider.parse2
def cb(response):
pass
req = scrapy.Request("http://example.com", cb)
assert get_callback(req, spider) == cb
def test_is_provider_using_response():
3
Source : country.py
with Apache License 2.0
from 1012598167
with Apache License 2.0
from 1012598167
def start_requests(self):
path = 'https://en.wikipedia.org/wiki/List_of_company_name_etymologies'#维基官网的没有地域等限制的公司列表 非常齐全
yield scrapy.Request(url=path ,callback=self.parse)#以parse方式发出request
#meta={'country_name':query}
def parse(self, response):
3
Source : label.py
with Apache License 2.0
from 1012598167
with Apache License 2.0
from 1012598167
def start_requests(self):
with open(r'wiki_country/querys/input.json', 'r') as f:
querys = json.load(f) # 把国家load进来
path = 'http://en.wikipedia.org'
for query in querys['query']:
url = path + query
print(url)
yield scrapy.Request(url=url, callback=self.parse) # 以parse方式发出request
def parse(self, response):
3
Source : wiki.py
with Apache License 2.0
from 1012598167
with Apache License 2.0
from 1012598167
def start_requests(self):
with open(r'wiki_country/querys/input.json', 'r') as f:
querys = json.load(f)#把国家load进来
#print("###"+querys)
path = 'http://en.wikipedia.org'
for query in querys['query']:
url = path + '_'.join(query.split(' '))#对于比如United States去空格为_
print(url)
yield scrapy.Request(url=url ,callback=self.parse)#以parse方式发出request
#meta={'country_name':query}
def parse(self, response):
3
Source : country.py
with Apache License 2.0
from 1012598167
with Apache License 2.0
from 1012598167
def start_requests(self):
path = 'https://www.worldometers.info/geography/alphabetical-list-of-countries/'
#更好的应该是这个网站 不过无所谓了 反正其实对应国家都是一样的https://en.wikipedia.org/wiki/List_of_country-name_etymologies
#其实可以用
yield scrapy.Request(url=path ,callback=self.parse)#以parse方式发出request
#meta={'country_name':query}
def parse(self, response):
3
Source : label.py
with Apache License 2.0
from 1012598167
with Apache License 2.0
from 1012598167
def start_requests(self):
with open(r'wiki_country/querys/input.json', 'r') as f:
querys = json.load(f) # 把国家load进来
path = 'http://en.wikipedia.org/wiki/'
for query in querys['query']:
url = path + '_'.join(query.split(' ')) # 对于比如United States去空格为_
print(url)
yield scrapy.Request(url=url, callback=self.parse) # 以parse方式发出request
def parse(self, response):
3
Source : wiki.py
with Apache License 2.0
from 1012598167
with Apache License 2.0
from 1012598167
def start_requests(self):
with open(r'wiki_country/querys/input.json', 'r') as f:
querys = json.load(f)#把国家load进来
#print("###"+querys)
path = 'http://en.wikipedia.org/wiki/'
for query in querys['query']:
url = path + '_'.join(query.split(' '))#对于比如United States去空格为_
print(url)
yield scrapy.Request(url=url ,callback=self.parse)#以parse方式发出request
#meta={'country_name':query}
def parse(self, response):
3
Source : demo.py
with MIT License
from 18839782321
with MIT License
from 18839782321
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url + '?Nao=0',
callback=self.parse,
dont_filter=True,
meta={'url': url}
)
def parse(self, response):
3
Source : demo.py
with MIT License
from 18839782321
with MIT License
from 18839782321
def parse_product(self, response):
json_data = json.loads(str(
re.findall(re.compile(r'digitalData.content=(.*?)thdAnalyticsEvent=', re.S), response.text)[
0]).strip().strip(';'))
for data in json_data['product']:
avail = data['productInfo']['availabilityStatus']['shipping']['sth']
productid = data['productInfo']['sku']
if avail == 'available':
detail_url = f'https://www.homedepot.com/p/svcs/frontEndModel/{productid}?_={str(int(time.time()) * 1000)}'
yield scrapy.Request(
url=detail_url,
callback=self.parse_detail,
dont_filter=True,
meta={'url': response.meta.get('url')}
)
def parse_detail(self, response):
3
Source : sxs_search_jobs.py
with MIT License
from 18839782321
with MIT License
from 18839782321
def start_requests(self):
for u in self.start_urls:
url = f'https://www.shixiseng.com/intern/{u}?pcm=pc_Company'
yield scrapy.Request(
url=url,
callback=self.parse,
dont_filter=True,
meta={'handle_httpstatus_list': [302]}
)
def parse(self, response):
3
Source : zhaopin_company_detail.py
with MIT License
from 18839782321
with MIT License
from 18839782321
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse,
dont_filter=True,
meta={'url': url}
)
def get_token(self, arg1):
3
Source : zhaopin_company_detail.py
with MIT License
from 18839782321
with MIT License
from 18839782321
def parse(self, response):
try:
arg1 = re.findall(re.compile(r"var arg1='(.*?)';", re.S), response.text)[0]
except Exception:
try:
arg1 = re.search("arg1='([^']+)'", response.text).group(1)
except Exception:
arg1 = ''
if arg1 != '':
token = self.get_token(arg1)
yield scrapy.Request(
url=response.url,
headers={"Cookie": f'acw_sc__v2={token}; '},
callback=self.parse_detail,
dont_filter=True
)
def parse_detail(self, response):
3
Source : nist.py
with MIT License
from aiqm
with MIT License
from aiqm
def start_requests(self):
start_url = urltemplate.format(min_weight, max_weight)
yield scrapy.Request(
url=start_url,
callback=lambda x: self.parse_range(x, min_weight, max_weight))
def parse_range(self, response, from_, to):
3
Source : get_contracts.py
with GNU General Public License v3.0
from ajcerejeira
with GNU General Public License v3.0
from ajcerejeira
def start_requests(self):
for i in range(1, self.ncontracts, self.step):
headers = {'Range': '{}-{}'.format(i, i + self.step - 1)}
yield scrapy.Request(url=self.base_url, headers=headers,
dont_filter=True)
def parse(self, response):
3
Source : restaurantreviewscraper.py
with MIT License
from akshitvjain
with MIT License
from akshitvjain
def parse(self, response):
# yield restaurant information
for restaurant in response.css('a.property_title'):
self.restaurants_scraped += 1
if (self.restaurants_scraped > MAX_RESTAURANTS):
return
res_url = ('https://www.tripadvisor.com%s' % \
restaurant.xpath('@href').extract_first())
yield scrapy.Request(res_url, callback=self.parse_restaurant)
# move to the next page of restaurants
next_page = ('https://www.tripadvisor.com%s'\
% (response.css('a.nav.next.rndBtn.ui_button.primary.taLnk')) \
.xpath('@href').extract_first())
print('NEXT PAGE: ' + next_page)
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
def parse_restaurant(self, response):
3
Source : download_contracts.py
with GNU Lesser General Public License v3.0
from anapaulagomes
with GNU Lesser General Public License v3.0
from anapaulagomes
def parse(self, response):
# TODO: parse other metadata so we can yield together with PDF URL
links = rows.plugins.html.extract_links(response.body)
for link in links:
if not link.lower().endswith(".pdf"):
continue
url = urljoin(self.url, link)
yield scrapy.Request(url=url, callback=self.save_pdf, meta={"url": url})
def save_pdf(self, response):
3
Source : ip66.py
with Apache License 2.0
from aox-lei
with Apache License 2.0
from aox-lei
def parse(self, response):
link = LinkExtractor(restrict_css='ul.textlarge22', allow='areaindex')
links = link.extract_links(response)
for _link in links:
# yield scrapy.Request('http://www.66ip.cn/areaindex_1/1.html', callback=self.parse_list)
yield scrapy.Request(_link.url, callback=self.parse_list)
def start_requests(self):
3
Source : hl7_spider.py
with Apache License 2.0
from arkhn
with Apache License 2.0
from arkhn
def start_requests(self):
if not os.path.exists(self.saving_path):
os.mkdir(self.saving_path)
urls = [
urllib.parse.urljoin(self.root_url, 'resourcelist.html'),
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
urls = [
urllib.parse.urljoin(self.root_url, 'datatypes.html'),
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_datatypes)
def parse_datatypes(self, response):
3
Source : qqmusic.py
with Apache License 2.0
from arvinljw
with Apache License 2.0
from arvinljw
def parse(self, response):
playList = self.getDict(response.text, 'getPlaylist')
self.total_size = playList['data']['sum']
items = playList['data']['list']
print('item->', items)
for item in items:
for result in self.parse_item(item):
yield result
self.start_size = self.end_size + 1
self.end_size += PER_PAGE_SIZE
next_page_url = self.url % (getRnd(), self.start_size, self.end_size)
if next_page_url and self.start_size < = self.total_size:
yield Request(next_page_url, headers=self.headers, callback=self.parse)
def parse_item(self, each):
3
Source : qqmusic.py
with Apache License 2.0
from arvinljw
with Apache License 2.0
from arvinljw
def parse_item(self, each):
item = QQMusicItem()
item['songSheet'] = each['dissname']
item['authorName'] = each['creator']['name']
item['playTimes'] = each['listennum']
item['createTime'] = each['createtime']
dissid = each['dissid']
detail_url = DETAIL_URL % dissid
if detail_url:
self.detail_header['referer'] = 'https://y.qq.com/n/yqq/playsquare/%s.html' % dissid
yield Request(detail_url, headers=self.detail_header, meta={'data': item}, callback=self.parse_detail)
else:
return item
def parse_detail(self, response):
3
Source : scraper.py
with GNU General Public License v3.0
from AugustKarlstedt
with GNU General Public License v3.0
from AugustKarlstedt
def after_login(self, response):
if b'The password or email is invalid.' in response.body:
self.logger.error('Login failed!')
return
category_links = response.css('a.category-list-item__link')
for category_link in category_links:
category_href = response.urljoin(category_link.xpath('@href').extract_first())
yield scrapy.Request(category_href, self.parse_high_level_category)
def parse_high_level_category(self, response):
3
Source : scraper.py
with GNU General Public License v3.0
from AugustKarlstedt
with GNU General Public License v3.0
from AugustKarlstedt
def parse_all_blinks_in_category(self, response):
books = response.css('a.letter-book-list__item')
for book in books:
book_href = response.urljoin(book.xpath('@href').extract_first())
yield scrapy.Request(book_href, self.parse_book)
def parse_book(self, response):
3
Source : scraper.py
with GNU General Public License v3.0
from AugustKarlstedt
with GNU General Public License v3.0
from AugustKarlstedt
def parse_book(self, response):
item = Book()
item['title'] = response.css('h1.book__header__title').extract()
item['subtitle'] = response.css('h2.book__header__subtitle').extract()
item['author'] = response.css('div.book__header__author').extract()
item['read_time'] = response.css('.book__header__info .book__header__info-item-body').extract()
item['synopsis'] = response.css('.book__tab-content[ref="synopsis"]').extract()
item['who_should_read'] = response.css('.book__tab-content[ref="who_should_read"]').extract()
item['about_the_author'] = response.css('.book__tab-content[ref="about_the_author"]').extract()
reader_link = response.css('.button-greenV2[data-read-now="Read now"]')
reader_href = response.urljoin(reader_link.xpath('@href').extract_first())
request = scrapy.Request(reader_href, self.parse_reader)
request.meta['item'] = item
yield request
def parse_reader(self, response):
3
Source : qiushibaike_spider.py
with MIT License
from autofelix
with MIT License
from autofelix
def start_requests(self):
urls = [
'https://www.qiushibaike.com/text/page/1/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
3
Source : stackoverflow-python-spider.py
with MIT License
from autofelix
with MIT License
from autofelix
def start_requests(self):
urls = []
_url = 'https://stackoverflow.com/questions/tagged/python?tab=votes&page={}&pagesize=15'
for page in range(1, 84322):
urls.append(_url.format(page))
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
3
Source : bench.py
with MIT License
from autofelix
with MIT License
from autofelix
def start_requests(self):
qargs = {'total': self.total, 'show': self.show}
url = '{}?{}'.format(self.baseurl, urlencode(qargs, doseq=1))
return [scrapy.Request(url, dont_filter=True)]
def parse(self, response):
3
Source : gov_spider.py
with Apache License 2.0
from bernard0047
with Apache License 2.0
from bernard0047
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url,callback = self.parse)
def parse(self,response):
3
Source : fifa_spider.py
with Mozilla Public License 2.0
from BradleyGrantham
with Mozilla Public License 2.0
from BradleyGrantham
def start_requests(self):
urls = [
"https://www.fifaindex.com/players/fifa19/",
"https://www.fifaindex.com/players/fifa18/",
"https://www.fifaindex.com/players/fifa17/",
"https://www.fifaindex.com/players/fifa16/",
"https://www.fifaindex.com/players/fifa15/",
"https://www.fifaindex.com/players/fifa14/",
"https://www.fifaindex.com/players/fifa13/",
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
3
Source : fifa_spider.py
with Mozilla Public License 2.0
from BradleyGrantham
with Mozilla Public License 2.0
from BradleyGrantham
def start_requests(self):
urls = [
"https://www.fifaindex.com/teams/fifa19/",
"https://www.fifaindex.com/teams/fifa18/",
"https://www.fifaindex.com/teams/fifa17/",
"https://www.fifaindex.com/teams/fifa16/",
"https://www.fifaindex.com/teams/fifa15/",
"https://www.fifaindex.com/teams/fifa14/",
"https://www.fifaindex.com/teams/fifa13/",
"https://www.fifaindex.com/teams/fifa12/"
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
3
Source : fifa_spider.py
with Mozilla Public License 2.0
from BradleyGrantham
with Mozilla Public License 2.0
from BradleyGrantham
def start_requests(self):
more_urls = [
"https://www.betstudy.com/soccer-stats/c/germany/bundesliga/d/fixtures/",
"http://www.betstudy.com/soccer-stats/c/england/premier-league/d/fixtures/"
]
urls = [
"https://www.betstudy.com/soccer-stats/c/france/ligue-1/d/fixtures/"
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_fixtures)
@staticmethod
3
Source : pipelines.py
with Apache License 2.0
from cat9
with Apache License 2.0
from cat9
def get_media_requests(self, item, info):
if item['img_urls']:
images = item['img_urls'].split(' || ')
for image in images:
yield scrapy.Request(image, priority=15)
def item_completed(self, results, item, info):
3
Source : aliexpress.py
with Apache License 2.0
from cat9
with Apache License 2.0
from cat9
def start_requests(self):
self._wait_for_login()
self.debug = self.settings.getbool("IS_DEBUG", False)
self.max_page_count = self.settings.getint("MAX_PAGE_COUNT", 10)
self.begin_category = self.settings.get('BEGIN_CATEGORY')
self.end_category = self.settings.get('END_CATEGORY')
print("start_requests,is Debug:", self.debug)
for url in self.start_urls:
yield scrapy.Request(url, cookies=self.my_cookies)
def parse(self, response):
3
Source : weibo_spider.py
with GNU General Public License v3.0
from CharesFang
with GNU General Public License v3.0
from CharesFang
def start_requests(self):
uid_list = self.get_uid_list(self.uid)
for uid in uid_list:
u_url = self._u_generator.gen_url(uid)
yield Request(
url=u_url, dont_filter=True, meta={'uid': uid}, callback=self._parse_profile, errback=self.parse_err
)
t_url = self._t_generator.gen_url(uid=uid, page=None)
yield Request(
url=t_url, dont_filter=True, meta={'uid': uid, 'last_page': 0},
callback=self._parse_tweet, errback=self.parse_err
)
def parse(self, response, **kwargs):
3
Source : tweet_info_spider.py
with GNU General Public License v3.0
from CharesFang
with GNU General Public License v3.0
from CharesFang
def start_requests(self):
"""
generate crawling Request from designated uid.
:return: Target Request obj.
"""
uid_list = self.get_uid_list(self.uid)
for uid in uid_list:
url = self._t_generator.gen_url(uid=uid, page=None)
yield Request(url=url, dont_filter=True, callback=self._parse_tweet, errback=self.parse_err,
meta={'uid': uid, 'last_page': 0})
def _parse_tweet(self, response, **kwargs):
3
Source : user_info_spider.py
with GNU General Public License v3.0
from CharesFang
with GNU General Public License v3.0
from CharesFang
def start_requests(self):
"""
Generate Request objs by target uid and target url generator
"""
uid_list = self.get_uid_list(self.uid)
for uid in uid_list:
url = self._u_generator(uid)
yield Request(url=url, dont_filter=True, callback=self._parse_profile, errback=self.parse_err,
meta={'uid': uid})
def _parse_profile(self, response):
3
Source : crawler.py
with MIT License
from Charlie-belmer
with MIT License
from Charlie-belmer
def parse(self, response):
for item in response.css('.SearchResult'):
# all URL's
#response.css('.SearchResult').xpath('a/@href').extract()
# first url
#response.css('.SearchResult').xpath('a/@href').extract()
##start here - all return as single item lists
plugin_details['name'] = item.css('.SearchResult-name::text').extract()
plugin_details['url'] = item.xpath('a/@href').extract()
plugin_details['icon_url'] = item.css('.SearchResult-icon').xpath('@src').extract()
plugin_details['users'] = item.css('.SearchResult-users-text::text').extract()
request = scrapy.Request(plugin_details['url'],
callback=self.parse_page2)
request.meta['details'] = plugin_details
yield request
def parse_plugin(self, response):
3
Source : clarin.py
with MIT License
from chequeado
with MIT License
from chequeado
def start_requests(self):
urls = ["https://www.clarin.com/rss/politica/", "https://www.clarin.com/rss/economia/"]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_seccion)
def parse_seccion(self, response):
3
Source : clarin.py
with MIT License
from chequeado
with MIT License
from chequeado
def parse_seccion(self, response):
feed = feedparser.parse(response.url)
for entry in feed['entries']:
request = scrapy.Request(url=entry['link'], callback=self.parse_noticia)
request.meta['item'] = entry
yield request
def parse_noticia(self, response):
3
Source : diariodelfindelmundo.py
with MIT License
from chequeado
with MIT License
from chequeado
def start_requests(self):
urls = [
'http://www.eldiariodelfindelmundo.com/provinciales/',
'http://www.eldiariodelfindelmundo.com/municipales/'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_seccion)
def parse_seccion(self, response):
3
Source : diariodelfindelmundo.py
with MIT License
from chequeado
with MIT License
from chequeado
def parse_seccion(self, response):
noticias = set(response.xpath('//div[@class="contenedor_general_resultados"]//h4[@class="titulo_listado_resultados "]/a/@href').extract())
for noticia in noticias:
nota = response.urljoin(noticia)
yield scrapy.Request(url=nota, callback=self.parse_noticia)
def parse_noticia(self, response):
3
Source : diarioprensa.py
with MIT License
from chequeado
with MIT License
from chequeado
def start_requests(self):
urls = [
'http://www.diarioprensa.com.ar/category/politica/',
'http://www.diarioprensa.com.ar/category/economia/'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_seccion)
def parse_seccion(self, response):
3
Source : diarioprensa.py
with MIT License
from chequeado
with MIT License
from chequeado
def parse_seccion(self, response):
noticias = set(response.xpath('//main[@id="main"]/article/header/h2/a/@href').extract())
for noticia in noticias:
nota = response.urljoin(noticia)
yield scrapy.Request(url=nota, callback=self.parse_noticia)
def parse_noticia(self, response):
3
Source : diariorionegro.py
with MIT License
from chequeado
with MIT License
from chequeado
def start_requests(self):
urls = [
'http://www.rionegro.com.ar/region',
#'http://www.rionegro.com.ar/argentina'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_seccion)
def parse_seccion(self, response):
3
Source : diariorionegro.py
with MIT License
from chequeado
with MIT License
from chequeado
def parse_seccion(self, response):
links = response.xpath('//a[contains(@href, "/region")]/@href')
for link in links:
entry_url = response.urljoin(link.extract())
request = scrapy.Request(url=entry_url, callback=self.parse_noticia)
yield request
def parse_noticia(self, response):
3
Source : elchubut.py
with MIT License
from chequeado
with MIT License
from chequeado
def start_requests(self):
urls = [
'http://www.elchubut.com.ar/rss'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_seccion)
def parse_seccion(self, response):
3
Source : elchubut.py
with MIT License
from chequeado
with MIT License
from chequeado
def parse_seccion(self, response):
feed = feedparser.parse(response.url)
for entry in feed['entries']:
if entry['category'] == 'Regionales':
request = scrapy.Request(url=entry['link'], callback=self.parse_noticia)
request.meta['item'] = entry
yield request
def parse_noticia(self, response):
3
Source : eldia.py
with MIT License
from chequeado
with MIT License
from chequeado
def start_requests(self):
urls = [
'http://www.eldia.com/seccion/politica-y-economia'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_seccion)
def parse_seccion(self, response):
3
Source : eldia.py
with MIT License
from chequeado
with MIT License
from chequeado
def parse_seccion(self, response):
noticias = set(response.xpath('//div[@id="main_seccion"]/article//a/@href').extract())
for noticia in noticias:
nota = response.urljoin(noticia)
yield scrapy.Request(url=nota, callback=self.parse_noticia)
def parse_noticia(self, response):
3
Source : elindependiente.py
with MIT License
from chequeado
with MIT License
from chequeado
def start_requests(self):
urls = [
'http://www.elindependiente.com.ar/seccion.php?seccion=1',
'http://www.elindependiente.com.ar/seccion.php?seccion=5'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_seccion)
def parse_seccion(self, response):
3
Source : elindependiente.py
with MIT License
from chequeado
with MIT License
from chequeado
def parse_seccion(self, response):
seccion = response.url.split('/')[-1]
noticias = set(response.xpath('//a/@href').extract())
noticias = [BASE_URL + p for p in noticias if 'pagina.php' in p]
for noticia in noticias:
nota = response.urljoin(noticia)
yield scrapy.Request(url=nota, callback=self.parse_noticia)
def parse_noticia(self, response):
See More Examples