scrapy.Request

Here are the examples of the python api scrapy.Request taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

837 Examples 7

5 Source : test_response_required_logic.py
with BSD 3-Clause "New" or "Revised" License
from scrapinghub

def test_get_callback():
    spider = MySpider()

    req = scrapy.Request("http://example.com")
    assert get_callback(req, spider) == spider.parse

    req = scrapy.Request("http://example.com", spider.parse2)
    assert get_callback(req, spider) == spider.parse2

    def cb(response):
        pass

    req = scrapy.Request("http://example.com", cb)
    assert get_callback(req, spider) == cb


def test_is_provider_using_response():

3 Source : country.py
with Apache License 2.0
from 1012598167

    def start_requests(self):
        path = 'https://en.wikipedia.org/wiki/List_of_company_name_etymologies'#维基官网的没有地域等限制的公司列表 非常齐全
        yield scrapy.Request(url=path ,callback=self.parse)#以parse方式发出request
#meta={'country_name':query}


    def parse(self, response):

3 Source : label.py
with Apache License 2.0
from 1012598167

    def start_requests(self):
        with open(r'wiki_country/querys/input.json', 'r') as f:
            querys = json.load(f)  # 把国家load进来
        path = 'http://en.wikipedia.org'
        for query in querys['query']:
            url = path + query
            print(url)
            yield scrapy.Request(url=url, callback=self.parse)  # 以parse方式发出request

    def parse(self, response):

3 Source : wiki.py
with Apache License 2.0
from 1012598167

    def start_requests(self):
        with open(r'wiki_country/querys/input.json', 'r') as f:
            querys = json.load(f)#把国家load进来
        #print("###"+querys)
        path = 'http://en.wikipedia.org'
        for query in querys['query']:
            url = path + '_'.join(query.split(' '))#对于比如United States去空格为_
            print(url)
            yield scrapy.Request(url=url ,callback=self.parse)#以parse方式发出request
#meta={'country_name':query}


    def parse(self, response):

3 Source : country.py
with Apache License 2.0
from 1012598167

    def start_requests(self):
        path = 'https://www.worldometers.info/geography/alphabetical-list-of-countries/'
        #更好的应该是这个网站 不过无所谓了 反正其实对应国家都是一样的https://en.wikipedia.org/wiki/List_of_country-name_etymologies
        #其实可以用
        yield scrapy.Request(url=path ,callback=self.parse)#以parse方式发出request
#meta={'country_name':query}


    def parse(self, response):

3 Source : label.py
with Apache License 2.0
from 1012598167

    def start_requests(self):
        with open(r'wiki_country/querys/input.json', 'r') as f:
            querys = json.load(f)  # 把国家load进来
        path = 'http://en.wikipedia.org/wiki/'
        for query in querys['query']:
            url = path + '_'.join(query.split(' '))  # 对于比如United States去空格为_
            print(url)
            yield scrapy.Request(url=url, callback=self.parse)  # 以parse方式发出request

    def parse(self, response):

3 Source : wiki.py
with Apache License 2.0
from 1012598167

    def start_requests(self):
        with open(r'wiki_country/querys/input.json', 'r') as f:
            querys = json.load(f)#把国家load进来
        #print("###"+querys)
        path = 'http://en.wikipedia.org/wiki/'
        for query in querys['query']:
            url = path + '_'.join(query.split(' '))#对于比如United States去空格为_
            print(url)
            yield scrapy.Request(url=url ,callback=self.parse)#以parse方式发出request
#meta={'country_name':query}


    def parse(self, response):

3 Source : demo.py
with MIT License
from 18839782321

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url=url + '?Nao=0',
                callback=self.parse,
                dont_filter=True,
                meta={'url': url}
            )

    def parse(self, response):

3 Source : demo.py
with MIT License
from 18839782321

    def parse_product(self, response):
        json_data = json.loads(str(
            re.findall(re.compile(r'digitalData.content=(.*?)thdAnalyticsEvent=', re.S), response.text)[
                0]).strip().strip(';'))
        for data in json_data['product']:
            avail = data['productInfo']['availabilityStatus']['shipping']['sth']
            productid = data['productInfo']['sku']
            if avail == 'available':
                detail_url = f'https://www.homedepot.com/p/svcs/frontEndModel/{productid}?_={str(int(time.time()) * 1000)}'
                yield scrapy.Request(
                    url=detail_url,
                    callback=self.parse_detail,
                    dont_filter=True,
                    meta={'url': response.meta.get('url')}
                )

    def parse_detail(self, response):

3 Source : sxs_search_jobs.py
with MIT License
from 18839782321

    def start_requests(self):
        for u in self.start_urls:
            url = f'https://www.shixiseng.com/intern/{u}?pcm=pc_Company'
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                dont_filter=True,
                meta={'handle_httpstatus_list': [302]}
            )

    def parse(self, response):

3 Source : zhaopin_company_detail.py
with MIT License
from 18839782321

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                dont_filter=True,
                meta={'url': url}
            )

    def get_token(self, arg1):

3 Source : zhaopin_company_detail.py
with MIT License
from 18839782321

    def parse(self, response):
        try:
            arg1 = re.findall(re.compile(r"var arg1='(.*?)';", re.S), response.text)[0]
        except Exception:
            try:
                arg1 = re.search("arg1='([^']+)'", response.text).group(1)
            except Exception:
                arg1 = ''
        if arg1 != '':
            token = self.get_token(arg1)
            yield scrapy.Request(
                url=response.url,
                headers={"Cookie": f'acw_sc__v2={token}; '},
                callback=self.parse_detail,
                dont_filter=True
            )

    def parse_detail(self, response):

3 Source : nist.py
with MIT License
from aiqm

    def start_requests(self):
        start_url = urltemplate.format(min_weight, max_weight)
        yield scrapy.Request(
            url=start_url,
            callback=lambda x: self.parse_range(x, min_weight, max_weight))

    def parse_range(self, response, from_, to):

3 Source : get_contracts.py
with GNU General Public License v3.0
from ajcerejeira

    def start_requests(self):
        for i in range(1, self.ncontracts, self.step):
            headers = {'Range': '{}-{}'.format(i, i + self.step - 1)}
            yield scrapy.Request(url=self.base_url, headers=headers,
                                 dont_filter=True)

    def parse(self, response):

3 Source : restaurantreviewscraper.py
with MIT License
from akshitvjain

	def parse(self, response):
		# yield restaurant information
		for restaurant in response.css('a.property_title'):
			self.restaurants_scraped += 1
			if (self.restaurants_scraped > MAX_RESTAURANTS):
				return
			res_url = ('https://www.tripadvisor.com%s' % \
				restaurant.xpath('@href').extract_first())
			yield scrapy.Request(res_url, callback=self.parse_restaurant)

		# move to the next page of restaurants
		next_page = ('https://www.tripadvisor.com%s'\
			% (response.css('a.nav.next.rndBtn.ui_button.primary.taLnk')) \
										.xpath('@href').extract_first())
		print('NEXT PAGE: ' + next_page)
		if next_page:
			yield scrapy.Request(next_page, callback=self.parse)
		
	def parse_restaurant(self, response):

3 Source : download_contracts.py
with GNU Lesser General Public License v3.0
from anapaulagomes

    def parse(self, response):
        # TODO: parse other metadata so we can yield together with PDF URL
        links = rows.plugins.html.extract_links(response.body)
        for link in links:
            if not link.lower().endswith(".pdf"):
                continue

            url = urljoin(self.url, link)
            yield scrapy.Request(url=url, callback=self.save_pdf, meta={"url": url})

    def save_pdf(self, response):

3 Source : ip66.py
with Apache License 2.0
from aox-lei

    def parse(self, response):
        link = LinkExtractor(restrict_css='ul.textlarge22', allow='areaindex')
        links = link.extract_links(response)
        for _link in links:
            # yield scrapy.Request('http://www.66ip.cn/areaindex_1/1.html', callback=self.parse_list)
            yield scrapy.Request(_link.url, callback=self.parse_list)

    def start_requests(self):

3 Source : hl7_spider.py
with Apache License 2.0
from arkhn

    def start_requests(self):
        if not os.path.exists(self.saving_path):
            os.mkdir(self.saving_path)
        urls = [
            urllib.parse.urljoin(self.root_url, 'resourcelist.html'),
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

        urls = [
            urllib.parse.urljoin(self.root_url, 'datatypes.html'),
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_datatypes)

    def parse_datatypes(self, response):

3 Source : qqmusic.py
with Apache License 2.0
from arvinljw

    def parse(self, response):
        playList = self.getDict(response.text, 'getPlaylist')
        self.total_size = playList['data']['sum']
        items = playList['data']['list']
        print('item->', items)
        for item in items:
            for result in self.parse_item(item):
                yield result

        self.start_size = self.end_size + 1
        self.end_size += PER_PAGE_SIZE
        next_page_url = self.url % (getRnd(), self.start_size, self.end_size)
        if next_page_url and self.start_size   <  = self.total_size:
            yield Request(next_page_url, headers=self.headers, callback=self.parse)

    def parse_item(self, each):

3 Source : qqmusic.py
with Apache License 2.0
from arvinljw

    def parse_item(self, each):
        item = QQMusicItem()
        item['songSheet'] = each['dissname']
        item['authorName'] = each['creator']['name']
        item['playTimes'] = each['listennum']
        item['createTime'] = each['createtime']

        dissid = each['dissid']
        detail_url = DETAIL_URL % dissid
        if detail_url:
            self.detail_header['referer'] = 'https://y.qq.com/n/yqq/playsquare/%s.html' % dissid
            yield Request(detail_url, headers=self.detail_header, meta={'data': item}, callback=self.parse_detail)
        else:
            return item

    def parse_detail(self, response):

3 Source : scraper.py
with GNU General Public License v3.0
from AugustKarlstedt

    def after_login(self, response):
        if b'The password or email is invalid.' in response.body:
            self.logger.error('Login failed!')
            return

        category_links = response.css('a.category-list-item__link')
        for category_link in category_links:
            category_href = response.urljoin(category_link.xpath('@href').extract_first())
            yield scrapy.Request(category_href, self.parse_high_level_category)
        
    def parse_high_level_category(self, response):

3 Source : scraper.py
with GNU General Public License v3.0
from AugustKarlstedt

    def parse_all_blinks_in_category(self, response):
        books = response.css('a.letter-book-list__item')

        for book in books:
            book_href = response.urljoin(book.xpath('@href').extract_first())
            yield scrapy.Request(book_href, self.parse_book)

    def parse_book(self, response):

3 Source : scraper.py
with GNU General Public License v3.0
from AugustKarlstedt

    def parse_book(self, response):
        item = Book()
        item['title'] = response.css('h1.book__header__title').extract()
        item['subtitle'] = response.css('h2.book__header__subtitle').extract()
        item['author'] = response.css('div.book__header__author').extract()
        item['read_time'] = response.css('.book__header__info .book__header__info-item-body').extract()
        item['synopsis'] = response.css('.book__tab-content[ref="synopsis"]').extract()
        item['who_should_read'] = response.css('.book__tab-content[ref="who_should_read"]').extract()
        item['about_the_author'] = response.css('.book__tab-content[ref="about_the_author"]').extract()

        reader_link = response.css('.button-greenV2[data-read-now="Read now"]')
        reader_href = response.urljoin(reader_link.xpath('@href').extract_first())

        request = scrapy.Request(reader_href, self.parse_reader)
        request.meta['item'] = item

        yield request

    def parse_reader(self, response):

3 Source : qiushibaike_spider.py
with MIT License
from autofelix

    def start_requests(self):
        urls = [
            'https://www.qiushibaike.com/text/page/1/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):

3 Source : stackoverflow-python-spider.py
with MIT License
from autofelix

    def start_requests(self):
        urls = []
        _url = 'https://stackoverflow.com/questions/tagged/python?tab=votes&page={}&pagesize=15'

        for page in range(1, 84322):
            urls.append(_url.format(page))

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):

3 Source : bench.py
with MIT License
from autofelix

    def start_requests(self):
        qargs = {'total': self.total, 'show': self.show}
        url = '{}?{}'.format(self.baseurl, urlencode(qargs, doseq=1))
        return [scrapy.Request(url, dont_filter=True)]

    def parse(self, response):

3 Source : gov_spider.py
with Apache License 2.0
from bernard0047

    def start_requests(self):    
        for url in self.start_urls:
            yield scrapy.Request(url=url,callback = self.parse)

    
    def parse(self,response):

3 Source : fifa_spider.py
with Mozilla Public License 2.0
from BradleyGrantham

    def start_requests(self):
        urls = [
            "https://www.fifaindex.com/players/fifa19/",
            "https://www.fifaindex.com/players/fifa18/",
            "https://www.fifaindex.com/players/fifa17/",
            "https://www.fifaindex.com/players/fifa16/",
            "https://www.fifaindex.com/players/fifa15/",
            "https://www.fifaindex.com/players/fifa14/",
            "https://www.fifaindex.com/players/fifa13/",
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):

3 Source : fifa_spider.py
with Mozilla Public License 2.0
from BradleyGrantham

    def start_requests(self):
        urls = [
            "https://www.fifaindex.com/teams/fifa19/",
            "https://www.fifaindex.com/teams/fifa18/",
            "https://www.fifaindex.com/teams/fifa17/",
            "https://www.fifaindex.com/teams/fifa16/",
            "https://www.fifaindex.com/teams/fifa15/",
            "https://www.fifaindex.com/teams/fifa14/",
            "https://www.fifaindex.com/teams/fifa13/",
            "https://www.fifaindex.com/teams/fifa12/"
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):

3 Source : fifa_spider.py
with Mozilla Public License 2.0
from BradleyGrantham

    def start_requests(self):
        more_urls = [
            "https://www.betstudy.com/soccer-stats/c/germany/bundesliga/d/fixtures/",
            "http://www.betstudy.com/soccer-stats/c/england/premier-league/d/fixtures/"
        ]
        urls = [
            "https://www.betstudy.com/soccer-stats/c/france/ligue-1/d/fixtures/"
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_fixtures)

    @staticmethod

3 Source : pipelines.py
with Apache License 2.0
from cat9

    def get_media_requests(self, item, info):
        if item['img_urls']:
            images = item['img_urls'].split(' || ')
            for image in images:
                yield scrapy.Request(image, priority=15)

    def item_completed(self, results, item, info):

3 Source : aliexpress.py
with Apache License 2.0
from cat9

    def start_requests(self):
        self._wait_for_login()
        self.debug = self.settings.getbool("IS_DEBUG", False)
        self.max_page_count = self.settings.getint("MAX_PAGE_COUNT", 10)
        self.begin_category = self.settings.get('BEGIN_CATEGORY')
        self.end_category = self.settings.get('END_CATEGORY')
        print("start_requests,is Debug:", self.debug)
        for url in self.start_urls:
            yield scrapy.Request(url, cookies=self.my_cookies)

    def parse(self, response):

3 Source : weibo_spider.py
with GNU General Public License v3.0
from CharesFang

    def start_requests(self):
        uid_list = self.get_uid_list(self.uid)
        for uid in uid_list:
            u_url = self._u_generator.gen_url(uid)
            yield Request(
                url=u_url, dont_filter=True, meta={'uid': uid}, callback=self._parse_profile, errback=self.parse_err
            )
            t_url = self._t_generator.gen_url(uid=uid, page=None)
            yield Request(
                url=t_url, dont_filter=True, meta={'uid': uid, 'last_page': 0},
                callback=self._parse_tweet, errback=self.parse_err
            )

    def parse(self, response, **kwargs):

3 Source : tweet_info_spider.py
with GNU General Public License v3.0
from CharesFang

    def start_requests(self):
        """
        generate crawling Request from designated uid.
        :return: Target Request obj.
        """
        uid_list = self.get_uid_list(self.uid)
        for uid in uid_list:
            url = self._t_generator.gen_url(uid=uid, page=None)
            yield Request(url=url, dont_filter=True, callback=self._parse_tweet, errback=self.parse_err,
                          meta={'uid': uid, 'last_page': 0})

    def _parse_tweet(self, response, **kwargs):

3 Source : user_info_spider.py
with GNU General Public License v3.0
from CharesFang

    def start_requests(self):
        """
            Generate Request objs by target uid and target url generator
        """
        uid_list = self.get_uid_list(self.uid)
        for uid in uid_list:
            url = self._u_generator(uid)
            yield Request(url=url, dont_filter=True, callback=self._parse_profile, errback=self.parse_err,
                          meta={'uid': uid})

    def _parse_profile(self, response):

3 Source : crawler.py
with MIT License
from Charlie-belmer

	def parse(self, response):
		for item in response.css('.SearchResult'):
			# all URL's
			#response.css('.SearchResult').xpath('a/@href').extract()
			# first url
			#response.css('.SearchResult').xpath('a/@href').extract()

			##start here - all return as single item lists

			plugin_details['name'] = item.css('.SearchResult-name::text').extract()
			plugin_details['url'] = item.xpath('a/@href').extract()
			plugin_details['icon_url'] = item.css('.SearchResult-icon').xpath('@src').extract()
			plugin_details['users'] = item.css('.SearchResult-users-text::text').extract()
			request = scrapy.Request(plugin_details['url'],
                             callback=self.parse_page2)
			request.meta['details'] = plugin_details
			yield request

	def parse_plugin(self, response):

3 Source : clarin.py
with MIT License
from chequeado

    def start_requests(self):
        urls = ["https://www.clarin.com/rss/politica/", "https://www.clarin.com/rss/economia/"]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_seccion)


    def parse_seccion(self, response):

3 Source : clarin.py
with MIT License
from chequeado

    def parse_seccion(self, response):
        feed = feedparser.parse(response.url)
        for entry in feed['entries']:
            request = scrapy.Request(url=entry['link'], callback=self.parse_noticia)
            request.meta['item'] = entry           
            yield request


    def parse_noticia(self, response):

3 Source : diariodelfindelmundo.py
with MIT License
from chequeado

    def start_requests(self):
        
        urls = [
            'http://www.eldiariodelfindelmundo.com/provinciales/',
            'http://www.eldiariodelfindelmundo.com/municipales/'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_seccion)


    def parse_seccion(self, response):

3 Source : diariodelfindelmundo.py
with MIT License
from chequeado

    def parse_seccion(self, response):
        noticias = set(response.xpath('//div[@class="contenedor_general_resultados"]//h4[@class="titulo_listado_resultados "]/a/@href').extract())
        for noticia in noticias:
            nota = response.urljoin(noticia)
            yield scrapy.Request(url=nota, callback=self.parse_noticia)


    def parse_noticia(self, response):

3 Source : diarioprensa.py
with MIT License
from chequeado

    def start_requests(self):
        
        urls = [
            'http://www.diarioprensa.com.ar/category/politica/',
            'http://www.diarioprensa.com.ar/category/economia/'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_seccion)


    def parse_seccion(self, response):

3 Source : diarioprensa.py
with MIT License
from chequeado

    def parse_seccion(self, response):
        noticias = set(response.xpath('//main[@id="main"]/article/header/h2/a/@href').extract())

        for noticia in noticias:
            nota = response.urljoin(noticia)
            yield scrapy.Request(url=nota, callback=self.parse_noticia)


    def parse_noticia(self, response):

3 Source : diariorionegro.py
with MIT License
from chequeado

    def start_requests(self):
        
        urls = [
            'http://www.rionegro.com.ar/region',
            #'http://www.rionegro.com.ar/argentina'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_seccion)


    def parse_seccion(self, response):

3 Source : diariorionegro.py
with MIT License
from chequeado

    def parse_seccion(self, response):
        links = response.xpath('//a[contains(@href, "/region")]/@href')
        for link in links:
            entry_url = response.urljoin(link.extract())
            request = scrapy.Request(url=entry_url, callback=self.parse_noticia)
            yield request


    def parse_noticia(self, response):

3 Source : elchubut.py
with MIT License
from chequeado

    def start_requests(self):
        
        urls = [
            'http://www.elchubut.com.ar/rss'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_seccion)


    def parse_seccion(self, response):

3 Source : elchubut.py
with MIT License
from chequeado

    def parse_seccion(self, response):
        feed = feedparser.parse(response.url)
        for entry in feed['entries']:
            if entry['category'] == 'Regionales':
                request = scrapy.Request(url=entry['link'], callback=self.parse_noticia)
                request.meta['item'] = entry           
                yield request


    def parse_noticia(self, response):

3 Source : eldia.py
with MIT License
from chequeado

    def start_requests(self):
        
        urls = [
            'http://www.eldia.com/seccion/politica-y-economia'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_seccion)


    def parse_seccion(self, response):

3 Source : eldia.py
with MIT License
from chequeado

    def parse_seccion(self, response):
        noticias = set(response.xpath('//div[@id="main_seccion"]/article//a/@href').extract())

        for noticia in noticias:
            nota = response.urljoin(noticia)
            yield scrapy.Request(url=nota, callback=self.parse_noticia)


    def parse_noticia(self, response):

3 Source : elindependiente.py
with MIT License
from chequeado

    def start_requests(self):
        
        urls = [
            'http://www.elindependiente.com.ar/seccion.php?seccion=1',
            'http://www.elindependiente.com.ar/seccion.php?seccion=5'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_seccion)


    def parse_seccion(self, response):

3 Source : elindependiente.py
with MIT License
from chequeado

    def parse_seccion(self, response):
        seccion = response.url.split('/')[-1]

        noticias = set(response.xpath('//a/@href').extract())
        noticias = [BASE_URL + p for p in noticias if 'pagina.php' in p]
        
        for noticia in noticias:
            nota = response.urljoin(noticia)
            yield scrapy.Request(url=nota, callback=self.parse_noticia)



    def parse_noticia(self, response):

See More Examples