scrapy.Spider

Here are the examples of the python api scrapy.Spider taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

30 Examples 7

3 Source : test_loaders.py
with MIT License
from ejulio

def get_crawler():
    def _crawler(settings={}):
        crawler = Crawler(Spider, settings=settings)
        crawler.spider = Spider("dummy")
        return crawler

    return _crawler


def test_start_urls_loader_not_configured(get_crawler):

3 Source : test_pipelines_mongo.py
with GNU General Public License v3.0
from scrapedia

    def setUp(self) -> None:
        self.settings = Settings()
        self.settings.setmodule(module=default_settings)
        self.settings.setdict(self.mongo_settings)
        self.spider = Spider(name="TestMongoPipeline")
        self.pipe = MongoPipeline.from_settings(settings=self.settings)
        yield self.pipe.open_spider(spider=None)

    @inlineCallbacks

3 Source : test_providers.py
with BSD 3-Clause "New" or "Revised" License
from scrapinghub

    def __call__(self, to_provide, response: scrapy.http.Response, spider: scrapy.Spider):
        assert isinstance(spider, scrapy.Spider)
        ret: List[Any] = []
        if Price in to_provide:
            ret.append(Price(response.css(".price::text").get()))
        if Html in to_provide:
            ret.append(Html("Price Html!"))
        return ret

    def fingerprint(self, to_provide: Set[Callable], request: Request) -> str:

3 Source : test_mixed_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    def test_regular_request(self):
        def _test(response):
            self.assertIsInstance(response, Response)
            self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"])
            self.assertEqual(response.url, request.url)
            self.assertEqual(response.status, 200)
            self.assertNotIn("playwright", response.flags)

        request = Request(self.server.urljoin("/index.html"))
        return self.handler.download_request(request, Spider("foo")).addCallback(_test)

    def test_playwright_request(self):

3 Source : test_mixed_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    def test_playwright_request(self):
        def _test(response):
            self.assertIsInstance(response, Response)
            self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"])
            self.assertEqual(response.url, request.url)
            self.assertEqual(response.status, 200)
            self.assertIn("playwright", response.flags)

        request = Request(self.server.urljoin("/index.html"), meta={"playwright": True})
        return self.handler.download_request(request, Spider("foo")).addCallback(_test)

3 Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_basic_response(self):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
            with StaticMockServer() as server:
                meta = {"playwright": True, "playwright_include_page": True}
                req = Request(server.urljoin("/index.html"), meta=meta)
                resp = await handler._download_request(req, Spider("foo"))

            assert isinstance(resp, HtmlResponse)
            assert resp.request is req
            assert resp.url == req.url
            assert resp.status == 200
            assert "playwright" in resp.flags
            assert resp.css("a::text").getall() == ["Lorem Ipsum", "Infinite Scroll"]
            assert isinstance(resp.meta["playwright_page"], PlaywrightPage)
            assert resp.meta["playwright_page"].url == resp.url

            await resp.meta["playwright_page"].close()

    @pytest.mark.asyncio

3 Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_post_request(self):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
            with MockServer() as server:
                req = FormRequest(
                    server.urljoin("/delay/2"), meta={"playwright": True}, formdata={"foo": "bar"}
                )
                resp = await handler._download_request(req, Spider("foo"))

            assert resp.request is req
            assert resp.url == req.url
            assert resp.status == 200
            assert "playwright" in resp.flags
            assert "Request body: foo=bar" in resp.text

    @pytest.mark.asyncio

3 Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_timeout(self):
        settings_dict = {
            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
            "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 1000,
        }
        async with make_handler(settings_dict) as handler:
            with MockServer() as server:
                req = Request(server.urljoin("/delay/2"), meta={"playwright": True})
                with pytest.raises(TimeoutError):
                    await handler._download_request(req, Spider("foo"))

    @pytest.mark.asyncio

3 Source : scrapy-tests-test_crawler.py
with Apache License 2.0
from SMAT-Lab

    def test_spider_custom_settings(self):
        class MySpider(scrapy.Spider):
            name = 'spider'
            custom_settings = {
                'AUTOTHROTTLE_ENABLED': True
            }

        crawler = Crawler(MySpider, {})
        enabled_exts = [e.__class__ for e in crawler.extensions.middlewares]
        self.assertIn(AutoThrottle, enabled_exts)


class CrawlerLoggingTestCase(unittest.TestCase):

3 Source : scrapy-tests-test_crawler.py
with Apache License 2.0
from SMAT-Lab

    def test_no_root_handler_installed(self):
        handler = get_scrapy_root_handler()
        if handler is not None:
            logging.root.removeHandler(handler)

        class MySpider(scrapy.Spider):
            name = 'spider'

        Crawler(MySpider, {})
        assert get_scrapy_root_handler() is None

    def test_spider_custom_settings_log_level(self):

3 Source : tests.py
with MIT License
from weiyu666

    def setUp(self):
        self.spider = Spider('myspider')
        self.key = 'scrapy_redis:tests:%s:queue' % self.spider.name
        self.q = self.queue_cls(self.server, Spider('myspider'), self.key)

    def tearDown(self):

3 Source : tests.py
with MIT License
from weiyu666

    def setUp(self):
        self.persist = False
        self.key_prefix = 'scrapy_redis:tests:'
        self.queue_key = self.key_prefix + '%(spider)s:requests'
        self.dupefilter_key = self.key_prefix + '%(spider)s:dupefilter'
        self.idle_before_close = 0
        self.scheduler = Scheduler(self.server, self.persist, self.queue_key,
                                   SpiderQueue, self.dupefilter_key,
                                   self.idle_before_close)
        self.spider = Spider('myspider')

    def tearDown(self):

0 Source : rmq_callback.py
with MIT License
from groupbwt

def rmq_callback(callback_method):
    @functools.wraps(callback_method)
    def wrapper(self, *args, **kwargs):
        delivery_tag_meta_key = RMQConstants.DELIVERY_TAG_META_KEY.value
        callback_result = callback_method(self, *args, **kwargs)
        if isinstance(self, scrapy.Spider):
            if len(args) > 0:
                response = args[0]
                if isinstance(response, scrapy.http.Response):
                    delivery_tag = response.meta.get(delivery_tag_meta_key, None)
                    try:
                        iter(callback_result)
                        for callback_result_item in callback_result:
                            if isinstance(callback_result_item, scrapy.Item):
                                self.crawler.signals.send_catch_log(
                                    signal=item_scheduled,
                                    response=response,
                                    spider=self,
                                    delivery_tag=delivery_tag,
                                )
                            yield callback_result_item
                    except TypeError:
                        pass
                    self.crawler.signals.send_catch_log(
                        signal=callback_completed,
                        response=response,
                        spider=self,
                        delivery_tag=delivery_tag,
                    )
            else:
                try:
                    iter(callback_result)
                    for callback_result_item in callback_result:
                        if isinstance(callback_result_item, scrapy.Item):
                            self.crawler.signals.send_catch_log(signal=item_scheduled, spider=self)
                        yield callback_result_item
                except TypeError:
                    pass
                self.crawler.signals.send_catch_log(signal=callback_completed, spider=self)
        else:
            try:
                iter(callback_result)
                yield from callback_result
            except TypeError:
                pass

    wrapper.__decorator_name__ = inspect.currentframe().f_code.co_name
    return wrapper

0 Source : rmq_errback.py
with MIT License
from groupbwt

def rmq_errback(errback_method):
    @functools.wraps(errback_method)
    def wrapper(self, *args, **kwargs):
        delivery_tag_meta_key = RMQConstants.DELIVERY_TAG_META_KEY.value
        errback_result = errback_method(self, *args, **kwargs)
        if isinstance(self, scrapy.Spider):
            if len(args) > 0:
                response_or_failure = args[0]
                if isinstance(response_or_failure, scrapy.http.Response):
                    delivery_tag = response_or_failure.meta.get(delivery_tag_meta_key, None)
                    try:
                        iter(errback_result)
                        for errback_result_item in errback_result:
                            if isinstance(errback_result_item, scrapy.Item):
                                self.crawler.signals.send_catch_log(
                                    signal=item_scheduled,
                                    response=response_or_failure,
                                    spider=self,
                                    delivery_tag=delivery_tag,
                                )
                            yield errback_result_item
                    except TypeError:
                        pass
                    self.crawler.signals.send_catch_log(
                        signal=errback_completed,
                        response=response_or_failure,
                        spider=self,
                        delivery_tag=delivery_tag,
                    )
                if isinstance(response_or_failure, Failure):
                    if hasattr(response_or_failure, "request"):
                        delivery_tag = response_or_failure.request.meta.get(
                            delivery_tag_meta_key, None
                        )
                        try:
                            iter(errback_result)
                            for errback_result_item in errback_result:
                                if isinstance(errback_result_item, scrapy.Item):
                                    self.crawler.signals.send_catch_log(
                                        signal=item_scheduled,
                                        response=response_or_failure,
                                        spider=self,
                                        delivery_tag=delivery_tag,
                                    )
                                yield errback_result_item
                        except TypeError:
                            pass
                        self.crawler.signals.send_catch_log(
                            signal=errback_completed,
                            failure=response_or_failure,
                            spider=self,
                            delivery_tag=delivery_tag,
                        )
            else:
                try:
                    iter(errback_result)
                    for errback_result_item in errback_result:
                        if (
                            isinstance(errback_result_item, scrapy.Item)
                            and delivery_tag_meta_key in errback_result_item.keys()
                        ):
                            self.crawler.signals.send_catch_log(
                                signal=item_scheduled,
                                response=None,
                                spider=self,
                                delivery_tag=errback_result_item[delivery_tag_meta_key],
                            )
                except TypeError:
                    pass
                self.crawler.signals.send_catch_log(signal=errback_completed)
        else:
            try:
                iter(errback_result)
                for errback_result_item in errback_result:
                    if (
                        isinstance(errback_result_item, scrapy.Item)
                        and delivery_tag_meta_key in errback_result_item.keys()
                    ):
                        self.crawler.signals.send_catch_log(
                            signal=item_scheduled,
                            response=None,
                            spider=self,
                            delivery_tag=errback_result_item[delivery_tag_meta_key],
                        )
            except TypeError:
                pass

    wrapper.__decorator_name__ = inspect.currentframe().f_code.co_name
    return wrapper

0 Source : test_middleware.py
with BSD 3-Clause "New" or "Revised" License
from scrapinghub

def spider_for(injectable: Type):

    class InjectableSpider(scrapy.Spider):

        url = None
        custom_settings = {
            "SCRAPY_POET_PROVIDERS": {
                WithFuturesProvider: 1,
                WithDeferredProvider: 2,
                ExtraClassDataProvider: 3,
            }
        }

        def start_requests(self):
            yield Request(self.url, capture_exceptions(callback_for(injectable)))

    return InjectableSpider


@attr.s(auto_attribs=True)

0 Source : test_browser_contexts.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_contexts_max_pages(self):
        settings = {
            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
            "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 2,
            "PLAYWRIGHT_CONTEXTS": {
                "a": {"java_script_enabled": True},
                "b": {"java_script_enabled": True},
            },
        }
        async with make_handler(settings) as handler:
            with StaticMockServer() as server:
                requests = [
                    handler._download_request(
                        Request(
                            server.urljoin(f"/index.html?a={i}"),
                            meta={"playwright": True, "playwright_context": "a"},
                        ),
                        Spider("foo"),
                    )
                    for i in range(20)
                ] + [
                    handler._download_request(
                        Request(
                            server.urljoin(f"/index.html?b={i}"),
                            meta={"playwright": True, "playwright_context": "b"},
                        ),
                        Spider("foo"),
                    )
                    for i in range(20)
                ]
                await asyncio.gather(*requests)

            assert handler.stats.get_value("playwright/page_count/max_concurrent") == 4

    @pytest.mark.asyncio

0 Source : test_browser_contexts.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_contexts_startup(self):
        settings = {
            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
            "PLAYWRIGHT_CONTEXTS": {
                "first": {
                    "storage_state": {
                        "cookies": [
                            {
                                "url": "https://example.org",
                                "name": "foo",
                                "value": "bar",
                            },
                        ],
                    },
                },
            },
        }
        async with make_handler(settings) as handler:
            assert len(handler.contexts) == 1
            assert len(handler.context_semaphores) == 1

            with StaticMockServer() as server:
                meta = {
                    "playwright": True,
                    "playwright_include_page": True,
                    "playwright_context": "first",
                }
                req = Request(server.urljoin("/index.html"), meta=meta)
                resp = await handler._download_request(req, Spider("foo"))

            page = resp.meta["playwright_page"]
            storage_state = await page.context.storage_state()
            await page.context.close()
            await page.close()
            cookie = storage_state["cookies"][0]
            assert cookie["name"] == "foo"
            assert cookie["value"] == "bar"
            assert cookie["domain"] == "example.org"

    @pytest.mark.asyncio

0 Source : test_browser_contexts.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_contexts_dynamic(self):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
            assert len(handler.contexts) == 0
            assert len(handler.context_semaphores) == 0

            with StaticMockServer() as server:
                meta = {
                    "playwright": True,
                    "playwright_include_page": True,
                    "playwright_context": "new",
                    "playwright_context_kwargs": {
                        "storage_state": {
                            "cookies": [
                                {
                                    "url": "https://example.org",
                                    "name": "asdf",
                                    "value": "qwerty",
                                },
                            ],
                        },
                    },
                }
                req = Request(server.urljoin("/index.html"), meta=meta)
                resp = await handler._download_request(req, Spider("foo"))

            assert len(handler.contexts) == 1
            assert len(handler.context_semaphores) == 1

            page = resp.meta["playwright_page"]
            storage_state = await page.context.storage_state()
            await page.close()
            cookie = storage_state["cookies"][0]
            assert cookie["name"] == "asdf"
            assert cookie["value"] == "qwerty"
            assert cookie["domain"] == "example.org"

    @pytest.mark.asyncio

0 Source : test_browser_contexts.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_deprecated_setting(self):
        settings = {
            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
            "PLAYWRIGHT_CONTEXT_ARGS": {
                "storage_state": {
                    "cookies": [
                        {
                            "url": "https://example.org",
                            "name": "asdf",
                            "value": "qwerty",
                        },
                    ],
                },
            },
        }
        with warnings.catch_warnings(record=True) as warning_list:
            async with make_handler(settings) as handler:
                assert warning_list[0].category is DeprecationWarning
                assert str(warning_list[0].message) == (
                    "The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use"
                    " PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in"
                    " PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context"
                )
                assert len(handler.contexts) == 1
                assert len(handler.context_semaphores) == 1

                with StaticMockServer() as server:
                    meta = {
                        "playwright": True,
                        "playwright_include_page": True,
                    }
                    req = Request(server.urljoin("/index.html"), meta=meta)
                    resp = await handler._download_request(req, Spider("foo"))

                page = resp.meta["playwright_page"]
                storage_state = await page.context.storage_state()
                await page.close()
                cookie = storage_state["cookies"][0]
                assert cookie["name"] == "asdf"
                assert cookie["value"] == "qwerty"
                assert cookie["domain"] == "example.org"


class TestCaseMultipleContextsChromium(MixinTestCaseMultipleContexts):

0 Source : test_page_coroutines.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_page_non_page_coroutine(self, caplog):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
            with StaticMockServer() as server:
                req = Request(
                    url=server.urljoin("/index.html"),
                    meta={
                        "playwright": True,
                        "playwright_page_coroutines": [
                            "not-a-page-coroutine",
                            5,
                            None,
                        ],
                    },
                )
                resp = await handler._download_request(req, Spider("foo"))

            assert isinstance(resp, HtmlResponse)
            assert resp.request is req
            assert resp.url == server.urljoin("/index.html")
            assert resp.status == 200
            assert "playwright" in resp.flags

            for obj in req.meta["playwright_page_coroutines"]:
                assert (
                    "scrapy-playwright",
                    logging.WARNING,
                    f"Ignoring {repr(obj)}: expected PageCoroutine, got {repr(type(obj))}",
                ) in caplog.record_tuples

    @pytest.mark.asyncio

0 Source : test_page_coroutines.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_page_mixed_page_coroutines(self, caplog):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
            with StaticMockServer() as server:
                req = Request(
                    url=server.urljoin("/index.html"),
                    meta={
                        "playwright": True,
                        "playwright_page_coroutines": {
                            "does_not_exist": PageCoroutine("does_not_exist"),
                            "is_closed": PageCoroutine("is_closed"),  # not awaitable
                            "title": PageCoroutine("title"),  # awaitable
                        },
                    },
                )
                resp = await handler._download_request(req, Spider("foo"))

            assert isinstance(resp, HtmlResponse)
            assert resp.request is req
            assert resp.url == server.urljoin("/index.html")
            assert resp.status == 200
            assert "playwright" in resp.flags

            does_not_exist = req.meta["playwright_page_coroutines"]["does_not_exist"]
            assert (
                "scrapy-playwright",
                logging.WARNING,
                f"Ignoring {repr(does_not_exist)}: could not find coroutine",
            ) in caplog.record_tuples
            assert not req.meta["playwright_page_coroutines"]["is_closed"].result
            assert req.meta["playwright_page_coroutines"]["title"].result == "Awesome site"


class TestPageCoroutineChromium(MixinPageCoroutineTestCase):

0 Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_page_coroutine_navigation(self):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
            with StaticMockServer() as server:
                req = Request(
                    url=server.urljoin("/index.html"),
                    meta={
                        "playwright": True,
                        "playwright_page_coroutines": [PageCoro("click", "a.lorem_ipsum")],
                    },
                )
                resp = await handler._download_request(req, Spider("foo"))

            assert isinstance(resp, HtmlResponse)
            assert resp.request is req
            assert resp.url == server.urljoin("/lorem_ipsum.html")
            assert resp.status == 200
            assert "playwright" in resp.flags
            assert resp.css("title::text").get() == "Lorem Ipsum"
            text = resp.css("p::text").get()
            assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit."

    @pytest.mark.asyncio

0 Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_page_coroutine_infinite_scroll(self):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
            with StaticMockServer() as server:
                req = Request(
                    url=server.urljoin("/scroll.html"),
                    headers={"User-Agent": "scrapy-playwright"},
                    meta={
                        "playwright": True,
                        "playwright_page_coroutines": [
                            PageCoro("wait_for_selector", selector="div.quote"),
                            PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
                            PageCoro("wait_for_selector", selector="div.quote:nth-child(11)"),
                            PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
                            PageCoro("wait_for_selector", selector="div.quote:nth-child(21)"),
                        ],
                    },
                )
                resp = await handler._download_request(req, Spider("foo"))

            assert isinstance(resp, HtmlResponse)
            assert resp.request is req
            assert resp.url == server.urljoin("/scroll.html")
            assert resp.status == 200
            assert "playwright" in resp.flags
            assert len(resp.css("div.quote")) == 30

    @pytest.mark.asyncio

0 Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_context_kwargs(self):
        settings_dict = {
            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
            "PLAYWRIGHT_CONTEXTS": {
                "default": {"java_script_enabled": False},
            },
        }
        async with make_handler(settings_dict) as handler:
            with StaticMockServer() as server:
                req = Request(
                    url=server.urljoin("/scroll.html"),
                    meta={
                        "playwright": True,
                        "playwright_page_coroutines": [
                            PageCoro("wait_for_selector", selector="div.quote", timeout=1000),
                        ],
                    },
                )
                with pytest.raises(TimeoutError):
                    await handler._download_request(req, Spider("foo"))

    @pytest.mark.asyncio

0 Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_page_coroutine_screenshot(self):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
            with NamedTemporaryFile(mode="w+b") as png_file:
                with StaticMockServer() as server:
                    req = Request(
                        url=server.urljoin("/index.html"),
                        meta={
                            "playwright": True,
                            "playwright_page_coroutines": {
                                "png": PageCoro("screenshot", path=png_file.name, type="png"),
                            },
                        },
                    )
                    await handler._download_request(req, Spider("foo"))

                png_file.file.seek(0)
                assert png_file.file.read() == req.meta["playwright_page_coroutines"]["png"].result
                assert get_mimetype(png_file) == "image/png"

    @pytest.mark.asyncio

0 Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_page_coroutine_pdf(self):
        if self.browser_type != "chromium":
            pytest.skip("PDF generation is supported only in Chromium")

        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
            with NamedTemporaryFile(mode="w+b") as pdf_file:
                with StaticMockServer() as server:
                    req = Request(
                        url=server.urljoin("/index.html"),
                        meta={
                            "playwright": True,
                            "playwright_page_coroutines": {
                                "pdf": PageCoro("pdf", path=pdf_file.name),
                            },
                        },
                    )
                    await handler._download_request(req, Spider("foo"))

                pdf_file.file.seek(0)
                assert pdf_file.file.read() == req.meta["playwright_page_coroutines"]["pdf"].result
                assert get_mimetype(pdf_file) == "application/pdf"

    @pytest.mark.asyncio

0 Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_user_agent(self):
        settings_dict = {
            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
            "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
            "USER_AGENT": None,
        }
        async with make_handler(settings_dict) as handler:
            with MockServer() as server:
                # if Scrapy's user agent is None, use the one from the Browser
                req = Request(
                    url=server.urljoin("/headers"),
                    meta={"playwright": True},
                )
                resp = await handler._download_request(req, Spider("foo"))
                headers = json.loads(resp.css("pre::text").get())
                headers = {key.lower(): value for key, value in headers.items()}
                assert headers["user-agent"] == self.browser_type

                # if Scrapy's user agent is set to some value, use it
                req = Request(
                    url=server.urljoin("/headers"),
                    meta={"playwright": True},
                    headers={"User-Agent": "foobar"},
                )
                resp = await handler._download_request(req, Spider("foo"))
                headers = json.loads(resp.css("pre::text").get())
                headers = {key.lower(): value for key, value in headers.items()}
                assert headers["user-agent"] == "foobar"

    @pytest.mark.asyncio

0 Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_use_playwright_headers(self):
        """Ignore Scrapy headers"""
        settings_dict = {
            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
            "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
            "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": "scrapy_playwright.headers.use_playwright_headers",  # noqa: E501
        }
        async with make_handler(settings_dict) as handler:
            with MockServer() as server:
                req = Request(
                    url=server.urljoin("/headers"),
                    meta={"playwright": True},
                    headers={"User-Agent": "foobar", "Asdf": "qwerty"},
                )
                resp = await handler._download_request(req, Spider("foo"))
                headers = json.loads(resp.css("pre::text").get())
                headers = {key.lower(): value for key, value in headers.items()}
                assert headers["user-agent"] == self.browser_type
                assert "asdf" not in headers

    @pytest.mark.asyncio

0 Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins

    async def test_use_custom_headers(self):
        """Custom header processing function"""

        async def important_headers(*args, **kwargs) -> dict:
            return {"foo": "bar"}

        settings_dict = {
            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
            "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
            "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers,
        }
        async with make_handler(settings_dict) as handler:
            with MockServer() as server:
                req = Request(
                    url=server.urljoin("/headers"),
                    meta={"playwright": True},
                    headers={"User-Agent": "foobar", "Asdf": "qwerty"},
                )
                resp = await handler._download_request(req, Spider("foo"))
                headers = json.loads(resp.css("pre::text").get())
                headers = {key.lower(): value for key, value in headers.items()}
                assert headers["foo"] == "bar"
                assert headers.get("user-agent") not in (self.browser_type, "foobar")
                assert "asdf" not in headers

    @pytest.mark.asyncio

0 Source : scrapy-tests-test_crawler.py
with Apache License 2.0
from SMAT-Lab

    def test_spider_custom_settings_log_level(self):
        log_file = self.mktemp()

        class MySpider(scrapy.Spider):
            name = 'spider'
            custom_settings = {
                'LOG_LEVEL': 'INFO',
                'LOG_FILE': log_file,
                # disable telnet if not available to avoid an extra warning
                'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE,
            }

        configure_logging()
        self.assertEqual(get_scrapy_root_handler().level, logging.DEBUG)
        crawler = Crawler(MySpider, {})
        self.assertEqual(get_scrapy_root_handler().level, logging.INFO)
        info_count = crawler.stats.get_value('log_count/INFO')
        logging.debug('debug message')
        logging.info('info message')
        logging.warning('warning message')
        logging.error('error message')

        with open(log_file, 'rb') as fo:
            logged = fo.read().decode('utf8')

        self.assertNotIn('debug message', logged)
        self.assertIn('info message', logged)
        self.assertIn('warning message', logged)
        self.assertIn('error message', logged)
        self.assertEqual(crawler.stats.get_value('log_count/ERROR'), 1)
        self.assertEqual(crawler.stats.get_value('log_count/WARNING'), 1)
        self.assertEqual(
            crawler.stats.get_value('log_count/INFO') - info_count, 1)
        self.assertEqual(crawler.stats.get_value('log_count/DEBUG', 0), 0)


class SpiderLoaderWithWrongInterface: