Here are the examples of the python api scrapy.Spider taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
30 Examples
3
Source : test_loaders.py
with MIT License
from ejulio
with MIT License
from ejulio
def get_crawler():
def _crawler(settings={}):
crawler = Crawler(Spider, settings=settings)
crawler.spider = Spider("dummy")
return crawler
return _crawler
def test_start_urls_loader_not_configured(get_crawler):
3
Source : test_pipelines_mongo.py
with GNU General Public License v3.0
from scrapedia
with GNU General Public License v3.0
from scrapedia
def setUp(self) -> None:
self.settings = Settings()
self.settings.setmodule(module=default_settings)
self.settings.setdict(self.mongo_settings)
self.spider = Spider(name="TestMongoPipeline")
self.pipe = MongoPipeline.from_settings(settings=self.settings)
yield self.pipe.open_spider(spider=None)
@inlineCallbacks
3
Source : test_providers.py
with BSD 3-Clause "New" or "Revised" License
from scrapinghub
with BSD 3-Clause "New" or "Revised" License
from scrapinghub
def __call__(self, to_provide, response: scrapy.http.Response, spider: scrapy.Spider):
assert isinstance(spider, scrapy.Spider)
ret: List[Any] = []
if Price in to_provide:
ret.append(Price(response.css(".price::text").get()))
if Html in to_provide:
ret.append(Html("Price Html!"))
return ret
def fingerprint(self, to_provide: Set[Callable], request: Request) -> str:
3
Source : test_mixed_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
def test_regular_request(self):
def _test(response):
self.assertIsInstance(response, Response)
self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"])
self.assertEqual(response.url, request.url)
self.assertEqual(response.status, 200)
self.assertNotIn("playwright", response.flags)
request = Request(self.server.urljoin("/index.html"))
return self.handler.download_request(request, Spider("foo")).addCallback(_test)
def test_playwright_request(self):
3
Source : test_mixed_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
def test_playwright_request(self):
def _test(response):
self.assertIsInstance(response, Response)
self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"])
self.assertEqual(response.url, request.url)
self.assertEqual(response.status, 200)
self.assertIn("playwright", response.flags)
request = Request(self.server.urljoin("/index.html"), meta={"playwright": True})
return self.handler.download_request(request, Spider("foo")).addCallback(_test)
3
Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_basic_response(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with StaticMockServer() as server:
meta = {"playwright": True, "playwright_include_page": True}
req = Request(server.urljoin("/index.html"), meta=meta)
resp = await handler._download_request(req, Spider("foo"))
assert isinstance(resp, HtmlResponse)
assert resp.request is req
assert resp.url == req.url
assert resp.status == 200
assert "playwright" in resp.flags
assert resp.css("a::text").getall() == ["Lorem Ipsum", "Infinite Scroll"]
assert isinstance(resp.meta["playwright_page"], PlaywrightPage)
assert resp.meta["playwright_page"].url == resp.url
await resp.meta["playwright_page"].close()
@pytest.mark.asyncio
3
Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_post_request(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with MockServer() as server:
req = FormRequest(
server.urljoin("/delay/2"), meta={"playwright": True}, formdata={"foo": "bar"}
)
resp = await handler._download_request(req, Spider("foo"))
assert resp.request is req
assert resp.url == req.url
assert resp.status == 200
assert "playwright" in resp.flags
assert "Request body: foo=bar" in resp.text
@pytest.mark.asyncio
3
Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_timeout(self):
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 1000,
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
req = Request(server.urljoin("/delay/2"), meta={"playwright": True})
with pytest.raises(TimeoutError):
await handler._download_request(req, Spider("foo"))
@pytest.mark.asyncio
3
Source : scrapy-tests-test_crawler.py
with Apache License 2.0
from SMAT-Lab
with Apache License 2.0
from SMAT-Lab
def test_spider_custom_settings(self):
class MySpider(scrapy.Spider):
name = 'spider'
custom_settings = {
'AUTOTHROTTLE_ENABLED': True
}
crawler = Crawler(MySpider, {})
enabled_exts = [e.__class__ for e in crawler.extensions.middlewares]
self.assertIn(AutoThrottle, enabled_exts)
class CrawlerLoggingTestCase(unittest.TestCase):
3
Source : scrapy-tests-test_crawler.py
with Apache License 2.0
from SMAT-Lab
with Apache License 2.0
from SMAT-Lab
def test_no_root_handler_installed(self):
handler = get_scrapy_root_handler()
if handler is not None:
logging.root.removeHandler(handler)
class MySpider(scrapy.Spider):
name = 'spider'
Crawler(MySpider, {})
assert get_scrapy_root_handler() is None
def test_spider_custom_settings_log_level(self):
3
Source : tests.py
with MIT License
from weiyu666
with MIT License
from weiyu666
def setUp(self):
self.spider = Spider('myspider')
self.key = 'scrapy_redis:tests:%s:queue' % self.spider.name
self.q = self.queue_cls(self.server, Spider('myspider'), self.key)
def tearDown(self):
3
Source : tests.py
with MIT License
from weiyu666
with MIT License
from weiyu666
def setUp(self):
self.persist = False
self.key_prefix = 'scrapy_redis:tests:'
self.queue_key = self.key_prefix + '%(spider)s:requests'
self.dupefilter_key = self.key_prefix + '%(spider)s:dupefilter'
self.idle_before_close = 0
self.scheduler = Scheduler(self.server, self.persist, self.queue_key,
SpiderQueue, self.dupefilter_key,
self.idle_before_close)
self.spider = Spider('myspider')
def tearDown(self):
0
Source : rmq_callback.py
with MIT License
from groupbwt
with MIT License
from groupbwt
def rmq_callback(callback_method):
@functools.wraps(callback_method)
def wrapper(self, *args, **kwargs):
delivery_tag_meta_key = RMQConstants.DELIVERY_TAG_META_KEY.value
callback_result = callback_method(self, *args, **kwargs)
if isinstance(self, scrapy.Spider):
if len(args) > 0:
response = args[0]
if isinstance(response, scrapy.http.Response):
delivery_tag = response.meta.get(delivery_tag_meta_key, None)
try:
iter(callback_result)
for callback_result_item in callback_result:
if isinstance(callback_result_item, scrapy.Item):
self.crawler.signals.send_catch_log(
signal=item_scheduled,
response=response,
spider=self,
delivery_tag=delivery_tag,
)
yield callback_result_item
except TypeError:
pass
self.crawler.signals.send_catch_log(
signal=callback_completed,
response=response,
spider=self,
delivery_tag=delivery_tag,
)
else:
try:
iter(callback_result)
for callback_result_item in callback_result:
if isinstance(callback_result_item, scrapy.Item):
self.crawler.signals.send_catch_log(signal=item_scheduled, spider=self)
yield callback_result_item
except TypeError:
pass
self.crawler.signals.send_catch_log(signal=callback_completed, spider=self)
else:
try:
iter(callback_result)
yield from callback_result
except TypeError:
pass
wrapper.__decorator_name__ = inspect.currentframe().f_code.co_name
return wrapper
0
Source : rmq_errback.py
with MIT License
from groupbwt
with MIT License
from groupbwt
def rmq_errback(errback_method):
@functools.wraps(errback_method)
def wrapper(self, *args, **kwargs):
delivery_tag_meta_key = RMQConstants.DELIVERY_TAG_META_KEY.value
errback_result = errback_method(self, *args, **kwargs)
if isinstance(self, scrapy.Spider):
if len(args) > 0:
response_or_failure = args[0]
if isinstance(response_or_failure, scrapy.http.Response):
delivery_tag = response_or_failure.meta.get(delivery_tag_meta_key, None)
try:
iter(errback_result)
for errback_result_item in errback_result:
if isinstance(errback_result_item, scrapy.Item):
self.crawler.signals.send_catch_log(
signal=item_scheduled,
response=response_or_failure,
spider=self,
delivery_tag=delivery_tag,
)
yield errback_result_item
except TypeError:
pass
self.crawler.signals.send_catch_log(
signal=errback_completed,
response=response_or_failure,
spider=self,
delivery_tag=delivery_tag,
)
if isinstance(response_or_failure, Failure):
if hasattr(response_or_failure, "request"):
delivery_tag = response_or_failure.request.meta.get(
delivery_tag_meta_key, None
)
try:
iter(errback_result)
for errback_result_item in errback_result:
if isinstance(errback_result_item, scrapy.Item):
self.crawler.signals.send_catch_log(
signal=item_scheduled,
response=response_or_failure,
spider=self,
delivery_tag=delivery_tag,
)
yield errback_result_item
except TypeError:
pass
self.crawler.signals.send_catch_log(
signal=errback_completed,
failure=response_or_failure,
spider=self,
delivery_tag=delivery_tag,
)
else:
try:
iter(errback_result)
for errback_result_item in errback_result:
if (
isinstance(errback_result_item, scrapy.Item)
and delivery_tag_meta_key in errback_result_item.keys()
):
self.crawler.signals.send_catch_log(
signal=item_scheduled,
response=None,
spider=self,
delivery_tag=errback_result_item[delivery_tag_meta_key],
)
except TypeError:
pass
self.crawler.signals.send_catch_log(signal=errback_completed)
else:
try:
iter(errback_result)
for errback_result_item in errback_result:
if (
isinstance(errback_result_item, scrapy.Item)
and delivery_tag_meta_key in errback_result_item.keys()
):
self.crawler.signals.send_catch_log(
signal=item_scheduled,
response=None,
spider=self,
delivery_tag=errback_result_item[delivery_tag_meta_key],
)
except TypeError:
pass
wrapper.__decorator_name__ = inspect.currentframe().f_code.co_name
return wrapper
0
Source : test_middleware.py
with BSD 3-Clause "New" or "Revised" License
from scrapinghub
with BSD 3-Clause "New" or "Revised" License
from scrapinghub
def spider_for(injectable: Type):
class InjectableSpider(scrapy.Spider):
url = None
custom_settings = {
"SCRAPY_POET_PROVIDERS": {
WithFuturesProvider: 1,
WithDeferredProvider: 2,
ExtraClassDataProvider: 3,
}
}
def start_requests(self):
yield Request(self.url, capture_exceptions(callback_for(injectable)))
return InjectableSpider
@attr.s(auto_attribs=True)
0
Source : test_browser_contexts.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_contexts_max_pages(self):
settings = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 2,
"PLAYWRIGHT_CONTEXTS": {
"a": {"java_script_enabled": True},
"b": {"java_script_enabled": True},
},
}
async with make_handler(settings) as handler:
with StaticMockServer() as server:
requests = [
handler._download_request(
Request(
server.urljoin(f"/index.html?a={i}"),
meta={"playwright": True, "playwright_context": "a"},
),
Spider("foo"),
)
for i in range(20)
] + [
handler._download_request(
Request(
server.urljoin(f"/index.html?b={i}"),
meta={"playwright": True, "playwright_context": "b"},
),
Spider("foo"),
)
for i in range(20)
]
await asyncio.gather(*requests)
assert handler.stats.get_value("playwright/page_count/max_concurrent") == 4
@pytest.mark.asyncio
0
Source : test_browser_contexts.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_contexts_startup(self):
settings = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {
"first": {
"storage_state": {
"cookies": [
{
"url": "https://example.org",
"name": "foo",
"value": "bar",
},
],
},
},
},
}
async with make_handler(settings) as handler:
assert len(handler.contexts) == 1
assert len(handler.context_semaphores) == 1
with StaticMockServer() as server:
meta = {
"playwright": True,
"playwright_include_page": True,
"playwright_context": "first",
}
req = Request(server.urljoin("/index.html"), meta=meta)
resp = await handler._download_request(req, Spider("foo"))
page = resp.meta["playwright_page"]
storage_state = await page.context.storage_state()
await page.context.close()
await page.close()
cookie = storage_state["cookies"][0]
assert cookie["name"] == "foo"
assert cookie["value"] == "bar"
assert cookie["domain"] == "example.org"
@pytest.mark.asyncio
0
Source : test_browser_contexts.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_contexts_dynamic(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
assert len(handler.contexts) == 0
assert len(handler.context_semaphores) == 0
with StaticMockServer() as server:
meta = {
"playwright": True,
"playwright_include_page": True,
"playwright_context": "new",
"playwright_context_kwargs": {
"storage_state": {
"cookies": [
{
"url": "https://example.org",
"name": "asdf",
"value": "qwerty",
},
],
},
},
}
req = Request(server.urljoin("/index.html"), meta=meta)
resp = await handler._download_request(req, Spider("foo"))
assert len(handler.contexts) == 1
assert len(handler.context_semaphores) == 1
page = resp.meta["playwright_page"]
storage_state = await page.context.storage_state()
await page.close()
cookie = storage_state["cookies"][0]
assert cookie["name"] == "asdf"
assert cookie["value"] == "qwerty"
assert cookie["domain"] == "example.org"
@pytest.mark.asyncio
0
Source : test_browser_contexts.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_deprecated_setting(self):
settings = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXT_ARGS": {
"storage_state": {
"cookies": [
{
"url": "https://example.org",
"name": "asdf",
"value": "qwerty",
},
],
},
},
}
with warnings.catch_warnings(record=True) as warning_list:
async with make_handler(settings) as handler:
assert warning_list[0].category is DeprecationWarning
assert str(warning_list[0].message) == (
"The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use"
" PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in"
" PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context"
)
assert len(handler.contexts) == 1
assert len(handler.context_semaphores) == 1
with StaticMockServer() as server:
meta = {
"playwright": True,
"playwright_include_page": True,
}
req = Request(server.urljoin("/index.html"), meta=meta)
resp = await handler._download_request(req, Spider("foo"))
page = resp.meta["playwright_page"]
storage_state = await page.context.storage_state()
await page.close()
cookie = storage_state["cookies"][0]
assert cookie["name"] == "asdf"
assert cookie["value"] == "qwerty"
assert cookie["domain"] == "example.org"
class TestCaseMultipleContextsChromium(MixinTestCaseMultipleContexts):
0
Source : test_page_coroutines.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_page_non_page_coroutine(self, caplog):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with StaticMockServer() as server:
req = Request(
url=server.urljoin("/index.html"),
meta={
"playwright": True,
"playwright_page_coroutines": [
"not-a-page-coroutine",
5,
None,
],
},
)
resp = await handler._download_request(req, Spider("foo"))
assert isinstance(resp, HtmlResponse)
assert resp.request is req
assert resp.url == server.urljoin("/index.html")
assert resp.status == 200
assert "playwright" in resp.flags
for obj in req.meta["playwright_page_coroutines"]:
assert (
"scrapy-playwright",
logging.WARNING,
f"Ignoring {repr(obj)}: expected PageCoroutine, got {repr(type(obj))}",
) in caplog.record_tuples
@pytest.mark.asyncio
0
Source : test_page_coroutines.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_page_mixed_page_coroutines(self, caplog):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with StaticMockServer() as server:
req = Request(
url=server.urljoin("/index.html"),
meta={
"playwright": True,
"playwright_page_coroutines": {
"does_not_exist": PageCoroutine("does_not_exist"),
"is_closed": PageCoroutine("is_closed"), # not awaitable
"title": PageCoroutine("title"), # awaitable
},
},
)
resp = await handler._download_request(req, Spider("foo"))
assert isinstance(resp, HtmlResponse)
assert resp.request is req
assert resp.url == server.urljoin("/index.html")
assert resp.status == 200
assert "playwright" in resp.flags
does_not_exist = req.meta["playwright_page_coroutines"]["does_not_exist"]
assert (
"scrapy-playwright",
logging.WARNING,
f"Ignoring {repr(does_not_exist)}: could not find coroutine",
) in caplog.record_tuples
assert not req.meta["playwright_page_coroutines"]["is_closed"].result
assert req.meta["playwright_page_coroutines"]["title"].result == "Awesome site"
class TestPageCoroutineChromium(MixinPageCoroutineTestCase):
0
Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_page_coroutine_navigation(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with StaticMockServer() as server:
req = Request(
url=server.urljoin("/index.html"),
meta={
"playwright": True,
"playwright_page_coroutines": [PageCoro("click", "a.lorem_ipsum")],
},
)
resp = await handler._download_request(req, Spider("foo"))
assert isinstance(resp, HtmlResponse)
assert resp.request is req
assert resp.url == server.urljoin("/lorem_ipsum.html")
assert resp.status == 200
assert "playwright" in resp.flags
assert resp.css("title::text").get() == "Lorem Ipsum"
text = resp.css("p::text").get()
assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
@pytest.mark.asyncio
0
Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_page_coroutine_infinite_scroll(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with StaticMockServer() as server:
req = Request(
url=server.urljoin("/scroll.html"),
headers={"User-Agent": "scrapy-playwright"},
meta={
"playwright": True,
"playwright_page_coroutines": [
PageCoro("wait_for_selector", selector="div.quote"),
PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
PageCoro("wait_for_selector", selector="div.quote:nth-child(11)"),
PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
PageCoro("wait_for_selector", selector="div.quote:nth-child(21)"),
],
},
)
resp = await handler._download_request(req, Spider("foo"))
assert isinstance(resp, HtmlResponse)
assert resp.request is req
assert resp.url == server.urljoin("/scroll.html")
assert resp.status == 200
assert "playwright" in resp.flags
assert len(resp.css("div.quote")) == 30
@pytest.mark.asyncio
0
Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_context_kwargs(self):
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {
"default": {"java_script_enabled": False},
},
}
async with make_handler(settings_dict) as handler:
with StaticMockServer() as server:
req = Request(
url=server.urljoin("/scroll.html"),
meta={
"playwright": True,
"playwright_page_coroutines": [
PageCoro("wait_for_selector", selector="div.quote", timeout=1000),
],
},
)
with pytest.raises(TimeoutError):
await handler._download_request(req, Spider("foo"))
@pytest.mark.asyncio
0
Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_page_coroutine_screenshot(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with NamedTemporaryFile(mode="w+b") as png_file:
with StaticMockServer() as server:
req = Request(
url=server.urljoin("/index.html"),
meta={
"playwright": True,
"playwright_page_coroutines": {
"png": PageCoro("screenshot", path=png_file.name, type="png"),
},
},
)
await handler._download_request(req, Spider("foo"))
png_file.file.seek(0)
assert png_file.file.read() == req.meta["playwright_page_coroutines"]["png"].result
assert get_mimetype(png_file) == "image/png"
@pytest.mark.asyncio
0
Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_page_coroutine_pdf(self):
if self.browser_type != "chromium":
pytest.skip("PDF generation is supported only in Chromium")
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with NamedTemporaryFile(mode="w+b") as pdf_file:
with StaticMockServer() as server:
req = Request(
url=server.urljoin("/index.html"),
meta={
"playwright": True,
"playwright_page_coroutines": {
"pdf": PageCoro("pdf", path=pdf_file.name),
},
},
)
await handler._download_request(req, Spider("foo"))
pdf_file.file.seek(0)
assert pdf_file.file.read() == req.meta["playwright_page_coroutines"]["pdf"].result
assert get_mimetype(pdf_file) == "application/pdf"
@pytest.mark.asyncio
0
Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_user_agent(self):
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
"USER_AGENT": None,
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
# if Scrapy's user agent is None, use the one from the Browser
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["user-agent"] == self.browser_type
# if Scrapy's user agent is set to some value, use it
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
headers={"User-Agent": "foobar"},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["user-agent"] == "foobar"
@pytest.mark.asyncio
0
Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_use_playwright_headers(self):
"""Ignore Scrapy headers"""
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": "scrapy_playwright.headers.use_playwright_headers", # noqa: E501
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["user-agent"] == self.browser_type
assert "asdf" not in headers
@pytest.mark.asyncio
0
Source : test_playwright_requests.py
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
with BSD 3-Clause "New" or "Revised" License
from scrapy-plugins
async def test_use_custom_headers(self):
"""Custom header processing function"""
async def important_headers(*args, **kwargs) -> dict:
return {"foo": "bar"}
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers,
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["foo"] == "bar"
assert headers.get("user-agent") not in (self.browser_type, "foobar")
assert "asdf" not in headers
@pytest.mark.asyncio
0
Source : scrapy-tests-test_crawler.py
with Apache License 2.0
from SMAT-Lab
with Apache License 2.0
from SMAT-Lab
def test_spider_custom_settings_log_level(self):
log_file = self.mktemp()
class MySpider(scrapy.Spider):
name = 'spider'
custom_settings = {
'LOG_LEVEL': 'INFO',
'LOG_FILE': log_file,
# disable telnet if not available to avoid an extra warning
'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE,
}
configure_logging()
self.assertEqual(get_scrapy_root_handler().level, logging.DEBUG)
crawler = Crawler(MySpider, {})
self.assertEqual(get_scrapy_root_handler().level, logging.INFO)
info_count = crawler.stats.get_value('log_count/INFO')
logging.debug('debug message')
logging.info('info message')
logging.warning('warning message')
logging.error('error message')
with open(log_file, 'rb') as fo:
logged = fo.read().decode('utf8')
self.assertNotIn('debug message', logged)
self.assertIn('info message', logged)
self.assertIn('warning message', logged)
self.assertIn('error message', logged)
self.assertEqual(crawler.stats.get_value('log_count/ERROR'), 1)
self.assertEqual(crawler.stats.get_value('log_count/WARNING'), 1)
self.assertEqual(
crawler.stats.get_value('log_count/INFO') - info_count, 1)
self.assertEqual(crawler.stats.get_value('log_count/DEBUG', 0), 0)
class SpiderLoaderWithWrongInterface: