acrawler.http.Request

Here are the examples of the python api acrawler.http.Request taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

15 Examples 7

3 Source : chain.py
with GNU General Public License v3.0
from wind2sing

    def to_vanilla(self, **kwargs):
        urls = []
        while self._urls:
            urls.append(self._urls.pop())
        for url in reversed(urls):
            yield Request(url, **self.kws, **kwargs)

    def callback(self):

3 Source : crawler.py
with GNU General Public License v3.0
from wind2sing

    async def start_requests(self):
        """Should be rewritten for your custom spider.

        Otherwise it will yield every url in :attr:`start_urls`. Any Request yielded from :meth:`start_requests`
        will combine :meth:`parse` to its callbacks and passes all callbacks to Response
        """
        for url in self.start_urls:
            yield Request(url)

    async def parse(self, response: _Response):

3 Source : crawler.py
with GNU General Public License v3.0
from wind2sing

    def web_add_task_query(self, query: dict):
        """ This method is to deal with web requests if you enable the web service. New tasks should be
        yielded in this method. And Crawler will finish tasks to send response. Should be overwritten.

        Args:
            query: a multidict.
        """
        url = query.pop("url", "")
        if url:
            task = Request(url=url, **query)
            yield task
        else:
            raise Exception("Not valid url from web request!")
        yield None

    def web_action_after_query(self, items):

3 Source : handlers.py
with GNU General Public License v3.0
from wind2sing

    async def _next_requests_from_redis_start(self):
        start_key = self.crawler.config.get("REDIS_START_KEY")
        if start_key:
            while True:
                url = await self.redis.spop(start_key)
                if url:
                    url = url.decode()
                    task = Request(url, callback=self.crawler.parse)
                    await self.crawler.add_task(task)
                    await asyncio.sleep(0)
                else:
                    await asyncio.sleep(0.5)

    async def on_close(self):

3 Source : test_request.py
with GNU General Public License v3.0
from wind2sing

def test_fp():
    rq = Request("https://httpbin.org/cookies/set?name=crawler&age=18")
    assert rq.fingerprint == "7c6accfd1f05cb417373b9f00f3d9b1bd90bbb78"


def test_same_fp():

3 Source : test_request.py
with GNU General Public License v3.0
from wind2sing

def test_same_fp():
    rq1 = Request("https://www.google.com")
    rq2 = Request("https://www.google.com")
    assert rq1.fingerprint == rq2.fingerprint

    rq3 = Request("https://httpbin.org/cookies/set?name=crawler&age=18")
    rq4 = Request("https://httpbin.org/cookies/set?age=18&name=crawler")
    rq5 = Request("https://httpbin.org/cookies/set?age=18&name=crawler#fragment")
    assert rq3.fingerprint == rq4.fingerprint
    assert rq3.fingerprint == rq5.fingerprint


def test_diff_fp():

3 Source : test_request.py
with GNU General Public License v3.0
from wind2sing

def test_diff_fp():
    rq1 = Request("https://www.google.com")
    rq2 = Request("https://httpbin.org/cookies/set?name=crawler&age=18")
    rq3 = Request("https://httpbin.org/cookies/set")
    assert rq1.fingerprint != rq2.fingerprint
    assert rq2.fingerprint != rq3.fingerprint


@pytest.mark.asyncio

3 Source : test_request.py
with GNU General Public License v3.0
from wind2sing

async def test_parse():
    def cb(resp):
        assert resp.status == 200
        return resp.status

    rq1 = Request("https://httpbin.org/json", callback=cb)
    resp = await rq1.send()
    assert await resp.parse()


@pytest.mark.asyncio

3 Source : test_scheduler.py
with GNU General Public License v3.0
from wind2sing

async def test_queue():
    sdl = Scheduler()
    rq1 = Request("https://www.baidu.com")
    rq2 = Request("https://www.bing.com")
    assert await sdl.produce(rq1)
    assert await sdl.produce(rq2)
    assert not await sdl.produce(rq1)


@pytest.mark.asyncio

3 Source : test_scheduler.py
with GNU General Public License v3.0
from wind2sing

async def test_dont_filter():
    sdl = Scheduler()
    rq1 = Request("https://www.baidu.com")
    rq2 = Request("https://www.bing.com", dont_filter=True)
    assert await sdl.produce(rq1)
    assert not await sdl.produce(rq1)
    assert await sdl.produce(rq2)
    assert await sdl.produce(rq2)


@pytest.mark.asyncio

3 Source : test_scheduler.py
with GNU General Public License v3.0
from wind2sing

async def test_consume():
    sdl = Scheduler()
    rq1 = Request("https://www.baidu.com")
    rq2 = Request("https://www.bing.com")
    assert await sdl.produce(rq1)
    assert await sdl.produce(rq2)
    assert await sdl.consume() is rq1

0 Source : chain.py
with GNU General Public License v3.0
from wind2sing

    def follow(self, css: str, limit: int = 0, pass_meta=False, **kwargs):
        req = ChainRequest()
        req.meta(kwargs.pop("meta", {}))
        if pass_meta:
            req.meta(self.kws.pop("meta", {}))

        def fn(resp: Response):
            count = 0
            for url in resp.sel.css(css).getall():
                m = req.kws.pop("meta", {})
                if resp.meta:
                    m.update(resp.meta)
                yield Request(url, meta=m, **req.kws, **kwargs)
                count += 1
                if limit and count >= limit:
                    break

        self.kws["callback"].append(fn)
        return req

    def paginate(self, css: str, limit: int = 0, pass_meta=False, **kwargs):

0 Source : test_queue.py
with GNU General Public License v3.0
from wind2sing

async def pq_push_pop(q):
    await q.start()
    await q.clear()

    task3 = DummyTask("I am task 3.", priority=3)
    await q.push(task3)
    task1 = DummyTask("I am task 1.", priority=1)
    await q.push(task1)
    task4 = DummyTask("I am task 4.", priority=3)
    await q.push(task4)
    task2 = DummyTask("I am task 2.", priority=2)
    await q.push(task2)

    t = await q.pop()
    assert t.val == task3.val
    await q.pop()
    await q.pop()
    t = await q.pop()
    assert t.val == task1.val
    assert await q.get_length() == 0

    req = Request(
        "http://httpbin.org/user-agent",
        request_config={"headers": {"User-Agent": "TestClient"}},
        priority=1,
    )
    await q.push(req)
    task = await q.pop()
    resp = await task.fetch()
    assert "TestClient" in resp.text
    await q.close()


@pytest.mark.asyncio

0 Source : test_request.py
with GNU General Public License v3.0
from wind2sing

async def test_send():
    rq1 = Request("https://httpbin.org/json")
    resp1 = await rq1.send()
    assert resp1.json == {
        "slideshow": {
            "author": "Yours Truly",
            "date": "date of publication",
            "slides": [
                {"title": "Wake up to WonderWidgets!", "type": "all"},
                {
                    "items": [
                        "Why   <  em>WonderWidgets < /em> are great",
                        "Who  < em>buys < /em> WonderWidgets",
                    ],
                    "title": "Overview",
                    "type": "all",
                },
            ],
            "title": "Sample Slide Show",
        }
    }


@pytest.mark.asyncio

0 Source : test_request.py
with GNU General Public License v3.0
from wind2sing

async def test_dumps():
    rq1 = Request("https://httpbin.org/json")
    await rq1.send()
    assert pickle.dumps(rq1)