feedsearch_crawler.crawler.request.Request

Here are the examples of the python api feedsearch_crawler.crawler.request.Request taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

1 Examples 7

0 Source : crawler.py
with MIT License
from DBeath

    async def follow(
        self,
        url: Union[str, URL],
        callback=None,
        response: Response = None,
        method: str = "GET",
        delay: Union[float, None] = None,
        priority: int = 0,
        allow_domain: bool = False,
        cb_kwargs: Dict = None,
        max_content_length: int = None,
        timeout: float = None,
        retries: int = None,
        **kwargs,
    ) -> Union[Request, None]:
        """
        Follow a URL by creating an HTTP Request.

        If the URL is not absolute then it is joined with the previous Response URL.
        The previous Response history is copied to the Request.

        Before a Request is followed, first check that the Request URL has not already been seen,
        that the max URL depth has not been reached, and that the URI scheme is allowed.

        These checks are performed before the Request is created so that we don't yield multiple requests
        to the same URL to the queue for further processing. We want to stop duplicates and invalid
        requests as early as possible.

        :param url: URL to follow.
        :param callback: Callback method to run if the Request is successful.
        :param response: Previous Response that contained the Request URL.
        :param kwargs: Optional Request keyword arguments. See Request for details.
        :param method: HTTP method for Request.
        :param delay: Optionally override the default delay for the Request.
        :param priority: Optionally override the default priority of the Request.
        :param allow_domain: Optionally override the allowed domains check.
        :param max_content_length: Optionally override the maximum allowed size in bytes of Response body.
        :param retries: Optionally override the number of Request retries.
        :param timeout: Optionally override the Request timeout.
        :param cb_kwargs: Optional Dictionary of keyword arguments to be passed to the callback function.
        :return: Request
        """
        original_url = copy.copy(url)
        if isinstance(url, str):
            url = parse_href_to_url(url)

        if not url:
            logger.warning("Attempted to follow invalid URL: %s", original_url)
            return

        history = []
        if response:
            # Join the URL to the Response URL if it doesn't contain a domain.
            if not url.is_absolute() or not url.scheme:
                url = coerce_url(
                    response.origin.join(url), default_scheme=response.scheme
                )

            # Restrict the depth of the Request chain to the maximum depth.
            # This test happens before the URL duplicate check so that the URL might still be reachable by another path.
            if self.max_depth and len(response.history) >= self.max_depth:
                logger.debug("Max Depth of '%d' reached: %s", self.max_depth, url)
                return

            # Copy the Response history so that it isn't a reference to a mutable object.
            history = copy.deepcopy(response.history)
        else:
            if not url.is_absolute():
                logger.debug("URL should have domain: %s", url)
                return

            if not url.scheme:
                url = coerce_url(url)

        # The URL scheme must be in the list of allowed schemes.
        if self.allowed_schemes and url.scheme not in self.allowed_schemes:
            logger.debug("URI Scheme '%s' not allowed: %s", url.scheme, url)
            return

        # The URL host must be in the list of allowed domains.
        if not allow_domain and not self.is_allowed_domain(url):
            logger.debug("Domain '%s' not allowed: %s", url.host, url)
            return

        # Check if URL is not already seen, and add it to the duplicate filter seen list.
        if await self._duplicate_filter.url_seen(url, method):
            return

        request = Request(
            url=url,
            request_session=self._session,
            history=history,
            callback=callback,
            xml_parser=self.parse_xml,
            max_content_length=max_content_length or self.max_content_length,
            timeout=timeout or self.request_timeout,
            method=method,
            delay=delay if isinstance(delay, float) else self.delay,
            retries=retries or self.max_retries,
            cb_kwargs=cb_kwargs,
            **kwargs,
        )

        # Override the Request priority only if the kwarg is provided.
        if priority:
            request.priority = priority

        return request

    @abstractmethod