com.brucezee.jspider.Request

Here are the examples of the java api com.brucezee.jspider.Request taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

50 Examples 7

19 Source : WebDriverPool.java
with MIT License
from brucezee

public void shutdownOrReturn(WebDriverEx webDriver, Request request, long expireMillis) {
    if (isWebDriverExpired(webDriver, expireMillis)) {
        shutdownWebDriver(webDriver, request);
    } else {
        returnWebDriver(webDriver, request);
    }
}

19 Source : WebDriverDownloader.java
with MIT License
from brucezee

private Page processFailedPage(Request request) {
    return new Page(request.getUrl(), 0, null, null);
}

19 Source : NoRepeatScheduler.java
with MIT License
from brucezee

@Override
public boolean isDuplicate(Task task, Request request) {
    if (repeatHandler != null) {
        return repeatHandler.isDuplicate(task, request);
    }
    return false;
}

19 Source : NoRepeatScheduler.java
with MIT License
from brucezee

/**
 * 如果获取任务返回为空,是否处理(比如添加新的任务等)
 * @param task 爬虫任务
 * @return 处理返回true,不处理返回false。
 */
protected boolean handleEmptyPoll(Task task) {
    if (pagingRequestFactory != null) {
        try {
            lock.lock();
            List<Request> requests = pagingRequestFactory.getRequests(task);
            if (requests != null && !requests.isEmpty()) {
                boolean success = false;
                for (Request request : requests) {
                    success = push(task, request) || success;
                }
                return success;
            }
        } finally {
            lock.unlock();
        }
    }
    return false;
}

19 Source : NoRepeatScheduler.java
with MIT License
from brucezee

@Override
public boolean push(Task task, Request request) {
    if (shouldReserved(task, request)) {
        pushWhenNoRepeat(task, request);
        return true;
    }
    if (!isDuplicate(task, request)) {
        pushWhenNoRepeat(task, request);
        addRepeatCheck(task, request);
        return true;
    }
    return false;
}

19 Source : NoRepeatScheduler.java
with MIT License
from brucezee

@Override
public Request poll(Task task) {
    // 获取请求任务
    Request request = doPoll(task);
    if (request == null) {
        // 如果任务为空,根据需要重新添加任务
        if (handleEmptyPoll(task)) {
            // 再次获取
            request = doPoll(task);
        }
    }
    return request;
}

19 Source : NoRepeatScheduler.java
with MIT License
from brucezee

/**
 * 控制任务添加
 * @param task 任务
 * @param request 请求
 * @return 是否需要强制添加
 */
protected boolean shouldReserved(Task task, Request request) {
    return false;
}

19 Source : HttpClientFactory.java
with MIT License
from brucezee

public HttpUriRequest createHttpUriRequest(SiteConfig siteConfig, Request request, HttpHost proxy) {
    return createRequestBuilder(siteConfig, request, proxy).build();
}

19 Source : HttpClientFactory.java
with MIT License
from brucezee

public RequestConfig createRequestConfig(SiteConfig siteConfig, Request request, HttpHost proxy) {
    return createRequestConfigBuilder(siteConfig, request, proxy).build();
}

18 Source : WebDriverDownloader.java
with MIT License
from brucezee

private Page processPage(Request request, WebDriverEx webDriver) {
    Header[] headers = getHeaderFromCookieSet(webDriver.manage().getCookies());
    String resource = webDriver.getPageSource();
    return new Page(request.getUrl(), 200, headers, resource);
}

18 Source : DefaultWebDriverChooser.java
with MIT License
from brucezee

@Override
public DriverType choose(Request request) {
    return driverType;
}

18 Source : RedisScheduler.java
with MIT License
from brucezee

@Override
protected void pushWhenNoRepeat(Task task, Request request) {
    Jedis jedis = jedisPool.getResource();
    try {
        jedis.rpush(RedisKeys.getQueueKey(task), serializer.serialize(request));
    } finally {
        jedis.close();
    }
}

18 Source : RequestJsonSerializer.java
with MIT License
from brucezee

@Override
public String serialize(Request object) {
    return JSON.toJSONString(object);
}

18 Source : NoRepeatScheduler.java
with MIT License
from brucezee

@Override
public void resetRequestRepeatCheck(Task task, Request request) {
    if (repeatHandler != null) {
        repeatHandler.resetRequestRepeatCheck(task, request);
    }
}

18 Source : NoRepeatScheduler.java
with MIT License
from brucezee

@Override
public void addRepeatCheck(Task task, Request request) {
    if (repeatHandler != null) {
        repeatHandler.addRepeatCheck(task, request);
    }
}

18 Source : UniversalSubPageProcessor.java
with MIT License
from brucezee

@Override
public boolean isMatch(Request request) {
    return true;
}

18 Source : DefaultHttpProxyPool.java
with MIT License
from brucezee

@Override
public HttpProxy getProxy(Request request) {
    return proxyStrategy.getProxy(httpProxies);
}

18 Source : DefaultHttpProxyPool.java
with MIT License
from brucezee

@Override
public void returnProxy(Request request, int statusCode) {
}

17 Source : WebDriverPool.java
with MIT License
from brucezee

public void returnWebDriver(WebDriverEx webDriver, Request request) {
    DriverType driverType = chooser.choose(request);
    BlockingQueue<WebDriverEx> queue = queueMap.get(driverType);
    if (queue != null) {
        try {
            queue.put(webDriver);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}

17 Source : WebDriverPool.java
with MIT License
from brucezee

public void shutdownWebDriver(WebDriverEx webDriver, Request request) {
    DriverType driverType = chooser.choose(request);
    BlockingQueue<WebDriverEx> queue = queueMap.get(driverType);
    if (queue != null) {
        webDriver.shutdown();
        if (queue instanceof LandlordBlockingQueue) {
            ((LandlordBlockingQueue) queue).resetOne();
        }
    }
}

17 Source : BloomFilterRepeatHandler.java
with MIT License
from brucezee

@Override
public void resetRequestRepeatCheck(Task task, Request request) {
    throw new UnsupportedOperationException();
}

17 Source : UrlMatchSubPageProcessor.java
with MIT License
from brucezee

@Override
public boolean isMatch(Request request) {
    return pattern.matcher(request.getUrl()).matches();
}

17 Source : CompositePageProcessor.java
with MIT License
from brucezee

@Override
public Result process(Request request, Page page) {
    for (SubPageProcessor processor : processors) {
        if (processor.isMatch(request)) {
            return processor.process(request, page);
        }
    }
    throw new IllegalArgumentException("No sub page processor can process request " + request);
}

17 Source : CompositePipeline.java
with MIT License
from brucezee

@Override
public void persist(Request request, Result result) {
    for (SubPipeline pipeline : pipelines) {
        if (pipeline.isMatch(request)) {
            pipeline.persist(request, result);
        }
    }
}

17 Source : HttpClientFactory.java
with MIT License
from brucezee

public RequestConfig.Builder createRequestConfigBuilder(SiteConfig siteConfig, Request request, HttpHost proxy) {
    RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
    requestConfigBuilder.setConnectTimeout(siteConfig.getConnectTimeout());
    requestConfigBuilder.setSocketTimeout(siteConfig.getSocketTimeout());
    requestConfigBuilder.setRedirectsEnabled(siteConfig.isRedirectsEnabled());
    requestConfigBuilder.setConnectionRequestTimeout(siteConfig.getConnectionRequestTimeout());
    requestConfigBuilder.setCircularRedirectsAllowed(siteConfig.isCircularRedirectsAllowed());
    requestConfigBuilder.setMaxRedirects(siteConfig.getMaxRedirects());
    requestConfigBuilder.setCookieSpec(siteConfig.getCookieSpec());
    requestConfigBuilder.setProxy(proxy);
    return requestConfigBuilder;
}

17 Source : DefaultHttpClientPool.java
with MIT License
from brucezee

protected String getHttpClientCacheKey(SiteConfig siteConfig, Request request) {
    return SpiderUrlUtils.getUrlHost(request.getUrl());
}

16 Source : WebDriverDownloader.java
with MIT License
from brucezee

@Override
public Page download(SiteConfig siteConfig, Request request) {
    WebDriverEx webDriver = null;
    try {
        webDriver = webDriverPool.getWebDriver(siteConfig, driverConfig, request);
    } catch (Exception e) {
        logger.error("Failed to get web driver from pool, url : {} {}", request.getUrl(), e);
    }
    if (webDriver == null) {
        return processFailedPage(request);
    }
    try {
        webDriver.get(request.getUrl());
        requestWaiter.waitResponse(siteConfig, request, webDriver);
    } catch (Exception e) {
        logger.error("Failed to request by web driver, url : {} {}", request.getUrl(), e);
    }
    try {
        return processPage(request, webDriver);
    } catch (Exception e) {
        logger.error("Failed to process page by web driver, url : {} {}", request.getUrl(), e);
        return processFailedPage(request);
    } finally {
        webDriverPool.shutdownOrReturn(webDriver, request, driverConfig.getExpiresMillis());
    }
}

16 Source : DefaultRequestWaiter.java
with MIT License
from brucezee

@Override
public void waitResponse(SiteConfig siteConfig, Request request, WebDriverEx webDriver) {
    webDriver.waitWithreplacedleAndDelayed(null, siteConfig.getSocketTimeout(), 1000);
}

16 Source : ShardedRedisScheduler.java
with MIT License
from brucezee

@Override
protected void pushWhenNoRepeat(Task task, Request request) {
    ShardedJedis jedis = jedisPool.getResource();
    try {
        jedis.rpush(RedisKeys.getQueueKey(task), serializer.serialize(request));
    } finally {
        jedis.close();
    }
}

16 Source : RedisPriorityScheduler.java
with MIT License
from brucezee

@Override
protected void pushWhenNoRepeat(Task task, Request request) {
    Jedis jedis = jedisPool.getResource();
    try {
        String content = serializer.serialize(request);
        if (request.getPriority() == 0) {
            jedis.rpush(RedisKeys.getQueueNoPriorityKey(task), content);
        } else if (request.getPriority() > 0) {
            jedis.zadd(RedisKeys.getZsetPlusPriorityKey(task), request.getPriority(), content);
        } else {
            jedis.zadd(RedisKeys.getZsetMinusPriorityKey(task), request.getPriority(), content);
        }
        jedis.sadd(RedisKeys.getSetKey(task), request.key());
    } finally {
        jedis.close();
    }
}

16 Source : QueuePriorityScheduler.java
with MIT License
from brucezee

@Override
public Request doPoll(Task task) {
    Request poll = priorityQueuePlus.poll();
    if (poll != null) {
        return poll;
    }
    poll = noPriorityQueue.poll();
    if (poll != null) {
        return poll;
    }
    return priorityQueueMinus.poll();
}

16 Source : HashSetRepeatHandler.java
with MIT License
from brucezee

@Override
public boolean isDuplicate(Task task, Request request) {
    return urls.contains(request.key());
}

16 Source : BloomFilterRepeatHandler.java
with MIT License
from brucezee

@Override
public void addRepeatCheck(Task task, Request request) {
    bloomFilter.put(request.key());
}

16 Source : LogPipeline.java
with MIT License
from brucezee

@Override
public void persist(Request request, Result result) {
    logger.debug("Request: {} result: {}", request.getUrl(), result);
}

16 Source : ConsolePipeline.java
with MIT License
from brucezee

@Override
public void persist(Request request, Result result) {
    System.out.println("Request: " + request.getUrl() + " result: " + result);
}

16 Source : MonitorSpiderListener.java
with MIT License
from brucezee

@Override
public void onError(Request request, Page page) {
    errorCount.incrementAndGet();
}

16 Source : HttpClientExecutor.java
with MIT License
from brucezee

/**
 * 请求执行器
 * Created by brucezee on 2017/1/6.
 */
public clreplaced HttpClientExecutor {

    private HttpClientPool httpClientPool;

    private HttpProxyPool httpProxyPool;

    private CookieStorePool cookieStorePool;

    private SiteConfig siteConfig;

    private Request request;

    public HttpClientExecutor(HttpClientPool httpClientPool, HttpProxyPool httpProxyPool, CookieStorePool cookieStorePool, SiteConfig siteConfig, Request request) {
        this.httpClientPool = httpClientPool;
        this.httpProxyPool = httpProxyPool;
        this.cookieStorePool = cookieStorePool;
        this.siteConfig = siteConfig;
        this.request = request;
    }

    public <T> Response<T> execute() {
        HttpProxy httpProxy = getHttpProxyFromPool();
        CookieStore cookieStore = getCookieStoreFromPool();
        CloseableHttpClient httpClient = httpClientPool.getHttpClient(siteConfig, request);
        HttpUriRequest httpRequest = httpClientPool.createHttpUriRequest(siteConfig, request, createHttpHost(httpProxy));
        CloseableHttpResponse httpResponse = null;
        IOException executeException = null;
        try {
            HttpContext httpContext = createHttpContext(httpProxy, cookieStore);
            httpResponse = httpClient.execute(httpRequest, httpContext);
        } catch (IOException e) {
            executeException = e;
        }
        Response<T> response = ResponseFactory.createResponse(request.getResponseType(), siteConfig.getCharset(request.getUrl()));
        response.handleHttpResponse(httpResponse, executeException);
        return response;
    }

    private HttpProxy getHttpProxyFromPool() {
        return httpProxyPool != null ? httpProxyPool.getProxy(request) : null;
    }

    private CookieStore getCookieStoreFromPool() {
        return cookieStorePool != null ? cookieStorePool.getCookieStore(request) : null;
    }

    private HttpHost createHttpHost(HttpProxy httpProxy) {
        return httpProxy != null ? new HttpHost(httpProxy.getHost(), httpProxy.getPort()) : null;
    }

    protected HttpContext createHttpContext(HttpProxy httpProxy, CookieStore cookieStore) {
        HttpContext httpContext = new HttpClientContext();
        if (cookieStore != null) {
            httpContext.setAttribute(HttpClientContext.COOKIE_STORE, cookieStore);
        }
        if (httpProxy != null && StringUtils.isNotBlank(httpProxy.getUsername())) {
            CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
            credentialsProvider.setCredentials(new AuthScope(httpProxy.getHost(), httpProxy.getPort()), new UsernamePreplacedwordCredentials(httpProxy.getUsername(), httpProxy.getPreplacedword()));
            httpContext.setAttribute(HttpClientContext.CREDS_PROVIDER, credentialsProvider);
        }
        return httpContext;
    }
}

16 Source : DefaultHttpClientPool.java
with MIT License
from brucezee

@Override
public HttpUriRequest createHttpUriRequest(SiteConfig siteConfig, Request request, HttpHost proxy) {
    return factory.createHttpUriRequest(siteConfig, request, proxy);
}

16 Source : BdbPersistentScheduler.java
with MIT License
from brucezee

@Override
public boolean push(Task task, Request request) {
    queue.add(request);
    count.incrementAndGet();
    return true;
}

15 Source : QueueScheduler.java
with MIT License
from brucezee

@Override
public void pushWhenNoRepeat(Task task, Request request) {
    queue.add(request);
    count.incrementAndGet();
}

15 Source : HashSetRepeatHandler.java
with MIT License
from brucezee

@Override
public void addRepeatCheck(Task task, Request request) {
    urls.add(request.key());
}

15 Source : HashSetRepeatHandler.java
with MIT License
from brucezee

@Override
public void resetRequestRepeatCheck(Task task, Request request) {
    urls.remove(request.key());
}

15 Source : BloomFilterRepeatHandler.java
with MIT License
from brucezee

@Override
public boolean isDuplicate(Task task, Request request) {
    return bloomFilter.mightContain(request.key());
}

15 Source : MonitorSpiderListener.java
with MIT License
from brucezee

@Override
public void onSuccess(Request request, Page page, Result result) {
    successCount.incrementAndGet();
}

14 Source : ShardedRedisPriorityScheduler.java
with MIT License
from brucezee

@Override
protected void pushWhenNoRepeat(Task task, Request request) {
    ShardedJedis jedis = jedisPool.getResource();
    try {
        String content = serializer.serialize(request);
        if (request.getPriority() == 0) {
            jedis.rpush(RedisKeys.getQueueNoPriorityKey(task), content);
        } else if (request.getPriority() > 0) {
            jedis.zadd(RedisKeys.getZsetPlusPriorityKey(task), request.getPriority(), content);
        } else {
            jedis.zadd(RedisKeys.getZsetMinusPriorityKey(task), request.getPriority(), content);
        }
        jedis.sadd(RedisKeys.getSetKey(task), request.key());
    } finally {
        jedis.close();
    }
}

14 Source : QueuePriorityScheduler.java
with MIT License
from brucezee

@Override
public void pushWhenNoRepeat(Task task, Request request) {
    int priority = request.getPriority();
    if (priority == 0) {
        noPriorityQueue.add(request);
    } else if (priority > 0) {
        priorityQueuePlus.add(request);
    } else {
        priorityQueueMinus.add(request);
    }
    count.incrementAndGet();
}

14 Source : DefaultHttpClientPool.java
with MIT License
from brucezee

@Override
public CloseableHttpClient getHttpClient(SiteConfig siteConfig, Request request) {
    String host = getHttpClientCacheKey(siteConfig, request);
    CloseableHttpClient httpClient = httpClients.get(host);
    if (httpClient == null) {
        synchronized (this) {
            httpClient = httpClients.get(host);
            if (httpClient == null) {
                httpClient = factory.createHttpClient(siteConfig);
                httpClients.put(host, httpClient);
            }
        }
    }
    return httpClient;
}

13 Source : WebDriverPool.java
with MIT License
from brucezee

public WebDriverEx getWebDriver(SiteConfig siteConfig, DriverConfig driverConfig, Request request) throws IOException, InterruptedException {
    DriverType driverType = chooser.choose(request);
    LandlordBlockingQueue<WebDriverEx> queue = null;
    queue = queueMap.get(driverType);
    if (queue == null) {
        lock.lockInterruptibly();
        try {
            queue = queueMap.get(driverType);
            if (queue == null) {
                queue = new LandlordBlockingQueue<WebDriverEx>(capacity);
                queueMap.put(driverType, queue);
            }
        } finally {
            lock.unlock();
        }
    }
    WebDriverEx poll = queue.poll();
    if (poll != null) {
        return poll;
    }
    if (queue.isNeedMore()) {
        queue.add(factory.createWebDriver(siteConfig, driverConfig, driverType));
    }
    return queue.poll(siteConfig.getConnectionRequestTimeout(), TimeUnit.MILLISECONDS);
}

13 Source : HttpClientDownloader.java
with MIT License
from brucezee

@Override
public Page download(SiteConfig siteConfig, Request request) {
    HttpClientExecutor executor = new HttpClientExecutor(httpClientPool, httpProxyPool, cookieStorePool, siteConfig, request);
    Response response = executor.execute();
    if (response.isException()) {
        logger.error("download exception, url : {} {}", request.getUrl(), response.getException().getMessage());
    } else if (!response.isSuccess()) {
        logger.error("download failed, url : {}", request.getUrl());
    }
    return new Page(request.getUrl(), response.getStatusCode(), response.getHeaders(), response.getResult());
}

5 Source : HttpClientFactory.java
with MIT License
from brucezee

public RequestBuilder createRequestBuilder(SiteConfig siteConfig, Request request, HttpHost proxy) {
    RequestConfig requestConfig = createRequestConfig(siteConfig, request, proxy);
    RequestBuilder requestBuilder = RequestBuilder.create(request.getMethod());
    requestBuilder.setConfig(requestConfig);
    requestBuilder.setCharset(getDefaultCharset(siteConfig.getCharset(request.getUrl())));
    requestBuilder.setUri(request.getUrl());
    requestBuilder.setEnreplacedy(request.enreplacedy());
    Map<String, String> parameters = request.getParameters();
    if (parameters != null && !parameters.isEmpty()) {
        for (Map.Entry<String, String> entry : parameters.entrySet()) {
            requestBuilder.addParameter(entry.getKey(), StringUtils.defaultString(entry.getValue()));
        }
    }
    Map<String, String> siteHeaders = siteConfig.getHeaders();
    Map<String, String> requestHeaders = request.getHeaders();
    Map<String, String> mergedHeaders = null;
    if (siteHeaders != null && requestHeaders != null) {
        siteHeaders.putAll(requestHeaders);
        mergedHeaders = siteHeaders;
    } else {
        mergedHeaders = siteHeaders != null ? siteHeaders : requestHeaders;
    }
    if (mergedHeaders != null && !mergedHeaders.isEmpty()) {
        for (Map.Entry<String, String> entry : mergedHeaders.entrySet()) {
            requestBuilder.addHeader(entry.getKey(), StringUtils.defaultString(entry.getValue()));
        }
    }
    return requestBuilder;
}