JAVA——基于HttpComponents(HttpClient)的简单网络爬虫DEMO
作者:互联网
基本概念
HttpComponents(HttpClient):
超文本传输协议(HTTP)可能是当今Internet上使用的最重要的协议。Web服务,支持网络的设备和网络计算的增长继续将HTTP协议的作用扩展到用户驱动的Web浏览器之外,同时增加了需要HTTP支持的应用程序的数量。
HttpComponents是为扩展而设计的,同时提供了对基本HTTP协议的强大支持,对于构建HTTP感知的客户端和服务器应用程序(例如Web浏览器,Web Spider,HTTP代理,Web服务传输库或利用或扩展HTTP协议以进行分布式通信。
官网
Maven
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.10</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-collections4 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
<version>4.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
源代码
HTTPClientPool
package club.zstuca.httpclient;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.ssl.TrustStrategy;
/**
* Https忽略证书
*/
public class HTTPClientPool {
private static final String HTTP = "http";
private static final String HTTPS = "https";
private static SSLConnectionSocketFactory sslConnectionSocketFactory = null;
private static PoolingHttpClientConnectionManager poolingHttpClientConnectionManager = null;//连接池管理类
private static SSLContextBuilder sslContextBuilder = null;//管理Https连接的上下文类
static {
try {
sslContextBuilder = new SSLContextBuilder().loadTrustMaterial(null,
new TrustStrategy() {
@Override
public boolean isTrusted(X509Certificate[] x509Certificates, String s)
throws CertificateException {
// 信任所有站点 直接返回true
return true;
}
});
//"SSLv2Hello", "SSLv3", "TLSv1"
sslConnectionSocketFactory = new SSLConnectionSocketFactory(
sslContextBuilder.build(),
new String[]{"TLSv1.2"},
null,
NoopHostnameVerifier.INSTANCE);
Registry<ConnectionSocketFactory> registryBuilder = RegistryBuilder
.<ConnectionSocketFactory>create()
.register(HTTP, new PlainConnectionSocketFactory())
.register(HTTPS, sslConnectionSocketFactory)
.build();
poolingHttpClientConnectionManager = new PoolingHttpClientConnectionManager(registryBuilder);
poolingHttpClientConnectionManager.setMaxTotal(200);
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
} catch (KeyStoreException e) {
e.printStackTrace();
} catch (KeyManagementException e) {
e.printStackTrace();
}
}
/**
* 获取连接
*
* @return
* @throws Exception
*/
public static CloseableHttpClient getHttpClient() throws Exception {
CloseableHttpClient httpClient = HttpClients.custom()
.setSSLSocketFactory(sslConnectionSocketFactory)
.setConnectionManager(poolingHttpClientConnectionManager)
.setConnectionManagerShared(true)
.setDefaultCookieStore(new BasicCookieStore())
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36")
.build();
return httpClient;
}
}
Web Crawler
package club.zstuca.httpclient;
import org.apache.http.*;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.StringUtils;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Map;
/**
* Http/Https请求的工具类
*/
public class HTTPClientUtil {
// 日志
private static Logger logger = LoggerFactory.getLogger(HTTPClientUtil.class);
// Request params default Config
private static RequestConfig requestConfig = RequestConfig.custom()
.setConnectTimeout(5000)
.setConnectionRequestTimeout(5000)
.setSocketTimeout(5000)
.setRedirectsEnabled(false)
.build();
// HttpClient
private static CloseableHttpClient httpClient = null;
// HTTP Request
private static HttpRequestBase httpRequest = null;
// HTTP Response
private static CloseableHttpResponse httpResponse = null;
/**
*
* @param HttpRequestType
* @param url
* @param header
* @param params
* @param httpEntity
* @return
*/
public static String doRequest(
String HttpRequestType,
String url,
Map<String, String> header,
Map<String, String> params,
HttpEntity httpEntity) {
String resultStr = "";
if (StringUtils.isEmpty(url)) {
return resultStr;
}
try {
// Set GET params
setHttpURIParams(url,params);
// Set POST params
if("POST".equals(HttpRequestType)&&httpEntity != null){
((HttpPost)httpRequest).setEntity(httpEntity);
}
// Set HTTP header
setHttpHeader(header);
// Send POST
sendHttpRequest();
// Response
resultStr = dealWithHttpResponse();
} catch (Exception e) {
e.printStackTrace();
} finally {
closeConnection();
}
return resultStr;
}
/**
* 发送POST请求
*
* @param url:请求地址
* @param header:请求头参数
* @param httpEntity:表单参数 form提交 json/xml参数
* @return
*/
public static String doPostRequest(String url, Map<String, String> header, HttpEntity httpEntity) {
String resultStr = "";
if (StringUtils.isEmpty(url)) {
return resultStr;
}
try {
getHttpRequest("POST");
HttpPost httpPost = (HttpPost)httpRequest;
httpPost.setURI(new URIBuilder(url).build());
// Set HTTP header
setHttpHeader(header);
// Set POST params
if (httpEntity != null) {
httpPost.setEntity(httpEntity);
}
sendHttpRequest();
// Response
resultStr = dealWithHttpResponse();
} catch (Exception e) {
e.printStackTrace();
} finally {
closeConnection();
}
return resultStr;
}
/**
* 发送GET请求
* @param url URL
* @param header HTTP header info
* @param params GET params
* @return
*/
public static String doGetRequest(String url, Map<String, String> header, Map<String, String> params) {
String resultStr = "";
if (StringUtils.isEmpty(url)) {
return resultStr;
}
try {
// getHttpRequest
getHttpRequest("GET");
// Set GET params
setHttpURIParams(url,params);
// Set HTTP header
setHttpHeader(header);
// Send POST
sendHttpRequest();
// Response
resultStr = dealWithHttpResponse();
} catch (Exception e) {
e.printStackTrace();
} finally {
closeConnection();
}
return resultStr;
}
/**
*
* @param HttpRequestType
* @throws Exception
*/
private static void getHttpRequest(String HttpRequestType) throws Exception {
httpClient = HTTPClientPool.getHttpClient();
if("GET".equals(HttpRequestType)){
httpRequest = new HttpGet();
}else if("POST".equals(HttpRequestType)){
httpRequest = new HttpPost();
}
}
/**
*
* @param header
*/
private static void setHttpHeader(Map<String, String> header){
if (!(header == null || header.isEmpty())) {
for (Map.Entry<String, String> headerEntry : header.entrySet()) {
httpRequest.setHeader(headerEntry.getKey(), headerEntry.getValue());
}
}
}
/**
*
* @param url
* @param params
* @throws URISyntaxException
*/
private static void setHttpURIParams(String url,Map<String, String> params) throws URISyntaxException {
// URIBuilder
URIBuilder urlbuilder = new URIBuilder(url);
if (!(params == null || params.isEmpty())) {
// Set GET params
for (Map.Entry<String, String> stringStringEntry : params.entrySet()) {
urlbuilder.setParameter(stringStringEntry.getKey(), stringStringEntry.getValue());
}
}
httpRequest.setURI(urlbuilder.build());
}
/**
*
* @throws IOException
*/
private static void sendHttpRequest() throws IOException {
// Request Config
httpRequest.setConfig(requestConfig);
// Send POST
httpResponse = httpClient.execute(httpRequest);
return ;
}
/**
*
* @return Response String UTF-8
*/
private static String dealWithHttpResponse(){
String resultStr = "";
try{
if (httpResponse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
resultStr = EntityUtils.toString(httpResponse.getEntity(),"UTF-8");
} else {
StringBuffer stringBuffer = new StringBuffer();
HeaderIterator headerIterator = httpResponse.headerIterator();
while (headerIterator.hasNext()) {
stringBuffer.append("\t" + headerIterator.next());
}
}
}catch (IOException e) {
e.printStackTrace();
}
return resultStr;
}
/**
* 关掉连接释放资源
*/
private static void closeConnection() {
if (httpClient != null) {
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (httpResponse != null) {
try {
httpResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
TEST
package clua.zstuca;
import club.zstuca.httpclient.HTTPClientUtil;
import java.util.HashMap;
public class HTTPTEST {
public static void main(String[] args) {
HTTPClientUtil.doGetRequest("http://www.baidu.com",null,null);
HTTPClientUtil.doGetRequest("http://api.help.bj.cn/apis/weather/", null, new HashMap<String, String>(){{
put("id","101060101");
}}
);
}
}
教学资源
https://www.bilibili.com/video/av68932809
参考文章
https://blog.csdn.net/qwe86314/article/details/91450098
STZG 发布了1362 篇原创文章 · 获赞 231 · 访问量 31万+ 关注标签:JAVA,String,DEMO,http,简单网络,static,import,apache,org 来源: https://blog.csdn.net/weixin_43272781/article/details/104071242