import com.google.common.collect.Lists;import com.meiyunji.spider.contant.HttpConnectContant;import com.meiyunji.spider.crawl.response.HeaderVo;import com.meiyunji.spider.crawl.response.ResponseObj;import org.apache.commons.lang3.StringUtils;import org.apache.http.HttpStatus;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import javax.net.ssl.*;import java.io.*;import java.net.*;import java.security.KeyManagementException;import java.security.NoSuchAlgorithmException;import java.security.SecureRandom;import java.security.cert.CertificateException;import java.security.cert.X509Certificate;import java.util.List;import java.util.Map;import java.util.zip.GZIPInputStream;/** * @author ldj * @date 2018/6/25. ** 使用 httpUrlConnection请求信息 */public class HttpConnectUtil { private static final Logger logger = LoggerFactory.getLogger(HttpConnectUtil.class); /** * Get 方法 * * @param url 请求连接 * @param ip 本次请求使用的ip * @param headers 请求头, 为空会填写默认头部信息 * @param parameters 请求体, * @return ResponseObj 返回结果 **/ public static ResponseObj urlGet(String url, String ip, Map
headers, Map parameters) { return urlBaseGet(url, ip, null, headers, parameters); } /** * 代理请求 * * @param url 请求连接 * @param ip 代理ip * @param port 代理端口 * @param headers 请求头, 为空会填写默认头部信息 * @param parameters 请求体, * @return ResponseObj 返回结果 */ public static ResponseObj urlPorxyGet(String url, String ip, Integer port, Map headers, Map parameters) { return urlBaseGet(url, ip, port, headers, parameters); } /** * Get 请求 */ private static ResponseObj urlBaseGet(String url, String ip, Integer port, Map headers, Map parameters) { ResponseObj responseObj; try { url = url.trim(); if (parameters != null) { StringBuilder sbd = new StringBuilder(url); parameters.forEach((key, value) -> sbd.append(StringUtils.containsIgnoreCase(sbd, "?") ? "?" : "&").append(key).append("=").append(value)); url = sbd.toString(); } URL requestUrl = new URL(url); if (HttpConnectContant.CONNECT_TYPE_HTTPS.equalsIgnoreCase(requestUrl.getProtocol())) { responseObj = httpsConnectionGet(requestUrl, ip, port, headers, parameters); } else { responseObj = httpConnectionGet(requestUrl, ip, port, headers, parameters); } } catch (IOException e) { responseObj = new ResponseObj(); responseObj.setStatusCode(-1); responseObj.setReasonPhrase("Crawl Error : URL DealHtml error, the url is not support request."); } responseObj.setIp(ip); responseObj.setUrl(url); logger.info("url: {}", url); logger.info("statusCode: {}", responseObj.getStatusCode()); logger.info("message: {}", responseObj.getReasonPhrase()); return responseObj; } /** * Https Get 方法 * * @param requestUrl 请求连接 * @param ip 本次请求使用的ip * @param headers 请求头, 为空会填写默认头部信息 * @param parameters 请求体, * @return ResponseObj 返回结果 * String ua, * String html, * String statusCode, * String ip; * String url; * String userAgent; */ private static ResponseObj httpsConnectionGet(URL requestUrl, String ip, Integer port, Map headers, Map parameters) { InputStreamReader reader = null; GZIPInputStream gzipInputStream = null; InputStream inputStream = null; HttpsURLConnection httpsURLConnection = null; ResponseObj responseObj = new ResponseObj(); try { // 获取 URLConnection对象 httpsURLConnection = (HttpsURLConnection) getUrlConnection(requestUrl, ip, port); setDefaultProperties(httpsURLConnection); httpsURLConnection.setRequestMethod(HttpConnectContant.GET); if (headers == null || headers.size() == 0) { setDefaultHeader(httpsURLConnection); } else { setCustomizedHeader(httpsURLConnection, headers); } /// 设置绕过https验证 trustAllHosts(httpsURLConnection); httpsURLConnection.setHostnameVerifier(DO_NOT_VARIFY); // 打开链接 httpsURLConnection.connect(); // 获取结果 if (httpsURLConnection.getResponseCode() == HttpStatus.SC_OK || httpsURLConnection.getResponseCode() == HttpStatus.SC_CREATED) { inputStream = httpsURLConnection.getInputStream(); } else { inputStream = httpsURLConnection.getErrorStream(); } String contentEncoding = httpsURLConnection.getContentEncoding(); if (contentEncoding != null && "gzip".equals(contentEncoding.trim().toLowerCase())) { gzipInputStream = new GZIPInputStream(inputStream); reader = new InputStreamReader(gzipInputStream); } else { reader = new InputStreamReader(inputStream); } StringBuilder sbfs = new StringBuilder(); char[] c = new char[1024]; int len; while (0 < (len = reader.read(c))) { sbfs.append(c, 0, len); } List headerList = Lists.newArrayList(); Map > responseHeaderFields = httpsURLConnection.getHeaderFields(); if (responseHeaderFields != null && responseHeaderFields.size() > 0) { responseHeaderFields.forEach((key, value) -> headerList.add(new HeaderVo(key, JsonUtil.objectToJson(value)))); } responseObj.setStatusCode(httpsURLConnection.getResponseCode()); responseObj.setContent(sbfs.toString()); responseObj.setResponseHeader(headerList); responseObj.setUserAgent(httpsURLConnection.getRequestProperty("user-agent")); } catch (IOException e) { e.printStackTrace(); responseObj.setStatusCode(-1); responseObj.setReasonPhrase("Crawl Error : HttpsUrlConnection IOException ."); } finally { try { if (reader != null) { reader.close(); } if (gzipInputStream != null) { gzipInputStream.close(); } if (inputStream != null) { inputStream.close(); } if (httpsURLConnection != null) { httpsURLConnection.disconnect(); } } catch (IOException e) { e.printStackTrace(); } } return responseObj; } /** * Http Get 方法 * * @param requestUrl 请求连接 * @param ip 本次请求使用的ip * @param headers 请求头, 为空会填写默认头部信息 * @param parameters 请求体, * @return ResponseObj 返回结果 * String ua, * String html, * String statusCode, * String ip; * String url; * URL requestUrl; * String userAgent; */ private static ResponseObj httpConnectionGet(URL requestUrl, String ip, Integer port, Map headers, Map parameters) { InputStreamReader reader = null; GZIPInputStream gzipInputStream = null; InputStream inputStream = null; HttpURLConnection httpUrlConnection = null; ResponseObj responseObj = new ResponseObj(); try { httpUrlConnection = (HttpURLConnection) getUrlConnection(requestUrl, ip, port); setDefaultProperties(httpUrlConnection); httpUrlConnection.setRequestMethod(HttpConnectContant.GET); if (headers == null || headers.size() <= 0) { setDefaultHeader(httpUrlConnection); } else { setCustomizedHeader(httpUrlConnection, headers); } httpUrlConnection.connect(); if (httpUrlConnection.getResponseCode() == HttpStatus.SC_OK || httpUrlConnection.getResponseCode() == HttpStatus.SC_CREATED) { inputStream = httpUrlConnection.getInputStream(); } else { inputStream = httpUrlConnection.getErrorStream(); } String contentEncoding = httpUrlConnection.getContentEncoding(); if (contentEncoding != null && "gzip".equals(contentEncoding.trim().toLowerCase())) { gzipInputStream = new GZIPInputStream(inputStream); reader = new InputStreamReader(gzipInputStream); } else { reader = new InputStreamReader(inputStream); } StringBuilder sbfs = new StringBuilder(); char[] c = new char[1024]; int len; while (0 < (len = reader.read(c))) { sbfs.append(c, 0, len); } List headerList = Lists.newArrayList(); httpUrlConnection.getHeaderFields().forEach((key, value) -> headerList.add(new HeaderVo(key, JsonUtil.objectToJson(value)))); responseObj.setStatusCode(httpUrlConnection.getResponseCode()); responseObj.setContent(sbfs.toString()); responseObj.setResponseHeader(headerList); responseObj.setUrl(requestUrl.getProtocol() + "://" + requestUrl.getHost() + requestUrl.getPath()); if (headers != null) { responseObj.setUserAgent(headers.get("user_agent")); } } catch (IOException e) { responseObj.setStatusCode(-1); responseObj.setReasonPhrase("Crawl Error : HttpsUrlConnection IOException ."); } finally { try { if (reader != null) { reader.close(); } if (gzipInputStream != null) { gzipInputStream.close(); } if (inputStream != null) { inputStream.close(); } if (httpUrlConnection != null) { httpUrlConnection.disconnect(); } } catch (IOException e) { e.printStackTrace(); } } return responseObj; } /** * POST 请求方式 * 默认以JSON的形式传递数据 */ public static ResponseObj urlPost(String url, String ip, Map headers, Map bodyMap) { return urlAdvancedPost(url, ip, headers, bodyMap, HttpConnectContant.POST_BODY_TYPE_JSON); } /** * POST 走代理请求 */ public static ResponseObj urlProxyPost(String url, String ip, Integer port, Map headers, Map bodyMap) { return urlAdvancedPost(url, ip, headers, bodyMap, HttpConnectContant.POST_BODY_TYPE_JSON); } /** * POST请求方式 * * @param url 请求连接 * @param ip 请求ip * @param headers 请求头 * @param bodyMap 请求体 * @param submiType 提交数据方式 format / json */ public static ResponseObj urlAdvancedPost(String url, String ip, Map headers, Map bodyMap, String submiType) { return urlBasePost(url, ip, null, headers, bodyMap, submiType); } /** * POST 代理请求方式 * * @param url 请求连接 * @param ip 代理请求ip * @param port 代理请求端口 * @param headers 请求头 * @param bodyMap 请求体 * @param submitType 提交数据方式 format / json */ public static ResponseObj urlAdvancedProxyPost(String url, String ip, Integer port, Map headers, Map bodyMap, String submitType) { return urlBasePost(url, ip, port, headers, bodyMap, submitType); } private static ResponseObj urlBasePost(String url, String ip, Integer port, Map headers, Map bodyMap, String submitType) { ResponseObj responseObj; try { URL requestUrl = new URL(url); if (HttpConnectContant.CONNECT_TYPE_HTTPS.equalsIgnoreCase(requestUrl.getProtocol())) { responseObj = httpsConnectionPost(requestUrl, ip, port, headers, bodyMap, submitType); } else { responseObj = httpConnectionPost(requestUrl, ip, port, headers, bodyMap, submitType); } } catch (IOException e) { responseObj = new ResponseObj(); responseObj.setStatusCode(-1); responseObj.setReasonPhrase("Crawl Error : URL DealHtml error, the url is not support request."); } responseObj.setIp(ip); responseObj.setUrl(url); logger.info("url: {}", url); logger.info("statusCode: {}", responseObj.getStatusCode()); logger.info("message: {}", responseObj.getReasonPhrase()); return responseObj; } /** * https POST方法 */ private static ResponseObj httpsConnectionPost(URL requestUrl, String ip, Integer port, Map headers, Map bodyMap, String submitType) { Reader reader = null; GZIPInputStream gzipInputStream = null; InputStream inputStream = null; HttpsURLConnection httpsURLConnection = null; ResponseObj responseObj = new ResponseObj(); try { httpsURLConnection = (HttpsURLConnection) getUrlConnection(requestUrl, ip, port); httpsURLConnection.setRequestMethod(HttpConnectContant.POST); // 设置默认请求属性 setDefaultProperties(httpsURLConnection); httpsURLConnection.setUseCaches(false); // 绕过 https 验证 trustAllHosts(httpsURLConnection); httpsURLConnection.setHostnameVerifier(DO_NOT_VARIFY); // 拼接请求头 if (headers == null || headers.size() <= 0) { setDefaultHeader(httpsURLConnection); } else { setCustomizedHeader(httpsURLConnection, headers); } // 拼接请求体 String parameterStr = getPostBody(bodyMap, submitType); byte[] writeBytes = parameterStr.getBytes(); httpsURLConnection.setRequestProperty("Content-Length", String.valueOf(writeBytes.length)); // 发起请求 OutputStream outputStream = httpsURLConnection.getOutputStream(); outputStream.write(writeBytes); outputStream.flush(); outputStream.close(); if (httpsURLConnection.getResponseCode() == HttpStatus.SC_OK || httpsURLConnection.getResponseCode() == HttpStatus.SC_CREATED) { inputStream = httpsURLConnection.getInputStream(); } else { inputStream = httpsURLConnection.getErrorStream(); } String contentEncoding = httpsURLConnection.getContentEncoding(); if (contentEncoding != null && HttpConnectContant.RESPONSE_CODE_GZIP.equals(contentEncoding.trim().toLowerCase())) { gzipInputStream = new GZIPInputStream(inputStream); reader = new InputStreamReader(gzipInputStream); } else { reader = new InputStreamReader(inputStream); } StringBuilder sbd = new StringBuilder(); char[] c = new char[1024]; int len; while (0 < (len = reader.read(c))) { sbd.append(c, 0, len); } List headerList = Lists.newArrayList(); Map > responseHeaderFields = httpsURLConnection.getHeaderFields(); if (responseHeaderFields != null && responseHeaderFields.size() > 0) { responseHeaderFields.forEach((key, value) -> headerList.add(new HeaderVo(key, JsonUtil.objectToJson(value)))); } responseObj.setStatusCode(httpsURLConnection.getResponseCode()); responseObj.setContent(sbd.toString()); responseObj.setResponseHeader(headerList); responseObj.setUserAgent(httpsURLConnection.getRequestProperty("user-agent")); } catch (IOException e) { e.printStackTrace(); responseObj.setStatusCode(-1); responseObj.setReasonPhrase("Crawl Error : HttpsUrlConnection IOException ."); } finally { try { if (reader != null) { reader.close(); } if (gzipInputStream != null) { gzipInputStream.close(); } if (inputStream != null) { inputStream.close(); } if (httpsURLConnection != null) { httpsURLConnection.disconnect(); } } catch (IOException e) { e.printStackTrace(); } } return responseObj; } /** * https POST方法 */ private static ResponseObj httpConnectionPost(URL requestUrl, String ip, Integer port, Map headers, Map bodyMap, String submitType) { Reader reader = null; GZIPInputStream gzipInputStream = null; InputStream inputStream = null; HttpURLConnection httpURLConnection = null; ResponseObj responseObj = new ResponseObj(); try { httpURLConnection = (HttpURLConnection) getUrlConnection(requestUrl, ip, port); httpURLConnection.setRequestMethod(HttpConnectContant.POST); // 设置默认请求属性 setDefaultProperties(httpURLConnection); httpURLConnection.setUseCaches(false); // 拼接请求头 if (headers == null || headers.size() <= 0) { setDefaultHeader(httpURLConnection); } else { setCustomizedHeader(httpURLConnection, headers); } // 拼接请求体 String parameterStr = getPostBody(bodyMap, submitType); byte[] writeBytes = parameterStr.getBytes(); httpURLConnection.setRequestProperty("Content-Length", String.valueOf(writeBytes.length)); // 发起请求 OutputStream outputStream = httpURLConnection.getOutputStream(); outputStream.write(writeBytes); outputStream.flush(); outputStream.close(); if (httpURLConnection.getResponseCode() == HttpStatus.SC_OK || httpURLConnection.getResponseCode() == HttpStatus.SC_CREATED) { inputStream = httpURLConnection.getInputStream(); } else { inputStream = httpURLConnection.getErrorStream(); } String contentEncoding = httpURLConnection.getContentEncoding(); if (contentEncoding != null && HttpConnectContant.RESPONSE_CODE_GZIP.equals(contentEncoding.trim().toLowerCase())) { gzipInputStream = new GZIPInputStream(inputStream); reader = new InputStreamReader(gzipInputStream); } else { reader = new InputStreamReader(inputStream); } StringBuilder sbd = new StringBuilder(); char[] c = new char[1024]; int len; while (0 < (len = reader.read(c))) { sbd.append(c, 0, len); } List headerList = Lists.newArrayList(); Map > responseHeaderFields = httpURLConnection.getHeaderFields(); if (responseHeaderFields != null && responseHeaderFields.size() > 0) { responseHeaderFields.forEach((key, value) -> headerList.add(new HeaderVo(key, JsonUtil.objectToJson(value)))); } responseObj.setStatusCode(httpURLConnection.getResponseCode()); responseObj.setContent(sbd.toString()); responseObj.setResponseHeader(headerList); responseObj.setUserAgent(httpURLConnection.getRequestProperty("user-agent")); } catch (IOException e) { e.printStackTrace(); responseObj.setStatusCode(-1); responseObj.setReasonPhrase("Crawl Error : HttpsUrlConnection IOException ."); } finally { try { if (reader != null) { reader.close(); } if (gzipInputStream != null) { gzipInputStream.close(); } if (inputStream != null) { inputStream.close(); } if (httpURLConnection != null) { httpURLConnection.disconnect(); } } catch (IOException e) { e.printStackTrace(); } } return responseObj; } /** * 拼接 POST 请求 请求体 */ private static String getPostBody(Map bodyMap, String submitType) { String parameterStr; StringBuilder sbd = new StringBuilder(); if (bodyMap != null && bodyMap.size() > 0 && HttpConnectContant.POST_BODY_TYPE_FORMAT.equals(submitType)) { //todo 数组类型数据不合适,需要优化 bodyMap.forEach((key, value) -> sbd.append(key).append("=").append(value).append("&")); parameterStr = sbd.toString(); } else { parameterStr = JsonUtil.objectToJson(bodyMap); } if (StringUtils.isBlank(parameterStr)) { parameterStr = ""; } return parameterStr; }//region---------------------------------------设置头部信息----------------------------------------------------- /** * 先设置默认头部信息 * 然后用新的头部信息覆盖默认信息 */ private static void setCustomizedHeader(URLConnection urlConnection, Map headerMap) { setDefaultHeader(urlConnection); for (Map.Entry entry : headerMap.entrySet()) { urlConnection.setRequestProperty(entry.getKey(), entry.getValue()); } } /** * 设置默认头部信息 */ private static void setDefaultHeader(URLConnection urlConnection) { urlConnection.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); urlConnection.setRequestProperty("accept-encoding", "gzip, deflate, br"); urlConnection.setRequestProperty("accept-language", "en-US,en;q=0.9,fr-FR;q=0.8,fr-CA;q=0.7,fr;q=0.6,de;q=0.5,zh-CN;q=0.4,zh;q=0.3,en-AU;q=0.2"); urlConnection.setRequestProperty("cache-control", "max-age=0"); urlConnection.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"); urlConnection.setRequestProperty("connection", "keep-alive"); } /** * 设置默认属性 */ private static void setDefaultProperties(URLConnection urlConnection) { urlConnection.setDoOutput(true); urlConnection.setDoInput(true); urlConnection.setConnectTimeout(HttpConnectContant.CONNECT_TIME_OUT_TIME); // 系统超时设置, 防止网络异常情况下, 可能会导致程序僵死而不继续往下执行 System.setProperty("sun.net.client.defaultConnectionTimeOut", String.valueOf(HttpConnectContant.CONNECT_TIME_OUT_TIME)); System.setProperty("sun.net.client.defaultReadTimeout", String.valueOf(HttpConnectContant.CONNECT_TIME_OUT_TIME)); } /** * 打开连接,获取连接对象 */ private static URLConnection getUrlConnection(URL requestUrl, String ip, Integer port) throws IOException { Proxy proxy = null; if (StringUtils.isNotBlank(ip) && port != null && port > 0) { proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ip, port)); } if (proxy == null) { return requestUrl.openConnection(); } return requestUrl.openConnection(proxy); }//endregion-------------------------------------------------------------------------------------------------------------//region ------------------------------------- 绕过 https 验证 ----------------------------------------------------------- /** * 覆盖java默认的证书验证 */ private static final TrustManager[] TRUST_ALL_CERTS = new TrustManager[]{ new X509TrustManager() { @Override public void checkClientTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException { } @Override public void checkServerTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException { } @Override public X509Certificate[] getAcceptedIssuers() { return new X509Certificate[]{}; } } }; /** * 设置不验证主机 */ private static final HostnameVerifier DO_NOT_VARIFY = new HostnameVerifier() { @Override public boolean verify(String s, SSLSession sslSession) { return true; } }; /** * 信任所有的链接 */ private static SSLSocketFactory trustAllHosts(HttpsURLConnection connection) { SSLSocketFactory sslSocketFactory = connection.getSSLSocketFactory(); try { SSLContext tls = SSLContext.getInstance("TLS"); tls.init(null, TRUST_ALL_CERTS, new SecureRandom()); SSLSocketFactory socketFactory = tls.getSocketFactory(); connection.setSSLSocketFactory(socketFactory); } catch (NoSuchAlgorithmException e) { e.printStackTrace(); } catch (KeyManagementException e) { e.printStackTrace(); } return sslSocketFactory; }// endregion -----------------------------------------------------------------------------------------------------------}