package spider;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.zip.GZIPInputStream;

import javax.net.ssl.SSLHandshakeException;

import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.NoHttpResponseException;
import org.apache.http.ParseException;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.params.ConnManagerParams;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.message.BasicHeader;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;


/**
 * http连接、抓取管理类
 * @author lidongyang
 * @createtime Oct 18, 2012 1:55:18 PM
 * 
 * @note 基本测试版
 */
public class HttpConnectionManager {
	
    /** 
     * 连接池里的最大连接数
     */  
    public static final int MAX_TOTAL_CONNECTIONS = 100;
    
    /** 
     * 每个路由的默认最大连接数
     */  
    public static final int MAX_ROUTE_CONNECTIONS = 50;
    
    /** 
     * 连接超时时间
     */  
    public static final int CONNECT_TIMEOUT = 50000;
    
    /**
     * 套接字超时时间
     */
    public static final int SOCKET_TIMEOUT = 50000;
    
    /**
     * 连接池中 连接请求执行被阻塞的超时时间
     */
    public static final long CONN_MANAGER_TIMEOUT = 60000;
    
    /**
     * http连接相关参数
     */
    private static HttpParams parentParams;
    
    /**
     * http线程池管理器
     */
    private static PoolingClientConnectionManager cm;
    
    /**
     * http客户端
     */
    private static DefaultHttpClient httpClient;
    
    /**
     * 默认目标主机
     */
    private static final HttpHost DEFAULT_TARGETHOST = new HttpHost("http://www.qq.com", 80);
    
    /**
     * 初始化http连接池,设置参数、http头等等信息
     */
    static {
    	SchemeRegistry schemeRegistry = new SchemeRegistry();
    	schemeRegistry.register(
    	         new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
    	schemeRegistry.register(
    	         new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));

    	cm = new PoolingClientConnectionManager(schemeRegistry);

    	cm.setMaxTotal(MAX_TOTAL_CONNECTIONS);
    	
    	cm.setDefaultMaxPerRoute(MAX_ROUTE_CONNECTIONS);

    	cm.setMaxPerRoute(new HttpRoute(DEFAULT_TARGETHOST), 20);		//设置对目标主机的最大连接数
    	
    	parentParams = new BasicHttpParams(); 
    	parentParams.setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);

    	parentParams.setParameter(ClientPNames.DEFAULT_HOST, DEFAULT_TARGETHOST);	//设置默认targetHost
    	
    	parentParams.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);
    	
    	parentParams.setParameter(ClientPNames.CONN_MANAGER_TIMEOUT, CONN_MANAGER_TIMEOUT);
    	parentParams.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, CONNECT_TIMEOUT);
    	parentParams.setParameter(CoreConnectionPNames.SO_TIMEOUT, SOCKET_TIMEOUT);
    	
    	parentParams.setParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS, true);
    	parentParams.setParameter(ClientPNames.HANDLE_REDIRECTS, true);
    	
    	//设置头信息,模拟浏览器
    	Collection<Header> collection = new ArrayList<Header>();
    	collection.add(new BasicHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)"));
    	collection.add(new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));
    	collection.add(new BasicHeader("Accept-Language", "zh-cn,zh,en-US,en;q=0.5"));
    	collection.add(new BasicHeader("Accept-Charset", "ISO-8859-1,utf-8,gbk,gb2312;q=0.7,*;q=0.7"));
    	collection.add(new BasicHeader("Accept-Encoding", "gzip, deflate"));
    	
    	parentParams.setParameter(ClientPNames.DEFAULT_HEADERS, collection);
    	//请求重试处理
    	HttpRequestRetryHandler httpRequestRetryHandler = new HttpRequestRetryHandler() {
    		public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
				if (executionCount >= 5) {
    				// 如果超过最大重试次数,那么就不要继续了
    				return false;
				}
				if (exception instanceof NoHttpResponseException) {
					// 如果服务器丢掉了连接,那么就重试
					return true;
				}
				if (exception instanceof SSLHandshakeException) {
					// 不要重试SSL握手异常
					return false;
				}
				HttpRequest request = (HttpRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST);
				boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
				if (idempotent) {
					// 如果请求被认为是幂等的,那么就重试
					return true;
				}
				return false;
    		}
    	};
    	
    	httpClient = new DefaultHttpClient(cm, parentParams);
    	
    	httpClient.setHttpRequestRetryHandler(httpRequestRetryHandler);
    }
    
    /**
     * 抓取页面代码
     * @param url 目标页面的url
     * @return 页面代码
     */
    public String getHtml(String url) {
    	HttpHost proxyHost = new HttpHost("211.142.236.137", 8080);//代理
    	
    	String html = getHtml(url, proxyHost);
    	
    	int count = 0;
    	while(StringUtils.isEmpty(html)){
	    	proxyHost = new HttpHost("211.142.236.137", 80);//更换代理
	    	html = getHtml(url, proxyHost);
	    	count++;
	    	if(count > 3){
	    		System.out.println("抓取失败");
	    		break;
	    	}
    	}
    	
System.out.println(html.length());
    	return html;
    }
    
    /**
     * 抓取url所指的页面代码
     * @param url 目标页面的url
     * @return 页面代码
     */
    public String getHtml(String url, HttpHost proxyHost) {
    	String html = "";
    	HttpGet httpGet = new HttpGet(url);
    	httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);//设置代理
    	
    	HttpResponse httpResponse;
    	HttpEntity httpEntity;
		try {
			httpResponse = httpClient.execute(httpGet);
			
			StatusLine statusLine = httpResponse.getStatusLine();
			int statusCode = statusLine.getStatusCode();
System.out.println(statusCode);
			if(200 != statusCode) {
				return html;
			}
			
	    	httpEntity = httpResponse.getEntity();
	    	if(httpEntity != null){
	    		html = readHtmlContentFromEntity(httpEntity);
	    	}
		} catch (ClientProtocolException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} finally {
			if(httpGet != null){
				httpGet.releaseConnection();
			}
		}
		
    	return html;
    }
    
    /**
     * 从response返回的实体中读取页面代码
     * @param httpEntity Http实体
     * @return 页面代码
     * @throws ParseException
     * @throws IOException
     */
    private String readHtmlContentFromEntity(HttpEntity httpEntity) throws ParseException, IOException {
    	String html = "";
    	Header header = httpEntity.getContentEncoding();
    	if(httpEntity.getContentLength() < 2147483647L){			//EntityUtils无法处理ContentLength超过2147483647L的Entity
    		if(header != null && "gzip".equals(header.getValue())){
    			html = EntityUtils.toString(new GzipDecompressingEntity(httpEntity));
    		} else {
    			html = EntityUtils.toString(httpEntity);
    		}
    	} else {
    		InputStream in = httpEntity.getContent();
    		if(header != null && "gzip".equals(header.getValue())){
    			html = unZip(in, ContentType.getOrDefault(httpEntity).getCharset().toString());
    		} else {
    			html = readInStreamToString(in, ContentType.getOrDefault(httpEntity).getCharset().toString());
    		}
    		if(in != null){
    			in.close();
    		}
    	}
    	return html;
    }
    
    /**
     * 测试代理是否可用(其实和getHtml(String url, HttpHost proxyHost)的代码差不多,为了从功能上区别,暂时这样)
     * @param httpHost 封装了代理的ip地址和端口
     * @param url 用来测试的页面
     * @return true 可用 false 不可用
     */
    public boolean isProxyUsable(HttpHost proxyHost, String url) {
    	HttpGet httpGet = new HttpGet(url);
    	httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);
    	try {
			HttpResponse httpResponse = httpClient.execute(httpGet);
			
			StatusLine statusLine = httpResponse.getStatusLine();
			int statusCode = statusLine.getStatusCode();
System.out.println(statusCode);
			if(200 != statusCode) {
				return false;
			}
			HttpEntity httpEntity = httpResponse.getEntity();
			if(httpEntity != null) {
				String html = readHtmlContentFromEntity(httpEntity);
System.out.println(html.length());
				if(StringUtils.isEmpty(html)){
					return false;
				}
			} else {
				return false;
			}
			
		} catch (ClientProtocolException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return false;
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return false;
		}
		
    	return true;
    }
    
    /**
	 * 解压服务器返回的gzip流
	 * @param in 抓取返回的InputStream流
	 * @param charSet 页面内容编码
	 * @return 页面内容的String格式
	 * @throws IOException
	 */
	private String unZip(InputStream in, String charSet) throws IOException {
		ByteArrayOutputStream baos = new ByteArrayOutputStream();
		GZIPInputStream gis = null;
		try {
			gis = new GZIPInputStream(in);
			byte[] _byte = new byte[1024];
			int len = 0;
			while ((len = gis.read(_byte)) != -1) {
				baos.write(_byte, 0, len);
			}
			String unzipString = new String(baos.toByteArray(), charSet);
			return unzipString;
		} finally {
			if (gis != null) {
				gis.close();
			}
			if(baos != null){
				baos.close();
			}
		}
	}
    
	/**
	 * 读取InputStream流
	 * @param in InputStream流
	 * @return 从流中读取的String
	 * @throws IOException
	 */
    private String readInStreamToString(InputStream in, String charSet) throws IOException {
    	StringBuilder str = new StringBuilder();
		String line;
		BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(in, charSet));
		while((line = bufferedReader.readLine()) != null){
			str.append(line);
			str.append("\n");
		}
		if(bufferedReader != null) {
			bufferedReader.close();
		}
		return str.toString();
    }
    
    /**
     * for test
     * @author lidongyang
     * @createtime Oct 18, 2012 2:35:09 PM
     */
    public class Test implements Runnable {
    	String url;
    	int threadNum;
    	
    	public Test() {
    		
    	}
    	
    	public Test(String url, int threadNum) {
    		this.url = url;
    		this.threadNum = threadNum;
    	}
    	
		@Override
		public void run() {
			getHtml(url);
		}
    }
    
    
    /**
     * for test
     * @param args
     * @throws InterruptedException 
     */
    public static void main(String[] args) throws InterruptedException{
		HttpConnectionManager httpConnectionManager = new HttpConnectionManager();
		Date start = new Date();
		httpConnectionManager.getHtml("http://www.qq.com");
		Date end = new Date();
		System.out.println((end.getTime() - start.getTime())/1000.0 + " 秒");
    }
}
最近下载更多
zdm1231  LV2 2022年1月7日
guodaxia  LV14 2020年8月31日
a934883425  LV1 2020年6月24日
tfd280620913  LV1 2020年5月18日
BigBull  LV1 2020年4月28日
958331502  LV1 2020年4月27日
xiaobai145  LV10 2020年2月15日
wei112233  LV15 2020年2月2日
sleepcat88  LV13 2020年1月20日
多多像小羊  LV1 2019年11月22日
最近浏览更多
taoshen95  LV14 2022年11月17日
12cq345  LV6 2022年11月15日
sunlea  LV17 2022年9月8日
whfuai  LV14 2022年5月9日
zdm1231  LV2 2022年1月7日
DeepLoveDeep  LV2 2021年11月23日
NRLeaf 2021年7月13日
暂无贡献等级
Sean_admin  LV7 2021年4月28日
wx929058456 2021年3月23日
暂无贡献等级
wwkddjjj  LV8 2021年3月23日
顶部 客服 微信二维码 底部
>扫描二维码关注最代码为好友扫描二维码关注最代码为好友