网络爬虫速成指南(一)网页下载

 
 
注解:此处仅仅是介绍一些类库及常规使用,若是要详细了解Http协议推荐看下《Http权威指南》
 
 

。net 方向 主要是用到HttpWebRequest下载内容:html

JAVA方向:
主要是用到HttpClient下载内容
示例代码:
相关类库(httpclient-4.1.2 httpcore-4.1.4
示例代码c#:
package com.data.crawl.qa.baiduzhidao;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.UnsupportedEncodingException;import java.util.ArrayList;import java.util.List;import java.util.Map;import java.util.Set;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.http.Header;import org.apache.http.HttpEntity;import org.apache.http.HttpStatus;import org.apache.http.NameValuePair;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.config.CookieSpecs;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.protocol.HttpClientContext;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;import org.apache.http.message.BasicNameValuePair;import org.apache.http.config.Registry;import org.apache.http.config.RegistryBuilder;import org.apache.http.cookie.CookieSpecProvider;import org.apache.http.impl.cookie.BestMatchSpecFactory;import org.apache.http.impl.cookie.BrowserCompatSpecFactory;/** * HttpClient链接池 *  * @author wqj *  */public class HttpClientPool {        private static Log log = LogFactory.getLog(HttpClientPool.class);        /**     * 最大HttpClient链接数     */    private final int MAX_TOTAL_CONNECTIONS = 10;    /**     * HttpClient链接池     */    private PoolingHttpClientConnectionManager connectionManager;    /**     * cookie 上下文     */    protected HttpClientContext context = null;    /**     * default constructor     */    public HttpClientPool(){        connectionManager = new PoolingHttpClientConnectionManager();        /* 链接池最大生成链接数200 */        connectionManager.setMaxTotal(MAX_TOTAL_CONNECTIONS);        /* 默认设置route最大链接数为20 */        connectionManager.setDefaultMaxPerRoute(10);     // 实例化cookie        context = HttpClientContext.create();        Registry<CookieSpecProvider> registry = RegistryBuilder.<CookieSpecProvider> create()                .register(CookieSpecs.BEST_MATCH, new BestMatchSpecFactory())                .register(CookieSpecs.BROWSER_COMPATIBILITY, new BrowserCompatSpecFactory()).build();        context.setCookieSpecRegistry(registry);    }    /**     * 从线程池实例化HttpClient     *      * @return     */    private CloseableHttpClient getHttpClient() {        int socketTimeOut = 120000;        int connectionTimeOut = 60000;        RequestConfig config = RequestConfig.custom().setSocketTimeout(socketTimeOut)                .setConnectTimeout(connectionTimeOut).setCookieSpec(CookieSpecs.BEST_MATCH).build();        return HttpClients.custom().setDefaultRequestConfig(config).setConnectionManager(connectionManager).build();    }    /**     * Post方式     */    public String Post(String uri, Map<String, String> params) {        CloseableHttpClient httpclient = getHttpClient();        HttpPost httpost = new HttpPost(uri);        List<NameValuePair> post_data = new ArrayList<NameValuePair>();        Set<String> keySet = params.keySet();        for (String key : keySet) {            post_data.add(new BasicNameValuePair(key, params.get(key)));        }        CloseableHttpResponse response = null;        try {            httpost.setEntity(new UrlEncodedFormEntity(post_data, "UTF-8"));            response = httpclient.execute(httpost, context);                        //默认编码            String charset = "utf-8";                        HttpEntity entity = response.getEntity();            String html = null;            if (entity != null) {                InputStream in = entity.getContent();                /* 侦测编码 */                ByteArrayOutputStream swapStream = new ByteArrayOutputStream();                byte[] buff = new byte[1024];                int rc = 0;                while ((rc = in.read(buff, 0, 1024)) > 0) {                    swapStream.write(buff, 0, rc);                }                byte[] data = swapStream.toByteArray();                String charset_1 = Icu4jDetector.getEncode(data);                charset = charset_1 == null ? charset : charset_1;                html = new String(data, charset);                System.out.println(html);                in.close();            }            return html;        } catch (UnsupportedEncodingException e) {            log.error(e.getMessage());        } catch (ClientProtocolException e) {            log.error(e.getMessage());        } catch (IOException e) {            log.error(e.getMessage());        }        return null;    }    /**     * 模拟登录时,访问首页时使用此方法,此方法不带cookie     *      * @param uri 统一资源定位符     * @return html文档     */    public String downHtml(String uri) {        CloseableHttpClient httpclient = getHttpClient();        HttpGet httpget = new HttpGet(uri);        CloseableHttpResponse response = null;        try {            response = httpclient.execute(httpget);            /* 判断访问的状态码 */            int statusCode = response.getStatusLine().getStatusCode();            if (statusCode != HttpStatus.SC_OK) {                log.info("request failed: " + response.getStatusLine());                return null;            }            /* 侦测编码 */            Pattern pattern = Pattern.compile("text/html;[\\s]*charset=(.*)");            Header[] arr = response.getHeaders("Content-Type");            String charset = "utf-8";            if (arr != null) {                String content = arr[0].getValue().toLowerCase();                Matcher m = pattern.matcher(content);                if (m.find()) {                    charset = m.group(1);                }            }            HttpEntity entity = response.getEntity();            String html = null;            if (entity != null) {                InputStream in = entity.getContent();                /* 侦测编码 */                ByteArrayOutputStream swapStream = new ByteArrayOutputStream();                byte[] buff = new byte[1024];                int rc = 0;                while ((rc = in.read(buff, 0, 1024)) > 0) {                    swapStream.write(buff, 0, rc);                }                byte[] data = swapStream.toByteArray();                String charset_1 = Icu4jDetector.getEncode(data);                charset = charset_1 == null ? charset : charset_1;                html = new String(data, charset);                in.close();            }            return html;        } catch (ClientProtocolException e) {           log.info(e.getMessage());        } catch (IOException e) {            log.info(e.getMessage());        }        return null;    }}
相关文章
相关标签/搜索