爬虫项目知识点整理

时间 2019-11-08

标签爬虫项目知识整理栏目网络爬虫繁體版

原文原文链接

爬虫知识点整理php

1、开发环境配置html

1.jdk1.8.0_91java

2.数据库MongoDB 2.6 Standardgit

3.apache-activemq-5.13.2web

4.commons-httpclient-3.0chrome

5.服务器apache-tomcat-7.0.69数据库

6.webmagic-core-0.5.2apache

2、项目需求json

a) 从全国企业信息系统爬取到企业的工商注册号，而后根据工商注册号获取该企业的详细信息api

b)预设大概的邮编号码，而后从中国邮政官网获取该邮编下的地址信息。

3、企业详细信息的获取

前置工做代码以下

CloseableHttpClient client = HttpClientUtil.getHttpClient();
//		HttpGet get = new HttpGet();
		HttpPost post = new HttpPost();
		post.setURI(new URI("https://www.sgs.gov.cn/notice/search/ent_spot_check_list"));
		BasicNameValuePair simcard1 = new BasicNameValuePair("captcha", "");
		BasicNameValuePair simcard2 = new BasicNameValuePair("condition.pageNo", pageNo);
		BasicNameValuePair simcard3 = new BasicNameValuePair("condition.insType", "1");
		BasicNameValuePair simcard4 = new BasicNameValuePair("session.token", "e07b51eb-8c11-4fea-9e3f-f9cb3c0b20e9");
		BasicNameValuePair simcard5 = new BasicNameValuePair("condition.keyword", "");
		List<BasicNameValuePair> formParams = new ArrayList<BasicNameValuePair>();
		formParams.add(simcard1);
		formParams.add(simcard2);
		formParams.add(simcard3);
		formParams.add(simcard4);
		formParams.add(simcard5);
		UrlEncodedFormEntity uefEntity = new UrlEncodedFormEntity((List<? extends org.apache.http.NameValuePair>) formParams,"UTF-8");
		post.setEntity(uefEntity);
		
		CloseableHttpResponse response =  client.execute(post);
		List<String> gongshangxinxi = new ArrayList<String>();
		String s = "";
		 if(response.getStatusLine().getStatusCode() == 200){
             try {
                 HttpEntity resEntity = response.getEntity();
                 //解析页面
                 if (resEntity != null) {
                    s=EntityUtils.toString(resEntity,"UTF-8");
                    Document doc = Jsoup.parse(s.toString());
	                  Elements content = doc.getElementsByClass("center");
	                  for(int i=0;i<content.size();i++){
	                	  String target = content.get(i).getElementsByClass("center").text();
	                	  gongshangxinxi.add(target);
	                  }
                 }
                 //配置任务
                 List<DataUrlCfgInfo> taskUrlCfgInfos = new ArrayList<>();
                 for(int i=0;i<gongshangxinxi.size();i++){
                	 if(i%2==0){
                		 taskUrlCfgInfos.add(new DataUrlCfgInfo("全国公示网","CMB00004","http://www.sgs.gov.cn/lz/etpsInfo.do?method=doSearch&searchType=2&keyWords="+gongshangxinxi.get(i),"","html", "defaultResoureProcsor","企业信息"));
                	 }
                 }
                 //将任务插入db
                 DataColctnTaskInfo dataColctnTaskInfo = new DataColctnTaskInfo("工商信息获取","CMB00004","在全国公示网抓取企业信息数据", taskUrlCfgInfos);
                 getMongoTemplate().insert(dataColctnTaskInfo);
                 logger.info("添加任务："+dataColctnTaskInfo.getTaskCode()+"成功");
                 EntityUtils.consume(resEntity);
             } finally {
                 response.close();
             }
         }

该段代码主要是新建了一个post请求，添加了几个请求参数，执行请求获取到页面信息，并将解析封装好的信息插入到数据库中。

httpClientUtil代码以下

private static DefaultHttpClient httpClient;
	  public static synchronized DefaultHttpClient getHttpClient() {
		  
	        if (null == httpClient) {
	            // 初始化工做
	            try {
	                KeyStore trustStore = KeyStore.getInstance(KeyStore
	                        .getDefaultType());
	                trustStore.load(null, null);
	                SSLSocketFactoryEx ss = new SSLSocketFactoryEx(trustStore);
	                SSLSocketFactory sf = ss;
	                sf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);  //容许全部主机的验证
	 
	                HttpParams params = new BasicHttpParams();
//	                HttpProtocolParams.setVersion(params);
	                HttpProtocolParams.setContentCharset(params,
	                        HTTP.DEFAULT_CONTENT_CHARSET);
	                HttpProtocolParams.setUseExpectContinue(params, true);
	 
	                // 设置链接管理器的超时
	                ConnManagerParams.setTimeout(params, 500000);
	                
	                
	                // 设置链接超时
//	                HttpConnectionParams.s;
	                // 设置socket超时
//	                HttpConnectionParams.SO_TIMEOUT;
	 
	                // 设置http https支持
	                SchemeRegistry schReg = new SchemeRegistry();
	                schReg.register(new Scheme("http", PlainSocketFactory
	                        .getSocketFactory(), 80));
	                schReg.register(new Scheme("https", sf, 443));
	 
	                ClientConnectionManager conManager = new ThreadSafeClientConnManager(
	                        params, schReg);
	                ((ThreadSafeClientConnManager)conManager).setMaxTotal(50);
	                ((ThreadSafeClientConnManager)conManager).setDefaultMaxPerRoute(10);;
	 
	                httpClient = new DefaultHttpClient(conManager, params);
	                httpClient.getParams().setParameter("http.socket.timeout", new
	                		Integer(500000));
	            } catch (Exception e) {
	                e.printStackTrace();
	                return new DefaultHttpClient();
	            }
	        }
	        return httpClient;
	    }
	 static class SSLSocketFactoryEx extends SSLSocketFactory {
			 
		    SSLContext sslContext = SSLContext.getInstance("TLS");
		 
		    public SSLSocketFactoryEx(KeyStore truststore)
		            throws NoSuchAlgorithmException, KeyManagementException,
		            KeyStoreException, UnrecoverableKeyException {
		        super(truststore);
		 
		        TrustManager tm = new X509TrustManager() {
		 
		            @Override
		            public java.security.cert.X509Certificate[] getAcceptedIssuers() {
		                return null;
		            }
		 	
		            @Override
		            public void checkClientTrusted(
		                    java.security.cert.X509Certificate[] chain, String authType)
		                    throws java.security.cert.CertificateException {
		 
		            }
		 
		            @Override
		            public void checkServerTrusted(
		                    java.security.cert.X509Certificate[] chain, String authType)
		                    throws java.security.cert.CertificateException {
		 
		            }
		        };
		 
		        sslContext.init(null, new TrustManager[] { tm }, null);
		    }
		 
		    @Override
		    public Socket createSocket(Socket socket, String host, int port,
		            boolean autoClose) throws IOException, UnknownHostException {
		        return sslContext.getSocketFactory().createSocket(socket, host, port,
		                autoClose);
		    }
		 
		    @Override
		    public Socket createSocket() throws IOException {
		        return sslContext.getSocketFactory().createSocket();
		    }
		}

httpclientUtil类的主要做用是容许全部主机ssl的https的验证，并对httpclient作一些配置例如timeout，线程大小等设置

后置工做代码以下

//获取存入db的任务信息
List<DataUrlCfgInfo> dataUrlCfgInfos = dataColctnRequest.getDataUrlCfgInfos();
//循环任务信息将单条任务放入request中
FutureColctdDataInfoGettingRequest request = new FutureColctdDataInfoGettingRequest(dataUrlCfgInfo);
request.putExtra("serialNo",serialNo);
futureTasks.add(request.getFutureTask());
//爬虫工具进行抓取任务
getSpider().addRequest(new Request[]{request});

//获取爬取结果
ColctdDataInfo colctdDaraInfo = futureTask.get(5,TimeUnit.MINUTES);
SimpleHttpClientTool httClientTool = new SimpleHttpClientTool();
//将爬取结果做为参数继续进行爬取
colctdDataInfos.add(httClientTool.getColctdDataInfo(colctdDaraInfo));
DataColctnCreateFiles.createFile(httClientTool.getColctdDataInfo(colctdDaraInfo),null);//建立文件


        HttpClient client = new HttpClient();
		client.getHostConfiguration().setHost("www.sgs.gov.cn/lz/etpsInfo.do", 80, "http");
		HttpMethod method = getPostMethod(colctdDataInfo.getContents());// 使用POST方式提交数据
		client.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
		Cookie[] cookies = client.getState().getCookies();
		StringBuffer tmpcookies = new StringBuffer();
		for (Cookie c : cookies) {
			tmpcookies.append(c.toString() + ";");
		}
        //设置来源url，很是重要
		method.setRequestHeader("Referer", "http://www.sgs.gov.cn/lz/etpsInfo.do?method=doSearch");
		method.setRequestHeader("User-Agent",
				"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36");

		int statusCode = client.executeMethod(method);
		ColctdDataInfo colctdDataInfo1 = new ColctdDataInfo();

		if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
			// 从头中取出转向的地址
			Header locationHeader = method.getResponseHeader("location");
			String location = null;
			if (locationHeader != null) {
				location = locationHeader.getValue();
				System.out.println("SGSLogin:" + location);
			} else {
				System.err.println("Location field value is null.");
			}
			return null;
		} else {
			// System.out.println(method.getStatusLine());

			try {
				String str = "";
				str = method.getResponseBodyAsString();
				List<String> list = new ArrayList<String>();
				list.add(str);
				colctdDataInfo1.setContents(list);
				colctdDataInfo1.setDataUrlCfgInfo(colctdDataInfo.getDataUrlCfgInfo());
				colctdDataInfo1.setVersionNo(colctdDataInfo.getVersionNo());
				colctdDataInfo1.setCreateDate(colctdDataInfo.getCreateDate());
				method.releaseConnection();
			} catch (IOException e) {
				e.printStackTrace();
			}
			return colctdDataInfo1;
		}

这段代码的大体流程就是首先从DB中取到爬取任务的信息，而后循环取出来以后，使用爬虫工具爬取到结果（也就是工商注册号），再把爬取结果做为参数传递给一个post请求，继续进行爬取获取到页面信息，将页面信息保存在文件中。

4、邮编地址信息的获取

前置代码工做（拼装邮编号，将邮编号码插入db）以下

北京市为例（邮编以10开头市级部分最大26区级部分取到99）

List<StringBuilder> shi = new ArrayList<StringBuilder>();
List<StringBuilder> location = new ArrayList<StringBuilder>();
		for(int j=0;j<27;j++){//查看了北京市的邮编发现市级部分最大26
			StringBuilder sb = new StringBuilder();
			if(j<10){
				sb.append("10"+"0"+j);
			}else{
				sb.append("10"+j);
			}
			shi.add(sb);
		}
		for(int i=0;i<shi.size();i++){
		for(int j =0;j<100;j++){//具体地址设置到99全覆盖
			StringBuilder sb = new StringBuilder();
			if(j<10){
				sb.append(shi.get(i)+"0"+j);
			}else{
				sb.append(shi.get(i).toString()+j+"");
			}
			location.add(sb);
		}
		}
		return location;

而后就是拼装好了插入db，代码以下

List<StringBuilder> quanguo = getBeiJingPost();
 //配置任务
 List<DataUrlCfgInfo> taskUrlCfgInfos = new ArrayList<DataUrlCfgInfo>();
 for(int i=0;i<quanguo.size();i++){
     taskUrlCfgInfos.add(new DataUrlCfgInfo("中国邮政官网","CMB0000801",quanguo.get(i).toString(),"","html", "defaultResoureProcsor","邮编信息"));
  }
 //将任务插入db
 DataColctnTaskInfo dataColctnTaskInfo = new DataColctnTaskInfo("邮编地址信息","CMB0000801","在中国邮政官网抓取地址信息数据", taskUrlCfgInfos);
 getMongoTemplate().insert(dataColctnTaskInfo);
 logger.info("添加任务："+dataColctnTaskInfo.getTaskCode()+"成功");

后置代码部分

因为验证码的缘由，我这边作了简单的匹配像素取值的方法，研究下来发现若是匹配样图的数量在400张左右的状况下，经过率大概在百分之三十以上，仍是能够的。

下载图片代码以下

HttpClient httpClient = new HttpClient();  
   GetMethod getMethod = new GetMethod( "http://www.cpdc.com.cn/web/api.php?op=checkcode&code_len=4&font_size=14&width=100&height=26&font_color=&background=");  
        for (int i = 0; i < 100; i++) {  
            try {  
                // 执行getMethod  
                int statusCode = httpClient.executeMethod(getMethod);  
                if (statusCode != HttpStatus.SC_OK) {  
                    System.err.println("Method failed: "  
                            + getMethod.getStatusLine());  
                }  
                // 读取内容  
                InputStream inputStream = getMethod.getResponseBodyAsStream();  
                OutputStream outStream = new FileOutputStream(new File(DOWNLOAD_DIR, i + ".png"));  
                IOUtils.copy(inputStream, outStream);  
                outStream.close();  
                System.out.println("OK!");  
            } catch (Exception e) {  
                e.printStackTrace();  
            } finally {  
                // 释放链接  
                getMethod.releaseConnection();  
            }  
        }

下载好图片以后对图片进行处理，代码以下

File dir = new File(DOWNLOAD_DIR);
File[] files = dir.listFiles(new ImageFileFilter("png"));
        
int counter = 0;
for (File file : files) {
   BufferedImage image = ImageIO.read(file);
   removeInterference(image); 
   List<BufferedImage> digitImageList = splitImage(image);
   for (int i = 0; i < digitImageList.size(); i++) {
        BufferedImage bi = digitImageList.get(i);
        ImageIO.write(bi, "PNG", new File(TRAIN_DIR, "temp_" + counter++ + ".png"));
       }
    }
System.out.println("生成供比对的图片完毕，请到目录中手工识别并重命名图片，并删除其它无关图片！");

后置代码部分

ColctdDataInfo colctdDataInfo1 = new ColctdDataInfo();
		String imageUrl = "http://www.cpdc.com.cn/web/api.php?op=checkcode&code_len=4&font_size=14&width=100&height=26&font_color=&background=&"+new Date().getTime();
		HttpClient httpClient = new HttpClient(new MultiThreadedHttpConnectionManager());
		httpClient.getParams().setParameter("http.protocol.content-charset", "utf-8");
		httpClient.getParams().setContentCharset("utf-8");
		httpClient.getParams().setSoTimeout(20000);
		
		ImageObject imageObject = getImage(imageUrl);
		ImageProcess process = new ImageProcess();
		String checkCode = process.getValidateCode(imageObject.getFile());
		long time = new Date().getTime();
		String makeUrl = makeUrl(dataUrlCfgInfo.getUrl(),checkCode,pageNo,time);
		GetMethod getMethod = new GetMethod(makeUrl);
		getMethod.setRequestHeader("Cookie",imageObject.getCookies().toString());
		int statusCode = httpClient.executeMethod(getMethod);
		
		JSONObject json = (JSONObject) JSON.parse(IOUtils.toString(getMethod.getResponseBodyAsStream()));
		
		if(json.get("checkcode").equals(false)){
			getMethod.releaseConnection();
		}else{
			int pageCount = JSON.parseObject(json.getString("pageinfo")).getIntValue("TOTALPAGE");
			if(pageCount>0){
				for(int i=1;i<pageCount+1;i++){
					
					String url = makeUrl(dataUrlCfgInfo.getUrl(),checkCode,i,time);
					GetMethod getMethod1 = new GetMethod(url);
					getMethod1.setRequestHeader("Cookie",imageObject.getCookies().toString());
					int statusCode1 = httpClient.executeMethod(getMethod1);
					JSONObject json1 = (JSONObject) JSON.parse(IOUtils.toString(getMethod1.getResponseBodyAsStream()));
					System.out.println("抓取邮编地址成功！"+json1);
					colctdDataInfo1 = makeColctdDataInfo(json1,dataUrlCfgInfo,serialNo);
					DataColctnCreateFiles.createFile(colctdDataInfo1,null);//建立文件
				}
			}else if(pageCount==0){
				colctdDataInfo1 = makeColctdDataInfo(json,dataUrlCfgInfo,serialNo);
				DataColctnCreateFiles.createFile(colctdDataInfo1,null);//建立文件
			}
			getMethod.releaseConnection();
		}
		return colctdDataInfo1;

该段代码主要流程是获取到验证码的图片，而后解析出具体验证码，将验证码做为参数，另外将取得验证码操做的cookie保存下来，而后作一个get请求获取到页面信息，并保存下来页面文件。

5、项目开发心得

单纯从爬虫这块开发的话，首先要分析请求的类型，是get仍是post，而后看是不是http请求仍是https的请求。经过拼装参数完成相关请求。其中有点就是关于动态绘制页面信息的操做。因为项目需求改动，以前关于动态绘制的代码被删除掉，这点我反而以为是该项目我所学到的一些知识点吧，其中用到了selenium，并用到webDriver，new出一个chromeDriver，经过拼凑js，而后执行js，达到动态执行页面的效果。其中我还大概研究了一下htmlUnit，这个是无浏览器执行页面，也很方便。后续会进一步研究有关htmlUnit的部分。