爬虫知识点整理php
1、开发环境配置html
1.jdk1.8.0_91java
2.数据库MongoDB 2.6 Standardgit
3.apache-activemq-5.13.2web
4.commons-httpclient-3.0chrome
5.服务器apache-tomcat-7.0.69数据库
6.webmagic-core-0.5.2apache
2、项目需求json
a) 从全国企业信息系统爬取到企业的工商注册号,而后根据工商注册号获取该企业的详细信息api
b)预设大概的邮编号码,而后从中国邮政官网获取该邮编下的地址信息。
3、企业详细信息的获取
前置工做代码以下
CloseableHttpClient client = HttpClientUtil.getHttpClient(); // HttpGet get = new HttpGet(); HttpPost post = new HttpPost(); post.setURI(new URI("https://www.sgs.gov.cn/notice/search/ent_spot_check_list")); BasicNameValuePair simcard1 = new BasicNameValuePair("captcha", ""); BasicNameValuePair simcard2 = new BasicNameValuePair("condition.pageNo", pageNo); BasicNameValuePair simcard3 = new BasicNameValuePair("condition.insType", "1"); BasicNameValuePair simcard4 = new BasicNameValuePair("session.token", "e07b51eb-8c11-4fea-9e3f-f9cb3c0b20e9"); BasicNameValuePair simcard5 = new BasicNameValuePair("condition.keyword", ""); List<BasicNameValuePair> formParams = new ArrayList<BasicNameValuePair>(); formParams.add(simcard1); formParams.add(simcard2); formParams.add(simcard3); formParams.add(simcard4); formParams.add(simcard5); UrlEncodedFormEntity uefEntity = new UrlEncodedFormEntity((List<? extends org.apache.http.NameValuePair>) formParams,"UTF-8"); post.setEntity(uefEntity); CloseableHttpResponse response = client.execute(post); List<String> gongshangxinxi = new ArrayList<String>(); String s = ""; if(response.getStatusLine().getStatusCode() == 200){ try { HttpEntity resEntity = response.getEntity(); //解析页面 if (resEntity != null) { s=EntityUtils.toString(resEntity,"UTF-8"); Document doc = Jsoup.parse(s.toString()); Elements content = doc.getElementsByClass("center"); for(int i=0;i<content.size();i++){ String target = content.get(i).getElementsByClass("center").text(); gongshangxinxi.add(target); } } //配置任务 List<DataUrlCfgInfo> taskUrlCfgInfos = new ArrayList<>(); for(int i=0;i<gongshangxinxi.size();i++){ if(i%2==0){ taskUrlCfgInfos.add(new DataUrlCfgInfo("全国公示网","CMB00004","http://www.sgs.gov.cn/lz/etpsInfo.do?method=doSearch&searchType=2&keyWords="+gongshangxinxi.get(i),"","html", "defaultResoureProcsor","企业信息")); } } //将任务插入db DataColctnTaskInfo dataColctnTaskInfo = new DataColctnTaskInfo("工商信息获取","CMB00004","在全国公示网抓取企业信息数据", taskUrlCfgInfos); getMongoTemplate().insert(dataColctnTaskInfo); logger.info("添加任务:"+dataColctnTaskInfo.getTaskCode()+"成功"); EntityUtils.consume(resEntity); } finally { response.close(); } }
该段代码主要是新建了一个post请求,添加了几个请求参数,执行请求获取到页面信息,并将解析封装好的信息插入到数据库中。
httpClientUtil代码以下
private static DefaultHttpClient httpClient; public static synchronized DefaultHttpClient getHttpClient() { if (null == httpClient) { // 初始化工做 try { KeyStore trustStore = KeyStore.getInstance(KeyStore .getDefaultType()); trustStore.load(null, null); SSLSocketFactoryEx ss = new SSLSocketFactoryEx(trustStore); SSLSocketFactory sf = ss; sf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER); //容许全部主机的验证 HttpParams params = new BasicHttpParams(); // HttpProtocolParams.setVersion(params); HttpProtocolParams.setContentCharset(params, HTTP.DEFAULT_CONTENT_CHARSET); HttpProtocolParams.setUseExpectContinue(params, true); // 设置链接管理器的超时 ConnManagerParams.setTimeout(params, 500000); // 设置链接超时 // HttpConnectionParams.s; // 设置socket超时 // HttpConnectionParams.SO_TIMEOUT; // 设置http https支持 SchemeRegistry schReg = new SchemeRegistry(); schReg.register(new Scheme("http", PlainSocketFactory .getSocketFactory(), 80)); schReg.register(new Scheme("https", sf, 443)); ClientConnectionManager conManager = new ThreadSafeClientConnManager( params, schReg); ((ThreadSafeClientConnManager)conManager).setMaxTotal(50); ((ThreadSafeClientConnManager)conManager).setDefaultMaxPerRoute(10);; httpClient = new DefaultHttpClient(conManager, params); httpClient.getParams().setParameter("http.socket.timeout", new Integer(500000)); } catch (Exception e) { e.printStackTrace(); return new DefaultHttpClient(); } } return httpClient; } static class SSLSocketFactoryEx extends SSLSocketFactory { SSLContext sslContext = SSLContext.getInstance("TLS"); public SSLSocketFactoryEx(KeyStore truststore) throws NoSuchAlgorithmException, KeyManagementException, KeyStoreException, UnrecoverableKeyException { super(truststore); TrustManager tm = new X509TrustManager() { @Override public java.security.cert.X509Certificate[] getAcceptedIssuers() { return null; } @Override public void checkClientTrusted( java.security.cert.X509Certificate[] chain, String authType) throws java.security.cert.CertificateException { } @Override public void checkServerTrusted( java.security.cert.X509Certificate[] chain, String authType) throws java.security.cert.CertificateException { } }; sslContext.init(null, new TrustManager[] { tm }, null); } @Override public Socket createSocket(Socket socket, String host, int port, boolean autoClose) throws IOException, UnknownHostException { return sslContext.getSocketFactory().createSocket(socket, host, port, autoClose); } @Override public Socket createSocket() throws IOException { return sslContext.getSocketFactory().createSocket(); } }
httpclientUtil类的主要做用是容许全部主机ssl的https的验证,并对httpclient作一些配置例如timeout,线程大小等设置
后置工做代码以下
//获取存入db的任务信息 List<DataUrlCfgInfo> dataUrlCfgInfos = dataColctnRequest.getDataUrlCfgInfos(); //循环任务信息将单条任务放入request中 FutureColctdDataInfoGettingRequest request = new FutureColctdDataInfoGettingRequest(dataUrlCfgInfo); request.putExtra("serialNo",serialNo); futureTasks.add(request.getFutureTask()); //爬虫工具进行抓取任务 getSpider().addRequest(new Request[]{request}); //获取爬取结果 ColctdDataInfo colctdDaraInfo = futureTask.get(5,TimeUnit.MINUTES); SimpleHttpClientTool httClientTool = new SimpleHttpClientTool(); //将爬取结果做为参数继续进行爬取 colctdDataInfos.add(httClientTool.getColctdDataInfo(colctdDaraInfo)); DataColctnCreateFiles.createFile(httClientTool.getColctdDataInfo(colctdDaraInfo),null);//建立文件 HttpClient client = new HttpClient(); client.getHostConfiguration().setHost("www.sgs.gov.cn/lz/etpsInfo.do", 80, "http"); HttpMethod method = getPostMethod(colctdDataInfo.getContents());// 使用POST方式提交数据 client.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); Cookie[] cookies = client.getState().getCookies(); StringBuffer tmpcookies = new StringBuffer(); for (Cookie c : cookies) { tmpcookies.append(c.toString() + ";"); } //设置来源url,很是重要 method.setRequestHeader("Referer", "http://www.sgs.gov.cn/lz/etpsInfo.do?method=doSearch"); method.setRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"); int statusCode = client.executeMethod(method); ColctdDataInfo colctdDataInfo1 = new ColctdDataInfo(); if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) { // 从头中取出转向的地址 Header locationHeader = method.getResponseHeader("location"); String location = null; if (locationHeader != null) { location = locationHeader.getValue(); System.out.println("SGSLogin:" + location); } else { System.err.println("Location field value is null."); } return null; } else { // System.out.println(method.getStatusLine()); try { String str = ""; str = method.getResponseBodyAsString(); List<String> list = new ArrayList<String>(); list.add(str); colctdDataInfo1.setContents(list); colctdDataInfo1.setDataUrlCfgInfo(colctdDataInfo.getDataUrlCfgInfo()); colctdDataInfo1.setVersionNo(colctdDataInfo.getVersionNo()); colctdDataInfo1.setCreateDate(colctdDataInfo.getCreateDate()); method.releaseConnection(); } catch (IOException e) { e.printStackTrace(); } return colctdDataInfo1; }
这段代码的大体流程就是首先从DB中取到爬取任务的信息,而后循环取出来以后,使用爬虫工具爬取到结果(也就是工商注册号),再把爬取结果做为参数传递给一个post请求,继续进行爬取获取到页面信息,将页面信息保存在文件中。
4、邮编地址信息的获取
前置代码工做(拼装邮编号,将邮编号码插入db)以下
北京市为例(邮编以10开头市级部分最大26区级部分取到99)
List<StringBuilder> shi = new ArrayList<StringBuilder>(); List<StringBuilder> location = new ArrayList<StringBuilder>(); for(int j=0;j<27;j++){//查看了北京市的邮编发现市级部分最大26 StringBuilder sb = new StringBuilder(); if(j<10){ sb.append("10"+"0"+j); }else{ sb.append("10"+j); } shi.add(sb); } for(int i=0;i<shi.size();i++){ for(int j =0;j<100;j++){//具体地址设置到99全覆盖 StringBuilder sb = new StringBuilder(); if(j<10){ sb.append(shi.get(i)+"0"+j); }else{ sb.append(shi.get(i).toString()+j+""); } location.add(sb); } } return location;
而后就是拼装好了插入db,代码以下
List<StringBuilder> quanguo = getBeiJingPost(); //配置任务 List<DataUrlCfgInfo> taskUrlCfgInfos = new ArrayList<DataUrlCfgInfo>(); for(int i=0;i<quanguo.size();i++){ taskUrlCfgInfos.add(new DataUrlCfgInfo("中国邮政官网","CMB0000801",quanguo.get(i).toString(),"","html", "defaultResoureProcsor","邮编信息")); } //将任务插入db DataColctnTaskInfo dataColctnTaskInfo = new DataColctnTaskInfo("邮编地址信息","CMB0000801","在中国邮政官网抓取地址信息数据", taskUrlCfgInfos); getMongoTemplate().insert(dataColctnTaskInfo); logger.info("添加任务:"+dataColctnTaskInfo.getTaskCode()+"成功");
后置代码部分
因为验证码的缘由,我这边作了简单的匹配像素取值的方法,研究下来发现若是匹配样图的数量在400张左右的状况下,经过率大概在百分之三十以上,仍是能够的。
下载图片代码以下
HttpClient httpClient = new HttpClient(); GetMethod getMethod = new GetMethod( "http://www.cpdc.com.cn/web/api.php?op=checkcode&code_len=4&font_size=14&width=100&height=26&font_color=&background="); for (int i = 0; i < 100; i++) { try { // 执行getMethod int statusCode = httpClient.executeMethod(getMethod); if (statusCode != HttpStatus.SC_OK) { System.err.println("Method failed: " + getMethod.getStatusLine()); } // 读取内容 InputStream inputStream = getMethod.getResponseBodyAsStream(); OutputStream outStream = new FileOutputStream(new File(DOWNLOAD_DIR, i + ".png")); IOUtils.copy(inputStream, outStream); outStream.close(); System.out.println("OK!"); } catch (Exception e) { e.printStackTrace(); } finally { // 释放链接 getMethod.releaseConnection(); } }
下载好图片以后对图片进行处理,代码以下
File dir = new File(DOWNLOAD_DIR); File[] files = dir.listFiles(new ImageFileFilter("png")); int counter = 0; for (File file : files) { BufferedImage image = ImageIO.read(file); removeInterference(image); List<BufferedImage> digitImageList = splitImage(image); for (int i = 0; i < digitImageList.size(); i++) { BufferedImage bi = digitImageList.get(i); ImageIO.write(bi, "PNG", new File(TRAIN_DIR, "temp_" + counter++ + ".png")); } } System.out.println("生成供比对的图片完毕,请到目录中手工识别并重命名图片,并删除其它无关图片!");
后置代码部分
ColctdDataInfo colctdDataInfo1 = new ColctdDataInfo(); String imageUrl = "http://www.cpdc.com.cn/web/api.php?op=checkcode&code_len=4&font_size=14&width=100&height=26&font_color=&background=&"+new Date().getTime(); HttpClient httpClient = new HttpClient(new MultiThreadedHttpConnectionManager()); httpClient.getParams().setParameter("http.protocol.content-charset", "utf-8"); httpClient.getParams().setContentCharset("utf-8"); httpClient.getParams().setSoTimeout(20000); ImageObject imageObject = getImage(imageUrl); ImageProcess process = new ImageProcess(); String checkCode = process.getValidateCode(imageObject.getFile()); long time = new Date().getTime(); String makeUrl = makeUrl(dataUrlCfgInfo.getUrl(),checkCode,pageNo,time); GetMethod getMethod = new GetMethod(makeUrl); getMethod.setRequestHeader("Cookie",imageObject.getCookies().toString()); int statusCode = httpClient.executeMethod(getMethod); JSONObject json = (JSONObject) JSON.parse(IOUtils.toString(getMethod.getResponseBodyAsStream())); if(json.get("checkcode").equals(false)){ getMethod.releaseConnection(); }else{ int pageCount = JSON.parseObject(json.getString("pageinfo")).getIntValue("TOTALPAGE"); if(pageCount>0){ for(int i=1;i<pageCount+1;i++){ String url = makeUrl(dataUrlCfgInfo.getUrl(),checkCode,i,time); GetMethod getMethod1 = new GetMethod(url); getMethod1.setRequestHeader("Cookie",imageObject.getCookies().toString()); int statusCode1 = httpClient.executeMethod(getMethod1); JSONObject json1 = (JSONObject) JSON.parse(IOUtils.toString(getMethod1.getResponseBodyAsStream())); System.out.println("抓取邮编地址成功!"+json1); colctdDataInfo1 = makeColctdDataInfo(json1,dataUrlCfgInfo,serialNo); DataColctnCreateFiles.createFile(colctdDataInfo1,null);//建立文件 } }else if(pageCount==0){ colctdDataInfo1 = makeColctdDataInfo(json,dataUrlCfgInfo,serialNo); DataColctnCreateFiles.createFile(colctdDataInfo1,null);//建立文件 } getMethod.releaseConnection(); } return colctdDataInfo1;
该段代码主要流程是获取到验证码的图片,而后解析出具体验证码,将验证码做为参数,另外将取得验证码操做的cookie保存下来,而后作一个get请求获取到页面信息,并保存下来页面文件。
5、项目开发心得
单纯从爬虫这块开发的话,首先要分析请求的类型,是get仍是post,而后看是不是http请求仍是https的请求。经过拼装参数完成相关请求。其中有点就是关于动态绘制页面信息的操做。因为项目需求改动,以前关于动态绘制的代码被删除掉,这点我反而以为是该项目我所学到的一些知识点吧,其中用到了selenium,并用到webDriver,new出一个chromeDriver,经过拼凑js,而后执行js,达到动态执行页面的效果。其中我还大概研究了一下htmlUnit,这个是无浏览器执行页面,也很方便。后续会进一步研究有关htmlUnit的部分。