是的,Java也能够作网络爬虫,不只能够爬静态网页的图片,也能够爬动态网页的图片,好比采用Ajax技术进行异步加载的百度瀑布流。java
之前有写过用Java进行百度图片的抓取,但只能抓取到第一二页,本博文则对此问题进行了深刻研究,提出了另一种思路解决问题。个人思路是这样的:之前人们总认为既然百度瀑布流是采用Javascript进行异步加载的,那么爬取图片至少要有一个模拟浏览器,好比Java领域中的无界面浏览器工具HtmlUnit,但后来我发现其实Jsoup也是能够的,只要用Jsoup去向百度服务器发送Ajax请求就好了,幸运的是我在观察百度图片的ajax请求时还真发现有两个类型的请求方式:avatarjson和acjson,实验告诉咱们第一种请求方式已经几乎能够知足咱们的全部需求。node
本博文所实现的效果是:根据输入的多个关键字,能够按定制的页数把各自关键字的搜索结果下载到本地文件夹中。具体以下所示:ajax
废话很少说,程序满上------->apache
package com.kendy.spider; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringEscapeUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; // 爬取百度图片 public class JsoupBaidu2 { public static void main(String[] args) throws Exception{ String downloadPath = "C:\\Users\\Kendy\\Desktop\\中国明星图"; List<String> list = nameList("凯莉·布鲁克 詹妮弗·洛佩兹 碧昂斯·诺里斯"); getPictures(list,1,downloadPath); //1表明下载一页,一页通常有30张图片 } public static void getPictures(List<String> keywordList, int max,String downloadPath) throws Exception{ // key为关键词,max做为爬取的页数 String gsm=Integer.toHexString(max)+""; String finalURL = ""; String tempPath = ""; for(String keyword : keywordList){ tempPath = downloadPath; if(!tempPath.endsWith("\\")){ tempPath = downloadPath+"\\"; } tempPath = tempPath+keyword+"\\"; File f = new File(tempPath); if(!f.exists()){ f.mkdirs(); } int picCount = 1; for(int page=0;page<=max;page++) { sop("正在下载第"+page+"页面"); Document document = null; try { String url ="http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word="+keyword+"&cg=star&pn="+page*30+"&rn=30&itg=0&z=0&fr=&width=&height=&lm=-1&ic=0&s=0&st=-1&gsm="+Integer.toHexString(page*30); sop(url); document = Jsoup.connect(url).data("query", "Java")//请求参数 .userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")//设置urer-agent get(); .timeout(5000) .get(); String xmlSource = document.toString(); xmlSource = StringEscapeUtils.unescapeHtml3(xmlSource); sop(xmlSource); String reg = "objURL\":\"http://.+?\\.jpg"; Pattern pattern = Pattern.compile(reg); Matcher m = pattern.matcher(xmlSource); while (m.find()) { finalURL = m.group().substring(9); sop(keyword+picCount+++":"+finalURL); download(finalURL,tempPath); sop(" 下载成功"); } } catch (IOException e) { e.printStackTrace(); } } } sop("下载完毕"); delMultyFile(downloadPath); sop("已经删除全部空图"); } public static void delMultyFile(String path){ File file = new File(path); if(!file.exists()) throw new RuntimeException("File \""+path+"\" NotFound when excute the method of delMultyFile()...."); File[] fileList = file.listFiles(); File tempFile=null; for(File f : fileList){ if(f.isDirectory()){ delMultyFile(f.getAbsolutePath()); }else{ if(f.length()==0) sop(f.delete()+"---"+f.getName()); } } } public static List<String> nameList(String nameList){ List<String> arr = new ArrayList<>(); String[] list; if(nameList.contains(",")) list= nameList.split(","); else if(nameList.contains("、")) list= nameList.split("、"); else if(nameList.contains(" ")) list= nameList.split(" "); else{ arr.add(nameList); return arr; } for(String s : list){ arr.add(s); } return arr; } public static void sop(Object obj){ System.out.println(obj); } //根据图片网络地址下载图片 public static void download(String url,String path){ //path = path.substring(0,path.length()-2); File file= null; File dirFile=null; FileOutputStream fos=null; HttpURLConnection httpCon = null; URLConnection con = null; URL urlObj=null; InputStream in =null; byte[] size = new byte[1024]; int num=0; try { String downloadName= url.substring(url.lastIndexOf("/")+1); dirFile = new File(path); if(!dirFile.exists() && path.length()>0){ if(dirFile.mkdir()){ sop("creat document file \""+path.substring(0,path.length()-1)+"\" success...\n"); } }else{ file = new File(path+downloadName); fos = new FileOutputStream(file); if(url.startsWith("http")){ urlObj = new URL(url); con = urlObj.openConnection(); httpCon =(HttpURLConnection) con; in = httpCon.getInputStream(); while((num=in.read(size)) != -1){ for(int i=0;i<num;i++) fos.write(size[i]); } } } }catch (FileNotFoundException notFoundE) { sop("找不到该网络图片...."); }catch(NullPointerException nullPointerE){ sop("找不到该网络图片...."); }catch(IOException ioE){ sop("产生IO异常....."); }catch (Exception e) { e.printStackTrace(); }finally{ try { fos.close(); } catch (Exception e) { e.printStackTrace(); } } } }