import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class TopitMe { private static final String URL_HOME = "http://www.topit.me"; private static final Logger LOGGER = LoggerFactory.getLogger(TopitMe.class); //解决重定向 private Document getHtml(String theme,String num) throws IOException{ Map<String, String> cookies = Jsoup.connect(URL_HOME) .execute().cookies(); cookies.put("item-tip", "true"); cookies.put("tip_global_1", "true"); cookies.put("is_click", "1"); Document doc = Jsoup.connect(URL_HOME+ "/" + theme + "/" + num).cookies(cookies).get(); return doc; } //专辑 public void getAlbum(String num) throws IOException{ int count = 0; int success = 0; int fail = 0; int ii = 1; String title = ""; //这个循环是为了分页,写死最多10页,懒得去单独取了。 for (int i = 1; i < 10; i++) { ii = i-1; Document doc = null; try { doc = getHtml("album" , num +"?p=" + i); } catch (Exception e1) { //e1.printStackTrace(); LOGGER.error("打开页面失败,编号={},页码={} ",num,i); break; } if(doc == null) break; //找不到标题就提早退出 Element element = doc.select("div.userinfo_blk h2").first(); if(element == null) { LOGGER.error("抓取标题失败,编号={},页码={} ",num,i); break; } //专辑名称 title = element.text(); //校验文件名是否合法 if(!filterName(title) || title.equals("")){ SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); title = dateFormat.format(new Date()); } Elements imgs = doc.select("div.catalog img[src]"); if(imgs.isEmpty()) { LOGGER.error("抓取图片标签失败,编号={},页码={} ",num,i); break; } for (int j = 0; j < imgs.size(); j++) { Element img = imgs.get(j); //img地址 ,地址放的属性不同,真是见鬼了 String href = img.attr("data-original"); if(href == null || href.equals("")){ href = img.attr("src"); } //属性不同.... String fileName = img.attr("alt"); if(fileName == null || fileName.equals("")){ fileName = img.attr("title"); } if(!filterName(fileName) || fileName.equals("")){ fileName = String.valueOf(new Date().getTime()); } count++; //取大图,m--j if(href.contains("/m/")){ String newHref = href.replace("/m/", "/l/"); try { download(title , i + "-" + (j+1) + "-" + fileName, newHref); //System.out.println(newHref); success ++; } catch (Exception e) { //System.err.println(newHref); fail++; LOGGER.error("下载失败,编号={},专辑={},页码={},地址={} ",num,title,i,newHref); } }else{ String newHref = href.substring(0, href.length()-5) + "l" + href.substring(href.length()-4); try { download(title , i + "-" + (j+1) + "-" + fileName, newHref); //System.out.println(newHref); success ++; } catch (Exception e) { //System.err.println(newHref); fail++; LOGGER.error("下载失败,编号={},专辑={},页码={},地址={} ",num,title,i,newHref); } } } } //System.out.println("[统计]编号: "+num+",抓取: "+count+",成功 : " + success + ",失败: " + fail); LOGGER.info("[统计]编号={},专辑={},抓取={},总计={},成功={},失败={}",num,title,ii,count,success,fail); } //校验文件名合法性 private boolean filterName(String name){ String[] filter = {"\\", "/" , ":" , "*" , "?" , "<" , ">" , "|" }; for (String string : filter) { if(name.contains(string)){ return false; } } return true; } //下载 private void download(String dir ,String fileName, String href) throws IOException { URL url = new URL(href); URLConnection conn = url.openConnection(); InputStream inStream = conn.getInputStream(); byte[] buffer = new byte[inStream.available()]; int length; File savefile = new File("F:/picture/" + dir); if (!savefile.exists()) { // 建立分离目录 savefile.mkdirs(); } FileOutputStream fos = new FileOutputStream("F:/picture/" + dir + "/" + fileName + ".jpg"); while ((length = inStream.read(buffer)) != -1) { fos.write(buffer, 0, length); } fos.close(); inStream.close(); } } //用法: public static void main(String[] args) throws Exception { //http://www.topit.me/album/12598 //专辑编号 TopitMe me = new TopitMe(); me.getAlbum("12598"); }