爬取topit.me专辑图片

时间 2019-11-20

标签 topit.me topit 专辑图片繁體版

原文原文链接

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TopitMe {
	private static final String URL_HOME = "http://www.topit.me";
	private static final Logger LOGGER = LoggerFactory.getLogger(TopitMe.class);
	//解决重定向
	private Document getHtml(String theme,String num) throws IOException{
		Map<String, String> cookies = Jsoup.connect(URL_HOME)
				.execute().cookies();
		cookies.put("item-tip", "true");
		cookies.put("tip_global_1", "true");
		cookies.put("is_click", "1");
		Document doc = Jsoup.connect(URL_HOME+ "/" + theme + "/" + num).cookies(cookies).get();
		return doc;
	}
	//专辑
	public void getAlbum(String num) throws IOException{
		int count = 0;
		int success = 0;
		int fail = 0;
		int ii = 1;
		String title = "";
		//这个循环是为了分页,写死最多10页,懒得去单独取了。
		for (int i = 1; i < 10; i++) {
			ii = i-1;
			Document doc = null;
			try {
				doc = getHtml("album" , num +"?p=" + i);
			} catch (Exception e1) {
				//e1.printStackTrace();
				LOGGER.error("打开页面失败,编号={},页码={} ",num,i);
				break;
			}
			if(doc == null) break;
			//找不到标题就提早退出
			Element element = doc.select("div.userinfo_blk h2").first();
			if(element == null) {
				LOGGER.error("抓取标题失败,编号={},页码={} ",num,i);
				break;
			}
			//专辑名称 
			title = element.text();
			//校验文件名是否合法
			if(!filterName(title) || title.equals("")){
				SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
				title = dateFormat.format(new Date());
			}
			Elements imgs = doc.select("div.catalog img[src]");
			if(imgs.isEmpty()) {
				LOGGER.error("抓取图片标签失败,编号={},页码={} ",num,i);
				break;
			}
			
			for (int j = 0; j < imgs.size(); j++) {
				Element img = imgs.get(j);
				//img地址 ,地址放的属性不同，真是见鬼了
				String href = img.attr("data-original");
				if(href == null || href.equals("")){
					href = img.attr("src");
				}
				//属性不同....
				String fileName = img.attr("alt");
				if(fileName == null || fileName.equals("")){
					fileName = img.attr("title");
				}
				if(!filterName(fileName) || fileName.equals("")){
					fileName = String.valueOf(new Date().getTime());
				}
				count++;
				//取大图,m--j
				if(href.contains("/m/")){
					String newHref = href.replace("/m/", "/l/");
					try {
						download(title , i + "-" + (j+1) + "-" + fileName, newHref);
						//System.out.println(newHref);
						success ++;
					} catch (Exception e) {
						//System.err.println(newHref);
						fail++;
						LOGGER.error("下载失败,编号={},专辑={},页码={},地址={} ",num,title,i,newHref);
					}
				}else{
					String newHref = href.substring(0, href.length()-5) + "l" + href.substring(href.length()-4);
					try {
						download(title , i + "-" + (j+1) + "-" + fileName, newHref);
						//System.out.println(newHref);
						success ++;
					} catch (Exception e) {
						//System.err.println(newHref);
						fail++;
						LOGGER.error("下载失败,编号={},专辑={},页码={},地址={} ",num,title,i,newHref);
					}
				}
			}
		}
		//System.out.println("[统计]编号： "+num+",抓取: "+count+",成功 : " + success + ",失败： " + fail);
		LOGGER.info("[统计]编号={},专辑={},抓取={},总计={},成功={},失败={}",num,title,ii,count,success,fail);
	}
	//校验文件名合法性
	private boolean filterName(String name){
		String[] filter = {"\\", "/" , ":" , "*" , "?" , "<" , ">" , "|" };
		for (String string : filter) {
			if(name.contains(string)){
				return false;
			}
		}
		return true;
	}
	//下载
	private void download(String dir ,String fileName, String href) throws IOException {
		URL url = new URL(href);
		URLConnection conn = url.openConnection();
		InputStream inStream = conn.getInputStream();
		byte[] buffer = new byte[inStream.available()];
		int length;

		File savefile = new File("F:/picture/" + dir);
		if (!savefile.exists()) {
			// 建立分离目录
			savefile.mkdirs();
		}
		FileOutputStream fos = new FileOutputStream("F:/picture/" + dir + "/"
				+ fileName + ".jpg");
		while ((length = inStream.read(buffer)) != -1) {
			fos.write(buffer, 0, length);
		}
		fos.close();
		inStream.close();
	}
}

//用法：
public static void main(String[] args) throws Exception {
                //http://www.topit.me/album/12598
                //专辑编号
		TopitMe me = new TopitMe();
		me.getAlbum("12598");
}

1. 网络爬虫（专门抓取图片）
2. QT 读取mp3ID3V2 获取mp3专辑图片、专辑名称、标题、作者（一）
3. scrapy爬虫，爬取图片
4. python爬虫（爬取图片）
5. Python爬虫：爬取图片
6. 爬取图片（二）
7. 爬取图片（四）
8. python爬取图片
9. java爬取图片
10. 爬取图片（一）
更多相关文章...
• Markdown 图片 - Markdown 教程
• PHP getimagesizefromstring - 获取图片信息函数 - PHP参考手册
• NewSQL-TiDB相关
• 漫谈MySQL的锁机制