常规网页展现部分通常分为列表页和详情页,Tumblr站是请求后台api返回json,例如整站搜索接口分页请求,参数加密暂时无解,可是对应blog的列表,可采用不需加密接口,因此仍是有办法作对应blog列表的爬虫处理。java
Tumblr站后端json返回格式各类乱,有json内含有html字符串,格式化时要注意容错,各类坑。web
还有一个简单去重处理,每一个列表信息均作入库处理,但下载时会用视频封面作一个md5比较去重处理,重复则不下载当前视频,而且使重复视频数据进行入库,(下期会细说不一样来用视频下载区别,以及汇总处理)。spring
本人的springboot 是采用多线程定时器,分别定时跑爬虫数据和下载爬虫资源,之后会在这里慢慢列出解决方法。json
SpiderTumblrService为一些数据的入库处理,SslDownloader为webmagic获取https,请求处理。后端
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <!--<version>0.7.2</version>--> <version>0.5.2</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency>
package win.raychow.modules.spider.base.processor; import com.alibaba.fastjson.JSON; import org.json.XML; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import win.raychow.core.base.dao.CacheKey; import win.raychow.core.base.service.HtmlTool; import win.raychow.demo.spider.tool.SslDownloader; import win.raychow.modules.spider.base.dao.SpiderTumblr; import win.raychow.modules.spider.base.domain.TumblrRecModel; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * Created by ray on 2017/11/19. */ @Service public class TumblrProcessor implements PageProcessor { private Logger logger = LoggerFactory.getLogger(this.getClass()); @Autowired TumblrPipeLine pipeLine; @Value("${spider.tumblr.prefixSexList}") private String prefixSexList; @Value("${spider.tumblr.prefixAnimalList}") private String prefixAnimalList; public final static String bashUrl = ".tumblr.com/api/read?type=video&num=20&start="; private String getCategory(String url){ //性 String[] sexList = prefixSexList.split(CacheKey.Split); for (String id: sexList) { if (url.contains(id)){ return SpiderTumblr.Category_AV; } } //动物 String[] animalList = prefixAnimalList.split(CacheKey.Split); for (String id: animalList) { if (url.contains(id)){ return SpiderTumblr.Category_Animal; } } return SpiderTumblr.Category_Null; } @Override public Site getSite() { //HttpHost httpHost = new HttpHost("127.0.0.1",1087); Site site = Site.me() //.setHttpProxy(httpHost) .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36") .setSleepTime(30 * 1000) .setTimeOut(20 * 1000) .setRetryTimes(3) .setCycleRetryTimes(3); return site; } @Override public void process(Page page){ String pageUrl = page.getUrl().toString(); logger.info(pageUrl); if (pageUrl.contains(bashUrl)) { try { String xml = page.getJson().toString(); String json = XML.toJSONObject(xml).toString(); TumblrRecModel rec = JSON.parseObject(json, TumblrRecModel.class); List<TumblrRecModel.Post> posts = rec.getTumblr().getPosts().getPost(); List<SpiderTumblr> list = new ArrayList<>(); TumblrRecModel.Tumblelog tumblelog = rec.getTumblr().getTumblelog(); //增长请求 if (pageUrl.contains("1&fffff=0")) { List<String> requestUrls =new ArrayList<>(); long total = Long.valueOf(rec.getTumblr().getPosts().getTotal()); long pageMax = total / 20 + 1; for (int j = 1; j < pageMax; j++) { String tmpUrl = pageUrl.replace("1&fffff=0",String.valueOf(20*j)) ; requestUrls.add(tmpUrl); } page.addTargetRequests(requestUrls); } if (posts.size() == 0) return; for (int i = 0; i < posts.size(); i++) { String str = ""; try { TumblrRecModel.Post post = posts.get(i); str = post.getVideoPlayer().get(0); str = str.replace("\"","'"); String id = HtmlTool.match(str,"video","id").get(0); String poster = HtmlTool.match(str,"video","poster").get(0); String optionsJson = HtmlTool.match(str,"video","data-crt-options").get(0); TumblrRecModel.Options optionsRec = JSON.parseObject(optionsJson,TumblrRecModel.Options.class); String file = HtmlTool.match(str,"source","src").get(0); //类型 String type = ""; if (str.toLowerCase().contains("video/mp4")){ type = "mp4"; } else if (str.toLowerCase().contains("video/ogg")){ type = "ogg"; } else if (str.toLowerCase().contains("video/webm")){ type = "webm"; } String category = pageUrl.split("&ggggg=")[1].toLowerCase(); if (optionsRec.getHdUrl().length() > 10){ file = optionsRec.getHdUrl(); } //String type = post.getVideoSource().getExtension(); String videoCaption = HtmlTool.removeHtmlTag(post.getVideoCaption()); String videoId = "tumblr_" + post.getUrl().substring(post.getUrl().lastIndexOf("/")).substring(1); SpiderTumblr tumblr = new SpiderTumblr(); tumblr.setVideoId(videoId); tumblr.setPosterImage(poster); tumblr.setVideoImage(optionsRec.getFilmstrip().getUrl()); tumblr.setVideoUrl(file); tumblr.setVideoType(type); tumblr.setTitle(videoCaption); tumblr.setBaseUrl(post.getUrl()); tumblr.setCategory(category); tumblr.setBlogTitle(tumblelog.getTitle()); list.add(tumblr); } catch (Exception e){ logger.error("xml to data error :" + str ); } } if (list.size() > 0){ page.putField("type", 0); page.putField("data", list); } } catch (Exception e){ logger.error("url:" + pageUrl ); } } } public void run(){ Spider spider = Spider.create(new TumblrProcessor()) //.setDownloader(new HttpClientDownloader()) //.setDownloader(new HttpDownloader()) .setDownloader(new SslDownloader()) //.addPipeline(new ConsolePipeline())//打印到控制台 .addPipeline(pipeLine); //animal String[] animalIds = prefixAnimalList.split(CacheKey.Split); for (String prefix:animalIds) { String tmpUrl = "https://" + prefix + bashUrl + "1&fffff=0" + "&ggggg=" + this.getCategory(prefix); spider.addUrl(tmpUrl); } //sex String[] sexIds = prefixSexList.split(CacheKey.Split); for (String prefix:sexIds) { String tmpUrl = "https://" + prefix + bashUrl + "1&fffff=0" + "&ggggg=" + this.getCategory(prefix); spider.addUrl(tmpUrl); } spider.run(); } }
package win.raychow.modules.spider.base.processor; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import win.raychow.modules.spider.base.dao.SpiderTumblr; import win.raychow.modules.spider.base.dao.SpiderTumblrService; import java.util.List; /** * Created by ray on 2017/11/19. */ @Service public class TumblrPipeLine implements Pipeline { @Autowired SpiderTumblrService service; @Override public void process(ResultItems resultItems, Task task){ if (resultItems.getAll().isEmpty() == false) { int type = resultItems.get("type"); if (type == 0){ //列表内容 List<SpiderTumblr> list = resultItems.get("data"); for (SpiderTumblr tumblr: list) { try { String blogName = tumblr.getBaseUrl().replace("https://","").replace("http://","").split("\\.")[0]; tumblr.setBlogName(blogName); } catch (Exception e){ } service.updateBySpider(tumblr); } } else if(type == 1){ } } } }
原文:http://raychow.linkfun.top/2017/12/15/archives/9_javaSpring/spriderTumblr/index/api