常规网页展现部分通常分为列表页和详情页,bilibili站是请求后台api返回json,而后使用js生成网页,全部主要设计解析方法为,json解析处理。web
b站后端json返回格式相对n站更具结构化,清晰,可是要注意能用字符类型尽可能用字符串类型,兼容性更好。spring
本人的springboot 是采用多线程定时器,分别定时跑爬虫数据和下载爬虫资源,之后会在这里慢慢列出解决方法。json
b站评论和列表能够用api请求得出,可是详情真实地址需不能直获取都加密,能够借助地三处理获取真实数据http://flvurl.cn/ ,另外B站图片和视频均匀简单防倒链,图片,须要。图片须要或删除Referer,这里还有H5的小技巧,之后慢慢说。视频须要 Referer:http://www.bilibili.com/后端
SpiderNicoService为一些数据的入库处理,SslDownloader为webmagic获取https,请求处理。api
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <!--<version>0.7.2</version>--> <version>0.5.2</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency>
package win.raychow.modules.spider.base.processor; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import win.raychow.demo.spider.tool.SslDownloader; import win.raychow.modules.spider.base.dao.SpiderBili; import win.raychow.modules.spider.base.domain.BilibiliReplyModel; import java.util.ArrayList; import java.util.List; import java.util.Map; /** * Created by ray on 2017/7/2. * 爬虫进程 */ @Service public class BilibiliProcessor implements PageProcessor { //https://api.bilibili.com/archive_rank/getarchiverankbypartion?type=jsonp&tid=20&pn=1 private String bashUrl = "https://api.bilibili.com/archive_rank/getarchiverankbypartion?type=jsonp&tid=20&pn=" ;// + i //https://www.bilibili.com/video/av11778873/ private String detailUrl = "https://www.bilibili.com/video/av"; //https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn=1&type=1&oid=11253064 private String replyUrl = "https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn=1&type=1&oid="; private Logger logger = LoggerFactory.getLogger(this.getClass()); @Value("${spider.bilibili.maxSize}") int maxSize; @Autowired BilibiliPipeLine pipeLine; private Site site = Site.me() .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36") .setSleepTime(10 * 1000) .setTimeOut(20 * 1000) .setRetryTimes(3) .setCycleRetryTimes(3); @Override public Site getSite() { return site; } @Override public void process(Page page){ String pageUrl = page.getUrl().toString(); logger.info(pageUrl); try { if (pageUrl.contains(bashUrl)) { //列表页 String text = page.getJson().toString(); JSONObject jsonObject = JSONObject.parseObject(text); JSONObject archives = (JSONObject) ((JSONObject) jsonObject.get("data")).get("archives"); List<SpiderBili> list = new ArrayList<>(); List<String> reqList = new ArrayList<>(); for (Map.Entry<String, Object> entry : archives.entrySet()) { SpiderBili tmp = JSON.parseObject(entry.getValue().toString(),SpiderBili.class); list.add(tmp); String reqTmp = replyUrl + tmp.getAid(); reqList.add(reqTmp); } if (list.size() > 0){ page.putField("type", 0); page.putField("data", list); page.addTargetRequests(reqList); } } else if (pageUrl.contains(replyUrl)){ String text = page.getJson().toString(); if (text.length() > 10){ BilibiliReplyModel replyModel = JSON.parseObject(text,BilibiliReplyModel.class); String aid = pageUrl.replace(replyUrl,""); page.putField("type", 1); page.putField("data", text); page.putField("aid", aid); } } else if (pageUrl.contains(detailUrl)) { //详情页 } } catch (Exception e){ logger.error("url:" + pageUrl ); } } public void run(){ Spider spider = Spider.create(new BilibiliProcessor()) .setDownloader(new SslDownloader()) .addPipeline(pipeLine); //.addPipeline(new ConsolePipeline());//打印到控制台 spider.addUrl("https://api.bilibili.com/archive_rank/getarchiverankbypartion?type=jsonp&tid=20&pn=32"); for (int i = 1; i < maxSize; i++) { //4000 String tmp = bashUrl + i; spider.addUrl(tmp); } try { spider.run(); } catch (Exception e){ } } }
package win.raychow.modules.spider.base.processor; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import win.raychow.modules.spider.base.dao.SpiderBili; import win.raychow.modules.spider.base.dao.SpiderBiliService; import java.util.List; /** * Created by ray on 2017/6/18. * 爬虫管道 */ @Service public class BilibiliPipeLine implements Pipeline { @Autowired SpiderBiliService service; @Override public void process(ResultItems resultItems, Task task){ if (resultItems.getAll().isEmpty() == false) { int type = resultItems.get("type"); if (type == 0){ //列表内容 List <SpiderBili> list = resultItems.get("data"); for (SpiderBili obj: list) { service.updateBySpider(obj); } } else if(type == 1){ //评论内容 int aid = Integer.parseInt(resultItems.get("aid")); String string = resultItems.get("data"); JSONObject jsonObject = JSON.parseObject(string); String json = JSON.toJSONString(jsonObject); List<SpiderBili> list = service.findByAid(aid); //以前有回复主题 if (list.size() > 0){ SpiderBili bili = list.get(0); bili.setReplyJson(json); service.update(bili); } } } } }
原文:http://raychow.linkfun.top/2017/12/08/archives/9_javaSpring/spriderBiliBili/index/springboot