[烈格黑街][福利]第一个java爬虫程序

哨得哨得


第一次在掘金发博客, 感受爽爽的, 简书和掘金之间我仍是选择掘金了, 由于掘金才是开发者的平台, 简书大部分仍是做者吧!(我的观点, 贤者勿喷)php

进入正题


简单介绍:

本次带来的一个用java写的爬取吾爱破解网(你们都懂得, 不是什么不正经的网站哈, 不过也是福利)最新更新的资源, 毕竟此网站一直不定时更新牛×哄哄的资源, 这个就是专门爬取最新分享的资源的 (什么XX软件啊, 某马教程视频啊....)html

意图 (缘由):

  1. 本人刚刚接触java(有半年了吧), 工做用到了html解析, 感受里爬虫不远了, 就想涉足一下
  2. 本人资源收藏爱好者, 吾爱XX给了我海量资源, 可是因为大部分资源你是百度云连接, 并且深知百毒云有些敏感资源过期太快了, 因此想弄一个爬虫, 自动爬取, 自动保存(这一步下次更新完成吧)
  3. 毕竟谁也没事运行下这个java程序, 后期会放入服务器开通接口, 再用本身的微信小程序调用(PS:有木有懂前端(喜欢开发UI)的来指导指导我啊!), 这样只要在微信就能够直接看到最新的资源了, (不仅是资源哦, 还有连接, 回复, 连接状态等等)

用到的知识点

  • java基础
  • jsoup 解析html第三方jar
  • okhttp 浏览器请求第三方jar
  • 正则表达式(正则表达式 + Excel + NotePad++ + 列编辑模式几乎解决全部字符串批处理问题, 下次演示)

代码(两个类):

  1. GetInfo.java
package test;

import com.mtl.pojo.Item;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import org.junit.Test;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;

public class GetInfo {

    private static String lastTopic;            //上一次查询最终的帖子的标题, 用来判断是否解析到上次解析的位置
    private static String thisTopic;            // 暂时保存此次解析的第一个标题, 最后加到lastTopic中去

    @Test
    public void getInfo(){
        try {
            OkHttpClient client = new OkHttpClient();//建立OkHttpClient对象

            for (int i = 1; i <= 8; i++) {

                //构建请求对象 经过内部类Request.Builder构建
                Request request = new Request.Builder()
                        .url("https://www.52pojie.cn/forum.php?mod=guide&view=newthread&page=" + i)//请求接口。若是须要传参拼接到接口后面。
                        .build();
                Response response = null;

                //发送请求获得response对象
                response = client.newCall(request).execute();

                //判断返回状态码
                if (response.isSuccessful()) {
                    String string = response.body().string();

// 查看返回的response头信息, 实际上用来设置返回的cookie的, 尚未完成
// Headers header = response.headers();
// for (int j = 0; j < header.size(); j++) {
// System.out.println(header.name(i) + "-----" + header.value(i));
// }
// System.out.println(string);

                    //调用方法解析html文本
                    ParseHtml parseHtml = new ParseHtml();
                    List<Item> items = parseHtml.getCurrentPageItems(string, lastTopic);

                    testInsert(items);
                    if (i == 1){
                        thisTopic = items.get(0).getTitle();
                    }
                    if (parseHtml.isFind()){
                        break;
                    }
                }
            }
            lastTopic = thisTopic;
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void testInsert(List<Item> items){
        try {
            InputStream resourceAsStream= Resources.getResourceAsStream("mybatis.xml");
            SqlSessionFactory build = new SqlSessionFactoryBuilder().build(resourceAsStream);
            SqlSession sqlSession = build.openSession();
            int insert = sqlSession.insert("com.mtl.mapper.ItemMapper.insertItems", items);
            System.out.println("insert = " + insert);
            sqlSession.commit();
            sqlSession.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
复制代码
  1. ParseHtml.java用来解析html字符串的工具类吧(不过并无设置静态方法,为了之后spring管理哈哈)
package test;

import com.mtl.pojo.Item;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ParseHtml {

    private boolean isFind = false;     //用来判断是否解析到了上次执行的最后一个标题, 结束条件

    /** * 获取当前html页面的全部item对象 * @param html 当前页面的html字符串 * @param lastTitle 中止解析的帖子标题 * @return item集合 * @throws IOException okhttp抛出的异常 */
    public List<Item> getCurrentPageItems(String html, String lastTitle) throws IOException {

        ArrayList<Item> items = new ArrayList<>();

        //Jsoup解析html文本获取Document对象
        Document parse = Jsoup.parse(html);
        Element body = parse.body();
        
        //经过选择器获取到标志的div而后赋值给item
        Element element = body.selectFirst("div#forumnew");
// System.out.println("element = " + element);
        Element table = element.nextElementSibling();
        Elements tbodys = table.select("tbody");
        for (int j = 0; j < tbodys.size(); j++) {
            element = tbodys.get(j);
            String title = element.selectFirst("a.xst").html();
            if (title.equals(lastTitle)){   //若是查找到上次的最后的话题就直接结束并通知前台找到了标记
                isFind = true;
                break;
            }
            Item item = new Item();
            item.setTitle(title);
            Element tbody = element.selectFirst("tbody");
            Elements tds = tbody.select("td");
            for (int i = 0; i < tds.size(); i++) {
                Element td = tds.get(i);
                switch (i){
                    case 0:
                        item.setUrl("https://www.52pojie.cn/" + td.selectFirst("a").attr("href"));
                        Element span = td.selectFirst("span");
                        if (span != null)
                            item.setAuthorityLevel(span.html());
                        break;
                    case 1:
                        item.setPartition(td.selectFirst("a").html());
                        break;
                    case 2:
                        item.setAuther(td.selectFirst("a").html());
                        item.setPublishTime(td.selectFirst("span").html());
                        break;
                    case 3:
                        item.setReplyNum(td.selectFirst("a").html());
                        item.setViewNum(td.selectFirst("em").html());
                        break;
                    case 4:
                        item.setLastReplyName(td.selectFirst("a").html());
                        item.setLastReplyTime(td.selectFirst("em").selectFirst("a").html());
                        item.setLastReplyUrl("https://www.52pojie.cn/" + td.selectFirst("a").attr("href"));
                        break;
                }
            }
            parseLink(item);
            items.add(item);
        }
        return items;
    }

    /** * 解析item内部的百度云连接 * @param item item对象 */
    private void parseLink(Item item) throws IOException {
        if (item.getAuthorityLevel() == null) {
            OkHttpClient okHttpClient = new OkHttpClient();
            String url = item.getUrl();
            Request build = new Request.Builder()
                    .url(url)
                    .build();
            Response response = okHttpClient.newCall(build).execute();
            if (response.isSuccessful()){
                String string = response.body().string();
                // System.out.println(string);
                Matcher matcher = Pattern.compile("[^\"](https://pan.baidu.com/s/[\\w\\-0-9_]+[a-zA-Z_0-9])((?!https).)+密码: ?([a-zA-Z0-9]{4})[^a-zA-Z0-9]").matcher(string);
                StringBuilder links = new StringBuilder();
                StringBuilder pwds = new StringBuilder();
                while (matcher.find()){
                    if (links.indexOf(matcher.group(1)) == -1){
                        links.append(matcher.group(1)).append(";");
                        pwds.append(matcher.group(3)).append(";");
                    }
// System.out.println("match = " + matcher.group(0));
                }
                if (!links.toString().equals("")){
                    item.setLinksAndPwdsStr(links.toString() + "#;#" + pwds.toString());
                }
            }
        }
    }

    /** * 测试须要阅读权限的连接返回的报文体 为之后自动登陆获取连接作准备 * @throws IOException */
    @Test
    public void testLink() throws IOException {
        OkHttpClient okHttpClient = new OkHttpClient();
        Request build = new Request.Builder()
                .url("https://www.52pojie.cn/thread-719615-1-1.html")
                .build();
        Response response = okHttpClient.newCall(build).execute();
        if (response.isSuccessful()){
            String string = response.body().string();
            Matcher authLevel = Pattern.compile("抱歉,本帖要求阅读权限高于 \\d+ 才能浏览").matcher(string);
            System.out.println(string);
            if (authLevel.find()) {
                System.out.println("须要权限");
            }else {
                Matcher matcher = Pattern.compile("[^\"](https://pan.baidu.com/s/[\\w\\-0-9_]+[a-zA-Z_0-9])((?!https).)+密码: ?([a-zA-Z0-9]{4})[^a-zA-Z0-9]").matcher(string);
                while (matcher.find()){
                    System.out.println("match = " + matcher.group(1) + "--" + matcher.group(3));
                }
            }
        }
    }

    public boolean isFind() {
        return isFind;
    }

    public void setFind(boolean find) {
        isFind = find;
    }
}
复制代码
  1. Item.java实体类
package com.mtl.pojo;

public class Item {
    private String title;   //标题
    private String url;     //连接
    private String[] links; //百度云连接数组
    private String[] pwds;  //对应百度云连接密码
    private String linksAndPwdsStr; //百度云连接数组
    private String publishTime;     //发表时间
    private String authorityLevel;  //查看权限
    private String partition;       //帖子分区
    private String auther;          //帖子做者
    private String replyNum;        //回复数量
    private String viewNum;         //查看数量
    private String lastReplyName;   //最后回复帐户
    private String lastReplyTime;   //最后回复时间
    private String lastReplyUrl;    //最后回复连接
    private String firstPageReply;  //第一页回复内容集合
    private boolean isNeedReply;    //是否须要回复才能够获取下载连接
    private int searchLinkTimes;    //搜寻连接次数, 以备后期超过阈值不在获取


    public String getLinksAndPwdsStr() {
        return linksAndPwdsStr;
    }

    public void setLinksAndPwdsStr(String linksAndPwdsStr) {
        if (linksAndPwdsStr == null || linksAndPwdsStr.equals("")){
            links = new String[]{};
            pwds = new String[]{};
        }else {
            String[] split = linksAndPwdsStr.split("#;#");
            links = split[0].split(";");
            pwds = split[1].split(";");
        }
        this.linksAndPwdsStr = linksAndPwdsStr;
    }

    public String[] getLinks() {
        return links;
    }

    public String[] getPwds() {
        return pwds;
    }

    public String getFirstPageReply() {
        return firstPageReply;
    }

    public void setFirstPageReply(String firstPageReply) {
        this.firstPageReply = firstPageReply;
    }

    public boolean isNeedReply() {
        return isNeedReply;
    }

    public void setNeedReply(boolean needReply) {
        isNeedReply = needReply;
    }

    public int getSearchLinkTimes() {
        return searchLinkTimes;
    }

    public void setSearchLinkTimes(int searchLinkTimes) {
        this.searchLinkTimes = searchLinkTimes;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getPublishTime() {
        return publishTime;
    }

    public void setPublishTime(String publishTime) {
        this.publishTime = publishTime;
    }

    public String getAuthorityLevel() {
        return authorityLevel;
    }

    public void setAuthorityLevel(String authorityLevel) {
        this.authorityLevel = authorityLevel;
    }

    public String getPartition() {
        return partition;
    }

    public void setPartition(String partition) {
        this.partition = partition;
    }

    public String getAuther() {
        return auther;
    }

    public void setAuther(String auther) {
        this.auther = auther;
    }

    public String getReplyNum() {
        return replyNum;
    }

    public void setReplyNum(String replyNum) {
        this.replyNum = replyNum;
    }

    public String getViewNum() {
        return viewNum;
    }

    public void setViewNum(String viewNum) {
        this.viewNum = viewNum;
    }

    public String getLastReplyName() {
        return lastReplyName;
    }

    public void setLastReplyName(String lastReplyName) {
        this.lastReplyName = lastReplyName;
    }

    public String getLastReplyTime() {
        return lastReplyTime;
    }

    public void setLastReplyTime(String lastReplyTime) {
        this.lastReplyTime = lastReplyTime;
    }

    public String getLastReplyUrl() {
        return lastReplyUrl;
    }

    public void setLastReplyUrl(String lastReplyUrl) {
        this.lastReplyUrl = lastReplyUrl;
    }
}
复制代码

因为数据库是存储数组很麻烦, 因此我想了一个折中的办法, 在实体类上下了手脚, 有兴趣的小伙伴能够看一下前端

后续打算

  1. 完成ssm项目,配置好服务, 测试接口
  2. 完成微信小程序UI, 使用服务器接口
  3. 上线微信小程序
  4. 完成自动登陆, 防止阅读权限没法获取问题
  5. 自动判断百度盘是否失效, 自动去除该item
  6. 自动回复获取须要回复才能够查看隐藏连接的帖子

先列这么多, 若是有兴趣的小伙伴, 但愿给我多提提意见哈, 毕竟转行刚刚入门, 还需努力

相关文章
相关标签/搜索