JavaSE小实践1：Java爬取斗图网站的全部表情包

时间 2019-12-04

标签 javase 实践 java 网站全部表情栏目 Java 繁體版

原文原文链接

跟朋友聊天总会用到大量表情包，有些人甚至专门收集各类各样的表情包，看看谁能战胜谁。今天我就用java爬取了一个斗图网站上的全部表情包，用来充实本身的表情包库。代码逻辑有可能并不完美，哈哈，也花了我几个小时才完成呢。
下载完全部图片，总共有225M.
思路：主要经过解析页面的源代码来获取图片的URL地址，而后经过图片地址下载图片到本地，因此要学会使用浏览器进行分析。javascript

所用jar包：jsoup-1.8.1.jar
网站首页：https://doutushe.com/portal/index/index/p/1
浏览器：Chrome

1，获取网页源代码

/**
     * 获取网页源代码
     * @author Augustu
     * @param url 网页地址
     * @param encoding 网页编码
     * @return    网页源代码
     */
    public static String getUrlResource(String url,String encoding) {
        //网页源代码，用String这个容器记录
        String htmlResource = "";
        //记录读取网页的每一行数据
        String temp = null;
        try {
            //1,找到网站地址
            URL theUrl = new URL(url);
            //2，创建起与网站的链接
            URLConnection urlConnection = theUrl.openConnection();
            //3,建立输入流，此处读取的是网页的源代码
            InputStreamReader isr = new InputStreamReader(urlConnection.getInputStream(),encoding);
            //4，对输入流进行缓冲，加快读取速度
            BufferedReader reader = new BufferedReader(isr);
            //5，一行一行读取源代码，存到htmlResource中
            while((temp = reader.readLine()) != null) {
                htmlResource += temp;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return htmlResource;
    }

2，获取页面全部组图片的UrL地址

/**
     * 获取页面全部组图片的UrL地址
     * @author Augustu
     * @param context 每一个页面的urL
     * @return 获取页面全部组图片的UrL地址
     */
    public static String findPictureUrl(String context) {
        String temp = "";//暂时存储获得的每一个url
        String pictureUrl = "";//获得全部URL
        //1，Jsoup将读取的网页源代码解析为Html文档，即可以使用Jsoup的方法操做html元素了，就像javascript同样
        Document document = Jsoup.parse(context);
        //2，观察网页源代码，发现每组图片都链接到了另外一个URL地址，这个a标签的class为“link-2”
        Elements groupUrl = document.getElementsByClass("link-2");
        //3,遍历每一个a标签，获得href
        for(Element ele: groupUrl) {
            //此处我发现每次Url都输出两次，也没找到缘由，就用此方法先解决他
            if(ele.attr("href") == temp) {
                continue;
            }
            temp = ele.attr("href");
            //4，将全部URL存入String中，并使用空格分开，便于后面分割
            //原本我使用“|”分隔开来，分割的结果居然是每一个字符都分开了
            pictureUrl += "https://doutushe.com"+ele.attr("href")+" ";
        }
        return pictureUrl;
    }

3，下载单张图片

/**
     * 下载单张图片
     * @param picturl 图片地址
     * @param filePath    下载路径
     * @param fileName    下载名
     */
    public static void downPicture(String picturl,String filePath,String fileName) {
        FileOutputStream fos = null;//输出文件流
        BufferedOutputStream bos = null;//缓冲输出
        File file = null;//建立文件对象
        File dir = new File(filePath);//建立文件保存目录
        Connection.Response response;
        try {
            //1，Jsoup链接地址，获得响应流，ignoreContentType表示忽略网页类型，若是不加会报错（默认只支持文本），由于咱们页面是图片
            response = Jsoup.connect(picturl).ignoreContentType(true).execute();
            //2,将页面内容按字节输出
            byte[] img = response.bodyAsBytes();
            //3，写入本地文件中
            //判断文件目录是否存在,
            if(!dir.exists() ){
                dir.mkdir();//建立文件夹
            }
            file = new File(filePath+"\\"+fileName);//建立文件
            fos = new FileOutputStream(file);
            bos = new BufferedOutputStream(fos);
            bos.write(img);//写入本地
        } catch (IOException e) {
            e.printStackTrace();
        }finally{
            //4,释放资源
            if(bos!=null){
                try {
                    bos.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if(fos!=null){
                try {
                    fos.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        
    }

4，下载全部图片

/**
     * 下载全部图片
     * @author Augustu
     * @param pictureUrl 每组图片url
     */
    public static void downallPicture(String pictureUrl,String downLoadPath) {
        String picturl = "";
        String pictureName ="";//
        String[] pictureUrlArry = pictureUrl.split(" ");//图片组的url
        for(int i=0;i<pictureUrlArry.length;i++) {
            //遍历获得每组图片的url
            String pictureHtml = getUrlResource(pictureUrlArry[i],"utf-8");
            Document document = Jsoup.parse(pictureHtml);
            //获得该组图片的分类名称
            String dir =  document.getElementsByTag("blockquote").first().child(0).text();
            //该标签包含全部图片url
            Elements elements = document.getElementsByClass("lazy");
            for(Element ele: elements) {
                //获得每张图片url
                picturl = ele.attr("data-original");
                //观察源代码，发现获取的图片地址多了/themes/doutushe/Public/assets/images/doutushe-erweima.jpg，将其删除
                if(picturl.equals("/themes/doutushe/Public/assets/images/doutushe-erweima.jpg")) {
                    continue;
                }
                //获得每张图片的名字，别忘了加后缀
                pictureName = ele.attr("title")+".gif";
                //下载该图片
                downPicture(picturl,downLoadPath+"\\"+dir,pictureName);
            }
        }
    }

5，主函数运行代码

public static void main(String[] args) {
        String context = "";
        //观察源代码，发现共有28个页面
        for(int i=1;i<=28;i++) {
            //获取每一个页面
            context = getUrlResource("https://doutushe.com/portal/index/index/p/"+i+"","utf-8");
            //获取该页面全部组图片的url
            String pictureUrl = findPictureUrl(context);
            downallPicture(pictureUrl,"E:\\image\\表情包");
        }
        
    }