java爬虫-初识

时间 2021-01-15

标签 html java node spring mongodb apache 浏览器缓存网络 app 栏目 Java 繁體版

原文原文链接

想找一些图片作桌面背景，可是又不想一张张去下载，后来就想到了爬虫。。。html

对于爬虫我也没具体用过，在网上一顿搜索后写了个小demo。java

爬虫的具体思路就是：node

1.调用url爬取网页信息spring

2.解析网页信息mongodb

3.保存数据apache

刚开始还用正则去匹配，获取img标签中的src地址，可是发现有不少不便（主要我正则不太会），后来发现了jsoup这个神器。 jsoup 是一款Java 的HTML解析器，可直接解析某个URL地址、HTML文本内容。它提供了一套很是省力的API，可经过DOM，CSS以及相似于jQuery的操做方法来取出和操做数据。浏览器

如下就用爬取图片为例：缓存

import com.crawler.domain.PictureInfo; import org.bson.types.ObjectId; import org.springframework.data.mongodb.core.MongoTemplate; import org.springframework.data.mongodb.gridfs.GridFsTemplate; import org.springframework.stereotype.Service; import org.apache.commons.io.FileUtils; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.util.DigestUtils; import org.springframework.util.StringUtils; import javax.annotation.Resource; import java.io.*; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /**
 * 爬虫实现
 *@program: crawler * @description * @author: wl * @create: 2021-01-12 17:56 **/ @Service public class CrawlerService { /** * @param url 要抓取的网页地址 * @param encoding 要抓取网页编码 * @return
     */
    public String getHtmlResourceByUrl(String url, String encoding) { URL urlObj = null; HttpURLConnection uc = null; InputStreamReader isr = null; BufferedReader reader = null; StringBuffer buffer = new StringBuffer(); // 创建网络链接
        try { urlObj = new URL(url); // 打开网络链接
            uc =(HttpURLConnection) urlObj.openConnection(); 　　　　　　　// 模拟浏览器请求
            uc.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); // 创建文件输入流
            isr = new InputStreamReader(uc.getInputStream(), encoding); // 创建缓存导入 将网页源代码下载下来
            reader = new BufferedReader(isr); // 临时
            String temp = null; while ((temp = reader.readLine()) != null) {// System.out.println(temp+"\n");
                buffer.append(temp + "\n"); } System.out.println("爬取结束:"+buffer.toString()); } catch (Exception e) { e.printStackTrace(); } finally { // 关流
            if (isr != null) { try { isr.close(); } catch (IOException e) { e.printStackTrace(); } } } return buffer.toString(); } /** * 下载图片 * * @param listImgSrc */
    public void Download(List<PictureInfo> listImgSrc) { int count = 0; try { for (int i = 0; i < listImgSrc.size(); i++) { try { PictureInfo pictureInfo = listImgSrc.get(i); String url=pictureInfo.getSrc(); String imageName = url.substring(url.lastIndexOf("/") + 1, url.length()); URL uri = new URL(url); // 打开链接
                    URLConnection con = uri.openConnection(); //设置请求超时为
                    con.setConnectTimeout(5 * 1000); con.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); // 输入流
                    InputStream is = con.getInputStream(); // 1K的数据缓冲
                    byte[] bs = new byte[1024]; // 读取到的数据长度
                    int len; // 输出的文件流
                    String src = url.substring(URL.length()); int index = src.lastIndexOf('/'); String fileName = src.substring(0, index + 1); File sf = new File(SAVE_PATH + fileName); if (!sf.exists()) { sf.mkdirs(); } OutputStream os = new FileOutputStream(sf.getPath() + "\\" + imageName); System.out.println(++count + ".开始下载:" + url); // 开始读取
                    while ((len = is.read(bs)) != -1) { os.write(bs, 0, len); } // 完毕，关闭全部连接
 os.close(); is.close(); System.out.println(imageName + ":--下载完成");  } catch (IOException e) { System.out.println("下载错误"+e); } } } catch (Exception e) { e.printStackTrace(); System.out.println("下载失败"+e); } } /** * 获得网页中图片的地址-推荐 * 使用jsoup * @param htmlStr html字符串 * @return List<String> */
    public List<PictureInfo> getImgStrJsoup(String htmlStr) { List<PictureInfo> pics = new ArrayList<PictureInfo>(); //获取网页的document树
        Document imgDoc = Jsoup.parse(htmlStr); //获取全部的img
        Elements alts = imgDoc.select("img[src]"); for (Element alt : alts) { PictureInfo p=new PictureInfo(); p.setSrc(alt.attr("src")); p.setAlt(alt.attr("alt")); p.setTitle(alt.attr("title")); pics.add(p); } return pics; }  }

主要方法就这些，只要爬取下来的网页信息包含img标签，就能扒下其对应的图片。网络

这只是一部分啊。。。app

不过提醒各位一句，爬下来本身用用就好了(我作动态桌面壁纸)，不要用于商业用途，若是要商业化，能够去爬那些无版权的网站。