实现的效果,自动在工程下建立Pictures文件夹,根据网站URL爬取图片,层层获取。在Pictures下以网站的层级URL命名文件夹,用来装该层URL下的图片。同时将文件名,路径,URL插入数据库,便于索引。java
第一步,建立持久层类,用来存储文件名,路径以及URL。mysql
package org.amuxia.demo; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.SQLException; public class JDBCHelper { private static final String driver = "com.mysql.jdbc.Driver"; private static final String DBurl = "jdbc:mysql://127.0.0.1:3306/edupic"; private static final String user = "root"; private static final String password = "root"; private PreparedStatement pstmt = null; private Connection spiderconn = null; public void insertFilePath(String fileName, String filepath, String url) { try { Class.forName(driver); spiderconn = DriverManager.getConnection(DBurl, user, password); String sql = "insert into FilePath (filename,filepath,url) values (?,?,?)"; pstmt = spiderconn.prepareStatement(sql); pstmt.setString(1, fileName); pstmt.setString(2, filepath); pstmt.setString(3, url); pstmt.executeUpdate(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (SQLException e) { e.printStackTrace(); } finally { try { pstmt.close(); spiderconn.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
第二步,建立解析URL的类,进行爬取web
package org.amuxia.demo; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Hashtable; import java.util.regex.Matcher; import java.util.regex.Pattern; public class GetWeb { private int webDepth = 5; // 爬虫深度 private int intThreadNum = 1; // 线程数 private String strHomePage = ""; // 主页地址 private String myDomain; // 域名 private String fPath = "CSDN"; // 储存网页文件的目录名 private ArrayList<String> arrUrls = new ArrayList<String>(); // 存储未处理URL private ArrayList<String> arrUrl = new ArrayList<String>(); // 存储全部URL供创建索引 private Hashtable<String, Integer> allUrls = new Hashtable<String, Integer>(); // 存储全部URL的网页号 private Hashtable<String, Integer> deepUrls = new Hashtable<String, Integer>(); // 存储全部URL深度 private int intWebIndex = 0; // 网页对应文件下标,从0开始 private long startTime; private int webSuccessed = 0; private int webFailed = 0; public static void main(String[] args) { GetWeb gw = new GetWeb("http://www.csdn.net/"); gw.getWebByHomePage(); } public GetWeb(String s) { this.strHomePage = s; } public GetWeb(String s, int i) { this.strHomePage = s; this.webDepth = i; } public synchronized void addWebSuccessed() { webSuccessed++; } public synchronized void addWebFailed() { webFailed++; } public synchronized String getAUrl() { String tmpAUrl = arrUrls.get(0); arrUrls.remove(0); return tmpAUrl; } public synchronized String getUrl() { String tmpUrl = arrUrl.get(0); arrUrl.remove(0); return tmpUrl; } public synchronized Integer getIntWebIndex() { intWebIndex++; return intWebIndex; } /** * 由用户提供的域名站点开始,对全部连接页面进行抓取 */ public void getWebByHomePage() { startTime = System.currentTimeMillis(); this.myDomain = getDomain(); if (myDomain == null) { System.out.println("Wrong input!"); return; } System.out.println("Homepage = " + strHomePage); System.out.println("Domain = " + myDomain); arrUrls.add(strHomePage); arrUrl.add(strHomePage); allUrls.put(strHomePage, 0); deepUrls.put(strHomePage, 1); File fDir = new File(fPath); if (!fDir.exists()) { fDir.mkdir(); } System.out.println("开始工做"); String tmp = getAUrl(); // 取出新的URL this.getWebByUrl(tmp, allUrls.get(tmp) + ""); // 对新URL所对应的网页进行抓取 int i = 0; for (i = 0; i < intThreadNum; i++) { new Thread(new Processer(this)).start(); } while (true) { if (arrUrls.isEmpty() && Thread.activeCount() == 1) { long finishTime = System.currentTimeMillis(); long costTime = finishTime - startTime; System.out.println("\n\n\n\n\n完成"); System.out.println( "开始时间 = " + startTime + " " + "结束时间 = " + finishTime + " " + "爬取总时间= " + costTime + "ms"); System.out.println("爬取的URL总数 = " + (webSuccessed + webFailed) + " 成功的URL总数: " + webSuccessed + " 失败的URL总数: " + webFailed); String strIndex = ""; String tmpUrl = ""; while (!arrUrl.isEmpty()) { tmpUrl = getUrl(); strIndex += "Web depth:" + deepUrls.get(tmpUrl) + " Filepath: " + fPath + "/web" + allUrls.get(tmpUrl) + ".htm" + "url:" + tmpUrl + "\n\n"; } System.out.println(strIndex); try { PrintWriter pwIndex = new PrintWriter(new FileOutputStream("fileindex.txt")); pwIndex.println(strIndex); pwIndex.close(); } catch (Exception e) { System.out.println("生成索引文件失败!"); } break; } } } /** * 对后续解析的网站进行爬取 * * @param strUrl * @param fileIndex */ public void getWebByUrl(String strUrl, String fileIndex) { try { System.out.println("经过URL获得网站: " + strUrl); URL url = new URL(strUrl); URLConnection conn = url.openConnection(); conn.setDoOutput(true); InputStream is = null; is = url.openStream(); String filename = strUrl.replaceAll("/", "_"); filename = filename.replace(":", "."); if (filename.indexOf("*") > 0) { filename = filename.replaceAll("*", "."); } if (filename.indexOf("?") > 0) { filename = filename.replaceAll("?", "."); } if (filename.indexOf("\"") > 0) { filename = filename.replaceAll("\"", "."); } if (filename.indexOf(">") > 0) { filename = filename.replaceAll(">", "."); } if (filename.indexOf("<") > 0) { filename = filename.replaceAll("<", "."); } if (filename.indexOf("|") > 0) { filename = filename.replaceAll("|", "."); } String filePath = fPath + "\\" + filename; File file = new File(filePath); if (!file.exists()) { file.mkdir(); } JDBCHelper helper = new JDBCHelper(); helper.insertFilePath(filename, filePath, strUrl); GetPicture getp = new GetPicture(); getp.get(strUrl, filePath); BufferedReader bReader = new BufferedReader(new InputStreamReader(is)); StringBuffer sb = new StringBuffer(); String rLine = null; String tmp_rLine = null; while ((rLine = bReader.readLine()) != null) { tmp_rLine = rLine; int str_len = tmp_rLine.length(); if (str_len > 0) { sb.append("\n" + tmp_rLine); if (deepUrls.get(strUrl) < webDepth) getUrlByString(tmp_rLine, strUrl); } tmp_rLine = null; } is.close(); System.out.println("获取网站成功 " + strUrl); addWebSuccessed(); } catch (Exception e) { System.out.println("获取网站失败,请检查URL是否存在 " + strUrl); addWebFailed(); } } /** * 判断用户所提供URL是否为域名地址 * * @return */ public String getDomain() { String reg = "(?<=http\\://[a-zA-Z0-9]{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv|edu)"; Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(strHomePage); boolean blnp = m.find(); if (blnp == true) { return m.group(0); } return null; } /** * 解析新的网页,提取其中含有的连接信息 * * @param inputArgs * @param strUrl */ public void getUrlByString(String inputArgs, String strUrl) { String tmpStr = inputArgs; String regUrl = "(?<=(href=)[\"]?[\']?)[http://][^\\s\"\'\\?]*(" + myDomain + ")[^\\s\"\'>]*"; Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(tmpStr); boolean blnp = m.find(); while (blnp == true) { if (!allUrls.containsKey(m.group(0))) { System.out.println("Find a new url,depth:" + (deepUrls.get(strUrl) + 1) + " " + m.group(0)); arrUrls.add(m.group(0)); arrUrl.add(m.group(0)); allUrls.put(m.group(0), getIntWebIndex()); deepUrls.put(m.group(0), (deepUrls.get(strUrl) + 1)); } tmpStr = tmpStr.substring(m.end(), tmpStr.length()); m = p.matcher(tmpStr); blnp = m.find(); } } /** * @author amuxia 另外一个独立的爬取线程 */ class Processer implements Runnable { GetWeb gw; public Processer(GetWeb g) { this.gw = g; } public void run() { while (!arrUrls.isEmpty()) { String tmp = getAUrl(); getWebByUrl(tmp, allUrls.get(tmp) + ""); } } } }
如图,此处添加要爬取的网址。正则表达式
private String fPath = "CSDN"; 这里定义你爬取图片存放的位置,这里直接放在工程下的CSDN文件夹下,随意放,本身找获得就OK。sql
第三步,抓取图片下载数据库
package org.amuxia.demo; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; public class GetPicture { public void getHtmlPicture(String httpUrl, String filePath) { URL url; BufferedInputStream in; FileOutputStream file; try { System.out.println("爬取网络图片"); // 获取图片名 String fileName = httpUrl.substring(httpUrl.lastIndexOf("/")).replace("/", ""); // 初始化url对象 url = new URL(httpUrl); // 初始化in对象,也就是得到url字节流 in = new BufferedInputStream(url.openStream()); file = new FileOutputStream(new File(filePath + "\\" + fileName)); int t; while ((t = in.read()) != -1) { file.write(t); } file.close(); in.close(); System.out.println("图片爬取成功"); } catch (Exception e) { e.printStackTrace(); } } public String getHtmlCode(String httpUrl) throws IOException { String content = ""; URL url = new URL(httpUrl); BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream())); String input; // 若是有数据 while ((input = reader.readLine()) != null) { // 将读取数据赋给content content += input; } // 关闭缓冲区 reader.close(); // 返回content return content; } /** * 图片爬取方法 * * @param url * @throws IOException */ public void get(String url, String filePath) throws IOException { // 定义两个获取网页图片的正则表达式 String searchImgReg = "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; String searchImgReg2 = "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; String content = this.getHtmlCode(url); Pattern pattern = Pattern.compile(searchImgReg); Matcher matcher = pattern.matcher(content); while (matcher.find()) { System.out.println(matcher.group(3)); this.getHtmlPicture(url + "/" + matcher.group(3), filePath); } pattern = Pattern.compile(searchImgReg2); matcher = pattern.matcher(content); while (matcher.find()) { System.out.println(matcher.group(3)); this.getHtmlPicture(matcher.group(3), filePath); } } }
这样就完成了!网络
咱们看到,已经基本实现,这里不须要额外导包,只须要导一个MySQL驱动包,固然,若是不须要插入数据到数据库,对爬取图片没有任何影响,把第一个类去掉就好。app
另外可能有些网站作了防爬虫可能会失败。ide
注:爬取网站以前最好和全部者进行沟通,另外爬取非公开内容是侵权的,这里只作测试使用。测试