先上效果图:html
准备工做:segmentfault
/** * 创建http链接 */ public static String Connect(String address) { HttpURLConnection conn = null; URL url = null; InputStream in = null; BufferedReader reader = null; StringBuffer stringBuffer = null; try { url = new URL(address); conn = (HttpURLConnection) url.openConnection(); conn.setConnectTimeout(5000); conn.setReadTimeout(5000); conn.setDoInput(true); conn.connect(); in = conn.getInputStream(); reader = new BufferedReader(new InputStreamReader(in)); stringBuffer = new StringBuffer(); String line = null; while ((line = reader.readLine()) != null) { stringBuffer.append(line); } } catch (Exception e) { e.printStackTrace(); } finally { conn.disconnect(); try { in.close(); reader.close(); } catch (Exception e) { e.printStackTrace(); } } return stringBuffer.toString(); }
/** * 用于将内容写入到磁盘文件 * @param allText */ private static void writeToFile(String allText) { System.out.println("正在写入。。。"); BufferedOutputStream bos = null; try { File targetFile = new File("/Users/shibo/tmp/pengfu.txt"); File fileDir = targetFile.getParentFile(); if (!fileDir.exists()) { fileDir.mkdirs(); } if (!targetFile.exists()) { targetFile.createNewFile(); } bos = new BufferedOutputStream(new FileOutputStream(targetFile, true)); bos.write(allText.getBytes()); } catch (IOException e) { e.printStackTrace(); } finally { if (null != bos) { try { bos.close(); } catch (IOException e) { e.printStackTrace(); } } } System.out.println("写入完毕。。。"); }
引入jsoup的jar包(用于解析dom):app
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.2</version> </dependency>
开始分析网站:dom
捧腹网段子
首先找到咱们须要的内容(做者、标题和正文)网站
查看其元素,我这里查看的是标题标签:ui
知道其结构以后,就能够获取咱们想要的内容了:url
public static void main(String[] args) { StringBuilder allText = new StringBuilder(); for (int i = 1; i <= 50; i++) { System.out.println("正在爬取第" + i + "页内容。。。"); // 创建链接,获取网页内容 String html = ConnectionUtil.Connect("https://www.pengfu.com/xiaohua_" + i + ".html"); // 将内容转换成dom格式,方便操做 Document doc = Jsoup.parse(html); // 获取网页内全部标题节点 Elements titles = doc.select("h1.dp-b"); for (Element titleEle : titles) { Element parent = titleEle.parent(); // 标题内容 String title = titleEle.getElementsByTag("a").text(); // 标题对应的做者 String author = parent.select("p.user_name_list > a").text(); // 标题对应的正文 String content = parent.select("div.content-img").text(); // 将内容格式化 allText.append(title) .append("\r\n做者:").append(author) .append("\r\n").append(content) .append("\r\n").append("\r\n"); } allText.append("-------------第").append(i).append("页-------------").append("\r\n"); System.out.println("第" + i + "页内容爬取完毕。。。"); } //将内容写入磁盘 Test.writeToFile(allText.toString()); }
参考文章:Python 爬虫入门(一)——爬取糗百spa