import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 功能:爬取某姐的小视频 * @author cxd * */ public class WebSpiderDemo1 { public static void main(String[] args) throws Exception { String source = "http://www.budejie.com/video/"; String destDir = "D:/rob/"; Map<String, String> urlMap = getUrlInSource(source); for (Map.Entry<String, String> entry : urlMap.entrySet()) { String title = entry.getKey();// 视频名称 String url = entry.getValue();// 视频url File destFile = new File(destDir + title + ".mp4"); download(url, destFile); } } /** * 经过视频的URL下载该视频并存入本地 * * @param url 视频的URL * @param destFile 视频存入的位置 * @throws IOException */ public static void download(String url, File destFile) throws IOException { URL videoUrl = new URL(url); InputStream is = videoUrl.openStream(); FileOutputStream fos = new FileOutputStream(destFile); int len = 0; byte[] buffer = new byte[1024]; while ((-1) != (len = is.read(buffer))) { fos.write(buffer, 0, len); } fos.flush(); if (null != fos) { fos.close(); } if (null != is) { is.close(); } } /** * 获取视频的URL地址和视频名称存入hashMap * * @param source * @return * @throws IOException */ public static Map<String, String> getUrlInSource(String source) throws IOException { Map<String, String> hashMap = new HashMap<>(); for (int index = 1; index <= 1; index++) { // 页数最大为50,本身玩嘛,就只爬取了一页。 String pageUrl = source + index; URL url = new URL(pageUrl); InputStream is = url.openStream(); // 若遇到反爬机制则使用该方法将程序假装为浏览器进行访问 // HttpURLConnection conn = (HttpURLConnection) url.openConnection(); // conn.setRequestMethod("GET"); // conn.setRequestProperty("user-agent", // "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"); // BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8")); BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8")); String info = null; String title = null; // 此处不要用==null进行判断,由于网页中有不少行都是null,不然会报java.lang.NullPointerException。 for (int i = 0; i < 10000; i++) { info = br.readLine(); if (null != info) { String urlRegex = "href=\"(.*?\\.mp4)"; if (info.contains("data-text")) { title = info; } Pattern pattern = Pattern.compile(urlRegex); Matcher matcher = pattern.matcher(info); if (matcher.find()) { for (int j = 0; j <= matcher.groupCount(); j++) { String tmp = matcher.group(j); if (!tmp.startsWith("href=")) { String videoTitle = getTitle(title.trim()); hashMap.put(videoTitle, tmp); } } } } } } return hashMap; } /** * 清洗整理titile字符串, * * @param info * @return */ private static String getTitle(String info) { int len = info.length(); String title = info.substring(11, len - 2); return title; } }