网络爬虫1

时间 2019-12-17

原文原文链接

网络爬虫，web crawler（网页蜘蛛，网络机器人,网页追逐者），是一种按照必定的规则，自动地抓取万维网信息的程序html

最简单的网络爬虫：读取页面中全部的邮箱java

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WebCrawler {

    
    public static void main(String[] args) throws IOException{
        // 网址
        //URL url = new URL("http://localhost:8080/JavaWeb/index.jsp");
 　　　　URL url = new URL("https://www.meizu.com/contact.html");
        URLConnection conn = url.openConnection();
        
        // 转流
        InputStream is = conn.getInputStream();
        InputStreamReader isReader = new InputStreamReader(is);
        
        // 读取
        BufferedReader bufRead = new BufferedReader(isReader);
        String line = null;
        String mailReg = "\\w+@\\w+(\\.\\w+)+";
        Pattern p = Pattern.compile(mailReg);
        
        while((line=bufRead.readLine())!=null){
            // 匹配
            Matcher matcher = p.matcher(line); while(matcher.find()){
                System.out.println(matcher.group());
            }
        }
        
        is.close();
    }
}