jsoup 解析html 页面数据

时间 2019-11-10

标签 jsoup 解析 html 页面数据栏目 Java 繁體版

原文原文链接

我html 页面元素：html

/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td[2]/font
/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td[2]/div/font/span
/html/body/table[2]/tbody/tr[3]/td/font/b
/html/body/table[2]/tbody/tr[5]/td/div/table/tbody/tr[1]/td[1]/div/b/font/spanjava

如下是代码实现：node

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupParseHtml {

	public static String getHtmlByUrl(String url){  
	    String html = null;  
	    //建立httpClient对象
	    HttpClient httpClient = new DefaultHttpClient();  
	    //以get方式请求该URL
	    HttpGet httpget = new HttpGet(url);  
	    try {  
	        //获得responce对象 
	        HttpResponse responce = httpClient.execute(httpget); 
	        //返回码
	        int resStatu = responce.getStatusLine().getStatusCode();  
	        //200正常  其余就不对  
	        if (resStatu==HttpStatus.SC_OK) {
	            //得到相应实体  
	            HttpEntity entity = responce.getEntity();  
	            if (entity!=null) {  
	                //得到html源代码
	                html = EntityUtils.toString(entity);  
	               
	            }  
	        }  
	    } catch (Exception e) {  
	        System.out.println("访问【"+url+"】出现异常!");  
	        e.printStackTrace();  
	    } finally {  
	        httpClient.getConnectionManager().shutdown();  
	    }  
	    return html;  
	}

static String txtpathstr="d:\\one\\";
	
	
	public static void main(String[] args) throws Exception {
		
		String contents="";
		String urlbase="http://localhost:8080/1.htm";

		//String urlbase="http://www.qiushibaike.com/8hr/page/8?s=4513032";//1?s=4513032
	    contents+=gettxtlist(urlbase);
			
		//写入文件
		writefile(contents);
		
	}
	
	public static String gettxtlist(String txturl) throws Exception{
		
		String content="";
		Document doc=jsoupconnect(txturl,360000);
		//Elements els= doc.select("div.content");
		
		Elements els= doc.select("html");
		
		
		for(Element el:els){
			if (el.select("body").size()>1){
				continue;
			}
			content+=el.text()+"\r\n";
			System.out.println();
			System.out.println(content);
		}
		return content;
	}
	
	
	
	
	public static Document jsoupconnect (String url,int timeout){
		Document doc=null;
		int retry=5;
		while (null==doc&&retry>0){
			retry--;
			try{
			doc= Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 6.1; rv:5.0)").timeout(timeout).get();
			}catch(Exception e){
				e.printStackTrace();
				
			}
		}
		return doc;
	}
	
	
	public static void writefile(String txtstr)throws Exception{
		File txtpath=new File(txtpathstr);
		if (!txtpath.exists()){
			txtpath.mkdirs();
		}
		File htxt=new File(txtpathstr+"test.txt");
		BufferedOutputStream outBuff = new BufferedOutputStream(new FileOutputStream(htxt));
		outBuff.write(txtstr.getBytes());
		outBuff.flush();
		outBuff.close();
	}
	
}

存在问题：只能一次性读取出来，不能按照要求，按照table分开，下一版本会解决这个问题apache