网页中的脚本通常在<script>标签中,例如 javascript
<script type="text/javascript"> | |
F.use(["/static/common/ui/tangram/base/base.js","/static/widget/common/searchbox/searchbox.js","/static/common/ui/vs/suggestion/suggestion.js"], function(baidu,searchbox,suggestion){ | |
baidu.dom.ready(function(){ | |
searchbox(); | |
if (navigator.cookieEnabled && !/sug?=0/.test(document.cookie)){ | |
suggestion(); | |
} | |
}); | |
}); | |
</script> |
代码: html
import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class html { /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { Document doc = Jsoup.connect("http://news.baidu.com").get(); //获取网页内容 //获取网页内容中非脚本信息 getTag(doc); } public static void getTag(Document doc) { Elements el = doc.select("*");//先遍历整个HTML List<String>list = new ArrayList<String>(); for(Element element:el){ String text = element.tagName(); if(text.endsWith("script"))continue;//删除HTML中的脚本 else{ if(element.hasText() == true)list.add(element.text()+'\n'); } } System.out.println(list); }