一、由于工做安排,临时让我这从没摸过爬虫的人爬数据,过程吧,还算能够,网上开源的有,加上伟大的群友的帮忙,算是出了一版运行结果正确的爬虫,如今记录下,有什么隐患,欢迎广大同行批评指正。java
二、代码结构:node
如上图,结构很简单,引用了jsoup jar包,Rule类具体的爬虫代码,Air类是要获取的页面数据实体类。jsp
三、代码:ide
Air.java:测试
package com.zyy.splider.rule;网站
public class Air {
String indexNo ;//序号
String city;//城市
String date; //日期
String aqi; //AQI指数
String level;//空气质量级别
String prev;//首要污染物
public Air(
String indexNo,
String city,
String date,
String aqi,
String level,
String prev){
this.indexNo = indexNo;
this.city = city;
this.date = date;
this.aqi = aqi;
this.level = level;
this.prev = prev;
}
public String getIndexNo() {
return indexNo;
}
public void setIndexNo(String indexNo) {
this.indexNo = indexNo;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getAqi() {
return aqi;
}
public void setAqi(String aqi) {
this.aqi = aqi;
}
public String getLevel() {
return level;
}
public void setLevel(String level) {
this.level = level;
}
public String getPrev() {
return prev;
}
public void setPrev(String prev) {
this.prev = prev;
}
}
this
Rule.javaurl
package com.zyy.splider.rule;ci
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;get
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/*
* 规则类
* 用于指定url,method,params等
*/
public class Rule {
public static void main(String[] args) throws Exception {
crawler();
}
//爬取
public static void crawler() throws Exception{
List<Air> cityInfo = new ArrayList<Air>();
int index = 1;
int line = 1;
for(int i = 1;i < 14;i ++){
System.out.println("==================================================" + i);
String root_url="http://datacenter.mep.gov.cn/report/air_daily/air_dairy.jsp?page=" + i;
Document document=Jsoup.connect(root_url).timeout(3000000).get();
System.out.println("*****************************************");
Elements links=document.getElementsByClass("report1_5");
//Elements links = document.select("#report1 .report1_5");
String indexNo = null ;//序号
String city = null;//城市
String date = null; //日期
String aqi = null; //AQI指数
String level = null;//空气质量级别
String prev;//首要污染物
for(Element link:links){
System.out.println( link.text());
if(index % 6 == 1){//序号
//air.setIndexNo(link.text()) ;
indexNo = link.text();
}else if(index % 6 == 2){//城市
//air.setCity(link.text());
city = link.text();
}else if(index % 6 == 3){ //日期
//air.setDate(link.text());
date = link.text();
}else if(index % 6 == 4){ //AQI指数
//air.setAqi(link.text());
aqi = link.text();
}else if(index % 6 == 5){ //空气质量级别
//air.setLevel(link.text());
level = link.text();
}else if(index % 6 == 0){ //首要污染物
//air.setPrev(link.text());
prev= link.text();
//将air信息保存到cityInfo列表
cityInfo.add(new Air(indexNo, city, date, aqi, level, prev));
line ++;//表示一行结束,行号+1
}
index ++;
}
}
//打印cityInfo中的信息
for(int i = 0;i < cityInfo.size();i ++){
System.out.println("第" + i + "条数据--------------------------------->");
System.out.println("序号:" + cityInfo.get(i).getIndexNo());
System.out.println("城市:" + cityInfo.get(i).getCity());
System.out.println("日期:" + cityInfo.get(i).getDate());
System.out.println("AQI指数:" + cityInfo.get(i).getAqi());
System.out.println("空气质量级别:" + cityInfo.get(i).getLevel());
System.out.println("首要污染物:" + cityInfo.get(i).getPrev());
}
}
}
}
四、也不知道这算不算一个爬虫程序,反正要的数据算是都有了,代码很简单,运行测试真心痛苦,由于有分页,每页显示条数仍是网站定死的,因此只能连发13个请求获取每页的数据,可能由于那个网站属于政府的网站,速度慢到死,,这里简单记录下,但愿这是第一次也是最后一次爬网站。。。。。。