本人在使用 httpclient 的过程当中,忽然想起来能够爬取一些数据,好比全国的中学名。固然不是空穴来风,以前也作过这方面的爬虫,不过基于selenium 作的 UI 脚本,效率很是慢,并且很不稳定,因此此次采起了接口的形式,果真效率提高了几个档次。一共6万+数据,用了16分钟左右,期间包括数据库的存储。如今分享代码供你们参考。关键信息隐去,你们看一下思路就行了。java
package practise; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.client.methods.HttpGet; import net.sf.json.JSONObject; import source.ApiLibrary; import source.Concurrent; public class Crawler extends ApiLibrary { public static String host = ""; public static Map<String, Integer> countrys = new HashMap<>(); public static Map<String, Integer> citys = new HashMap<>(); public static Map<String, Integer> address = new HashMap<>(); public static Map<String, Integer> school = new HashMap<>(); public static List<String> total = new ArrayList<>(); public static void main(String[] args) { Crawler crawler = new Crawler(); crawler.getCountry1();// 省份 Set<String> countryId = countrys.keySet(); for (String name : countryId) { int id = countrys.get(name); crawler.getCountry2(id);// 市 Set<String> cityId = citys.keySet(); for (String city : cityId) { int cid = citys.get(city); crawler.getCountry3(cid);// 县 Set<String> adresss = address.keySet(); for (String adres : adresss) { int aid = address.get(adres); crawler.getCountry4(aid);// 名 Set<String> schol = school.keySet(); for (String sch : schol) { String line = name + PART + city + PART + adres + PART + sch; total.add(line); } } } } Concurrent.saveRequestTimes(total); testOver(); } /** * 查询省份 */ public void getCountry1() { String url = host + "/user/editinfo/getSchollCountryList"; HttpGet httpGet = getHttpGet(url); // httpGet.addHeader("Cookie", cookies); // httpGet.addHeader("User-Agent", userangent); JSONObject response = getHttpResponseEntityByJson(httpGet); String[] country = response.getString("content").split("</a>"); int size = country.length; for (int i = 0; i < size; i++) { String msg = country[i]; int code = getCode(msg); String name = getName(msg); countrys.put(name, code); } } /** * 查询市 * * @param id */ public void getCountry2(int id) { String url = host + "/user/editinfo/getSchollCityList?region_id=" + id; HttpGet httpGet = getHttpGet(url); JSONObject response = getHttpResponseEntityByJson(httpGet); String[] ssString = response.getString("content").split("</a>"); int size = ssString.length; citys.clear(); for (int i = 0; i < size; i++) { String msg = ssString[i]; int code = getCode(msg); String name = getName(msg); citys.put(name, code); } } /** * 查询县 * * @param id */ public void getCountry3(int id) { String url = host + "/user/editinfo/getSchollAddressList?region_id=" + id; HttpGet httpGet = getHttpGet(url); JSONObject response = getHttpResponseEntityByJson(httpGet); String[] ssString = response.getString("content").split("</a>"); int size = ssString.length; address.clear(); for (int i = 0; i < size; i++) { String msg = ssString[i]; int code = getCode(msg); String name = getName(msg); address.put(name, code); } } /** * 查询学校 * * @param id */ public void getCountry4(int id) { String url = host + "/user/editinfo/getSchoolNameList?region_id=" + id; HttpGet httpGet = getHttpGet(url); JSONObject response = getHttpResponseEntityByJson(httpGet); String[] ssString = response.getString("content").split("</a>"); int size = ssString.length; school.clear(); for (int i = 0; i < size; i++) { String msg = ssString[i]; int code = getCode(msg); String name = getName(msg); school.put(name, code); } } /** * 获取 code * * @param text * @return */ public int getCode(String text) { int code = 0; Pattern pattern = Pattern.compile("\"\\d+\""); Matcher matcher = pattern.matcher(text); if (matcher.find()) { code = changeStringToInt(matcher.group(0).replace("\"", "")); } return code; } /** * 获取名称 * * @param text * @return */ public String getName(String text) { String name = text.substring(text.lastIndexOf(">") + 1, text.length()); return name; } }
下面是爬取到数据截图python