上个月写了一篇《个人大前端之旅》,里面介绍了一下我对大前端时代到来的一点我的观点。简单来讲,我更喜欢把本身的将来规划成一专多能的工程师,毕竟技多不压身,在深刻研究本职领域的前提下多涉猎一下其余的领域对本身的成长老是有益处的。html
先归纳一下本文的主要内容:前端
先说结论(房产类网站可通用):node
简单抽取一下具体的爬取步骤,以自如(北京地区)为例:mysql
经过主页的布局,能够看到房产类的网站基本上都是上方是地标(好比:东城-崇文门),下面是该地标附近的房产信息。因此经过分析这块的网页结构就能够抓到全部的地标信息。git
以自如网站为例,好比咱们想看安定门的租房信息,直接在首页的搜索框中输入“安定门”而后点击搜索按钮。程序员
根据上一小节的方法论,开始动手写代码。这里以自如为例(自如的信息比链家难爬,可是原理都是通用的)。github
打开自如首页,打开 Chrome 的开发者工具,开始分析网页元素。ajax
let allParentLocation = $('ul.clearfix.filterList', 'dl.clearfix.zIndex6');
for (let i = 1; i < allParentLocation.children().length; i++) {
let parentLocation = allParentLocation.children().eq(i);
let parentLocationText = parentLocation.children().eq(0).text(); // 东城 西城...
let allChildren = $(parentLocation.children().eq(1)).find('a');
for (let j = 1; j <allChildren.length; j++) {
let childrenLocationText = allChildren.eq(j).text(); //子行政区
//TODO 上面的childrenLocationText变量就是地标信息
}
}
复制代码
如2.1.2所述,自如二级页面基本上是 baseUrl+地标+page 组成。因此我们能够完善一下3.1中的代码。下面咱们封装一个函数用来解析地标而且生成全部二级页地址的数组。注:这个函数返回的是一个 Promise ,后面会用 async 函数来组织全部 Promise 。sql
/**
* 获取行政区
* @param data
* @returns {Promise<any>}
*/
function parseLocationAndInitTargetPath(data) {
let targetPaths = [];
let promise = new Promise(function (resolve, reject) {
let $ = cheerio.load(data);
let allParentLocation = $('ul.clearfix.filterList', 'dl.clearfix.zIndex6');
for (let i = 1; i < allParentLocation.children().length; i++) {
let parentLocation = allParentLocation.children().eq(i);
let parentLocationText = parentLocation.children().eq(0).text(); // 东城 西城...
let allChildren = $(parentLocation.children().eq(1)).find('a');
for (let j = 1; j <allChildren.length; j++) {
let childrenLocationText = allChildren.eq(j).text(); //子行政区
let encodeChildrenLocationText = encodeURI(childrenLocationText);
for (let page = 1; page < 50; page++) { //只获取前50页的数据
targetPaths.push(`${basePath}qwd=${encodeChildrenLocationText}&p=${page}`);
}
}
}
resolve(targetPaths);
});
return promise;
}
复制代码
先观察一下二级页的布局,例如咱们想把图片、标题、tags、价格这几个信息抓取下来。 npm
/**
* 解析每一条的数据
*/
async function parseItemData(targetPaths) {
let promises = [];
for (let path of targetPaths) {
let data = await getHtmlSource(path);
let allText = '';
try{
allText = await ziRoomPriceUtil.getTextFromImage(data);
}catch(err){
console.log('抓取失败--->>> '+path);
continue;
}
let promise = new Promise((resolve, reject) => {
let $ = cheerio.load(data);
let result = $('#houseList');
let allResults = [];
for (let i = 0; i < result.children().length; i++) {
let item = result.children().eq(i);
let imgSrc = $('img', item).attr('src');
let title = $('a', $('.txt', item)).eq(0).text();
let detail = $('a', $('.txt', item)).eq(1).text();
let label = '';
$('span', $('.txt', item)).each(function (i, elem) {
label = label + ' ' + $(this).text();
});
let price = '';
if (allText.length !== 10) {
price = '未抓取到价格信息'+allText;
}else{
let priceContain = $('span', $('.priceDetail', item));
for(let i = 0;i<priceContain.length;i++){
if(i === 0 || i === priceContain.length-1){
price = price +' '+ priceContain.eq(i).text(); //首位: ¥ 末尾: 每个月/每季度
}else {
price = price + ziRoomPriceUtil.style2Price(priceContain.eq(i).attr('style'),allText);
}
}
}
allResults.push({'imgSrc':imgSrc,'title':title,'detail':detail,'label':label,'price':price});
}
resolve(allResults);
});
promises.push(promise);
}
return Promise.all(promises);
}
复制代码
注意 上面有几个点须要解释一下
//自如爬虫脚本 http://www.ziroom.com/
let schedule = require('node-schedule');
let superagent = require('superagent');
let cheerio = require('cheerio');
let charset = require('superagent-charset'); //解决乱码问题:
charset(superagent);
let ziRoomPriceUtil = require('../utils/ZiRoomPriceUtil');
var phantom = require("phantom");
var _ph, _page, _outObj;
let basePath = 'http://www.ziroom.com/z/nl/z3.html?';
/**
* 使用phantom获取网页源码
* @param path
* @param callback
*/
function getHtmlSource(path) {
let promise = new Promise(function (resolve, reject) {
phantom.create().then(function (ph) {
_ph = ph;
return _ph.createPage();
}).then(function (page) {
_page = page;
return _page.open(path);
}).then(function (status) {
return _page.property('content')
}).then(function (content) {
resolve(content);
_page.close();
_ph.exit();
}).catch(function (e) {
console.log(e);
});
});
return promise;
}
/**
* 获取行政区
* @param data
* @returns {Promise<any>}
*/
function parseLocationAndInitTargetPath(data) {
let targetPaths = [];
let promise = new Promise(function (resolve, reject) {
let $ = cheerio.load(data);
let allParentLocation = $('ul.clearfix.filterList', 'dl.clearfix.zIndex6');
for (let i = 1; i < allParentLocation.children().length; i++) {
let parentLocation = allParentLocation.children().eq(i);
let parentLocationText = parentLocation.children().eq(0).text(); // 东城 西城...
let allChildren = $(parentLocation.children().eq(1)).find('a');
for (let j = 1; j <allChildren.length; j++) {
let childrenLocationText = allChildren.eq(j).text(); //子行政区
let encodeChildrenLocationText = encodeURI(childrenLocationText);
for (let page = 1; page < 50; page++) { //只获取前三页的数据
targetPaths.push(`${basePath}qwd=${encodeChildrenLocationText}&p=${page}`);
}
}
}
resolve(targetPaths);
});
return promise;
}
/**
* 解析每一条的数据
*/
async function parseItemData(targetPaths) {
let promises = [];
for (let path of targetPaths) {
let data = await getHtmlSource(path);
let allText = '';
try{
allText = await ziRoomPriceUtil.getTextFromImage(data);
}catch(err){
console.log('抓取失败--->>> '+path);
continue;
}
let promise = new Promise((resolve, reject) => {
let $ = cheerio.load(data);
let result = $('#houseList');
let allResults = [];
for (let i = 0; i < result.children().length; i++) {
let item = result.children().eq(i);
let imgSrc = $('img', item).attr('src');
let title = $('a', $('.txt', item)).eq(0).text();
let detail = $('a', $('.txt', item)).eq(1).text();
let label = '';
$('span', $('.txt', item)).each(function (i, elem) {
label = label + ' ' + $(this).text();
});
let price = '';
if (allText.length !== 10) {
price = '未抓取到价格信息'+allText;
}else{
let priceContain = $('span', $('.priceDetail', item));
for(let i = 0;i<priceContain.length;i++){
if(i === 0 || i === priceContain.length-1){
price = price +' '+ priceContain.eq(i).text(); //首位: ¥ 末尾: 每个月/每季度
}else {
price = price + ziRoomPriceUtil.style2Price(priceContain.eq(i).attr('style'),allText);
}
}
}
allResults.push({'imgSrc':imgSrc,'title':title,'detail':detail,'label':label,'price':price});
}
resolve(allResults);
});
promises.push(promise);
}
return Promise.all(promises);
}
/**
* 初始化目标网页
*/
async function init() {
let basePathSource = await getHtmlSource(basePath);
let targetPaths = await parseLocationAndInitTargetPath(basePathSource);
let result = await parseItemData(targetPaths);
return result ;
}
/**
* 开始爬取
*/
function startSplider() {
console.log('自如爬虫已启动...');
let startTime = new Date();
init().then(function (data) {
let endTime = new Date();
console.log('自如爬虫执行完毕 共消耗时间'+(endTime - startTime)/1000+'秒');
}, function (error) {
console.log(error);
});
}
startSplider();
// module.exports = {
// startSplider,
// };
复制代码
let md5=require("md5")
let baiduAiUtil = require('./BaiduAiUtil');
function style2Price(style,allText) {
let position = style.match('[1-9]\\d*')/30;
return allText.substr(position,1);
}
function getTextFromImage(pageSrouce) {
let promise = new Promise(function (resolve, reject) {
try {
let matchStr = pageSrouce.match('static8.ziroom.com/phoenix/pc/images/price/[^\\s]+.png')[0];
let path = `http://${matchStr}`;
baiduAiUtil.identifyImageByUrl(path).then(function(result) {
resolve(result.words_result[0].words);
}).catch(function(err) {
// 若是发生网络错误
reject(err)
});
} catch (err) {
reject(err);
}
});
return promise;
}
module.exports = {
style2Price,
getTextFromImage
}
复制代码
let fs = require('fs');
let AipOcrClient = require("baidu-aip-sdk").ocr;
// 设置APPID/AK/SK
let APP_ID = "需替换你的 APPID";
let API_KEY = "需替换你的 AK";
let SECRET_KEY = "需替换你的 SK";
// 新建一个对象,建议只保存一个对象调用服务接口
let client = new AipOcrClient(APP_ID, API_KEY, SECRET_KEY);
/**
* 经过本地文件识别数据
* @param imagePath 本地file path
* @returns {Promise}
*/
function identifyImageByFile(imagePath){
let image = fs.readFileSync(imagePath).toString("base64");
return client.generalBasic(image);
}
/**
* 经过远程url识别数据
* @param url 远程url地址
* @returns {Promise}
*/
function identifyImageByUrl(url){
return client.generalBasicUrl(url);
}
module.exports = {
identifyImageByUrl,
identifyImageByFile
}
复制代码
注:这是我存到mysql中的爬取结果,因为 Node 连接 Mysql 不是本文重点,因此没贴代码。你能够选择把 startSplider 函数获取到的结果放到文件里、MongooDB 或者其余地方。
这段时间写了不少各大网站的爬虫代码,发现不少工做量是重复的。好比:租房类的网站大部分都是 先爬地标再爬二级页 这种套路。本着 “以可配置为荣 以硬编码为耻” 的程序员价值观,后期会考虑把爬虫模块作成可配置的。这里跟你们分享一个开源库: 牛咖 。
contact way | value |
---|---|
weixinjie1993@gmail.com | |
W2006292 | |
github | github.com/weixinjie |
blog | juejin.im/user/57673c… |