node爬虫快速入门

时间 2019-11-08

标签 node 爬虫快速入门栏目网络爬虫繁體版

原文原文链接

node爬虫

初入前端，刚刚接触node，对于耳闻已久的node爬虫很是神往，因此有了这篇文章，项目代码在文章末尾

需求

抓取天涯论坛重庆地区板块的文章列表信息。html

使用工具

node.js
superagent(客户端请求代理模块)
cheerio(为服务器特别定制的，快速、灵活、实施的jQuery核心实现)

安装并使用cheerio，superagent模块

安装

npm install superagent cheerio --save

在项目中引入cheerio，superagent

const superagent = require('superagent')
const cheerio = require('cheerio')

指定须要抓取的域名

const mainUrl = 'http://bbs.tianya.cn'  //天涯论坛主域名
let url = '/list-45-1.shtml'    //重庆区域域名

请求数据

superagent.get(mainUrl + url).end(function (err, res) {
        // 抛错拦截
        if (err) {
            return
            throw Error(err)
        }
        console.log(res)
    }

分析页面结构

对页面内容进行分析，提取对咱们须要的内容

如下图片是页面信息
前端

咱们须要的列表在class为mt5的div下。
整个网页有多个mt5，继续向下找。
每一栏信息在('.mt5 table tbody tr')下。

调用cheerio选取('.mt5 table tbody tr')node

let $ = cheerio.load(res.text)
$('.mt5 table tbody tr').each((index, item)=>{
//这里是每一项的信息
})

找到了信息，下面对找到的信息进行解析

解析数据

找到须要解析的数据，对数据进行解析，保存咱们须要的数据

let $ = cheerio.load(res.text)
        let data = []   //存储抓去到的数据
        $('.mt5 table tbody tr').each((index, item) => {
            let _this = $(item)
            //根据页面判断是不是文章
            if ($(_this.children()[0]).hasClass('td-title')) {
                //对数据进行存储
                let obj
                let title = $(_this.find('.td-title')).find('span').next().text()
                // let text = $(_this.find('a')[0]).text()  //另外一种选择器
                let type = $(_this.find('.td-title')).find('.face').attr('title')
                let goto = $(_this.find('.td-title')).find('span').next().attr('href')
                let author = $(_this.children()[1]).text()
                let point = $(_this.children()[2]).text()
                let time = $(_this.children()[3]).text()
                obj = {
                    title: title,
                    type: type,
                    url: mainUrl + goto,
                    author: author,
                    point: point,
                    time: time
                }
                if (obj.title != "") {
                    //判断若是有内容，则推送到data中
                    data.push(obj)
                }
            }
        })

存储数据到本地

此时须要把data中保存的数据存到想要保存的文件中须要用到node的fs模块

1.引入fs模块git

const fs = require('fs')

2.存储数据到本地github

在根目录下建立data文件夹数据库

fs.writeFile(__dirname + '/data/articleLists.json', JSON.stringify({
                status: 0,
                data: data
            }), function (err) {
                if (err) {
                    console.log(err)
                } else {
                  console.log("写入文章列表完成")
                }
            })

如今爬虫会把爬到的数据存储到本地了
ok，到这里咱们的爬虫已经完成了，接下来咱们须要对它进行优化npm

让爬虫更聪明

如今咱们的爬虫只能爬取当前页的信息，咱们来改一下，让它也能翻页

分析翻页按钮，天涯论坛的列表也的下一页按钮中有一个a标签，里边的url加上以前咱们记录的mainUrl就是下一页的标签。因此，在爬虫爬取完本页的数据后，让爬虫向下一页的连接发一个新的请求就能够继续爬去了。json

//单次读取后，找到下一页的连接，继续抓取下一页的数据
        let nextPage = $('.mt5').next().find('.short-pages-2 .links')
        nextPage.children().each((index, item) => {
            if ($(item).text() === '下一页') {
                let url = $(item).attr("href")
                getData(url)    //刚才咱们请求数据的方法，命名为这个函数
            }
        })

如今，爬虫读取完当前页数据后就会继续爬取下一页的数据。服务器

完成代码

最后我还增长了一个页码，每一页数据，单独进行记录。下面是完整的代码

const superagent = require('superagent')
const cheerio = require('cheerio')
const fs = require('fs')

const mainUrl = 'http://bbs.tianya.cn'  //天涯论坛主域名
let url = '/list-45-1.shtml'    //重庆区域域名

let index = 1   //记录页码数
//发送请求获取页面资源方法
let getData = (url) => {
    // 使用superagent请求页面数据
    superagent.get(mainUrl + url).end(function (err, res) {
        // 抛错拦截
        if (err) {
            return
            throw Error(err)
        }
        // 请求数据后使用cheerio解析数据
        let $ = cheerio.load(res.text)
        let data = []   //存储抓去到的数据
        $('.mt5 table tbody tr').each((index, item) => {
            let _this = $(item)
            //根据页面判断是不是文章
            if ($(_this.children()[0]).hasClass('td-title')) {
                //对数据进行存储
                let obj
                let title = $(_this.find('.td-title')).find('span').next().text()
                // let text = $(_this.find('a')[0]).text()  //另外一种选择器
                let type = $(_this.find('.td-title')).find('.face').attr('title')
                let goto = $(_this.find('.td-title')).find('span').next().attr('href')
                let author = $(_this.children()[1]).text()
                let point = $(_this.children()[2]).text()
                let time = $(_this.children()[3]).text()
                obj = {
                    title: title,
                    type: type,
                    url: mainUrl + goto,
                    author: author,
                    point: point,
                    time: time
                }
                if (obj.title != "") {
                    //判断若是有内容，则推送到data中
                    data.push(obj)
                }
            }
        })
        if (data.length > 0) {  //判断data中是否有内容
            //使用fs模块对data中的数据进行储存，也可使用数据库进行操做
            fs.writeFile(__dirname + '/data/articleLists' + index + '.json', JSON.stringify({
                status: 0,
                data: data
            }), function (err) {
                if (err) {
                    console.log(err)
                } else {
                    console.log("写入文章列表完成, 当前页码：", index)
                    index++
                }
            })
        }
        //单次读取后，找到下一页的连接，继续抓取下一页的数据
        let nextPage = $('.mt5').next().find('.short-pages-2 .links')
        nextPage.children().each((index, item) => {
            if ($(item).text() === '下一页') {
                let url = $(item).attr("href")
                getData(url)
            }
        })
    })
}
//初次执行数据抓取
getData(url)

好了本次node爬虫快速入门文章到这里就结束了，可是这个爬虫还有不少地方须要完善，之后我会为你们带来更详细的爬虫教程