Node JS爬虫：爬取瀑布流网页高清图

时间 2019-11-07

原文原文链接

静态为主的网页每每用get方法就能获取页面全部内容。动态网页即异步请求数据的网页则须要用浏览器加载完成后再进行抓取。本文介绍了如何连续爬取瀑布流网页。html

在知乎提到python就必有一大帮人提起爬虫，咱Node JS爬虫也是很是简单的，和python相比仅仅是“异步”和“多线程”的性能对比而已。对python了解很少，故对此不作评价。java

phantomjs是一个‘无壳’的chrome，具体安装方法查看phantomjs.org。phantomjs提供命令行工具运行，运行需使用命令phantom xxx.js。使用phantom-node这个库能够在Node Js中把玩phantomjs，这样就能够使用pm2进行进程守护和负载均衡了。node

目标

爬取200张以上的1920*1080分辨率的动漫壁纸，网页是百度瀑布流图片 python

方式

瀑布流是根据页面滚动位置来判断是否继续往下加载，故要利用phantomjs滚动页面来获取更多图片连接。单个图片详细页面刚进入时是压缩过的图片，这是百度优化访问速度的措施，等待几秒图片src就会替换成大图的连接。所以，进入图片详细页时应延迟几秒再获取图片src，具体延迟几秒视你网速而定。git

步骤

获取连接

首先利用phantom打开网页github

const phantom = require('phantom')

(async function() {
    const instance = await phantom.create();
    const page = await instance.createPage();
    const status = await page.open(url);
    const size = await page.property('viewportSize', {
        width: 1920,
        height: 1080
    })
}())
复制代码

获取连接数量，不足200则滚动网页chrome

// 添加一个延时函数，等待页面加载后再滚动
function delay(second) {
    return new Promise((resolve) => {
        setTimeout(resolve, second * 1000);
    });
}
复制代码

async function pageScroll(i) {
    await delay(5)
    await page.property('scrollPosition', {
        left: 0,
        top: 1000 * i
    })
    let content = await page.property('content')
    let $ = cheerio.load(content)
    console.log($('.imgbox').length)
    if($('.imgbox').length < 200) {
        await pageScroll(++i)
    }
}
await pageScroll(0)

复制代码

提取图片连接浏览器

let urlList = []
$('.imgbox').each(function() {
    urlList.push('https://image.baidu.com'+$(this).find('a').attr('href'))
})
复制代码

保存图片

定义保存图片的函数多线程

const request = require('request')
const fs = require('fs')

function save(url) {
    let ext = url.split('.').pop()
    request(url).pipe(fs.createWriteStream(`./image/${new Date().getTime()}.${ext}`));
}
复制代码

遍历urlList，建议用递归遍历，循环遍历delay不起做用

async function imgSave(i) {
    let page = await page.open(urlList[i])
    delay(1)
    let content = await page.property('content')
    $ = cheerio.load(content)
    let src = $('#currentImg').attr('src')
    save(src)
    if(i<urlList.length) {
        await imgSave(++i)
    }
}
await imgSave(0)
复制代码

最后爬取结果如图，都是高分辨率的，部分图片作了防爬处理

完整代码

const phantom = require('phantom')
const cheerio = require('cheerio')
const request = require('request')
const fs = require('fs')
function delay(second) {
    return new Promise((resolve) => {
        setTimeout(resolve, second * 1000);
    });
}
let url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&oq=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&rsp=-1'
function save(url) {
    let ext = url.split('.').pop()
    request(url).pipe(fs.createWriteStream(`./image/${new Date().getTime()}.${ext}`));
}
(async function() {
    let instance = await phantom.create();
    let page = await instance.createPage();
    let status = await page.open(url);
    let size = await page.property('viewportSize', {
        width: 1920,
        height: 1080
    })
    let $
    async function pageScroll(i) {
        await delay(1)
        await page.property('scrollPosition', {
            left: 0,
            top: 1000 * i
        })
        let content = await page.property('content')
        $ = cheerio.load(content)
        if($('.imgbox').length < 200) {
            await pageScroll(++i)
        }
    }
    await pageScroll(0)
    let urlList = []
    $('.imgbox').each(function() {
        urlList.push('https://image.baidu.com'+$(this).find('a').attr('href'))
    })
    async function imgSave(i) {
        let status = await page.open(urlList[i])
        await delay(1)
        let content = await page.property('content')
        $ = cheerio.load(content)
        let src = $('#currentImg').attr('src')
        save(src)
        if(i<urlList.length) {
            await imgSave(++i)
        }
    }
    await imgSave(0)
    await instance.exit()
}());
复制代码

个人博客：www.bougieblog.cn，欢迎前来尬聊。