nodejs爬虫实践-入门版

需求场景:
当你想分析微信生态内什么产品最受欢迎的时候
当你想参考他们的点子与设计的时候...html

你须要数据,你须要爬虫ios

工具库

分析目标页面 Dom 结构

获取 json 格式数据npm

let res = await axios.get(url)
let html = res.data
let items = []
// 解析html
let $ = cheerio.load(html)
$(config.dom).each((idx, element) => {
  let $link = $(element)
  items.push({
    title: $link.attr('title').replace(/\//g, '-'),
    href: $link.attr('href'),
    desc: $link.find('.desc').text(),
    count: $link.find('.opened i').text()
  })
})
console.log({
  total: items.length,
  items
})
复制代码

结果以下

{
  total: 70,
  items: [
    {
      title: '微报名',
      href: 'https://weixiao.qq.com/store/details/10007',
      desc: '在线收集报名信息,让校园活动更便捷',
      count: '43968'
    },
    {
      title: '微上墙',
      href: 'https://weixiao.qq.com/store/details/10008',
      desc: '线下活动中,经过大屏幕同步展现现场观众发送的微信消息',
      count: '34967'
    },
    ...
  ]
}
复制代码

项目所属图片资源获取

注意图片地址有多种json

// /img/home/apps/preview/apply_01.jpg
// https://weixiao.qq.com/img/home/apps/preview/apply_01.jpg
// http://p.qpic.cn/weixiao/0/1481391605/640
复制代码

下载单张图片资源

// 下载单张图片资源
const downloadImage = async (imageSrc, fileName) => {
  const url = /^http(s?):\/\//.test(imageSrc) ? imageSrc : `https:${imageSrc}`
  const res = await axios({
    url,
    method: 'get',
    responseType: 'stream'
  })
  // console.log(res.data)
  res.data.pipe(fs.createWriteStream(fileName))
}
复制代码

下载单个相册图片资源

// 下载相册图片
const downloadPhoto = async item => {
  const url = item.href
  const res = await axios.get(url)
  const html = res.data
  // 暂存图片资源地址
  let items = []
  // 解析html
  const $ = cheerio.load(html)
  $('.preview .preview-img').each((idx, element) => {
    const $img = $(element)
    const type = 'png'
    let src = $img.attr('src')
    !/^http(s?):\/\//.test(src) && (src = `https://weixiao.qq.com${src}`)
    items.push({
      src,
      type
    })
  })
  console.log({
    total: items.length,
    items
  })
  // 建立子目录
  let folderPath = path.resolve(__dirname, `${baseDir}/${item.title}`)
  mkdirSync(folderPath)
  for (let i = 0; i < items.length; i++) {
    const item = items[i]
    await downloadImage(item.src, `${folderPath}/${i + 1}.${item.type}`)
    console.log(`已下载:${item.title}-${i + 1}`)
  }
}
复制代码

执行代码以下

逐个资源下载,能够优化为 Promise.all 统一下载axios

逐个下载

const axios = require('axios')
const cheerio = require('cheerio')
const fs = require('fs')
const path = require('path')

const config = {
  url: 'https://weixiao.qq.com', // 目标域名
  route: 'store/labels?tag=0&order=2', // 目标具体地址
  dom: '.app-list a.app-item' // 处理元素选择器
}

const baseDir = `./qq-app-download`
const savePath = path.resolve(__dirname, baseDir)

// 建立文件夹
const mkdirSync = dirPathStr => {
  if (!fs.existsSync(dirPathStr)) {
    fs.mkdirSync(dirPathStr)
    console.log(`文件夹已生成:${dirPathStr}`)
  } else {
    console.log(`文件夹已存在:${dirPathStr}`)
  }
}

mkdirSync(savePath)

// 下载图片
const downloadImage = async (imageSrc, fileName) => {
  const url = /^http(s?):\/\//.test(imageSrc) ? imageSrc : `https:${imageSrc}`
  const res = await axios({
    method: 'get',
    url,
    responseType: 'stream'
  })
  // console.log(res.data)
  res.data.pipe(fs.createWriteStream(fileName))
}

// 下载相册
const downloadPhoto = async item => {
  const url = item.href
  const title = item.title
  const res = await axios.get(url)
  const html = res.data
  let items = []
  // 解析html
  const $ = cheerio.load(html)
  $('.preview .preview-img').each((idx, element) => {
    const $img = $(element)
    const type = 'png'
    let src = $img.attr('src')
    !/^http(s?):\/\//.test(src) && (src = `https://weixiao.qq.com${src}`)
    items.push({
      src,
      type
    })
  })
  // console.log(items)
  // 存放图片子目录
  const folderPath = path.resolve(__dirname, `${baseDir}/${item.title}`)
  mkdirSync(folderPath)
  for (let i = 0; i < items.length; i++) {
    const item = items[i]
    await downloadImage(item.src, `${folderPath}/${i + 1}.${item.type}`)
    console.log(`[${title}] - ${i + 1} 下载完成`)
  }
}

// 下载本页面的全部相册
const downloadImgList = async items => {
  // for (let index = 0; index < items.length; index++) {
  for (let index = 0; index < 3; index++) {
    const item = items[index]
    // 下载相册
    await downloadPhoto(item)
    console.log(`相册 [${item.title}] 下载完成`)
  }
}

// 入口函数
const main = async () => {
  const st = new Date().getTime()
  const url = `${config.url}/${config.route}`
  const res = await axios.get(url)
  const html = res.data
  let items = []
  // 解析html
  const $ = cheerio.load(html)
  $(config.dom).each((idx, element) => {
    const $link = $(element)
    items.push({
      title: $link.attr('title').replace(/\//g, '-'),
      href: $link.attr('href'),
      desc: $link.find('.desc').text(),
      count: $link.find('.opened i').text()
    })
  })
  console.log({
    total: items.length,
    items
  })
  await downloadImgList(items)
  const et = new Date().getTime()
  console.log(`总耗时=>${(et - st) / 1000}s`)
}

main()
复制代码

统一下载

const axios = require('axios')
const cheerio = require('cheerio')
const fs = require('fs')
const path = require('path')

const config = {
  url: 'https://weixiao.qq.com', // 目标域名
  route: 'store/labels?tag=0&order=2', // 目标具体地址
  dom: '.app-list a.app-item' // 处理元素选择器
}

const baseDir = `./qq-app-download`
const savePath = path.resolve(__dirname, baseDir)

const mkdirSync = dirPathStr => {
  if (!fs.existsSync(dirPathStr)) {
    fs.mkdirSync(dirPathStr)
    console.log(`文件夹已 生成 :${dirPathStr}`)
  } else {
    console.log(`文件夹已 存在 :${dirPathStr}`)
  }
}

mkdirSync(savePath)

// 封装taskPromiseAll
const taskPromiseAll = async arr => {
  return new Promise((resolve, reject) => {
    Promise.all(arr)
      .then(res => {
        resolve()
      })
      .catch(error => {
        reject(error)
      })
  })
}

// 下载图片
const downloadImage = async (imageSrc, fileName) => {
  let url = /^http(s?):\/\//.test(imageSrc) ? imageSrc : `https:${imageSrc}`
  let res = await axios({
    url,
    method: 'get',
    responseType: 'stream'
  })
  res.data.pipe(fs.createWriteStream(fileName))
}

// 下载相册
const downloadPhoto = async item => {
  const url = item.href
  const res = await axios.get(url)
  const html = res.data
  let items = []
  // 解析html
  const $ = cheerio.load(html)
  $('.preview .preview-img').each((idx, element) => {
    const $img = $(element)
    const type = 'png'
    let src = $img.attr('src')
    !/^http(s?):\/\//.test(src) && (src = `https://weixiao.qq.com${src}`)
    items.push({
      src,
      type
    })
  })
  // console.log(items)
  // 存放图片子目录
  const folderPath = path.resolve(__dirname, `${baseDir}/${item.title}`)
  mkdirSync(folderPath)
  const arr = items.map((item, i) =>
    downloadImage(item.src, `${folderPath}/${i + 1}.${item.type}`)
  )
  await taskPromiseAll(arr)
}

// 下载本页面的全部相册
const downloadImgList = async items => {
  const arr = items.map(item => downloadPhoto(item))
  const res = await taskPromiseAll(arr)
  // console.log(res)
}

// 入口函数
const main = async () => {
  const st = new Date().getTime()
  const url = `${config.url}/${config.route}`
  const res = await axios.get(url)
  const html = res.data
  let items = []
  // 解析html
  const $ = cheerio.load(html)
  $(config.dom).each((idx, element) => {
    const $link = $(element)
    items.push({
      title: $link.attr('title').replace(/\//g, '-'),
      href: $link.attr('href'),
      desc: $link.find('.desc').text(),
      count: $link.find('.opened i').text()
    })
  })
  console.log({
    total: items.length,
    items
  })
  await downloadImgList(items)
  console.log(`耗时=>${(new Date().getTime() - st) / 1000}`)
}

main()
复制代码

优化

资源统一下载微信

优化后运行对好比下(下载3个相册)

按文件下载 按相册下载 统一下载
11.5s 8.5s 3.8s
相关文章
相关标签/搜索