puppeteer官网node
引用puppeteer官网解释: Most things that you can do manually in the browser can be done using Puppeteer!
page.setViewport()
设置获取屏幕大小,默认获取屏幕大小为800px * 600px
page.pdf(路径,大小)
保存为pdf格式图片git
- 举例:
page.pdf({path: 'hn.pdf', format: 'A4'});
page.evaluate(fn)
执行chrome的apigithub
举例:web
await page.evaluate(() => { return { width: document.documentElement.clientWidth, height: document.documentElement.clientHeight, deviceScaleFactor: window.deivcePixelRatio }; })puppeteer.launch({headless: false});
打开浏览器,默认值是true更多APIchrome
const puppeteer = require('puppeteer'); // 引用default.js的sceenshot路径,将截取的屏幕pdf保存到该路径下。 const { screenshot } = require('./config/default.js'); (async () => { // 获取browser实例 const browser = await puppeteer.launch(); // 获取浏览器tab页面实例 const page = await browser.newPage(); // 连接到百度首页 await page.goto('https://www.baidu.com'); // 截屏 await page.screenshot({ // 将截屏按时间戳保存到指定路径下。 path: `${screenshot}/${Date.now()}.png` }); // 关闭 await browser.close(); })();
node src/screenshot.js
. |-mn |-src | |-config | | |-default.js | |-helper | | |-srcToImg.js | |-mn.js |-package.json
const puppeteer = require('puppeteer'); const { mn } = require('./config/default'); const srcToImg = require('./helper/srcToImg'); (async () => { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://image.baidu.com'); console.log('go to https://image.baidu.com'); await page.setViewport({ width: 1920, height: 1080 }); console.log('reset viewport'); await page.focus('#kw'); await page.keyboard.sendCharacter('狗'); await page.click('.s_search'); console.log('go to search list'); page.on('load', async () => { console.log('page loading done, start fetch ...'); const srcs = await page.evaluate(() => { const images = document.querySelectorAll('img.main_img'); return Array.prototype.map.call(images, img => img.src); }); console.log(`get ${srcs.length} image, start download`); srcs.forEach(async (src) => { await srcToImg(src, mn); }); await browser.close(); }) })();
const path = require('path'); module.exports = { screenshot: path.resolve(__dirname, '../../screenshot'), mn: path.resolve(__dirname, '../../mn') }
const http = require('http'); const https = require('https'); const fs = require('fs'); const path = require('path'); const { promisify } = require('util'); const writeFile = promisify(fs.writeFile); module.exports = async(src, dir) => { if(/\.(jpg|png|gif)$/.test(src)) { await urlToImg(src, dir); }else { await base64ToImg(src, dir); } } // 识别src为http或者https的图片 const urlToImg = promisify((url, dir, callback) => { const mod = /^https:/.test(url) ? https : http; const ext = path.extname(url); const file = path.join(dir, `${Date.now()}${ext}`); mod.get(url, res => { res.pipe(fs.createWriteStream(file)) .on('finish', () => { callback(); console.log(file); }) }) }) // 识别src为base64地址的图片 const base64ToImg = async (base64Str, dir) => { // data: image/jpeg;base64,/raegreagearg const matchs = base64Str.match(/^data:(.+?);base64,(.+)$/); try { const ext = matches[1].split('/')[1] .replace('jpeg', 'jpg'); const file = path.join(dir, `${Date.now()}.${ext}`); await writeFile(file, match[2], 'base64'); console.log(file); } catch (ex) { console.log('非法 base64 字符串'); } }
go to https://image.baidu.com reset viewport go to search list page loading done, start fetch ... get 46 image, start download 非法 base64 字符串 非法 base64 字符串 非法 base64 字符串 非法 base64 字符串 非法 base64 字符串 非法 base64 字符串 /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351397.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351396.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351398.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351400.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351405.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351386.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351399.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351405.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351405.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351402.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351412.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351413.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351403.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351398.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351399.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351403.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351406.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351401.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351408.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351404.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351414.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351400.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351402.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351413.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351408.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351414.jpg /Users/lius/Desktop/web spider/headless-crawler/headless_crawler/mn/1530800351413.jpg ......