github Chrome DevTools Protocol 协议自己的仓库 有问题能够在这里提issuejavascript
github debugger-protocol-viewer 协议API文档的仓库java
API 文档地址 API展现的地方,这个常常用node
Network 网络请求、Cookie、缓存、证书等相关内容git
Page 页面的加载、资源内容、弹层、截图、打印等相关内容github
DOM 文档DOM的获取、修改、删除、查询等相关内容chrome
Runtime JavaScript代码的执行,这里面咱们能够搞事情~~express
咱们这里不会直接调用Websocket相关的内容来调用chrome的调试命令,而是用chrome-remote-interface 这个封装的库来作,它是基于Promise风格的浏览器
每个功能块成为一个单独的domain
,像Network,Page,DOM等都是不一样的domain
缓存
几乎每个个头大的domain
都有enable
方法,须要先调用这个方法启用以后再使用网络
各个domain
的接口方法参数都是第一个对象或者说一个Map,不用考虑参数的位置了
各个domain
的接口返回值也是一个对象,取对应的key就行
参数值和返回值常常是meta信息,常常是各类对象的id信息,而不是具体的对象内容(这里可能须要切一下风格)
首先作一个简单的封装,准备API的执行环境,具体可参考前一篇关于工具库的。
const chromeLauncher = require('chrome-launcher'); const chromeRemoteInterface = require('chrome-remote-interface'); const prepareAPI = (config = {}) => { const {host = 'localhost', port = 9222, autoSelectChrome = true, headless = true} = config; const wrapperEntry = chromeLauncher.launch({ host, port, autoSelectChrome, additionalFlags: [ '--disable-gpu', headless ? '--headless' : '' ] }).then(chromeInstance => { const remoteInterface = chromeRemoteInterface(config).then(chromeAPI => chromeAPI).catch(err => { throw err; }); return Promise.all([chromeInstance, remoteInterface]) }).catch(err => { throw err }); return wrapperEntry };
const wrapper = require('the-wrapper-module'); const performanceParser = (perforceTiming) => { let timingGather = {}; perforceTiming = perforceTiming || {}; timingGather.redirect = perforceTiming.redirectEnd - perforceTiming.redirectEnd-perforceTiming.redirectStart; timingGather.dns = perforceTiming.domainLookupEnd - perforceTiming.domainLookupStart; timingGather.tcp = perforceTiming.connectEnd - perforceTiming.connectStart; timingGather.request = perforceTiming.responseStart - perforceTiming.requestStart; timingGather.response = perforceTiming.responseEnd - perforceTiming.responseStart; timingGather.domReady = perforceTiming.domContentLoadedEventStart - perforceTiming.navigationStart; timingGather.load = perforceTiming.loadEventStart - perforceTiming.navigationStart; return timingGather; }; const showPerformanceInfo = (performanceInfo) => { performanceInfo = performanceInfo || {}; console.log(`页面重定向耗时:${performanceInfo.redirect}`); console.log(`DNS查找耗时:${performanceInfo.dns}`); console.log(`TCP链接耗时:${performanceInfo.tcp}`); console.log(`请求发送耗时:${performanceInfo.request}`); console.log(`响应接收耗时:${performanceInfo.response}`); console.log(`DOMReady耗时:${performanceInfo.domReady}`); console.log(`页面加载耗时:${performanceInfo.load}`); }; wrapper.prepareAPI().then(([chromeInstance, remoteInterface]) => { const {Runtime,Page} = remoteInterface; Page.loadEventFired(() => { Runtime.evaluate({ expression:'window.performance.timing.toJSON()', returnByValue:true //不加这个参数,拿到的是一个对象的meta信息,还须要getProperties }).then((resultObj) => { let {result,exceptionDetails} = resultObj; if(!exceptionDetails){ showPerformanceInfo(performanceParser(result.value)) }else{ throw exceptionDetails; } }); }); Page.enable().then(() => { Page.navigate({ url:'http://www.baidu.com' }) }); });
Web自动化 headless chrome
,并爬取首屏结果连接const wrapper = require('the-wrapper-module'); //有this的地方写成箭头函数要注意,这里会有问题 const buttonClick = function () { this.click(); }; const setInputValue = () => { var input = document.getElementById('kw'); input.value = 'Web自动化 headless chrome'; }; const parseSearchResult = () => { let resultList = []; const linkBlocks = document.querySelectorAll('div.result.c-container'); for (let block of Array.from(linkBlocks)) { let targetObj = block.querySelector('h3'); resultList.push({ title: targetObj.textContent, link: targetObj.querySelector('a').getAttribute('href') }); } return resultList; }; wrapper.prepareAPI({ // headless: false //加上这行代码能够查看浏览器的变化 }).then(([chromeInstance, remoteInterface]) => { const {Runtime, DOM, Page, Network} = remoteInterface; let framePointer; Promise.all([Page.enable(), Network.enable(), DOM.enable(),Page.setAutoAttachToCreatedPages({autoAttach:true})]).then(() => { Page.domContentEventFired(() => { console.log('Page.domContentEventFired') Runtime.evaluate({ expression:`window.location.href`, returnByValue:true }).then(result => { console.log(result) }) }); Page.frameNavigated(() => { console.log('Page.frameNavigated') Runtime.evaluate({ expression:`window.location.href`, returnByValue:true }).then(result => { console.log(result) }) }) Page.loadEventFired(() => { console.log('Page.loadEventFired') Runtime.evaluate({ expression:`window.location.href`, returnByValue:true }).then(result => { console.log(result) }) DOM.getDocument().then(({root}) => { //百度首页表单 DOM.querySelector({ nodeId: root.nodeId, selector: '#form' }).then(({nodeId}) => { Promise.all([ //找到 搜索框填入值 DOM.querySelector({ nodeId: nodeId, selector: '#kw' }).then((inputNode) => { Runtime.evaluate({ // 两种写法 // expression:'document.getElementById("kw").value = "Web自动化 headless chrome"', expression: `(${setInputValue})()` }); //这段代码不起做用 日狗 // DOM.setNodeValue({ // nodeId:inputNode.nodeId, // value:'Web自动化 headless chrome' // }); //上面的代码需求要这么写 // DOM.setAttributeValue({ // nodeId:inputNode.nodeId, // name:'value', // value:'headless chrome' // }); }) //找到 提交按钮setInputValue , DOM.querySelector({ nodeId, selector: '#su' }) ]).then(([inputNode, buttonNode]) => { Runtime.evaluate({ expression: 'document.getElementById("kw").value', }).then(({result}) => { console.log(result) }); return DOM.resolveNode({ nodeId: buttonNode.nodeId }).then(({object}) => { const {objectId} = object; return Runtime.callFunctionOn({ objectId, functionDeclaration: `${buttonClick}` }) }); }).then(() => { setTimeout(() => { Runtime.evaluate({ expression: `(${parseSearchResult})()`, returnByValue: true }).then(({result}) => { console.log(result.value) //百度的URL有加密,须要再请求一次拿到真实URL }) },3e3) }); }) }); }); Page.navigate({ url: 'http://www.baidu.com' }).then((frameObj) => { framePointer = frameObj }); }) });