nodejs实现网站数据的爬取

时间 2019-11-17

原文原文链接

 1 // 引入https模块，因为咱们爬取的网站采用的是https协议
 2 const https = require('https');
 3 // 引入cheerio模块，使用这个模块能够将爬取的网页源代码进行装载，而后使用相似jquery的语法去操做这些元素
 4 // 在cheerio不是内置模块，须要使用包管理器下载安装
 5 const cheerio = require('cheerio');
 6 // 这里以爬取拉钩网为例
 7 var url = "https://www.lagou.com/";
 8 // 使用https模块中的get方法，获取指定url中的网页源代码
 9 https.get(url, function (res) {
10     var html = '';
11     // 每当咱们从指定的url中获得数据的时候,就会触发res的data事件,事件中的chunk是每次获得的数据,data事件会触发屡次,由于一个网页的源代码并非一次性就能够下完的
12     res.on("data", function (chunk) {
13         html += chunk;
14     });
15     // 当网页的源代码下载完成后, 就会触发end事件
16     res.on("end", function () {
17         //这里咱们对下载的源代码进行一些处理
18         doSomeThing(html);
19 
20     });
21 });
22 function doSomeThing(html) {
23     // 使用cheerio模块装载咱们获得的页面源代码,返回的是一个相似于jquery中的$对象
24     var $ = cheerio.load(html);
25     //使用这个$对象就像操做jquery对象通常去操做咱们获取获得的页面的源代码
26     var $menu_box = $(".menu_box");
27     // 将咱们须要的文字信息存储在一个数组中
28     var result = [];
29     $menu_box.each(function (i, item) {
30         var obj = {};
31         var h2 = $(item).find("h2").text().trim();
32         obj.name = h2;
33         var $as = $(item).find("a");
34         obj.subName = [];
35         $as.each(function (i, item) {
36             obj.subName.push($(item).text());
37         });
38         result.push(obj);
39     });
40     //最后咱们输出这个结果
41     console.log(result);
42 }

 
  // 引入https模块，因为咱们爬取的网站采用的是https协议 
 
  const 
  https = 
  require( 
  'https'); 
 
  // 引入cheerio模块，使用这个模块能够将爬取的网页源代码进行装载，而后使用相似jquery的语法去操做这些元素 
 
  // 在cheerio不是内置模块，须要使用包管理器下载安装 
 
  const 
  cheerio = 
  require( 
  'cheerio'); 
 
  // 这里以爬取拉钩网为例 
 
  var 
  url = 
  "https://www.lagou.com/"; 
 
  // 使用https模块中的get方法，获取指定url中的网页源代码 
 
  https. 
  get( 
  url, 
  function ( 
  res) { 
 
  var 
  html = 
  ''; 
 
  // 每当咱们从指定的url中获得数据的时候,就会触发res的data事件,事件中的chunk是每次获得的数据,data事件会触发屡次,由于一个网页的源代码并非一次性就能够下完的 
 
  res. 
  on( 
  "data", 
  function ( 
  chunk) { 
 
  html += 
  chunk; 
 
   }); 
 
  // 当网页的源代码下载完成后, 就会触发end事件 
 
  res. 
  on( 
  "end", 
  function () { 
 
  //这里咱们对下载的源代码进行一些处理 
 
  doSomeThing( 
  html); 
 
   }); 
 
   }); 
 
  function 
  doSomeThing( 
  html) { 
 
  // 使用cheerio模块装载咱们获得的页面源代码,返回的是一个相似于jquery中的$对象 
 
  var 
  $ = 
  cheerio. 
  load( 
  html); 
 
  //使用这个$对象就像操做jquery对象通常去操做咱们获取获得的页面的源代码 
 
  var 
  $menu_box = 
  $( 
  ".menu_box"); 
 
  // 将咱们须要的文字信息存储在一个数组中 
 
  var 
  result = []; 
 
  $menu_box. 
  each( 
  function ( 
  i, 
  item) { 
 
  var 
  obj = {}; 
 
  var 
  h2 = 
  $( 
  item). 
  find( 
  "h2"). 
  text(). 
  trim(); 
 
  obj. 
  name = 
  h2; 
 
  var 
  $as = 
  $( 
  item). 
  find( 
  "a"); 
 
  obj. 
  subName = []; 
 
  $as. 
  each( 
  function ( 
  i, 
  item) { 
 
  obj. 
  subName. 
  push( 
  $( 
  item). 
  text()); 
 
   }); 
 
  result. 
  push( 
  obj); 
 
   }); 
 
  //最后咱们输出这个结果 
 
  console. 
  log( 
  result); 
 
   }