做者：嵇智javascript

ParserBlock

咱们在上一篇 ParserCore 讲到了，首先先调用 normalize 这个 rule 函数将 Linux("\n") 与 Windows("\r\n") 的换行符统一处理成 "\n"。接着就走到 ParseBlock.parse 的流程。这一步主要是产出 block 为 true 的 token。以下图所示：css

module.exports = function block(state) {
  var token;

  if (state.inlineMode) {
    token          = new state.Token('inline', '', 0);
    token.content  = state.src;
    token.map      = [ 0, 1 ];
    token.children = [];
    state.tokens.push(token);
  } else {
    state.md.block.parse(state.src, state.md, state.env, state.tokens);
  }
};
复制代码

parse 函数传入了四个参数：html

state.src 表明用户传入的字符串
state.md 是指当前 md 实例，主要是为了方便拿到 md 上的属性与方法
state.env 是在调用 md.parse 注入的一些额外的数据，默认是 {}，通常不会须要它，除非你要作一些定制化的开发
tokens 引用。注意：不能在 rule 函数里面更改 tokens 引用，必须保证全部的 rule 函数都是在操纵同一份 tokens。

咱们再来聚焦 ParserBlock 内部的逻辑。位于 lib/parser_block.js。java

var _rules = [
  [ 'table',      require('./rules_block/table'),      [ 'paragraph', 'reference' ] ],
  [ 'code',       require('./rules_block/code') ],
  [ 'fence',      require('./rules_block/fence'),      [ 'paragraph', 'reference', 'blockquote', 'list' ] ],
  [ 'blockquote', require('./rules_block/blockquote'), [ 'paragraph', 'reference', 'blockquote', 'list' ] ],
  [ 'hr',         require('./rules_block/hr'),         [ 'paragraph', 'reference', 'blockquote', 'list' ] ],
  [ 'list',       require('./rules_block/list'),       [ 'paragraph', 'reference', 'blockquote' ] ],
  [ 'reference',  require('./rules_block/reference') ],
  [ 'heading',    require('./rules_block/heading'),    [ 'paragraph', 'reference', 'blockquote' ] ],
  [ 'lheading',   require('./rules_block/lheading') ],
  [ 'html_block', require('./rules_block/html_block'), [ 'paragraph', 'reference', 'blockquote' ] ],
  [ 'paragraph',  require('./rules_block/paragraph') ]
];

function ParserBlock() {

  this.ruler = new Ruler();

  for (var i = 0; i < _rules.length; i++) {
    this.ruler.push(_rules[i][0], _rules[i][1], { alt: (_rules[i][2] || []).slice() });
  }
}

ParserBlock.prototype.tokenize = function (state, startLine, endLine) {
  var ok, i,
      rules = this.ruler.getRules(''),
      len = rules.length,
      line = startLine,
      hasEmptyLines = false,
      maxNesting = state.md.options.maxNesting;

  while (line < endLine) {
    state.line = line = state.skipEmptyLines(line);
    if (line >= endLine) { break; }

    if (state.sCount[line] < state.blkIndent) { break; }

    if (state.level >= maxNesting) {
      state.line = endLine;
      break;
    }
    for (i = 0; i < len; i++) {
      ok = rules[i](state, line, endLine, false);
      if (ok) { break; }
    }
    state.tight = !hasEmptyLines;
    if (state.isEmpty(state.line - 1)) {
      hasEmptyLines = true;
    }

    line = state.line;

    if (line < endLine && state.isEmpty(line)) {
      hasEmptyLines = true;
      line++;
      state.line = line;
    }
  }
};

ParserBlock.prototype.parse = function (src, md, env, outTokens) {
  var state;

  if (!src) { return; }

  state = new this.State(src, md, env, outTokens);

  this.tokenize(state, state.line, state.lineMax);
};


ParserBlock.prototype.State = require('./rules_block/state_block');
复制代码

从构造函数能够看出，ParserBlock 有 11 种 rule，分别为 table、code、fence、blockquote、hr、list、reference、heading、lheading、html_block、paragraph。通过由这些 rule 组成的 rules chain 以后，就能输出 type 为对应类型的 tokens，这也就是 ParserBlock 的做用所在。ruler 用来管理全部的 rule 以及 rule 所属的 chain。git

for (var i = 0; i < _rules.length; i++) {
  this.ruler.push(_rules[i][0], _rules[i][1], { alt: (_rules[i][2] || []).slice() });
}
复制代码

_rules 是一个二维数组，它的元素也是一个数组，先暂称之为 ruleConfig。ruleConfig 的第一个元素是 rule 的 name。第二个是 rule 的 fn，第三个是 rule 的 alt，也就是所属的职责链。假如 alt 为 ['paragraph', 'reference']，那么若是调用 ruler.getRules('paragraph') 就能返回 [fn]，同时调用 ruler.getRules('reference') 也能返回 [fn]，由于 fn 的 alt 数组包含了这两种职责链。github

再来看 parse 方法。shell

ParserBlock.prototype.parse = function (src, md, env, outTokens) {
  var state;

  if (!src) { return; }

  state = new this.State(src, md, env, outTokens);

  this.tokenize(state, state.line, state.lineMax);
};
ParserBlock.prototype.State = require('./rules_block/state_block');
复制代码

先了解 ParserBlock 的 State，记得以前 ParserCore 的 State 么？也就是在每个 Parser 的过程当中都有一个 State 实例，用来管理他们在 parse 的一些状态。ParserBlock 的 State 是位于 lib/rules_block/state_block.js。数组

function StateBlock(src, md, env, tokens) {
  var ch, s, start, pos, len, indent, offset, indent_found;
  this.src = src;
  this.md     = md;

  this.env = env;

  this.tokens = tokens

  this.bMarks = []
  this.eMarks = []
  this.tShift = []
  this.sCount = []

  this.bsCount = []

  this.blkIndent  = 0

  this.line       = 0
  this.lineMax    = 0
  this.tight      = false
  this.ddIndent   = -1
  this.parentType = 'root'

  this.level = 0

  this.result = ''
  s = this.src
  indent_found = false

  for (start = pos = indent = offset = 0, len = s.length; pos < len; pos++) {
    ch = s.charCodeAt(pos);

    if (!indent_found) {
      if (isSpace(ch)) {
        indent++;

        if (ch === 0x09) {
          offset += 4 - offset % 4;
        } else {
          offset++;
        }
        continue;
      } else {
        indent_found = true;
      }
    }

    if (ch === 0x0A || pos === len - 1) {
      if (ch !== 0x0A) { pos++; }
      this.bMarks.push(start);
      this.eMarks.push(pos);
      this.tShift.push(indent);
      this.sCount.push(offset);
      this.bsCount.push(0);

      indent_found = false;
      indent = 0;
      offset = 0;
      start = pos + 1;
    }
  }

  this.bMarks.push(s.length);
  this.eMarks.push(s.length);
  this.tShift.push(0);
  this.sCount.push(0);
  this.bsCount.push(0);

  this.lineMax = this.bMarks.length - 1;
}
复制代码

理解 State 上的属性的做用，是很关键的。由于这些属性都是接下来 tokenize 所依赖的信息。重点关注以下的属性：markdown

tokensless

tokenize 以后的 token 组成的数组
bMarks

存储每一行的起始位置，由于 parse 的过程是根据换行符逐行扫描
eMarks

存储每一行的终止位置
tShift

存储每一行第一个非空格的字符的位置（制表符长度只算作1）
sCount

存储每一行第一个非空格的字符串的位置（制表符长度为4）
bsCount

通常为 0
blkIndent

通常为 0
line

当前所在行数。tokenize 的时候逐行扫描会用到
lineMax

src 被分割成了多少行

以上都是在 tokenize 过程当中很是有用的属性。接来下看一下 tokenize 的过程，以后就生成了 block 为 true 的 token。

ParserBlock.prototype.tokenize = function (state, startLine, endLine) {
  var ok, i,
      rules = this.ruler.getRules(''),
      len = rules.length,
      line = startLine,
      hasEmptyLines = false,
      maxNesting = state.md.options.maxNesting;

  while (line < endLine) {
    state.line = line = state.skipEmptyLines(line);
    if (line >= endLine) { break; }

    if (state.sCount[line] < state.blkIndent) { break; }

    if (state.level >= maxNesting) {
      state.line = endLine;
      break;
    }

    for (i = 0; i < len; i++) {
      ok = rules[i](state, line, endLine, false);
      if (ok) { break; }
    }

    state.tight = !hasEmptyLines;
    if (state.isEmpty(state.line - 1)) {
      hasEmptyLines = true;
    }

    line = state.line;

    if (line < endLine && state.isEmpty(line)) {
      hasEmptyLines = true;
      line++;
      state.line = line;
    }
  }
}
复制代码

函数的执行流程以下：

获取 ParserBlock 构造函数声明的全部 rule 函数，由于在 Ruler 类里面规定，内部的 rule 函数必定属于名字为空字符串的 rule chain。固然构造函数还有不少其余的 rule chain。好比 paragraph、reference、blockquote、 list，暂时还未用到。同时，声明了不少初始变量。
而后走到一个 while 循环，由于 state_block 存放的信息都是以 src 字符串每一行做为维度区分的，好比每一行的起始位置，每一行的终止位置，每一行第一个字符的位置。这些信息都是特定 rule 所须要的。while 语句的前面部分就是跳过空行、是否达到最大嵌套等级的判断，重点关注这行代码。

for (i = 0; i < len; i++) {
  ok = rules[i](state, line, endLine, false);
  if (ok) { break; }
}
复制代码

这里的循环，也就是会对 src 的每一行都执行 rule chain，进而产出 token，若是其中一个 rule 返回 true，就跳出循环，准备 tokenize 下一行。那咱们来看下这些 rules 的做用。它们都位于 lib/rules_block 文件夹下面。

table.js

module.exports = function table(state, startLine, endLine, silent) {
  var ch, lineText, pos, i, nextLine, columns, columnCount, token,
      aligns, t, tableLines, tbodyLines;

if (startLine + 2 > endLine) { return false; }

  nextLine = startLine + 1;

  if (state.sCount[nextLine] < state.blkIndent) { return false; }

  if (state.sCount[nextLine] - state.blkIndent >= 4) { return false; }

  pos = state.bMarks[nextLine] + state.tShift[nextLine];
  if (pos >= state.eMarks[nextLine]) { return false; }

  ch = state.src.charCodeAt(pos++);
  if (ch !== 0x7C/* | */ && ch !== 0x2D/* - */ && ch !== 0x3A/* : */) { return false; }

  while (pos < state.eMarks[nextLine]) {
    ch = state.src.charCodeAt(pos);

    if (ch !== 0x7C/* | */ && ch !== 0x2D/* - */ && ch !== 0x3A/* : */ && !isSpace(ch)) { return false; }

    pos++;
  }

  lineText = getLine(state, startLine + 1);

  columns = lineText.split('|');
  aligns = [];
  for (i = 0; i < columns.length; i++) {
    t = columns[i].trim();
    if (!t) {
      if (i === 0 || i === columns.length - 1) {
        continue;
      } else {
        return false;
      }
    }

    if (!/^:?-+:?$/.test(t)) { return false; }
    if (t.charCodeAt(t.length - 1) === 0x3A/* : */) {
      aligns.push(t.charCodeAt(0) === 0x3A/* : */ ? 'center' : 'right');
    } else if (t.charCodeAt(0) === 0x3A/* : */) {
      aligns.push('left');
    } else {
      aligns.push('');
    }
  }

  lineText = getLine(state, startLine).trim();
  if (lineText.indexOf('|') === -1) { return false; }
  if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }
  columns = escapedSplit(lineText.replace(/^\||\|$/g, ''));

  columnCount = columns.length;
  if (columnCount > aligns.length) { return false; }

  if (silent) { return true; }

  token     = state.push('table_open', 'table', 1);
  token.map = tableLines = [ startLine, 0 ];

  token     = state.push('thead_open', 'thead', 1);
  token.map = [ startLine, startLine + 1 ];

  token     = state.push('tr_open', 'tr', 1);
  token.map = [ startLine, startLine + 1 ];

  for (i = 0; i < columns.length; i++) {
    token          = state.push('th_open', 'th', 1);
    token.map      = [ startLine, startLine + 1 ];
    if (aligns[i]) {
      token.attrs  = [ [ 'style', 'text-align:' + aligns[i] ] ];
    }

    token          = state.push('inline', '', 0);
    token.content  = columns[i].trim();
    token.map      = [ startLine, startLine + 1 ];
    token.children = [];

    token          = state.push('th_close', 'th', -1);
  }

  token     = state.push('tr_close', 'tr', -1);
  token     = state.push('thead_close', 'thead', -1);

  token     = state.push('tbody_open', 'tbody', 1);
  token.map = tbodyLines = [ startLine + 2, 0 ];

  for (nextLine = startLine + 2; nextLine < endLine; nextLine++) {
    if (state.sCount[nextLine] < state.blkIndent) { break; }

    lineText = getLine(state, nextLine).trim();
    if (lineText.indexOf('|') === -1) { break; }
    if (state.sCount[nextLine] - state.blkIndent >= 4) { break; }
    columns = escapedSplit(lineText.replace(/^\||\|$/g, ''));

    token = state.push('tr_open', 'tr', 1);
    for (i = 0; i < columnCount; i++) {
      token          = state.push('td_open', 'td', 1);
      if (aligns[i]) {
        token.attrs  = [ [ 'style', 'text-align:' + aligns[i] ] ];
      }

      token          = state.push('inline', '', 0);
      token.content  = columns[i] ? columns[i].trim() : '';
      token.children = [];

      token          = state.push('td_close', 'td', -1);
    }
    token = state.push('tr_close', 'tr', -1);
  }
  token = state.push('tbody_close', 'tbody', -1);
  token = state.push('table_close', 'table', -1);

  tableLines[1] = tbodyLines[1] = nextLine;
  state.line = nextLine;
  return true;
}
复制代码

table 这个 rule 就是用来生成 table HMTL 字符串的。内部的解析都是根据 markdown 写 table 的规范而来的，详细逻辑这里就不展开，若是感兴趣，能够写个 demo 本身打断点试试。

code.js

module.exports = function code(state, startLine, endLine/*, silent*/) {
  var nextLine, last, token;

  if (state.sCount[startLine] - state.blkIndent < 4) { return false; }

  last = nextLine = startLine + 1;

  while (nextLine < endLine) {
    if (state.isEmpty(nextLine)) {
      nextLine++;
      continue;
    }

    if (state.sCount[nextLine] - state.blkIndent >= 4) {
      nextLine++;
      last = nextLine;
      continue;
    }
    break;
  }

  state.line = last;

  token         = state.push('code_block', 'code', 0);
  token.content = state.getLines(startLine, last, 4 + state.blkIndent, true);
  token.map     = [ startLine, state.line ];

  return true;
};
复制代码

code rule 的做用也是很简单，它认为只要你每行的起始位置多于 3 个空格，那就是一个 code_block。好比下面的。

我如今就是一个 code_block
复制代码

fence.js

module.exports = function fence(state, startLine, endLine, silent) {
  var marker, len, params, nextLine, mem, token, markup,
      haveEndMarker = false,
      pos = state.bMarks[startLine] + state.tShift[startLine],
      max = state.eMarks[startLine];

  if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }

  if (pos + 3 > max) { return false; }

  marker = state.src.charCodeAt(pos);

  if (marker !== 0x7E/* ~ */ && marker !== 0x60 /* ` */) {
    return false;
  }

  mem = pos;
  pos = state.skipChars(pos, marker);

  len = pos - mem;

  if (len < 3) { return false; }

  markup = state.src.slice(mem, pos);
  params = state.src.slice(pos, max);

  if (params.indexOf(String.fromCharCode(marker)) >= 0) { return false; }

  // Since start is found, we can report success here in validation mode
  if (silent) { return true; }

  // search end of block
  nextLine = startLine;

  for (;;) {
    nextLine++;
    if (nextLine >= endLine) {
      break;
    }

    pos = mem = state.bMarks[nextLine] + state.tShift[nextLine];
    max = state.eMarks[nextLine];

    if (pos < max && state.sCount[nextLine] < state.blkIndent) {
      break;
    }

    if (state.src.charCodeAt(pos) !== marker) { continue; }

    if (state.sCount[nextLine] - state.blkIndent >= 4) {
      continue;
    }

    pos = state.skipChars(pos, marker);

    if (pos - mem < len) { continue; }

    pos = state.skipSpaces(pos);

    if (pos < max) { continue; }

    haveEndMarker = true;
    // found!
    break;
  }

  len = state.sCount[startLine];

  state.line = nextLine + (haveEndMarker ? 1 : 0);

  token         = state.push('fence', 'code', 0);
  token.info    = params;
  token.content = state.getLines(startLine + 1, nextLine, len, true);
  token.markup  = markup;
  token.map     = [ startLine, state.line ];

  return true;
};

复制代码

fence rule 相似于 code rule。它表明具备语言类型的 code_block。好比 javascript、shell、css、stylus 等等。举个栗子：

echo 'done'
复制代码

上面就是会解析生成一个 type 为 fence，info 为 shell，markup 为 "```" 的token。

blockquote.js

代码太长，就不贴代码了，blockquote 的做用就是生成 markup 为 > 的 token。下面就是一个 blockquote。

i am a blockquote

hr.js

module.exports = function hr(state, startLine, endLine, silent) {
  var marker, cnt, ch, token,
      pos = state.bMarks[startLine] + state.tShift[startLine],
      max = state.eMarks[startLine];

  // if it's indented more than 3 spaces, it should be a code block
  if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }

  marker = state.src.charCodeAt(pos++);

  // Check hr marker
  if (marker !== 0x2A/* * */ &&
      marker !== 0x2D/* - */ &&
      marker !== 0x5F/* _ */) {
    return false;
  }

  // markers can be mixed with spaces, but there should be at least 3 of them

  cnt = 1;
  while (pos < max) {
    ch = state.src.charCodeAt(pos++);
    if (ch !== marker && !isSpace(ch)) { return false; }
    if (ch === marker) { cnt++; }
  }

  if (cnt < 3) { return false; }

  if (silent) { return true; }

  state.line = startLine + 1;

  token        = state.push('hr', 'hr', 0);
  token.map    = [ startLine, state.line ];
  token.markup = Array(cnt + 1).join(String.fromCharCode(marker));

  return true;
};
复制代码

hr rule 也很简单，就是生成 type 为 hr 的 token。它的 markup 是 ***、---、___，也就是在 md 文件写这三种语法，都能解析出 <hr> 标签。

list.js

list 做用是为了解析有序列表以及无序列表的。详细的逻辑比较复杂，须要了解的能够本身经过 demo 断点调试。

reference.js

reference 做用是为了解析超连接。咱们在 md 的语法就是相似于 [reference](http://www.baidu.con) 这种。

heading.js

module.exports = function heading(state, startLine, endLine, silent) {
  var ch, level, tmp, token,
      pos = state.bMarks[startLine] + state.tShift[startLine],
      max = state.eMarks[startLine];

  // if it's indented more than 3 spaces, it should be a code block
  if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }

  ch  = state.src.charCodeAt(pos);

  if (ch !== 0x23/* # */ || pos >= max) { return false; }

  // count heading level
  level = 1;
  ch = state.src.charCodeAt(++pos);
  while (ch === 0x23/* # */ && pos < max && level <= 6) {
    level++;
    ch = state.src.charCodeAt(++pos);
  }

  if (level > 6 || (pos < max && !isSpace(ch))) { return false; }

  if (silent) { return true; }

  // Let's cut tails like ' ### ' from the end of string

  max = state.skipSpacesBack(max, pos);
  tmp = state.skipCharsBack(max, 0x23, pos); // #
  if (tmp > pos && isSpace(state.src.charCodeAt(tmp - 1))) {
    max = tmp;
  }

  state.line = startLine + 1;

  token        = state.push('heading_open', 'h' + String(level), 1);
  token.markup = '########'.slice(0, level);
  token.map    = [ startLine, state.line ];

  token          = state.push('inline', '', 0);
  token.content  = state.src.slice(pos, max).trim();
  token.map      = [ startLine, state.line ];
  token.children = [];

  token        = state.push('heading_close', 'h' + String(level), -1);
  token.markup = '########'.slice(0, level);

  return true;
};
复制代码

heading 做用是解析标题标签(h1 - h6)。它的语法主要是 #, ## 等等。

lheading.js

lheading 是解析自带分隔符的标签，好比下面

这是一个标题
========

// 上面会渲染成

<h1>这是一个标题</h1>
复制代码

html_block.js

html_block 是解析 HTML，若是你在 md 里面写 HTML 标签，那么最后仍是会获得 HTML 字符串，好比你写以下字符串：

let src = "<p>234</p>"

// 获得以下token

let token = [
  {
    "type": "html_block",
    "tag": "",
    "attrs": null,
    "map": [
      0,
      1
    ],
    "nesting": 0,
    "level": 0,
    "children": null,
    "content": "<p>234</p>",
    "markup": "",
    "info": "",
    "meta": null,
    "block": true,
    "hidden": false
  }
]

最后输出的字符串也是 `<p>234</p>`
复制代码

paragraph.js

module.exports = function paragraph(state, startLine/*, endLine*/) {
  var content, terminate, i, l, token, oldParentType,
      nextLine = startLine + 1,
      terminatorRules = state.md.block.ruler.getRules('paragraph'),
      endLine = state.lineMax;

  oldParentType = state.parentType;
  state.parentType = 'paragraph';

  // jump line-by-line until empty one or EOF
  for (; nextLine < endLine && !state.isEmpty(nextLine); nextLine++) {
    // this would be a code block normally, but after paragraph
    // it's considered a lazy continuation regardless of what's there
    if (state.sCount[nextLine] - state.blkIndent > 3) { continue; }

    // quirk for blockquotes, this line should already be checked by that rule
    if (state.sCount[nextLine] < 0) { continue; }

    // Some tags can terminate paragraph without empty line.
    terminate = false;
    for (i = 0, l = terminatorRules.length; i < l; i++) {
      if (terminatorRules[i](state, nextLine, endLine, true)) {
        terminate = true;
        break;
      }
    }
    if (terminate) { break; }
  }

  content = state.getLines(startLine, nextLine, state.blkIndent, false).trim();

  state.line = nextLine;

  token          = state.push('paragraph_open', 'p', 1);
  token.map      = [ startLine, state.line ];

  token          = state.push('inline', '', 0);
  token.content  = content;
  token.map      = [ startLine, state.line ];
  token.children = [];

  token          = state.push('paragraph_close', 'p', -1);

  state.parentType = oldParentType;

  return true;
};
复制代码

paragraph 那就很简单也是常常用到的，就是生成 p 标签。

总结

综上，能够看出 ParserBlock 的流程仍是很是的复杂与繁琐的。首先它拥有本身的 block_state，block_state 存储了 ParserBlock 在 tokenize 过程当中须要的不少信息，它的处理是以 src 换行符为维度。接着在 tokenize 的过程当中逐行对字符串运用不一样的 rule 函数，生成对应类型的 token，这样就完成了 ParserBlock 的 parse 过程。

在 ParserBlock 处理以后，可能会生成一种 type 为 inline 的 token。这种 token 属于未彻底解析的 token。举个栗子：

const src = '__ad__'

// 通过 parse 处理以后

const generatedTokens = [
  {
    "type": "paragraph_open",
    "tag": "p",
    ......
  },
  {
    "type": "inline",
    "tag": "",
    "attrs": null,
    "map": [
      0,
      1
    ],
    "nesting": 0,
    "level": 1,
    "children": [
      {
        "type": "text",
        "tag": "",
        ......
      },
      {
        "type": "strong_open",
        "tag": "strong",
        ......
      },
      {
        "type": "text",
        "tag": "",
        ......
      },
      {
        "type": "strong_close",
        "tag": "strong",
        ......
      },
      {
        "type": "text",
        "tag": "",
        ......
      }
    ],
    "content": "__ad__",
    "markup": "",
    "info": "",
    "meta": null,
    "block": true,
    "hidden": false
  },
  {
    "type": "paragraph_close",
    ......
  }
]

// 数组的第二个 token 的 type 为 inline，注意它有个 children 属性
// children 属性上的 token是怎么来的呢？
复制代码

原本由 ParserBlock 处理以后，children 为空，但这样的话，第二个 token 的 content 属性是 "__ad__"，说明加粗的语法还未解析，所以 ParserBlock 的处理还不够，咱们还须要更细粒度 token，那么这就是 ParserInline 的由来。它的做用就是编译 type 为 inline 的 token，并将更细粒度的 token 放在它的 children 属性上，这也就是generatedTokens 第二项的 children 属性值的由来。

markdown源码分析4-ParserBlock

ParserBlock

总结