做者:嵇智javascript
咱们在上一篇 ParserCore 讲到了,首先先调用 normalize 这个 rule 函数将 Linux("\n"
) 与 Windows("\r\n"
) 的换行符统一处理成 "\n"
。接着就走到 ParseBlock.parse 的流程。这一步主要是产出 block 为 true 的 token。以下图所示:css
module.exports = function block(state) { var token; if (state.inlineMode) { token = new state.Token('inline', '', 0); token.content = state.src; token.map = [ 0, 1 ]; token.children = []; state.tokens.push(token); } else { state.md.block.parse(state.src, state.md, state.env, state.tokens); } }; 复制代码
parse 函数传入了四个参数:html
{}
,通常不会须要它,除非你要作一些定制化的开发咱们再来聚焦 ParserBlock 内部的逻辑。位于 lib/parser_block.js
。java
var _rules = [ [ 'table', require('./rules_block/table'), [ 'paragraph', 'reference' ] ], [ 'code', require('./rules_block/code') ], [ 'fence', require('./rules_block/fence'), [ 'paragraph', 'reference', 'blockquote', 'list' ] ], [ 'blockquote', require('./rules_block/blockquote'), [ 'paragraph', 'reference', 'blockquote', 'list' ] ], [ 'hr', require('./rules_block/hr'), [ 'paragraph', 'reference', 'blockquote', 'list' ] ], [ 'list', require('./rules_block/list'), [ 'paragraph', 'reference', 'blockquote' ] ], [ 'reference', require('./rules_block/reference') ], [ 'heading', require('./rules_block/heading'), [ 'paragraph', 'reference', 'blockquote' ] ], [ 'lheading', require('./rules_block/lheading') ], [ 'html_block', require('./rules_block/html_block'), [ 'paragraph', 'reference', 'blockquote' ] ], [ 'paragraph', require('./rules_block/paragraph') ] ]; function ParserBlock() { this.ruler = new Ruler(); for (var i = 0; i < _rules.length; i++) { this.ruler.push(_rules[i][0], _rules[i][1], { alt: (_rules[i][2] || []).slice() }); } } ParserBlock.prototype.tokenize = function (state, startLine, endLine) { var ok, i, rules = this.ruler.getRules(''), len = rules.length, line = startLine, hasEmptyLines = false, maxNesting = state.md.options.maxNesting; while (line < endLine) { state.line = line = state.skipEmptyLines(line); if (line >= endLine) { break; } if (state.sCount[line] < state.blkIndent) { break; } if (state.level >= maxNesting) { state.line = endLine; break; } for (i = 0; i < len; i++) { ok = rules[i](state, line, endLine, false); if (ok) { break; } } state.tight = !hasEmptyLines; if (state.isEmpty(state.line - 1)) { hasEmptyLines = true; } line = state.line; if (line < endLine && state.isEmpty(line)) { hasEmptyLines = true; line++; state.line = line; } } }; ParserBlock.prototype.parse = function (src, md, env, outTokens) { var state; if (!src) { return; } state = new this.State(src, md, env, outTokens); this.tokenize(state, state.line, state.lineMax); }; ParserBlock.prototype.State = require('./rules_block/state_block'); 复制代码
从构造函数能够看出,ParserBlock 有 11 种 rule,分别为 table
、code
、fence
、blockquote
、hr
、list
、reference
、heading
、lheading
、html_block
、paragraph
。通过由这些 rule 组成的 rules chain 以后,就能输出 type 为对应类型的 tokens,这也就是 ParserBlock 的做用所在。ruler 用来管理全部的 rule 以及 rule 所属的 chain。git
for (var i = 0; i < _rules.length; i++) { this.ruler.push(_rules[i][0], _rules[i][1], { alt: (_rules[i][2] || []).slice() }); } 复制代码
_rules 是一个二维数组,它的元素也是一个数组,先暂称之为 ruleConfig。ruleConfig 的第一个元素是 rule 的 name。第二个是 rule 的 fn,第三个是 rule 的 alt,也就是所属的职责链。假如 alt 为 ['paragraph', 'reference']
,那么若是调用 ruler.getRules('paragraph')
就能返回 [fn]
,同时调用 ruler.getRules('reference')
也能返回 [fn]
,由于 fn 的 alt 数组包含了这两种职责链。github
再来看 parse 方法。shell
ParserBlock.prototype.parse = function (src, md, env, outTokens) { var state; if (!src) { return; } state = new this.State(src, md, env, outTokens); this.tokenize(state, state.line, state.lineMax); }; ParserBlock.prototype.State = require('./rules_block/state_block'); 复制代码
先了解 ParserBlock 的 State,记得以前 ParserCore 的 State 么?也就是在每个 Parser 的过程当中都有一个 State 实例,用来管理他们在 parse 的一些状态。ParserBlock 的 State 是位于 lib/rules_block/state_block.js
。数组
function StateBlock(src, md, env, tokens) { var ch, s, start, pos, len, indent, offset, indent_found; this.src = src; this.md = md; this.env = env; this.tokens = tokens this.bMarks = [] this.eMarks = [] this.tShift = [] this.sCount = [] this.bsCount = [] this.blkIndent = 0 this.line = 0 this.lineMax = 0 this.tight = false this.ddIndent = -1 this.parentType = 'root' this.level = 0 this.result = '' s = this.src indent_found = false for (start = pos = indent = offset = 0, len = s.length; pos < len; pos++) { ch = s.charCodeAt(pos); if (!indent_found) { if (isSpace(ch)) { indent++; if (ch === 0x09) { offset += 4 - offset % 4; } else { offset++; } continue; } else { indent_found = true; } } if (ch === 0x0A || pos === len - 1) { if (ch !== 0x0A) { pos++; } this.bMarks.push(start); this.eMarks.push(pos); this.tShift.push(indent); this.sCount.push(offset); this.bsCount.push(0); indent_found = false; indent = 0; offset = 0; start = pos + 1; } } this.bMarks.push(s.length); this.eMarks.push(s.length); this.tShift.push(0); this.sCount.push(0); this.bsCount.push(0); this.lineMax = this.bMarks.length - 1; } 复制代码
理解 State 上的属性的做用,是很关键的。由于这些属性都是接下来 tokenize 所依赖的信息。重点关注以下的属性:markdown
tokensless
tokenize 以后的 token 组成的数组
bMarks
存储每一行的起始位置,由于 parse 的过程是根据换行符逐行扫描
eMarks
存储每一行的终止位置
tShift
存储每一行第一个非空格的字符的位置(制表符长度只算作1)
sCount
存储每一行第一个非空格的字符串的位置(制表符长度为4)
bsCount
通常为 0
blkIndent
通常为 0
line
当前所在行数。tokenize 的时候逐行扫描会用到
lineMax
src 被分割成了多少行
以上都是在 tokenize 过程当中很是有用的属性。接来下看一下 tokenize 的过程,以后就生成了 block 为 true 的 token。
ParserBlock.prototype.tokenize = function (state, startLine, endLine) { var ok, i, rules = this.ruler.getRules(''), len = rules.length, line = startLine, hasEmptyLines = false, maxNesting = state.md.options.maxNesting; while (line < endLine) { state.line = line = state.skipEmptyLines(line); if (line >= endLine) { break; } if (state.sCount[line] < state.blkIndent) { break; } if (state.level >= maxNesting) { state.line = endLine; break; } for (i = 0; i < len; i++) { ok = rules[i](state, line, endLine, false); if (ok) { break; } } state.tight = !hasEmptyLines; if (state.isEmpty(state.line - 1)) { hasEmptyLines = true; } line = state.line; if (line < endLine && state.isEmpty(line)) { hasEmptyLines = true; line++; state.line = line; } } } 复制代码
函数的执行流程以下:
获取 ParserBlock 构造函数声明的全部 rule 函数,由于在 Ruler 类里面规定,内部的 rule 函数必定属于名字为空字符串的 rule chain。固然构造函数还有不少其余的 rule chain。好比 paragraph
、reference
、blockquote
、 list
,暂时还未用到。同时,声明了不少初始变量。
而后走到一个 while 循环,由于 state_block 存放的信息都是以 src 字符串每一行做为维度区分的,好比每一行的起始位置,每一行的终止位置,每一行第一个字符的位置。这些信息都是特定 rule 所须要的。while 语句的前面部分就是跳过空行、是否达到最大嵌套等级的判断,重点关注这行代码。
for (i = 0; i < len; i++) { ok = rules[i](state, line, endLine, false); if (ok) { break; } } 复制代码
这里的循环,也就是会对 src 的每一行都执行 rule chain,进而产出 token,若是其中一个 rule 返回 true,就跳出循环,准备 tokenize 下一行。 那咱们来看下这些 rules 的做用。它们都位于 lib/rules_block
文件夹下面。
module.exports = function table(state, startLine, endLine, silent) { var ch, lineText, pos, i, nextLine, columns, columnCount, token, aligns, t, tableLines, tbodyLines; if (startLine + 2 > endLine) { return false; } nextLine = startLine + 1; if (state.sCount[nextLine] < state.blkIndent) { return false; } if (state.sCount[nextLine] - state.blkIndent >= 4) { return false; } pos = state.bMarks[nextLine] + state.tShift[nextLine]; if (pos >= state.eMarks[nextLine]) { return false; } ch = state.src.charCodeAt(pos++); if (ch !== 0x7C/* | */ && ch !== 0x2D/* - */ && ch !== 0x3A/* : */) { return false; } while (pos < state.eMarks[nextLine]) { ch = state.src.charCodeAt(pos); if (ch !== 0x7C/* | */ && ch !== 0x2D/* - */ && ch !== 0x3A/* : */ && !isSpace(ch)) { return false; } pos++; } lineText = getLine(state, startLine + 1); columns = lineText.split('|'); aligns = []; for (i = 0; i < columns.length; i++) { t = columns[i].trim(); if (!t) { if (i === 0 || i === columns.length - 1) { continue; } else { return false; } } if (!/^:?-+:?$/.test(t)) { return false; } if (t.charCodeAt(t.length - 1) === 0x3A/* : */) { aligns.push(t.charCodeAt(0) === 0x3A/* : */ ? 'center' : 'right'); } else if (t.charCodeAt(0) === 0x3A/* : */) { aligns.push('left'); } else { aligns.push(''); } } lineText = getLine(state, startLine).trim(); if (lineText.indexOf('|') === -1) { return false; } if (state.sCount[startLine] - state.blkIndent >= 4) { return false; } columns = escapedSplit(lineText.replace(/^\||\|$/g, '')); columnCount = columns.length; if (columnCount > aligns.length) { return false; } if (silent) { return true; } token = state.push('table_open', 'table', 1); token.map = tableLines = [ startLine, 0 ]; token = state.push('thead_open', 'thead', 1); token.map = [ startLine, startLine + 1 ]; token = state.push('tr_open', 'tr', 1); token.map = [ startLine, startLine + 1 ]; for (i = 0; i < columns.length; i++) { token = state.push('th_open', 'th', 1); token.map = [ startLine, startLine + 1 ]; if (aligns[i]) { token.attrs = [ [ 'style', 'text-align:' + aligns[i] ] ]; } token = state.push('inline', '', 0); token.content = columns[i].trim(); token.map = [ startLine, startLine + 1 ]; token.children = []; token = state.push('th_close', 'th', -1); } token = state.push('tr_close', 'tr', -1); token = state.push('thead_close', 'thead', -1); token = state.push('tbody_open', 'tbody', 1); token.map = tbodyLines = [ startLine + 2, 0 ]; for (nextLine = startLine + 2; nextLine < endLine; nextLine++) { if (state.sCount[nextLine] < state.blkIndent) { break; } lineText = getLine(state, nextLine).trim(); if (lineText.indexOf('|') === -1) { break; } if (state.sCount[nextLine] - state.blkIndent >= 4) { break; } columns = escapedSplit(lineText.replace(/^\||\|$/g, '')); token = state.push('tr_open', 'tr', 1); for (i = 0; i < columnCount; i++) { token = state.push('td_open', 'td', 1); if (aligns[i]) { token.attrs = [ [ 'style', 'text-align:' + aligns[i] ] ]; } token = state.push('inline', '', 0); token.content = columns[i] ? columns[i].trim() : ''; token.children = []; token = state.push('td_close', 'td', -1); } token = state.push('tr_close', 'tr', -1); } token = state.push('tbody_close', 'tbody', -1); token = state.push('table_close', 'table', -1); tableLines[1] = tbodyLines[1] = nextLine; state.line = nextLine; return true; } 复制代码
table 这个 rule 就是用来生成 table HMTL 字符串的。内部的解析都是根据 markdown 写 table 的规范而来的,详细逻辑这里就不展开,若是感兴趣,能够写个 demo 本身打断点试试。
module.exports = function code(state, startLine, endLine/*, silent*/) { var nextLine, last, token; if (state.sCount[startLine] - state.blkIndent < 4) { return false; } last = nextLine = startLine + 1; while (nextLine < endLine) { if (state.isEmpty(nextLine)) { nextLine++; continue; } if (state.sCount[nextLine] - state.blkIndent >= 4) { nextLine++; last = nextLine; continue; } break; } state.line = last; token = state.push('code_block', 'code', 0); token.content = state.getLines(startLine, last, 4 + state.blkIndent, true); token.map = [ startLine, state.line ]; return true; }; 复制代码
code rule 的做用也是很简单,它认为只要你每行的起始位置多于 3 个空格,那就是一个 code_block。好比下面的。
我如今就是一个 code_block
复制代码
module.exports = function fence(state, startLine, endLine, silent) { var marker, len, params, nextLine, mem, token, markup, haveEndMarker = false, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine]; if (state.sCount[startLine] - state.blkIndent >= 4) { return false; } if (pos + 3 > max) { return false; } marker = state.src.charCodeAt(pos); if (marker !== 0x7E/* ~ */ && marker !== 0x60 /* ` */) { return false; } mem = pos; pos = state.skipChars(pos, marker); len = pos - mem; if (len < 3) { return false; } markup = state.src.slice(mem, pos); params = state.src.slice(pos, max); if (params.indexOf(String.fromCharCode(marker)) >= 0) { return false; } // Since start is found, we can report success here in validation mode if (silent) { return true; } // search end of block nextLine = startLine; for (;;) { nextLine++; if (nextLine >= endLine) { break; } pos = mem = state.bMarks[nextLine] + state.tShift[nextLine]; max = state.eMarks[nextLine]; if (pos < max && state.sCount[nextLine] < state.blkIndent) { break; } if (state.src.charCodeAt(pos) !== marker) { continue; } if (state.sCount[nextLine] - state.blkIndent >= 4) { continue; } pos = state.skipChars(pos, marker); if (pos - mem < len) { continue; } pos = state.skipSpaces(pos); if (pos < max) { continue; } haveEndMarker = true; // found! break; } len = state.sCount[startLine]; state.line = nextLine + (haveEndMarker ? 1 : 0); token = state.push('fence', 'code', 0); token.info = params; token.content = state.getLines(startLine + 1, nextLine, len, true); token.markup = markup; token.map = [ startLine, state.line ]; return true; }; 复制代码
fence rule 相似于 code rule。它表明具备语言类型的 code_block。好比 javascript、shell、css、stylus 等等。举个栗子:
echo 'done'
复制代码
上面就是会解析生成一个 type 为 fence,info 为 shell,markup 为 "```"
的token。
代码太长,就不贴代码了,blockquote 的做用就是生成 markup 为 > 的 token。下面就是一个 blockquote。
i am a blockquote
module.exports = function hr(state, startLine, endLine, silent) { var marker, cnt, ch, token, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine]; // if it's indented more than 3 spaces, it should be a code block if (state.sCount[startLine] - state.blkIndent >= 4) { return false; } marker = state.src.charCodeAt(pos++); // Check hr marker if (marker !== 0x2A/* * */ && marker !== 0x2D/* - */ && marker !== 0x5F/* _ */) { return false; } // markers can be mixed with spaces, but there should be at least 3 of them cnt = 1; while (pos < max) { ch = state.src.charCodeAt(pos++); if (ch !== marker && !isSpace(ch)) { return false; } if (ch === marker) { cnt++; } } if (cnt < 3) { return false; } if (silent) { return true; } state.line = startLine + 1; token = state.push('hr', 'hr', 0); token.map = [ startLine, state.line ]; token.markup = Array(cnt + 1).join(String.fromCharCode(marker)); return true; }; 复制代码
hr rule 也很简单,就是生成 type 为 hr 的 token。它的 markup 是 ***
、---
、___
,也就是在 md 文件写这三种语法,都能解析出 <hr>
标签。
list 做用是为了解析有序列表以及无序列表的。详细的逻辑比较复杂,须要了解的能够本身经过 demo 断点调试。
reference 做用是为了解析超连接。咱们在 md 的语法就是相似于 [reference](http://www.baidu.con)
这种。
module.exports = function heading(state, startLine, endLine, silent) { var ch, level, tmp, token, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine]; // if it's indented more than 3 spaces, it should be a code block if (state.sCount[startLine] - state.blkIndent >= 4) { return false; } ch = state.src.charCodeAt(pos); if (ch !== 0x23/* # */ || pos >= max) { return false; } // count heading level level = 1; ch = state.src.charCodeAt(++pos); while (ch === 0x23/* # */ && pos < max && level <= 6) { level++; ch = state.src.charCodeAt(++pos); } if (level > 6 || (pos < max && !isSpace(ch))) { return false; } if (silent) { return true; } // Let's cut tails like ' ### ' from the end of string max = state.skipSpacesBack(max, pos); tmp = state.skipCharsBack(max, 0x23, pos); // # if (tmp > pos && isSpace(state.src.charCodeAt(tmp - 1))) { max = tmp; } state.line = startLine + 1; token = state.push('heading_open', 'h' + String(level), 1); token.markup = '########'.slice(0, level); token.map = [ startLine, state.line ]; token = state.push('inline', '', 0); token.content = state.src.slice(pos, max).trim(); token.map = [ startLine, state.line ]; token.children = []; token = state.push('heading_close', 'h' + String(level), -1); token.markup = '########'.slice(0, level); return true; }; 复制代码
heading 做用是解析标题标签(h1 - h6)。它的语法主要是 #, ## 等等。
lheading 是解析自带分隔符的标签,好比下面
这是一个标题
========
// 上面会渲染成
<h1>这是一个标题</h1>
复制代码
html_block 是解析 HTML,若是你在 md 里面写 HTML 标签,那么最后仍是会获得 HTML 字符串,好比你写以下字符串:
let src = "<p>234</p>" // 获得以下token let token = [ { "type": "html_block", "tag": "", "attrs": null, "map": [ 0, 1 ], "nesting": 0, "level": 0, "children": null, "content": "<p>234</p>", "markup": "", "info": "", "meta": null, "block": true, "hidden": false } ] 最后输出的字符串也是 `<p>234</p>` 复制代码
module.exports = function paragraph(state, startLine/*, endLine*/) { var content, terminate, i, l, token, oldParentType, nextLine = startLine + 1, terminatorRules = state.md.block.ruler.getRules('paragraph'), endLine = state.lineMax; oldParentType = state.parentType; state.parentType = 'paragraph'; // jump line-by-line until empty one or EOF for (; nextLine < endLine && !state.isEmpty(nextLine); nextLine++) { // this would be a code block normally, but after paragraph // it's considered a lazy continuation regardless of what's there if (state.sCount[nextLine] - state.blkIndent > 3) { continue; } // quirk for blockquotes, this line should already be checked by that rule if (state.sCount[nextLine] < 0) { continue; } // Some tags can terminate paragraph without empty line. terminate = false; for (i = 0, l = terminatorRules.length; i < l; i++) { if (terminatorRules[i](state, nextLine, endLine, true)) { terminate = true; break; } } if (terminate) { break; } } content = state.getLines(startLine, nextLine, state.blkIndent, false).trim(); state.line = nextLine; token = state.push('paragraph_open', 'p', 1); token.map = [ startLine, state.line ]; token = state.push('inline', '', 0); token.content = content; token.map = [ startLine, state.line ]; token.children = []; token = state.push('paragraph_close', 'p', -1); state.parentType = oldParentType; return true; }; 复制代码
paragraph 那就很简单也是常常用到的,就是生成 p 标签。
综上,能够看出 ParserBlock 的流程仍是很是的复杂与繁琐的。首先它拥有本身的 block_state,block_state 存储了 ParserBlock 在 tokenize 过程当中须要的不少信息,它的处理是以 src 换行符为维度。接着在 tokenize 的过程当中逐行对字符串运用不一样的 rule 函数,生成对应类型的 token,这样就完成了 ParserBlock 的 parse 过程。
在 ParserBlock 处理以后,可能会生成一种 type 为 inline 的 token。这种 token 属于未彻底解析的 token。举个栗子:
const src = '__ad__' // 通过 parse 处理以后 const generatedTokens = [ { "type": "paragraph_open", "tag": "p", ...... }, { "type": "inline", "tag": "", "attrs": null, "map": [ 0, 1 ], "nesting": 0, "level": 1, "children": [ { "type": "text", "tag": "", ...... }, { "type": "strong_open", "tag": "strong", ...... }, { "type": "text", "tag": "", ...... }, { "type": "strong_close", "tag": "strong", ...... }, { "type": "text", "tag": "", ...... } ], "content": "__ad__", "markup": "", "info": "", "meta": null, "block": true, "hidden": false }, { "type": "paragraph_close", ...... } ] // 数组的第二个 token 的 type 为 inline,注意它有个 children 属性 // children 属性上的 token是怎么来的呢? 复制代码
原本由 ParserBlock 处理以后,children 为空,但这样的话,第二个 token 的 content 属性是 "__ad__",说明加粗的语法还未解析,所以 ParserBlock 的处理还不够 ,咱们还须要更细粒度 token,那么这就是 ParserInline 的由来。它的做用就是编译 type 为 inline 的 token,并将更细粒度的 token 放在 它的 children 属性上,这也就是generatedTokens 第二项的 children 属性值的由来。