做者:嵇智javascript
咱们在上一篇 ParserCore 讲到了,首先先调用 normalize 这个 rule 函数将 Linux("\n"
) 与 Windows("\r\n"
) 的换行符统一处理成 "\n"
。接着就走到 ParseBlock.parse 的流程。这一步主要是产出 block 为 true 的 token。以下图所示:css
module.exports = function block(state) {
var token;
if (state.inlineMode) {
token = new state.Token('inline', '', 0);
token.content = state.src;
token.map = [ 0, 1 ];
token.children = [];
state.tokens.push(token);
} else {
state.md.block.parse(state.src, state.md, state.env, state.tokens);
}
};
复制代码
parse 函数传入了四个参数:html
{}
,通常不会须要它,除非你要作一些定制化的开发咱们再来聚焦 ParserBlock 内部的逻辑。位于 lib/parser_block.js
。java
var _rules = [
[ 'table', require('./rules_block/table'), [ 'paragraph', 'reference' ] ],
[ 'code', require('./rules_block/code') ],
[ 'fence', require('./rules_block/fence'), [ 'paragraph', 'reference', 'blockquote', 'list' ] ],
[ 'blockquote', require('./rules_block/blockquote'), [ 'paragraph', 'reference', 'blockquote', 'list' ] ],
[ 'hr', require('./rules_block/hr'), [ 'paragraph', 'reference', 'blockquote', 'list' ] ],
[ 'list', require('./rules_block/list'), [ 'paragraph', 'reference', 'blockquote' ] ],
[ 'reference', require('./rules_block/reference') ],
[ 'heading', require('./rules_block/heading'), [ 'paragraph', 'reference', 'blockquote' ] ],
[ 'lheading', require('./rules_block/lheading') ],
[ 'html_block', require('./rules_block/html_block'), [ 'paragraph', 'reference', 'blockquote' ] ],
[ 'paragraph', require('./rules_block/paragraph') ]
];
function ParserBlock() {
this.ruler = new Ruler();
for (var i = 0; i < _rules.length; i++) {
this.ruler.push(_rules[i][0], _rules[i][1], { alt: (_rules[i][2] || []).slice() });
}
}
ParserBlock.prototype.tokenize = function (state, startLine, endLine) {
var ok, i,
rules = this.ruler.getRules(''),
len = rules.length,
line = startLine,
hasEmptyLines = false,
maxNesting = state.md.options.maxNesting;
while (line < endLine) {
state.line = line = state.skipEmptyLines(line);
if (line >= endLine) { break; }
if (state.sCount[line] < state.blkIndent) { break; }
if (state.level >= maxNesting) {
state.line = endLine;
break;
}
for (i = 0; i < len; i++) {
ok = rules[i](state, line, endLine, false);
if (ok) { break; }
}
state.tight = !hasEmptyLines;
if (state.isEmpty(state.line - 1)) {
hasEmptyLines = true;
}
line = state.line;
if (line < endLine && state.isEmpty(line)) {
hasEmptyLines = true;
line++;
state.line = line;
}
}
};
ParserBlock.prototype.parse = function (src, md, env, outTokens) {
var state;
if (!src) { return; }
state = new this.State(src, md, env, outTokens);
this.tokenize(state, state.line, state.lineMax);
};
ParserBlock.prototype.State = require('./rules_block/state_block');
复制代码
从构造函数能够看出,ParserBlock 有 11 种 rule,分别为 table
、code
、fence
、blockquote
、hr
、list
、reference
、heading
、lheading
、html_block
、paragraph
。通过由这些 rule 组成的 rules chain 以后,就能输出 type 为对应类型的 tokens,这也就是 ParserBlock 的做用所在。ruler 用来管理全部的 rule 以及 rule 所属的 chain。git
for (var i = 0; i < _rules.length; i++) {
this.ruler.push(_rules[i][0], _rules[i][1], { alt: (_rules[i][2] || []).slice() });
}
复制代码
_rules 是一个二维数组,它的元素也是一个数组,先暂称之为 ruleConfig。ruleConfig 的第一个元素是 rule 的 name。第二个是 rule 的 fn,第三个是 rule 的 alt,也就是所属的职责链。假如 alt 为 ['paragraph', 'reference']
,那么若是调用 ruler.getRules('paragraph')
就能返回 [fn]
,同时调用 ruler.getRules('reference')
也能返回 [fn]
,由于 fn 的 alt 数组包含了这两种职责链。github
再来看 parse 方法。shell
ParserBlock.prototype.parse = function (src, md, env, outTokens) {
var state;
if (!src) { return; }
state = new this.State(src, md, env, outTokens);
this.tokenize(state, state.line, state.lineMax);
};
ParserBlock.prototype.State = require('./rules_block/state_block');
复制代码
先了解 ParserBlock 的 State,记得以前 ParserCore 的 State 么?也就是在每个 Parser 的过程当中都有一个 State 实例,用来管理他们在 parse 的一些状态。ParserBlock 的 State 是位于 lib/rules_block/state_block.js
。数组
function StateBlock(src, md, env, tokens) {
var ch, s, start, pos, len, indent, offset, indent_found;
this.src = src;
this.md = md;
this.env = env;
this.tokens = tokens
this.bMarks = []
this.eMarks = []
this.tShift = []
this.sCount = []
this.bsCount = []
this.blkIndent = 0
this.line = 0
this.lineMax = 0
this.tight = false
this.ddIndent = -1
this.parentType = 'root'
this.level = 0
this.result = ''
s = this.src
indent_found = false
for (start = pos = indent = offset = 0, len = s.length; pos < len; pos++) {
ch = s.charCodeAt(pos);
if (!indent_found) {
if (isSpace(ch)) {
indent++;
if (ch === 0x09) {
offset += 4 - offset % 4;
} else {
offset++;
}
continue;
} else {
indent_found = true;
}
}
if (ch === 0x0A || pos === len - 1) {
if (ch !== 0x0A) { pos++; }
this.bMarks.push(start);
this.eMarks.push(pos);
this.tShift.push(indent);
this.sCount.push(offset);
this.bsCount.push(0);
indent_found = false;
indent = 0;
offset = 0;
start = pos + 1;
}
}
this.bMarks.push(s.length);
this.eMarks.push(s.length);
this.tShift.push(0);
this.sCount.push(0);
this.bsCount.push(0);
this.lineMax = this.bMarks.length - 1;
}
复制代码
理解 State 上的属性的做用,是很关键的。由于这些属性都是接下来 tokenize 所依赖的信息。重点关注以下的属性:markdown
tokensless
tokenize 以后的 token 组成的数组
bMarks
存储每一行的起始位置,由于 parse 的过程是根据换行符逐行扫描
eMarks
存储每一行的终止位置
tShift
存储每一行第一个非空格的字符的位置(制表符长度只算作1)
sCount
存储每一行第一个非空格的字符串的位置(制表符长度为4)
bsCount
通常为 0
blkIndent
通常为 0
line
当前所在行数。tokenize 的时候逐行扫描会用到
lineMax
src 被分割成了多少行
以上都是在 tokenize 过程当中很是有用的属性。接来下看一下 tokenize 的过程,以后就生成了 block 为 true 的 token。
ParserBlock.prototype.tokenize = function (state, startLine, endLine) {
var ok, i,
rules = this.ruler.getRules(''),
len = rules.length,
line = startLine,
hasEmptyLines = false,
maxNesting = state.md.options.maxNesting;
while (line < endLine) {
state.line = line = state.skipEmptyLines(line);
if (line >= endLine) { break; }
if (state.sCount[line] < state.blkIndent) { break; }
if (state.level >= maxNesting) {
state.line = endLine;
break;
}
for (i = 0; i < len; i++) {
ok = rules[i](state, line, endLine, false);
if (ok) { break; }
}
state.tight = !hasEmptyLines;
if (state.isEmpty(state.line - 1)) {
hasEmptyLines = true;
}
line = state.line;
if (line < endLine && state.isEmpty(line)) {
hasEmptyLines = true;
line++;
state.line = line;
}
}
}
复制代码
函数的执行流程以下:
获取 ParserBlock 构造函数声明的全部 rule 函数,由于在 Ruler 类里面规定,内部的 rule 函数必定属于名字为空字符串的 rule chain。固然构造函数还有不少其余的 rule chain。好比 paragraph
、reference
、blockquote
、 list
,暂时还未用到。同时,声明了不少初始变量。
而后走到一个 while 循环,由于 state_block 存放的信息都是以 src 字符串每一行做为维度区分的,好比每一行的起始位置,每一行的终止位置,每一行第一个字符的位置。这些信息都是特定 rule 所须要的。while 语句的前面部分就是跳过空行、是否达到最大嵌套等级的判断,重点关注这行代码。
for (i = 0; i < len; i++) {
ok = rules[i](state, line, endLine, false);
if (ok) { break; }
}
复制代码
这里的循环,也就是会对 src 的每一行都执行 rule chain,进而产出 token,若是其中一个 rule 返回 true,就跳出循环,准备 tokenize 下一行。 那咱们来看下这些 rules 的做用。它们都位于 lib/rules_block
文件夹下面。
module.exports = function table(state, startLine, endLine, silent) {
var ch, lineText, pos, i, nextLine, columns, columnCount, token,
aligns, t, tableLines, tbodyLines;
if (startLine + 2 > endLine) { return false; }
nextLine = startLine + 1;
if (state.sCount[nextLine] < state.blkIndent) { return false; }
if (state.sCount[nextLine] - state.blkIndent >= 4) { return false; }
pos = state.bMarks[nextLine] + state.tShift[nextLine];
if (pos >= state.eMarks[nextLine]) { return false; }
ch = state.src.charCodeAt(pos++);
if (ch !== 0x7C/* | */ && ch !== 0x2D/* - */ && ch !== 0x3A/* : */) { return false; }
while (pos < state.eMarks[nextLine]) {
ch = state.src.charCodeAt(pos);
if (ch !== 0x7C/* | */ && ch !== 0x2D/* - */ && ch !== 0x3A/* : */ && !isSpace(ch)) { return false; }
pos++;
}
lineText = getLine(state, startLine + 1);
columns = lineText.split('|');
aligns = [];
for (i = 0; i < columns.length; i++) {
t = columns[i].trim();
if (!t) {
if (i === 0 || i === columns.length - 1) {
continue;
} else {
return false;
}
}
if (!/^:?-+:?$/.test(t)) { return false; }
if (t.charCodeAt(t.length - 1) === 0x3A/* : */) {
aligns.push(t.charCodeAt(0) === 0x3A/* : */ ? 'center' : 'right');
} else if (t.charCodeAt(0) === 0x3A/* : */) {
aligns.push('left');
} else {
aligns.push('');
}
}
lineText = getLine(state, startLine).trim();
if (lineText.indexOf('|') === -1) { return false; }
if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }
columns = escapedSplit(lineText.replace(/^\||\|$/g, ''));
columnCount = columns.length;
if (columnCount > aligns.length) { return false; }
if (silent) { return true; }
token = state.push('table_open', 'table', 1);
token.map = tableLines = [ startLine, 0 ];
token = state.push('thead_open', 'thead', 1);
token.map = [ startLine, startLine + 1 ];
token = state.push('tr_open', 'tr', 1);
token.map = [ startLine, startLine + 1 ];
for (i = 0; i < columns.length; i++) {
token = state.push('th_open', 'th', 1);
token.map = [ startLine, startLine + 1 ];
if (aligns[i]) {
token.attrs = [ [ 'style', 'text-align:' + aligns[i] ] ];
}
token = state.push('inline', '', 0);
token.content = columns[i].trim();
token.map = [ startLine, startLine + 1 ];
token.children = [];
token = state.push('th_close', 'th', -1);
}
token = state.push('tr_close', 'tr', -1);
token = state.push('thead_close', 'thead', -1);
token = state.push('tbody_open', 'tbody', 1);
token.map = tbodyLines = [ startLine + 2, 0 ];
for (nextLine = startLine + 2; nextLine < endLine; nextLine++) {
if (state.sCount[nextLine] < state.blkIndent) { break; }
lineText = getLine(state, nextLine).trim();
if (lineText.indexOf('|') === -1) { break; }
if (state.sCount[nextLine] - state.blkIndent >= 4) { break; }
columns = escapedSplit(lineText.replace(/^\||\|$/g, ''));
token = state.push('tr_open', 'tr', 1);
for (i = 0; i < columnCount; i++) {
token = state.push('td_open', 'td', 1);
if (aligns[i]) {
token.attrs = [ [ 'style', 'text-align:' + aligns[i] ] ];
}
token = state.push('inline', '', 0);
token.content = columns[i] ? columns[i].trim() : '';
token.children = [];
token = state.push('td_close', 'td', -1);
}
token = state.push('tr_close', 'tr', -1);
}
token = state.push('tbody_close', 'tbody', -1);
token = state.push('table_close', 'table', -1);
tableLines[1] = tbodyLines[1] = nextLine;
state.line = nextLine;
return true;
}
复制代码
table 这个 rule 就是用来生成 table HMTL 字符串的。内部的解析都是根据 markdown 写 table 的规范而来的,详细逻辑这里就不展开,若是感兴趣,能够写个 demo 本身打断点试试。
module.exports = function code(state, startLine, endLine/*, silent*/) {
var nextLine, last, token;
if (state.sCount[startLine] - state.blkIndent < 4) { return false; }
last = nextLine = startLine + 1;
while (nextLine < endLine) {
if (state.isEmpty(nextLine)) {
nextLine++;
continue;
}
if (state.sCount[nextLine] - state.blkIndent >= 4) {
nextLine++;
last = nextLine;
continue;
}
break;
}
state.line = last;
token = state.push('code_block', 'code', 0);
token.content = state.getLines(startLine, last, 4 + state.blkIndent, true);
token.map = [ startLine, state.line ];
return true;
};
复制代码
code rule 的做用也是很简单,它认为只要你每行的起始位置多于 3 个空格,那就是一个 code_block。好比下面的。
我如今就是一个 code_block
复制代码
module.exports = function fence(state, startLine, endLine, silent) {
var marker, len, params, nextLine, mem, token, markup,
haveEndMarker = false,
pos = state.bMarks[startLine] + state.tShift[startLine],
max = state.eMarks[startLine];
if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }
if (pos + 3 > max) { return false; }
marker = state.src.charCodeAt(pos);
if (marker !== 0x7E/* ~ */ && marker !== 0x60 /* ` */) {
return false;
}
mem = pos;
pos = state.skipChars(pos, marker);
len = pos - mem;
if (len < 3) { return false; }
markup = state.src.slice(mem, pos);
params = state.src.slice(pos, max);
if (params.indexOf(String.fromCharCode(marker)) >= 0) { return false; }
// Since start is found, we can report success here in validation mode
if (silent) { return true; }
// search end of block
nextLine = startLine;
for (;;) {
nextLine++;
if (nextLine >= endLine) {
break;
}
pos = mem = state.bMarks[nextLine] + state.tShift[nextLine];
max = state.eMarks[nextLine];
if (pos < max && state.sCount[nextLine] < state.blkIndent) {
break;
}
if (state.src.charCodeAt(pos) !== marker) { continue; }
if (state.sCount[nextLine] - state.blkIndent >= 4) {
continue;
}
pos = state.skipChars(pos, marker);
if (pos - mem < len) { continue; }
pos = state.skipSpaces(pos);
if (pos < max) { continue; }
haveEndMarker = true;
// found!
break;
}
len = state.sCount[startLine];
state.line = nextLine + (haveEndMarker ? 1 : 0);
token = state.push('fence', 'code', 0);
token.info = params;
token.content = state.getLines(startLine + 1, nextLine, len, true);
token.markup = markup;
token.map = [ startLine, state.line ];
return true;
};
复制代码
fence rule 相似于 code rule。它表明具备语言类型的 code_block。好比 javascript、shell、css、stylus 等等。举个栗子:
echo 'done'
复制代码
上面就是会解析生成一个 type 为 fence,info 为 shell,markup 为 "```"
的token。
代码太长,就不贴代码了,blockquote 的做用就是生成 markup 为 > 的 token。下面就是一个 blockquote。
i am a blockquote
module.exports = function hr(state, startLine, endLine, silent) {
var marker, cnt, ch, token,
pos = state.bMarks[startLine] + state.tShift[startLine],
max = state.eMarks[startLine];
// if it's indented more than 3 spaces, it should be a code block
if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }
marker = state.src.charCodeAt(pos++);
// Check hr marker
if (marker !== 0x2A/* * */ &&
marker !== 0x2D/* - */ &&
marker !== 0x5F/* _ */) {
return false;
}
// markers can be mixed with spaces, but there should be at least 3 of them
cnt = 1;
while (pos < max) {
ch = state.src.charCodeAt(pos++);
if (ch !== marker && !isSpace(ch)) { return false; }
if (ch === marker) { cnt++; }
}
if (cnt < 3) { return false; }
if (silent) { return true; }
state.line = startLine + 1;
token = state.push('hr', 'hr', 0);
token.map = [ startLine, state.line ];
token.markup = Array(cnt + 1).join(String.fromCharCode(marker));
return true;
};
复制代码
hr rule 也很简单,就是生成 type 为 hr 的 token。它的 markup 是 ***
、---
、___
,也就是在 md 文件写这三种语法,都能解析出 <hr>
标签。
list 做用是为了解析有序列表以及无序列表的。详细的逻辑比较复杂,须要了解的能够本身经过 demo 断点调试。
reference 做用是为了解析超连接。咱们在 md 的语法就是相似于 [reference](http://www.baidu.con)
这种。
module.exports = function heading(state, startLine, endLine, silent) {
var ch, level, tmp, token,
pos = state.bMarks[startLine] + state.tShift[startLine],
max = state.eMarks[startLine];
// if it's indented more than 3 spaces, it should be a code block
if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }
ch = state.src.charCodeAt(pos);
if (ch !== 0x23/* # */ || pos >= max) { return false; }
// count heading level
level = 1;
ch = state.src.charCodeAt(++pos);
while (ch === 0x23/* # */ && pos < max && level <= 6) {
level++;
ch = state.src.charCodeAt(++pos);
}
if (level > 6 || (pos < max && !isSpace(ch))) { return false; }
if (silent) { return true; }
// Let's cut tails like ' ### ' from the end of string
max = state.skipSpacesBack(max, pos);
tmp = state.skipCharsBack(max, 0x23, pos); // #
if (tmp > pos && isSpace(state.src.charCodeAt(tmp - 1))) {
max = tmp;
}
state.line = startLine + 1;
token = state.push('heading_open', 'h' + String(level), 1);
token.markup = '########'.slice(0, level);
token.map = [ startLine, state.line ];
token = state.push('inline', '', 0);
token.content = state.src.slice(pos, max).trim();
token.map = [ startLine, state.line ];
token.children = [];
token = state.push('heading_close', 'h' + String(level), -1);
token.markup = '########'.slice(0, level);
return true;
};
复制代码
heading 做用是解析标题标签(h1 - h6)。它的语法主要是 #, ## 等等。
lheading 是解析自带分隔符的标签,好比下面
这是一个标题
========
// 上面会渲染成
<h1>这是一个标题</h1>
复制代码
html_block 是解析 HTML,若是你在 md 里面写 HTML 标签,那么最后仍是会获得 HTML 字符串,好比你写以下字符串:
let src = "<p>234</p>"
// 获得以下token
let token = [
{
"type": "html_block",
"tag": "",
"attrs": null,
"map": [
0,
1
],
"nesting": 0,
"level": 0,
"children": null,
"content": "<p>234</p>",
"markup": "",
"info": "",
"meta": null,
"block": true,
"hidden": false
}
]
最后输出的字符串也是 `<p>234</p>`
复制代码
module.exports = function paragraph(state, startLine/*, endLine*/) {
var content, terminate, i, l, token, oldParentType,
nextLine = startLine + 1,
terminatorRules = state.md.block.ruler.getRules('paragraph'),
endLine = state.lineMax;
oldParentType = state.parentType;
state.parentType = 'paragraph';
// jump line-by-line until empty one or EOF
for (; nextLine < endLine && !state.isEmpty(nextLine); nextLine++) {
// this would be a code block normally, but after paragraph
// it's considered a lazy continuation regardless of what's there
if (state.sCount[nextLine] - state.blkIndent > 3) { continue; }
// quirk for blockquotes, this line should already be checked by that rule
if (state.sCount[nextLine] < 0) { continue; }
// Some tags can terminate paragraph without empty line.
terminate = false;
for (i = 0, l = terminatorRules.length; i < l; i++) {
if (terminatorRules[i](state, nextLine, endLine, true)) {
terminate = true;
break;
}
}
if (terminate) { break; }
}
content = state.getLines(startLine, nextLine, state.blkIndent, false).trim();
state.line = nextLine;
token = state.push('paragraph_open', 'p', 1);
token.map = [ startLine, state.line ];
token = state.push('inline', '', 0);
token.content = content;
token.map = [ startLine, state.line ];
token.children = [];
token = state.push('paragraph_close', 'p', -1);
state.parentType = oldParentType;
return true;
};
复制代码
paragraph 那就很简单也是常常用到的,就是生成 p 标签。
综上,能够看出 ParserBlock 的流程仍是很是的复杂与繁琐的。首先它拥有本身的 block_state,block_state 存储了 ParserBlock 在 tokenize 过程当中须要的不少信息,它的处理是以 src 换行符为维度。接着在 tokenize 的过程当中逐行对字符串运用不一样的 rule 函数,生成对应类型的 token,这样就完成了 ParserBlock 的 parse 过程。
在 ParserBlock 处理以后,可能会生成一种 type 为 inline 的 token。这种 token 属于未彻底解析的 token。举个栗子:
const src = '__ad__'
// 通过 parse 处理以后
const generatedTokens = [
{
"type": "paragraph_open",
"tag": "p",
......
},
{
"type": "inline",
"tag": "",
"attrs": null,
"map": [
0,
1
],
"nesting": 0,
"level": 1,
"children": [
{
"type": "text",
"tag": "",
......
},
{
"type": "strong_open",
"tag": "strong",
......
},
{
"type": "text",
"tag": "",
......
},
{
"type": "strong_close",
"tag": "strong",
......
},
{
"type": "text",
"tag": "",
......
}
],
"content": "__ad__",
"markup": "",
"info": "",
"meta": null,
"block": true,
"hidden": false
},
{
"type": "paragraph_close",
......
}
]
// 数组的第二个 token 的 type 为 inline,注意它有个 children 属性
// children 属性上的 token是怎么来的呢?
复制代码
原本由 ParserBlock 处理以后,children 为空,但这样的话,第二个 token 的 content 属性是 "__ad__",说明加粗的语法还未解析,所以 ParserBlock 的处理还不够 ,咱们还须要更细粒度 token,那么这就是 ParserInline 的由来。它的做用就是编译 type 为 inline 的 token,并将更细粒度的 token 放在 它的 children 属性上,这也就是generatedTokens 第二项的 children 属性值的由来。