jquery源码学习（二）：Sizzle引擎-词法解析

时间 2019-11-20

标签 jquery 源码学习 sizzle 引擎词法解析栏目 JQuery 繁體版

原文原文链接

tokenize源码备注：

Sizzle.tokenize//2131行

rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ) //601行

whitespace = "[\\x20\\t\\r\\n\\f]"  //574行

\t:水平制表符 \f:换页符 \r:回车符 \n:换行符 \x20:空格符git

rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + "*" ),

rcombinators = /^[\x20\t\r\n\f]([>+~]|[\x20\t\r\n\f])[\x20\t\r\n\f]/ 零个或多个WS+捕获(>;+;~|WS)+零个或多个WSgithub

rtirm = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + whitespace + "+$", "g" ),

rtirm = /^[\x20\t\r\n\f]+|((?:^|[^\])(?:\.)*)[\x20\t\r\n\f]+$/g正则表达式

注： (?:exp)匹配exp,不捕获匹配的文本，也不给此分组分配组号数组

\x?? 表达的意思

这是利用2位16进制表示ascii码表中的字符。缓存

而\uxxxx是利用4位十六进制表示Unicode字符。闭包

选择器解析基本思想

解析结果：async

咱们能够看到：选择器被分解成了数组里的多个token对象 token对象的格式以下：函数

Token：{  
   value:'匹配到的字符串', 
   type:'对应的Token类型', 
   matches:'正则匹配到的一个结构'(捕获组)
}

若是选择器的格式是以逗号分割的多个选择器，则返回一个二维数组，这个数组的每一项都是一组token对象。oop

tokenize函数思想:学习

function tokenize( selector, parseOnly ){
    var matched, match, tokens, type,
        soFar, groups, preFilters,  //soFar:全局变量，用来放置解析的选择器字符串，会随着程序的运行不断削减。
        cached = tokenCache[ selector + " " ]; //从缓存中读取已经解析好的选择器。

    while(soFar){
        if(匹配到逗号)//表示是一个新的选择器了
            {
                新建一个数组，用来放置一组新的token
                去掉soFar中匹配的部分
            }
        if((match = rcombinators.exec( soFar )))  //匹配+>~
            {
                tokens.push({
                    value: matched,
                    // Cast descendant combinators to space
                    type: match[0].replace( rtrim, " " ) //感受rtrim在这里用处不大，由于match[0]只有四种可能+>~空格
                });
            }
        for(type in Expr.filter)//Expr.filter中有这些类型TAG,CLASS,ATTR,CHILD,PSEUD，进行循环匹配
            {
                    if ( (match = matchExpr[ type ].exec( soFar )) && (!preFilters[ type ] ||
                    (match = preFilters[ type ]( match ))) ) {
                    matched = match.shift();
                    //放入Token序列中
                    tokens.push({
                        value: matched,
                        type: type,
                        matches: match
                    });
                    //剩余还未分析的字符串须要减去这段已经分析过的
                    soFar = soFar.slice( matched.length );
                }
            }
        if ( !matched ) { //到此没有匹配完说明有问题
                break;
        }

        return parseOnly ? //若是是只解析，抛出soFar长度，若是长度>0表示解析失败，其余状况，若是soFar长度不为零，抛出Sizzle错误。不然将解析的选择器结果缓存以便下次调用。
            soFar.length :
            soFar ?
                Sizzle.error( selector ) :
                // Cache the tokens
                tokenCache( selector, groups ).slice( 0 );
    }

}

缓存tokenCache

tokenCache在tokenize函数的开头与结尾被调用到：

cached = tokenCache[ selector + " " ]; //从缓存中读取已经解析好的选择器。
tokenCache( selector, groups ).slice( 0 ); //将选择器名做为键，解析的结果做为值压入缓存

第540行有tokenCache的建立过程,能够发现，除了tokenCache，还有其余的缓存也是经过一个名为createCache的函数建立的

classCache = createCache(),
tokenCache = createCache(),
compilerCache = createCache(),

在854行咱们能够看到createCache

function createCache() {
            var keys = [];

            function cache( key, value ) {
                // Use (key + " ") to avoid collision with native prototype properties (see Issue #157)
                if ( keys.push( key + " " ) > Expr.cacheLength ) {
                    // Only keep the most recent entries
                    delete cache[ keys.shift() ];
                }
                return (cache[ key + " " ] = value);
            }
            return cache;
        }

这是一个建立函数的函数，它建立的函数cache既能做为函数调用，又能做为对象来储存缓存，由于js中函数也是一个对象，一个函数被建立之初除了得到一个prototype属性指向一个空对象以外，它自己也能够做为对象而挂载属性。当cache被赋值给takenCache时，造成了一个闭包，tokenCache能够经过闭包访问keys,并向其中push键名。key+' '用来避免属性名与原生属性名冲突，return (cache[ key + " " ] = value)最后会返回value。这至关于cache[ key + " " ] = value;return value;

createCache函数很是值得学习。

/**
 *
 *
 *matchExpr 过滤正则
    ATTR: /^\[[\x20\t\r\n\f]*((?:\\.|[\w-]|[^\x00-\xa0])+)[\x20\t\r\n\f]*(?:([*^$|!~]?=)[\x20\t\r\n\f]*(?:(['"])((?:\\.|[^\\])*?)\3|((?:\\.|[\w#-]|[^\x00-\xa0])+)|)|)[\x20\t\r\n\f]*\]/
    CHILD: /^:(only|first|last|nth|nth-last)-(child|of-type)(?:\([\x20\t\r\n\f]*(even|odd|(([+-]|)(\d*)n|)[\x20\t\r\n\f]*(?:([+-]|)[\x20\t\r\n\f]*(\d+)|))[\x20\t\r\n\f]*\)|)/i
    CLASS: /^\.((?:\\.|[\w-]|[^\x00-\xa0])+)/
    ID: /^#((?:\\.|[\w-]|[^\x00-\xa0])+)/
    PSEUDO: /^:((?:\\.|[\w-]|[^\x00-\xa0])+)(?:\(((['"])((?:\\.|[^\\])*?)\3|((?:\\.|[^\\()[\]]|\[[\x20\t\r\n\f]*((?:\\.|[\w-]|[^\x00-\xa0])+)[\x20\t\r\n\f]*(?:([*^$|!~]?=)[\x20\t\r\n\f]*(?:(['"])((?:\\.|[^\\])*?)\8|((?:\\.|[\w#-]|[^\x00-\xa0])+)|)|)[\x20\t\r\n\f]*\])*)|.*)\)|)/
    TAG: /^((?:\\.|[\w*-]|[^\x00-\xa0])+)/
    bool: /^(?:checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|ismap|loop|multiple|open|readonly|required|scoped)$/i
    needsContext: /^[\x20\t\r\n\f]*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\([\x20\t\r\n\f]*((?:-\d)?\d*)[\x20\t\r\n\f]*\)|)(?=[^-]|$)/i
 *
 */

最后，咱们能够写一个简略版解析引擎:

github地址：https://github.com/kangkang1234/JqueryStudy/blob/master/tokenize/tokenize.js

var regexp = {  //存放用到的正则表达式
    comma:/^\s*,\s*/,
    separator:/^\s*[\+>~]\s*|\s+/,
    rtrim:/\s+/g,
    tag:/^\s*([a-zA-Z]+)\s*/,
    class:/^\s*(\.[a-zA-Z]+)\s*/,
    child:/^\s*:((?:first|last)-child|nth-child\(([1-9]+)\))\s*/,
    attr:/^asdadawd$/,
    pseudo:/^asdasda$/,
    id:/^\s*(#[a-zA-Z])+\s*/
};

var filter = ['tag','class','child','attr','pseudo','id']; //存放第三次匹配的类型

function createCache() {  //建立缓存
    var keys = [];
    function cache(selector,tokens) {
        if(keys.length>100){
            delete cache[keys.shift()];
        }

        return cache[selector+' '] = tokens;
    }

    return cache;
}

var cache = createCache();

function tokenize(selector,parseOnly){
    var originSelector = selector;

    if(cache[selector+' ']){
        return cache[selector+' '];
    }

    var match; //通用，得到某一次exec的数组
    var tokens = []; //储存最后的解析数组;
    var token = [];  //储存单次解析数组;
    var matchs,value,tag;
    var len,i;

    tokens.push(token);
    while(selector){
        match = false;
        //第一部分：匹配逗号
        if((match = regexp.comma.exec(selector))&&match.index===0){
            token = [];
            tokens.push(token);
            selector = selector.slice(match[0].length);
        }
        //第二部分：匹配+~空格>
        else if((match = regexp.separator.exec(selector))&&match.index===0){
            value = match.shift();
            tag = value.replace(regexp.rtrim,' ');
            token.push({value:value,tag:tag});
            selector = selector.slice(value.length);
        }
        //第三部分：匹配剩余的标签
        else {
            len = filter.length;
            for(i=0;i<len;i++){
                if((match = regexp[filter[i]].exec(selector))&&match.index===0){
                    value = match.shift();
                    tag = match[0];
                    matchs = match;
                    token.push({value:value,tag:tag,matchs:match});
                    selector = selector.slice(value.length);
                    break;
                }
            }
        }

        if(!match){
            break;
        }
    }

    return parseOnly?
        selector.length:
        selector.length?
            false:
            cache(originSelector,tokens);
}