sqlite对于典型select处理流程

时间 2019-11-10

原文原文链接

1、select s from f where w
这是一个最为基础的sql语句，至关于C语言的printf("hello world\n");对于一个简单的应用来讲，知道这个套路解决通常问题是没有难度的，在一些复杂的场景，例如涉及到subselect、join、union等各类各样的延伸问题时，若是不太清楚程序自己的执行和解析流程，那么对于问题的演绎会有阻碍。另外因为思路比较简单，喜欢模拟程序的执行来推理出一些东西，因此想看看这个典型的语句在sqlite中是如何处理的。
触发这个问题的缘由在于处理一个稍微复杂点的问题：但愿首先经过两个不一样的table来组合成一个新的table，而后在这个新生成的table基础上来再次执行一些操做。直观的想就是在select的from中构造一张临时表，而后在from的条件中把这个表做为基础表从这张表中select，以后发现这种方法根本行不通，select from以后的表不能是任何一个经过alias表示的子select语句，由此触发了对于这个语句实现方法的一个简单学习。
问题的语法描述大体如此
mysql> select * from (select field, type from user) as newuser where newuser.field in (select field from newuser);
ERROR 1146 (42S02): Table 'mysql.newuser' doesn't exist
在where中再次从from中临时生成的newuser中select成员，此时表达式报错，说明select以后的table只能是物理存在的table，或者是create temporary建立的表格，而不能是 select as 生成的临时表格。
因为mysql的工程过于高端大气上档次，看起来比较复杂，因此依然是从最为简单的sqlite数据库相关实现来学习一下，sqlite3的代码总共大概10W行，内容并非不少，又考虑到其中注释可能至少有1/3左右，真正的逻辑代码可能更少；sqlite的实现使用虚拟机方法，代码看起来比较学术派，对于理解数据库的实现相对来讲比较容易。
2、语法解析
sqlite对于mysql的解析实现并无使用通用的bison/yacc来生成，而是在本身的工程中包含了一个语法生成器lemon，这个工具在浙江大学有一本书剖析了它的内部实现机制，和毛德操老师的《linux内核源代码情景分析》是一个系列的，我以前还买了一本，冲着对《情景分析》的品牌，不事后的比较少，用鲁迅先生的话说：“后来大半忘却了”。
这个lemon的词法表示和bison的表示方式也不一样，比较明显的特色是：
一、非终端符号小写表示，终端符号使用大写字符表示。例如在parse.y中
id(A) ::= ID(X).         {A = X;}
id(A) ::= INDEXED(X).    {A = X;}
这里的id就是非终端符号，而ID为终端符号。
二、符号引用。在bison中，动做中经过 $1之类的编号来引用语法中的标识符数值，lemon中则使用在标识符的后面一个括号内定义字符来表示对于这个标识符语法数值的引用。这两种实现的优缺点很差说，就像社会科学中大部分事件同样，有人叫好，有人拍砖。可是咱们如今最后不要也不用关心这些形而上的东西。
语法文件中比较相关的内容罗列以下
cmd ::= select(X). {
SelectDest dest = {SRT_Output, 0, 0, 0, 0};
sqlite3Select(pParse, X, &dest); 在一个select语句被完整解析以后，执行这个语句完成sql语义。
sqlite3SelectDelete(pParse->db, X);
}
……
select(A) ::= oneselect(X).                      {A = X;}一个完整的select，也就是咱们最为常见的select from where三元组
%ifndef SQLITE_OMIT_COMPOUND_SELECT
select(A) ::= select(X) multiselect_op(Y) oneselect(Z). {
if( Z ){
    Z->op = (u8)Y;
    Z->pPrior = X;
}else{
    sqlite3SelectDelete(pParse->db, X);
}
A = Z;
}
%type multiselect_op {int}
multiselect_op(A) ::= UNION(OP).             {A = @OP;}
multiselect_op(A) ::= UNION ALL.             {A = TK_ALL;}
multiselect_op(A) ::= EXCEPT|INTERSECT(OP). {A = @OP;}
%endif SQLITE_OMIT_COMPOUND_SELECT
oneselect(A) ::= SELECT distinct(D) selcollist(W) from(X) where_opt(Y)
                 groupby_opt(P) having_opt(Q) orderby_opt(Z) limit_opt(L). {
A = sqlite3SelectNew(pParse,W,X,Y,P,Q,Z,D,L.pLimit,L.pOffset);
}
……
// A complete FROM clause.
//
from(A) ::= .                {A = sqlite3DbMallocZero(pParse->db, sizeof(*A));}
from(A) ::= FROM seltablist(X). {
A = X;
sqlite3SrcListShiftJoinType(A);
}
……
// "seltablist" is a " Select Table List" - the content of the FROM clause注释明确说明stl是select table list的缩写。
// in a SELECT statement. "stl_prefix" is a prefix of this list.
//
stl_prefix(A) ::= seltablist(X) joinop(Y).    {
   A = X;
   if( ALWAYS(A && A->nSrc>0) ) A->a[A->nSrc-1].jointype = (u8)Y;
}
stl_prefix(A) ::= .                           {A = 0;}
seltablist(A) ::= stl_prefix(X) nm(Y) dbnm(D) as(Z) indexed_opt(I) on_opt(N) using_opt(U). {
A = sqlite3SrcListAppendFromTerm(pParse,X,&Y,&D,&Z,0,N,U);
sqlite3SrcListIndexedBy(pParse, A, &I);
}
%ifndef SQLITE_OMIT_SUBQUERY
seltablist(A) ::= stl_prefix(X) LP select(S) RP 其中LP和RP不是“老婆”和“人品”，而是“Left Parenthese”和“Right Parentheses”，这就是一对括号引导的自select表达式，也就是这个语法，容许在from中经过子表达式来临时生成一个新的内存table。
                    as(Z) on_opt(N) using_opt(U). {
    A = sqlite3SrcListAppendFromTerm(pParse,X,0,0,&Z,S,N,U);
}
seltablist(A) ::= stl_prefix(X) LP seltablist(F) RP
                    as(Z) on_opt(N) using_opt(U). {
    if( X==0 && Z.n==0 && N==0 && U==0 ){
      A = F;
    }else{
      Select *pSubquery;
      sqlite3SrcListShiftJoinType(F);
      pSubquery = sqlite3SelectNew(pParse,0,F,0,0,0,0,0,0,0);
      A = sqlite3SrcListAppendFromTerm(pParse,X,0,0,&Z,pSubquery,N,U);对于from以后的内容，经过该函数添加到FromTerm列表中，
    }
}
sqlite3SrcListAppendFromTerm函数将解析出来的table描述以及subquery追加到Src列表的最后，这个其实就是以后整个select中使用变量及标识符的数据库做用域，这里也是不一样的select语句的定界域，和C语言的变量查找同样，同一个变量要从一个context中逐层向外查找，这样才能找到正确的标识符定义。在这里，这个src也定义了不一样范围内select可使用的table的范围，嵌套或者被包含的subselect只能向本层或者外层链式查找符号范围(若是这个符号是一个table列的时候)。
3、select语句的处理
在以前已经看到，select处理由sqlite3Select函数完成，在该函数中，对于表达式、table、database的处理在函数
sqlite3SelectPrep(pParse, p, 0);
中完成。
该函数主要执行流程为
SQLITE_PRIVATE void sqlite3SelectPrep(
Parse *pParse,         /* The parser context */
Select *p,             /* The SELECT statement being coded. */
NameContext *pOuterNC /* Name context for container */
){
sqlite3 *db;
if( NEVER(p==0) ) return;
db = pParse->db;
if( p->selFlags & SF_HasTypeInfo ) return;
sqlite3SelectExpand(pParse, p);   这里对select以后的内容进行展开，包括了对于全部的select以及subselect中引用到的table的定位(table的定位经过sqlite3LocateTable函数完成)，该函数还完成对于select以后通配符'*'的展开，展开为一个table的全部列的名字。
if( pParse->nErr || db->mallocFailed ) return;
sqlite3ResolveSelectNames(pParse, p, pOuterNC); 该函数完成对于表达式中各个变量的解析，解析的时候就会涉及到对于变量所在table的做用域的查询。这里pOuterNC中的NC即为Name Context的缩写，这个链表并不在解析是连接，而是在最终执行以后动态建立并连接本身须要的NC列表。
if( pParse->nErr || db->mallocFailed ) return;
sqlite3SelectAddTypeInfo(pParse, p);
}
4、NC链表的创建
一、单步骤遍历
在对select结构的遍历(walker)过程当中，其中对于子select处理的回调函数为resolveSelectStep，在这个函数中建立了一个堆栈上的NameContext结构，在对这个结构初始化以后，将这个结果做为子select的OuterNC传递给subselect，从而在堆栈上创建一个链表结构。这种在堆栈上动态建立结构在以前也见过，就是Matt Piertrek大牛写的《A Crash Course on the Depths of Win32? Structured Exception Handling》，异常的处理也是在堆栈上创建；另一个类似的例子就是内核中经过DEFINE_WAIT创建的等待结构，一般也是在堆栈中临时建立，函数返回以后销毁，环保可递归。
static int resolveSelectStep(Walker *pWalker, Select *p){
NameContext *pOuterNC; /* Context that contains this SELECT */
NameContext sNC;        /* Name context of this SELECT */
……
    /* Resolve names in the result set. */
    pEList = p->pEList;
    assert( pEList!=0 );
    for(i=0; i<pEList->nExpr; i++){
      Expr *pX = pEList->a[i].pExpr;
      if( sqlite3ResolveExprNames(&sNC, pX) ){ 把本身做为新的链表上NC结构，传递给表达式解析。
        return WRC_Abort;
      }
    }
……
    /* Recursively resolve names in all subqueries
    */
    for(i=0; i<p->pSrc->nSrc; i++){
      struct SrcList_item *pItem = &p->pSrc->a[i];
      if( pItem->pSelect ){
        const char *zSavedContext = pParse->zAuthContext;
        if( pItem->zName ) pParse->zAuthContext = pItem->zName;
        sqlite3ResolveSelectNames(pParse, pItem->pSelect, pOuterNC); 这里对于subquery中的表达式，它只能使用外层的NameContext，而不能使用本层的NameContext。
        pParse->zAuthContext = zSavedContext;
        if( pParse->nErr || db->mallocFailed ) return WRC_Abort;
      }
    }
……
    /* Add the expression list to the name-context before parsing the
    ** other expressions in the SELECT statement. This is so that
    ** expressions in the WHERE clause (etc.) can refer to expressions by
    ** aliases in the result set.
    **
    ** Minor point: If this is the case, then the expression will be
    ** re-evaluated for each reference to it.
    */
    sNC.pEList = p->pEList;
    if( sqlite3ResolveExprNames(&sNC, p->pWhere) ||
       sqlite3ResolveExprNames(&sNC, p->pHaving) 对于where和having的处理也是按照从当前NameContex中查找。
    ){
      return WRC_Abort;
    }
二、sqlite3SelectPrep函数流程
SQLITE_PRIVATE void sqlite3SelectPrep(
Parse *pParse,         /* The parser context */
Select *p,             /* The SELECT statement being coded. */
NameContext *pOuterNC /* Name context for container */
){
sqlite3 *db;
if( NEVER(p==0) ) return;
db = pParse->db;
if( p->selFlags & SF_HasTypeInfo ) return;
sqlite3SelectExpand(pParse, p);
if( pParse->nErr || db->mallocFailed ) return;
sqlite3ResolveSelectNames(pParse, p, pOuterNC);
if( pParse->nErr || db->mallocFailed ) return;
sqlite3SelectAddTypeInfo(pParse, p);
}
二、1 对于from以后整个select可见table和column的处理
这个函数包含了对整个select语句树形结构的两次完成遍历，第一次对全部的select按照深度优先方法遍历全部的select语句，这一次只是处理select语句from中的有名表格和无名表格(此时并无处理select以后和where以后的表达式)，整个过程至关于初始化了整个select中共享可见的table及column结构。
二、2 对于select以后和where以后表达式中处理
可是这里有个问题，就是它只是处理了from以后可能存在的subquery虚拟表，而select以后和where以后的表达式一样能够存在subquery，这个表格在sqlite3SelectExpand函数的此次遍历中并无被处理到。
在sqlite3ResolveSelectNames对于语法树的遍历过程当中，此时在对select以后和where以后的表达式进行变量的定位，这个函数将会首先对剩余的、可能存在的subquery进行展开，这一点在该函数的注释中也有说明

/* Normally sqlite3SelectExpand() will be called first and will have             一般来讲sqlite3SelectExpand函数以后select已经展开
** already expanded this SELECT. However, if this is a subquery within 可是若是表达式中有subquery
** an expression, sqlite3ResolveExprNames() will be called without a 调用sqlite3ResolveExprNames函数以前并无
** prior call to sqlite3SelectExpand(). When that happens, let              调用sqlite3SelectExpand，当这种状况发生时
** sqlite3SelectPrep() do all of the processing for this SELECT. 将会由sqlite3SelectPrep来完成对SELECT的全部处理。
** sqlite3SelectPrep() will invoke both sqlite3SelectExpand() and函数sqlite3SelectPrep将会正确同时调用sqlite3SelectExpand
** this routine in the correct order. 和本函数。
*/
if( (p->selFlags & SF_Expanded)==0 ){
    sqlite3SelectPrep(pParse, p, pOuterNC);
    return (pParse->nErr || db->mallocFailed) ? WRC_Abort : WRC_Prune;
}
二、3 sqlite3WalkSelect函数流程
该函数遍历表达式中全部并列的select语句，对于每一个一select语句，首先执行一次xSelectCallback回调，在这个回调中用户能够提供方式来进行一些整个select的全局性的处理工做，例如以前看到的from以后table的定位；而后遍历select以后以及where以后以及各类group之类的全部表达式，最后再处理from以后的subquery。
咱们看下显式的例子
sqlite> select first in (select first from noexit) as dupfirst from (select first from neigtherexit) as fromfirst;
Error: no such table: noexit
sqlite>
能够看到 select以后表达式的错误提示要遭遇from以后错误错误表达式的提示。至于sqlite3WalkSelect为何要把from中的subquery单独处理，多是为了处理两种类型对from以后table的不一样可见做用域吧，比方说from以后并列的table内subquery不可见其它table，待肯定。
。
/*
** Call sqlite3WalkExpr() for every expression in Select statement p.
** Invoke sqlite3WalkSelect() for subqueries in the FROM clause and
** on the compound select chain, p->pPrior.
**
** Return WRC_Continue under normal conditions. Return WRC_Abort if
** there is an abort request.
**
** If the Walker does not have an xSelectCallback() then this routine
** is a no-op returning WRC_Continue.
*/
SQLITE_PRIVATE int sqlite3WalkSelect(Walker *pWalker, Select *p){
int rc;
if( p==0 || pWalker->xSelectCallback==0 ) return WRC_Continue;
rc = WRC_Continue;
while( p ){
    rc = pWalker->xSelectCallback(pWalker, p);
    if( rc ) break;
    if( sqlite3WalkSelectExpr(pWalker, p) ) return WRC_Abort;
    if( sqlite3WalkSelectFrom(pWalker, p) ) return WRC_Abort;
    p = p->pPrior;
}
return rc & WRC_Abort;
}
5、表达式中变量的定位
resolveExprStep
    /* A lone identifier is the name of a column.
    */
    case TK_ID: {
      return lookupName(pParse, 0, 0, pExpr->u.zToken, pNC, pExpr);
    }
在接下来的lookupName函数中
/* Start at the inner-most context and move outward until a match is found */
while( pNC && cnt==0 ){
    ExprList *pEList;
    SrcList *pSrcList = pNC->pSrcList;

    if( pSrcList ){
      for(i=0, pItem=pSrcList->a; i<pSrcList->nSrc; i++, pItem++){ 对于from中的每个table遍历。
        Table *pTab;
        int iDb;
        Column *pCol;

        pTab = pItem->pTab;
        assert( pTab!=0 && pTab->zName!=0 );
        iDb = sqlite3SchemaToIndex(db, pTab->pSchema);
        assert( pTab->nCol>0 );
        if( zTab ){
          if( pItem->zAlias ){ 存在alias名称。
            char *zTabName = pItem->zAlias;
            if( sqlite3StrICmp(zTabName, zTab)!=0 ) continue;
          }else{ 真是table名。
            char *zTabName = pTab->zName;
            if( NEVER(zTabName==0) || sqlite3StrICmp(zTabName, zTab)!=0 ){
              continue;
            }
            if( zDb!=0 && sqlite3StrICmp(db->aDb[iDb].zName, zDb)!=0 ){
              continue;
            }
          }
        }
        if( 0==(cntTab++) ){
          pExpr->iTable = pItem->iCursor;
          pExpr->pTab = pTab;
          pSchema = pTab->pSchema;
          pMatch = pItem;
        }
        for(j=0, pCol=pTab->aCol; j<pTab->nCol; j++, pCol++){ 遍历table的全部列，也就是全部field。
          if( sqlite3StrICmp(pCol->zName, zCol)==0 ){ 列名称想等，认为找到一个匹配。
            IdList *pUsing;
            cnt++;
            pExpr->iTable = pItem->iCursor;
            pExpr->pTab = pTab;
            pMatch = pItem;
            pSchema = pTab->pSchema;
            /* Substitute the rowid (column -1) for the INTEGER PRIMARY KEY */
            pExpr->iColumn = j==pTab->iPKey ? -1 : (i16)j;
            if( i<pSrcList->nSrc-1 ){
              if( pItem[1].jointype & JT_NATURAL ){
                /* If this match occurred in the left table of a natural join,
                ** then skip the right table to avoid a duplicate match */
                pItem++;
                i++;
              }else if( (pUsing = pItem[1].pUsing)!=0 ){
                /* If this match occurs on a column that is in the USING clause
                ** of a join, skip the search of the right table of the join
                ** to avoid a duplicate match there. */
                int k;
                for(k=0; k<pUsing->nId; k++){
                  if( sqlite3StrICmp(pUsing->a[k].zName, zCol)==0 ){
                    pItem++;
                    i++;
                    break;
                  }
                }
              }
            }
            break;
          }
        }
      }
    }
……
    /* Advance to the next name context. The loop will exit when either
    ** we have a match (cnt>0) or when we run out of name contexts.
    */
    if( cnt==0 ){
       pNC = pNC->pNext; 向更外层的NameContext查找变量的定义，这个地方使用循环便可，没有递归。
    }
}
6、一些验证
一、from以后table的解析早于select以后表达式的解析
这点其实不用验证，有脚趾头想一下就能够想到，先查找table是否存在，而后验证select的内容是否存在
sqlite> .schema
CREATE TABLE tsecer(first int, second int, third varchar(100));
sqlite> select first from nothistable;
Error: no such table: nothistable
sqlite> select nothiscom from tsecer;
Error: no such column: nothiscom
sqlite>
二、select和where可使用from中的table，而同级的from以后subquery不能
sqlite> select * from tsecer as harry where first in (select first from tsecer where first=harry.first); 在where中能够引用from中全部的table。
sqlite> select * from tsecer as harry, (select * from tsecer where first=harry.first); 同级的from中不能引用同级的table。
Error: no such column: harry.first
sqlite>
三、union两侧在同一个src列表中且可递归
// "seltablist" is a "Select Table List" - the content of the FROM clause
// in a SELECT statement. "stl_prefix" is a prefix of this list.
//
stl_prefix(A) ::= seltablist(X) joinop(Y).    {
   A = X;
   if( ALWAYS(A && A->nSrc>0) ) A->a[A->nSrc-1].jointype = (u8)Y;
}
stl_prefix(A) ::= .                           {A = 0;}
seltablist(A) ::= stl_prefix(X) nm(Y) dbnm(D) as(Z) indexed_opt(I) on_opt(N) using_opt(U). {
A = sqlite3SrcListAppendFromTerm(pParse,X,&Y,&D,&Z,0,N,U);
sqlite3SrcListIndexedBy(pParse, A, &I);
}
对于
select something otherthing
from table1 join table2 on table1.f1==table2.f2
这样的语句，
table1 join 被解析为
stl_prefix(A) ::= seltablist(X) joinop(Y).
部分，而able1 join table2 on table1.f1==table2.f2 则被总体解析为
seltablist(A) ::= stl_prefix(X) nm(Y) dbnm(D) as(Z) indexed_opt(I) on_opt(N) using_opt(U).
在后面一个seltablist对应的动做中，执行了
A = sqlite3SrcListAppendFromTerm(pParse,X,&Y,&D,&Z,0,N,U);
这也就是说，join串联起来的全部table的on中能够用全部的table名称：
sqlite> select * from tsecer as harry join tsecer as har join tsecer as ray on harry.first = ray.second;
sqlite>
这个表达式并不会出现解析错误，虽然自己没有任何意义。在表达式的最后引用了join的第一个和最后一个table的alias。
四、from后的table若是是字符串，必须为物理表或临时表，而不能是alias表
sqlite3LocateTable--->>sqlite3FindTable，这个函数不会查找任何alias表示的表格，其核心代码为
SQLITE_PRIVATE Table *sqlite3FindTable(sqlite3 *db, const char *zName, const char *zDatabase){
Table *p = 0;
int i;
int nName;
assert( zName!=0 );
nName = sqlite3Strlen30(zName);
for(i=OMIT_TEMPDB; i<db->nDb; i++){
    int j = (i<2) ? i^1 : i;   /* Search TEMP before MAIN */
    if( zDatabase!=0 && sqlite3StrICmp(zDatabase, db->aDb[j].zName) ) continue;
    p = sqlite3HashFind(& db->aDb[j].pSchema->tblHash, zName, nName);     if( p ) break; } return p; } 因此下面句子错误，这一点在mysql中一样成立 sqlite> select * from tsecer as harry where harry.first in (select first from harry); Error: no such table: harry sqlite>