【字符串算法】字典树详解

时间 2020-12-01

标签 php node ios c++ 数据结构搜索引擎 spa code 排序递归栏目 PHP 繁體版

原文原文链接

字典树

　　字典树，又称单词查找树，Trie树，是一种树形结构，是一种哈希树的变种。典型应用是用于统计，排序和保存大量的字符串（但不只限于字符串），因此常常被搜索引擎系统用于文本词频统计。它的优势是：利用字符串的公共前缀来节约存储空间，最大限度地减小无谓的字符串比较，查询效率比哈希表高。
　　字典树与字典很类似,当你要查一个单词是否是在字典树中,首先看单词的第一个字母是否是在字典的第一层,若是不在,说明字典树里没有该单词,若是在就在该字母的孩子节点里找是否是有单词的第二个字母,没有说明没有该单词,有的话用一样的方法继续查找.字典树不只能够用来储存字母,也能够储存数字等其它数据。php

Trie的数据结构定义：node

#define MAX 26
typedef struct Trie {
    Trie* next[MAX];
    int v;  //根据须要变化
};

Trie* root;

　　next是表示每层有多少种类的数，若是只是小写字母，则26便可，若改成大小写字母，则是52，若再加上数字，则是62了，这里根据题意来肯定。 v能够表示一个字典树到此有多少相同前缀的数目，这里根据须要应当学会自由变化。ios

　　Trie的查找（最主要的操做）：c++

　　(1) 每次从根结点开始一次搜索；
　　(2) 取得要查找关键词的第一个字母，并根据该字母选择对应的子树并转到该子树继续进行检索；　　数据结构

(3) 在相应的子树上，取得要查找关键词的第二个字母,并进一步选择对应的子树进行检索。　　　 (4) 迭代过程……
　 (5) 在某个结点处，关键词的全部字母已被取出，则读取附在该结点上的信息，即完成查找。搜索引擎

　　这里给出生成字典树和查找的模版：spa

生成字典树：code

void createTrie(char* str) {
    int len = strlen(str);
    Trie *p = root, *q;
    for (int i = 0; i < len; ++i) {
        int id = str[i] - '0';
        if (p->next[id] == NULL) {
            q = (Trie*)malloc(sizeof(Trie));
            q->v = 1;  //初始v==1
            for (int j = 0; j < MAX; ++j)
                q->next[j] = NULL;
            p->next[id] = q;
            p = p->next[id];
        } else {
            p->next[id]->v++;
            p = p->next[id];
        }
    }
    p->v = -1;  //若为结尾，则将v改为-1表示
}

查找:排序

int findTrie(char* str) {
    int len = strlen(str);
    Trie* p = root;
    for (int i = 0; i < len; ++i) {
        int id = str[i] - '0';
        p = p->next[id];
        if (p == NULL)  //若为空集，表示不存以此为前缀的串
            return 0;
        if (p->v == -1)  //字符集中已有串是此串的前缀
            return -1;
    }
    return -1;  //此串是字符集中某串的前缀
}

例题

hdu 1251 统计难题

　　题意：在给出的字符串中找出由给出的字符串中出现过的两个串拼成的字符串。
　　字典树的模板题，先建字典数，而后再查询每一个给定的单词。。递归

代码以下:

#include <string.h>
#include <iostream>
using namespace std;

const int sonsum = 26, base = 'a';
char s1[12], ss[12];

struct Trie {
    int num;
    bool flag;
    struct Trie* son[sonsum];
    Trie() {
        num = 1;
        flag = false;
        memset(son, NULL, sizeof(son));
    }
};

Trie* NewTrie() {
    Trie* temp = new Trie;
    return temp;
}

void Inset(Trie* root, char* s) {
    Trie* temp = root;
    while (*s) {
        if (temp->son[*s - base] == NULL) {
            temp->son[*s - base] = NewTrie();
        } else
            temp->son[*s - base]->num++;
        temp = temp->son[*s - base];
        s++;
    }
    temp->flag = true;
}

int search(Trie* root, char* s) {
    Trie* temp = root;
    while (*s) {
        if (temp->son[*s - base] == NULL)
            return 0;
        temp = temp->son[*s - base];
        s++;
    }
    return temp->num;
}

int main() {
    Trie* root = NewTrie();
    root->num = 0;
    // while(cin.get(s1,12))
    while (gets(s1) && strcmp(s1, "") != 0) {
        // if(strcmp(s1," ")==0)
        // break;
        Inset(root, s1);
    }
    while (cin >> ss) {
        int ans = search(root, ss);
        cout << ans << endl;
    }

    return 0;
}

poj 2001 Shortest Prefixes

　　题意：找出能惟一标示一个字符串的最短前缀，若是找不出，就输出该字符串。
　　用字典树便可
　　
代码以下：

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
using namespace std;

char list[1005][25];

struct node {
    int count;
    node* childs[26];
    node() {
        count = 0;
        int i;
        for (i = 0; i < 26; i++)
            childs[i] = NULL;
    }
};

node* root = new node;
node *current, *newnode;

void insert(char* str) {
    int i, m;
    current = root;
    for (i = 0; i < strlen(str); i++) {
        m = str[i] - 'a';
        if (current->childs[m] != NULL) {
            current = current->childs[m];
            ++(current->count);
        } else {
            newnode = new node;
            ++(newnode->count);
            current->childs[m] = newnode;
            current = newnode;
        }
    }
}

void search(char* str) {
    int i, m;
    char ans[25];
    current = root;
    for (i = 0; i < strlen(str); i++) {
        m = str[i] - 'a';
        current = current->childs[m];
        ans[i] = str[i];
        ans[i + 1] = '\0';
        if (current->count == 1)  //能够惟一标示该字符串的前缀
        {
            printf("%s %s\n", str, ans);
            return;
        }
    }
    printf("%s %s\n", str, ans);  // 不然输出该字符串
}

int main() {
    int i, t = 0;
    while (scanf("%s", list[t]) != EOF) {
        insert(list[t]);
        t++;
    }
    for (i = 0; i < t; i++)
        search(list[i]);
    return 0;
}

hdu 4825 Xor Sum

　　题意：给你一些数字，再询问Q个问题，每一个问题给一个数字，使这个数字和以前给出的数字的异或和最大。
　　构造字典树，高位在前，低位在后，而后顺着字典树根向深处递归查询

代码以下：

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <functional>
#include <iostream>
#include <map>
#include <queue>
#include <set>
#include <string>
#include <vector>

using namespace std;
typedef long long LL;
typedef pair<LL, int> PLI;

const int MX = 2e5 + 5;
const int INF = 0x3f3f3f3f;

struct Node {
    Node* Next[2];
    Node() { Next[0] = Next[1] = NULL; }
};

void trie_add(Node* root, int S) {
    Node* p = root;
    for (int i = 31; i >= 0; i--) {
        int id = ((S & (1 << i)) != 0);
        if (p->Next[id] == NULL) {
            p->Next[id] = new Node();
        }
        p = p->Next[id];
    }
}

int trie_query(Node* root, int S) {
    Node* p = root;
    int ans = 0;
    for (int i = 31; i >= 0; i--) {
        int id = ((S & (1 << i)) != 0);
        if (p->Next[id ^ 1] != NULL) {
            ans |= (id ^ 1) << i;
            p = p->Next[id ^ 1];
        } else {
            ans |= id << i;
            p = p->Next[id];
        }
    }
    return ans;
}

int main() {
    // freopen("input.txt", "r", stdin);
    int T, n, Q, t, ansk = 0;
    scanf("%d", &T);
    while (T--) {
        scanf("%d%d", &n, &Q);
        Node* root = new Node();

        for (int i = 1; i <= n; i++) {
            scanf("%d", &t);
            trie_add(root, t);
        }

        printf("Case #%d:\n", ++ansk);
        while (Q--) {
            scanf("%d", &t);
            printf("%d\n", trie_query(root, t));
        }
    }
    return 0;
}