字符串匹配,KMP了解一下

​ 如何判断一个字符串是否是另外一个字符串的子串,咱们第一反应就是indexOf或includes或者用正则,虽然没有什么不对,可是仍是须要了解一下字符串匹配是怎么匹配的。在计算机科学中,Knuth-Morris-Pratt字符串查找算法(简称为KMP算法)可在一个主文本字符串S内查找一个词W的出现位置。此算法经过运用对这个词在不匹配时自己就包含足够的信息来肯定下一个匹配将在哪里开始的发现,从而避免从新检查先前匹配的字符。(尴尬了,大一时学的算法,不用就全忘了┭┮﹏┭┮)javascript

​ 设母串S = S_0...S_n,子串T = T_0...T_m,在传统的字符串匹配算法中,当母串S与子串T在i和j点失配时,即S_i \not= T_j ,此时i回溯到i - j + 1处(i = i - j + 1),j回溯到0 (j = 0)处继续匹配,因而可知匹配算法效率不高,复杂度为O(n * m)。java

​ 而在KMP算法,当母串S与子串T在i和j点失配时,i不须要回溯,j只须要回溯到某一个特定位置便可总体复杂度为O(n + m),。那j须要回溯到哪呢,咱们定义一个next数组,令next[j] = k表示当母串S和子串T在i点失配时,即S_i \not= T_j, j须要回溯到k这个位置(k < j)。接下来咱们来讨论如何肯定k点,并求出next数组。ios

​ 若母串S和子串T在i和j点失配,S_i \not= T_j,此时必有S_{i - j} ... S_{i - 1} = T_0 ... T_{j - 1},对于任意的0 \le k < j,有S_{i - k} ... S_{i - 1} = T_{j - k} ... T_{j - 1}。如有S_{i - k} ... S_{i - 1} = T_0 ... T_{k - 1},则有T_0 ... T_{k - 1} = T_{j - k} ... T_{j - 1},所以next[j] = k。算法

​ 对于next[j + 1],若T_k = T_j,则必有T_0 ... T_{k - 1}T_k = T_{j - k} ... T_{j - 1}T_j,所以next[j + 1] = k + 1 = next[j] + 1。若T_k \not= T_j,显然T_0 ... T_{k - 1}T_k \not= T_{j - k} ... T_{j - 1}T_j,此时令母串S^{'} = ...T_{j - k}...T_{j - 1}T_j...,子串T^{'} = T_0...T_{k - 1}T_k...,此时就至关于母串S'和子串T'在j和k点失配,由next数组的定义,next[k] = k',即T_0...T_{k' - 1} = T_{k - k'}...T_{k - 1} = T_{j - k'} ... T_{j - 1},此时如有T_{k'} = T_j,则T_0 ... T_{k' - 1}T_{k'} = T_{j - k'} ... T_{j - 1}T_j,所以next[j + 1] = k' + 1 = next[k] + 1,若T_{k'} \not= T_j,此时须要重复上面的步骤去寻找更小的k''使得T_{k''} = T_j,此时next[j + 1] = k'' + 1 = next[k'] + 1,若T_{k''} \not= T_j,继续寻找k''',直到到达next[0] = -1为止。数组

​ C++代码实现ui

#include <cstdio>
#include <cstring>
#include <iostream>
using namespace std;

const int max_size = 100000;
int Next[max_size] = {};

void make_next(char* t, int* next) {
  int j = 0, k = -1;
  next[j] = k;
  while (j < strlen(t)) {
    if (k == -1 || t[j] == t[k]) next[++j] = ++k; // next[j + 1] = k + 1;
    else k = next[k]; // 寻找更小的k使得t[j] = t[k]
  }
}

int kmp(char* s, char* t, int* next) {
	int i = 0, j = 0;
	make_next(t, next);
	while (i < strlen(s)) {
          if (j == -1 || s[i] == t[j]) i ++, j ++;
          else j = next[j]; // i点失配,j回溯到next[j]点
          if (j == strlen(t)) return i - j;
	}
	return -1;
}

int kmp_count(char* s, char* t, int* next) {
	int i = 0, j = 0, v = 0;
	make_next(t, next);
    while (i < strlen(s)) {
		if (j == -1 || s[i] == t[j]) i++, j++;
         else j = next[j];
		if (j == strlen(t)) v++, j = next[j]; 
	} 
	return v;
}

int main() { 
	memset(Next, 0, sizeof(Next));
	char* s = const_cast<char*>("abcabacabaa");
	char* t = const_cast<char*>("aba");
	int i = kmp(s, t, Next);
	cout<<i<<endl;
	memset(Next, 0, sizeof(Next));
	int v = kmp_count(s, t, Next);
	cout<<v<<endl;
}
复制代码

JavaScript实现spa

function makeNext(t) {
  let k = -1,  next = [k], j = 0;
  while (j < t.length) {
    if (k === -1 || t[k] === t[j]) ++j, next.push(++k);
    else k = next[k];
  }
  return next;
}

function kmp(s, t) {
  let i = 0, j = 0, next = makeNext(t);
  while (i < s.length) {
    if (j === -1 || s[i] === t[j]) j++ , i++;
    else j = next[j];
    if (j === t.length) return i - j;
  }
  return -1;
}

function kmpCount(s, t) {
  let i = 0, j = 0, v = 0, next = makeNext(t);
  while (i < s.length) {
    if (j === -1 || s[i] === t[j]) j++ , i++;
    else j = next[j];
    if (j === t.length) v++ , j = next[j];
  }
  return v;
}

let s = "ababababacadababa";
let t = "ba";
console.log(kmp(s, t));
console.log(kmpCount(s, t));
复制代码
相关文章
相关标签/搜索