Skip to content

Suffix Array

Definition

按字典序排序的后缀的位置。SA[i] = Rank(Suffix(i))

"banana"
sa[0] = 5   a
sa[1] = 3   ana
sa[2] = 1   anana
sa[3] = 0   banana
sa[4] = 4   na
sa[5] = 2   nana

\(O(n \log n)\) Method to find SA

  • j-suffix
1-suffix: b, a, n, a, n, a
2-suffix: ba, an, na, an, na, a
4-suffix: bana, anan, nana, ana, na, a

​ 排名:可并列的j-后缀字典序排序数组,记为\(pm^j\)

​ 名次:不可并列,位置靠左的名次在前。

sa[]就是名次为i的n-后缀的位置。(n为原字符串长度)

  • 倍增法

    const int maxn =  1000;
    int wa[maxn], wb[maxn], wv[maxn], ws[maxn];
    int sa[maxn];
    // n: len(s), m: 127(ASCII unique chars)
    void buildSA(const char* s, int* sa, int n, int m=127){
        int i, j, p, *pm=wa, *k2sa = wb, *t;
        // k1 radix sort
        for(i=0; i<m; i++) ws[i] = 0; 
        for(i=0; i<n; i++) ws[pm[i] = s[i]]++;
        for(i=1; i<m; i++) ws[i] += ws[i-1];
        for(i=n-1; i>=0; i--) sa[--ws[pm[i]]] = i;
        // loops j->2j
        for(j=p=1; p<n; j<<=1, m=p){
            // generate k2sa
            for(p=0, i=n-j; i<n; i++) k2sa[p++]=i; // null k2
            for(i=0; i<n; i++) if(sa[i] >= j) k2sa[p++] = sa[i] - j;
            // k2 radix sort
            for(i=0; i<m; i++) ws[i] = 0;
            for(i=0; i<n; i++) ws[wv[i] = pm[k2sa[i]]]++;
            for(i=1; i<m; i++) ws[i] += ws[i-1];
            for(i=n-1; i>=0; i--) sa[--ws[wv[i]]] = k2sa[i];
            // update pm
            for(t=pm, pm=k2sa, k2sa=t, pm[sa[0]]=0, p=i=1; i<n; i++){
                int a = sa[i-1], b=sa[i];
                if(k2sa[a] == k2sa[b] && k2sa[a+j] == k2sa[b+j])
                    pm[sa[i]] = p-1;
                else pm[sa[i]] = p++;
            }
        } // stop when p==n
    }
    

RMQ (Sparse Table)

Range Maximum/Minimum Query. RMQ(arr, i, j)

预处理\(O(nlogn)\),查询\(O(1)\),更易写,但局限性是arr不能修改。

dp[i][j] : \(max\ of \ arr[i] \sim arr[i+2^j]\)

dp[i][j] = max(dp[i][j-1], dp[i+(1<<(j-1))][j-1])

// maximum RMQ
const int maxn = 1005;
int N;
int arr[maxn];
int st[maxn][32]; // log_2 maxn < 32

void build(){
    for(int i=1; i<=N; i++) st[i][0] = arr[i];
    int k = log2(N*1.0);
    for(int j=1; j<=k; j++){
        for(int i=1; i<=N; i++){
            if(i+(1<<(j-1))<=N){
                st[i][j] = max(st[i][j-1], st[i+(1<<(j-1))][j-1]);       
            }
        }
    }
}

int query(int l, int r){
    int k = log2(r-l+1.0);
    return max(st[l][k], st[r+1-(1<<k)][k]);
}
Balanced Lineup
#include <iostream>    
#include <cstdio>
#include <iomanip>
#include <string>
#include <cstring>
#include <queue>   
#include <vector>  
#include <algorithm>   
#include <deque>   
#include <map>

using namespace std;

const int maxn = 50005;
int N;
int arr[maxn];
int st[maxn][32], st2[maxn][32];

void build() {
    for (int i = 1; i <= N; i++) {
        st[i][0] = arr[i];
        st2[i][0] = arr[i];
    }
    int k = log2(N*1.0);
    for (int j = 1; j <= k; j++) {
        for (int i = 1; i <= N; i++) {
            if (i + (1 << (j - 1)) <= N) {
                st[i][j] = max(st[i][j - 1], st[i + (1 << (j - 1))][j - 1]);
                st2[i][j] = min(st2[i][j - 1], st2[i + (1 << (j - 1))][j - 1]);
            }
        }
    }
}

int querymx(int l, int r) {
    int k = log2(r - l + 1.0);
    return max(st[l][k], st[r + 1 - (1 << k)][k]);
}

int querymn(int l, int r) {
    int k = log2(r - l + 1.0);
    return min(st2[l][k], st2[r + 1 - (1 << k)][k]);
}

int Q, a, b;

int main() {
    scanf("%d%d", &N, &Q);
    for (int i = 1; i <= N; i++) cin >> arr[i];
    build();
    for (int i = 0; i < Q; i++) {
        scanf("%d%d", &a, &b);
        printf("%d\n", querymx(a, b) - querymn(a, b));
    }
}

最长公共前缀数组

任给两个后缀,O(1)求其最长公共前缀(LCP)的长度。

Rank[i]: 位置i的后缀的名次
LCP(i, j): **名次**为i和j的后缀的LCP
LCPL(i, j): LCP(i, j) 的长度
height[i] = LCPL(i-1, i) : 名次为i和i-1的后缀的LCPL

// SA --> Rank
Rank[SA[i]] = i

// SA --> height
H[i] = LCPL(Rank[i]-1, Rank[i])
     = LCPL(Suffix(i), Suffix(SA[Rank[i]-1]))
     位置i的后缀X和名次在X前一位的后缀Y的LCPL。

height[i] = H[SA[i]]
height[Rank[i]] = H[i]


// height --> LCPL
LCPL(i, j) = min{height[i+1, ..., j]} // RMQ O(1)
  • LCP引理1
\[ \displaylines{ LCPL(i, j) = min\{LCPL(k, k+1)| k=i, ..., j-1\} } \]
  • LCP引理2
\[ \displaylines{ \forall i \le k \lt j \\ LCPL(k, j) \ge LCPL(i, j) } \]
  • LCP引理3

1544426987775

  • H定理

    i>0 && Rank[i]>0时:

\[ \displaylines{ H[i] \ge H[i-1] -1 } \]
int Rank[maxn], height[maxn];
void buildHeight(char* str, int n, int* sa){
    int i, j, k;
    for(i=0; i<n; i++) Rank[sa[i]]=i;
    for(i=k=0; i<n; height[Rank[i++]]=k)
        for(k?k--:0, j=sa[Rank[i]-1];
           str[i+k] == str[j+k];
           k++);
}

// length+1, "abcd\0"
buildSA("abcd", sa, 5, 130);
buildHeight("abcd", 5, sa);

Applications

POJ2774 Long Long Message

求两个字符串的最长公共子串。

  • 首先拼接两个字符串,用不曾出现过的字符隔开。
  • 则最长公共子串为某两个后缀的最长公共前缀,即LCPL(i, j)的最大值。
  • 由于引理1,为了找到这个最大值,只需要遍历所有LCPL(i-1 ,i) = height[i]找最大的height即可。
  • 注意两个后缀要在不同字符串中,因此额外判断sa[i]>len1, sa[i-1]<len1或相反。
#define _CRT_SECURE_NO_WARNINGS
#include <cstdio>
#include <cstring>
#include <string>
#include <iostream>
#include <algorithm>
#include <string>

using namespace std;

int N;

const int maxn = 200005;
int wa[maxn], wb[maxn], wv[maxn], Ws[maxn], height[maxn], Rank[maxn]; //辅助数组 
int sa[maxn]; //sa[i]是名次为i的后缀的位置,即后缀数组

void buildSA(int * s, int * sa, int n, int m) {
    int i, j, p, *pm = wa, *k2sa = wb, *t;
    for (i = 0; i < m; i++) Ws[i] = 0;
    for (i = 0; i < n; i++) Ws[pm[i] = s[i]]++; //(1)
    for (i = 1; i < m; i++) Ws[i] += Ws[i - 1];
    for (i = n - 1; i >= 0; i--) sa[--Ws[pm[i]]] = i;
    for (j = p = 1; p < n; j <<= 1, m = p) {  //烧脑循环
        for (p = 0, i = n - j; i < n; i++) k2sa[p++] = i;
        for (i = 0; i < n; i++) //按名次从小到大遍历n个j-后缀
            if (sa[i] >= j) k2sa[p++] = sa[i] - j;
        for (i = 0; i < m; i++) Ws[i] = 0;
        for (i = 0; i < n; i++)
            Ws[wv[i] = pm[k2sa[i]]]++;
        for (i = 1; i < m; i++) Ws[i] += Ws[i - 1];
        for (i = n - 1; i >= 0; i--)
            sa[--Ws[wv[i]]] = k2sa[i];//求位置为k2sa[i]的2j-后缀的名次
        for (t = pm, pm = k2sa, k2sa = t,
            pm[sa[0]] = 0, p = i = 1; i < n; i++) {//按名次遍历2j-后缀
            int a = sa[i - 1], b = sa[i];
            if (k2sa[a] == k2sa[b] && a + j < n && b + j < n &&
                k2sa[a + j] == k2sa[b + j])
                pm[sa[i]] = p - 1; //未发现新的2j-后缀
            else
                pm[sa[i]] = p++; //发现新的2j-后缀
        } //当p达到n时,说明已经有了n个不同的2j-后缀,并且都在sa里排好了序。
    } //烧脑循环结束
    return;
}

void BuildHeight(int * str, int n, int * sa, int * Rank) {
    int i, j, k;
    for (int i = 0; i < n; ++i) //i 是名次,n是字符串长度 
        Rank[sa[i]] = i;
    height[0] = 0;
    for (i = k = 0; i < n - 1; height[Rank[i++]] = k)//i是位置
        for (k ? k-- : 0, j = sa[Rank[i] - 1]; //Rank[0]>0才不越界 
            str[i + k] == str[j + k]; k++);
}

char a[maxn], b[maxn];
int s[maxn];

int main() {
    while (~scanf("%s%s", a, b)) {
        int la = strlen(a), lb = strlen(b), l = 0;
        for (int i = 0; i < la; i++) s[l++] = a[i] - 'a' + 1;
        s[l++] = 28;
        for (int i = 0; i < lb; i++) s[l++] = b[i] - 'a' + 1;
        s[l] = 0;
        buildSA(s, sa, l+1, 255);
        BuildHeight(s, l+1, sa, Rank);
        int ans = 0;
        for (int i = 1; i <= l; i++) {
            if (sa[i - 1]<la && sa[i]>la || sa[i - 1] > la && sa[i] < la)
                ans = max(ans, height[i]);
        }
        cout << ans << endl;
    }
}
最长公共子串(多序列)

ref

设所有字符串序列长度之和为\(L\),每个字符串平均长度为\(l\),则算法

  • 平均复杂度\(O(L \log L)\)
  • 最坏复杂度\(O(l L\log L)\),在所有字符串都只含有同一个字符时达到。
POJ3450 Corporate Identity
#define _CRT_SECURE_NO_WARNINGS
#include <cstdio>
#include <cstring>
#include <iostream>
#include <algorithm>
#include <string>

using namespace std;

const int maxc = 205;
const int maxs = 4005;
const int maxn = 4005 * 205;


int wa[maxn], wb[maxn], wc[maxn], wd[maxn];
int sa[maxn];

// n: strlen(s), m: 128(ASCII unique chars)
void buildSA(int* s, int n, int* sa, int m = 128) {
    int i, j, p, *pm = wa, *k2sa = wb, *t;
    // k1 radix sort
    for (i = 0; i < m; i++) wd[i] = 0;
    for (i = 0; i < n; i++) wd[pm[i] = s[i]]++;
    for (i = 1; i < m; i++) wd[i] += wd[i - 1];
    for (i = n - 1; i >= 0; i--) sa[--wd[pm[i]]] = i;
    // loops j->2j
    for (j = p = 1; p < n; j <<= 1, m = p) {
        // generate k2sa
        for (p = 0, i = n - j; i < n; i++) k2sa[p++] = i; // null k2
        for (i = 0; i < n; i++) if (sa[i] >= j) k2sa[p++] = sa[i] - j;
        // k2 radix sort
        for (i = 0; i < m; i++) wd[i] = 0;
        for (i = 0; i < n; i++) wd[wc[i] = pm[k2sa[i]]]++;
        for (i = 1; i < m; i++) wd[i] += wd[i - 1];
        for (i = n - 1; i >= 0; i--) sa[--wd[wc[i]]] = k2sa[i];
        // update pm
        for (t = pm, pm = k2sa, k2sa = t, pm[sa[0]] = 0, p = i = 1; i < n; i++) {
            int a = sa[i - 1], b = sa[i];
            if (k2sa[a] == k2sa[b] && k2sa[a + j] == k2sa[b + j])
                pm[sa[i]] = p - 1;
            else pm[sa[i]] = p++;
        }
    }
}

int Rank[maxn], height[maxn];
void buildHeight(int* str, int n) {
    int i, j, k;
    for (i = 0; i < n; i++) Rank[sa[i]] = i;
    for (i = k = 0; i < n; height[Rank[i++]] = k)
        for (k ? k-- : 0, j = sa[Rank[i] - 1];
            str[i + k] == str[j + k];
            k++);
}

char ans[maxc];
int vis[maxs];

char a[maxc];
int s[maxn], id[maxn];

int N;
int l = 0, mark = 30, la;

bool check(int m) {
    int cnt = 0;
    memset(vis, 0, sizeof(vis));
    // 遍历所有后缀
    for (int i = 1; i <= l; i++) {
        // 最长公共子串长度小于m,失败
        if (height[i] < m) {
            cnt = 0;
            memset(vis, 0, sizeof(vis));
            continue;
        }
        if (!vis[id[sa[i-1]]]) {
            vis[id[sa[i-1]]] = 1;
            cnt++;
        }
        if (!vis[id[sa[i]]]) {
            vis[id[sa[i]]] = 1;
            cnt++;
        }
        if (cnt == N) {
            for (int j = 0; j < m; j++) ans[j] = s[sa[i] + j] + 'a' - 1;
            ans[m] = '\0';
            return true;
        }
    }
    return false;
}

int main() {
    while (cin >> N && N) {
        l = 0;
        mark = 30;
        for (int i = 0; i < N; i++) {
            cin >> a;
            la = strlen(a);
            for (int j = 0; j < la; j++) {
                id[l] = i; // 标志当前下标是原来的哪一个字符串
                s[l++] = a[j] - 'a' + 1;
            }
            id[l] = mark + i;
            s[l++] = mark + i;
        }
        s[l] = 0;
        buildSA(s, l + 1, sa, mark + N);
        buildHeight(s, l + 1);
        int l = 1, r = la, flag = 0;
        while (l < r) {
            int m = (l + r) / 2;
            //cout << "bs " << l << "-" << m << " " << r << endl;
            if (check(m)) {
                flag = 1;
                l = m + 1;
            }
            else r = m;
        }
        if (flag) cout << ans << endl;
        else cout << "IDENTITY LOST" << endl;
    }
}
Musical Theme POJ

不重叠的最长重复子串。

有一个测试点WA,我也不知道为啥.jpg

#define _CRT_SECURE_NO_WARNINGS
#include <cstdio>
#include <cstring>
#include <iostream>
#include <algorithm>
#include <string>

using namespace std;

const int maxn = 20005;
int N;
int wa[maxn], wb[maxn], wc[maxn], wd[maxn];
int sa[maxn];

// n: strlen(s), m: 128(ASCII unique chars)
void buildSA(int* s, int n, int* sa, int m = 200) {
    int i, j, p, *pm = wa, *k2sa = wb, *t;
    // k1 radix sort
    for (i = 0; i < m; i++) wd[i] = 0;
    for (i = 0; i < n; i++) wd[pm[i] = s[i]]++;
    for (i = 1; i < m; i++) wd[i] += wd[i - 1];
    for (i = n - 1; i >= 0; i--) sa[--wd[pm[i]]] = i;
    // loops j->2j
    for (j = p = 1; p < n; j <<= 1, m = p) {
        // generate k2sa
        for (p = 0, i = n - j; i < n; i++) k2sa[p++] = i; // null k2
        for (i = 0; i < n; i++) if (sa[i] >= j) k2sa[p++] = sa[i] - j;
        // k2 radix sort
        for (i = 0; i < m; i++) wd[i] = 0;
        for (i = 0; i < n; i++) wd[wc[i] = pm[k2sa[i]]]++;
        for (i = 1; i < m; i++) wd[i] += wd[i - 1];
        for (i = n - 1; i >= 0; i--) sa[--wd[wc[i]]] = k2sa[i];
        // update pm
        for (t = pm, pm = k2sa, k2sa = t, pm[sa[0]] = 0, p = i = 1; i < n; i++) {
            int a = sa[i - 1], b = sa[i];
            if (k2sa[a] == k2sa[b] && k2sa[a + j] == k2sa[b + j])
                pm[sa[i]] = p - 1;
            else pm[sa[i]] = p++;
        }
    }
}

int Rank[maxn], height[maxn];
void buildHeight(int* str, int n) {
    int i, j, k;
    for (i = 0; i < n; i++) Rank[sa[i]] = i;
    for (i = k = 0; i < n; height[Rank[i++]] = k)
        for (k ? k-- : 0, j = sa[Rank[i] - 1];
            str[i + k] == str[j + k];
            k++);
}

int s[maxn];

bool check(int m) {
    int mn = sa[1];
    int mx = sa[1];
    for (int i = 2; i < N; i++) {
        if (height[i] >= m){
            mn = min(mn, sa[i]);
            mx = max(mx, sa[i]);
        }
        else {
            if (mx - mn >= m) return true;
            else mx = mn = sa[i];
        }
    }
    if (mx - mn >= m) return true;
    return false;
}

int main() {
    while (cin >> N && N) {
        for (int i = 0; i < N; i++) scanf("%d", s + i);
        for (int i = 0; i < N - 1; i++) s[i] = s[i + 1] - s[i] + 100;
        s[N - 1] = 0;
        buildSA(s, N, sa);
        buildHeight(s, N);
        int l = 4, r = N, ans = -1;
        while (l < r) {
            int mid = (l + r) / 2;
            if (check(mid)) {
                ans = mid;
                l = mid + 1;
            }
            else r = mid;
        }
        ans++;
        printf("%d\n", ans < 5 ? 0 : ans);
    }
}
Milk Pattern

可重叠的至少k次的最长重复子序列。

二分(转换为判定性问题)+后缀分组法。

#define _CRT_SECURE_NO_WARNINGS
#include <cstdio>
#include <cstring>
#include <string>
#include <iostream>
#include <algorithm>
#include <string>

using namespace std;

int N;

const int MAXN = 20005;
int wa[MAXN], wb[MAXN], wv[MAXN], Ws[MAXN], height[MAXN], Rank[MAXN]; //辅助数组 
int sa[MAXN]; //sa[i]是名次为i的后缀的位置,即后缀数组

void buildSA(int * s, int * sa, int n, int m) {
    int i, j, p, *pm = wa, *k2sa = wb, *t;
    for (i = 0; i < m; i++) Ws[i] = 0;
    for (i = 0; i < n; i++) Ws[pm[i] = s[i]]++; //(1)
    for (i = 1; i < m; i++) Ws[i] += Ws[i - 1];
    for (i = n - 1; i >= 0; i--) sa[--Ws[pm[i]]] = i;
    for (j = p = 1; p < n; j <<= 1, m = p) {  //烧脑循环
        for (p = 0, i = n - j; i < n; i++) k2sa[p++] = i;
        for (i = 0; i < n; i++) //按名次从小到大遍历n个j-后缀
            if (sa[i] >= j) k2sa[p++] = sa[i] - j;
        for (i = 0; i < m; i++) Ws[i] = 0;
        for (i = 0; i < n; i++)
            Ws[wv[i] = pm[k2sa[i]]]++;
        for (i = 1; i < m; i++) Ws[i] += Ws[i - 1];
        for (i = n - 1; i >= 0; i--)
            sa[--Ws[wv[i]]] = k2sa[i];//求位置为k2sa[i]的2j-后缀的名次
        for (t = pm, pm = k2sa, k2sa = t,
            pm[sa[0]] = 0, p = i = 1; i < n; i++) {//按名次遍历2j-后缀
            int a = sa[i - 1], b = sa[i];
            if (k2sa[a] == k2sa[b] && a + j < n && b + j < n &&
                k2sa[a + j] == k2sa[b + j])
                pm[sa[i]] = p - 1; //未发现新的2j-后缀
            else
                pm[sa[i]] = p++; //发现新的2j-后缀
        } //当p达到n时,说明已经有了n个不同的2j-后缀,并且都在sa里排好了序。
    } //烧脑循环结束
    return;
}

void BuildHeight(int * str, int n, int * sa, int * Rank) {
    int i, j, k;
    for (int i = 0; i < n; ++i) //i 是名次,n是字符串长度 
        Rank[sa[i]] = i;
    height[0] = 0;
    for (i = k = 0; i < n - 1; height[Rank[i++]] = k)//i是位置
        for (k ? k-- : 0, j = sa[Rank[i] - 1]; //Rank[0]>0才不越界 
            str[i + k] == str[j + k]; k++);
}

int s[MAXN];
int K;

bool check(int m) {
    int cnt = 1;
    for (int i = 0; i < N; i++) {
        if (height[i] >= m) {
            cnt++;
            if (cnt == K) return true;
        }
        else cnt = 1;
    }
    return false;
}

int main() {
    cin >> N >> K;
    for (int i = 0; i < N; i++) cin >> s[i];
    buildSA(s, sa, N, 255);
    BuildHeight(s, N, sa, Rank);
    int l = 1, r = N;
    while (l < r) {
        int m = (l + r) / 2;
        if (check(m)) l = m + 1;
        else r = m;
    }
    cout << l - 1 << endl;
}