Suffix Array
Definition
按字典序排序的后缀的位置。SA[i] = Rank(Suffix(i))
"banana"
sa[0] = 5 a
sa[1] = 3 ana
sa[2] = 1 anana
sa[3] = 0 banana
sa[4] = 4 na
sa[5] = 2 nana
\(O(n \log n)\) Method to find SA
- j-suffix
1-suffix: b, a, n, a, n, a
2-suffix: ba, an, na, an, na, a
4-suffix: bana, anan, nana, ana, na, a
排名:可并列的j-后缀字典序排序数组,记为\(pm^j\)
名次:不可并列,位置靠左的名次在前。
sa[]
就是名次为i的n-后缀的位置。(n为原字符串长度)
-
倍增法
const int maxn = 1000; int wa[maxn], wb[maxn], wv[maxn], ws[maxn]; int sa[maxn]; // n: len(s), m: 127(ASCII unique chars) void buildSA(const char* s, int* sa, int n, int m=127){ int i, j, p, *pm=wa, *k2sa = wb, *t; // k1 radix sort for(i=0; i<m; i++) ws[i] = 0; for(i=0; i<n; i++) ws[pm[i] = s[i]]++; for(i=1; i<m; i++) ws[i] += ws[i-1]; for(i=n-1; i>=0; i--) sa[--ws[pm[i]]] = i; // loops j->2j for(j=p=1; p<n; j<<=1, m=p){ // generate k2sa for(p=0, i=n-j; i<n; i++) k2sa[p++]=i; // null k2 for(i=0; i<n; i++) if(sa[i] >= j) k2sa[p++] = sa[i] - j; // k2 radix sort for(i=0; i<m; i++) ws[i] = 0; for(i=0; i<n; i++) ws[wv[i] = pm[k2sa[i]]]++; for(i=1; i<m; i++) ws[i] += ws[i-1]; for(i=n-1; i>=0; i--) sa[--ws[wv[i]]] = k2sa[i]; // update pm for(t=pm, pm=k2sa, k2sa=t, pm[sa[0]]=0, p=i=1; i<n; i++){ int a = sa[i-1], b=sa[i]; if(k2sa[a] == k2sa[b] && k2sa[a+j] == k2sa[b+j]) pm[sa[i]] = p-1; else pm[sa[i]] = p++; } } // stop when p==n }
RMQ (Sparse Table)
Range Maximum/Minimum Query. RMQ(arr, i, j)
预处理\(O(nlogn)\),查询\(O(1)\),更易写,但局限性是arr
不能修改。
dp[i][j]
: \(max\ of \ arr[i] \sim arr[i+2^j]\)
dp[i][j] = max(dp[i][j-1], dp[i+(1<<(j-1))][j-1])
// maximum RMQ
const int maxn = 1005;
int N;
int arr[maxn];
int st[maxn][32]; // log_2 maxn < 32
void build(){
for(int i=1; i<=N; i++) st[i][0] = arr[i];
int k = log2(N*1.0);
for(int j=1; j<=k; j++){
for(int i=1; i<=N; i++){
if(i+(1<<(j-1))<=N){
st[i][j] = max(st[i][j-1], st[i+(1<<(j-1))][j-1]);
}
}
}
}
int query(int l, int r){
int k = log2(r-l+1.0);
return max(st[l][k], st[r+1-(1<<k)][k]);
}
Balanced Lineup
#include <iostream>
#include <cstdio>
#include <iomanip>
#include <string>
#include <cstring>
#include <queue>
#include <vector>
#include <algorithm>
#include <deque>
#include <map>
using namespace std;
const int maxn = 50005;
int N;
int arr[maxn];
int st[maxn][32], st2[maxn][32];
void build() {
for (int i = 1; i <= N; i++) {
st[i][0] = arr[i];
st2[i][0] = arr[i];
}
int k = log2(N*1.0);
for (int j = 1; j <= k; j++) {
for (int i = 1; i <= N; i++) {
if (i + (1 << (j - 1)) <= N) {
st[i][j] = max(st[i][j - 1], st[i + (1 << (j - 1))][j - 1]);
st2[i][j] = min(st2[i][j - 1], st2[i + (1 << (j - 1))][j - 1]);
}
}
}
}
int querymx(int l, int r) {
int k = log2(r - l + 1.0);
return max(st[l][k], st[r + 1 - (1 << k)][k]);
}
int querymn(int l, int r) {
int k = log2(r - l + 1.0);
return min(st2[l][k], st2[r + 1 - (1 << k)][k]);
}
int Q, a, b;
int main() {
scanf("%d%d", &N, &Q);
for (int i = 1; i <= N; i++) cin >> arr[i];
build();
for (int i = 0; i < Q; i++) {
scanf("%d%d", &a, &b);
printf("%d\n", querymx(a, b) - querymn(a, b));
}
}
最长公共前缀数组
任给两个后缀,O(1)求其最长公共前缀(LCP)的长度。
Rank[i]: 位置i的后缀的名次
LCP(i, j): **名次**为i和j的后缀的LCP
LCPL(i, j): LCP(i, j) 的长度
height[i] = LCPL(i-1, i) : 名次为i和i-1的后缀的LCPL
// SA --> Rank
Rank[SA[i]] = i
// SA --> height
H[i] = LCPL(Rank[i]-1, Rank[i])
= LCPL(Suffix(i), Suffix(SA[Rank[i]-1]))
位置i的后缀X和名次在X前一位的后缀Y的LCPL。
height[i] = H[SA[i]]
height[Rank[i]] = H[i]
// height --> LCPL
LCPL(i, j) = min{height[i+1, ..., j]} // RMQ O(1)
- LCP引理1
\[
\displaylines{
LCPL(i, j) = min\{LCPL(k, k+1)| k=i, ..., j-1\}
}
\]
- LCP引理2
\[
\displaylines{
\forall i \le k \lt j \\
LCPL(k, j) \ge LCPL(i, j)
}
\]
- LCP引理3
-
H定理
i>0 && Rank[i]>0
时:
\[
\displaylines{
H[i] \ge H[i-1] -1
}
\]
int Rank[maxn], height[maxn];
void buildHeight(char* str, int n, int* sa){
int i, j, k;
for(i=0; i<n; i++) Rank[sa[i]]=i;
for(i=k=0; i<n; height[Rank[i++]]=k)
for(k?k--:0, j=sa[Rank[i]-1];
str[i+k] == str[j+k];
k++);
}
// length+1, "abcd\0"
buildSA("abcd", sa, 5, 130);
buildHeight("abcd", 5, sa);
Applications
POJ2774 Long Long Message
求两个字符串的最长公共子串。
- 首先拼接两个字符串,用不曾出现过的字符隔开。
- 则最长公共子串为某两个后缀的最长公共前缀,即
LCPL(i, j)
的最大值。 - 由于引理1,为了找到这个最大值,只需要遍历所有
LCPL(i-1 ,i) = height[i]
找最大的height即可。 - 注意两个后缀要在不同字符串中,因此额外判断
sa[i]>len1, sa[i-1]<len1
或相反。
#define _CRT_SECURE_NO_WARNINGS
#include <cstdio>
#include <cstring>
#include <string>
#include <iostream>
#include <algorithm>
#include <string>
using namespace std;
int N;
const int maxn = 200005;
int wa[maxn], wb[maxn], wv[maxn], Ws[maxn], height[maxn], Rank[maxn]; //辅助数组
int sa[maxn]; //sa[i]是名次为i的后缀的位置,即后缀数组
void buildSA(int * s, int * sa, int n, int m) {
int i, j, p, *pm = wa, *k2sa = wb, *t;
for (i = 0; i < m; i++) Ws[i] = 0;
for (i = 0; i < n; i++) Ws[pm[i] = s[i]]++; //(1)
for (i = 1; i < m; i++) Ws[i] += Ws[i - 1];
for (i = n - 1; i >= 0; i--) sa[--Ws[pm[i]]] = i;
for (j = p = 1; p < n; j <<= 1, m = p) { //烧脑循环
for (p = 0, i = n - j; i < n; i++) k2sa[p++] = i;
for (i = 0; i < n; i++) //按名次从小到大遍历n个j-后缀
if (sa[i] >= j) k2sa[p++] = sa[i] - j;
for (i = 0; i < m; i++) Ws[i] = 0;
for (i = 0; i < n; i++)
Ws[wv[i] = pm[k2sa[i]]]++;
for (i = 1; i < m; i++) Ws[i] += Ws[i - 1];
for (i = n - 1; i >= 0; i--)
sa[--Ws[wv[i]]] = k2sa[i];//求位置为k2sa[i]的2j-后缀的名次
for (t = pm, pm = k2sa, k2sa = t,
pm[sa[0]] = 0, p = i = 1; i < n; i++) {//按名次遍历2j-后缀
int a = sa[i - 1], b = sa[i];
if (k2sa[a] == k2sa[b] && a + j < n && b + j < n &&
k2sa[a + j] == k2sa[b + j])
pm[sa[i]] = p - 1; //未发现新的2j-后缀
else
pm[sa[i]] = p++; //发现新的2j-后缀
} //当p达到n时,说明已经有了n个不同的2j-后缀,并且都在sa里排好了序。
} //烧脑循环结束
return;
}
void BuildHeight(int * str, int n, int * sa, int * Rank) {
int i, j, k;
for (int i = 0; i < n; ++i) //i 是名次,n是字符串长度
Rank[sa[i]] = i;
height[0] = 0;
for (i = k = 0; i < n - 1; height[Rank[i++]] = k)//i是位置
for (k ? k-- : 0, j = sa[Rank[i] - 1]; //Rank[0]>0才不越界
str[i + k] == str[j + k]; k++);
}
char a[maxn], b[maxn];
int s[maxn];
int main() {
while (~scanf("%s%s", a, b)) {
int la = strlen(a), lb = strlen(b), l = 0;
for (int i = 0; i < la; i++) s[l++] = a[i] - 'a' + 1;
s[l++] = 28;
for (int i = 0; i < lb; i++) s[l++] = b[i] - 'a' + 1;
s[l] = 0;
buildSA(s, sa, l+1, 255);
BuildHeight(s, l+1, sa, Rank);
int ans = 0;
for (int i = 1; i <= l; i++) {
if (sa[i - 1]<la && sa[i]>la || sa[i - 1] > la && sa[i] < la)
ans = max(ans, height[i]);
}
cout << ans << endl;
}
}
最长公共子串(多序列)
设所有字符串序列长度之和为\(L\),每个字符串平均长度为\(l\),则算法
- 平均复杂度\(O(L \log L)\);
- 最坏复杂度\(O(l L\log L)\),在所有字符串都只含有同一个字符时达到。
POJ3450 Corporate Identity
#define _CRT_SECURE_NO_WARNINGS
#include <cstdio>
#include <cstring>
#include <iostream>
#include <algorithm>
#include <string>
using namespace std;
const int maxc = 205;
const int maxs = 4005;
const int maxn = 4005 * 205;
int wa[maxn], wb[maxn], wc[maxn], wd[maxn];
int sa[maxn];
// n: strlen(s), m: 128(ASCII unique chars)
void buildSA(int* s, int n, int* sa, int m = 128) {
int i, j, p, *pm = wa, *k2sa = wb, *t;
// k1 radix sort
for (i = 0; i < m; i++) wd[i] = 0;
for (i = 0; i < n; i++) wd[pm[i] = s[i]]++;
for (i = 1; i < m; i++) wd[i] += wd[i - 1];
for (i = n - 1; i >= 0; i--) sa[--wd[pm[i]]] = i;
// loops j->2j
for (j = p = 1; p < n; j <<= 1, m = p) {
// generate k2sa
for (p = 0, i = n - j; i < n; i++) k2sa[p++] = i; // null k2
for (i = 0; i < n; i++) if (sa[i] >= j) k2sa[p++] = sa[i] - j;
// k2 radix sort
for (i = 0; i < m; i++) wd[i] = 0;
for (i = 0; i < n; i++) wd[wc[i] = pm[k2sa[i]]]++;
for (i = 1; i < m; i++) wd[i] += wd[i - 1];
for (i = n - 1; i >= 0; i--) sa[--wd[wc[i]]] = k2sa[i];
// update pm
for (t = pm, pm = k2sa, k2sa = t, pm[sa[0]] = 0, p = i = 1; i < n; i++) {
int a = sa[i - 1], b = sa[i];
if (k2sa[a] == k2sa[b] && k2sa[a + j] == k2sa[b + j])
pm[sa[i]] = p - 1;
else pm[sa[i]] = p++;
}
}
}
int Rank[maxn], height[maxn];
void buildHeight(int* str, int n) {
int i, j, k;
for (i = 0; i < n; i++) Rank[sa[i]] = i;
for (i = k = 0; i < n; height[Rank[i++]] = k)
for (k ? k-- : 0, j = sa[Rank[i] - 1];
str[i + k] == str[j + k];
k++);
}
char ans[maxc];
int vis[maxs];
char a[maxc];
int s[maxn], id[maxn];
int N;
int l = 0, mark = 30, la;
bool check(int m) {
int cnt = 0;
memset(vis, 0, sizeof(vis));
// 遍历所有后缀
for (int i = 1; i <= l; i++) {
// 最长公共子串长度小于m,失败
if (height[i] < m) {
cnt = 0;
memset(vis, 0, sizeof(vis));
continue;
}
if (!vis[id[sa[i-1]]]) {
vis[id[sa[i-1]]] = 1;
cnt++;
}
if (!vis[id[sa[i]]]) {
vis[id[sa[i]]] = 1;
cnt++;
}
if (cnt == N) {
for (int j = 0; j < m; j++) ans[j] = s[sa[i] + j] + 'a' - 1;
ans[m] = '\0';
return true;
}
}
return false;
}
int main() {
while (cin >> N && N) {
l = 0;
mark = 30;
for (int i = 0; i < N; i++) {
cin >> a;
la = strlen(a);
for (int j = 0; j < la; j++) {
id[l] = i; // 标志当前下标是原来的哪一个字符串
s[l++] = a[j] - 'a' + 1;
}
id[l] = mark + i;
s[l++] = mark + i;
}
s[l] = 0;
buildSA(s, l + 1, sa, mark + N);
buildHeight(s, l + 1);
int l = 1, r = la, flag = 0;
while (l < r) {
int m = (l + r) / 2;
//cout << "bs " << l << "-" << m << " " << r << endl;
if (check(m)) {
flag = 1;
l = m + 1;
}
else r = m;
}
if (flag) cout << ans << endl;
else cout << "IDENTITY LOST" << endl;
}
}
Musical Theme POJ
不重叠的最长重复子串。
有一个测试点WA,我也不知道为啥.jpg
#define _CRT_SECURE_NO_WARNINGS
#include <cstdio>
#include <cstring>
#include <iostream>
#include <algorithm>
#include <string>
using namespace std;
const int maxn = 20005;
int N;
int wa[maxn], wb[maxn], wc[maxn], wd[maxn];
int sa[maxn];
// n: strlen(s), m: 128(ASCII unique chars)
void buildSA(int* s, int n, int* sa, int m = 200) {
int i, j, p, *pm = wa, *k2sa = wb, *t;
// k1 radix sort
for (i = 0; i < m; i++) wd[i] = 0;
for (i = 0; i < n; i++) wd[pm[i] = s[i]]++;
for (i = 1; i < m; i++) wd[i] += wd[i - 1];
for (i = n - 1; i >= 0; i--) sa[--wd[pm[i]]] = i;
// loops j->2j
for (j = p = 1; p < n; j <<= 1, m = p) {
// generate k2sa
for (p = 0, i = n - j; i < n; i++) k2sa[p++] = i; // null k2
for (i = 0; i < n; i++) if (sa[i] >= j) k2sa[p++] = sa[i] - j;
// k2 radix sort
for (i = 0; i < m; i++) wd[i] = 0;
for (i = 0; i < n; i++) wd[wc[i] = pm[k2sa[i]]]++;
for (i = 1; i < m; i++) wd[i] += wd[i - 1];
for (i = n - 1; i >= 0; i--) sa[--wd[wc[i]]] = k2sa[i];
// update pm
for (t = pm, pm = k2sa, k2sa = t, pm[sa[0]] = 0, p = i = 1; i < n; i++) {
int a = sa[i - 1], b = sa[i];
if (k2sa[a] == k2sa[b] && k2sa[a + j] == k2sa[b + j])
pm[sa[i]] = p - 1;
else pm[sa[i]] = p++;
}
}
}
int Rank[maxn], height[maxn];
void buildHeight(int* str, int n) {
int i, j, k;
for (i = 0; i < n; i++) Rank[sa[i]] = i;
for (i = k = 0; i < n; height[Rank[i++]] = k)
for (k ? k-- : 0, j = sa[Rank[i] - 1];
str[i + k] == str[j + k];
k++);
}
int s[maxn];
bool check(int m) {
int mn = sa[1];
int mx = sa[1];
for (int i = 2; i < N; i++) {
if (height[i] >= m){
mn = min(mn, sa[i]);
mx = max(mx, sa[i]);
}
else {
if (mx - mn >= m) return true;
else mx = mn = sa[i];
}
}
if (mx - mn >= m) return true;
return false;
}
int main() {
while (cin >> N && N) {
for (int i = 0; i < N; i++) scanf("%d", s + i);
for (int i = 0; i < N - 1; i++) s[i] = s[i + 1] - s[i] + 100;
s[N - 1] = 0;
buildSA(s, N, sa);
buildHeight(s, N);
int l = 4, r = N, ans = -1;
while (l < r) {
int mid = (l + r) / 2;
if (check(mid)) {
ans = mid;
l = mid + 1;
}
else r = mid;
}
ans++;
printf("%d\n", ans < 5 ? 0 : ans);
}
}
Milk Pattern
可重叠的至少k次的最长重复子序列。
二分(转换为判定性问题)+后缀分组法。
#define _CRT_SECURE_NO_WARNINGS
#include <cstdio>
#include <cstring>
#include <string>
#include <iostream>
#include <algorithm>
#include <string>
using namespace std;
int N;
const int MAXN = 20005;
int wa[MAXN], wb[MAXN], wv[MAXN], Ws[MAXN], height[MAXN], Rank[MAXN]; //辅助数组
int sa[MAXN]; //sa[i]是名次为i的后缀的位置,即后缀数组
void buildSA(int * s, int * sa, int n, int m) {
int i, j, p, *pm = wa, *k2sa = wb, *t;
for (i = 0; i < m; i++) Ws[i] = 0;
for (i = 0; i < n; i++) Ws[pm[i] = s[i]]++; //(1)
for (i = 1; i < m; i++) Ws[i] += Ws[i - 1];
for (i = n - 1; i >= 0; i--) sa[--Ws[pm[i]]] = i;
for (j = p = 1; p < n; j <<= 1, m = p) { //烧脑循环
for (p = 0, i = n - j; i < n; i++) k2sa[p++] = i;
for (i = 0; i < n; i++) //按名次从小到大遍历n个j-后缀
if (sa[i] >= j) k2sa[p++] = sa[i] - j;
for (i = 0; i < m; i++) Ws[i] = 0;
for (i = 0; i < n; i++)
Ws[wv[i] = pm[k2sa[i]]]++;
for (i = 1; i < m; i++) Ws[i] += Ws[i - 1];
for (i = n - 1; i >= 0; i--)
sa[--Ws[wv[i]]] = k2sa[i];//求位置为k2sa[i]的2j-后缀的名次
for (t = pm, pm = k2sa, k2sa = t,
pm[sa[0]] = 0, p = i = 1; i < n; i++) {//按名次遍历2j-后缀
int a = sa[i - 1], b = sa[i];
if (k2sa[a] == k2sa[b] && a + j < n && b + j < n &&
k2sa[a + j] == k2sa[b + j])
pm[sa[i]] = p - 1; //未发现新的2j-后缀
else
pm[sa[i]] = p++; //发现新的2j-后缀
} //当p达到n时,说明已经有了n个不同的2j-后缀,并且都在sa里排好了序。
} //烧脑循环结束
return;
}
void BuildHeight(int * str, int n, int * sa, int * Rank) {
int i, j, k;
for (int i = 0; i < n; ++i) //i 是名次,n是字符串长度
Rank[sa[i]] = i;
height[0] = 0;
for (i = k = 0; i < n - 1; height[Rank[i++]] = k)//i是位置
for (k ? k-- : 0, j = sa[Rank[i] - 1]; //Rank[0]>0才不越界
str[i + k] == str[j + k]; k++);
}
int s[MAXN];
int K;
bool check(int m) {
int cnt = 1;
for (int i = 0; i < N; i++) {
if (height[i] >= m) {
cnt++;
if (cnt == K) return true;
}
else cnt = 1;
}
return false;
}
int main() {
cin >> N >> K;
for (int i = 0; i < N; i++) cin >> s[i];
buildSA(s, sa, N, 255);
BuildHeight(s, N, sa, Rank);
int l = 1, r = N;
while (l < r) {
int m = (l + r) / 2;
if (check(m)) l = m + 1;
else r = m;
}
cout << l - 1 << endl;
}