Generalized Suffix Automaton (GSAM) is a powerful structure for handling all suffixes of a trie (or multiple strings) with minimal DFA size. For a trie ( T ), we define "prefixes" (root-to-node strings), "suffixes" (node-to-leaf strrings), and "substrings" (path strings) similarly to single strings. GSAM is the minimal DFA accepting all suffixes of ( T ).
The ( \text{endpos} ) equivalence classes (as in standard Suffix Automaton, SAM) form a tree structure, and the splitting of equivalence classes when addding a new leaf node follows similar rules to SAM (detailed in Liu Yanyu’s 2015 training team paper). The number of nodes in GSAM is bounded by ( O(n) ) (or ( O(\text{trie size}) )), matching the linearity of stendard SAM’s node count.
A key insight: GSAM’s transition edges have an ( O(n|\Sigma|) ) upper bound. This is tight—for example, a trie with a long chain of 'a' transitions followed by branches with distinct characters achieves this bound (though constants are not maximized).
Implementing GSAM is similar to SAM, with two common approaches: BFS (offline, based on trie BFS order) and DFS (online, for dynamic insertion). Below is an offline BFS-based implementation.
const int MAXN = 1e5 + 5; // Adjust as needed
const int CHARSET = 10; // Character set size (e.g., 10 for digits)
int trans[MAXN][CHARSET], length[MAXN], suffixLink[MAXN], size = 1;
int trieTrans[MAXN][CHARSET], triePos[MAXN], trieRoot = 1, trieSize = 1;
int queue[MAXN], tail = 0;
namespace BFS {
int extend(int c, int p) {
int curr = ++size;
length[curr] = length[p] + 1;
while (p && !trans[p][c]) {
trans[p][c] = curr;
p = suffixLink[p];
}
if (p) {
int q = trans[p][c];
if (length[q] == length[p] + 1) {
suffixLink[curr] = q;
} else {
int clone = ++size;
memcpy(trans[clone], trans[q], sizeof(trans[q]));
length[clone] = length[p] + 1;
suffixLink[clone] = suffixLink[q];
suffixLink[q] = clone;
suffixLink[curr] = clone;
while (p && trans[p][c] == q) {
trans[p][c] = clone;
p = suffixLink[p];
}
}
} else {
suffixLink[curr] = 1; // Root's suffix link points to itself
}
return curr;
}
void build() {
queue[tail = 1] = trieRoot; // Initialize queue with trie root
for (int i = 1; i <= tail; ++i) {
int u = queue[i];
for (int c = 0; c < CHARSET; ++c) {
if (trieTrans[u][c]) { // Trie has a child via character c
int trieChild = trieTrans[u][c];
triePos[trieChild] = extend(c, triePos[u]);
queue[++tail] = trieChild;
}
}
}
}
}
The BFS approach mirrors standard SAM but inherits from the trie’s parent. For online insertion (processing strings incrementally), a DFS-based approach works:
const int MAXN = 1e5 + 5;
const int CHARSET = 10;
int trans[MAXN][CHARSET], length[MAXN], suffixLink[MAXN], size = 1;
int trieTrans[MAXN][CHARSET], triePos[MAXN], trieRoot = 1;
namespace DFS {
int extend(int c, int p) {
// Check if the transition already exists (avoids redundant nodes)
if (trans[p][c]) {
int q = trans[p][c];
if (length[q] == length[p] + 1) {
return q; // Reuse existing node with correct length
}
// Split the equivalence class (clone node)
int clone = ++size;
memcpy(trans[clone], trans[q], sizeof(trans[q]));
length[clone] = length[p] + 1;
suffixLink[clone] = suffixLink[q];
suffixLink[q] = clone;
// Update transitions pointing to q
while (p && trans[p][c] == q) {
trans[p][c] = clone;
p = suffixLink[p];
}
return clone;
}
// Create new node if transition doesn't exist
int curr = ++size;
length[curr] = length[p] + 1;
while (p && !trans[p][c]) {
trans[p][c] = curr;
p = suffixLink[p];
}
if (p) {
int q = trans[p][c];
if (length[q] == length[p] + 1) {
suffixLink[curr] = q;
} else {
int clone = ++size;
memcpy(trans[clone], trans[q], sizeof(trans[q]));
length[clone] = length[p] + 1;
suffixLink[clone] = suffixLink[q];
suffixLink[q] = clone;
suffixLink[curr] = clone;
while (p && trans[p][c] == q) {
trans[p][c] = clone;
p = suffixLink[p];
}
}
} else {
suffixLink[curr] = 1;
}
return curr;
}
void build(int u = trieRoot) {
for (int c = 0; c < CHARSET; ++c) {
if (trieTrans[u][c]) {
int trieChild = trieTrans[u][c];
triePos[trieChild] = extend(c, triePos[u]);
build(trieChild); // Recursive DFS on trie
}
}
}
}
The DFS version requires a check: if the transition ( S+x ) (prefix ( S ) + character ( x )) already exists, we avoid creating a new node (which would violate DFA minimality). This is because ( S+x ) might have been inserted earlier (e.g., as a suffix of another string), so reusing the existing equivalence class (or splitting it) is necessary.
BFS avoids this check because trie BFS ensures longer strings are processed after shorter ones—so ( \dots+S+x ) cannot exist before ( S+x ). BFS complexity is linear in trie size (with a character set constant), while DFS (treating each string as a separate insertion) is linear in total string length (via potential analysis on suffix links).