📜  使用后缀数组对字符串的不同子字符串进行计数

📅  最后修改于: 2021-04-17 13:46:27             🧑  作者: Mango

鉴于小写字母字符长度为n的字符串,我们需要计算这个字符串的不同子的总数。
例子:

Input  : str = “ababa”
Output : 10
Total number of distinct substring are 10, which are,
"", "a", "b", "ab", "ba", "aba", "bab", "abab", "baba"
and "ababa"

我们在下面的文章中讨论了基于Suffix Trie的解决方案:
使用后缀Trie计数字符串的不同子字符串

我们可以使用后缀数组和最长公共前缀概念解决此问题。后缀数组是给定字符串的所有后缀的排序数组。
对于字符串“ ababa”,后缀为:“ ababa”,“ baba”,“ aba”,“ ba”,“ a”。在以排序形式获取这些后缀之后,我们得到的后缀数组为[4,2,0,3,1]
然后我们使用kasai算法计算lcp数组。对于字符串“ ababa”,lcp数组为[1、3、0、2、0]

在构造完两个数组之后,我们牢记这一事实来计算不同子字符串的总数:如果查看字符串的每个后缀的前缀,我们将覆盖该字符串的所有子字符串。
我们将解释以上示例的过程,

String  = “ababa”
Suffixes in sorted order : “a”, “aba”, “ababa”,
                            “ba”, “baba”
Initializing distinct substring count by length
of first suffix, 
Count = length(“a”) = 1        
Substrings taken in consideration : “a”

Now we consider each consecutive pair of suffix, 
lcp("a", "aba") = "a".
All characters that are not part of the longest 
common prefix contribute to a distinct substring. 
In the above case, they are 'b' and ‘a'. So they 
should be added to Count.
Count += length(“aba”) - lcp(“a”, “aba”) 
Count  = 3    
Substrings taken in consideration : “aba”, “ab”

Similarly for next pair also,
Count += length(“ababa”) - lcp(“aba”, “ababa”)
Count = 5
Substrings taken in consideration : “ababa”, “abab”

Count += length(“ba”) - lcp(“ababa”, “ba”)
Count = 7
Substrings taken in consideration : “ba”, “b”

Count += length(“baba”) - lcp(“ba”, “baba”)
Count = 9
Substrings taken in consideration : “baba”, “bab”

We finally add 1 for empty string.
count = 10

上面的想法在下面的代码中实现。

// C++ code to count total distinct substrings
// of a string
#include 
using namespace std;
  
// Structure to store information of a suffix
struct suffix
{
    int index;  // To store original index
    int rank[2]; // To store ranks and next
                 // rank pair
};
  
// A comparison function used by sort() to compare
// two suffixes. Compares two pairs, returns 1 if
// first pair is smaller
int cmp(struct suffix a, struct suffix b)
{
    return (a.rank[0] == b.rank[0])?
           (a.rank[1] < b.rank[1] ?1: 0):
           (a.rank[0] < b.rank[0] ?1: 0);
}
  
// This is the main function that takes a string
// 'txt' of size n as an argument, builds and return
// the suffix array for the given string
vector buildSuffixArray(string txt, int n)
{
    // A structure to store suffixes and their indexes
    struct suffix suffixes[n];
  
    // Store suffixes and their indexes in an array
    // of structures. The structure is needed to sort
    // the suffixes alphabatically and maintain their
    // old indexes while sorting
    for (int i = 0; i < n; i++)
    {
        suffixes[i].index = i;
        suffixes[i].rank[0] = txt[i] - 'a';
        suffixes[i].rank[1] = ((i+1) < n)?
                              (txt[i + 1] - 'a'): -1;
    }
  
    // Sort the suffixes using the comparison function
    // defined above.
    sort(suffixes, suffixes+n, cmp);
  
    // At his point, all suffixes are sorted according
    // to first 2 characters.  Let us sort suffixes
    // according to first 4 characters, then first
    // 8 and so on
    int ind[n];  // This array is needed to get the
                 // index in suffixes[] from original
                 // index. This mapping is needed to get
                 // next suffix.
    for (int k = 4; k < 2*n; k = k*2)
    {
        // Assigning rank and index values to first suffix
        int rank = 0;
        int prev_rank = suffixes[0].rank[0];
        suffixes[0].rank[0] = rank;
        ind[suffixes[0].index] = 0;
  
        // Assigning rank to suffixes
        for (int i = 1; i < n; i++)
        {
            // If first rank and next ranks are same as
            // that of previous suffix in array, assign
            // the same new rank to this suffix
            if (suffixes[i].rank[0] == prev_rank &&
               suffixes[i].rank[1] == suffixes[i-1].rank[1])
            {
                prev_rank = suffixes[i].rank[0];
                suffixes[i].rank[0] = rank;
            }
  
            else // Otherwise increment rank and assign
            {
                prev_rank = suffixes[i].rank[0];
                suffixes[i].rank[0] = ++rank;
            }
            ind[suffixes[i].index] = i;
        }
  
        // Assign next rank to every suffix
        for (int i = 0; i < n; i++)
        {
            int nextindex = suffixes[i].index + k/2;
            suffixes[i].rank[1] = (nextindex < n)?
                      suffixes[ind[nextindex]].rank[0]: -1;
        }
  
        // Sort the suffixes according to first k characters
        sort(suffixes, suffixes+n, cmp);
    }
  
    // Store indexes of all sorted suffixes in the suffix
    // array
    vectorsuffixArr;
    for (int i = 0; i < n; i++)
        suffixArr.push_back(suffixes[i].index);
  
    // Return the suffix array
    return  suffixArr;
}
  
/* To construct and return LCP */
vector kasai(string txt, vector suffixArr)
{
    int n = suffixArr.size();
  
    // To store LCP array
    vector lcp(n, 0);
  
    // An auxiliary array to store inverse of suffix array
    // elements. For example if suffixArr[0] is 5, the
    // invSuff[5] would store 0.  This is used to get next
    // suffix string from suffix array.
    vector invSuff(n, 0);
  
    // Fill values in invSuff[]
    for (int i=0; i < n; i++)
        invSuff[suffixArr[i]] = i;
  
    // Initialize length of previous LCP
    int k = 0;
  
    // Process all suffixes one by one starting from
    // first suffix in txt[]
    for (int i=0; i0)
            k--;
    }
  
    // return the constructed lcp array
    return lcp;
}
  
//  method to return count of total distinct substring
int countDistinctSubstring(string txt)
{
    int n = txt.length();
    //  calculating suffix array and lcp array
    vector suffixArr = buildSuffixArray(txt, n);
    vector lcp = kasai(txt, suffixArr);
  
    // n - suffixArr[i] will be the length of suffix
    // at ith position in suffix array initializing
    // count with length of first suffix of sorted
    // suffixes
    int result = n - suffixArr[0];
  
    for (int i = 1; i < lcp.size(); i++)
  
        //  subtract lcp from the length of suffix
        result += (n - suffixArr[i]) - lcp[i - 1];
  
    result++;  // For empty string
    return result;
}
  
//  Driver code to test above methods
int main()
{
    string txt = "ababa";
    cout << countDistinctSubstring(txt);
    return 0;
}

输出:

10