后缀树小结+详细解释的代码

2012-09-27

后缀树总结+详细解释的代码几年前曾实现过一个菜鸟版的SuffixTree。最近要用到后缀树处理些问题，认真实现了

后缀树总结+详细解释的代码

几年前曾实现过一个菜鸟版的SuffixTree。最近要用到后缀树处理些问题，认真实现了一个，主要是基于UKKonen的On-Line算法。稍微总结下。

网上关于后缀树介绍的文章有几篇写的挺好的，我就不再费力去做重复工作了。这个只是我的个人总结帖，所以定位是给看了后缀树的简介，知道什么后缀树，然后看了UKKonen的加速文章，有点迷迷糊糊的同学的一个总结帖。

首先国内后缀树介绍有几个博客写的蛮详细的：

http://www.cnblogs.com/snowberg/archive/2011/10/21/2468588.html

http://blog.csdn.net/v_july_v/article/details/6897097

正宗的Paper应该是Ukkonen的下面这一篇paper。

（1） E. Ukkonen, On-Line Construction ofSuffix Trees, Algorithmica, 14 (1995), 249-260

但是我看了，对我来说真心有点难懂啊，然后Gusfield后来写了不知道书还是Paper的下面一个文章，讲的就通俗易懂多了，想学习后缀树OnLine算法的话，强烈推荐看下面的Guesfield的文章。

（2） Gusfield, Dan (1999) [1997].Algorithms on Strings, Trees and Sequences: Computer Science and ComputationalBiology. USA: Cambridge University Press.

上面两篇Paper都是英文，想看的同学Google下即可。

大部分的同学一看后缀树都明白是什么回事了，但是一看到UKKonen的算法，三个加速大段大段的描述后，就晕掉了。

我尝试忽略证明，简单并不严谨地总结下UKKonen算法中（没看过Guesfield或相关文章的同学，我只能表示对不住了），关键的三个加速：

（1） SuffixLink ：各种理论证明起来有点小复杂，但是道理用处说白了很简单。因为当我们将s[i+1] 加到子串s[j-1…i]后，下一步我们就想将s[i+1]加到s[j….i]后面。正常来说我们就从根节点遍历s[j….i]呗，但是这个花时间啊，所以我们为什么不从s[j-1…..i]直接就跳到s[j….i]呢，而不要每次都从根节点遍历下来。所以所谓的SuffixLink，对s[j-1....i]来说，就是s[j…..i]的地址。

（2）在Ukkonen算法中，叶节点总是叶节点（这个加速认真看下Gusfield的文章一看就懂，这里只是总结，就不深入去讲），所以每次遍历只需从最后一个叶节点开始。

（3）但发现s[j…i+1]已经在后缀树中，那么s[j+1….i+1]这些后缀肯定也在后缀树中了，所以就不需要再遍历。

Ukkoen的后缀树我觉得最难得还是代码实现。网上代码比较少，特来分享下。我这个肯定不是最快的，不过应该是后缀树注释最多之一的一份代码了，而且代码结构和Guesfield那文章的整体描述比较接近。然后为了方便入门，这个只实现了加速1，慢慢一个个的来。大家有兴趣的，稍微修改下加速2和加速3就来了。然后有错误，也麻烦大家指正，我做了不少测试了，结果都正确，但是暂时不敢100%包票。

头文件：

#pragma once#include "SuffixTree.h"#include <iostream>using namespace std;SuffixNode* pNodeNoSuffixLink=NULL;//=====================================Class Definitions==============================/*  Trace the substring(TreePath strPath) in one single edge going out of pNode.  Input:  int* edgeCharsFound : how many characters we find matched in the outgoing edge of pNode.*/SuffixNode* TraceSingleEdge(SuffixTree* pTree,SuffixNode* pNode,TreePath strPath,int* edgeCharsFound,int* edgePos,bool* searchDone,bool skipFlag){//Find outgoing edge of pNode with our first character.SuffixNode* nextNode = Find_Son(pTree,pNode,pTree->m_czTreeStr[strPath.m_iBegin]);*searchDone = true;if(nextNode == NULL){//There is no match in pNode's sons,so we can only return to pNode.*edgePos = GetNodeLabelLength(pTree,pNode); *edgeCharsFound = 0;return pNode;}int edgeLen = GetNodeLabelLength(pTree,nextNode);int strLen = strPath.m_iEnd - strPath.m_iBegin + 1;if(skipFlag == true)//Use the trick1 : skip{if(edgeLen < strLen){*searchDone = false;*edgeCharsFound = edgeLen;*edgePos = edgeLen - 1;}else if(edgeLen == strLen){*edgeCharsFound = edgeLen;*edgePos = edgeLen - 1;}else{*edgeCharsFound = strLen;*edgePos = strLen - 1;}return nextNode;}else//No skip,match each char one after another{*edgePos = 0;*edgeCharsFound = 0;//Find out the min lengthif(strLen < edgeLen)edgeLen = strLen;for(*edgeCharsFound=1,*edgePos=1;(*edgePos)<edgeLen ;(*edgePos)++,(*edgeCharsFound)++){if( pTree->m_czTreeStr[ nextNode->m_iEdgeStart + *edgePos ] != pTree->m_czTreeStr[strPath.m_iBegin + *edgePos ]){(*edgePos)--;return nextNode;}}}//When it comes here, (*edgePos) is one more;(*edgePos)--;if(*edgeCharsFound < strLen){*searchDone = false;}return nextNode;}/*  Trace the sub string(TreePath str) from the node(SuffixNode* pNode).  Input:  int* edgePos :For output , where the last char is found at that edge  int* charsFound : How many chars of str have been found.  bool skipFlag : Use skip trick or not.  */SuffixNode* TraceString(SuffixTree* pTree,SuffixNode* pNode,TreePath str,int* edgePos,int* charsFound,bool skipFlag){bool searchDone=false;*charsFound = 0;*edgePos=0 ;int edgeCharsFound=0;while(searchDone==false){edgeCharsFound = 0;*edgePos=0;pNode = TraceSingleEdge(pTree,pNode,str,&edgeCharsFound,edgePos,&searchDone,skipFlag);str.m_iBegin += edgeCharsFound;*charsFound += edgeCharsFound;}if(*charsFound == 0)return NULL;return pNode;}/*  Input:   (1) pNode : the node who is going to add a new son or whose edge is going to be split.   (2) edgeLabelBeg :  when newleafFlag==true,it's the edge begin label of the new leaf. when when newleafFlag==false, it's the edge begin label of the new new leaf( the leaf of s[i+1], not s[i]).   (3) like above : just the end   (4 )int edgePos : where split is done to pNode if newLeafFlag==false (the 0th position or 1th position or...)*/SuffixNode* ApplyExtensionRule2(SuffixNode* pNode,int edgeLabelBeg,int edgeLabelEnd,int edgePos,bool newLeafFlag){if(newLeafFlag==true){//Add an new leafSuffixNode* newLeaf = CreateTreeNode(pNode,edgeLabelBeg,edgeLabelEnd);return newLeaf;}else{//Add an new internal node and an new leaf//First create the new internal node.SuffixNode* nInternalNode = CreateTreeNode(pNode->m_pFarther,pNode->m_iEdgeStart,pNode->m_iEdgeStart + edgePos);//Remove pNode from its farther's sonsfor(vector<SuffixNode*>::iterator pNodeIter=pNode->m_pFarther->m_pSons.begin();pNodeIter!=pNode->m_pFarther->m_pSons.end();pNodeIter++){if( pNode == *pNodeIter ){pNode->m_pFarther->m_pSons.erase(pNodeIter);break;}}//Adjust pNode's information.pNode->m_iEdgeStart += (edgePos + 1);pNode->m_pFarther = nInternalNode;nInternalNode->m_pSons.push_back(pNode);//Create the new leaf for s[i+1]SuffixNode* nLeafNode = CreateTreeNode(nInternalNode,edgeLabelBeg,edgeLabelEnd);return nInternalNode;}}bool IsTheLastCharInEdge(SuffixTree* pTree, SuffixNode* pNode, int edge_pos){if( edge_pos == GetNodeLabelLength(pTree,pNode) - 1 )return true;return false;}int GetNodeLabelEnd(SuffixTree* pTree,SuffixNode* pNode){//if(pNode->m_pSons.size() == NULL)//{//return pTree->m_iE;//}return pNode->m_iEdgeEnd;}int GetNodeLabelLength(SuffixTree* pTree, SuffixNode* pNode){int length = GetNodeLabelEnd(pTree,pNode) - pNode->m_iEdgeStart + 1;return length;}SuffixNode* CreateTreeNode(SuffixNode* pFarther,int iedgeStart,int iedgeEnd){SuffixNode* pNode=new SuffixNode();pNode->m_iEdgeStart = iedgeStart;pNode->m_iEdgeEnd = iedgeEnd;pNode->m_pFarther = pFarther;pNode->m_pSuffixLink = NULL;if(pFarther!=NULL)pFarther->m_pSons.push_back(pNode);return pNode;}//Find the son node which starts with the chSuffixNode* Find_Son(SuffixTree* pTree,SuffixNode* pFarNode, char ch){for(vector<SuffixNode*>::iterator nodeIter=pFarNode->m_pSons.begin();nodeIter!=pFarNode->m_pSons.end();nodeIter++){if(pTree->m_czTreeStr[(*nodeIter) -> m_iEdgeStart] == ch ){return *nodeIter;}}return NULL;}/*   FollowSuffixLink :   Follows the suffix link of the source node according to Ukkonen's rules(jump from s[j-1...i] to s[j....i]).    Input : The tree, and node. The node is the last internal node we visited.   Output: The destination node that represents the longest suffix of node's            path. Example: if node represents the path "abcde" then it returns            the node that represents "bcde".*/void FollowSuffixLink(SuffixTree* pTree,TreePos * pPos,TreePath strji){if(strji.m_iEnd < strji.m_iBegin)//Empty string,then we return to root.{pPos->m_iEdgePos=0;pPos->m_pNode = pTree->m_pRoot;return;}/*gama : the string(r in Gusfield's paper) between node and its father.If the node doesn't have suffix link , we need to go up to its farther*/TreePath gama;if(pPos->m_pNode == pTree->m_pRoot){int charsFound=0;pPos->m_pNode = TraceString(pTree,pTree->m_pRoot,strji,&pPos->m_iEdgePos,&charsFound,false);if(pPos->m_pNode == NULL){pPos->m_iEdgePos = 0;pPos->m_pNode =pTree->m_pRoot;if(strji.m_iBegin != strji.m_iEnd){cout<<"There is s[j-1..i](not empty) doesn't exist!"<<endl;return;}}if(strji.m_iEnd != strji.m_iBegin && charsFound != strji.m_iEnd - strji.m_iBegin + 1){cout<<"s[j...i] doesn't exit from root:["<<strji.m_iBegin<<","<<strji.m_iEnd<<"]"<<endl;return;}return;}// No suffix link,walk up at most one step(if it is not the root).if( pPos->m_pNode->m_pSuffixLink == NULL ) {if(pPos->m_pNode->m_pFarther == pTree->m_pRoot){//its farther is the rootpPos->m_pNode = pTree->m_pRoot;int charsFound=0;pPos->m_pNode = TraceString(pTree,pTree->m_pRoot,strji,&pPos->m_iEdgePos,&charsFound,false);if(pPos->m_pNode == NULL){pPos->m_iEdgePos = 0;pPos->m_pNode =pTree->m_pRoot;if(strji.m_iBegin != strji.m_iEnd){cout<<"There is s[j-1..i](not empty) doesn't exist!"<<endl;return;}}if(strji.m_iEnd != strji.m_iBegin &&  charsFound != strji.m_iEnd - strji.m_iBegin + 1){cout<<"s[j...i] doesn't exit from root:["<<strji.m_iBegin<<","<<strji.m_iEnd<<"]"<<endl;return;}return;}else{// Find the gamma (the substring between pPos's parent's and pPos's)gama.m_iBegin = pPos->m_pNode->m_iEdgeStart;gama.m_iEnd = pPos->m_pNode->m_iEdgeStart + pPos->m_iEdgePos;// the end of s[j..i] //Follow farther's suffix linkpPos->m_pNode = pPos->m_pNode->m_pFarther->m_pSuffixLink;//Down-walk gamma (until we found s[i],the character we add last extension)int charsFound=0;pPos->m_pNode = TraceString(pTree,pPos->m_pNode,gama,&pPos->m_iEdgePos,&charsFound,true);////////////////////////////////////////////////}}else{//A suffix link exists - just follow it.pPos->m_pNode = pPos->m_pNode->m_pSuffixLink;pPos->m_iEdgePos = GetNodeLabelLength(pTree,pPos->m_pNode) - 1; //The last char of pPos's suffix link represents s[i] (the character we add last extension).}return;}/*For Debug:See if the sub string (from root to pPos) equals pTree->string[subPath.m_iBegin,subPath.m_iEnd]*/bool TestPosSubStringEqualPath(SuffixTree* pTree,TreePos *pPos, TreePath subPath){if(pTree->m_pRoot == pPos->m_pNode && subPath.m_iBegin == subPath.m_iEnd){return true;}int strRevIndex = subPath.m_iEnd;SuffixNode* tmpNode = pPos->m_pNode;int edgeRevIndex = tmpNode->m_iEdgeStart + pPos->m_iEdgePos;while(tmpNode != pTree->m_pRoot){while( edgeRevIndex >= tmpNode->m_iEdgeStart && strRevIndex >= subPath.m_iBegin){if( pTree->m_czTreeStr[edgeRevIndex] != pTree->m_czTreeStr[strRevIndex] ){return false;}edgeRevIndex--;strRevIndex--;}tmpNode = tmpNode->m_pFarther;edgeRevIndex = tmpNode->m_iEdgeEnd;}if(strRevIndex != subPath.m_iBegin-1)return false;return true;}/*Input:(1) SuffixTree* pTree : The suffix tree(2) TreePos* pPos : The last internal node we visited , then we are going to jump to its suffix link in this extension.(3) TreePath extendStrPath : The suffix (s[j...i+1]) we are goint to add to the tree.*/void SingleCharExtesion(SuffixTree* pTree,TreePos* pPos ,TreePath extendStrPath,int* firstExtend){TreePath sji;sji.m_iBegin = extendStrPath.m_iBegin;sji.m_iEnd = extendStrPath.m_iEnd - 1;if(*firstExtend != -1){//Ready to jump from suffix link at or above s[j-1...i] that either has a suffix link (to s[j-1...i]) or is the root.FollowSuffixLink(pTree,pPos,sji);}*firstExtend = 1;//////////////////////////////////////////For Debug//////////////////////////////////////////////////if(sji.m_iEnd >= sji.m_iBegin){if(TestPosSubStringEqualPath(pTree,pPos, sji) == false){cout<<"FollowSuffixLink doesn't go to the right s[j..i]:"<<extendStrPath.m_iBegin<<":"<<extendStrPath.m_iEnd-1<<endl;}}///////////////////////////////////////////////////////////////////////////////////////////////////////int chars_found=0;//Now we are going to found out which rule to use for extension,rule1?rule2?rule3?//First test rule3.{/*We only need to extend the last character(s[i+1]) since we use suffix link to jump from s[j-1..i] to s[j..i],and extendStrPath.m_iEnd is s[i+1].*/chars_found = 0;/*If the last character(s[i]) is the last of its edge,try to find s[i+1] in the next edge.*/if(IsTheLastCharInEdge(pTree,pPos->m_pNode,pPos->m_iEdgePos)){SuffixNode* pTmp = Find_Son(pTree,pPos->m_pNode,pTree->m_czTreeStr[extendStrPath.m_iEnd]);if(pTmp != 0){  //s[i+1] exits already.chars_found = 1;}}//Else see if can find extendStrPath.m_iEnd in the current edgeelse{if( pTree->m_czTreeStr[ pPos->m_pNode->m_iEdgeStart + pPos->m_iEdgePos + 1]    == pTree->m_czTreeStr[extendStrPath.m_iEnd])//Notice that " + 1 " means the next char of s[j...i] : yes, s[i+1]{//s[i+1] exits already.chars_found = 1;}}}//If s[i+1] was found - rule 3 appliesif(chars_found == 1){ /* If there is an internal node that has no suffix link yet (only one may          exist) - create a suffix link from it to the father - node of the          current position in the tree*/if(pNodeNoSuffixLink != NULL){if(pPos->m_pNode->m_pSons.size() != 0){pNodeNoSuffixLink->m_pSuffixLink = pPos->m_pNode;pNodeNoSuffixLink=NULL;}}//if(pPos->m_pNode->m_pSons.size()==0)//*ruleApplied = 1;//else//*ruleApplied = 3;return;}/*Since rule3 doesn't fit ( that s[j...i+1] is not in the tree),we are going to see rule2 and rule1.*/ /* If last char s[j...i] found is the last char of an edge - create an new leaf,apply rule2(add a new leaf) or rule1 */if(IsTheLastCharInEdge(pTree,pPos->m_pNode,pPos->m_iEdgePos) || pPos->m_pNode==pTree->m_pRoot){if(pPos->m_pNode->m_pSons.size() != 0){//Internal node or root,apply rule2 that add a new leafApplyExtensionRule2(pPos->m_pNode, extendStrPath.m_iEnd, extendStrPath.m_iEnd, 0, true);//Suffix Linkif(pNodeNoSuffixLink != NULL){pNodeNoSuffixLink->m_pSuffixLink = pPos->m_pNode;pNodeNoSuffixLink = NULL;}/**ruleApplied = 2;*/}//else it's a leaf, We do nothing.else{pPos->m_pNode->m_iEdgeEnd++;/**ruleApplied = 1;*/}}//Else apply rule2 that adds an new intern nodeelse{SuffixNode* nInternalNode = ApplyExtensionRule2(pPos->m_pNode,extendStrPath.m_iEnd,extendStrPath.m_iEnd,pPos->m_iEdgePos,false);if(pNodeNoSuffixLink != NULL){pNodeNoSuffixLink->m_pSuffixLink = nInternalNode;pNodeNoSuffixLink = NULL;}//See the new internal node's suffix link.if( GetNodeLabelLength(pTree,nInternalNode)==1 && nInternalNode->m_pFarther == pTree->m_pRoot){nInternalNode->m_pSuffixLink = pTree->m_pRoot;pNodeNoSuffixLink = NULL;}else{pNodeNoSuffixLink = nInternalNode;}//Adjust the node for the next extensionpPos->m_pNode = nInternalNode;//*ruleApplied = 2;}}/*  Add s[0....i+1],s[1...i+1].... to the suffix tree  Input:  SuffixNode* pNode: When we only use trick 1,pNode is the pointer to the longest leaf,s[0........i].*/void SinglePhaseExtend(SuffixTree* pTree,TreePos pPos,int phase){int iExtension=0;//pTree->m_iE = phase-1;int ruleApplied=-1;while(iExtension <= phase ){TreePath extendPath;extendPath.m_iBegin=iExtension;extendPath.m_iEnd=phase;SingleCharExtesion(pTree,&pPos,extendPath,&ruleApplied);iExtension++;}return;}SuffixNode* CreateFirstCharacter(SuffixTree* pTree){SuffixNode* firstLeaf = CreateTreeNode(pTree->m_pRoot,0,0);return firstLeaf;}SuffixTree* CreateSuffixTree(string tStr){SuffixTree* psTree=new SuffixTree();psTree->m_czTreeStr = tStr+"$";psTree->m_pRoot = CreateTreeNode(NULL,0,0);//Add the first char into it.SuffixNode* firstLeaf = CreateFirstCharacter(psTree);TreePos* firstLeafPos = new TreePos(0,firstLeaf);for(int phase = 1 ; phase<psTree->m_czTreeStr.length() ; phase++){firstLeafPos->m_iEdgePos = firstLeafPos->m_pNode->m_iEdgeEnd - firstLeafPos->m_pNode->m_iEdgeStart; //start from s[j..i]SinglePhaseExtend(psTree,*firstLeafPos,phase);}return psTree;}bool FindSubString(SuffixTree* pTree,string subStr){SuffixNode* node = Find_Son(pTree,pTree->m_pRoot,subStr[0]);if(node == NULL){return false;}int startIndex = node->m_iEdgeStart;int strIndex=0;int edgeIndex;while(node != NULL){edgeIndex = node->m_iEdgeStart;int edgeLabelEnd = node->m_iEdgeEnd;//GetNodeLabelEnd(pTree,node);while(strIndex < subStr.length() && edgeIndex <= edgeLabelEnd && pTree->m_czTreeStr[edgeIndex] == subStr[strIndex]){strIndex++;edgeIndex++;}if(strIndex == subStr.length()){//we found itreturn true;}else if(edgeIndex > node->m_iEdgeEnd){node = Find_Son(pTree,node,subStr[strIndex]);}else{return false;}}return false;}

热点排行

编程

后缀树小结+详细解释的代码