Ukkonen’s suffix tree construction: Difference between revisions

Content added Content deleted

Inline

Revision as of 16:00, 10 July 2021

Suffix Trees are very useful in numerous string processing and computational biology problems.

The task is to create a function which implements Ukkonen’s algorithm to create a useful Suffix Tree as described:

Part 1
Part 2
Part 3
Part 4
Part 5
Part 6

Using Arithmetic-geometric mean/Calculate Pi generate the first 1000, 10000, and 100000 decimal places of pi. Using your implementation with an alphabet of 0 through 9 (plus $ say to make the tree explicit) find the longest repeated string in each list. Time your results and demonstrate that your implementation is linear (i.e. that 10000 takes approx. 10 times as long as 1000). You may vary the size of the lists of decimal places of pi to give reasonable answers.

Go

This is a translation of the C code here which is an extended form of the code in Part 6 of the task description for finding the longest repeated substring of a given string. In the interests of brevity, the extensive comments in the C version have been largely omitted. The C code doesn't compile as it stands but I have added a fix in the Talk Page.

For convenience I have included the code from the Arithmetic-geometric_mean/Calculate_Pi#Go task in the same package.

It takes around 25 seconds on my machine (Celeron @1.6GHz) to calculate the first 100,000 (or so) decimal places of Pi. Having done that, the timings for extracting the longest repeated sequence of digits are quick and fairly linear as expected.

As the task doesn't say whether overlapping sequences are to be counted, I've assumed that they are as this is what the algorithm naturally produces. <lang go>package main

import (

   "fmt"
   "math/big"
   "time"

)

var maxChar = 128

type Node struct {

   children    []*Node
   suffixLink  *Node
   start       int
   end         *int
   suffixIndex int

}

var (

   text                 string
   root                 *Node
   lastNewNode          *Node
   activeNode           *Node
   activeEdge           = -1
   activeLength         = 0
   remainingSuffixCount = 0
   leafEnd              = -1
   rootEnd              *int
   splitEnd             *int
   size                 = -1

)

func newNode(start int, end *int) *Node {

   node := new(Node)
   node.children = make([]*Node, maxChar)
   node.suffixLink = root
   node.start = start
   node.end = end
   node.suffixIndex = -1
   return node

}

func edgeLength(n *Node) int {

   if n == root {
       return 0
   }
   return *(n.end) - n.start + 1

}

func walkDown(currNode *Node) bool {

   if activeLength >= edgeLength(currNode) {
       activeEdge += edgeLength(currNode)
       activeLength -= edgeLength(currNode)
       activeNode = currNode
       return true
   }
   return false

}

func extendSuffixTree(pos int) {

   leafEnd = pos
   remainingSuffixCount++
   lastNewNode = nil
   for remainingSuffixCount > 0 {
       if activeLength == 0 {
           activeEdge = pos
       }
       if activeNode.children[text[activeEdge]] == nil {
           activeNode.children[text[activeEdge]] = newNode(pos, &leafEnd)
           if lastNewNode != nil {
               lastNewNode.suffixLink = activeNode
               lastNewNode = nil
           }
       } else {
           next := activeNode.children[text[activeEdge]]
           if walkDown(next) {
               continue
           }
           if text[next.start+activeLength] == text[pos] {
               if lastNewNode != nil && activeNode != root {
                   lastNewNode.suffixLink = activeNode
                   lastNewNode = nil
               }
               activeLength++
               break
           }
           temp := next.start + activeLength - 1
           splitEnd = &temp
           split := newNode(next.start, splitEnd)
           activeNode.children[text[activeEdge]] = split
           split.children[text[pos]] = newNode(pos, &leafEnd)
           next.start += activeLength
           split.children[text[next.start]] = next
           if lastNewNode != nil {
               lastNewNode.suffixLink = split
           }
           lastNewNode = split
       }
       remainingSuffixCount--
       if activeNode == root && activeLength > 0 {
           activeLength--
           activeEdge = pos - remainingSuffixCount + 1
       } else if activeNode != root {
           activeNode = activeNode.suffixLink
       }
   }

}

func setSuffixIndexByDFS(n *Node, labelHeight int) {

   if n == nil {
       return
   }
   if n.start != -1 {
       // Uncomment line below to print suffix tree
       // fmt.Print(text[n.start: *(n.end) +1])
   }
   leaf := 1
   for i := 0; i < maxChar; i++ {
       if n.children[i] != nil {
           // Uncomment the 3 lines below to print suffix index
           //if leaf == 1 && n.start != -1 {
           //    fmt.Printf(" [%d]\n", n.suffixIndex)
           //}
           leaf = 0
           setSuffixIndexByDFS(n.children[i], labelHeight+edgeLength(n.children[i]))
       }
   }
   if leaf == 1 {
       n.suffixIndex = size - labelHeight
       // Uncomment line below to print suffix index
       //fmt.Printf(" [%d]\n", n.suffixIndex)
   }

}

func buildSuffixTree() {

   size = len(text)
   temp := -1
   rootEnd = &temp
   root = newNode(-1, rootEnd)
   activeNode = root
   for i := 0; i < size; i++ {
       extendSuffixTree(i)
   }
   labelHeight := 0
   setSuffixIndexByDFS(root, labelHeight)

}

func doTraversal(n *Node, labelHeight int, maxHeight, substringStartIndex *int) {

   if n == nil {
       return
   }
   if n.suffixIndex == -1 {
       for i := 0; i < maxChar; i++ {
           if n.children[i] != nil {
               doTraversal(n.children[i], labelHeight+edgeLength(n.children[i]),
                   maxHeight, substringStartIndex)
           }
       }
   } else if n.suffixIndex > -1 && (*maxHeight < labelHeight-edgeLength(n)) {
       *maxHeight = labelHeight - edgeLength(n)
       *substringStartIndex = n.suffixIndex
   }

}

func getLongestRepeatedSubstring(s string) {

   maxHeight := 0
   substringStartIndex := 0
   doTraversal(root, 0, &maxHeight, &substringStartIndex)
   // Uncomment line below to print maxHeight and substringStartIndex
   // fmt.Printf("maxHeight %d, substringStartIndex %d\n", maxHeight, substringStartIndex)
   if s == "" {
       fmt.Printf("  %s is: ", text)
   } else {
       fmt.Printf("  %s is: ", s)
   }
   k := 0
   for ; k < maxHeight; k++ {
       fmt.Printf("%c", text[k+substringStartIndex])
   }
   if k == 0 {
       fmt.Print("No repeated substring")
   }
   fmt.Println()

}

func calculatePi() *big.Float {

   one := big.NewFloat(1)
   two := big.NewFloat(2)
   four := big.NewFloat(4)
   prec := uint(325 * 1024) // enough to calculate Pi to 100,182 decimal digits

   a := big.NewFloat(1).SetPrec(prec)
   g := new(big.Float).SetPrec(prec)

   // temporary variables
   t := new(big.Float).SetPrec(prec)
   u := new(big.Float).SetPrec(prec)

   g.Quo(a, t.Sqrt(two))
   sum := new(big.Float)
   pow := big.NewFloat(2)

   for a.Cmp(g) != 0 {
       t.Add(a, g)
       t.Quo(t, two)
       g.Sqrt(u.Mul(a, g))
       a.Set(t)
       pow.Mul(pow, two)
       t.Sub(t.Mul(a, a), u.Mul(g, g))
       sum.Add(sum, t.Mul(t, pow))
   }

   t.Mul(a, a)
   t.Mul(t, four)
   pi := t.Quo(t, u.Sub(one, sum))
   return pi

}

func main() {

   tests := []string{
       "GEEKSFORGEEKS$",
       "AAAAAAAAAA$",
       "ABCDEFG$",
       "ABABABA$",
       "ATCGATCGA$",
       "banana$",
       "abcpqrabpqpq$",
       "pqrpqpqabab$",
   }
   fmt.Println("Longest Repeated Substring in:\n")
   for _, test := range tests {
       text = test
       buildSuffixTree()
       getLongestRepeatedSubstring("")
   }
   fmt.Println()

   pi := calculatePi()
   piStr := fmt.Sprintf("%v", pi)
   piStr = piStr[2:] // remove initial 3.
   numbers := []int{1e3, 1e4, 1e5}
   maxChar = 58
   for _, number := range numbers {
       start := time.Now()
       text = piStr[0:number] + "$"
       buildSuffixTree()
       getLongestRepeatedSubstring(fmt.Sprintf("first %d d.p. of Pi", number))
       elapsed := time.Now().Sub(start)
       fmt.Printf("  (this took %s)\n\n", elapsed)
   }

}</lang>

Output:

Sample run:

Longest Repeated Substring in:

  GEEKSFORGEEKS$ is: GEEKS
  AAAAAAAAAA$ is: AAAAAAAAA
  ABCDEFG$ is: No repeated substring
  ABABABA$ is: ABABA
  ATCGATCGA$ is: ATCGA
  banana$ is: ana
  abcpqrabpqpq$ is: ab
  pqrpqpqabab$ is: ab

  first 1000 d.p. of Pi is: 23846
  (this took 7.728858ms)

  first 10000 d.p. of Pi is: 7111369
  (this took 57.524478ms)

  first 100000 d.p. of Pi is: 041021944
  (this took 599.770281ms)

Julia

Translation of: Go

Uses array indices instead of the Go version's node pointers. <lang julia>const oo = typemax(Int)

"""The suffix-tree's node.""" mutable struct Node

   children::Dict{Char, Int}
   start::Int
   ending::Int
   suffixlink::Int
   suffixindex::Int

end

Node() = Node(Dict(), 0, oo, 0, -1) Node(start, ending) = Node(Dict(), start, ending, 0, -1)

""" Ukkonen Suffix-Tree """ mutable struct SuffixTree

   nodes::Vector{Node}
   text::Vector{Char}
   root::Int
   position::Int
   currentnode::Int
   needsuffixlink::Int
   remainder::Int
   activenode::Int
   activelength::Int
   activeedge::Int

end

edgelength(st, n::Node) = min(n.ending, st.position + 1) - n.start

function newnode(st, start, ending)

   st.currentnode += 1
   st.nodes[st.currentnode] = Node(start, ending)
   return st.currentnode

end

function SuffixTree(str::String)

   nodes = [Node() for _ in 1:length(str) * 2]
   st = SuffixTree(nodes, [c for c in str], 1, 0, 0, 0, 0, 1, 1, 1)
   st.root = newnode(st, 0, 0)
   st.activenode = st.root
   for i in 1:length(st.text)
       extendsuffixtree(st, i)
   end
   setsuffixindexbyDFS(st, st.nodes[st.root], 0)
   return st

end

function addsuffixlink(st, nodenum::Int)

   if st.needsuffixlink > 0
       st.nodes[st.needsuffixlink].suffixlink = nodenum
   end
   st.needsuffixlink = nodenum

end

activeedge(st) = st.text[st.activeedge]

function walkdown!(st, currnode::Int)

   len = edgelength(st, st.nodes[currnode])
   st.activelength < len && return false
   st.activeedge += len
   st.activelength -= len
   st.activenode = currnode
   return true

end

function extendsuffixtree(st, pos)

   st.position = pos
   st.needsuffixlink = 0
   st.remainder += 1
   while st.remainder > 0
       st.activelength == 0 && (st.activeedge = st.position)
       if !haskey(st.nodes[st.activenode].children, activeedge(st))
           nodenum = newnode(st, st.position, oo)
           st.nodes[st.activenode].children[activeedge(st)] = nodenum
           addsuffixlink(st, st.activenode)
       else
           next = st.nodes[st.activenode].children[activeedge(st)]
           walkdown!(st, next) && continue
           if st.text[st.nodes[next].start + st.activelength] == st.text[pos]
               addsuffixlink(st, st.activenode)
               st.activelength += 1
               break
           end
           splt = newnode(st, st.nodes[next].start, st.nodes[next].start + st.activelength)
           st.nodes[st.activenode].children[activeedge(st)] = splt
           nodenum = newnode(st, st.position, oo)
           st.nodes[splt].children[st.text[pos]] = nodenum
           st.nodes[next].start += st.activelength
           st.nodes[splt].children[st.text[st.nodes[next].start]] = next
           addsuffixlink(st, splt)
       end
       st.remainder -= 1
       if st.activenode == st.root && st.activelength > 0
           st.activelength -= 1
           st.activeedge = st.position - st.remainder + 1
       elseif st.activenode != st.root
           st.activenode = st.nodes[st.activenode].suffixlink
       end
   end

end

function setsuffixindexbyDFS(st, node, labelheight, verbose=false)

   verbose && node.start > 0 && print(st.text[node.start:min(node.ending, length(st.text))])
   isleaf = true
   for child in map(v -> st.nodes[v], collect(values(node.children)))
       verbose && isleaf && node.start > 0 && println(" [", node.suffixindex, "]")
       isleaf = false
       setsuffixindexbyDFS(st, child, labelheight + edgelength(st, child))
   end
   if isleaf
       idx = length(st.text) - labelheight
       node.suffixindex = idx
       verbose && println(" [$idx]")
   end

end

function dotraversal(st)

   maxheight, substringstartindices = 0, [0]
   function traversal(node::Node, labelheight)
       if node.suffixindex == -1
           for child in map(v -> st.nodes[v], collect(values(node.children)))
               traversal(child, labelheight + edgelength(st, child))
           end
       elseif maxheight < labelheight - edgelength(st, node)
           maxheight = labelheight - edgelength(st, node)
           substringstartindices = [node.suffixindex + 1]
       elseif maxheight == labelheight - edgelength(st, node)
           push!(substringstartindices, node.suffixindex + 1)
       end
   end
   traversal(st.nodes[st.root], 0)
   return maxheight, substringstartindices

end

function getlongestrepeatedsubstring(st::SuffixTree, label="", printresult=true)

   len, starts = dotraversal(st)
   substring = len == 0 ? "" :
       join(unique(map(x -> String(st.text[x:x+len-1]), starts)), " (or) ")
   if printresult
       print("  ", label == "" ? String(st.text) : label, ": ")
       println(len == 0 ? "No repeated substring." : substring)
   end
   return substring

end

function testsuffixtree()

   tests = [
       "CAAAABAAAABD\$",
       "GEEKSFORGEEKS\$",
       "AAAAAAAAAA\$",
       "ABCDEFG\$",
       "ABABABA\$",
       "ATCGATCGA\$",
       "banana\$",
       "abcpqrabpqpq\$",
       "pqrpqpqabab\$",
   ]
   println("Longest Repeated Substring in:\n")
   for test in tests
       st = SuffixTree(test)
       getlongestrepeatedsubstring(st)
   end
   println()
   sπ = ""
   setprecision(4000000) do
       sπ = string(BigFloat(π))[3:end]
   end
   for number in [1000, 10000, 100000, 1000000]
       text = sπ[1:number] * "\$"
       @time begin
           st = SuffixTree(text)
           getlongestrepeatedsubstring(st, "first $number d.p. of π")
       end
   end

end

testsuffixtree()

</lang>

Output:

Longest Repeated Substring in:

  CAAAABAAAABD: AAAAB
  GEEKSFORGEEKS: GEEKS
  AAAAAAAAAA: AAAAAAAAA
  ABCDEFG: No repeated substring.
  ABABABA: ABABA
  ATCGATCGA: ATCGA
  banana: ana
  abcpqrabpqpq: ab (or) pq
  pqrpqpqabab: ab (or) pq

  first 1000 d.p. of π: 60943 (or) 42019 (or) 82534 (or) 99999 (or) 93751 (or) 23846 (or) 33446
  0.003336 seconds (34.86 k allocations: 4.252 MiB)
  first 10000 d.p. of π: 7111369 (or) 8530614
  0.038749 seconds (351.60 k allocations: 42.460 MiB, 16.54% gc time)
  first 100000 d.p. of π: 134926158 (or) 041021944 (or) 201890888
  0.533892 seconds (3.52 M allocations: 425.035 MiB, 22.42% gc time)
  first 1000000 d.p. of π: 756130190263
  6.008879 seconds (35.25 M allocations: 4.152 GiB, 23.20% gc time)

Phix

Translation of: Go

-- demo/rosetta/Ukkonens_Suffix_Tree.exw
with javascript_semantics
integer maxChar = 'z'
 
sequence children = {},
         suffixLinks = {},
         starts = {},
         endIndices = {},
         suffixIndices = {},
         leaves = {}

function new_leaf(integer v=0)
    leaves = append(leaves,v)
    return length(leaves)
end function

string text
integer splitEndIdx,
        rootEndIdx,
        leafEndIdx = new_leaf(),
        root = NULL,
        lastNewNode,
        activeNode,
        activeEdge = -1,
        activeLength = 0,
        remainingSuffixCount = 0,
        size = -1
 
function newNode(integer start, finishIdx, bool bKids=false)
    children = append(children,iff(bKids?repeat(NULL,maxChar):0))
    suffixLinks = append(suffixLinks,root)
    starts = append(starts,start)
    endIndices = append(endIndices,finishIdx)
    suffixIndices = append(suffixIndices,-1)
    return length(children)
end function
 
function edgeLength(integer n)
    return iff(n==root?0:leaves[endIndices[n]] - starts[n] + 1)
end function
 
function walkDown(integer currNode)
    integer l = edgeLength(currNode)
    if activeLength >= l then
        activeEdge += l
        activeLength -= l
        activeNode = currNode
        return true
    end if
    return false
end function
 
procedure extendSuffixTree(integer pos)
    leaves[leafEndIdx] = pos
    remainingSuffixCount += 1
    lastNewNode = NULL
    while remainingSuffixCount > 0 do
        if activeLength == 0 then
            activeEdge = pos
        end if
        integer ta = text[activeEdge]
        bool ca0 = children[activeNode]=0
        integer next = iff(ca0?NULL:children[activeNode][ta])
        if next==null then
            if ca0 then
                children[activeNode] = repeat(NULL,maxChar)
            end if
            children[activeNode][ta] = newNode(pos, leafEndIdx)
            if lastNewNode!=NULL then
                suffixLinks[lastNewNode] = activeNode
                lastNewNode = NULL
            end if
        else
            if walkDown(next) then
                continue
            end if
            integer tp = text[pos]
            if text[starts[next]+activeLength] == tp then
                if lastNewNode!=NULL and activeNode!=root then
                    suffixLinks[lastNewNode] = activeNode
                    lastNewNode = NULL
                end if
                activeLength += 1
                exit
            end if
            integer temp = starts[next] + activeLength - 1
            splitEndIdx = new_leaf(temp)
            integer splitnode = newNode(starts[next], splitEndIdx, true)
            ta = text[activeEdge]
            children[activeNode][ta] = splitnode
            children[splitnode][tp] = newNode(pos, leafEndIdx)
            starts[next] += activeLength
            children[splitnode][text[starts[next]]] = next
            if lastNewNode!=NULL then
                suffixLinks[lastNewNode] = splitnode
            end if
            lastNewNode = splitnode
        end if
        remainingSuffixCount -= 1
        if activeNode==root and activeLength>0 then
            activeLength -= 1
            activeEdge = pos - remainingSuffixCount + 1
        elsif activeNode!=root then
            activeNode = suffixLinks[activeNode]
        end if
    end while
end procedure
 
procedure setSuffixIndexByDFS(integer n, labelHeight)
    if n!=NULL then
        if children[n]=0 then
            suffixIndices[n] = size - labelHeight
        else
            bool leaf = true
            for i=1 to maxChar do
                integer nci = children[n][i]
                if nci!=NULL then
                    leaf = false
                    setSuffixIndexByDFS(nci, labelHeight+edgeLength(nci))
                end if
            end for
            if leaf then ?9/0 end if -- (sanity check)
        end if
    end if
end procedure
 
procedure buildSuffixTree()
    size = length(text)
    rootEndIdx = new_leaf(-1)
    root = newNode(-1, rootEndIdx)
    activeNode = root
    for i=1 to size do
        extendSuffixTree(i)
    end for
    integer labelHeight = 0
    setSuffixIndexByDFS(root, labelHeight)
end procedure
 
procedure doTraversal(integer n, labelHeight, maxHeightIdx, substringStartIndex)
    if n!=NULL then
        integer nsi = suffixIndices[n], newHeight
        if nsi == -1 then
            for i=1 to maxChar do
                integer nci = children[n][i]
                if nci!=NULL then
                    newHeight = labelHeight+edgeLength(nci)
                    doTraversal(nci, newHeight, maxHeightIdx, substringStartIndex)
                end if
            end for
        elsif nsi > -1 then
            newHeight = labelHeight-edgeLength(n)
            if leaves[maxHeightIdx]<newHeight then
                leaves[maxHeightIdx] = newHeight
                leaves[substringStartIndex] = nsi
            end if
        end if
    end if
end procedure
 
function getLongestRepeatedSubstring()
    integer maxHeightIdx = new_leaf(),
            substringStartIndex = new_leaf()
    doTraversal(root, 0, maxHeightIdx, substringStartIndex)
    integer maxHeight = leaves[maxHeightIdx],
            start = leaves[substringStartIndex]
    string t = iff(maxHeight=0?"No repeated substring"
                              :text[start+1..start+maxHeight])
    return t
end function
 
constant tests = {"CAAAABAAAABD$",
                  "GEEKSFORGEEKS$",
                  "AAAAAAAAAA$",
                  "ABCDEFG$",
                  "ABABABA$",
                  "ATCGATCGA$",
                  "banana$",
                  "abcpqrabpqpq$",
                  "pqrpqpqabab$"}
printf(1,"Longest Repeated Substring in:\n")
for i=1 to length(tests) do
    text = tests[i]
    buildSuffixTree()
    printf(1,"  %s is: %s\n", {text,getLongestRepeatedSubstring()})
end for
printf(1,"\n")
 
include mpfr.e
string piStr
if platform()=JS then
    mpfr pi = mpfr_init(0,-1001) -- (set precision to 1,000 dp, plus the "3.")
    mpfr_const_pi(pi)
    piStr = mpfr_get_fixed(pi,1000) -- (all we can really manage under pwa/p2js)
else
    -- gmp crashes when I try 100,000 dp, so just get from file
    piStr = get_text(`E:\downloads\misc\arm\pi-10million.txt`)
end if
piStr = piStr[3..$] -- discard leading "3."
maxChar = '9'
for i=3 to iff(platform()=JS?3:6) do
    atom t0 = time()
    integer n = power(10,i)
    text = piStr[1..n] & "$"
    buildSuffixTree()
    string r = getLongestRepeatedSubstring(),
           e = elapsed(time()-t0)
    printf(1,"  first %,d d.p. of Pi is: %s (%s)\n", {n,r,e})
end for

Output:

Longest Repeated Substring in:
  CAAAABAAAABD$ is: AAAAB
  GEEKSFORGEEKS$ is: GEEKS
  AAAAAAAAAA$ is: AAAAAAAAA
  ABCDEFG$ is: No repeated substring
  ABABABA$ is: ABABA
  ATCGATCGA$ is: ATCGA
  banana$ is: ana
  abcpqrabpqpq$ is: ab
  pqrpqpqabab$ is: ab

  first 1,000 d.p. of Pi is: 23846 (0s)
  first 10,000 d.p. of Pi is: 7111369 (0.0s)
  first 100,000 d.p. of Pi is: 041021944 (0.3s)
  first 1,000,000 d.p. of Pi is: 756130190263 (3.2s)

Note that mpfr_const_pi() struggles to generate more than 1,000 digits of pi under pwa/p2js [and will continue to do so unless someone graciously donates a decent/fast Chudnovsky method in pure Phix or JavaScript...]

Wren

Translation of: Go

Library: Wren-big

Library: Wren-dynamic

Library: Wren-trait

As it would take a very long time to calculate the first 100,000 digits of Pi using the code from the Arithmetic-geometric_mean/Calculate_Pi#Wren task, I have instead saved the digits produced by the Go entry to a file (which only takes a few seconds) and then loaded that into the Wren script.

Having done that, the timings for extracting the longest repeated sequence of digits are reasonably quick and fairly linear as expected. <lang ecmascript>import "/big" for BigRat import "/dynamic" for Struct import "/trait" for ByRef import "io" for File

var maxChar = 128

var Node = Struct.create("Node", ["children", "suffixLink", "start", "pEnd", "suffixIndex"])

var text = "" var root = null var lastNewNode = null var activeNode = null var activeEdge = -1 var activeLength = 0 var remainingSuffixCount = 0 var pLeafEnd = ByRef.new(-1) var pRootEnd = null var pSplitEnd = null var size = -1

var newNode = Fn.new { |start, pEnd|

   var children = List.filled(maxChar, null)
   var suffixLink = root
   var suffixIndex = -1
   return Node.new(children, suffixLink, start, pEnd, suffixIndex)

}

var edgeLength = Fn.new { |n|

   if (n == root) return 0
   return n.pEnd.value - n.start + 1

}

var walkDown = Fn.new { |currNode|

   var el = edgeLength.call(currNode)
   if (activeLength >= el) {
       activeEdge = activeEdge + el
       activeLength = activeLength - el
       activeNode = currNode
       return true
   }
   return false

}

var extendSuffixTree = Fn.new { |pos|

   pLeafEnd.value = pos
   remainingSuffixCount = remainingSuffixCount + 1
   lastNewNode = null
   while (remainingSuffixCount > 0) {
       if (activeLength == 0) activeEdge = pos
       if (!activeNode.children[text[activeEdge].bytes[0]]) {
           activeNode.children[text[activeEdge].bytes[0]] = newNode.call(pos, pLeafEnd)
           if (lastNewNode) {
               lastNewNode.suffixLink = activeNode
               lastNewNode = null
           }
       } else {
           var next = activeNode.children[text[activeEdge].bytes[0]]
           if (walkDown.call(next)) continue
           if (text[next.start + activeLength] == text[pos]) {
               if (lastNewNode && activeNode != root) {
                   lastNewNode.suffixLink = activeNode
                   lastNewNode = null
               }
               activeLength = activeLength + 1
               break
           }
           var temp = next.start + activeLength - 1
           pSplitEnd = ByRef.new(temp)
           var split = newNode.call(next.start, pSplitEnd)
           activeNode.children[text[activeEdge].bytes[0]] = split
           split.children[text[pos].bytes[0]] = newNode.call(pos, pLeafEnd)
           next.start = next.start + activeLength
           split.children[text[next.start].bytes[0]] = next
           if (lastNewNode) lastNewNode.suffixLink = split
           lastNewNode = split
       }
       remainingSuffixCount = remainingSuffixCount - 1
       if (activeNode == root && activeLength > 0) {
           activeLength = activeLength - 1
           activeEdge = pos - remainingSuffixCount + 1
       } else if (activeNode != root) {
           activeNode = activeNode.suffixLink
       }
   }

}

var setSuffixIndexByDFS // recursive setSuffixIndexByDFS = Fn.new { |n, labelHeight|

   if (!n) return
   if (n.start != -1) {
       // Uncomment line below to print suffix tree
       // System.write(text[n.start..n.pEnd.value])
   }
   var leaf = 1
   for (i in 0...maxChar) {
       if (n.children[i]) {
           // Uncomment the 3 lines below to print suffix index
           // if (leaf == 1 && n.start != -1) {
           //    System.print(" [%(n.suffixIndex)]")
           // }
           leaf = 0
           setSuffixIndexByDFS.call(n.children[i], labelHeight + edgeLength.call(n.children[i]))
       }
   }
   if (leaf == 1) {
       n.suffixIndex = size - labelHeight
       // Uncomment line below to print suffix index
       // System.print(" [%(n.suffixIndex)]")
   }

}

var buildSuffixTree = Fn.new {

   size = text.count
   var temp = -1
   pRootEnd = ByRef.new(temp)
   root = newNode.call(-1, pRootEnd)
   activeNode = root
   for (i in 0...size) extendSuffixTree.call(i)
   var labelHeight = 0
   setSuffixIndexByDFS.call(root, labelHeight)

}

var doTraversal // recursive doTraversal = Fn.new { |n, labelHeight, pMaxHeight, pSubstringStartIndex|

   if (!n) return
   if (n.suffixIndex == -1) {
       for (i in 0...maxChar) {
           if (n.children[i]) {                        
               doTraversal.call(n.children[i], labelHeight + edgeLength.call(n.children[i]),
                   pMaxHeight, pSubstringStartIndex)
            }
       }
   } else if (n.suffixIndex > -1 && (pMaxHeight.value < labelHeight - edgeLength.call(n))) {
       pMaxHeight.value = labelHeight - edgeLength.call(n)
       pSubstringStartIndex.value = n.suffixIndex
   }

}

var getLongestRepeatedSubstring = Fn.new { |s|

   var maxHeight = 0
   var substringStartIndex = 0
   var pMaxHeight = ByRef.new(maxHeight)
   var pSubstringStartIndex = ByRef.new(substringStartIndex)
   doTraversal.call(root, 0, pMaxHeight, pSubstringStartIndex)
   maxHeight = pMaxHeight.value
   substringStartIndex = pSubstringStartIndex.value
   // Uncomment line below to print maxHeight and substringStartIndex
   // System.print("maxHeight %(maxHeight), substringStartIndex %(substringStartIndex)")
   if (s == "") {
       System.write("  %(text) is: ")
   } else {
       System.write("  %(s) is: ")
   }
   var k = 0
   while (k < maxHeight) {
       System.write(text[k + substringStartIndex])
       k = k + 1
   }
   if (k == 0) {
       System.write("No repeated substring")
   }
   System.print()

}

var tests = [

   "GEEKSFORGEEKS$",
   "AAAAAAAAAA$",
   "ABCDEFG$",
   "ABABABA$",
   "ATCGATCGA$",
   "banana$",
   "abcpqrabpqpq$",
   "pqrpqpqabab$",

] System.print("Longest Repeated Substring in:\n") for (test in tests) {

   text = test
   buildSuffixTree.call()
   getLongestRepeatedSubstring.call("")

} System.print()

// load pi to 100,182 digits var piStr = File.read("pi_100000.txt") piStr = piStr[2..-1] // remove initial 3. var numbers = [1e3, 1e4, 1e5] maxChar = 58 for (number in numbers) {

   var start = System.clock
   text = piStr[0...number] + "$"
   buildSuffixTree.call()
   getLongestRepeatedSubstring.call("first %(number) d.p. of Pi")
   var elapsed = (System.clock - start) * 1000
   System.print("  (this took %(elapsed) ms)\n")

}</lang>

Output:

Longest Repeated Substring in:

  GEEKSFORGEEKS$ is: GEEKS
  AAAAAAAAAA$ is: AAAAAAAAA
  ABCDEFG$ is: No repeated substring
  ABABABA$ is: ABABA
  ATCGATCGA$ is: ATCGA
  banana$ is: ana
  abcpqrabpqpq$ is: ab
  pqrpqpqabab$ is: ab

  first 1000 d.p. of Pi is: 23846
  (this took 9.987 ms)

  first 10000 d.p. of Pi is: 7111369
  (this took 89.12 ms)

  first 100000 d.p. of Pi is: 041021944
  (this took 1031.072 ms)

@@ Line 1: / Line 1: @@
 [[Category:String algorithms]]
-{{draft task}}
+{{task}}
 Suffix Trees are very useful in numerous string processing and computational biology problems.