Ukkonen’s suffix tree construction: Difference between revisions
m (→{{header|Phix}}: added syntax colouring the hard way, made p2js compatible, read pi from file) |
No edit summary |
||
Line 1: | Line 1: | ||
[[Category:String algorithms]] |
[[Category:String algorithms]] |
||
{{ |
{{task}} |
||
Suffix Trees are very useful in numerous string processing and computational biology problems. |
Suffix Trees are very useful in numerous string processing and computational biology problems. |
||
Revision as of 16:00, 10 July 2021
You are encouraged to solve this task according to the task description, using any language you may know.
Suffix Trees are very useful in numerous string processing and computational biology problems.
The task is to create a function which implements Ukkonen’s algorithm to create a useful Suffix Tree as described:
Part 1 Part 2 Part 3 Part 4 Part 5 Part 6
Using Arithmetic-geometric mean/Calculate Pi generate the first 1000, 10000, and 100000 decimal places of pi. Using your implementation with an alphabet of 0 through 9 (plus $ say to make the tree explicit) find the longest repeated string in each list. Time your results and demonstrate that your implementation is linear (i.e. that 10000 takes approx. 10 times as long as 1000). You may vary the size of the lists of decimal places of pi to give reasonable answers.
Go
This is a translation of the C code here which is an extended form of the code in Part 6 of the task description for finding the longest repeated substring of a given string. In the interests of brevity, the extensive comments in the C version have been largely omitted. The C code doesn't compile as it stands but I have added a fix in the Talk Page.
For convenience I have included the code from the Arithmetic-geometric_mean/Calculate_Pi#Go task in the same package.
It takes around 25 seconds on my machine (Celeron @1.6GHz) to calculate the first 100,000 (or so) decimal places of Pi. Having done that, the timings for extracting the longest repeated sequence of digits are quick and fairly linear as expected.
As the task doesn't say whether overlapping sequences are to be counted, I've assumed that they are as this is what the algorithm naturally produces. <lang go>package main
import (
"fmt" "math/big" "time"
)
var maxChar = 128
type Node struct {
children []*Node suffixLink *Node start int end *int suffixIndex int
}
var (
text string root *Node lastNewNode *Node activeNode *Node activeEdge = -1 activeLength = 0 remainingSuffixCount = 0 leafEnd = -1 rootEnd *int splitEnd *int size = -1
)
func newNode(start int, end *int) *Node {
node := new(Node) node.children = make([]*Node, maxChar) node.suffixLink = root node.start = start node.end = end node.suffixIndex = -1 return node
}
func edgeLength(n *Node) int {
if n == root { return 0 } return *(n.end) - n.start + 1
}
func walkDown(currNode *Node) bool {
if activeLength >= edgeLength(currNode) { activeEdge += edgeLength(currNode) activeLength -= edgeLength(currNode) activeNode = currNode return true } return false
}
func extendSuffixTree(pos int) {
leafEnd = pos remainingSuffixCount++ lastNewNode = nil for remainingSuffixCount > 0 { if activeLength == 0 { activeEdge = pos } if activeNode.children[text[activeEdge]] == nil { activeNode.children[text[activeEdge]] = newNode(pos, &leafEnd) if lastNewNode != nil { lastNewNode.suffixLink = activeNode lastNewNode = nil } } else { next := activeNode.children[text[activeEdge]] if walkDown(next) { continue } if text[next.start+activeLength] == text[pos] { if lastNewNode != nil && activeNode != root { lastNewNode.suffixLink = activeNode lastNewNode = nil } activeLength++ break } temp := next.start + activeLength - 1 splitEnd = &temp split := newNode(next.start, splitEnd) activeNode.children[text[activeEdge]] = split split.children[text[pos]] = newNode(pos, &leafEnd) next.start += activeLength split.children[text[next.start]] = next if lastNewNode != nil { lastNewNode.suffixLink = split } lastNewNode = split } remainingSuffixCount-- if activeNode == root && activeLength > 0 { activeLength-- activeEdge = pos - remainingSuffixCount + 1 } else if activeNode != root { activeNode = activeNode.suffixLink } }
}
func setSuffixIndexByDFS(n *Node, labelHeight int) {
if n == nil { return } if n.start != -1 { // Uncomment line below to print suffix tree // fmt.Print(text[n.start: *(n.end) +1]) } leaf := 1 for i := 0; i < maxChar; i++ { if n.children[i] != nil { // Uncomment the 3 lines below to print suffix index //if leaf == 1 && n.start != -1 { // fmt.Printf(" [%d]\n", n.suffixIndex) //} leaf = 0 setSuffixIndexByDFS(n.children[i], labelHeight+edgeLength(n.children[i])) } } if leaf == 1 { n.suffixIndex = size - labelHeight // Uncomment line below to print suffix index //fmt.Printf(" [%d]\n", n.suffixIndex) }
}
func buildSuffixTree() {
size = len(text) temp := -1 rootEnd = &temp root = newNode(-1, rootEnd) activeNode = root for i := 0; i < size; i++ { extendSuffixTree(i) } labelHeight := 0 setSuffixIndexByDFS(root, labelHeight)
}
func doTraversal(n *Node, labelHeight int, maxHeight, substringStartIndex *int) {
if n == nil { return } if n.suffixIndex == -1 { for i := 0; i < maxChar; i++ { if n.children[i] != nil { doTraversal(n.children[i], labelHeight+edgeLength(n.children[i]), maxHeight, substringStartIndex) } } } else if n.suffixIndex > -1 && (*maxHeight < labelHeight-edgeLength(n)) { *maxHeight = labelHeight - edgeLength(n) *substringStartIndex = n.suffixIndex }
}
func getLongestRepeatedSubstring(s string) {
maxHeight := 0 substringStartIndex := 0 doTraversal(root, 0, &maxHeight, &substringStartIndex) // Uncomment line below to print maxHeight and substringStartIndex // fmt.Printf("maxHeight %d, substringStartIndex %d\n", maxHeight, substringStartIndex) if s == "" { fmt.Printf(" %s is: ", text) } else { fmt.Printf(" %s is: ", s) } k := 0 for ; k < maxHeight; k++ { fmt.Printf("%c", text[k+substringStartIndex]) } if k == 0 { fmt.Print("No repeated substring") } fmt.Println()
}
func calculatePi() *big.Float {
one := big.NewFloat(1) two := big.NewFloat(2) four := big.NewFloat(4) prec := uint(325 * 1024) // enough to calculate Pi to 100,182 decimal digits
a := big.NewFloat(1).SetPrec(prec) g := new(big.Float).SetPrec(prec)
// temporary variables t := new(big.Float).SetPrec(prec) u := new(big.Float).SetPrec(prec)
g.Quo(a, t.Sqrt(two)) sum := new(big.Float) pow := big.NewFloat(2)
for a.Cmp(g) != 0 { t.Add(a, g) t.Quo(t, two) g.Sqrt(u.Mul(a, g)) a.Set(t) pow.Mul(pow, two) t.Sub(t.Mul(a, a), u.Mul(g, g)) sum.Add(sum, t.Mul(t, pow)) }
t.Mul(a, a) t.Mul(t, four) pi := t.Quo(t, u.Sub(one, sum)) return pi
}
func main() {
tests := []string{ "GEEKSFORGEEKS$", "AAAAAAAAAA$", "ABCDEFG$", "ABABABA$", "ATCGATCGA$", "banana$", "abcpqrabpqpq$", "pqrpqpqabab$", } fmt.Println("Longest Repeated Substring in:\n") for _, test := range tests { text = test buildSuffixTree() getLongestRepeatedSubstring("") } fmt.Println()
pi := calculatePi() piStr := fmt.Sprintf("%v", pi) piStr = piStr[2:] // remove initial 3. numbers := []int{1e3, 1e4, 1e5} maxChar = 58 for _, number := range numbers { start := time.Now() text = piStr[0:number] + "$" buildSuffixTree() getLongestRepeatedSubstring(fmt.Sprintf("first %d d.p. of Pi", number)) elapsed := time.Now().Sub(start) fmt.Printf(" (this took %s)\n\n", elapsed) }
}</lang>
- Output:
Sample run:
Longest Repeated Substring in: GEEKSFORGEEKS$ is: GEEKS AAAAAAAAAA$ is: AAAAAAAAA ABCDEFG$ is: No repeated substring ABABABA$ is: ABABA ATCGATCGA$ is: ATCGA banana$ is: ana abcpqrabpqpq$ is: ab pqrpqpqabab$ is: ab first 1000 d.p. of Pi is: 23846 (this took 7.728858ms) first 10000 d.p. of Pi is: 7111369 (this took 57.524478ms) first 100000 d.p. of Pi is: 041021944 (this took 599.770281ms)
Julia
Uses array indices instead of the Go version's node pointers. <lang julia>const oo = typemax(Int)
"""The suffix-tree's node.""" mutable struct Node
children::Dict{Char, Int} start::Int ending::Int suffixlink::Int suffixindex::Int
end
Node() = Node(Dict(), 0, oo, 0, -1) Node(start, ending) = Node(Dict(), start, ending, 0, -1)
""" Ukkonen Suffix-Tree """ mutable struct SuffixTree
nodes::Vector{Node} text::Vector{Char} root::Int position::Int currentnode::Int needsuffixlink::Int remainder::Int activenode::Int activelength::Int activeedge::Int
end
edgelength(st, n::Node) = min(n.ending, st.position + 1) - n.start
function newnode(st, start, ending)
st.currentnode += 1 st.nodes[st.currentnode] = Node(start, ending) return st.currentnode
end
function SuffixTree(str::String)
nodes = [Node() for _ in 1:length(str) * 2] st = SuffixTree(nodes, [c for c in str], 1, 0, 0, 0, 0, 1, 1, 1) st.root = newnode(st, 0, 0) st.activenode = st.root for i in 1:length(st.text) extendsuffixtree(st, i) end setsuffixindexbyDFS(st, st.nodes[st.root], 0) return st
end
function addsuffixlink(st, nodenum::Int)
if st.needsuffixlink > 0 st.nodes[st.needsuffixlink].suffixlink = nodenum end st.needsuffixlink = nodenum
end
activeedge(st) = st.text[st.activeedge]
function walkdown!(st, currnode::Int)
len = edgelength(st, st.nodes[currnode]) st.activelength < len && return false st.activeedge += len st.activelength -= len st.activenode = currnode return true
end
function extendsuffixtree(st, pos)
st.position = pos st.needsuffixlink = 0 st.remainder += 1 while st.remainder > 0 st.activelength == 0 && (st.activeedge = st.position) if !haskey(st.nodes[st.activenode].children, activeedge(st)) nodenum = newnode(st, st.position, oo) st.nodes[st.activenode].children[activeedge(st)] = nodenum addsuffixlink(st, st.activenode) else next = st.nodes[st.activenode].children[activeedge(st)] walkdown!(st, next) && continue if st.text[st.nodes[next].start + st.activelength] == st.text[pos] addsuffixlink(st, st.activenode) st.activelength += 1 break end splt = newnode(st, st.nodes[next].start, st.nodes[next].start + st.activelength) st.nodes[st.activenode].children[activeedge(st)] = splt nodenum = newnode(st, st.position, oo) st.nodes[splt].children[st.text[pos]] = nodenum st.nodes[next].start += st.activelength st.nodes[splt].children[st.text[st.nodes[next].start]] = next addsuffixlink(st, splt) end st.remainder -= 1 if st.activenode == st.root && st.activelength > 0 st.activelength -= 1 st.activeedge = st.position - st.remainder + 1 elseif st.activenode != st.root st.activenode = st.nodes[st.activenode].suffixlink end end
end
function setsuffixindexbyDFS(st, node, labelheight, verbose=false)
verbose && node.start > 0 && print(st.text[node.start:min(node.ending, length(st.text))]) isleaf = true for child in map(v -> st.nodes[v], collect(values(node.children))) verbose && isleaf && node.start > 0 && println(" [", node.suffixindex, "]") isleaf = false setsuffixindexbyDFS(st, child, labelheight + edgelength(st, child)) end if isleaf idx = length(st.text) - labelheight node.suffixindex = idx verbose && println(" [$idx]") end
end
function dotraversal(st)
maxheight, substringstartindices = 0, [0] function traversal(node::Node, labelheight) if node.suffixindex == -1 for child in map(v -> st.nodes[v], collect(values(node.children))) traversal(child, labelheight + edgelength(st, child)) end elseif maxheight < labelheight - edgelength(st, node) maxheight = labelheight - edgelength(st, node) substringstartindices = [node.suffixindex + 1] elseif maxheight == labelheight - edgelength(st, node) push!(substringstartindices, node.suffixindex + 1) end end traversal(st.nodes[st.root], 0) return maxheight, substringstartindices
end
function getlongestrepeatedsubstring(st::SuffixTree, label="", printresult=true)
len, starts = dotraversal(st) substring = len == 0 ? "" : join(unique(map(x -> String(st.text[x:x+len-1]), starts)), " (or) ") if printresult print(" ", label == "" ? String(st.text) : label, ": ") println(len == 0 ? "No repeated substring." : substring) end return substring
end
function testsuffixtree()
tests = [ "CAAAABAAAABD\$", "GEEKSFORGEEKS\$", "AAAAAAAAAA\$", "ABCDEFG\$", "ABABABA\$", "ATCGATCGA\$", "banana\$", "abcpqrabpqpq\$", "pqrpqpqabab\$", ] println("Longest Repeated Substring in:\n") for test in tests st = SuffixTree(test) getlongestrepeatedsubstring(st) end println() sπ = "" setprecision(4000000) do sπ = string(BigFloat(π))[3:end] end for number in [1000, 10000, 100000, 1000000] text = sπ[1:number] * "\$" @time begin st = SuffixTree(text) getlongestrepeatedsubstring(st, "first $number d.p. of π") end end
end
testsuffixtree()
</lang>
- Output:
Longest Repeated Substring in: CAAAABAAAABD: AAAAB GEEKSFORGEEKS: GEEKS AAAAAAAAAA: AAAAAAAAA ABCDEFG: No repeated substring. ABABABA: ABABA ATCGATCGA: ATCGA banana: ana abcpqrabpqpq: ab (or) pq pqrpqpqabab: ab (or) pq first 1000 d.p. of π: 60943 (or) 42019 (or) 82534 (or) 99999 (or) 93751 (or) 23846 (or) 33446 0.003336 seconds (34.86 k allocations: 4.252 MiB) first 10000 d.p. of π: 7111369 (or) 8530614 0.038749 seconds (351.60 k allocations: 42.460 MiB, 16.54% gc time) first 100000 d.p. of π: 134926158 (or) 041021944 (or) 201890888 0.533892 seconds (3.52 M allocations: 425.035 MiB, 22.42% gc time) first 1000000 d.p. of π: 756130190263 6.008879 seconds (35.25 M allocations: 4.152 GiB, 23.20% gc time)
Phix
-- demo/rosetta/Ukkonens_Suffix_Tree.exw with javascript_semantics integer maxChar = 'z' sequence children = {}, suffixLinks = {}, starts = {}, endIndices = {}, suffixIndices = {}, leaves = {} function new_leaf(integer v=0) leaves = append(leaves,v) return length(leaves) end function string text integer splitEndIdx, rootEndIdx, leafEndIdx = new_leaf(), root = NULL, lastNewNode, activeNode, activeEdge = -1, activeLength = 0, remainingSuffixCount = 0, size = -1 function newNode(integer start, finishIdx, bool bKids=false) children = append(children,iff(bKids?repeat(NULL,maxChar):0)) suffixLinks = append(suffixLinks,root) starts = append(starts,start) endIndices = append(endIndices,finishIdx) suffixIndices = append(suffixIndices,-1) return length(children) end function function edgeLength(integer n) return iff(n==root?0:leaves[endIndices[n]] - starts[n] + 1) end function function walkDown(integer currNode) integer l = edgeLength(currNode) if activeLength >= l then activeEdge += l activeLength -= l activeNode = currNode return true end if return false end function procedure extendSuffixTree(integer pos) leaves[leafEndIdx] = pos remainingSuffixCount += 1 lastNewNode = NULL while remainingSuffixCount > 0 do if activeLength == 0 then activeEdge = pos end if integer ta = text[activeEdge] bool ca0 = children[activeNode]=0 integer next = iff(ca0?NULL:children[activeNode][ta]) if next==null then if ca0 then children[activeNode] = repeat(NULL,maxChar) end if children[activeNode][ta] = newNode(pos, leafEndIdx) if lastNewNode!=NULL then suffixLinks[lastNewNode] = activeNode lastNewNode = NULL end if else if walkDown(next) then continue end if integer tp = text[pos] if text[starts[next]+activeLength] == tp then if lastNewNode!=NULL and activeNode!=root then suffixLinks[lastNewNode] = activeNode lastNewNode = NULL end if activeLength += 1 exit end if integer temp = starts[next] + activeLength - 1 splitEndIdx = new_leaf(temp) integer splitnode = newNode(starts[next], splitEndIdx, true) ta = text[activeEdge] children[activeNode][ta] = splitnode children[splitnode][tp] = newNode(pos, leafEndIdx) starts[next] += activeLength children[splitnode][text[starts[next]]] = next if lastNewNode!=NULL then suffixLinks[lastNewNode] = splitnode end if lastNewNode = splitnode end if remainingSuffixCount -= 1 if activeNode==root and activeLength>0 then activeLength -= 1 activeEdge = pos - remainingSuffixCount + 1 elsif activeNode!=root then activeNode = suffixLinks[activeNode] end if end while end procedure procedure setSuffixIndexByDFS(integer n, labelHeight) if n!=NULL then if children[n]=0 then suffixIndices[n] = size - labelHeight else bool leaf = true for i=1 to maxChar do integer nci = children[n][i] if nci!=NULL then leaf = false setSuffixIndexByDFS(nci, labelHeight+edgeLength(nci)) end if end for if leaf then ?9/0 end if -- (sanity check) end if end if end procedure procedure buildSuffixTree() size = length(text) rootEndIdx = new_leaf(-1) root = newNode(-1, rootEndIdx) activeNode = root for i=1 to size do extendSuffixTree(i) end for integer labelHeight = 0 setSuffixIndexByDFS(root, labelHeight) end procedure procedure doTraversal(integer n, labelHeight, maxHeightIdx, substringStartIndex) if n!=NULL then integer nsi = suffixIndices[n], newHeight if nsi == -1 then for i=1 to maxChar do integer nci = children[n][i] if nci!=NULL then newHeight = labelHeight+edgeLength(nci) doTraversal(nci, newHeight, maxHeightIdx, substringStartIndex) end if end for elsif nsi > -1 then newHeight = labelHeight-edgeLength(n) if leaves[maxHeightIdx]<newHeight then leaves[maxHeightIdx] = newHeight leaves[substringStartIndex] = nsi end if end if end if end procedure function getLongestRepeatedSubstring() integer maxHeightIdx = new_leaf(), substringStartIndex = new_leaf() doTraversal(root, 0, maxHeightIdx, substringStartIndex) integer maxHeight = leaves[maxHeightIdx], start = leaves[substringStartIndex] string t = iff(maxHeight=0?"No repeated substring" :text[start+1..start+maxHeight]) return t end function constant tests = {"CAAAABAAAABD$", "GEEKSFORGEEKS$", "AAAAAAAAAA$", "ABCDEFG$", "ABABABA$", "ATCGATCGA$", "banana$", "abcpqrabpqpq$", "pqrpqpqabab$"} printf(1,"Longest Repeated Substring in:\n") for i=1 to length(tests) do text = tests[i] buildSuffixTree() printf(1," %s is: %s\n", {text,getLongestRepeatedSubstring()}) end for printf(1,"\n") include mpfr.e string piStr if platform()=JS then mpfr pi = mpfr_init(0,-1001) -- (set precision to 1,000 dp, plus the "3.") mpfr_const_pi(pi) piStr = mpfr_get_fixed(pi,1000) -- (all we can really manage under pwa/p2js) else -- gmp crashes when I try 100,000 dp, so just get from file piStr = get_text(`E:\downloads\misc\arm\pi-10million.txt`) end if piStr = piStr[3..$] -- discard leading "3." maxChar = '9' for i=3 to iff(platform()=JS?3:6) do atom t0 = time() integer n = power(10,i) text = piStr[1..n] & "$" buildSuffixTree() string r = getLongestRepeatedSubstring(), e = elapsed(time()-t0) printf(1," first %,d d.p. of Pi is: %s (%s)\n", {n,r,e}) end for
- Output:
Longest Repeated Substring in: CAAAABAAAABD$ is: AAAAB GEEKSFORGEEKS$ is: GEEKS AAAAAAAAAA$ is: AAAAAAAAA ABCDEFG$ is: No repeated substring ABABABA$ is: ABABA ATCGATCGA$ is: ATCGA banana$ is: ana abcpqrabpqpq$ is: ab pqrpqpqabab$ is: ab first 1,000 d.p. of Pi is: 23846 (0s) first 10,000 d.p. of Pi is: 7111369 (0.0s) first 100,000 d.p. of Pi is: 041021944 (0.3s) first 1,000,000 d.p. of Pi is: 756130190263 (3.2s)
Note that mpfr_const_pi() struggles to generate more than 1,000 digits of pi under pwa/p2js [and will continue to do so unless someone graciously donates a decent/fast Chudnovsky method in pure Phix or JavaScript...]
Wren
As it would take a very long time to calculate the first 100,000 digits of Pi using the code from the Arithmetic-geometric_mean/Calculate_Pi#Wren task, I have instead saved the digits produced by the Go entry to a file (which only takes a few seconds) and then loaded that into the Wren script.
Having done that, the timings for extracting the longest repeated sequence of digits are reasonably quick and fairly linear as expected. <lang ecmascript>import "/big" for BigRat import "/dynamic" for Struct import "/trait" for ByRef import "io" for File
var maxChar = 128
var Node = Struct.create("Node", ["children", "suffixLink", "start", "pEnd", "suffixIndex"])
var text = "" var root = null var lastNewNode = null var activeNode = null var activeEdge = -1 var activeLength = 0 var remainingSuffixCount = 0 var pLeafEnd = ByRef.new(-1) var pRootEnd = null var pSplitEnd = null var size = -1
var newNode = Fn.new { |start, pEnd|
var children = List.filled(maxChar, null) var suffixLink = root var suffixIndex = -1 return Node.new(children, suffixLink, start, pEnd, suffixIndex)
}
var edgeLength = Fn.new { |n|
if (n == root) return 0 return n.pEnd.value - n.start + 1
}
var walkDown = Fn.new { |currNode|
var el = edgeLength.call(currNode) if (activeLength >= el) { activeEdge = activeEdge + el activeLength = activeLength - el activeNode = currNode return true } return false
}
var extendSuffixTree = Fn.new { |pos|
pLeafEnd.value = pos remainingSuffixCount = remainingSuffixCount + 1 lastNewNode = null while (remainingSuffixCount > 0) { if (activeLength == 0) activeEdge = pos if (!activeNode.children[text[activeEdge].bytes[0]]) { activeNode.children[text[activeEdge].bytes[0]] = newNode.call(pos, pLeafEnd) if (lastNewNode) { lastNewNode.suffixLink = activeNode lastNewNode = null } } else { var next = activeNode.children[text[activeEdge].bytes[0]] if (walkDown.call(next)) continue if (text[next.start + activeLength] == text[pos]) { if (lastNewNode && activeNode != root) { lastNewNode.suffixLink = activeNode lastNewNode = null } activeLength = activeLength + 1 break } var temp = next.start + activeLength - 1 pSplitEnd = ByRef.new(temp) var split = newNode.call(next.start, pSplitEnd) activeNode.children[text[activeEdge].bytes[0]] = split split.children[text[pos].bytes[0]] = newNode.call(pos, pLeafEnd) next.start = next.start + activeLength split.children[text[next.start].bytes[0]] = next if (lastNewNode) lastNewNode.suffixLink = split lastNewNode = split } remainingSuffixCount = remainingSuffixCount - 1 if (activeNode == root && activeLength > 0) { activeLength = activeLength - 1 activeEdge = pos - remainingSuffixCount + 1 } else if (activeNode != root) { activeNode = activeNode.suffixLink } }
}
var setSuffixIndexByDFS // recursive setSuffixIndexByDFS = Fn.new { |n, labelHeight|
if (!n) return if (n.start != -1) { // Uncomment line below to print suffix tree // System.write(text[n.start..n.pEnd.value]) } var leaf = 1 for (i in 0...maxChar) { if (n.children[i]) { // Uncomment the 3 lines below to print suffix index // if (leaf == 1 && n.start != -1) { // System.print(" [%(n.suffixIndex)]") // } leaf = 0 setSuffixIndexByDFS.call(n.children[i], labelHeight + edgeLength.call(n.children[i])) } } if (leaf == 1) { n.suffixIndex = size - labelHeight // Uncomment line below to print suffix index // System.print(" [%(n.suffixIndex)]") }
}
var buildSuffixTree = Fn.new {
size = text.count var temp = -1 pRootEnd = ByRef.new(temp) root = newNode.call(-1, pRootEnd) activeNode = root for (i in 0...size) extendSuffixTree.call(i) var labelHeight = 0 setSuffixIndexByDFS.call(root, labelHeight)
}
var doTraversal // recursive doTraversal = Fn.new { |n, labelHeight, pMaxHeight, pSubstringStartIndex|
if (!n) return if (n.suffixIndex == -1) { for (i in 0...maxChar) { if (n.children[i]) { doTraversal.call(n.children[i], labelHeight + edgeLength.call(n.children[i]), pMaxHeight, pSubstringStartIndex) } } } else if (n.suffixIndex > -1 && (pMaxHeight.value < labelHeight - edgeLength.call(n))) { pMaxHeight.value = labelHeight - edgeLength.call(n) pSubstringStartIndex.value = n.suffixIndex }
}
var getLongestRepeatedSubstring = Fn.new { |s|
var maxHeight = 0 var substringStartIndex = 0 var pMaxHeight = ByRef.new(maxHeight) var pSubstringStartIndex = ByRef.new(substringStartIndex) doTraversal.call(root, 0, pMaxHeight, pSubstringStartIndex) maxHeight = pMaxHeight.value substringStartIndex = pSubstringStartIndex.value // Uncomment line below to print maxHeight and substringStartIndex // System.print("maxHeight %(maxHeight), substringStartIndex %(substringStartIndex)") if (s == "") { System.write(" %(text) is: ") } else { System.write(" %(s) is: ") } var k = 0 while (k < maxHeight) { System.write(text[k + substringStartIndex]) k = k + 1 } if (k == 0) { System.write("No repeated substring") } System.print()
}
var tests = [
"GEEKSFORGEEKS$", "AAAAAAAAAA$", "ABCDEFG$", "ABABABA$", "ATCGATCGA$", "banana$", "abcpqrabpqpq$", "pqrpqpqabab$",
] System.print("Longest Repeated Substring in:\n") for (test in tests) {
text = test buildSuffixTree.call() getLongestRepeatedSubstring.call("")
} System.print()
// load pi to 100,182 digits var piStr = File.read("pi_100000.txt") piStr = piStr[2..-1] // remove initial 3. var numbers = [1e3, 1e4, 1e5] maxChar = 58 for (number in numbers) {
var start = System.clock text = piStr[0...number] + "$" buildSuffixTree.call() getLongestRepeatedSubstring.call("first %(number) d.p. of Pi") var elapsed = (System.clock - start) * 1000 System.print(" (this took %(elapsed) ms)\n")
}</lang>
- Output:
Longest Repeated Substring in: GEEKSFORGEEKS$ is: GEEKS AAAAAAAAAA$ is: AAAAAAAAA ABCDEFG$ is: No repeated substring ABABABA$ is: ABABA ATCGATCGA$ is: ATCGA banana$ is: ana abcpqrabpqpq$ is: ab pqrpqpqabab$ is: ab first 1000 d.p. of Pi is: 23846 (this took 9.987 ms) first 10000 d.p. of Pi is: 7111369 (this took 89.12 ms) first 100000 d.p. of Pi is: 041021944 (this took 1031.072 ms)