Bioinformatics/Global alignment: Difference between revisions

Line 178:
_________________
Total length 300
</pre>
 
=={{header|Phix}}==
<lang Phix>requires("0.8.4") -- (or the trivial 7/2/21 bugfix in punique.e)
 
procedure printcounts(sequence ss)
-- Given DNA sequence(s), report the sequence, length and base counts
for i=1 to length(ss) do
string dna = ss[i]
sequence acgt = repeat(0,6)
for j=1 to length(dna) do
acgt[find(dna[j],"ACGT")+1] += 1
end for
acgt[$] = sum(acgt)
string ncf = "Nucleotide counts for :"
printf(1,"%s%s\n",{ncf,join(split_by(dna,50),"\n"&repeat(' ',length(ncf)))})
printf(1,"\nBase counts: Other:%d, A:%d, C:%d, G:%d, T:%d, total:%d\n\n",acgt)
end for
end procedure
function deduplicate(sequence ss)
-- Remove duplicates and strings contained within a larger string from vector of strings
sequence filtered = {}
for i=1 to length(ss) do
string si = ss[i]
bool found = false
for j=1 to length(ss) do
if i!=j and match(si,ss[j]) then
found = true
exit
end if
end for
if not found then
filtered = append(filtered, si)
end if
end for
return filtered
end function
procedure shortest_common_superstring(sequence ss)
-- Returns shortest common superstring of a vector of strings
ss = deduplicate(unique(ss,"STABLE"))
sequence shortestsuper = {join(ss,"")}
integer shortest = length(shortestsuper[1])
for p=1 to factorial(length(ss)) do
sequence perm = permute(p,ss)
string sup = perm[1]
for i=2 to length(perm) do
string pi = perm[i]
for j=-min(length(pi),length(sup)) to 0 do
string overlap = sup[j..$]
if overlap = pi[1..length(overlap)] then
sup &= pi[length(overlap)+1..$]
pi = ""
exit
end if
end for
if length(pi) then ?9/0 end if -- (sanity chk)
end for
if length(sup) < shortest then
shortest = length(sup)
shortestsuper = {sup}
elsif length(sup) = shortest
and not find(sup,shortestsuper) then
shortestsuper = append(shortestsuper,sup)
end if
end for
printcounts(shortestsuper)
end procedure
constant tests = {
{"TA", "AAG", "TA", "GAA", "TA"},
{"CATTAGGG", "ATTAG", "GGG", "TA"},
{"AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"},
{"ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT",
"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT",
"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC",
"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC",
"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT",
"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA"}
}
papply(tests, shortest_common_superstring)</lang>
{{out}}
<pre>
Nucleotide counts for :GAAGTA
 
Base counts: Other:0, A:3, C:0, G:2, T:1, total:6
 
Nucleotide counts for :TAGAAG
 
Base counts: Other:0, A:3, C:0, G:2, T:1, total:6
 
Nucleotide counts for :TAAGAA
 
Base counts: Other:0, A:4, C:0, G:1, T:1, total:6
 
Nucleotide counts for :CATTAGGG
 
Base counts: Other:0, A:2, C:1, G:3, T:2, total:8
 
Nucleotide counts for :AAGAUGGAGCGCAUCGCAAUAAGGA
 
Base counts: Other:3, A:10, C:4, G:8, T:0, total:25
 
Nucleotide counts for :CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG
CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG
AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT
GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT
CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG
TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
 
Base counts: Other:0, A:74, C:57, G:75, T:94, total:300
</pre>
7,794

edits