Bioinformatics/Global alignment: Difference between revisions

A Python implementation
m (→‎{{header|Phix}}: fixed a couple of syntax glitches, added personal tag)
(A Python implementation)
Line 736:
TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
Base counts: Other:0, A:74, C:57, G:75, T:94, total:300
</pre>
 
=={{header|Python}}==
{{trans|Go}}
 
<lang python>import os
 
from collections import Counter
from functools import reduce
from itertools import permutations
 
BASES = ("A", "C", "G", "T")
 
 
def deduplicate(sequences):
"""Return the set of sequences with those that are a substring
of others removed too."""
sequences = set(sequences)
duplicates = set()
 
for s, t in permutations(sequences, 2):
if s != t and s in t:
duplicates.add(s)
 
return sequences - duplicates
 
 
def smash(s, t):
"""Return `s` concatenated with `t`. The longest suffix of `s`
that matches a prefix of `t` will be removed."""
for i in range(len(s)):
if t.startswith(s[i:]):
return s[:i] + t
return s + t
 
 
def shortest_superstring(sequences):
"""Return the shortest superstring covering all sequences. If
there are multiple shortest superstrings, an arbitrary
superstring is returned."""
sequences = deduplicate(sequences)
shortest = "".join(sequences)
 
for perm in permutations(sequences):
superstring = reduce(smash, perm)
if len(superstring) < len(shortest):
shortest = superstring
 
return shortest
 
 
def shortest_superstrings(sequences):
"""Return a list of all shortest superstrings that cover
`sequences`."""
sequences = deduplicate(sequences)
 
shortest = set(["".join(sequences)])
shortest_length = sum(len(s) for s in sequences)
 
for perm in permutations(sequences):
superstring = reduce(smash, perm)
superstring_length = len(superstring)
if superstring_length < shortest_length:
shortest.clear()
shortest.add(superstring)
shortest_length = superstring_length
elif superstring_length == shortest_length:
shortest.add(superstring)
 
return shortest
 
 
def print_report(sequence):
"""Writes a report to stdout for the given DNA sequence."""
buf = [f"Nucleotide counts for {sequence}:\n"]
 
counts = Counter(sequence)
for base in BASES:
buf.append(f"{base:>10}{counts.get(base, 0):>12}")
 
other = sum(v for k, v in counts.items() if k not in BASES)
buf.append(f"{'Other':>10}{other:>12}")
 
buf.append(" " * 5 + "_" * 17)
buf.append(f"{'Total length':>17}{sum(counts.values()):>5}")
 
print(os.linesep.join(buf), "\n")
 
 
if __name__ == "__main__":
test_cases = [
("TA", "AAG", "TA", "GAA", "TA"),
("CATTAGGG", "ATTAG", "GGG", "TA"),
("AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"),
(
"ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT",
"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT",
"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC",
"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC",
"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT",
"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA",
),
]
 
for case in test_cases:
for superstring in shortest_superstrings(case):
print_report(superstring)
 
# or ..
#
# for case in test_cases:
# print_report(shortest_superstring(case))
#
# .. if you don't want all possible shortest superstrings.
</lang>
 
{{out}}
<pre>
Nucleotide counts for GAAGTA:
 
A 3
C 0
G 2
T 1
Other 0
_________________
Total length 6
 
Nucleotide counts for TAAGAA:
 
A 4
C 0
G 1
T 1
Other 0
_________________
Total length 6
 
Nucleotide counts for TAGAAG:
 
A 3
C 0
G 2
T 1
Other 0
_________________
Total length 6
 
Nucleotide counts for CATTAGGG:
 
A 2
C 1
G 3
T 2
Other 0
_________________
Total length 8
 
Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA:
 
A 10
C 4
G 8
T 0
Other 3
_________________
Total length 25
 
Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA:
 
A 74
C 57
G 75
T 94
Other 0
_________________
Total length 300
</pre>
 
140

edits