Bioinformatics/Global alignment: Difference between revisions
Content added Content deleted
m (→{{header|Phix}}: fixed a couple of syntax glitches, added personal tag) |
(A Python implementation) |
||
Line 736: | Line 736: | ||
TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA |
TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA |
||
Base counts: Other:0, A:74, C:57, G:75, T:94, total:300 |
Base counts: Other:0, A:74, C:57, G:75, T:94, total:300 |
||
</pre> |
|||
=={{header|Python}}== |
|||
{{trans|Go}} |
|||
<lang python>import os |
|||
from collections import Counter |
|||
from functools import reduce |
|||
from itertools import permutations |
|||
BASES = ("A", "C", "G", "T") |
|||
def deduplicate(sequences): |
|||
"""Return the set of sequences with those that are a substring |
|||
of others removed too.""" |
|||
sequences = set(sequences) |
|||
duplicates = set() |
|||
for s, t in permutations(sequences, 2): |
|||
if s != t and s in t: |
|||
duplicates.add(s) |
|||
return sequences - duplicates |
|||
def smash(s, t): |
|||
"""Return `s` concatenated with `t`. The longest suffix of `s` |
|||
that matches a prefix of `t` will be removed.""" |
|||
for i in range(len(s)): |
|||
if t.startswith(s[i:]): |
|||
return s[:i] + t |
|||
return s + t |
|||
def shortest_superstring(sequences): |
|||
"""Return the shortest superstring covering all sequences. If |
|||
there are multiple shortest superstrings, an arbitrary |
|||
superstring is returned.""" |
|||
sequences = deduplicate(sequences) |
|||
shortest = "".join(sequences) |
|||
for perm in permutations(sequences): |
|||
superstring = reduce(smash, perm) |
|||
if len(superstring) < len(shortest): |
|||
shortest = superstring |
|||
return shortest |
|||
def shortest_superstrings(sequences): |
|||
"""Return a list of all shortest superstrings that cover |
|||
`sequences`.""" |
|||
sequences = deduplicate(sequences) |
|||
shortest = set(["".join(sequences)]) |
|||
shortest_length = sum(len(s) for s in sequences) |
|||
for perm in permutations(sequences): |
|||
superstring = reduce(smash, perm) |
|||
superstring_length = len(superstring) |
|||
if superstring_length < shortest_length: |
|||
shortest.clear() |
|||
shortest.add(superstring) |
|||
shortest_length = superstring_length |
|||
elif superstring_length == shortest_length: |
|||
shortest.add(superstring) |
|||
return shortest |
|||
def print_report(sequence): |
|||
"""Writes a report to stdout for the given DNA sequence.""" |
|||
buf = [f"Nucleotide counts for {sequence}:\n"] |
|||
counts = Counter(sequence) |
|||
for base in BASES: |
|||
buf.append(f"{base:>10}{counts.get(base, 0):>12}") |
|||
other = sum(v for k, v in counts.items() if k not in BASES) |
|||
buf.append(f"{'Other':>10}{other:>12}") |
|||
buf.append(" " * 5 + "_" * 17) |
|||
buf.append(f"{'Total length':>17}{sum(counts.values()):>5}") |
|||
print(os.linesep.join(buf), "\n") |
|||
if __name__ == "__main__": |
|||
test_cases = [ |
|||
("TA", "AAG", "TA", "GAA", "TA"), |
|||
("CATTAGGG", "ATTAG", "GGG", "TA"), |
|||
("AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"), |
|||
( |
|||
"ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", |
|||
"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", |
|||
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", |
|||
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", |
|||
"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", |
|||
"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", |
|||
"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", |
|||
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", |
|||
"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", |
|||
"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", |
|||
"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", |
|||
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", |
|||
"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA", |
|||
), |
|||
] |
|||
for case in test_cases: |
|||
for superstring in shortest_superstrings(case): |
|||
print_report(superstring) |
|||
# or .. |
|||
# |
|||
# for case in test_cases: |
|||
# print_report(shortest_superstring(case)) |
|||
# |
|||
# .. if you don't want all possible shortest superstrings. |
|||
</lang> |
|||
{{out}} |
|||
<pre> |
|||
Nucleotide counts for GAAGTA: |
|||
A 3 |
|||
C 0 |
|||
G 2 |
|||
T 1 |
|||
Other 0 |
|||
_________________ |
|||
Total length 6 |
|||
Nucleotide counts for TAAGAA: |
|||
A 4 |
|||
C 0 |
|||
G 1 |
|||
T 1 |
|||
Other 0 |
|||
_________________ |
|||
Total length 6 |
|||
Nucleotide counts for TAGAAG: |
|||
A 3 |
|||
C 0 |
|||
G 2 |
|||
T 1 |
|||
Other 0 |
|||
_________________ |
|||
Total length 6 |
|||
Nucleotide counts for CATTAGGG: |
|||
A 2 |
|||
C 1 |
|||
G 3 |
|||
T 2 |
|||
Other 0 |
|||
_________________ |
|||
Total length 8 |
|||
Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA: |
|||
A 10 |
|||
C 4 |
|||
G 8 |
|||
T 0 |
|||
Other 3 |
|||
_________________ |
|||
Total length 25 |
|||
Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA: |
|||
A 74 |
|||
C 57 |
|||
G 75 |
|||
T 94 |
|||
Other 0 |
|||
_________________ |
|||
Total length 300 |
|||
</pre> |
</pre> |
||