Bioinformatics/Global alignment: Difference between revisions

Content added Content deleted

Inline

@@ Line 736: / Line 736: @@
                        TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
 Base counts: Other:0, A:74, C:57, G:75, T:94, total:300
+</pre>
+=={{header|Python}}==
+{{trans|Go}}
+<lang python>import os
+from collections import Counter
+from functools import reduce
+from itertools import permutations
+BASES = ("A", "C", "G", "T")
+def deduplicate(sequences):
+    """Return the set of sequences with those that are a substring
+    of others removed too."""
+    sequences = set(sequences)
+    duplicates = set()
+    for s, t in permutations(sequences, 2):
+        if s != t and s in t:
+            duplicates.add(s)
+    return sequences - duplicates
+def smash(s, t):
+    """Return `s` concatenated with `t`. The longest suffix of `s`
+    that matches a prefix of `t` will be removed."""
+    for i in range(len(s)):
+        if t.startswith(s[i:]):
+            return s[:i] + t
+    return s + t
+def shortest_superstring(sequences):
+    """Return the shortest superstring covering all sequences. If
+    there are multiple shortest superstrings, an arbitrary
+    superstring is returned."""
+    sequences = deduplicate(sequences)
+    shortest = "".join(sequences)
+    for perm in permutations(sequences):
+        superstring = reduce(smash, perm)
+        if len(superstring) < len(shortest):
+            shortest = superstring
+    return shortest
+def shortest_superstrings(sequences):
+    """Return a list of all shortest superstrings that cover
+    `sequences`."""
+    sequences = deduplicate(sequences)
+    shortest = set(["".join(sequences)])
+    shortest_length = sum(len(s) for s in sequences)
+    for perm in permutations(sequences):
+        superstring = reduce(smash, perm)
+        superstring_length = len(superstring)
+        if superstring_length < shortest_length:
+            shortest.clear()
+            shortest.add(superstring)
+            shortest_length = superstring_length
+        elif superstring_length == shortest_length:
+            shortest.add(superstring)
+    return shortest
+def print_report(sequence):
+    """Writes a report to stdout for the given DNA sequence."""
+    buf = [f"Nucleotide counts for {sequence}:\n"]
+    counts = Counter(sequence)
+    for base in BASES:
+        buf.append(f"{base:>10}{counts.get(base, 0):>12}")
+    other = sum(v for k, v in counts.items() if k not in BASES)
+    buf.append(f"{'Other':>10}{other:>12}")
+    buf.append(" " * 5 + "_" * 17)
+    buf.append(f"{'Total length':>17}{sum(counts.values()):>5}")
+    print(os.linesep.join(buf), "\n")
+if __name__ == "__main__":
+    test_cases = [
+        ("TA", "AAG", "TA", "GAA", "TA"),
+        ("CATTAGGG", "ATTAG", "GGG", "TA"),
+        ("AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"),
+        (
+            "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT",
+            "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT",
+            "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
+            "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
+            "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT",
+            "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC",
+            "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT",
+            "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
+            "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC",
+            "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT",
+            "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
+            "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
+            "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA",
+        ),
+    ]
+    for case in test_cases:
+        for superstring in shortest_superstrings(case):
+            print_report(superstring)
+    # or ..
+    #
+    # for case in test_cases:
+    #     print_report(shortest_superstring(case))
+    #
+    # .. if you don't want all possible shortest superstrings.
+</lang>
+{{out}}
+<pre>
+Nucleotide counts for GAAGTA:
+         A           3
+         C           0
+         G           2
+         T           1
+     Other           0
+     _________________
+     Total length    6
+Nucleotide counts for TAAGAA:
+         A           4
+         C           0
+         G           1
+         T           1
+     Other           0
+     _________________
+     Total length    6
+Nucleotide counts for TAGAAG:
+         A           3
+         C           0
+         G           2
+         T           1
+     Other           0
+     _________________
+     Total length    6
+Nucleotide counts for CATTAGGG:
+         A           2
+         C           1
+         G           3
+         T           2
+     Other           0
+     _________________
+     Total length    8
+Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA:
+         A          10
+         C           4
+         G           8
+         T           0
+     Other           3
+     _________________
+     Total length   25
+Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA:
+         A          74
+         C          57
+         G          75
+         T          94
+     Other           0
+     _________________
+     Total length  300
 </pre>