Bioinformatics/Global alignment: Difference between revisions

← Older edit

Bioinformatics/Global alignment (view source)

Revision as of 09:33, 8 November 2023

68,065 bytes added , 6 months ago

m

→‎{{header|Wren}}: Minor tidy

PureFox

9,476

edits

Revision as of 07:57, 8 February 2021 (view source) Petelomax (talk \| contribs) m (→‎{{header\|Phix}}) ← Older edit		Latest revision as of 09:33, 8 November 2023 (view source) PureFox (talk \| contribs) m (→‎{{header\|Wren}}: Minor tidy)
(24 intermediate revisions by 13 users not shown)
Line 1: {{task}} ~~{{Draft task}}[[Category:Bioinfomatics]][[Category:Strings]]~~ [[Category:Bioinfomatics]] [[Category:Strings]] Global alignment is designed to search for highly similar regions in two or more DNA sequences, where the sequences appear in the same order and orientation, fitting the sequences in as pieces in a puzzle. Line 53 ⟶ 55: :* [[Bioinformatics/Sequence_mutation\|Bioinformatics sequence mutation]]. <br><br> =={{header\|11l}}== {{trans\|Nim}} <syntaxhighlight lang="11l">-V ACGT = [‘A’, ‘C’, ‘G’, ‘T’] F permutations(slist) V l = sorted(slist) V r = [l] L l.next_permutation() r [+]= copy(l) R r F printCounts(dnaSeq) DefaultDict[Char, Int] counts L(c) dnaSeq counts[c]++ print("\nNucleotide counts for #.:\n".format(dnaSeq)) L(base) :ACGT print(‘#10 #11’.format(base, counts[base])) V others = 0 L(base) counts.keys() I base !C :ACGT others += counts[base] print(‘ Other #11’.format(others)) print(‘ --------------------’) print(‘ Total length #7’.format(dnaSeq.len)) F headTailOverlap(s1, s2) V start = 0 L V? n = s1.find(s2[0], start) I n == N R 0 start = n I s2.starts_with(s1[start..]) R s1.len - start start++ F deduplicate(slist) [String] r V s = Set(slist) L(s1) s V i = L.index L(s2) s I L.index != i & s1 C s2 L.break L.was_no_break r.append(s1) R r F shortestCommonSuperstring(sl) V slist = deduplicate(sl) V result = slist.join(‘’) L(perm) permutations(slist) String sup = perm[0] L(i) 0 .< slist.len - 1 V overlapPos = headTailOverlap(perm[i], perm[i + 1]) sup ‘’= perm[i + 1][overlapPos..] I sup.len < result.len result = sup R result V TestSequences = [ [‘TA’, ‘AAG’, ‘TA’, ‘GAA’, ‘TA’], [‘CATTAGGG’, ‘ATTAG’, ‘GGG’, ‘TA’], [‘AAGAUGGA’, ‘GGAGCGCAUC’, ‘AUCGCAAUAAGGA’], [‘ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT’, ‘GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT’, ‘CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA’, ‘TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC’, ‘AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT’, ‘GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC’, ‘CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT’, ‘TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC’, ‘CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC’, ‘GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT’, ‘TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC’, ‘CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA’, ‘TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA’]] L(test) TestSequences V scs = shortestCommonSuperstring(test) printCounts(scs)</syntaxhighlight> {{out}} <pre> Nucleotide counts for GAAGTA: A 3 C 0 G 2 T 1 Other 0 -------------------- Total length 6 Nucleotide counts for CATTAGGG: A 2 C 1 G 3 T 2 Other 0 -------------------- Total length 8 Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA: A 10 C 4 G 8 T 0 Other 3 -------------------- Total length 25 Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA: A 74 C 57 G 75 T 94 Other 0 -------------------- Total length 300 </pre> =={{header\|C++}}== <syntaxhighlight lang="c++"> #include <algorithm> #include <cstdint> #include <iostream> #include <numeric> #include <unordered_map> #include <unordered_set> #include <string> #include <vector> // Print a report of the given string to the standard output device. void print_report(const std::string& text) { std::unordered_map<char, int32_t> bases; for ( const char& ch : text ) { bases[ch]++; } const int32_t total = std::accumulate(bases.begin(), bases.end(), 0, [&](int32_t previous_sum, std::pair<char, int32_t> entry) { return previous_sum + entry.second; }); std::cout << "Nucleotide counts for: " << ( ( text.length() > 50 ) ? "\n" : "" ); std::cout << text << std::endl; std::cout << "Bases: A " << bases['A'] << ", C: " << bases['C'] << ", G: " << bases['G'] << ", T: " << bases['T'] << ", total: " << total << "\n" << std::endl; } // Return all permutations of the given list of strings. std::vector<std::vector<std::string>> permutations(std::vector<std::string>& list) { int32_t indexes[list.size()] = {}; std::vector<std::vector<std::string>> result; result.push_back(list); int32_t i = 0; while ( (uint64_t) i < list.size() ) { if ( indexes[i] < i ) { const int j = ( i % 2 == 0 ) ? 0 : indexes[i]; std::swap(list[i], list[j]); result.push_back(list); indexes[i]++; i = 0; } else { indexes[i] = 0; i++; } } return result; } // Return 'before' concatenated with 'after', removing the longest suffix of 'before' that matches a prefix of 'after'. std::string concatenate(const std::string& before, const std::string& after) { for ( uint64_t i = 0; i < before.length(); ++i ) { if ( after.starts_with(before.substr(i, before.length())) ) { return before.substr(0, i) + after; } } return before + after; } // Remove duplicate strings and strings which are substrings of other strings in the given list. std::vector<std::string> deduplicate(const std::vector<std::string>& list) { std::vector<std::string> singletons(list); std::sort(singletons.begin(), singletons.end()); singletons.erase(std::unique(singletons.begin(), singletons.end()), singletons.end()); std::vector<std::string> result(singletons); std::unordered_set<std::string> marked_for_removal; for ( const std::string& test_word : result ) { for ( const std::string& word : singletons ) { if ( word != test_word && word.find(test_word) != std::string::npos ) { marked_for_removal.emplace(test_word); } } } result.erase(std::remove_if(result.begin(), result.end(), [&](std::string& word) { return marked_for_removal.count(word) != 0; } ), result.end()); return result; } // Return a set containing all of the shortest common superstrings of the given list of strings. std::unordered_set<std::string> shortest_common_superstrings(const std::vector<std::string>& list) { std::vector<std::string> deduplicated = deduplicate(list); std::unordered_set<std::string> shortest; shortest.emplace(std::reduce(list.begin(), list.end(), std::string(""))); uint64_t shortest_length; for ( const std::string& word : list ) { shortest_length += word.length(); } for ( std::vector<std::string> permutation : permutations(deduplicated) ) { std::string candidate; for ( const std::string& word : permutation ) { candidate = concatenate(candidate, word); } if ( candidate.length() < shortest_length ) { shortest.clear(); shortest.emplace(candidate); shortest_length = candidate.length(); } else if ( candidate.length() == shortest_length ) { shortest.emplace(candidate); } } return shortest; } int main() { const std::vector<std::vector<std::string>> test_sequences = { { "TA", "AAG", "TA", "GAA", "TA" }, { "CATTAGGG", "ATTAG", "GGG", "TA" }, { "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" }, { "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" } }; for ( const std::vector<std::string>& test : test_sequences ) { for ( const std::string& superstring : shortest_common_superstrings(test) ) { print_report(superstring); } } } </syntaxhighlight> <pre> Nucleotide counts for: TAGAAG Bases: A 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: TAAGAA Bases: A 4, C: 0, G: 1, T: 1, total: 6 Nucleotide counts for: GAAGTA Bases: A 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: CATTAGGG Bases: A 2, C: 1, G: 3, T: 2, total: 8 Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA Bases: A 10, C: 4, G: 8, T: 0, total: 25 Nucleotide counts for: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA Bases: A 74, C: 57, G: 75, T: 94, total: 300 </pre> =={{header\|Go}}== {{trans\|Julia}} <syntaxhighlight lang="go">package main import ( "fmt" "strings" ) /* Gets n! for small n. / func factorial(n int) int { fact := 1 for i := 2; i <= n; i++ { fact = i } return fact } /* Gets all permutations of a list of strings. / func getPerms(input []string) [][]string { perms := [][]string{input} le := len(input) a := make([]string, le) copy(a, input) n := le - 1 fact := factorial(n + 1) for c := 1; c < fact; c++ { i := n - 1 j := n for i >= 0 && a[i] > a[i+1] { i-- } if i == -1 { i = n } for a[j] < a[i] { j-- } a[i], a[j] = a[j], a[i] j = n i++ if i == n+1 { i = 0 } for i < j { a[i], a[j] = a[j], a[i] i++ j-- } b := make([]string, le) copy(b, a) perms = append(perms, b) } return perms } / Returns all distinct elements from a list of strings. / func distinct(slist []string) []string { distinctSet := make(map[string]int, len(slist)) i := 0 for _, s := range slist { if _, ok := distinctSet[s]; !ok { distinctSet[s] = i i++ } } result := make([]string, len(distinctSet)) for s, i := range distinctSet { result[i] = s } return result } / Given a DNA sequence, report the sequence, length and base counts. / func printCounts(seq string) { bases := [][]rune{{'A', 0}, {'C', 0}, {'G', 0}, {'T', 0}} for _, c := range seq { for _, base := range bases { if c == base[0] { base[1]++ } } } sum := 0 fmt.Println("\nNucleotide counts for", seq, "\b:\n") for _, base := range bases { fmt.Printf("%10c%12d\n", base[0], base[1]) sum += int(base[1]) } le := len(seq) fmt.Printf("%10s%12d\n", "Other", le-sum) fmt.Printf(" ____________________\n%14s%8d\n", "Total length", le) } / Return the position in s1 of the start of overlap of tail of string s1 with head of string s2. / func headTailOverlap(s1, s2 string) int { for start := 0; ; start++ { ix := strings.IndexByte(s1[start:], s2[0]) if ix == -1 { return 0 } else { start += ix } if strings.HasPrefix(s2, s1[start:]) { return len(s1) - start } } } / Remove duplicates and strings contained within a larger string from a list of strings. / func deduplicate(slist []string) []string { var filtered []string arr := distinct(slist) for i, s1 := range arr { withinLarger := false for j, s2 := range arr { if j != i && strings.Contains(s2, s1) { withinLarger = true break } } if !withinLarger { filtered = append(filtered, s1) } } return filtered } / Returns shortest common superstring of a list of strings. / func shortestCommonSuperstring(slist []string) string { ss := deduplicate(slist) shortestSuper := strings.Join(ss, "") for _, perm := range getPerms(ss) { sup := perm[0] for i := 0; i < len(ss)-1; i++ { overlapPos := headTailOverlap(perm[i], perm[i+1]) sup += perm[i+1][overlapPos:] } if len(sup) < len(shortestSuper) { shortestSuper = sup } } return shortestSuper } func main() { testSequences := [][]string{ {"TA", "AAG", "TA", "GAA", "TA"}, {"CATTAGGG", "ATTAG", "GGG", "TA"}, {"AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"}, { "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA", }, } for _, test := range testSequences { scs := shortestCommonSuperstring(test) printCounts(scs) } }</syntaxhighlight> {{out}} <pre> Nucleotide counts for TAAGAA: A 4 C 0 G 1 T 1 Other 0 ____________________ Total length 6 Nucleotide counts for CATTAGGG: A 2 C 1 G 3 T 2 Other 0 ____________________ Total length 8 Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA: A 10 C 4 G 8 T 0 Other 3 ____________________ Total length 25 Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA: A 74 C 57 G 75 T 94 Other 0 ____________________ Total length 300 </pre> =={{header\|Java}}== <syntaxhighlight lang="java"> import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; public final class BioinformaticsGlobalAlignment { public static void main(String[] aArgs) { List<List<String>> testSequences = Arrays.asList( Arrays.asList( "TA", "AAG", "TA", "GAA", "TA" ), Arrays.asList( "CATTAGGG", "ATTAG", "GGG", "TA" ), Arrays.asList( "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" ), Arrays.asList( "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" ) ); for ( List<String> test : testSequences ) { for ( String superstring : shortestCommonSuperstrings(test) ) { printReport(superstring); } } } // Return a set containing all of the shortest common superstrings of the given list of strings. private static Set<String> shortestCommonSuperstrings(List<String> aList) { List<String> deduplicated = deduplicate(aList); Set<String> shortest = new HashSet<String>(); shortest.add(String.join("", deduplicated)); int shortestLength = aList.stream().mapToInt( s -> s.length() ).sum(); for ( List<String> permutation : permutations(deduplicated) ) { String candidate = permutation.stream().reduce("", (a, b) -> concatenate(a, b) ); if ( candidate.length() < shortestLength ) { shortest.clear(); shortest.add(candidate); shortestLength = candidate.length(); } else if ( candidate.length() == shortestLength ) { shortest.add(candidate); } } return shortest; } // Remove duplicate strings and strings which are substrings of other strings in the given list. private static List<String> deduplicate(List<String> aList) { List<String> unique = aList.stream().distinct().collect(Collectors.toList()); List<String> result = new ArrayList<String>(unique); List<String> markedForRemoval = new ArrayList<String>(); for ( String testWord : result ) { for ( String word : unique ) { if ( ! word.equals(testWord) && word.contains(testWord) ) { markedForRemoval.add(testWord); } } } result.removeAll(markedForRemoval); return result; } // Return aBefore concatenated with aAfter, removing the longest suffix of aBefore that matches a prefix of aAfter. private static String concatenate(String aBefore, String aAfter) { for ( int i = 0; i < aBefore.length(); i++ ) { if ( aAfter.startsWith(aBefore.substring(i, aBefore.length())) ) { return aBefore.substring(0, i) + aAfter; } } return aBefore + aAfter; } // Return all permutations of the given list of strings. private static List<List<String>> permutations(List<String> aList) { int[] indexes = new int[aList.size()]; List<List<String>> result = new ArrayList<List<String>>(); result.add( new ArrayList<String>(aList) ); int i = 0; while ( i < aList.size() ) { if ( indexes[i] < i ) { final int j = ( i % 2 == 0 ) ? 0 : indexes[i]; String temp = aList.get(j); aList.set(j, aList.get(i)); aList.set(i, temp); result.add( new ArrayList<String>(aList) ); indexes[i]++; i = 0; } else { indexes[i] = 0; i += 1; } } return result; } // Print a report of the given string to the standard output device. private static void printReport(String aText) { char[] nucleotides = new char[] {'A', 'C', 'G', 'T' }; Map<Character, Integer> bases = new HashMap<Character, Integer>(); for ( char base : nucleotides ) { bases.put(base, 0); } for ( char ch : aText.toCharArray() ) { bases.merge(ch, 1, Integer::sum); } final int total = bases.values().stream().reduce(0, Integer::sum); System.out.print("Nucleotide counts for: " + ( ( aText.length() > 50 ) ? System.lineSeparator() : "") ); System.out.println(aText); System.out.print(String.format("%s%d%s%d%s%d%s%d", "Bases: A: ", bases.get('A'), ", C: ", bases.get('C'), ", G: ", bases.get('G'), ", T: ", bases.get('T'))); System.out.println(", total: " + total + System.lineSeparator()); } } </syntaxhighlight> {{ out }} <pre> Nucleotide counts for: TAGAAG Bases: A: 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: GAAGTA Bases: A: 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: TAAGAA Bases: A: 4, C: 0, G: 1, T: 1, total: 6 Nucleotide counts for: CATTAGGG Bases: A: 2, C: 1, G: 3, T: 2, total: 8 Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA Bases: A: 10, C: 4, G: 8, T: 0, total: 25 Nucleotide counts for: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA Bases: A: 74, C: 57, G: 75, T: 94, total: 300 </pre> =={{header\|jq}}== {{works with\|jq}} '''Works with gojq, the Go implementation of jq''' <syntaxhighlight lang="jq"> ### Generic helper functions # bag-of-words def bow(stream): reduce stream as $word ({}; .[($word\|tostring)] += 1); def permutations: if length == 0 then [] else range(0;length) as $i \| [.[$i]] + (del(.[$i])\|permutations) end ;</syntaxhighlight><syntaxhighlight lang="jq"> # Give a synoptic view of the input string, # highlighting the occurrence of ACGTU letters def synopsis: ["A", "C", "G", "T", "U"] as $standard \| . as $seq \| bow(explode \| map([.]\|implode)[]) as $bases \| ("Nucleotide counts for \($seq):\n"), (($standard + ($bases\|keys - $standard))[] \| "\(.): \($bases[.]//0)"), "__", "Σ: \($seq\|length)" ; # If the strings, $s1 and $s2, overlap by at least $minimumoverlap characters, # return { i1: <index in $s1 where overlap starts>, overlap: <overlapping string>}, # otherwise, return null def overlap_info($s1; $s2; $minimumoverlap): first( range(0; $s1\|length + 1 - $minimumoverlap) as $i1 \| $s1[$i1:] as $overlap \| select($s2 \| startswith($overlap)) \| {$i1, $overlap} ) // null ; # Input: an array of strings # Remove duplicates and strings contained within a larger string def deduplicate: unique \| . as $arr \| reduce range(0;length) as $i ([]; $arr[$i] as $s1 \| if any( $arr[] \| select(. != $s1); index($s1)) then . else . + [$s1] end); # Given an array of deduplicated strings, attempt to find a superstring # composed of these strings in the same order; # return it if found, else null. def relevant($min): . as $in \| length as $length \| {s: .[0], i:0} \| until (.s == null or .i >= $length - 1; .i as $i # Since the strings have been deduplicated we can use $in[$i]: \| overlap_info($in[$i]; $in[$i+1]; $min) as $overlap \| if $overlap then .s += $in[$i+1][$overlap.overlap\|length:] else .s = null end \| .i += 1 ) \| .s ; # Input: an array of strings # Return shortest common superstring def shortest_common_superstring: deduplicate as $ss \| reduce ($ss \| permutations) as $perm ({shortestsuper: ($ss \| add) }; ($perm \| relevant(1)) as $candidate \| if $candidate and ($candidate\|length) < (.shortestsuper\|length) then .shortestsuper = $candidate else . end) \| .shortestsuper; </syntaxhighlight> '''The specific tasks''' <syntaxhighlight lang="jq"> def examples: [ ["TA", "AAG", "TA", "GAA", "TA"], ["CATTAGGG", "ATTAG", "GGG", "TA"], ["AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"], ["ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA"] ]; def tasks: def t: shortest_common_superstring \| synopsis; examples \| . as $examples \| range(0;length) as $i \| "Task \($i+1):", ($examples[$i]\|t), ""; tasks</syntaxhighlight> {{out}} <pre> Task 1: Nucleotide counts for TAAGAA: A: 4 C: 0 G: 1 T: 1 U: 0 __ Σ: 6 Task 2: Nucleotide counts for CATTAGGG: A: 2 C: 1 G: 3 T: 2 U: 0 __ Σ: 8 Task 3: Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA: A: 10 C: 4 G: 8 T: 0 U: 3 __ Σ: 25 Task 4: Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA: A: 74 C: 57 G: 75 T: 94 U: 0 __ Σ: 300 </pre> =={{header\|Julia}}== <~~lang~~syntaxhighlight lang="julia">using Combinatorics """ Given a DNA sequence, report the sequence, length and base counts""" Line 137 ⟶ 954: printcounts(scs) end </~~lang~~syntaxhighlight>{{out}} <pre> Nucleotide counts for TAAGAA: Line 179 ⟶ 996: Total length 300 </pre> =={{header\|Nim}}== {{trans\|Wren}} <syntaxhighlight lang="nim">import algorithm, sequtils, strformat, strutils, tables const ACGT = ['A', 'C', 'G', 'T'] # Four DNA bases. ~~=={{header\|Phix}}==~~ ~~<lang Phix>procedure printcounts(sequence ss)~~ iterator permutations(slist: seq[string]): seq[string] = ~~-- Given DNA sequence(s), report the sequence, length and base counts~~ var slist ~~for i~~=1 ~~to length~~sorted(ssslist) do yield slist ~~string dna = ss[i]~~ while slist.nextPermutation(): ~~sequence acgt = repeat(0,6)~~ yield slist ~~for j=1 to length(dna) do~~ ~~acgt[find(dna[j],"ACGT")+1] += 1~~ ~~end for~~ proc printCounts(dnaSeq: string) = ~~acgt[$] = sum(acgt)~~ ## Given a DNA sequence, report ~~string~~the ~~ncf~~sequence, =length ~~"Nucleotide~~and base counts ~~for :"~~. let counts = dnaSeq.toCountTable() ~~printf(1,"%s%s\n",{ncf,join(split_by(dna,50),"\n"&repeat(' ',length(ncf)))})~~ echo &"\nNucleotide counts for {dnaSeq}:\n" ~~printf(1,"Base counts: Other:%d, A:%d, C:%d, G:%d, T:%d, total:%d\n\n",acgt)~~ for base ~~end~~in ~~for~~ACGT: echo &"{($base):>10} {counts[base]:11}" ~~end procedure~~ var others = 0 for base in counts.keys: ~~function deduplicate(sequence ss)~~ if base notin ACGT: inc others, counts[base] ~~-- Remove any strings contained within a larger string from a set of strings~~ echo &" Other {others:11}" ~~sequence filtered = {}~~ echo &" ————————————————————" ~~for i=1 to length(ss) do~~ echo &" Total length {dnaSeq.len: 7}" ~~string si = ss[i]~~ ~~bool found = false~~ ~~for j=1 to length(ss) do~~ func headTailOverlap(s1, s2: string): int = ~~if i!=j and match(si,ss[j]) then~~ ## Return the position in "s1" of the start of overlap ~~found = true~~ ## of tail of string "s1" with head of string ~~exit~~"s2". var start = 0 ~~end if~~ while true: ~~end for~~ start = s1.find(s2[0], start) ~~if not found then~~ if start < 0: return 0 ~~filtered = append(filtered, si)~~ if s2.startsWith(s1[start..^1]): return s1.len - start ~~end if~~ ~~end~~inc ~~for~~start ~~return filtered~~ ~~end function~~ proc deduplicate(slist: seq[string]): seq[string] = ## Remove duplicates and strings contained within a larger string from a list of strings. ~~procedure shortest_common_superstring(sequence ss)~~ let slist = sequtils.deduplicate(slist) ~~-- Returns shortest common superstring of a set of strings~~ for i, s1 in slist: ~~ss = deduplicate(unique(ss))~~ block check: ~~sequence shortestsuper = {join(ss,"")}~~ for j, s2 in slist: ~~integer shortest = length(shortestsuper[1])~~ if j != i and s1 in s2: ~~for p=1 to factorial(length(ss)) do~~ ~~sequence~~ ~~perm~~ =break ~~permute(p,ss)~~check # "s1" ~~string~~is ~~sup~~not =contained ~~perm[1]~~in another string. result.add s1 ~~for i=2 to length(perm) do~~ ~~string pi = perm[i]~~ ~~for j=-min(length(pi),length(sup)) to 0 do~~ func shortestCommonSuperstring(slist: seq[string]): string = ~~string overlap = sup[j..$]~~ ## Return shortest common superstring of a list of strings. ~~if overlap = pi[1..length(overlap)] then~~ ~~sup &= pi[length(overlap)+1..$]~~ let slist = slist.deduplicate() ~~pi = ""~~ result = slist.join() ~~exit~~ for perm in slist.permutations(): ~~end if~~ var sup = ~~end for~~perm[0] for i in 0..<slist.high: ~~if length(pi) then ?9/0 end if -- (sanity chk)~~ let overlapPos = headTailOverlap(perm[i], perm[i+1]) ~~end for~~ sup &= perm[i+1][overlapPos..^1] ~~if length(sup) < shortest then~~ if sup.len < result.len: ~~shortest~~result = ~~length(~~sup) ~~shortestsuper = {sup}~~ ~~elsif length(sup) = shortest~~ const TestSequences = [ ~~and not find(sup,shortestsuper) then~~ @["TA", "AAG", "TA", "GAA", "TA"], ~~shortestsuper = append(shortestsuper,sup)~~ @["CATTAGGG", "ATTAG", "GGG", "TA"], ~~end if~~ @["AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"], ~~end for~~ @["ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", ~~printcounts(shortestsuper)~~ "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", ~~end procedure~~ "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", ~~constant tests = {~~ "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", ~~{"TA", "AAG", "TA", "GAA", "TA"},~~ "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", ~~{"CATTAGGG", "ATTAG", "GGG", "TA"},~~ "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", ~~{"AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"},~~ "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", ~~{"ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT",~~ "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", ~~"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT",~~ "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", ~~"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",~~ "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", ~~"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",~~ "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", ~~"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT",~~ "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA"]] ~~"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC",~~ ~~"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT",~~ for test in TestSequences: ~~"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",~~ let scs = test.shortestCommonSuperstring ~~"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC",~~ scs.printCounts()</syntaxhighlight> ~~"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT",~~ ~~"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",~~ {{out}} ~~"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",~~ <pre>Nucleotide counts for GAAGTA: ~~"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA"}~~ A 3 C 0 G 2 T 1 Other 0 ———————————————————— Total length 6 Nucleotide counts for CATTAGGG: A 2 C 1 G 3 T 2 Other 0 ———————————————————— Total length 8 Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA: A 10 C 4 G 8 T 0 Other 3 ———————————————————— Total length 25 Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA: A 74 C 57 G 75 T 94 Other 0 ———————————————————— Total length 300</pre> =={{header\|Pascal}}== Used a matrix of head-tail overlapping and modified n-queens to generate the permutations.<BR> Here nearly no runtime.But see [[N-queens_problem]] that using permutation is not the way for > 17<BR> Of course this is more a traveling salesman problem. <syntaxhighlight lang="pascal"> program BaseInDNA; {$IFDEF FPC} {$mode Delphi} {$Optimization ON,All} {$ELSE} {$APPTYPE CONSOLE} {$ENDIF} uses sysutils,classes; type tmyString = AnsiString;//[255]; tpMyString = ^tmyString; tOvrLapMat = array of array of Int32; tNextDNA = array of Int32; tpNextDNA = pInt32; const convDgtBase :array['1'..'5'] of char = ('A','C','G','T','U'); Test1 : array[0..4] of tmyString = ('TA','AAG','TA','GAA','TA'); Test2 : array[0..3] of tmyString = ('CATTAGGG', 'ATTAG', 'GGG', 'TA'); Test3 : array[0..2] of tmyString = ('AAGAUGGA', 'GGAGCGCAUC', 'AUCGCAAUAAGGA'); Test4 : array[0..12] of tmyString = ('ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT', 'GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT', 'CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA', 'TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC', 'AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT', 'GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC', 'CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT', 'TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC', 'CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC', 'GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT', 'TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC', 'CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA', 'TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA'); var sl_DNA : TStringList; OverlapMat : tOvrLapMat; SolDNA : tNextDNA; pNextDNA : tpNextDNA; DNA_Count,MAX,LastMax : Int32; function ConvertACGT_1234(const s:AnsiString):AnsiString; const conv :array['A'..'U'] of char = ('1',#0,'2',#0,#0,#0,'3',#0,#0, #0,#0,#0,#0,#0,#0,#0,#0,#0, #0,'4','5'); var pC: pChar; i : NativeInt; begin i := Length(s); setlength(result,i); pC := @result[1]; dec(i); while i >= 0 do Begin pC[i] := conv[s[i+1]]; dec(i); end; end; function Convert1234_ACGTU(const s:AnsiString):AnsiString; var pC: pChar; i : NativeInt; begin i := Length(s); setlength(result,i); pC := @result[1]; dec(i); while i >= 0 do Begin pC[i] := convDgtBase[s[i+1]]; dec(i); end; end; procedure Check_Base_Count(const s: ANsiString); var bc : ANsiString; BaseCnt : array[0..4] of UInt32; pC: pChar; i : NativeInt; Begin writeln('Total length : ',Length(s)); bc := ConvertACGT_1234(s); FillChar(BaseCnt,SizeOf(BaseCnt),#0); pC := @bc[1]; for i := length(bc)-1 downto 0 do inc(BaseCnt[Ord(pC[i])-Ord('1')]); For i := 0 to 4 do write(convDgtBase[chr(i+49)],' : ',BaseCnt[i]:3,' '); writeln; end; procedure extract_double(var sl : TStringList); var i,j : NativeInt; begin sl.sort; for i := sl.count-2 downto 0 do if sl[i] = sl[i+1] then sl.delete(i+1); i := sl.count-1; repeat For j := i-1 Downto 0 do Begin if (Pos(sl[j],sl[i]) >0) then Begin sl.delete(j); i := sl.count; BREAK; end else if (Pos(sl[i],sl[j]) >0) then Begin sl.delete(i); i := sl.count; BREAK; end; end; dec(i); until i < 1; end; procedure InsertSL(var sl : TStringList;pS :tpMyString;cnt:NativeInt); Begin sl.clear; while cnt > 0 do Begin sl.Append(pS^); inc(pS); dec(cnt); end; extract_double(sl); sl.sort; end; function Check_Head_Tail(const s1,s2: AnsiString):NativeInt; var cH : AnsiChar; i,j,k : NativeInt; Begin result := 0; j := length(s1); cH := s2[1]; repeat if s1[j]= cH then Begin i:= 1; k := j; while (s1[k] = s2[i]) AND (k <= length(s1)) do begin inc(i); inc(k); end; if k > length(s1) then result := length(s1)-j+1; end; dec(j); until j <1; end; function CreateOvrLapMat(const sl_DNA:TStringList):tOvrLapMat; var col,row,DNAlen : NativeInt; begin DNAlen := sl_DNA.Count; setlength(result,DNAlen,DNAlen); dec(DNAlen); For row := DNAlen downto 0 do For col := DNAlen downto 0 do if row<>col then result[row,col] := Check_Head_Tail(sl_DNA[row],sl_DNA[col]); {//output of matrix For row := 0 to DNAlen do Begin For col := 0 to DNAlen do write(OverlapMat[row,col]:3); writeln; end; } ~~papply(tests, shortest_common_superstring)</lang>~~ end; procedure SetQueen(Row,sum,lastIdx:NativeInt); var i,NextIdx,dSum : nativeInt; begin IF row <= DNA_Count-1 then begin For i := row to DNA_Count-1 do begin NextIdx := pNextDNA[i];pNextDNA[i] := pNextDNA[Row];pNextDNA[Row] := NextIdx; dSum :=OverlapMat[lastidx,NextIdx]; sum += dSum; SetQueen(Row+1,sum,NextIdx); sum -= dSum; pNextDNA[Row] := pNextDNA[i];pNextDNA[i] := NextIdx; end; end else begin //solution found could be modified MAX<=sum for more solutions of same length If MAX<sum then Begin MAX := sum; // remember the way for i := DNA_Count-1 downto 0 do SolDNA[i+1] := pNextDNA[i]; end; end; end; procedure Find; var col,row,i : NativeInt; NextDNA : tNextDNA; Combined : AnsiString; Begin DNA_Count := sl_DNA.count; IF DNA_Count = 1 then Combined := sl_DNA[0] else Begin setlength(SolDNA,DNA_count); dec(DNA_Count); setlength(NextDNA,DNA_count); //Tail-Head-Matrix OverlapMat := CreateOvrLapMat(sl_DNA); MAX := 0; LastMax := 0; pNextDNA := @NextDNA[0]; //start with base_sequence[row] for row := 0 to DNA_count do begin i := 0; For col := 0 to DNA_count do if row<>col then begin pNextDNA[i] := col; inc(i); end; SetQueen(0,0,row); If LastMax< MAX then begin SolDNA[0]:= row; LastMax := MAX; end; end; Combined := ''; for col := 0 to DNA_Count-1 do Begin write(SolDNA[col]+1,'->'); row := length(sl_DNA[SolDNA[col]]); Combined += copy(sl_DNA[SolDNA[col]],1,row-OverlapMat[SolDNA[col],SolDNA[col+1]]); end; writeln(SolDNA[DNA_Count]+1); Combined += sl_DNA[SolDNA[DNA_Count]]; LastMax := 0; for col := 0 to DNA_Count do inc(LastMax,Length(sl_DNA[col])); IF LastMax-MAX <> length(combined) then writeln(LastMax,'-',Max,' = ',LastMax-MAX,' ?=? ',length(combined)); end; writeln(combined); Check_Base_Count(combined); writeln; end; BEGIN sl_DNA := TStringList.create; InsertSL(sl_DNA,@Test1[0],High(Test1)+1); find; InsertSL(sl_DNA,@Test2[0],High(Test2)+1); find; InsertSL(sl_DNA,@Test3[0],High(Test3)+1); find; InsertSL(sl_DNA,@Test4[0],High(Test4)+1); find; END. </syntaxhighlight> {{out}} <pre> 2->1->3 GAAGTA Total length : 6 A : 3 C : 0 G : 2 T : 1 U : 0 CATTAGGG Total length : 8 A : 2 C : 1 G : 3 T : 2 U : 0 1->3->2 AAGAUGGAGCGCAUCGCAAUAAGGA Total length : 25 A : 10 C : 4 G : 8 T : 0 U : 3 1->6->7->5->4->2->3 CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA Total length : 300 A : 74 C : 57 G : 75 T : 94 U : 0 </pre> =={{header\|Perl}}== <syntaxhighlight lang="perl">#!/usr/bin/perl use strict; # https://rosettacode.org/wiki/Bioinformatics/global_alignment use warnings; use List::Util qw( first uniq ); my @seq = ( [ qw( TA AAG TA GAA TA ) ], [ qw( CATTAGGG ATTAG GGG TA) ], [ qw( AAGAUGGA GGAGCGCAUC AUCGCAAUAAGGA ) ], [ qw( ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA ) ], ); sub removedups # remove dups and subseqs { local $_ = join ' ', sort { length $a <=> length $b } split ' ', shift; 1 while s/\b(\w+) (?=.\1)//; return $_; } for ( @seq ) { local $_ = removedups join ' ', @$_; my @queue = $_; my @best; while( @queue ) { local $_ = shift @queue; my @seq = split ' ', $_; my @over; for my $left ( @seq ) { for my $right ( @seq ) { $left eq $right and next; "$left $right" =~ /(.+) \1/ or next; my $len = length $1; $over[$len] .= "$left $right\n"; } } if( @over ) { for my $join ( split /\n/, $over[-1] ) { my ($left, $right) = split ' ', $join; my @newseq = grep $_ ne $left && $_ ne $right, @seq; # remove used push @queue, removedups "$left $right" =~ s/(.+) (?=\1)//r . join ' ', '', @newseq; } } else { tr/ //d; $best[length] .= "$_\n"; next; } } for ( uniq split /\n/, first {defined} @best ) { printf "\nlength %d - %s\n", length, $_; my %ch; $ch{$_}++ for /./g; use Data::Dump 'dd'; dd \%ch; } }</syntaxhighlight> {{out}} <pre> length 6 - TAGAAG { A => 3, G => 2, T => 1 } length 8 - CATTAGGG { A => 2, C => 1, G => 3, T => 2 } length 25 - AAGAUGGAGCGCAUCGCAAUAAGGA { A => 10, C => 4, G => 8, U => 3 } length 300 - CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA { A => 74, C => 57, G => 75, T => 94 } </pre> =={{header\|Phix}}== <!--<syntaxhighlight lang="phix">(phixonline)--> <span style="color: #008080;">procedure</span> <span style="color: #000000;">printcounts</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">ss</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- Given DNA sequence(s), report the sequence, length and base counts</span> <span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ss</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span> <span style="color: #004080;">string</span> <span style="color: #000000;">dna</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">ss</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">acgt</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #000000;">6</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">for</span> <span style="color: #000000;">j</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">dna</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span> <span style="color: #000000;">acgt</span><span style="color: #0000FF;">[</span><span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">dna</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">],</span><span style="color: #008000;">"ACGT"</span><span style="color: #0000FF;">)+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span> <span style="color: #008080;">end</span> <span style="color: #008080;">for</span> <span style="color: #000000;">acgt</span><span style="color: #0000FF;">[$]</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sum</span><span style="color: #0000FF;">(</span><span style="color: #000000;">acgt</span><span style="color: #0000FF;">)</span> <span style="color: #004080;">string</span> <span style="color: #000000;">ncf</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"Nucleotide counts for :"</span> <span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%s%s\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">ncf</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">join</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">split_by</span><span style="color: #0000FF;">(</span><span style="color: #000000;">dna</span><span style="color: #0000FF;">,</span><span style="color: #000000;">50</span><span style="color: #0000FF;">),</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">&</span><span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ncf</span><span style="color: #0000FF;">)))})</span> <span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"Base counts: Other:%d, A:%d, C:%d, G:%d, T:%d, total:%d\n\n"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">acgt</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">for</span> <span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span> <span style="color: #008080;">function</span> <span style="color: #000000;">deduplicate</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">ss</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- Remove any strings contained within a larger string from a set of strings</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">filtered</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ss</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span> <span style="color: #004080;">string</span> <span style="color: #000000;">si</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">ss</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span> <span style="color: #004080;">bool</span> <span style="color: #000000;">found</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">false</span> <span style="color: #008080;">for</span> <span style="color: #000000;">j</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ss</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span> <span style="color: #008080;">if</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">j</span> <span style="color: #008080;">and</span> <span style="color: #7060A8;">match</span><span style="color: #0000FF;">(</span><span style="color: #000000;">si</span><span style="color: #0000FF;">,</span><span style="color: #000000;">ss</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span> <span style="color: #000000;">found</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">true</span> <span style="color: #008080;">exit</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> <span style="color: #008080;">end</span> <span style="color: #008080;">for</span> <span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #000000;">found</span> <span style="color: #008080;">then</span> <span style="color: #000000;">filtered</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">filtered</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">si</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> <span style="color: #008080;">end</span> <span style="color: #008080;">for</span> <span style="color: #008080;">return</span> <span style="color: #000000;">filtered</span> <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <span style="color: #008080;">procedure</span> <span style="color: #000000;">shortest_common_superstring</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">ss</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- Returns shortest common superstring of a set of strings</span> <span style="color: #000000;">ss</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">deduplicate</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">unique</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ss</span><span style="color: #0000FF;">))</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">shortestsuper</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #7060A8;">join</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ss</span><span style="color: #0000FF;">,</span><span style="color: #008000;">""</span><span style="color: #0000FF;">)}</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">shortest</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">shortestsuper</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">for</span> <span style="color: #000000;">p</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">factorial</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ss</span><span style="color: #0000FF;">))</span> <span style="color: #008080;">do</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">perm</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">permute</span><span style="color: #0000FF;">(</span><span style="color: #000000;">p</span><span style="color: #0000FF;">,</span><span style="color: #000000;">ss</span><span style="color: #0000FF;">)</span> <span style="color: #004080;">string</span> <span style="color: #000000;">sup</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">perm</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span> <span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">2</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">perm</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span> <span style="color: #004080;">string</span> <span style="color: #000000;">pi</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">perm</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span> <span style="color: #008080;">for</span> <span style="color: #000000;">j</span><span style="color: #0000FF;">=-</span><span style="color: #7060A8;">min</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">pi</span><span style="color: #0000FF;">),</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sup</span><span style="color: #0000FF;">))</span> <span style="color: #008080;">to</span> <span style="color: #000000;">0</span> <span style="color: #008080;">do</span> <span style="color: #004080;">string</span> <span style="color: #000000;">overlap</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">sup</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">..$]</span> <span style="color: #008080;">if</span> <span style="color: #000000;">overlap</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">pi</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">..</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">overlap</span><span style="color: #0000FF;">)]</span> <span style="color: #008080;">then</span> <span style="color: #000000;">sup</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">pi</span><span style="color: #0000FF;">[</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">overlap</span><span style="color: #0000FF;">)+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">..$]</span> <span style="color: #000000;">pi</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span> <span style="color: #008080;">exit</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> <span style="color: #008080;">end</span> <span style="color: #008080;">for</span> <span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">pi</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> <span style="color: #000080;font-style:italic;">-- (sanity chk)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">for</span> <span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sup</span><span style="color: #0000FF;">)</span> <span style="color: #0000FF;"><</span> <span style="color: #000000;">shortest</span> <span style="color: #008080;">then</span> <span style="color: #000000;">shortest</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sup</span><span style="color: #0000FF;">)</span> <span style="color: #000000;">shortestsuper</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">sup</span><span style="color: #0000FF;">}</span> <span style="color: #008080;">elsif</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sup</span><span style="color: #0000FF;">)</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">shortest</span> <span style="color: #008080;">and</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sup</span><span style="color: #0000FF;">,</span><span style="color: #000000;">shortestsuper</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> <span style="color: #000000;">shortestsuper</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">shortestsuper</span><span style="color: #0000FF;">,</span><span style="color: #000000;">sup</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> <span style="color: #008080;">end</span> <span style="color: #008080;">for</span> <span style="color: #000000;">printcounts</span><span style="color: #0000FF;">(</span><span style="color: #000000;">shortestsuper</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span> <span style="color: #008080;">constant</span> <span style="color: #000000;">tests</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">"TA"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"AAG"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"TA"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"GAA"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"TA"</span><span style="color: #0000FF;">},</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">"CATTAGGG"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"ATTAG"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"GGG"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"TA"</span><span style="color: #0000FF;">},</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">"AAGAUGGA"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"GGAGCGCAUC"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"AUCGCAAUAAGGA"</span><span style="color: #0000FF;">},</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">"ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA"</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">}</span> <span style="color: #7060A8;">papply</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tests</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">shortest_common_superstring</span><span style="color: #0000FF;">)</span> <!--</syntaxhighlight>--> {{out}} (Shows three length-6 results for the first test) Line 291 ⟶ 1,650: Base counts: Other:0, A:74, C:57, G:75, T:94, total:300 </pre> =={{header\|Python}}== {{trans\|Go}} <syntaxhighlight lang="python">import os from collections import Counter from functools import reduce from itertools import permutations BASES = ("A", "C", "G", "T") def deduplicate(sequences): """Return the set of sequences with those that are a substring of others removed too.""" sequences = set(sequences) duplicates = set() for s, t in permutations(sequences, 2): if s != t and s in t: duplicates.add(s) return sequences - duplicates def smash(s, t): """Return `s` concatenated with `t`. The longest suffix of `s` that matches a prefix of `t` will be removed.""" for i in range(len(s)): if t.startswith(s[i:]): return s[:i] + t return s + t def shortest_superstring(sequences): """Return the shortest superstring covering all sequences. If there are multiple shortest superstrings, an arbitrary superstring is returned.""" sequences = deduplicate(sequences) shortest = "".join(sequences) for perm in permutations(sequences): superstring = reduce(smash, perm) if len(superstring) < len(shortest): shortest = superstring return shortest def shortest_superstrings(sequences): """Return a list of all shortest superstrings that cover `sequences`.""" sequences = deduplicate(sequences) shortest = set(["".join(sequences)]) shortest_length = sum(len(s) for s in sequences) for perm in permutations(sequences): superstring = reduce(smash, perm) superstring_length = len(superstring) if superstring_length < shortest_length: shortest.clear() shortest.add(superstring) shortest_length = superstring_length elif superstring_length == shortest_length: shortest.add(superstring) return shortest def print_report(sequence): """Writes a report to stdout for the given DNA sequence.""" buf = [f"Nucleotide counts for {sequence}:\n"] counts = Counter(sequence) for base in BASES: buf.append(f"{base:>10}{counts.get(base, 0):>12}") other = sum(v for k, v in counts.items() if k not in BASES) buf.append(f"{'Other':>10}{other:>12}") buf.append(" " * 5 + "_" * 17) buf.append(f"{'Total length':>17}{sum(counts.values()):>5}") print(os.linesep.join(buf), "\n") if __name__ == "__main__": test_cases = [ ("TA", "AAG", "TA", "GAA", "TA"), ("CATTAGGG", "ATTAG", "GGG", "TA"), ("AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"), ( "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA", ), ] for case in test_cases: for superstring in shortest_superstrings(case): print_report(superstring) # or .. # # for case in test_cases: # print_report(shortest_superstring(case)) # # .. if you don't want all possible shortest superstrings. </syntaxhighlight> {{out}} <pre> Nucleotide counts for GAAGTA: A 3 C 0 G 2 T 1 Other 0 _________________ Total length 6 Nucleotide counts for TAAGAA: A 4 C 0 G 1 T 1 Other 0 _________________ Total length 6 Nucleotide counts for TAGAAG: A 3 C 0 G 2 T 1 Other 0 _________________ Total length 6 Nucleotide counts for CATTAGGG: A 2 C 1 G 3 T 2 Other 0 _________________ Total length 8 Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA: A 10 C 4 G 8 T 0 Other 3 _________________ Total length 25 Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA: A 74 C 57 G 75 T 94 Other 0 _________________ Total length 300 </pre> =={{header\|Raku}}== {{trans\|Go}} {{trans\|Julia}} <syntaxhighlight lang="raku" line># 20210209 Raku programming solution sub printCounts(\seq) { my $bases = seq.comb.Bag ; say "\nNucleotide counts for ", seq, " :"; say $bases.kv, " and total length = ", $bases.total } sub stringCentipede(\s1, \s2) { loop ( my $offset = 0, my \S1 = $ = '' ; ; $offset++ ) { S1 = s1.substr: $offset ; with S1.index(s2.substr(0,1)) -> $p { $offset += $p } else { return False } return s1.chars - $offset if s2.starts-with: s1.substr: $offset } } sub deduplicate { my @sorted = @_.unique.sort: .chars; # by length gather while ( my $target = shift @sorted ) { take $target unless @sorted.grep: { .contains: $target } } } sub shortestCommonSuperstring { my \ß = $ = [~] my @ss = deduplicate @_ ; # ShortestSuper for @ss.permutations -> @perm { my \sup = $ = @perm[0]; for @perm.rotor(2 => -1) { sup ~= @_[1].substr: stringCentipede \|@_ } ß = sup if sup.chars < ß.chars ; } ß } .&shortestCommonSuperstring.&printCounts for ( <TA AAG TA GAA TA>, <CATTAGGG ATTAG GGG TA>, <AAGAUGGA GGAGCGCAUC AUCGCAAUAAGGA> , <ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA >, ) </syntaxhighlight> {{out}} <pre> Nucleotide counts for TAAGAA : (T 1 A 4 G 1) and total length = 6 Nucleotide counts for CATTAGGG : (G 3 A 2 T 2 C 1) and total length = 8 Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA : (A 10 U 3 C 4 G 8) and total length = 25 Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA : (C 57 G 75 A 74 T 94) and total length = 300 </pre> =={{header\|Wren}}== {{trans\|Julia}} Line 298 ⟶ 1,915: {{libheader\|Wren-str}} {{libheader\|Wren-math}} <~~lang~~syntaxhighlight ~~ecmascript~~lang="wren">import "./fmt" for Fmt import "./seq" for Lst import "./str" for Str import "./math" for Int / Gets all permutations of a list of strings. */ Line 416 ⟶ 2,033: var scs = shortestCommonSuperstring.call(test) printCounts.call(scs) }</~~lang~~syntaxhighlight> {{out}}