Bioinformatics/Global alignment: Difference between revisions

m
(→‎{{header|Perl}}: added Pascal by prepending to perl)
m (→‎{{header|Wren}}: Minor tidy)
 
(10 intermediate revisions by 3 users not shown)
Line 1:
{{task}}
{{Draft task}}[[Category:Bioinfomatics]][[Category:Strings]]
 
[[Category:Bioinfomatics]]
[[Category:Strings]]
Global alignment is designed to search for highly similar regions in two or more DNA sequences, where the
sequences appear in the same order and orientation, fitting the sequences in as pieces in a puzzle.
Line 53 ⟶ 55:
:* [[Bioinformatics/Sequence_mutation|Bioinformatics sequence mutation]].
<br><br>
 
 
=={{header|11l}}==
{{trans|Nim}}
 
<langsyntaxhighlight lang="11l">-V ACGT = [‘A’, ‘C’, ‘G’, ‘T’]
 
F permutations(slist)
Line 137:
L(test) TestSequences
V scs = shortestCommonSuperstring(test)
printCounts(scs)</langsyntaxhighlight>
 
{{out}}
Line 181:
--------------------
Total length 300
</pre>
 
=={{header|C++}}==
<syntaxhighlight lang="c++">
 
#include <algorithm>
#include <cstdint>
#include <iostream>
#include <numeric>
#include <unordered_map>
#include <unordered_set>
#include <string>
#include <vector>
 
// Print a report of the given string to the standard output device.
void print_report(const std::string& text) {
std::unordered_map<char, int32_t> bases;
for ( const char& ch : text ) {
bases[ch]++;
}
 
const int32_t total = std::accumulate(bases.begin(), bases.end(), 0,
[&](int32_t previous_sum, std::pair<char, int32_t> entry) {
return previous_sum + entry.second;
});
 
std::cout << "Nucleotide counts for: " << ( ( text.length() > 50 ) ? "\n" : "" );
std::cout << text << std::endl;
std::cout << "Bases: A " << bases['A'] << ", C: " << bases['C'] << ", G: " << bases['G'] << ", T: " << bases['T']
<< ", total: " << total << "\n" << std::endl;
}
 
// Return all permutations of the given list of strings.
std::vector<std::vector<std::string>> permutations(std::vector<std::string>& list) {
int32_t indexes[list.size()] = {};
std::vector<std::vector<std::string>> result;
result.push_back(list);
int32_t i = 0;
while ( (uint64_t) i < list.size() ) {
if ( indexes[i] < i ) {
const int j = ( i % 2 == 0 ) ? 0 : indexes[i];
std::swap(list[i], list[j]);
result.push_back(list);
indexes[i]++;
i = 0;
} else {
indexes[i] = 0;
i++;
}
}
return result;
}
 
// Return 'before' concatenated with 'after', removing the longest suffix of 'before' that matches a prefix of 'after'.
std::string concatenate(const std::string& before, const std::string& after) {
for ( uint64_t i = 0; i < before.length(); ++i ) {
if ( after.starts_with(before.substr(i, before.length())) ) {
return before.substr(0, i) + after;
}
}
return before + after;
}
 
// Remove duplicate strings and strings which are substrings of other strings in the given list.
std::vector<std::string> deduplicate(const std::vector<std::string>& list) {
std::vector<std::string> singletons(list);
std::sort(singletons.begin(), singletons.end());
singletons.erase(std::unique(singletons.begin(), singletons.end()), singletons.end());
 
std::vector<std::string> result(singletons);
std::unordered_set<std::string> marked_for_removal;
for ( const std::string& test_word : result ) {
for ( const std::string& word : singletons ) {
if ( word != test_word && word.find(test_word) != std::string::npos ) {
marked_for_removal.emplace(test_word);
}
}
}
 
result.erase(std::remove_if(result.begin(), result.end(),
[&](std::string& word) {
return marked_for_removal.count(word) != 0;
}
), result.end());
 
return result;
}
 
// Return a set containing all of the shortest common superstrings of the given list of strings.
std::unordered_set<std::string> shortest_common_superstrings(const std::vector<std::string>& list) {
std::vector<std::string> deduplicated = deduplicate(list);
 
std::unordered_set<std::string> shortest;
shortest.emplace(std::reduce(list.begin(), list.end(), std::string("")));
 
uint64_t shortest_length;
for ( const std::string& word : list ) {
shortest_length += word.length();
}
 
for ( std::vector<std::string> permutation : permutations(deduplicated) ) {
std::string candidate;
for ( const std::string& word : permutation ) {
candidate = concatenate(candidate, word);
}
 
if ( candidate.length() < shortest_length ) {
shortest.clear();
shortest.emplace(candidate);
shortest_length = candidate.length();
} else if ( candidate.length() == shortest_length ) {
shortest.emplace(candidate);
}
}
return shortest;
}
 
int main() {
const std::vector<std::vector<std::string>> test_sequences = {
{ "TA", "AAG", "TA", "GAA", "TA" },
{ "CATTAGGG", "ATTAG", "GGG", "TA" },
{ "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" },
{ "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT",
"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT",
"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC",
"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC",
"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT",
"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" } };
 
for ( const std::vector<std::string>& test : test_sequences ) {
for ( const std::string& superstring : shortest_common_superstrings(test) ) {
print_report(superstring);
}
}
}
</syntaxhighlight>
<pre>
Nucleotide counts for: TAGAAG
Bases: A 3, C: 0, G: 2, T: 1, total: 6
 
Nucleotide counts for: TAAGAA
Bases: A 4, C: 0, G: 1, T: 1, total: 6
 
Nucleotide counts for: GAAGTA
Bases: A 3, C: 0, G: 2, T: 1, total: 6
 
Nucleotide counts for: CATTAGGG
Bases: A 2, C: 1, G: 3, T: 2, total: 8
 
Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA
Bases: A 10, C: 4, G: 8, T: 0, total: 25
 
Nucleotide counts for:
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
Bases: A 74, C: 57, G: 75, T: 94, total: 300
</pre>
 
=={{header|Go}}==
{{trans|Julia}}
<langsyntaxhighlight lang="go">package main
 
import (
Line 355 ⟶ 517:
printCounts(scs)
}
}</langsyntaxhighlight>
 
{{out}}
Line 398 ⟶ 560:
____________________
Total length 300
</pre>
 
=={{header|Java}}==
<syntaxhighlight lang="java">
 
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
 
public final class BioinformaticsGlobalAlignment {
 
public static void main(String[] aArgs) {
List<List<String>> testSequences = Arrays.asList(
Arrays.asList( "TA", "AAG", "TA", "GAA", "TA" ),
Arrays.asList( "CATTAGGG", "ATTAG", "GGG", "TA" ),
Arrays.asList( "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" ),
Arrays.asList( "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT",
"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT",
"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC",
"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC",
"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT",
"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" )
);
for ( List<String> test : testSequences ) {
for ( String superstring : shortestCommonSuperstrings(test) ) {
printReport(superstring);
}
}
}
// Return a set containing all of the shortest common superstrings of the given list of strings.
private static Set<String> shortestCommonSuperstrings(List<String> aList) {
List<String> deduplicated = deduplicate(aList);
Set<String> shortest = new HashSet<String>();
shortest.add(String.join("", deduplicated));
int shortestLength = aList.stream().mapToInt( s -> s.length() ).sum();
for ( List<String> permutation : permutations(deduplicated) ) {
String candidate = permutation.stream().reduce("", (a, b) -> concatenate(a, b) );
if ( candidate.length() < shortestLength ) {
shortest.clear();
shortest.add(candidate);
shortestLength = candidate.length();
} else if ( candidate.length() == shortestLength ) {
shortest.add(candidate);
}
}
return shortest;
}
 
// Remove duplicate strings and strings which are substrings of other strings in the given list.
private static List<String> deduplicate(List<String> aList) {
List<String> unique = aList.stream().distinct().collect(Collectors.toList());
List<String> result = new ArrayList<String>(unique);
List<String> markedForRemoval = new ArrayList<String>();
for ( String testWord : result ) {
for ( String word : unique ) {
if ( ! word.equals(testWord) && word.contains(testWord) ) {
markedForRemoval.add(testWord);
}
}
}
result.removeAll(markedForRemoval);
return result;
}
// Return aBefore concatenated with aAfter, removing the longest suffix of aBefore that matches a prefix of aAfter.
private static String concatenate(String aBefore, String aAfter) {
for ( int i = 0; i < aBefore.length(); i++ ) {
if ( aAfter.startsWith(aBefore.substring(i, aBefore.length())) ) {
return aBefore.substring(0, i) + aAfter;
}
}
return aBefore + aAfter;
}
// Return all permutations of the given list of strings.
private static List<List<String>> permutations(List<String> aList) {
int[] indexes = new int[aList.size()];
List<List<String>> result = new ArrayList<List<String>>();
result.add( new ArrayList<String>(aList) );
int i = 0;
while ( i < aList.size() ) {
if ( indexes[i] < i ) {
final int j = ( i % 2 == 0 ) ? 0 : indexes[i];
String temp = aList.get(j);
aList.set(j, aList.get(i));
aList.set(i, temp);
result.add( new ArrayList<String>(aList) );
indexes[i]++;
i = 0;
} else {
indexes[i] = 0;
i += 1;
}
}
return result;
}
// Print a report of the given string to the standard output device.
private static void printReport(String aText) {
char[] nucleotides = new char[] {'A', 'C', 'G', 'T' };
Map<Character, Integer> bases = new HashMap<Character, Integer>();
for ( char base : nucleotides ) {
bases.put(base, 0);
}
for ( char ch : aText.toCharArray() ) {
bases.merge(ch, 1, Integer::sum);
}
final int total = bases.values().stream().reduce(0, Integer::sum);
System.out.print("Nucleotide counts for: " + ( ( aText.length() > 50 ) ? System.lineSeparator() : "") );
System.out.println(aText);
System.out.print(String.format("%s%d%s%d%s%d%s%d",
"Bases: A: ", bases.get('A'), ", C: ", bases.get('C'), ", G: ", bases.get('G'), ", T: ", bases.get('T')));
System.out.println(", total: " + total + System.lineSeparator());
}
 
}
</syntaxhighlight>
{{ out }}
<pre>
Nucleotide counts for: TAGAAG
Bases: A: 3, C: 0, G: 2, T: 1, total: 6
 
Nucleotide counts for: GAAGTA
Bases: A: 3, C: 0, G: 2, T: 1, total: 6
 
Nucleotide counts for: TAAGAA
Bases: A: 4, C: 0, G: 1, T: 1, total: 6
 
Nucleotide counts for: CATTAGGG
Bases: A: 2, C: 1, G: 3, T: 2, total: 8
 
Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA
Bases: A: 10, C: 4, G: 8, T: 0, total: 25
 
Nucleotide counts for:
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
Bases: A: 74, C: 57, G: 75, T: 94, total: 300
</pre>
 
Line 403 ⟶ 719:
{{works with|jq}}
'''Works with gojq, the Go implementation of jq'''
<syntaxhighlight lang="jq">
<lang jq>
### Generic helper functions
 
Line 415 ⟶ 731:
range(0;length) as $i
| [.[$i]] + (del(.[$i])|permutations)
end ;</langsyntaxhighlight><syntaxhighlight lang ="jq">
# Give a synoptic view of the input string,
# highlighting the occurrence of ACGTU letters
Line 475 ⟶ 791:
else . end)
| .shortestsuper;
</syntaxhighlight>
</lang>
'''The specific tasks'''
<syntaxhighlight lang="jq">
<lang jq>
def examples:
[
Line 509 ⟶ 825:
| "Task \($i+1):", ($examples[$i]|t), "";
 
tasks</langsyntaxhighlight>
{{out}}
<pre>
Line 556 ⟶ 872:
Σ: 300
</pre>
 
=={{header|Julia}}==
<langsyntaxhighlight lang="julia">using Combinatorics
 
""" Given a DNA sequence, report the sequence, length and base counts"""
Line 639 ⟶ 954:
printcounts(scs)
end
</langsyntaxhighlight>{{out}}
<pre>
Nucleotide counts for TAAGAA:
Line 681 ⟶ 996:
Total length 300
</pre>
 
=={{header|Nim}}==
{{trans|Wren}}
<langsyntaxhighlight Nimlang="nim">import algorithm, sequtils, strformat, strutils, tables
 
const ACGT = ['A', 'C', 'G', 'T'] # Four DNA bases.
Line 765 ⟶ 1,079:
for test in TestSequences:
let scs = test.shortestCommonSuperstring
scs.printCounts()</langsyntaxhighlight>
 
{{out}}
Line 807 ⟶ 1,121:
————————————————————
Total length 300</pre>
 
=={{header|Pascal}}==
Used a matrix of head-tail overlapping and modified n-queens to generate the permutations.<BR>
Here nearly no runtime.But see [[N-queens_problem]] that using permutation is not the way for > 17<BR>
Of course this is more a traveling salesman problem.
<langsyntaxhighlight lang="pascal">
program BaseInDNA;
{$IFDEF FPC}
Line 1,103 ⟶ 1,416:
find;
END.
</syntaxhighlight>
</lang>
{{out}}
<pre>
Line 1,125 ⟶ 1,438:
A : 74 C : 57 G : 75 T : 94 U : 0
</pre>
 
=={{header|Perl}}==
<langsyntaxhighlight lang="perl">#!/usr/bin/perl
 
use strict; # https://rosettacode.org/wiki/Bioinformatics/global_alignment
Line 1,210 ⟶ 1,522:
use Data::Dump 'dd'; dd \%ch;
}
}</langsyntaxhighlight>
{{out}}
<pre>
Line 1,225 ⟶ 1,537:
{ A => 74, C => 57, G => 75, T => 94 }
</pre>
 
=={{header|Phix}}==
<!--<langsyntaxhighlight Phixlang="phix">(phixonline)-->
<span style="color: #008080;">procedure</span> <span style="color: #000000;">printcounts</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">ss</span><span style="color: #0000FF;">)</span>
<span style="color: #000080;font-style:italic;">-- Given DNA sequence(s), report the sequence, length and base counts</span>
Line 1,312 ⟶ 1,623:
<span style="color: #0000FF;">}</span>
<span style="color: #7060A8;">papply</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tests</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">shortest_common_superstring</span><span style="color: #0000FF;">)</span>
<!--</langsyntaxhighlight>-->
{{out}}
(Shows three length-6 results for the first test)
Line 1,339 ⟶ 1,650:
Base counts: Other:0, A:74, C:57, G:75, T:94, total:300
</pre>
 
=={{header|Python}}==
{{trans|Go}}
 
<langsyntaxhighlight lang="python">import os
 
from collections import Counter
Line 1,459 ⟶ 1,769:
#
# .. if you don't want all possible shortest superstrings.
</syntaxhighlight>
</lang>
 
{{out}}
Line 1,523 ⟶ 1,833:
Total length 300
</pre>
 
=={{header|Raku}}==
{{trans|Go}}
{{trans|Julia}}
<syntaxhighlight lang="raku" perl6line># 20210209 Raku programming solution
 
sub printCounts(\seq) {
Line 1,583 ⟶ 1,892:
>,
)
</syntaxhighlight>
</lang>
 
{{out}}
Line 1,600 ⟶ 1,909:
(C 57 G 75 A 74 T 94) and total length = 300
</pre>
 
=={{header|Wren}}==
{{trans|Julia}}
Line 1,607 ⟶ 1,915:
{{libheader|Wren-str}}
{{libheader|Wren-math}}
<langsyntaxhighlight ecmascriptlang="wren">import "./fmt" for Fmt
import "./seq" for Lst
import "./str" for Str
import "./math" for Int
 
/* Gets all permutations of a list of strings. */
Line 1,725 ⟶ 2,033:
var scs = shortestCommonSuperstring.call(test)
printCounts.call(scs)
}</langsyntaxhighlight>
 
{{out}}
9,476

edits