Bioinformatics/base count
Given this string representing ordered DNA bases:
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT
- "Pretty print" the sequence followed by a summary of the counts of each of the bases, (A, C, G, and T) in the sequence as well as the total count of bases in the string.
Factor
<lang factor>USING: assocs formatting grouping io kernel literals math math.statistics prettyprint qw sequences sorting ;
CONSTANT: dna $[
qw{ CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT } concat
]
- .dna ( seq n -- )
"SEQUENCE:" print [ group ] [ ] bi [ * swap " %3d: %s\n" printf ] curry each-index ;
- show-counts ( seq -- )
"BASE COUNTS:" print histogram >alist [ first ] sort-with [ [ " %c: %3d\n" printf ] assoc-each ] [ "TOTAL: " write [ second ] [ + ] map-reduce . ] bi ;
dna [ 50 .dna nl ] [ show-counts ] bi</lang>
- Output:
SEQUENCE: 0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 50: CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 100: AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 150: GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 200: CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 250: TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 300: TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 350: CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 450: GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT BASE COUNTS: A: 129 C: 97 G: 119 T: 155 TOTAL: 500
Go
<lang go>package main
import (
"fmt" "sort"
)
func main() {
dna := "" + "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG" + "CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG" + "AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT" + "GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT" + "CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG" + "TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA" + "TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT" + "CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG" + "TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC" + "GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT"
fmt.Println("SEQUENCE:") le := len(dna) for i := 0; i < le; i += 50 { k := i + 50 if k > le { k = le } fmt.Printf("%5d: %s\n", i, dna[i:k]) } baseMap := make(map[byte]int) // allows for 'any' base for i := 0; i < le; i++ { baseMap[dna[i]]++ } var bases []byte for k := range baseMap { bases = append(bases, k) } sort.Slice(bases, func(i, j int) bool { // get bases into alphabetic order return bases[i] < bases[j] })
fmt.Println("\nBASE COUNT:") for _, base := range bases { fmt.Printf(" %c: %3d\n", base, baseMap[base]) } fmt.Println(" ------") fmt.Println(" Σ:", le) fmt.Println(" ======")
}</lang>
- Output:
SEQUENCE: 0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 50: CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 100: AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 150: GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 200: CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 250: TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 300: TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 350: CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 450: GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT BASE COUNT: A: 129 C: 97 G: 119 T: 155 ------ Σ: 500 ======
Pascal
<lang pascal>program DNA_Base_Count; {$IFDEF FPC}
{$MODE DELPHI}//String = AnsiString
{$ELSE}
{$APPTYPE CONSOLE}
{$ENDIF} const
dna = 'CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG' + 'CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG' + 'AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT' + 'GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT' + 'CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG' + 'TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA' + 'TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT' + 'CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG' + 'TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC' + 'GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT';
var
CntIdx : array of NativeUint; DNABases : String; SumBaseTotal : NativeInt;
procedure OutFormatBase(var DNA: String;colWidth:NativeInt); var
j: NativeInt;
Begin
j := 0; Writeln(' DNA base sequence'); While j<Length(DNA) do Begin writeln(j:5,copy(DNA,j+1,colWidth):colWidth+2); inc(j,colWidth); end; writeln;
end;
procedure Cnt(const DNA: String); var
i,p :NativeInt;
Begin
SetLength(CntIdx,Length(DNABases)); i := 1; while i <= Length(DNA) do Begin p := Pos(DNA[i],DNABases); //found new base so extend list if p = 0 then Begin DNABases := DNABases+DNA[i]; p := length(DNABases); Setlength(CntIdx,p+1); end; inc(CntIdx[p]); inc(i); end;
Writeln('Base Count'); SumBaseTotal := 0; For i := 1 to Length(DNABases) do Begin p := CntIdx[i]; inc(SumBaseTotal,p); writeln(DNABases[i]:4,p:10); end; Writeln('Total base count ',SumBaseTotal); writeln;
end;
var
TestDNA: String;
Begin
DNABases :='ACGT';// predefined TestDNA := DNA; OutFormatBase(TestDNA,50); Cnt(TestDNA);
end.</lang>
- Output:
DNA base sequence 0 CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 50 CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 100 AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 150 GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 200 CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 250 TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 300 TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 350 CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400 TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 450 GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT Base Count A 129 C 97 G 119 T 155 Total base count 500
Python
<lang python>from collections import Counter
def basecount(dna):
return sorted(Counter(dna).items())
def seq_split(dna, n=50):
return [dna[i: i+n] for i in range(0, len(dna), n)]
def seq_pp(dna, n=50):
for i, part in enumerate(seq_split(dna, n)): print(f"{i*n:>5}: {part}") print("\n BASECOUNT:") tot = 0 for base, count in basecount(dna): print(f" {base:>3}: {count}") tot += count base, count = 'TOT', tot print(f" {base:>3}= {count}")
if __name__ == '__main__':
print("SEQUENCE:") sequence = \
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG\ CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG\ AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT\ GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT\ CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG\ TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA\ TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT\ CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG\ TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC\ GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT
seq_pp(sequence)
</lang>
- Output:
SEQUENCE: 0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 50: CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 100: AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 150: GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 200: CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 250: TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 300: TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 350: CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 450: GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT BASECOUNT: A: 129 C: 97 G: 119 T: 155 TOT= 500
REXX
A little extra boilerplate was added to verify correct coding of the bases in a DNA string and the alignment of the (totals) numbers. <lang rexx>/*REXX program finds the number of each base in a DNA string (along with a total). */ parse arg dna . if dna== | dna=="," then dna= CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG ,
CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG , AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT , GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT , CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG , TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA , TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT , CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG , TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC , GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT
dna= space(dna, 0); upper dna /*elide blanks from DNA; uppercase it. */ say '────────length of the DNA string: ' length(dna) @.=0 /*initialize the count for all bases. */ w= 1 /*the maximum width of a base count. */ $= /*a placeholder for the names of bases.*/
do j=1 for length(dna) /*traipse through the DNA string. */ _= substr(dna, j, 1) /*obtain a base name from the DNA str. */ if pos(_, $)==0 then $=$ || _ /*if not found before, add it to list. */ @._= @._ + 1 /*bump the count of this base. */ w= max(w, length(@._) ) /*compute the maximum width number. */ end /*j*/
say
do k=0 for 255; z= d2c(k) /*traipse through all possibilities. */ if pos(z, $)==0 then iterate /*Was this base found? No, then skip. */ say ' base ' z " has a basecount of: " right(@.z, w) @.tot= @.tot + @.z /*add to a grand total to verify count.*/ end /*k*/
say /*stick a fork in it, we're all done. */ say '────────total for all basecounts:' right(@.tot, w+1)</lang>
- output when using the default input:
────────length of the DNA string: 500 base A has a basecount of: 129 base C has a basecount of: 97 base G has a basecount of: 119 base T has a basecount of: 155 ────────total for all basecounts: 500