Bioinformatics/base count

From Rosetta Code
Revision as of 12:43, 26 November 2019 by Thundergnat (talk | contribs) (→‎{{header|Perl 6}}: Add a Perl 6 example)
Bioinformatics/base count is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

Given this string representing ordered DNA bases:

CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG
CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG
AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT
GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT
CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG
TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT
CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG
TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC
GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT
  1. "Pretty print" the sequence followed by a summary of the counts of each of the bases, (A, C, G, and T) in the sequence as well as the total count of bases in the string.



Factor

<lang factor>USING: assocs formatting grouping io kernel literals math math.statistics prettyprint qw sequences sorting ;

CONSTANT: dna $[

   qw{
       CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG
       CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG
       AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT
       GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT
       CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG
       TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
       TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT
       CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG
       TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC
       GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT
   } concat

]

.dna ( seq n -- )
   "SEQUENCE:" print [ group ] [ ] bi
   [ * swap "  %3d: %s\n" printf ] curry each-index ;
show-counts ( seq -- )
   "BASE COUNTS:" print histogram >alist [ first ] sort-with
   [ [ "    %c: %3d\n" printf ] assoc-each ]
   [ "TOTAL: " write [ second ] [ + ] map-reduce . ] bi ;

dna [ 50 .dna nl ] [ show-counts ] bi</lang>

Output:
SEQUENCE:
    0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG
   50: CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG
  100: AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT
  150: GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT
  200: CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG
  250: TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
  300: TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT
  350: CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG
  400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC
  450: GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT

BASE COUNTS:
    A: 129
    C:  97
    G: 119
    T: 155
TOTAL: 500

Go

<lang go>package main

import (

   "fmt"
   "sort"

)

func main() {

   dna := "" +
       "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG" +
       "CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG" +
       "AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT" +
       "GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT" +
       "CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG" +
       "TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA" +
       "TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT" +
       "CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG" +
       "TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC" +
       "GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT"
   fmt.Println("SEQUENCE:")
   le := len(dna)
   for i := 0; i < le; i += 50 {
       k := i + 50
       if k > le {
           k = le
       }
       fmt.Printf("%5d: %s\n", i, dna[i:k])
   }
   baseMap := make(map[byte]int) // allows for 'any' base
   for i := 0; i < le; i++ {
       baseMap[dna[i]]++
   }
   var bases []byte
   for k := range baseMap {
       bases = append(bases, k)
   }
   sort.Slice(bases, func(i, j int) bool { // get bases into alphabetic order
       return bases[i] < bases[j]
   })
   fmt.Println("\nBASE COUNT:")
   for _, base := range bases {
       fmt.Printf("    %c: %3d\n", base, baseMap[base])
   }
   fmt.Println("    ------")
   fmt.Println("    Σ:", le)
   fmt.Println("    ======")

}</lang>

Output:
SEQUENCE:
    0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG
   50: CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG
  100: AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT
  150: GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT
  200: CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG
  250: TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
  300: TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT
  350: CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG
  400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC
  450: GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT

BASE COUNT:
    A: 129
    C:  97
    G: 119
    T: 155
    ------
    Σ: 500
    ======

Pascal

<lang pascal>program DNA_Base_Count; {$IFDEF FPC}

 {$MODE DELPHI}//String = AnsiString

{$ELSE}

 {$APPTYPE CONSOLE}

{$ENDIF} const

   dna =
       'CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG' +
       'CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG' +
       'AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT' +
       'GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT' +
       'CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG' +
       'TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA' +
       'TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT' +
       'CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG' +
       'TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC' +
       'GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT';

var

 CntIdx : array of NativeUint;
 DNABases : String;
 SumBaseTotal : NativeInt;

procedure OutFormatBase(var DNA: String;colWidth:NativeInt); var

 j: NativeInt;

Begin

 j := 0;
 Writeln(' DNA base sequence');
 While j<Length(DNA) do
 Begin
   writeln(j:5,copy(DNA,j+1,colWidth):colWidth+2);
   inc(j,colWidth);
 end;
 writeln;

end;

procedure Cnt(const DNA: String); var

 i,p :NativeInt;

Begin

 SetLength(CntIdx,Length(DNABases));
 i := 1;
 while i <= Length(DNA) do
 Begin
   p := Pos(DNA[i],DNABases);
   //found new base so extend list
   if p = 0 then
   Begin
     DNABases := DNABases+DNA[i];
     p := length(DNABases);
     Setlength(CntIdx,p+1);
   end;
   inc(CntIdx[p]);
   inc(i);
 end;
 Writeln('Base     Count');
 SumBaseTotal := 0;
 For i := 1 to Length(DNABases) do
 Begin
   p := CntIdx[i];
   inc(SumBaseTotal,p);
   writeln(DNABases[i]:4,p:10);
 end;
 Writeln('Total base count ',SumBaseTotal);
 writeln;

end;

var

 TestDNA: String;

Begin

 DNABases :='ACGT';// predefined
 TestDNA := DNA;
 OutFormatBase(TestDNA,50);
 Cnt(TestDNA);

end.</lang>

Output:
 DNA base sequence
    0  CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG
   50  CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG
  100  AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT
  150  GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT
  200  CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG
  250  TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
  300  TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT
  350  CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG
  400  TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC
  450  GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT

Base     Count
   A       129
   C        97
   G       119
   T       155
Total base count 500

Perl 6

Works with: Rakudo version 2019.07.1

It's the Letter frequency task all over again, just simpler and dressed up in different clothes.

The specs for what "pretty print" means are sadly lacking. Ah well, just makes it easily defensible if I do anything at all.

<lang perl6>my $dna = join , lines q:to/END/;

   CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG
   CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG
   AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT
   GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT
   CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG
   TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
   TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT
   CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG
   TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC
   GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT
   END


put pretty($dna, 80); put "\nTotal bases: ", +my $bases = $dna.comb.Bag; put $bases.sort(~*.key).join: "\n";

sub pretty ($string, $wrap = 50) {

   $string.comb($wrap).map( { sprintf "%8d: %s", $++ * $wrap, $_ } ).join: "\n"

}</lang>

Output:
       0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCG
      80: AGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTC
     160: GTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGT
     240: TCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATATTTAATTTTTCTATATAGCG
     320: ATCTGTATTTAAGCAATTCATTTAGGTTATCGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG
     400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGACGACCGGGGACTTGCATGATGGGAGCAGCTT
     480: TGTTAAACTACGAACGTAAT

Total bases: 500
A	129
C	97
G	119
T	155

Python

<lang python>from collections import Counter

def basecount(dna):

   return sorted(Counter(dna).items())

def seq_split(dna, n=50):

   return [dna[i: i+n] for i in range(0, len(dna), n)]

def seq_pp(dna, n=50):

   for i, part in enumerate(seq_split(dna, n)):
       print(f"{i*n:>5}: {part}")
   print("\n  BASECOUNT:")
   tot = 0
   for base, count in basecount(dna):
       print(f"    {base:>3}: {count}")
       tot += count
   base, count = 'TOT', tot
   print(f"    {base:>3}= {count}")
   

if __name__ == '__main__':

   print("SEQUENCE:")
   sequence = \

CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG\ CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG\ AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT\ GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT\ CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG\ TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA\ TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT\ CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG\ TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC\ GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT

   seq_pp(sequence)

</lang>

Output:
SEQUENCE:
    0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG
   50: CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG
  100: AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT
  150: GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT
  200: CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG
  250: TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
  300: TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT
  350: CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG
  400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC
  450: GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT

  BASECOUNT:
      A: 129
      C: 97
      G: 119
      T: 155
    TOT= 500

REXX

A little extra boilerplate was added to verify correct coding of the bases in a DNA string and the alignment of the (totals) numbers. <lang rexx>/*REXX program finds the number of each base in a DNA string (along with a total). */ parse arg dna . if dna== | dna=="," then dna= CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG ,

                                CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG ,
                                AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT ,
                                GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT ,
                                CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG ,
                                TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA ,
                                TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT ,
                                CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG ,
                                TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC ,
                                GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT

dna= space(dna, 0); upper dna /*elide blanks from DNA; uppercase it. */ say '────────length of the DNA string: ' length(dna) @.=0 /*initialize the count for all bases. */ w= 1 /*the maximum width of a base count. */ $= /*a placeholder for the names of bases.*/

      do j=1  for length(dna)                   /*traipse through the  DNA  string.    */
      _= substr(dna, j, 1)                      /*obtain a base name from the DNA str. */
      if pos(_, $)==0  then $=$ || _            /*if not found before, add it to list. */
      @._= @._ + 1                              /*bump the count of this base.         */
      w= max(w, length(@._) )                   /*compute the maximum width number.    */
      end   /*j*/

say

      do k=0  for 255;   z= d2c(k)              /*traipse through all possibilities.   */
      if pos(z, $)==0  then iterate             /*Was this base found?  No, then skip. */
      say '     base '   z    " has a basecount of: "   right(@.z, w)
      @.tot= @.tot + @.z                        /*add to a grand total to verify count.*/
      end   /*k*/

say /*stick a fork in it, we're all done. */ say '────────total for all basecounts:' right(@.tot, w+1)</lang>

output   when using the default input:
────────length of the DNA string:  500

     base  A  has a basecount of:  129
     base  C  has a basecount of:   97
     base  G  has a basecount of:  119
     base  T  has a basecount of:  155

────────total for all basecounts:  500