Arithmetic coding/As a generalized change of radix: Difference between revisions

From Rosetta Code
Content added Content deleted
m (→‎{{header|Sidef}}: fix precedence issue)
m (Minor simplifications)
Line 258: Line 258:
# frequencies of all previously occurring symbols
# frequencies of all previously occurring symbols
foreach my $c (@chars) {
foreach my $c (@chars) {
my $x = $cf{$c};
$L->bmuladd($base, $cf{$c}*$pf);
$L->bmuladd($base, $x * $pf);
$pf->bmul($freq{$c});
$pf->bmul($freq{$c});
}
}
Line 278: Line 277:
$enc *= $radix**$pow;
$enc *= $radix**$pow;


# Base
my $base = Math::BigInt->new(0);
my $base = Math::BigInt->new(0);
$base += $_ for values %{$freq};
$base += $_ for values %{$freq};
Line 312: Line 312:
my $cv = $cf{$c};
my $cv = $cf{$c};


my $rem = ($enc - $pow * $cv) / $fv;
my $rem = ($enc - $pow*$cv) / $fv;


$enc = $rem;
$enc = $rem;
Line 377: Line 377:
# frequencies of all previously occurring symbols
# frequencies of all previously occurring symbols
for @chars -> $c {
for @chars -> $c {
my $x = %cf{$c};
$L = $L*$base + %cf{$c}*$pf;
$L = $L * $base + $x * $pf;
$pf *= %freq{$c};
$pf *= %freq{$c};
}
}
Line 401: Line 400:
my $enc = $encoding * $radix**$pow;
my $enc = $encoding * $radix**$pow;


my $base = 0;
# Base
$base += $_ for %freq.values;
my $base = [+] %freq.values;


# Create the cumulative frequency table
# Create the cumulative frequency table
Line 498: Line 497:
# frequencies of all previously occurring symbols
# frequencies of all previously occurring symbols
for b in bytes:
for b in bytes:
x = cf[b]
lower = lower*base + cf[b]*pf
lower = lower * base + x*pf
pf *= freq[b]
pf *= freq[b]


Line 519: Line 517:
enc *= radix**pow;
enc *= radix**pow;


# Base
base = sum(freq.values())
base = sum(freq.values())


Line 607: Line 606:
# frequencies of all previously occurring symbols
# frequencies of all previously occurring symbols
bytes.each do |b|
bytes.each do |b|
x = cf[b]
lower = lower*base + cf[b]*pf
lower = lower * base + x*pf
pf *= freq[b]
pf *= freq[b]
end
end
Line 631: Line 629:
enc *= radix**pow;
enc *= radix**pow;


base = 0
# Base
base = freq.values.reduce(:+)
freq.each_value { |v| base += v }


# Create the cumulative frequency table
# Create the cumulative frequency table
Line 728: Line 726:
# frequencies of all previously occurring symbols
# frequencies of all previously occurring symbols
bytes.each { |b|
bytes.each { |b|
var x = cf{b}
L = (L*base + cf{b}*pf)
L *= base += x*pf
pf *= freq{b}
pf *= freq{b}
}
}
Line 747: Line 744:
enc *= radix**pow;
enc *= radix**pow;


var base = 0
# Base
freq.each_value { |v| base += v }
var base = freq.values.sum


# Create the cumulative frequency table
# Create the cumulative frequency table

Revision as of 12:44, 30 January 2016

Arithmetic coding/As a generalized change of radix is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

Arithmetic coding is a form of entropy encoding used in lossless data compression. Normally, a string of characters such as the words "hello there" is represented using a fixed number of bits per character, as in the ASCII code. When a string is converted to arithmetic encoding, frequently used characters will be stored with fewer bits and not-so-frequently occurring characters will be stored with more bits, resulting in fewer bits used in total. Arithmetic coding differs from other forms of entropy encoding, such as Huffman coding, in that rather than separating the input into component symbols and replacing each with a code, arithmetic coding encodes the entire message into a single number.

Task

Create a program which implements the arithmetic coding as a generalized change of radix.

Show the results, in base 10, for all the following strings:

  • "DABDDB"
  • "DABDDBBDDBA"
  • "ABRACADABRA"
  • "TOBEORNOTTOBEORTOBEORNOT"


Verify the implementation by decoding the results back into strings and checking for equality with the given strings.

Go

<lang go>package main

import (

   "fmt"
   "math/big"

)

func cumulative_freq(freq map[byte]int64) map[byte]int64 {

   total := int64(0)
   cf := make(map[byte]int64)
   for i := 0; i < 256; i++ {
       b := byte(i)
       if v, ok := freq[b]; ok {
           cf[b] = total
           total += v
       }
   }
   return cf

}

func arithmethic_coding(str string, radix int64) (*big.Int,

                               *big.Int, map[byte]int64) {
   // Convert the string into a slice of bytes
   chars := []byte(str)
   // The frequency characters
   freq := make(map[byte]int64)
   for _, c := range chars {
       freq[c] += 1
   }
   // The cumulative frequency
   cf := cumulative_freq(freq)
   // Base
   base := len(chars)
   // Lower bound
   L := big.NewInt(0)
   // Product of all frequencies
   pf := big.NewInt(1)
   // Each term is multiplied by the product of the
   // frequencies of all previously occurring symbols
   bigBase := big.NewInt(int64(base))
   for _, c := range chars {
       x := big.NewInt(cf[c])
       L.Mul(L, bigBase)
       L.Add(L, x.Mul(x, pf))
       pf.Mul(pf, big.NewInt(freq[c]))
   }
   // Upper bound
   U := big.NewInt(0)
   U.Set(L)
   U.Add(U, pf)
   bigOne := big.NewInt(1)
   bigZero := big.NewInt(0)
   bigRadix := big.NewInt(radix)
   tmp := big.NewInt(0).Set(pf)
   powr := big.NewInt(0)
   for {
       tmp.Div(tmp, bigRadix)
       if tmp.Cmp(bigZero) == 0 {
           break
       }
       powr.Add(powr, bigOne)
   }
   diff := big.NewInt(0)
   diff.Sub(U, bigOne)
   diff.Div(diff, big.NewInt(0).Exp(bigRadix, powr, nil))
   return diff, powr, freq

}

func arithmethic_decoding(num *big.Int, radix int64,

         pow *big.Int, freq map[byte]int64) string {
   powr := big.NewInt(radix)
   enc := big.NewInt(0).Set(num)
   enc.Mul(enc, powr.Exp(powr, pow, nil))
   base := int64(0)
   for _, v := range freq {
       base += v
   }
   // Create the cumulative frequency table
   cf := cumulative_freq(freq)
   // Create the dictionary
   dict := make(map[int64]byte)
   for k, v := range cf {
       dict[v] = k
   }
   // Fill the gaps in the dictionary
   lchar := -1
   for i := int64(0); i < base; i++ {
       if v, ok := dict[i]; ok {
           lchar = int(v)
       } else if lchar != -1 {
           dict[i] = byte(lchar)
       }
   }
   // Decode the input number
   decoded := make([]byte, base)
   bigBase := big.NewInt(base)
   for i := base - 1; i >= 0; i-- {
       pow := big.NewInt(0)
       pow.Exp(bigBase, big.NewInt(i), nil)
       div := big.NewInt(0)
       div.Div(enc, pow)
       c := dict[div.Int64()]
       fv := freq[c]
       cv := cf[c]
       prod := big.NewInt(0).Mul(pow, big.NewInt(cv))
       diff := big.NewInt(0).Sub(enc, prod)
       enc.Div(diff, big.NewInt(fv))
       decoded[base-i-1] = c
   }
   // Return the decoded output
   return string(decoded)

}

func main() {

   var radix = int64(10)
   strSlice := []string{
       `DABDDB`,
       `DABDDBBDDBA`,
       `ABRACADABRA`,
       `TOBEORNOTTOBEORTOBEORNOT`,
   }
   for _, str := range strSlice {
       enc, pow, freq := arithmethic_coding(str, radix)
       dec := arithmethic_decoding(enc, radix, pow, freq)
       fmt.Printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow)
       if str != dec {
           panic("\tHowever that is incorrect!")
       }
   }

}</lang>

Output:
DABDDB                   =>                 251 * 10^2
DABDDBBDDBA              =>              167351 * 10^6
ABRACADABRA              =>             7954170 * 10^4
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15

J

Note that decoding assumes we know the length of the string (and also that strings are assumed to contain only upper case alphanumeric letters) - but that's not a part of this task.

Implementation:

<lang J>aek=:3 :0

 b=. x:#y
 f=. (#/.~{~~.i.]) y
 c=. _65+a.i.y
 L=. b #. c**/\1,}:f
 p=. */f
 e=. x:<.10^.p
 e,~<.(L+p)%10^e

)</lang>

Results are number pairs where the second number is the number of zeros to append to the first.

Example use:

<lang J> aek 'DABDDB' 251 2

  aek 'DABDDBBDDBA'

83677 6

  aek 'ABRACADABRA'

4860830 4

  aek 'TOBEORNOTTOBEORTOBEORNOT'

1224972022395235072 15</lang>

Draft note: these results are different from those of some other implementations currently visible here. I expect that that difference occurs from those other implementations using cumulative frequency instead of frequency at the wrong place in the calculation. (If I am incorrect about this, I'd appreciate a detailed exposition.)

Perl

<lang perl>use Math::BigInt (try => 'GMP');

sub cumulative_freq {

   my ($freq) = @_;
   my %cf;
   my $total = Math::BigInt->new(0);
   foreach my $c (map { chr } 0 .. 255) {
       if (exists $freq->{$c}) {
           $cf{$c} = $total;
           $total += $freq->{$c};
       }
   }
   return %cf;

}

sub arithmethic_coding {

   my ($str, $radix) = @_;
   my @chars = split(//, $str);
   # The frequency characters
   my %freq;
   $freq{$_}++ for @chars;
   # The cumulative frequency table
   my %cf = cumulative_freq(\%freq);
   # Base
   my $base = Math::BigInt->new(scalar @chars);
   # Lower bound
   my $L = Math::BigInt->new(0);
   # Product of all frequencies
   my $pf = Math::BigInt->new(1);
   # Each term is multiplied by the product of the
   # frequencies of all previously occurring symbols
   foreach my $c (@chars) {
       $L->bmuladd($base, $cf{$c}*$pf);
       $pf->bmul($freq{$c});
   }
   # Upper bound
   my $U = $L + $pf;
   my $pow = Math::BigInt->new($pf)->blog($radix);
   my $enc = ($U - 1)->bdiv(Math::BigInt->new($radix)->bpow($pow));
   return ($enc, $pow, \%freq);

}

sub arithmethic_decoding {

   my ($enc, $radix, $pow, $freq) = @_;
   # Multiply enc by radix^pow
   $enc *= $radix**$pow;
   # Base
   my $base = Math::BigInt->new(0);
   $base += $_ for values %{$freq};
   # Create the cumulative frequency table
   my %cf = cumulative_freq($freq);
   # Create the dictionary
   my %dict;
   while (my ($k, $v) = each %cf) {
       $dict{$v} = $k;
   }
   # Fill the gaps in the dictionary
   my $lchar;
   foreach my $i (0 .. $base - 1) {
       if (exists $dict{$i}) {
           $lchar = $dict{$i};
       }
       elsif (defined $lchar) {
           $dict{$i} = $lchar;
       }
   }
   # Decode the input number
   my $decoded = ;
   for (my $i = $base - 1 ; $i >= 0 ; $i--) {
       my $pow = $base**$i;
       my $div = ($enc / $pow);
       my $c  = $dict{$div};
       my $fv = $freq->{$c};
       my $cv = $cf{$c};
       my $rem = ($enc - $pow*$cv) / $fv;
       $enc = $rem;
       $decoded .= $c;
   }
   # Return the decoded output
   return $decoded;

}

my $radix = 10; # can be any integer greater or equal with 2

foreach my $str (qw(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT)) {

   my ($enc, $pow, $freq) = arithmethic_coding($str, $radix);
   my $dec = arithmethic_decoding($enc, $radix, $pow, $freq);
   printf("%-25s=> %19s * %d^%s\n", $str, $enc, $radix, $pow);
   if ($str ne $dec) {
       die "\tHowever that is incorrect!";
   }

}</lang>

Output:
DABDDB                   =>                 251 * 10^2
DABDDBBDDBA              =>              167351 * 10^6
ABRACADABRA              =>             7954170 * 10^4
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15

Perl 6

<lang perl6>sub cumulative_freq(%freq) {

   my %cf;
   my $total = 0;
   for (0..255).map({.chr}) -> $c {
       if (%freq{$c}:exists) {
           %cf{$c} = $total;
           $total += %freq{$c};
       }
   }
   return %cf;

}

sub arithmethic_coding($str, $radix) {

   my @chars = $str.comb;
   # The frequency characters
   my %freq;
   %freq{$_}++ for @chars;
   # The cumulative frequency table
   my %cf = cumulative_freq(%freq);
   # Base
   my $base = @chars.elems;
   # Lower bound
   my $L = 0;
   # Product of all frequencies
   my $pf = 1;
   # Each term is multiplied by the product of the
   # frequencies of all previously occurring symbols
   for @chars -> $c {
       $L = $L*$base + %cf{$c}*$pf;
       $pf *= %freq{$c};
   }
   # Upper bound
   my $U = $L + $pf;
   my $pow = 0;
   loop {
       $pf div= $radix;
       last if $pf == 0;
       ++$pow;
   }
   my $enc = ($U - 1) div ($radix ** $pow);
   ($enc, $pow, %freq);

}

sub arithmethic_decoding($encoding, $radix, $pow, %freq) {

   # Multiply encoding by radix^pow
   my $enc = $encoding * $radix**$pow;
   # Base
   my $base = [+] %freq.values;
   # Create the cumulative frequency table
   my %cf = cumulative_freq(%freq);
   # Create the dictionary
   my %dict;
   for %cf.kv -> $k,$v {
       %dict{$v} = $k;
   }
   # Fill the gaps in the dictionary
   my $lchar;
   for ^$base -> $i {
       if (%dict{$i}:exists) {
           $lchar = %dict{$i};
       }
       elsif (defined $lchar) {
           %dict{$i} = $lchar;
       }
   }
   # Decode the input number
   my $decoded = ;
   for reverse(^$base) -> $i {
       my $pow = $base**$i;
       my $div = $enc div $pow;
       my $c  = %dict{$div};
       my $fv = %freq{$c};
       my $cv = %cf{$c};
       my $rem = ($enc - $pow*$cv) div $fv;
       $enc = $rem;
       $decoded ~= $c;
   }
   # Return the decoded output
   return $decoded;

}

my $radix = 10; # can be any integer greater or equal with 2

for <DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT> -> $str {

   my ($enc, $pow, %freq) = arithmethic_coding($str, $radix);
   my $dec = arithmethic_decoding($enc, $radix, $pow, %freq);
   printf("%-25s=> %19s * %d^%s\n", $str, $enc, $radix, $pow);
   if ($str ne $dec) {
       die "\tHowever that is incorrect!";
   }

}</lang>

Output:
DABDDB                   =>                 251 * 10^2
DABDDBBDDBA              =>              167351 * 10^6
ABRACADABRA              =>             7954170 * 10^4
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15

Python

Works with: Python version 3.1+

<lang python>from collections import Counter

def cumulative_freq(freq):

   cf = {}
   total = 0
   for b in range(256):
       if b in freq:
           cf[b] = total
           total += freq[b]
   return cf

def arithmethic_coding(bytes, radix):

   # The frequency characters
   freq = Counter(bytes)
   # The cumulative frequency table
   cf = cumulative_freq(freq)
   # Base
   base = len(bytes)
   # Lower bound
   lower = 0
   # Product of all frequencies
   pf = 1
   # Each term is multiplied by the product of the
   # frequencies of all previously occurring symbols
   for b in bytes:
       lower = lower*base + cf[b]*pf
       pf *= freq[b]
   # Upper bound
   upper = lower+pf
   pow = 0
   while True:
       pf //= radix
       if pf==0: break
       pow += 1
   enc = (upper-1) // radix**pow
   return enc, pow, freq

def arithmethic_decoding(enc, radix, pow, freq):

   # Multiply enc by radix^pow
   enc *= radix**pow;
   # Base
   base = sum(freq.values())
   # Create the cumulative frequency table
   cf = cumulative_freq(freq)
   # Create the dictionary
   dict = {}
   for k,v in cf.items():
       dict[v] = k
   # Fill the gaps in the dictionary
   lchar = None
   for i in range(base):
       if i in dict:
           lchar = dict[i]
       elif lchar is not None:
           dict[i] = lchar
   # Decode the input number
   decoded = bytearray()
   for i in range(base-1, -1, -1):
       pow = base**i
       div = enc//pow
       c  = dict[div]
       fv = freq[c]
       cv = cf[c]
       rem = (enc - pow*cv) // fv
       enc = rem
       decoded.append(c)
   # Return the decoded output
   return bytes(decoded)

radix = 10 # can be any integer greater or equal with 2

for str in b'DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT'.split():

   enc, pow, freq = arithmethic_coding(str, radix)
   dec = arithmethic_decoding(enc, radix, pow, freq)
   print("%-25s=> %19s * %d^%s" % (str, enc, radix, pow))
   if str != dec:
   	raise Exception("\tHowever that is incorrect!")</lang>
Output:
DABDDB                   =>                 251 * 10^2
DABDDBBDDBA              =>              167351 * 10^6
ABRACADABRA              =>             7954170 * 10^4
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15

Ruby

<lang ruby>def cumulative_freq(freq)

 cf = {}
 total = 0
 (0..255).each do |b|
   if freq.has_key?(b)
     cf[b] = total
     total += freq[b]
   end
 end
 return cf

end

def arithmethic_coding(bytes, radix)

 # The frequency characters
 freq = Hash.new(0)
 bytes.each { |b| freq[b] += 1 }
 # The cumulative frequency table
 cf = cumulative_freq(freq)
 # Base
 base = bytes.size
 # Lower bound
 lower = 0
 # Product of all frequencies
 pf = 1
 # Each term is multiplied by the product of the
 # frequencies of all previously occurring symbols
 bytes.each do |b|
   lower = lower*base + cf[b]*pf
   pf *= freq[b]
 end
 # Upper bound
 upper = lower+pf
 pow = 0
 loop do
   pf /= radix
   break if pf==0
   pow += 1
 end
 enc = ((upper-1) / radix**pow)
 [enc, pow, freq]

end

def arithmethic_decoding(enc, radix, pow, freq)

 # Multiply enc by radix^pow
 enc *= radix**pow;
 # Base
 base = freq.values.reduce(:+)
 # Create the cumulative frequency table
 cf = cumulative_freq(freq)
 # Create the dictionary
 dict = {}
 cf.each_pair do |k,v|
   dict[v] = k
 end
 # Fill the gaps in the dictionary
 lchar = nil
 (0...base).each do |i|
   if dict.has_key?(i)
     lchar = dict[i]
   elsif lchar != nil
     dict[i] = lchar
   end
 end
 # Decode the input number
 decoded = []
 (0...base).reverse_each do |i|
   pow = base**i
   div = enc/pow
   c  = dict[div]
   fv = freq[c]
   cv = cf[c]
   rem = ((enc - pow*cv) / fv)
   enc = rem
   decoded << c
 end
 # Return the decoded output
 return decoded

end

radix = 10 # can be any integer greater or equal with 2

%w(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT).each do |str|

 enc, pow, freq = arithmethic_coding(str.bytes, radix)
 dec = arithmethic_decoding(enc, radix, pow, freq).map{|b| b.chr }.join
 printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow)
 if str != dec
   raise "\tHowever that is incorrect!"
 end

end</lang>

Output:
DABDDB                   =>                 251 * 10^2
DABDDBBDDBA              =>              167351 * 10^6
ABRACADABRA              =>             7954170 * 10^4
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15

Sidef

<lang ruby>func cumulative_freq(freq) {

   var cf = Hash()
   var total = 0
   256.range.each { |b|
       if (freq.contains(b)) {
           cf{b} = total
           total += freq{b}
       }
   }
   return cf

}

func arithmethic_coding(bytes, radix=10) {

   # The frequency characters
   var freq = Hash()
   bytes.each { |b| freq{b} := 0 ++ }
   # The cumulative frequency table
   var cf = cumulative_freq(freq)
   # Base
   var base = bytes.len
   # Lower bound
   var L = 0
   # Product of all frequencies
   var pf = 1
   # Each term is multiplied by the product of the
   # frequencies of all previously occurring symbols
   bytes.each { |b|
       L = (L*base + cf{b}*pf)
       pf *= freq{b}
   }
   # Upper bound
   var U = L+pf
   var pow = pf.log(radix).int
   var enc = ((U-1) // radix**pow)
   return (enc, pow, freq)

}

func arithmethic_decoding(enc, radix, pow, freq) {

   # Multiply enc by radix^pow
   enc *= radix**pow;
   # Base
   var base = freq.values.sum
   # Create the cumulative frequency table
   var cf = cumulative_freq(freq);
   # Create the dictionary
   var dict = Hash()
   cf.each_kv { |k,v|
       dict{v} = k
   }
   # Fill the gaps in the dictionary
   var lchar = 
   base.range.each { |i|
       if (dict.contains(i)) {
           lchar = dict{i}
       }
       elsif (!lchar.is_empty) {
           dict{i} = lchar
       }
   }
   # Decode the input number
   var decoded = []
   base.range.reverse.each { |i|
       var pow = base**i;
       var div = enc//pow
       var c  = dict{div}
       var fv = freq{c}
       var cv = cf{c}
       var rem = ((enc - pow*cv) // fv)
       enc = rem
       decoded << c
   }
   # Return the decoded output
   return decoded

}

var radix = 10; # can be any integer greater or equal with 2

%w(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT).each { |str|

   var (enc, pow, freq) = arithmethic_coding(str.bytes, radix)
   var dec = arithmethic_decoding(enc, radix, pow, freq).join_bytes('UTF-8')
   printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow);
   if (str != dec) {
       die "\tHowever that is incorrect!"
   }

}</lang>

Output:
DABDDB                   =>                 251 * 10^2
DABDDBBDDBA              =>              167351 * 10^6
ABRACADABRA              =>             7954170 * 10^4
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15