Posit numbers/decoding

Posit is a quantization of the real projective line proposed by John Gustafson in 2015. It is claimed to be an improvement over IEEE 754.

The purpose of this task is to write a program capable of decoding a posit number. You will use the example provided by Gustafson in his paper : 0b0000110111011101, representing a 16-bit long real number with three bits for the exponent. Once decoded, you should obtain either the fraction 477/134217728 or the floating point value 3.55393E−6.

Jeff Johnson from Facebook research, described posit numbers as such:

A more efficient representation for tapered floating points is the recent posit format by Gustafson. It has no explicit size field; the exponent is encoded using a Golomb-Rice prefix-free code, with the exponent

e

encoded as a Golomb-Rice quotient and remainder

(q,r)

with

q

in unary and

r

in binary (in posit terminology,

q

is the regime). Remainder encoding size is defined by the exponent scale

s

, where

2^{s}

is the Golomb-Rice divisor. Any space not used by the exponent encoding is used by the significand, which unlike IEEE 754 always has a leading 1; gradual underflow (and overflow) is handled by tapering. A posit number system is characterized by

(N,s)

, where

N

is the word length in bits and

s

is the exponent scale. The minimum and maximum positive finite numbers in

(N,s)

are Failed to parse (syntax error): {\displaystyle f_\mathrm{min} = 2^{−(N−2)2^s}} and Failed to parse (syntax error): {\displaystyle f_\mathrm{max} = 2^{(N−2)2^s}} . The number line is represented much as the projective reals, with a single point at

\pm \infty

bounding Failed to parse (syntax error): {\displaystyle −f_\mathrm{max}} and

f_{\mathrm {max} }

.

\pm \infty

and 0 have special encodings; there is no NaN. The number system allows any choice of

N\geq 3

and Failed to parse (syntax error): {\displaystyle 0\le s\le N − 3} .

s

controls the dynamic range achievable; e.g., 8-bit (8, 5)-posit

f_{\mathrm {max} }=2^{192}

is larger than

f_{\mathrm {max} }

in float32. (8, 0) and (8, 1) are more reasonable values to choose for 8-bit floating point representations, with

f_{\mathrm {max} }

of 64 and 4096 accordingly. Precision is maximized in the range Failed to parse (syntax error): {\displaystyle \pm\left[2^{−(s+1)}, 2^{s+1}\right)} with Failed to parse (syntax error): {\displaystyle N − 3 − s} significand fraction bits, tapering to no fraction bits at

\pm f_{\mathrm {max} }

.

— Jeff Johnson, Rethinking floating point for deep learning, Facebook research.

Julia

struct PositType3{T<:Integer}
    numbits::UInt16
    es::UInt16
    bits::T
    PositType3(nb, ne, i) = new{typeof(i)}(UInt16(nb), UInt16(ne), i)
end

""" From posithub.org/docs/Posits4.pdf """
function Base.Rational(p::PositType3)
  s = signbit(p.bits)                      # s for S signbit, is 1 if negative
  pabs = p.bits << 1                       # shift off signbit (adds a 0 to F at LSB)
  pabs == 0 && return s ? 1 // 0 : 0 // 1  # if p is 0, return 0 if s = 0, error if s = 1
  expsign = signbit(pabs)                  # exponent sign from 2nd bit now in MSB location
  k = expsign == 1 ? leading_ones(pabs) : leading_zeros(pabs) # regime R bit count
  scaling = 2^p.es * (expsign == 0 ? -1 : 1)
  pabs <<= (k + 1)                         # shift off unwanted R bits
  pabs >>= (k + 2)                         # shift back without the extra LSB bit
  fsize = p.numbits - k - p.es - 2         # check how many F bits are actually explicit
  fsize <= 0 && return 0 // 1              # missing F is 0
  f = (pabs & (2^fsize - 1)) // 2^fsize     # Get F value. Can be missing -> 0
  e = pabs >> fsize                        # Get E value. 
  pw = (1 - 2s) * (scaling * k + e + s)
  return pw >= 0 ? ((1 - 3s) + f) * 2^pw // 1 : ((1 - 3s) + f) // 2^(-pw)
end

@show Rational(PositType3(16, 3, 0b0000110111011101)) == 477 // 134217728

Output:

 Rational(PositType3(16, 3, 0x0ddd)) == 477 // 134217728 = true

Phix

with javascript_semantics
function twos_compliment_2_on(string bits, integer nbits)
    for i=2 to nbits do
        bits[i] = iff(bits[i]='0'?'1':'0')
    end for
    for i=nbits to 2 by -1 do
        if bits[i]='0' then
            bits[i] = '1'
            exit
        end if
        bits[i] = '0'
    end for
    return bits
end function

function posit_decode(integer nbits, es, object bits)
    --
    -- nbits: number of bits (aka n)
    -- es: exponent scale
    -- bits: (binary) integer or string of nbits 0|1
    --
    if not string(bits) then
        string fmt = sprintf("%%0%db",nbits)
        bits = sprintf(fmt,bits)
    end if
    assert(length(bits)==nbits)
    integer s = bits[1]='1'
    if s then bits = twos_compliment_2_on(bits,nbits) end if
    integer r = find(xor_bits(bits[2],1),bits,3)-2,
            b2z = bits[2]='0', k = iff(b2z?-r:r-1)
    if r<0 and b2z then
        if s then
            return {bits,"NaR"} -- aka inf
        end if
        return {bits,"zero"}
    end if
    integer estart = r+3,
           efinish = min(r+2+es,nbits),
          exponent = to_integer(bits[estart..efinish],0,2),
          fraction = to_integer(bits[efinish+1..$],0,2)
    atom useed = power(2,power(2,es)),
            fs = power(2,nbits-efinish)
    atom res = iff(s?-1:+1)*power(useed,k)*power(2,exponent)*(1+fraction/fs)
    return {bits,res}
end function

constant tests = {{16,3,0b0000110111011101},
                  {16,3,0b1000000000000000},
                  {16,3,0b0000000000000000},
                  {16,1,0b0110110010101000},
                  {16,1,0b1001001101011000},
                  {16,2,0b0000000000000001},
--                {16,0,0b0111111111111111}, -- nope
                  {8,1,0b01000000},
                  {8,1,0b11000000},
                  {8,1,0b00110000},
                  {8,1,0b00100000},
                  {8,2,0b00000001},
--                {8,2,0b01111111},         -- nope
                  {32,2,0b00000000000000000000000000000001}}
--                {32,2,0b01111111111111111111111111111111}} -- nope
for t in tests do
    printf(1,"%s = %v\n",call_func(posit_decode,t))
end for

Output:

Be warned I could not get the largest positive values to match the Wikipedia page...

0000110111011101 = 3.553926944e-6
1000000000000000 = "NaR"
0000000000000000 = "zero"
0110110010101000 = 12.65625
1110110010101000 = -12.65625
0000000000000001 = 1.38777878e-17
01000000 = 1
11000000 = -1
00110000 = 0.5
00100000 = 0.25
00000001 = 5.960464478e-8
00000000000000000000000000000001 = 7.523163846e-37

raku

unit role Posit[UInt $N, UInt $es];

has UInt $.UInt;
method sign { self.UInt > 2**($N - 1) ?? -1 !! +1 }

method FatRat {
  return 0   if self.UInt == 0;
  my UInt $mask = 2**($N - 1);
  return Inf if self.UInt == $mask;
  my UInt $n = self.UInt;
  my $sign = $n +& $mask ?? -1 !! +1;
  my $r = $sign;
  $n = ((2**$n - 1) +^ $n) + 1 if self.sign < 0;
  my int $count = 0;
  $mask +>= 1;
  my Bool $first-bit = ?($n +& $mask);
  repeat { $count++; $mask +>= 1;
  } while ?($n +& $mask) == $first-bit && $mask;
  my $m = $count;
  my $k = $first-bit ?? $m - 1 !! -$m;
  $r *= 2**($k*2**$es);
  return $r unless $mask > 1;
  $mask +>= 1;
  $count = 0;
  my UInt $exponent = 0;
  while $mask && $count++ < $es {
    $exponent +<= 1;
    $exponent +|= 1 if $n +& $mask;
    $mask +>= 1;
  }
  $r *= 2**$exponent;
  my $fraction = 1.FatRat;
  while $mask {
    (state $power-of-two = 1) +<= 1;
    $fraction += 1/$power-of-two if $n +& $mask;
    $mask +>= 1;
  }
  $r *= $fraction;

  return $r;
}

CHECK {
  use Test;
  # example from L<http://www.johngustafson.net/pdfs/BeatingFloatingPoint.pdf>
  is Posit[16, 3]
    .new(UInt => 0b0000110111011101)
    .FatRat, 477.FatRat/134217728;
}

Output:

ok 1 -

Wren

Library: Wren-fmt

Library: Wren-big

import "./fmt" for Conv
import "./big" for BigRat

var posit16_decode = Fn.new { |ps, maxExpSize|
    var p = ps.map { |c| c == "0" ? 0 : 1 }.toList

    // Deal with exceptional values.
    if (p[1..-1].all { |i| i == 0 }) {
        return (p[0] == 0) ? 0 : Conv.infinity
    }

    // Convert bits after sign bit to two's complement if negative.
    if (p[0] == 1) {
        for (i in 1..15) p[i] = (p[i] == 0) ? 1 : 0
        for (i in 15..1) {
            if (p[i] == 1) {
                p[i] = 0
            } else {
                p[i] = 1
                break
            }
        }
    }
    var first = p[1]
    var rs = 15  // regime size
    for (i in 2..15) {
        if (p[i] != first) {
            rs = i - 1
            break
        }
    }
    var regime = p[1..rs]
    var es = (rs == 15) ? 0 : maxExpSize.min(14-rs)  // actual exponent size
    var exponent = [0]
    if (es > 0) exponent = p[rs + 2...rs + 2 + es]
    var fs = (es == 0) ? 0 : 14 - rs - es  // function size
    var s = (p[0] == 0) ? 1 : -1  // sign
    var k = regime.all { |i| i == 0 } ? -rs : rs - 1
    var u = 2.pow(2.pow(maxExpSize))
    var e = Conv.atoi(exponent.join(""), 2)
    var f = BigRat.one
    if (fs > 0) {
        var fraction = ps[-fs..-1]
        f = Conv.atoi(fraction.join(""), 2)
        f = BigRat.one + BigRat.new(f, 2.pow(fs))
    }
    return f * BigRat.new(u, 1).pow(k) * s * 2.pow(e)
}

var ps = "0000110111011101"
System.print(posit16_decode.call(ps, 3))

Output:

477/134217728