Entropy: Difference between revisions
No edit summary |
|||
Line 1,084: | Line 1,084: | ||
s.each_char.group_by(&:to_s).values.map { |x| x.length / s.length.to_f }.reduce(0) { |e, x| e - x*Math.log2(x) } |
s.each_char.group_by(&:to_s).values.map { |x| x.length / s.length.to_f }.reduce(0) { |e, x| e - x*Math.log2(x) } |
||
end</lang> |
end</lang> |
||
=={{header|Rust}}== |
|||
<lang Rust>// works for Rust 0.9 |
|||
fn entropy(s: &str) -> f32 { |
|||
let mut entropy: f32 = 0.0; |
|||
let mut histogram = [0, ..256]; |
|||
let len = s.len(); |
|||
for i in range(0, len) { histogram[s[i]] += 1; } |
|||
for i in range(0, 256) { |
|||
if histogram[i] > 0 { |
|||
let ratio = (histogram[i] as f32 / len as f32) as f32; |
|||
entropy -= (ratio * log2(ratio)) as f32; |
|||
} |
|||
} |
|||
entropy |
|||
}</lang> |
|||
=={{header|Scala}}== |
=={{header|Scala}}== |
||
<lang scala>import scala.math._ |
<lang scala>import scala.math._ |
Revision as of 01:14, 21 January 2014
You are encouraged to solve this task according to the task description, using any language you may know.
Calculate the information entropy (Shannon entropy) of a given input string.
Entropy is the expected value of the measure of information content in a system. In general, the Shannon entropy of a variable is defined as:
where the information content . If the base of the logarithm , the result is expressed in bits, a unit of information. Therefore, given a string of length where is the relative frequency of each character, the entropy of a string in bits is:
For this task, use "1223334444" as an example. The result should be around 1.84644 bits.
Related Task: Fibonacci_word
Ada
Uses Ada 2012. <lang Ada>with Ada.Text_IO, Ada.Float_Text_IO, Ada.Numerics.Elementary_Functions;
procedure Count_Entropy is
package TIO renames Ada.Text_IO;
Count: array(Character) of Natural := (others => 0); Sum: Natural := 0; Line: String := "1223334444";
begin
for I in Line'Range loop -- count the characters Count(Line(I)) := Count(Line(I))+1; Sum := Sum + 1; end loop;
declare -- compute the entropy and print it function P(C: Character) return Float is (Float(Count(C)) / Float(Sum)); use Ada.Numerics.Elementary_Functions, Ada.Float_Text_IO; Result: Float := 0.0; begin for Ch in Character loop Result := Result - (if P(Ch)=0.0 then 0.0 else P(Ch) * Log(P(Ch), Base => 2.0)); end loop; Put(Result, Fore => 1, Aft => 5, Exp => 0); end;
end Count_Entropy;</lang>
Aime
<lang aime>integer i, l; record r; real h, x; text s;
s = argv(1); l = length(s);
i = l; while (i) {
i -= 1; rn_a_integer(r, cut(s, i, 1), 1);
}
h = 0; if (r_first(r, s)) {
do { x = r_q_integer(r, s); x /= l; h -= x * log2(x); } while (r_greater(r, s, s));
}
o_real(6, h); o_newline();</lang> Examples:
$ aime -a tmp/entr 1223334444 1.846439 $ aime -a tmp/entr 'Rosetta Code is the best site in the world!' 3.646513 $ aime -a tmp/entr 1234567890abcdefghijklmnopqrstuvwxyz 5.169925
AutoHotkey
<lang AutoHotkey>MsgBox, % Entropy(1223334444)
Entropy(n) {
a := [], len := StrLen(n), m := n while StrLen(m) { s := SubStr(m, 1, 1) m := RegExReplace(m, s, "", c) a[s] := c } for key, val in a { m := Log(p := val / len) e -= p * m / Log(2) } return, e
}</lang> Output:
1.846440
AWK
<lang awk>#!/usr/bin/awk -f { for (i=1; i<= length($0); i++) { H[substr($0,i,1)]++; N++; } }
END { for (i in H) { p = H[i]/N; E -= p * log(p); } print E/log(2); }</lang> Usage: <lang bash> echo 1223334444 |./entropy.awk 1.84644 </lang>
Burlesque
<lang burlesque>blsq ) "1223334444"F:u[vv^^{1\/?/2\/LG}m[?*++ 1.8464393446710157</lang>
C
<lang c>#include <stdio.h>
- include <stdlib.h>
- include <stdbool.h>
- include <string.h>
- include <math.h>
- define MAXLEN 100 //maximum string length
int makehist(char *S,int *hist,int len){ int wherechar[256]; int i,histlen; histlen=0; for(i=0;i<256;i++)wherechar[i]=-1; for(i=0;i<len;i++){ if(wherechar[(int)S[i]]==-1){ wherechar[(int)S[i]]=histlen; histlen++; } hist[wherechar[(int)S[i]]]++; } return histlen; }
double entropy(int *hist,int histlen,int len){ int i; double H; H=0; for(i=0;i<histlen;i++){ H-=(double)hist[i]/len*log2((double)hist[i]/len); } return H; }
int main(void){ char S[MAXLEN]; int len,*hist,histlen; double H; scanf("%[^\n]",S); len=strlen(S); hist=(int*)calloc(len,sizeof(int)); histlen=makehist(S,hist,len); //hist now has no order (known to the program) but that doesn't matter H=entropy(hist,histlen,len); printf("%lf\n",H); return 0; }</lang> Examples: <lang>$ ./entropy 1223334444 1.846439 $ ./entropy Rosetta Code is the best site in the world! 3.646513</lang>
C++
<lang cpp>#include <string>
- include <map>
- include <iostream>
- include <algorithm>
- include <cmath>
double log2( double number ) {
return log( number ) / log( 2 ) ;
}
int main( int argc , char *argv[ ] ) {
std::string teststring( argv[ 1 ] ) ; std::map<char , int> frequencies ; for ( char c : teststring ) frequencies[ c ] ++ ; int numlen = teststring.length( ) ; double infocontent = 0 ; for ( std::pair<char , int> p : frequencies ) { double freq = static_cast<double>( p.second ) / numlen ; infocontent += freq * log2( freq ) ; } infocontent *= -1 ; std::cout << "The information content of " << teststring << " is " << infocontent << " !\n" ; return 0 ;
}</lang>
- Output:
The information content of 1223334444 is 1.84644 !
Clojure
<lang Clojure>(defn entropy [s]
(let [len (count s) freqs (frequencies s) log-of-2 (Math/log 2)] (->> (keys freqs) (map (fn [c] (let [rf (/ (get freqs c) len)] (* -1 rf (/ (Math/log rf) log-of-2))))) (reduce +))))</lang>
Output <lang Clojure>(H "1223334444") 1.8464393446710154</lang>
Common Lisp
<lang lisp>(defun entropy(input-string)
(let ((frequency-table (make-hash-table :test 'equal)) (entropy 0)) (map 'nil #'(lambda(c) (setf (gethash c frequency-table) (if (gethash c frequency-table) (+ (gethash c frequency-table) 1) 1))) (coerce input-string 'list)) (maphash #'(lambda(k v) (setf entropy (+ entropy (* -1 (/ v (length input-string)) (log (/ v (length input-string)) 2))))) frequency-table) entropy))
</lang>
D
<lang d>import std.stdio, std.algorithm, std.math;
double entropy(T)(T[] s) /*pure nothrow*/ if (__traits(compiles, sort(s))) {
return s .sort() .group .map!(g => g[1] / cast(double)s.length) .map!(p => -p * log2(p)) .reduce!q{a + b};
}
void main() {
"1223334444"d.dup.entropy.writeln;
}</lang>
- Output:
1.84644
Emacs Lisp
<lang lisp>(defun shannon-entropy (input)
(let ((freq-table (make-hash-table))
(entropy 0) (length (+ (length input) 0.0)))
(mapcar (lambda (x)
(puthash x (+ 1 (gethash x freq-table 0)) freq-table)) input)
(maphash (lambda (k v)
(set 'entropy (+ entropy (* (/ v length) (log (/ v length) 2))))) freq-table)
(- entropy)))</lang>
- Output:
After adding the above to the emacs runtime, you can run the function interactively in the scratch buffer as shown below (type ctrl-j at the end of the first line and the output will be placed by emacs on the second line).
<lang lisp>(shannon-entropy "1223334444") 1.8464393446710154</lang>
Erlang
<lang Erlang> -module( entropy ).
-export( [shannon/1, task/0] ).
shannon( String ) -> shannon_information_content( lists:foldl(fun count/2, dict:new(), String), erlang:length(String) ).
task() -> shannon( "1223334444" ).
count( Character, Dict ) -> dict:update_counter( Character, 1, Dict ).
shannon_information_content( Dict, String_length ) -> {_String_length, Acc} = dict:fold( fun shannon_information_content/3, {String_length, 0.0}, Dict ), Acc / math:log( 2 ).
shannon_information_content( _Character, How_many, {String_length, Acc} ) ->
Frequency = How_many / String_length,
{String_length, Acc - (Frequency * math:log(Frequency))}. </lang>
- Output:
24> entropy:task(). 1.8464393446710157
Euler Math Toolbox
<lang EulerMathToolbox>>function entropy (s) ... $ v=strtochar(s); $ m=getmultiplicities(unique(v),v); $ m=m/sum(m); $ return sum(-m*logbase(m,2)) $endfunction >entropy("1223334444")
1.84643934467</lang>
F#
<lang fsharp>open System
let ld x = Math.Log x / Math.Log 2.
let entropy (s : string) =
let n = float s.Length Seq.groupBy id s |> Seq.map (fun (_, vals) -> float (Seq.length vals) / n) |> Seq.fold (fun e p -> e - p * ld p) 0.
printfn "%f" (entropy "1223334444")</lang>
- Output:
1.846439
friendly interactive shell
Sort of hacky, but friendly interactive shell isn't really optimized for mathematic tasks (in fact, it doesn't even have associative arrays).
<lang fishshell>function entropy
for arg in $argv set name count_$arg if not count $$name > /dev/null set $name 0 set values $values $arg end set $name (math $$name + 1) end set entropy 0 for value in $values set name count_$value set entropy (echo " scale = 50 p = "$$name" / "(count $argv)" $entropy - p * l(p) " | bc -l) end echo "$entropy / l(2)" | bc -l
end entropy (echo 1223334444 | fold -w1)</lang>
- Output:
1.84643934467101549345
Forth
<lang forth>: flog2 ( f -- f ) fln 2e fln f/ ;
create freq 256 cells allot
- entropy ( str len -- f )
freq 256 cells erase tuck bounds do i c@ cells freq + 1 swap +! loop 0e 256 0 do i cells freq + @ ?dup if s>f dup s>f f/ fdup flog2 f* f- then loop drop ;
s" 1223334444" entropy f. \ 1.84643934467102 ok </lang>
Fortran
Please find the GNU/linux compilation instructions along with sample run among the comments at the start of the FORTRAN 2008 source. This program acquires input from the command line argument, thereby demonstrating the fairly new get_command_argument intrinsic subroutine. The expression of the algorithm is a rough translated of the j solution. Thank you. <lang FORTRAN> !-*- mode: compilation; default-directory: "/tmp/" -*- !Compilation started at Tue May 21 21:43:12 ! !a=./f && make $a && OMP_NUM_THREADS=2 $a 1223334444 !gfortran -std=f2008 -Wall -ffree-form -fall-intrinsics f.f08 -o f ! Shannon entropy of 1223334444 is 1.84643936 ! !Compilation finished at Tue May 21 21:43:12
program shannonEntropy
implicit none integer :: num, L, status character(len=2048) :: s num = 1 call get_command_argument(num, s, L, status) if ((0 /= status) .or. (L .eq. 0)) then write(0,*)'Expected a command line argument with some length.' else write(6,*)'Shannon entropy of '//(s(1:L))//' is ', se(s(1:L)) endif
contains
! algebra ! ! 2**x = y ! x*log(2) = log(y) ! x = log(y)/log(2)
! NB. The j solution ! entropy=: +/@:-@(* 2&^.)@(#/.~ % #) ! entropy '1223334444' !1.84644 real function se(s) implicit none character(len=*), intent(in) :: s integer, dimension(256) :: tallies real, dimension(256) :: norm tallies = 0 call TallyKey(s, tallies) ! J's #/. works with the set of items in the input. ! TallyKey is sufficiently close that, with the merge, gets the correct result. norm = tallies / real(len(s)) se = sum(-(norm*log(merge(1.0, norm, norm .eq. 0))/log(2.0))) end function se
subroutine TallyKey(s, counts) character(len=*), intent(in) :: s integer, dimension(256), intent(out) :: counts integer :: i, j counts = 0 do i=1,len(s) j = iachar(s(i:i)) counts(j) = counts(j) + 1 end do end subroutine TallyKey
end program shannonEntropy </lang>
Go
<lang go>package main
import (
"fmt" "math"
)
const s = "1223334444"
func main() {
m := map[rune]float64{} for _, r := range s { m[r]++ } hm := 0. for _, c := range m { hm += c * math.Log2(c) } const l = float64(len(s)) fmt.Println(math.Log2(l) - hm/l)
}</lang>
- Output:
1.8464393446710152
Groovy
<lang groovy>String.metaClass.getShannonEntrophy = {
-delegate.inject([:]) { map, v -> map[v] = (map[v] ?: 0) + 1; map }.values().inject(0.0) { sum, v -> def p = (BigDecimal)v / delegate.size() sum + p * Math.log(p) / Math.log(2) }
}</lang> Testing <lang groovy>[ '1223334444': '1.846439344671',
'1223334444555555555': '1.969811065121', '122333': '1.459147917061', '1227774444': '1.846439344671', aaBBcccDDDD: '1.936260027482', '1234567890abcdefghijklmnopqrstuvwxyz': '5.169925004424', 'Rosetta Code': '3.084962500407' ].each { s, expected ->
println "Checking $s has a shannon entrophy of $expected" assert sprintf('%.12f', s.shannonEntrophy) == expected
}</lang> Output:
Checking 1223334444 has a shannon entrophy of 1.846439344671 Checking 1223334444555555555 has a shannon entrophy of 1.969811065121 Checking 122333 has a shannon entrophy of 1.459147917061 Checking 1227774444 has a shannon entrophy of 1.846439344671 Checking aaBBcccDDDD has a shannon entrophy of 1.936260027482 Checking 1234567890abcdefghijklmnopqrstuvwxyz has a shannon entrophy of 5.169925004424 Checking Rosetta Code has a shannon entrophy of 3.084962500407
Haskell
<lang haskell>import Data.List
main = print $ entropy "1223334444"
entropy s =
sum . map lg' . fq' . map (fromIntegral.length) . group . sort $ s where lg' c = (c * ) . logBase 2 $ 1.0 / c fq' c = map (\x -> x / (sum c)) c </lang>
Icon and Unicon
Hmmm, the 2nd equation sums across the length of the string (for the example, that would be the sum of 10 terms). However, the answer cited is for summing across the different characters in the string (sum of 4 terms). The code shown here assumes the latter and works in Icon and Unicon. This assumption is consistent with the Wikipedia description.
<lang unicon>procedure main(a)
s := !a | "1223334444" write(H(s))
end
procedure H(s)
P := table(0.0) every P[!s] +:= 1.0/*s every (h := 0.0) -:= P[c := key(P)] * log(P[c],2) return h
end</lang>
Output:
->en 1.846439344671015 ->
J
Solution:<lang j> entropy=: +/@:-@(* 2&^.)@(#/.~ % #)</lang>
- Example:
<lang j> entropy '1223334444' 1.84644</lang>
Java
<lang java5>import java.lang.Math; import java.util.Map; import java.util.HashMap;
public class REntropy {
@SuppressWarnings("boxing") public static double getShannonEntropy(String s) { int n = 0; Map<Character, Integer> occ = new HashMap<>();
for (int c_ = 0; c_ < s.length(); ++c_) { char cx = s.charAt(c_); if (occ.containsKey(cx)) { occ.put(cx, occ.get(cx) + 1); } else { occ.put(cx, 1); } ++n; }
double e = 0.0; for (Map.Entry<Character, Integer> entry : occ.entrySet()) { char cx = entry.getKey(); double p = (double) entry.getValue() / n; e += p * log2(p); } return -e; }
private static double log2(double a) { return Math.log(a) / Math.log(2); } public static void main(String[] args) { String[] sstr = { "1223334444", "1223334444555555555", "122333", "1227774444", "aaBBcccDDDD", "1234567890abcdefghijklmnopqrstuvwxyz", "Rosetta Code", };
for (String ss : sstr) { double entropy = REntropy.getShannonEntropy(ss); System.out.printf("Shannon entropy of %40s: %.12f%n", "\"" + ss + "\"", entropy); } return; }
}</lang>
- Output:
Shannon entropy of "1223334444": 1.846439344671 Shannon entropy of "1223334444555555555": 1.969811065278 Shannon entropy of "122333": 1.459147917027 Shannon entropy of "1227774444": 1.846439344671 Shannon entropy of "aaBBcccDDDD": 1.936260027532 Shannon entropy of "1234567890abcdefghijklmnopqrstuvwxyz": 5.169925001442 Shannon entropy of "Rosetta Code": 3.084962500721
Lang5
<lang lang5>: -rot rot rot ; [] '__A set : dip swap __A swap 1 compress append '__A set execute __A -1 extract nip ; : nip swap drop ; : sum '+ reduce ;
- 2array 2 compress ; : comb "" split ; : lensize length nip ;
- <group> #( a -- 'a )
grade subscript dup 's dress distinct strip length 1 2array reshape swap 'A set : `filter(*) A in A swap select ; '`filter apply ;
- elements(*) lensize ;
- entropy #( s -- n )
length "<group> 'elements apply" dip / dup neg swap log * 2 log / sum ;
"1223334444" comb entropy . # 1.84643934467102</lang>
Mathematica
<lang Mathematica>shE[s_String] := -Plus @@ ((# Log[2., #]) & /@ ((Length /@ Gather[#])/
Length[#]) &[Characters[s]])</lang>
- Example:
<lang Mathematica> shE["1223334444"]
1.84644 shE["Rosetta Code"] 3.08496</lang>
MATLAB / Octave
This version allows for any input vectors, including strings, floats, negative integers, etc. <lang MATLAB>function E = entropy(d) if ischar(d), d=abs(d); end;
[Y,I,J] = unique(d);
H = sparse(J,1,1); p = full(H(H>0))/length(d); E = -sum(p.*log2(p)); end; </lang> Usage <lang MATLAB>> entropy('1223334444') ans = 1.8464</lang>
NetRexx
<lang NetRexx>/* NetRexx */ options replace format comments java crossref savelog symbols
runSample(Arg) return
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /* REXX ***************************************************************
* 28.02.2013 Walter Pachl **********************************************************************/
method getShannonEntropy(s = "1223334444") public static --trace var occ c chars n cn i e p pl
Numeric Digits 30 occ = 0 chars = n = 0 cn = 0 Loop i = 1 To s.length() c = s.substr(i, 1) If chars.pos(c) = 0 Then Do cn = cn + 1 chars = chars || c End occ[c] = occ[c] + 1 n = n + 1 End i p = Loop ci = 1 To cn c = chars.substr(ci, 1) p[c] = occ[c] / n End ci e = 0 Loop ci = 1 To cn c = chars.substr(ci, 1) pl = log2(p[c]) e = e + p[c] * pl End ci Return -e
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ method log2(a = double) public static binary returns double
return Math.log(a) / Math.log(2)
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ method runSample(Arg) public static
parse Arg sstr if sstr = then sstr = '1223334444' - '1223334444555555555' - '122333' - '1227774444' - 'aaBBcccDDDD' - '1234567890abcdefghijklmnopqrstuvwxyz' - 'Rosetta_Code' say 'Calculating Shannons entropy for the following list:' say '['(sstr.space(1, ',')).changestr(',', ', ')']' say entropies = 0 ssMax = 0 -- This crude sample substitutes a '_' character for a space in the input strings loop w_ = 1 to sstr.words() ss = sstr.word(w_) ssMax = ssMax.max(ss.length()) ss_ = ss.changestr('_', ' ') entropy = getShannonEntropy(ss_) entropies[ss] = entropy end w_ loop report = 1 to sstr.words() ss = sstr.word(report) ss_ = ss.changestr('_', ' ') Say 'Shannon entropy of' ('"'ss_'"').right(ssMax + 2)':' entropies[ss].format(null, 12) end report return
</lang>
- Output:
Calculating Shannon's entropy for the following list: [1223334444, 1223334444555555555, 122333, 1227774444, aaBBcccDDDD, 1234567890abcdefghijklmnopqrstuvwxyz, Rosetta_Code] Shannon entropy of "1223334444": 1.846439344671 Shannon entropy of "1223334444555555555": 1.969811065278 Shannon entropy of "122333": 1.459147917027 Shannon entropy of "1227774444": 1.846439344671 Shannon entropy of "aaBBcccDDDD": 1.936260027532 Shannon entropy of "1234567890abcdefghijklmnopqrstuvwxyz": 5.169925001442 Shannon entropy of "Rosetta Code": 3.084962500721
PARI/GP
<lang parigp>entropy(s)=s=Vec(s);my(v=vecsort(s,,8));-sum(i=1,#v,(x->x*log(x))(sum(j=1,#s,v[i]==s[j])/#s))/log(2)</lang>
>entropy("1223334444") %1 = 1.8464393446710154934341977463050452232
Perl
<lang Perl>sub entropy {
my %count; $count{$_}++ for @_; my $entropy = 0; for (values %count) { my $p = $_/@_; $entropy -= $p * log $p; } $entropy / log 2
}
print entropy split //, "1223334444";</lang>
Perl 6
<lang Perl 6>sub entropy(@a) {
[+] map -> \p { p * -log p }, bag(@a).values »/» +@a;
}
say log(2) R/ entropy '1223334444'.comb;</lang>
- Output:
1.84643934467102
In case we would like to add this function to Perl 6's core, here is one way it could be done:
<lang Perl 6>use MONKEY_TYPING; augment class Bag {
method entropy {
[+] map -> \p { - p * log p }, self.values »/» +self;
}
}
say '1223334444'.comb.Bag.entropy / log 2;</lang>
Python
Python: Longer version
<lang python>from __future__ import division import math
def hist(source):
hist = {}; l = 0; for e in source: l += 1 if e not in hist: hist[e] = 0 hist[e] += 1 return (l,hist)
def entropy(hist,l):
elist = [] for v in hist.values(): c = v / l elist.append(-c * math.log(c ,2)) return sum(elist)
def printHist(h):
flip = lambda (k,v) : (v,k) h = sorted(h.iteritems(), key = flip) print 'Sym\thi\tfi\tInf' for (k,v) in h: print '%s\t%f\t%f\t%f'%(k,v,v/l,-math.log(v/l, 2))
source = "1223334444" (l,h) = hist(source); print '.[Results].' print 'Length',l print 'Entropy:', entropy(h, l) printHist(h)</lang>
Output:
.[Results]. Length 10 Entropy: 1.84643934467 Sym hi fi Inf 1 1.000000 0.100000 3.321928 2 2.000000 0.200000 2.321928 3 3.000000 0.300000 1.736966 4 4.000000 0.400000 1.321928
Python: More succinct version
The Counter module is only available in Python >= 2.7.
<lang python>>>> import math >>> from collections import Counter >>> >>> def entropy(s): ... p, lns = Counter(s), float(len(s)) ... return -sum( count/lns * math.log(count/lns, 2) for count in p.values()) ... >>> entropy("1223334444") 1.8464393446710154 >>> </lang>
R
<lang r>entropy = function(s)
{freq = prop.table(table(strsplit(s, )[1])) -sum(freq * log(freq, base = 2))}
print(entropy("1223334444")) # 1.846439</lang>
Racket
<lang racket>#lang racket (require math) (provide entropy hash-entropy list-entropy digital-entropy)
(define (hash-entropy h)
(define (log2 x) (/ (log x) (log 2))) (define n (for/sum [(c (in-hash-values h))] c)) (- (for/sum ([c (in-hash-values h)] #:unless (zero? c)) (* (/ c n) (log2 (/ c n))))))
(define (list-entropy x) (hash-entropy (samples->hash x)))
(define entropy (compose list-entropy string->list)) (define digital-entropy (compose entropy number->string))
(module+ test
(require rackunit) (check-= (entropy "1223334444") 1.8464393446710154 1E-8) (check-= (digital-entropy 1223334444) (entropy "1223334444") 1E-8) (check-= (digital-entropy 1223334444) 1.8464393446710154 1E-8) (check-= (entropy "xggooopppp") 1.8464393446710154 1E-8))
(module+ main (entropy "1223334444"))</lang>
Output: 1.8464393446710154
REXX
version 1
<lang rexx>/* REXX ***************************************************************
- 28.02.2013 Walter Pachl
- 12.03.2013 Walter Pachl typo in log corrected. thanx for testing
- 22.05.2013 -"- extended the logic to accept other strings
- 25.05.2013 -"- 'my' log routine is apparently incorrect
- 25.05.2013 -"- problem identified & corrected
- /
Numeric Digits 30 Parse Arg s If s= Then
s="1223334444"
occ.=0 chars= n=0 cn=0 Do i=1 To length(s)
c=substr(s,i,1) If pos(c,chars)=0 Then Do cn=cn+1 chars=chars||c End occ.c=occ.c+1 n=n+1 End
do ci=1 To cn
c=substr(chars,ci,1) p.c=occ.c/n /* say c p.c */ End
e=0 Do ci=1 To cn
c=substr(chars,ci,1) e=e+p.c*log(p.c,30,2) End
Say 'Version 1:' s 'Entropy' format(-e,,12) Exit
log: Procedure /***********************************************************************
- Return log(x) -- with specified precision and a specified base
- Three different series are used for the ranges 0 to 0.5
- 0.5 to 1.5
- 1.5 to infinity
- 03.09.1992 Walter Pachl
- 25.05.2013 -"- 'my' log routine is apparently incorrect
- 25.05.2013 -"- problem identified & corrected
- /
Parse Arg x,prec,b If prec= Then prec=9 Numeric Digits (2*prec) Numeric Fuzz 3 Select When x<=0 Then r='*** invalid argument ***' When x<0.5 Then Do z=(x-1)/(x+1) o=z r=z k=1 Do i=3 By 2 ra=r k=k+1 o=o*z*z r=r+o/i If r=ra Then Leave End r=2*r End When x<1.5 Then Do z=(x-1) o=z r=z k=1 Do i=2 By 1 ra=r k=k+1 o=-o*z r=r+o/i If r=ra Then Leave End End Otherwise /* 1.5<=x */ Do z=(x+1)/(x-1) o=1/z r=o k=1 Do i=3 By 2 ra=r k=k+1 o=o/(z*z) r=r+o/i If r=ra Then Leave End r=2*r End End If b<> Then r=r/log(b,prec) Numeric Digits (prec) r=r+0 Return r </lang>
<lang rexx>/* REXX ***************************************************************
- Test program to compare Versions 1 and 2
- (the latter tweaked to be acceptable by my (oo)Rexx
- and to give the same output.)
- version 1 was extended to accept the strings of the incorrect flag
- 22.05.2013 Walter Pachl (I won't analyze the minor differences)
- 25.05.2013 I did now analyze and had to discover that
- 'my' log routine is apparently incorrect
- 25.05.2013 problem identified & corrected
- /
Call both '1223334444' Call both '1223334444555555555' Call both '122333' Call both '1227774444' Call both 'aaBBcccDDDD' Call both '1234567890abcdefghijklmnopqrstuvwxyz' Exit both:
Parse Arg s Call entropy s Call entropy2 s Say ' ' Return
</lang> Output:
Version 1: 1223334444 Entropy 1.846439344671 Version 2: 1223334444 Entropy 1.846439344671 Version 1: 1223334444555555555 Entropy 1.969811065278 Version 2: 1223334444555555555 Entropy 1.969811065278 Version 1: 122333 Entropy 1.459147917027 Version 2: 122333 Entropy 1.459147917027 Version 1: 1227774444 Entropy 1.846439344671 Version 2: 1227774444 Entropy 1.846439344671 Version 1: 1234567890abcdefghijklmnopqrstuvwxyz Entropy 5.169925001442 Version 2: 1234567890abcdefghijklmnopqrstuvwxyz Entropy 5.169925001442
version 2
REXX doesn't have a BIF for LOG or LN, so the subroutine (function) LOG2 is included herein.
The LOG2 subroutine in only included here for functionality, not to document how to calculate LOG2 using REXX.
<lang rexx>/*REXX program calculates the information entropy for a given char str.*/
numeric digits 30 /*use thirty digits for precision*/
parse arg $; if $== then $=1223334444 /*obtain optional input*/
n=0; @.=0; L=length($); $$=
do j=1 for L; _=substr($,j,1) /*process each character in $ str*/ if @._==0 then do; n=n+1 /*if unique, bump char counter. */ $$=$$ || _ /*add this character to the list.*/ end @._ = @._+1 /*keep track of this char count. */ end /*j*/
sum=0 /*calc info entropy for each char*/
do i=1 for n; _=substr($$,i,1) /*obtain a char from unique list.*/ sum=sum - @._/L * log2(@._/L) /*add (negatively) the entropies.*/ end /*i*/
say ' input string: ' $ say 'string length: ' L say ' unique chars: ' n ; say say 'the information entropy of the string ──► ' format(sum,,12) " bits." exit /*stick a fork in it, we're done.*/ /*──────────────────────────────────LOG2 subroutine───────────────────────────*/ log2: procedure; parse arg x 1 xx; ig= x>1.5; is=1-2*(ig\==1); ii=0 numeric digits digits()+5 /* [↓] precision of E must be > digits().*/ e=2.7182818284590452353602874713526624977572470936999595749669676277240766303535
do while ig & xx>1.5 | \ig&xx<.5; _=e; do j=-1; iz=xx* _**-is if j>=0 then if ig & iz<1 | \ig&iz>.5 then leave; _=_*_; izz=iz; end /*j*/ xx=izz; ii=ii+is*2**j; end /*while*/; x=x* e**-ii-1; z=0; _=-1; p=z do k=1; _=-_*x; z=z+_/k; if z=p then leave; p=z; end /*k*/ r=z+ii; if arg()==2 then return r; return r/log2(2,0)</lang>
output when using the default input of: 1223334444
input string: 1223334444 string length: 10 unique chars: 4 the information entropy of the string ──► 1.846439344671 bits.
output when using the input of: Rosetta Code
input string: Rosetta Code string length: 12 unique chars: 9 the information entropy of the string ──► 3.084962500721 bits.
Ruby
<lang ruby>def entropy(s)
counts = Hash.new(0) s.each_char { |c| counts[c] += 1 }
counts.values.reduce(0) do |entropy, count| freq = count / s.length.to_f entropy - freq * Math.log2(freq) end
end</lang> One-liner, same performance (or better): <lang ruby>def entropy2(s)
s.each_char.group_by(&:to_s).values.map { |x| x.length / s.length.to_f }.reduce(0) { |e, x| e - x*Math.log2(x) }
end</lang>
Rust
<lang Rust>// works for Rust 0.9 fn entropy(s: &str) -> f32 { let mut entropy: f32 = 0.0; let mut histogram = [0, ..256]; let len = s.len();
for i in range(0, len) { histogram[s[i]] += 1; } for i in range(0, 256) { if histogram[i] > 0 { let ratio = (histogram[i] as f32 / len as f32) as f32; entropy -= (ratio * log2(ratio)) as f32; } }
entropy }</lang>
Scala
<lang scala>import scala.math._
def entropy( v:String ) = { v
.groupBy (a => a) .values .map( i => i.length.toDouble / v.length ) .map( p => -p * log10(p) / log10(2)) .sum
}
// Confirm that "1223334444" has an entropy of about 1.84644 assert( math.round( entropy("1223334444") * 100000 ) * 0.00001 == 1.84644 )</lang>
Seed7
<lang seed7>$ include "seed7_05.s7i";
include "float.s7i"; include "math.s7i";
const func float: entropy (in string: stri) is func
result var float: entropy is 0.0; local var hash [char] integer: count is (hash [char] integer).value; var char: ch is ' '; var float: p is 0.0; begin for ch range stri do if ch in count then incr(count[ch]); else count @:= [ch] 1; end if; end for; for key ch range count do p := flt(count[ch]) / flt(length(stri)); entropy -:= p * log(p) / log(2.0); end for; end func ;
const proc: main is func
begin writeln(entropy("1223334444") digits 5); end func;</lang>
- Output:
1.84644
Tcl
<lang tcl>proc entropy {str} {
set log2 [expr log(2)] foreach char [split $str ""] {dict incr counts $char} set entropy 0.0 foreach count [dict values $counts] {
set freq [expr {$count / double([string length $str])}] set entropy [expr {$entropy - $freq * log($freq)/$log2}]
} return $entropy
}</lang> Demonstration: <lang tcl>puts [format "entropy = %.5f" [entropy "1223334444"]] puts [format "entropy = %.5f" [entropy "Rosetta Code"]]</lang>
- Output:
entropy = 1.84644 entropy = 3.08496
XPL0
<lang XPL0>code real RlOut=48, Ln=54; \intrinsic routines string 0; \use zero-terminated strings
func StrLen(A); \Return number of characters in an ASCIIZ string char A; int I; for I:= 0, -1>>1-1 do
if A(I) = 0 then return I;
func real Entropy(Str); \Return Shannon entropy of string char Str; int Len, I, Count(128); real Sum, Prob; [Len:= StrLen(Str); for I:= 0 to 127 do Count(I):= 0; for I:= 0 to Len-1 do \count number of each character in string
Count(Str(I)):= Count(Str(I)) + 1;
Sum:= 0.0; for I:= 0 to 127 do
if Count(I) # 0 then \(avoid Ln(0.0) error) [Prob:= float(Count(I)) / float(Len); \probability of char in string Sum:= Sum + Prob*Ln(Prob); ];
return -Sum/Ln(2.0); ];
RlOut(0, Entropy("1223334444"))</lang>
- Output:
1.84644