Huffman coding
You are encouraged to solve this task according to the task description, using any language you may know.
Huffman encoding is a way to assign binary codes to symbols that reduces the overall number of bits used to encode a typical string of of those symbols.
For example, if you use letters as symbols and have details of the frequency of occurence of those letters in typical strings, then you could just encode each letter with a fixed number of bits, such as in ASCII codes. You can do better than this by encoding more frequently occurring letters such as e and a, with smaller bit strings; and less frequently occurring letters such as q and x with longer bit strings.
Any string of letters will be encoded as a string of bits that are no-longer of the same length per letter. To successfully decode such as string, the smaller codes assigned to letters such as 'e' cannot occur as a prefix in the larger codes such as that for 'x'.
- If you were to assign a code 01 for 'e' and code 011 for 'x', then if the bits to decode started as 011... then you would not know iif you should decode an 'e' or an 'x'.
The Huffman coding scheme takes each symbol and its weight (or frequency of occurrence), and generates proper encodings for each symbol taking account of the weights of each symbol, so that higher weighted symbols have less bits in their encoding. (See the WP article for more information).
A Huffman encoding can be computed by first creating a tree of nodes:
- Create a leaf node for each symbol and add it to the priority queue.
- While there is more than one node in the queue:
- Remove the node of highest priority (lowest probability) twice to get two nodes.
- Create a new internal node with these two nodes as children and with probability equal to the sum of the two nodes' probabilities.
- Add the new node to the queue.
- The remaining node is the root node and the tree is complete.
Traverse the constructed binary tree from root to leaves assigning and accumulating a '0' for one branch and a '1' for the other at each node. The accumulated zeroes and ones at each leaf constitute a Huffman encoding for those symbols and weights:
Using the characters and their frequency from the string "this is an example for huffman encoding", create a program to generate a Huffman encoding for each character as a table.
C
This code lacks a lot of needed checkings, expecially for memory allocation.
<lang c>#include <stdio.h>
- include <stdlib.h>
- include <string.h>
- define BYTES 256
struct huffcode {
int nbits; int code;
}; typedef struct huffcode huffcode_t;
struct huffheap {
int *h; int n, s, cs; long *f;
}; typedef struct huffheap heap_t;
/* heap handling funcs */ static heap_t *_heap_create(int s, long *f) {
heap_t *h; h = malloc(sizeof(heap_t)); h->h = malloc(sizeof(int)*s); h->s = h->cs = s; h->n = 0; h->f = f; return h;
}
static void _heap_destroy(heap_t *heap) {
free(heap->h); free(heap);
}
- define swap_(I,J) do { int t_; t_ = a[(I)]; \
a[(I)] = a[(J)]; a[(J)] = t_; } while(0)
static void _heap_sort(heap_t *heap) {
int i=1, j=2; /* gnome sort */ int *a = heap->h;
while(i < heap->n) { /* smaller values are kept at the end */ if ( heap->f[a[i-1]] >= heap->f[a[i]] ) { i = j; j++; } else { swap_(i-1, i); i--; i = (i==0) ? 1 : i; } }
}
- undef swap_
static void _heap_add(heap_t *heap, int c) {
if ( (heap->n + 1) > heap->s ) { heap->h = realloc(heap->h, heap->s + heap->cs); heap->s += heap->cs; } heap->h[heap->n] = c; heap->n++; _heap_sort(heap);
}
static int _heap_remove(heap_t *heap) {
if ( heap->n > 0 ) { heap->n--; return heap->h[heap->n]; } return -1;
}
/* huffmann code generator */ huffcode_t **create_huffman_codes(long *freqs) {
huffcode_t **codes; heap_t *heap; long efreqs[BYTES*2]; int preds[BYTES*2]; int i, extf=BYTES; int r1, r2;
memcpy(efreqs, freqs, sizeof(long)*BYTES); memset(&efreqs[BYTES], 0, sizeof(long)*BYTES);
heap = _heap_create(BYTES*2, efreqs); if ( heap == NULL ) return NULL;
for(i=0; i < BYTES; i++) if ( efreqs[i] > 0 ) _heap_add(heap, i);
while( heap->n > 1 ) { r1 = _heap_remove(heap); r2 = _heap_remove(heap); efreqs[extf] = efreqs[r1] + efreqs[r2]; _heap_add(heap, extf); preds[r1] = extf; preds[r2] = -extf; extf++; } r1 = _heap_remove(heap); preds[r1] = r1; _heap_destroy(heap);
codes = malloc(sizeof(huffcode_t *)*BYTES);
int bc, bn, ix; for(i=0; i < BYTES; i++) { bc=0; bn=0; if ( efreqs[i] == 0 ) { codes[i] = NULL; continue; } ix = i; while( abs(preds[ix]) != ix ) { bc |= ((preds[ix] >= 0) ? 1 : 0 ) << bn; ix = abs(preds[ix]); bn++; } codes[i] = malloc(sizeof(huffcode_t)); codes[i]->nbits = bn; codes[i]->code = bc; } return codes;
}
void free_huffman_codes(huffcode_t **c) {
int i;
for(i=0; i < BYTES; i++) if (c[i] != NULL) free(c[i]); free(c);
}
- define MAXBITSPERCODE 100
void inttobits(int c, int n, char *s) {
s[n] = 0; while(n > 0) { s[n-1] = (c%2) + '0'; c >>= 1; n--; }
}
const char *test = "this is an example for huffman encoding";
int main() {
huffcode_t **r; int i; char strbit[MAXBITSPERCODE]; const char *p; long freqs[BYTES];
memset(freqs, 0, sizeof freqs);
p = test; while(*p != '\0') freqs[*p++]++;
r = create_huffman_codes(freqs);
for(i=0; i < BYTES; i++) { if ( r[i] != NULL ) { inttobits(r[i]->code, r[i]->nbits, strbit); printf("%c (%d) %s\n", i, r[i]->code, strbit); } }
free_huffman_codes(r);
return 0;
}</lang>
Java
This implementation creates an actual tree structure, and then traverses the tree to recover the code. <lang java>import java.util.*;
abstract class HuffmanTree implements Comparable<HuffmanTree> {
public int frequency; // the frequency of this tree public HuffmanTree(int freq) { frequency = freq; }
// compares on the frequency public int compareTo(HuffmanTree tree) { return frequency - tree.frequency; }
}
class HuffmanLeaf extends HuffmanTree {
public char value; // the character this leaf represents public HuffmanLeaf(int freq, char val) { super(freq); value = val; }
}
class HuffmanNode extends HuffmanTree {
public HuffmanTree left, right; // subtrees public HuffmanNode(HuffmanTree l, HuffmanTree r) { super(l.frequency + r.frequency); left = l; right = r; }
}
public class HuffmanCode {
// input is an array of frequencies, indexed by character code public static HuffmanTree buildTree(int[] charFreqs) { PriorityQueue<HuffmanTree> trees = new PriorityQueue<HuffmanTree>(); // initially, we have a forest of leaves // one for each non-empty character for (int i = 0; i < charFreqs.length; i++) if (charFreqs[i] > 0) trees.offer(new HuffmanLeaf(charFreqs[i], (char)i));
assert trees.size() > 0; // loop until there is only one tree left while (trees.size() > 1) { // two trees with least frequency HuffmanTree a = trees.poll(); HuffmanTree b = trees.poll();
// put into new node and re-insert into queue trees.offer(new HuffmanNode(a, b)); } return trees.poll(); }
public static void printCodes(HuffmanTree tree, Stack<Boolean> prefix) { assert tree != null; if (tree instanceof HuffmanLeaf) { HuffmanLeaf leaf = (HuffmanLeaf)tree;
// print out character and frequency System.out.print(leaf.value + "\t" + leaf.frequency + "\t");
// print out code for this leaf, which is just the prefix for (boolean bit : prefix) System.out.print(bit ? '1' : '0'); System.out.println();
} else if (tree instanceof HuffmanNode) { HuffmanNode node = (HuffmanNode)tree;
// traverse left prefix.push(false); printCodes(node.left, prefix); prefix.pop();
// traverse right prefix.push(true); printCodes(node.right, prefix); prefix.pop(); } }
public static void main(String[] args) { String test = "this is an example for huffman encoding";
// we will assume that all our characters will have // code less than 256, for simplicity int[] charFreqs = new int[256]; // read each character and record the frequencies for (char c : test.toCharArray()) charFreqs[c]++;
// build tree HuffmanTree tree = buildTree(charFreqs);
// print out results System.out.println("SYMBOL\tWEIGHT\tHUFFMAN CODE"); printCodes(tree, new Stack<Boolean>()); }
}</lang>
Example output:
SYMBOL WEIGHT HUFFMAN CODE d 1 00000 t 1 00001 h 2 0001 s 2 0010 c 1 00110 x 1 00111 m 2 0100 o 2 0101 n 4 011 u 1 10000 l 1 10001 a 3 1001 r 1 10100 g 1 101010 p 1 101011 e 3 1011 i 3 1100 f 3 1101 6 111
Python
A slight modification of the method outlined in the task description allows the code to be accumulated as the heap is manipulated.
The output is sorted first on length of the code, then on the symbols.
<lang python>from heapq import heappush, heappop, heapify
def encode(symbol2weights):
Huffman encode the given dict mapping symbols to weights heap = [ [float(wt), [sym, ]] for sym, wt in symbol2weights.iteritems() ] heapify(heap) while len(heap) >1: lo = heappop(heap) hi = heappop(heap) for i in lo[1:]: i[1] = '0' + i[1] for i in hi[1:]: i[1] = '1' + i[1] lohi = [ lo[0] + hi[0] ] + lo[1:] + hi[1:] heappush(heap, lohi) return sorted(heappop(heap)[1:], key=lambda x: (len(x[-1]), x))
astring = "this is an example for huffman encoding" symbol2weights = dict((ch, astring.count(ch)) for ch in set(astring)) huff = encode(symbol2weights) print "\nSYMBOL\tWEIGHT\tHUFFMAN CODE" for h in huff:
print "%s\t%s\t%s" % (h[0], symbol2weights[h[0]], h[1])</lang>
Example output:
SYMBOL WEIGHT HUFFMAN CODE 6 101 n 4 010 a 3 1001 e 3 1100 f 3 1101 h 2 0001 i 3 1110 m 2 0010 o 2 0011 s 2 0111 g 1 00000 l 1 00001 p 1 01100 r 1 01101 t 1 10000 u 1 10001 x 1 11110 c 1 111110 d 1 111111