LZW compression

From Rosetta Code

Jump to: navigation, search

Programming Task
This is a programming task. It lays out a problem which Rosetta Code users are encouraged to solve, using languages they know.

Code examples should be formatted along the lines of one of the existing prototypes.

The Lempel-Ziv-Welch (LZW) algorithm provides lossless data compression. You can read a complete description of it in the Wikipedia article on the subject. It was patented, but it fell in the public domain in 2004.

Contents

[edit] D

D 1, with Phobos, from the Python version (the final writefln works only on 7-bit ASCII strings):

 
import std.stdio: writefln;
 
/// Compress a string to a list of output symbols.
int[] compress(string uncompressed) {
    // build the dictionary
    int dict_size = 256;
    int[string] dictionary;
    for (int i; i < dict_size; i++)
        dictionary["" ~ cast(char)i] = i;
 
    string w;
    int[] result;
    foreach (c; uncompressed) {
        string wc = w ~ c;
        if (wc in dictionary)
            w = wc;
        else {
            result ~= dictionary[w];
            // add wc to the dictionary
            dictionary[wc] = dict_size;
            dict_size++;
            w = "" ~ c;
        }
    }
 
    // output the code for w
    if (w.length)
        foreach (c; w)
            result ~= c;
    return result;
}
 
 
/// Decompress a list of output ks to a string.
string decompress(int[] compressed) {
    // build the dictionary
    int dict_size = 256;
    string[int] dictionary;
    for (int i; i < dict_size; i++)
        dictionary[i] = "" ~ cast(char)i;
 
    string w = "" ~ cast(char)compressed[0];
    string result = w;
    foreach (k; compressed[1 .. $]) {
        string entry;
        if (k in dictionary)
            entry = dictionary[k];
        else if (k == dictionary.length)
            entry = w ~ w[0];
        else
            throw new Exception("Bad compressed k");
        result ~= entry;
 
        // add w+entry[0] to the dictionary
        dictionary[dict_size] = w ~ entry[0];
        dict_size++;
 
        w = entry;
    }
    return result;
}
 
void main() {
    auto compressed = compress("TOBEORNOTTOBEORTOBEORNOT");
    writefln(compressed);
 
    auto decompressed = decompress(compressed);
    writefln(decompressed);
}
 

The output:

 
[84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,79,84]
TOBEORNOTTOBEORTOBEORNOT
 

[edit] Forth

Works with: GNU Forth version 0.6.2

256 value next-symbol

\ current string fragment

create w 256 allot     \ counted string

: w=c ( c -- )       w 1+ c!        1 w c! ;
: w+c ( c -- )  w count + c!  w c@ 1+ w c! ;
\ Compression

\ dictionary of strings to symbols
0 value dict

: init-dict  table to dict  256 to next-symbol  dict set-current ;

: free-dict                           forth-wordlist set-current ;

: in-dict? ( key len -- ? )		\ can assume len > 1
  dict search-wordlist dup if nip then ;

: lookup-dict ( key len -- symbol )
  dup 1 = if drop c@ exit then
  dict search-wordlist if >body @ else abort" bad-dict!" then ;

: put-dict ( data key len -- )
  nextname create , ;

\ output buffer of symbols
\  in real life, these symbols would be packed into octets
variable out-size
create out 256 cells allot

: output ( symbol -- )
  dup out out-size @ cells + !  1 out-size +!
  dup 256 < if emit space else . then ;

: compress ( addr len -- )
  init-dict  0 out-size !
  over c@ w=c  1 /string
  bounds do
    i c@ w+c
    w count in-dict? 0= if
      w count 1- lookup-dict output
      next-symbol dup w count put-dict
      1+ to next-symbol
      i c@ w=c
    then
  loop
  w count bounds do i c@ output loop
  free-dict ;
\ Decompression

\ array of symbols to strings (in real code this would need to be growable)
\  next-symbol is reused for the size of this table
create symtab 256 cells allot
0 value start

: init-symtab  256 to next-symbol  here to start ;

: free-symtab  start here - allot ;

: get-symbol ( symbol -- addr len )
  dup 256 < if pad c! pad 1 exit then
  256 - cells symtab + @ count ;

: add-symbol ( addr len -- )
  here symtab next-symbol 256 - cells + !
  s,
  next-symbol 1+ to next-symbol ;

create entry 256 allot

: decompress ( addr len -- )
  init-symtab
  over @ dup emit w=c
  cells bounds cell+ do
    i @ next-symbol < if
      i @ get-symbol entry place
    else i @ next-symbol = if
      w 1+ c@ w count + c!  w count 1+ entry place
    else
      abort" bad symbol!"
    then then
    entry count type	\ output
    entry 1+ c@ w+c
    w count add-symbol
    entry count w place
  1 cells +loop
  free-symtab ;
\ Testing

s" TOBEORNOTTOBEORTOBEORNOT" compress cr
\ T O B E O R N O T 256 258 260 265 259 261 O T

out out-size @ decompress cr
\ TOBEORNOTTOBEORTOBEORNOT

[edit] OCaml

#directory "+site-lib/extlib/"
#load "extLib.cma"
open ExtString
 
(** compress a string to a list of output symbols *)
let compress ~uncompressed =
  (* build the dictionary *)
  let dict_size = 256 in
  let dictionary = Hashtbl.create 397 in
  for i=0 to 255 do
    let str = String.make 1 (char_of_int i) in
    Hashtbl.add dictionary str i
  done;
 
  let f = (fun (w, dict_size, result) c ->
    let c = String.make 1 c in
    let wc = w ^ c in
    if Hashtbl.mem dictionary wc then
      (wc, dict_size, result)
    else
      begin
        (* add wc to the dictionary *)
        Hashtbl.add dictionary wc dict_size;
        let this = Hashtbl.find dictionary w in
        (c, dict_size + 1, this::result)
      end
  ) in
  let w, _, result =
    String.fold_left f ("", dict_size, []) uncompressed
  in
 
  (* output the code for w *)
  let result =
    if w = ""
    then result
    else (Hashtbl.find dictionary w) :: result
  in
 
  (List.rev result)
;;
 
exception ValueError of string
 
(** decompress a list of output symbols to a string *)
let decompress ~compressed =
  (* build the dictionary *)
  let dict_size = 256 in
  let dictionary = Hashtbl.create 397 in
  for i=0 to pred dict_size do
    let str = String.make 1 (char_of_int i) in
    Hashtbl.add dictionary i str
  done;
 
  let w, compressed =
    match compressed with
    | hd::tl -> (String.make 1 (char_of_int hd)), tl
    | [] -> failwith "empty input"
  in
 
  let result = w::[] in
 
  let result, _, _ =
    List.fold_left (fun (result, w, dict_size) k ->
      let entry =
        if Hashtbl.mem dictionary k then
          Hashtbl.find dictionary k
        else if k = Hashtbl.length dictionary then
          w ^ (String.make 1 w.[0])
        else
          raise(ValueError(Printf.sprintf "Bad compressed k: %d" k))
      in
      let result = entry :: result in
 
      (* add (w ^ entry.[0]) to the dictionary *)
      Hashtbl.add dictionary dict_size (w ^ (String.make 1 entry.[0]));
      (result, entry, dict_size + 1)
    ) (result, w, dict_size) compressed
  in
  (List.rev result)
;;

here is the interface:

val compress : uncompressed:string -> int list
val decompress : compressed:int list -> string list

How to use:
The compressed datas are a list of symbols (of type int) that will require more than 8 bits to be saved. So to know how many bits are required, you need to know how many bits are required for the greatest symbol in the list.

let greatest = List.fold_left (fun m x -> max m x) 0 ;;
 
(** number of bits needed to encode the integer m *)
let n_bits m =
  let m = float m in
  let rec aux n =
    let max = (2. ** n) -. 1. in
    if max >= m then int_of_float n
    else aux (n +. 1.0)
  in
  aux 1.0
;;
 
let write_compressed ~filename ~compressed =
  let nbits = n_bits(greatest compressed) in
  let oc = open_out filename in
  output_byte oc nbits;
  let ob = IO.output_bits(IO.output_channel oc) in
  List.iter (fun code ->
      IO.write_bits ob nbits code
    ) compressed;
  IO.flush_bits ob;
  close_out oc;
;;
 
let read_compressed ~filename =
  let ic = open_in filename in
  let nbits = input_byte ic in
  let ib = IO.input_bits(IO.input_channel ic) in
  let rec loop acc =
    try
      let code = IO.read_bits ib nbits in
      loop (code::acc)
    with _ -> List.rev acc
  in
  let compressed = loop [] in
  let result = decompress ~compressed in
  let buf = Buffer.create 2048 in
  List.iter (Buffer.add_string buf) result;
  (Buffer.contents buf)
;;


[edit] Python

In this version the dicts contain mixed typed data:

 
def compress(uncompressed):
    """Compress a string to a list of output symbols."""
 
    # Build the dictionary.
    dict_size = 256
    dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))
 
    w = ""
    result = []
    for c in uncompressed:
        wc = w + c
        if wc in dictionary:
            w = wc
        else:
            result.append(dictionary[w])
            # Add wc to the dictionary.
            dictionary[wc] = dict_size
            dict_size += 1
            w = c
 
    # Output the code for w.
    if w:
        result.extend(w)
    return result
 
 
def decompress(compressed):
    """Decompress a list of output ks to a string."""
 
    # Build the dictionary.
    dict_size = 256
    dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))
 
    w = result = compressed.pop(0)
    for k in compressed:
        if k in dictionary:
            entry = dictionary[k]
        elif k == len(dictionary):
            entry = w + w[0]
        else:
            raise ValueError('Bad compressed k: %s' % k)
        result += entry
 
        # Add w+entry[0] to the dictionary.
        dictionary[dict_size] = w + entry[0]
        dict_size += 1
 
        w = entry
    return result
 
 
# How to use:
compressed = compress('TOBEORNOTTOBEORTOBEORNOT')
print compressed
decompressed = decompress(compressed)
print decompressed
 

Output:

 
['T', 'O', 'B', 'E', 'O', 'R', 'N', 'O', 'T', 256, 258, 260, 265, 259, 261, 'O', 'T']
TOBEORNOTTOBEORTOBEORNOT
 
Personal tools