Knuth's algorithm S: Difference between revisions

Content added Content deleted

Inline

Revision as of 17:22, 8 November 2011

This is a method of randomly sampling n items from a set of M items, with equal probability; where M >= n and M, the number of items is unknown until the end. This means that the equal probability sampling should be maintained for all successive items > n as they become available (although the content of successive samples can change).

The algorithm

Select the first n items as the sample as they become available;
For the i-th item where i > n, have a random chance of n/i of keeping it. If failing this chance, the sample remains the same. If not, have it randomly (1/n) replace one of the previously selected n items of the sample.
Repeat #2 for any subsequent items.

The Task

Create a function s_of_n_creator that given $n$ the maximum sample size, returns a function s_of_n that takes one parameter, item.
Function s_of_n when called with successive items returns an equi-weighted random sample of up to n of its items so far, each time it is called, calculated using Knuths Algorithm S.
Test your functions by printing and showing the frequency of occurrences of the selected digits from 100,000 repetitions of:

Use the s_of_n_creator with n == 3 to generate an s_of_n.
call s_of_n with each of the digits 0 to 9 in order, keeping the returned three digits of its random sampling from its last call with argument item=9.

Note: A class taking n and generating a callable instance/function might also be used.

Reference

The Art of Computer Programming, Vol 2, 3.4.2 p.142

Cf.

Common Lisp

<lang lisp>(setf *random-state* (make-random-state t))

(defun make-selector (n)

 (let ((i 0) res)
   (lambda (&optional (x nil x-p))
     (if (and x-p (< (random (incf i)) n))

(if (< (length res) n) (push x res) (setf (elt res (random n)) x)))

     res)))

test

(loop repeat 100000

     with freq = (make-array 10 :initial-element 0)
     do (let ((f (make-selector 3)))

(mapc f '(0 1 2 3 4 5 6 7 8 9)) (mapc (lambda (i) (incf (aref freq i))) (funcall f)))

     finally (prin1 freq))</lang>output<lang>#(30026 30023 29754 30017 30267 29997 29932 29990 29965 30029)</lang>

D

Translation of: Python

<lang d>import std.stdio, std.random, std.range, std.algorithm;

auto s_of_n_creator(in int n) {

   int[] sample;
   int i;
   typeof(sample) s_of_n(in int item) {
       i++;
       if (i <= n) {
           // Keep first n items
           sample ~= item;
       } else if (uniform(0.0, 1.0) < (cast(double)n / i)) {
           // Keep item
           sample = remove(sample, uniform(0, n));
           sample ~= item;
       }
       return sample;
   }
   return &s_of_n;

}

void main() {

   int[10] bin;
   auto items = iota(bin.length);
   writeln("Single run samples for n = 3:");
   auto s_of_n1 = s_of_n_creator(3);
   foreach (item; items) {
       auto sample = s_of_n1(item);
       writefln("  Item: %d -> sample: %s", item, sample);
   }

   enum nruns = 1_000_000;
   foreach (trial; 0 .. nruns) {
       auto s_of_n2 = s_of_n_creator(3);
       int[] sample;
       foreach (item; items)
           sample = s_of_n2(item);
       foreach (s; sample)
           bin[s]++;
   }
   writefln("\nTest item frequencies for %d runs:", nruns);
   foreach (i, d; bin)
       writefln("  %d: %d", i, d);

}</lang> Example output:

Single run samples for n = 3:
  Item: 0 -> sample: [0]
  Item: 1 -> sample: [0, 1]
  Item: 2 -> sample: [0, 1, 2]
  Item: 3 -> sample: [0, 1, 2]
  Item: 4 -> sample: [0, 1, 4]
  Item: 5 -> sample: [0, 1, 4]
  Item: 6 -> sample: [1, 4, 6]
  Item: 7 -> sample: [1, 6, 7]
  Item: 8 -> sample: [6, 7, 8]
  Item: 9 -> sample: [7, 8, 9]

Test item frequencies for 1000000 runs:
  0: 300033
  1: 299914
  2: 299505
  3: 300033
  4: 299704
  5: 299893
  6: 299871
  7: 300297
  8: 300292
  9: 300458

Go

<lang go>package main

import (

   "fmt"
   "rand"
   "time"

)

func sOfNCreator(n int) func(byte) []byte {

   s := make([]byte, 0, 3)
   m := n
   return func(item byte) []byte {
       if len(s) < 3 {
           s = append(s, item)
       } else {
           m++
           if rand.Intn(m) < n {
               s[rand.Intn(n)] = item
           }
       }
       return s
   }

}

func main() {

   rand.Seed(time.Nanoseconds())
   var freq [10]int
   for r := 0; r < 1e5; r++ {
       sOfN := sOfNCreator(3)
       for d := byte('0'); d < '9'; d++ {
           sOfN(d)
       }
       for _, d := range sOfN('9') {
           freq[d-'0']++
       }
   }
   fmt.Println(freq)

}</lang> Output:

[30075 29955 30024 30095 30031 30018 29973 29642 30156 30031]

J

Note that this approach introduces heavy inefficiencies, to achieve information hiding.

<lang j>coclass'inefficient'

 create=:3 :0
   N=: y
   ITEMS=: 
   K=:0
 )

 s_of_n=:3 :0
   K=: K+1
   if. N>#ITEMS do.
     ITEMS=: ITEMS,y
   else.
     if. (N%K)>?0 do.
       ITEMS=: (((i.#ITEMS)-.?N){ITEMS),y
     else.
       ITEMS
     end.
   end.
 )

s_of_n_creator_base_=: 1 :0

 ctx=: conew&'inefficient' m
 s_of_n__ctx

)</lang>

Required example:

<lang j>run=:3 :0

 nl=. conl 1
 s3_of_n=. 3 s_of_n_creator
 r=. {: s3_of_n"0 i.10
 coerase (conl 1)-.nl
 r

)

  (~.,._1 + #/.~) (i.10),,D=:run"0 i.1e5

0 30099 1 29973 2 29795 3 29995 4 29996 5 30289 6 29903 7 29993 8 30215 9 29742</lang>

OCaml

<lang ocaml>let s_of_n_creator n =

 let i = ref 0
 and sample = ref [| |] in
 function item ->
   incr i;
   if !i <= n then sample := Array.append [| item |] !sample
   else if Random.int !i < n then !sample.(Random.int n) <- item;
   !sample

let test n items_set =

 let s_of_n = s_of_n_creator n in
 Array.fold_left (fun _ v -> s_of_n v) [| |] items_set

let () =

 Random.self_init();
 let n = 3 in
 let num_items = 10 in
 let items_set = Array.init num_items (fun i -> i) in
 let results = Array.create num_items 0 in
 for i = 1 to 100_000 do
   let res = test n items_set in
   Array.iter (fun j -> results.(j) <- succ results.(j)) res
 done;
 Array.iter (Printf.printf " %d") results;
 print_newline()</lang>

Output:

 30051 29899 30249 30058 30012 29836 29998 29882 30148 29867

PicoLisp

<lang PicoLisp>(de s_of_n_creator (@N)

  (curry (@N (I . 0) (Res)) (Item)
     (cond
        ((>= @N (inc 'I)) (push 'Res Item))
        ((>= @N (rand 1 I)) (set (nth Res (rand 1 @N)) Item)) )
     Res ) )

(let Freq (need 10 0)

  (do 100000
     (let S_of_n (s_of_n_creator 3)
        (for I (mapc S_of_n (0 1 2 3 4 5 6 7 8 9))
           (inc (nth Freq (inc I))) ) ) )
  Freq )</lang>

Output:

-> (30003 29941 29918 30255 29848 29875 30056 29839 30174 30091)

Python

<lang python>from random import random, randrange

def s_of_n_creator(n):

   sample, i = [], 0
   def s_of_n(item):
       nonlocal i

       i += 1
       if i <= n:
           # Keep first n items
           sample.append(item)
       elif random() < n / i:
           # Keep item
           del sample[randrange(n)]
           sample.append(item)
       return sample
   return s_of_n

if __name__ == '__main__':

   bin = [0]* 10
   items = range(10)
   print("Single run samples for n = 3:")
   s_of_n = s_of_n_creator(3)
   for item in items:
       sample = s_of_n(item)
       print("  Item: %i -> sample: %s" % (item, sample))
   #
   for trial in range(100000):
       s_of_n = s_of_n_creator(3)
       for item in items:
           sample = s_of_n(item)
       for s in sample:
           bin[s] += 1
   print("\nTest item frequencies for 100000 runs:\n ",
         '\n  '.join("%i:%i" % x for x in enumerate(bin)))</lang>

Sample output

Single run samples for n = 3:
  Item: 0 -> sample: [0]
  Item: 1 -> sample: [0, 1]
  Item: 2 -> sample: [0, 1, 2]
  Item: 3 -> sample: [0, 1, 3]
  Item: 4 -> sample: [0, 1, 3]
  Item: 5 -> sample: [0, 1, 3]
  Item: 6 -> sample: [0, 1, 3]
  Item: 7 -> sample: [0, 3, 7]
  Item: 8 -> sample: [0, 3, 7]
  Item: 9 -> sample: [0, 3, 7]

Test item frequencies for 100000 runs:
  0:29983
  1:30240
  2:29779
  3:29921
  4:30224
  5:29967
  6:30036
  7:30050
  8:29758
  9:30042

Python Class based version

Only a slight change creates the following class-based implementation: <lang python>class S_of_n_creator():

   def __init__(self, n):
       self.n = n
       self.i = 0
       self.sample = []
   
   def __call__(self, item):
       self.i += 1
       n, i, sample = self.n, self.i, self.sample
       if i <= n:
           # Keep first n items
           sample.append(item)
       elif random() < n / i:
           # Keep item
           del sample[randrange(n)]
           sample.append(item)
       return sample</lang>

The above can be instantiated as follows after which s_of_n can be called in the same way as it is in the first example where it is a function instead of an instance. <lang python>s_of_n = S_of_n_creator(3)</lang>

Tcl

<lang tcl>package require Tcl 8.6

oo::class create SofN {

   variable items size count
   constructor {n} {

set size $n

   }
   method item {item} {

if {[incr count] <= $size} { lappend items $item } elseif {rand()*$count < $size} { lset items [expr {int($size * rand())}] $item } return $items

}

Test code

for {set i 0} {$i < 100000} {incr i} {

   set sOf3 [SofN new 3]
   foreach digit {0 1 2 3 4 5 6 7 8 9} {

set digs [$sOf3 item $digit]

   }
   $sOf3 destroy
   foreach digit $digs {

incr freq($digit)

} parray freq</lang>

Sample output:

freq(0) = 29812
freq(1) = 30099
freq(2) = 29927
freq(3) = 30106
freq(4) = 30048
freq(5) = 29993
freq(6) = 29912
freq(7) = 30219
freq(8) = 30060
freq(9) = 29824

@@ Line 238: / Line 238: @@
   Array.iter (Printf.printf " %d") results;
   print_newline()</lang>
+Output:
+<pre> 30051 29899 30249 30058 30012 29836 29998 29882 30148 29867</pre>
 =={{header|PicoLisp}}==