Knuth's algorithm S: Difference between revisions

From Rosetta Code
Content added Content deleted
(→‎{{header|PARI/GP}}: 100,000 times)
Line 494: Line 494:


=={{header|PARI/GP}}==
=={{header|PARI/GP}}==
{{incomplete|PARI/GP|Does not return a function.}}
{{improve|PARI/GP|Does not return a function.}}
<lang parigp>KnuthS(v,n)={
<lang parigp>KnuthS(v,n)={
my(u=vector(n,i,i));
my(u=vector(n,i,i));
Line 502: Line 502:
vecextract(v,u)
vecextract(v,u)
};
};
test()={
KnuthS([0,1,2,3,4,5,6,7,8,9],3)</lang>
my(v=vector(10),t);
for(i=1,1e5,
t=KnuthS([0,1,2,3,4,5,6,7,8,9],3);
v[t[1]+1]++;v[t[2]+1]++;v[t[3]+1]++
);
for(i=0,9,print(i" "v[i+1]))
};</lang>


Output:
Output:
<pre>%1 = [8, 4, 2]</pre>
<pre>0 30048
1 29902
2 30168
3 29950
4 30003
5 30094
6 29905
7 30286
8 29839
9 29805</pre>


=={{header|Perl}}==
=={{header|Perl}}==

Revision as of 08:34, 23 November 2011

Task
Knuth's algorithm S
You are encouraged to solve this task according to the task description, using any language you may know.

This is a method of randomly sampling n items from a set of M items, with equal probability; where M >= n and M, the number of items is unknown until the end. This means that the equal probability sampling should be maintained for all successive items > n as they become available (although the content of successive samples can change).

The algorithm
  1. Select the first n items as the sample as they become available;
  2. For the i-th item where i > n, have a random chance of n/i of keeping it. If failing this chance, the sample remains the same. If not, have it randomly (1/n) replace one of the previously selected n items of the sample.
  3. Repeat #2 for any subsequent items.
The Task
  1. Create a function s_of_n_creator that given the maximum sample size, returns a function s_of_n that takes one parameter, item.
  2. Function s_of_n when called with successive items returns an equi-weighted random sample of up to n of its items so far, each time it is called, calculated using Knuths Algorithm S.
  3. Test your functions by printing and showing the frequency of occurrences of the selected digits from 100,000 repetitions of:
  1. Use the s_of_n_creator with n == 3 to generate an s_of_n.
  2. call s_of_n with each of the digits 0 to 9 in order, keeping the returned three digits of its random sampling from its last call with argument item=9.

Note: A class taking n and generating a callable instance/function might also be used.

Reference
  • The Art of Computer Programming, Vol 2, 3.4.2 p.142
Cf.

C

Instead of returning a closure we set the environment in a structure:

<lang c>#include <stdlib.h>

  1. include <stdio.h>
  2. include <string.h>
  3. include <time.h>

struct s_env {

   unsigned int n, i;
   size_t size;
   void *sample;

};

void s_of_n_init(struct s_env *s_env, size_t size, unsigned int n) {

   s_env->i = 0;
   s_env->n = n;
   s_env->size = size;
   s_env->sample = malloc(n * size);

}

void sample_set_i(struct s_env *s_env, unsigned int i, void *item) {

   memcpy(s_env->sample + i * s_env->size, item, s_env->size);

}

void *s_of_n(struct s_env *s_env, void *item) {

   s_env->i++;
   if (s_env->i <= s_env->n)
       sample_set_i(s_env, s_env->i - 1, item);
   else if ((rand() % s_env->i) < s_env->n)
       sample_set_i(s_env, rand() % s_env->n, item);
   return s_env->sample;

}

int *test(unsigned int n, int *items_set, unsigned int num_items) {

   int i;
   int *sample;
   struct s_env s_env;
   s_of_n_init(&s_env, sizeof(items_set[0]), n);
   for (i = 0; i < num_items; i++) {
       s_of_n(&s_env, (void *) &items_set[i]);
   }
   return (int *)s_env.sample;

}

int main() {

   unsigned int i, j;
   unsigned int n = 3;
   unsigned int num_items = 10;
   unsigned int *frequencies;
   int *items_set;
   srand(time(NULL));
   items_set = malloc(num_items * sizeof(int));
   frequencies = malloc(num_items * sizeof(int));
   for (i = 0; i < num_items; i++) {
       items_set[i] = i;
       frequencies[i] = 0;
   }
   for (i = 0; i < 100000; i++) {
       int *res = test(n, items_set, num_items);
       for (j = 0; j < n; j++) {
           frequencies[res[j]]++;
       }

free(res);

   }
   for (i = 0; i < num_items; i++) {
       printf(" %d", frequencies[i]);
   }
   puts("");
   return 0;

}</lang>

C++

Works with: C++11

<lang cpp>#include <iostream>

  1. include <functional>
  2. include <vector>
  3. include <cstdlib>
  4. include <ctime>

template <typename T> std::function<std::vector<T>(T)> s_of_n_creator(int n) {

 std::vector<T> sample;
 int i = 0;
 return [=](T item) mutable {
   i++;
   if (i <= n) {
     sample.push_back(item);
   } else if (std::rand() % i < n) {
     sample[std::rand() % n] = item;
   }
   return sample;
 };

}

int main() {

 std::srand(std::time(NULL));
 int bin[10] = {0};
 for (int trial = 0; trial < 100000; trial++) {
   auto s_of_n = s_of_n_creator<int>(3);
   std::vector<int> sample;
   for (int i = 0; i < 10; i++)
     sample = s_of_n(i);
   for (int s : sample)
     bin[s]++;
 }
 for (int x : bin)
   std::cout << x << std::endl;
 return 0;

}</lang>

Output:

30052
29740
30197
30223
29857
29688
30095
29803
30098
30247

Common Lisp

This example is in need of improvement:

Task function names ignored

<lang lisp>(setf *random-state* (make-random-state t))

(defun make-selector (n)

 (let ((i 0) res)
   (lambda (&optional (x nil x-p))
     (if (and x-p (< (random (incf i)) n))

(if (< (length res) n) (push x res) (setf (elt res (random n)) x)))

     res)))
test

(loop repeat 100000

     with freq = (make-array 10 :initial-element 0)
     do (let ((f (make-selector 3)))

(mapc f '(0 1 2 3 4 5 6 7 8 9)) (mapc (lambda (i) (incf (aref freq i))) (funcall f)))

     finally (prin1 freq))</lang>output<lang>#(30026 30023 29754 30017 30267 29997 29932 29990 29965 30029)</lang>

D

Translation of: Python

<lang d>import std.stdio, std.random, std.range, std.algorithm;

auto s_of_n_creator(in int n) {

   int[] sample;
   int i;
   auto s_of_n(in int item) {
       i++;
       if (i <= n) {
           sample ~= item;
       } else if (uniform(0.0, 1.0) < (cast(double)n / i)) {
           sample = remove(sample, uniform(0, n));
           sample ~= item;
       }
       return sample;
   }
   return &s_of_n;

}

void main() {

   int[10] bin;
   auto items = iota(bin.length);
   writeln("Single run samples for n = 3:");
   auto s_of_n1 = s_of_n_creator(3);
   foreach (item; items) {
       auto sample = s_of_n1(item);
       writefln("  Item: %d -> sample: %s", item, sample);
   }
   enum nruns = 100_000;
   foreach (trial; 0 .. nruns) {
       auto s_of_n2 = s_of_n_creator(3);
       int[] sample;
       foreach (item; items)
           sample = s_of_n2(item);
       foreach (s; sample)
           bin[s]++;
   }
   writefln("\nTest item frequencies for %d runs:", nruns);
   foreach (d; bin)
       write(d, " ");

}</lang> Example output:

Single run samples for n = 3:
  Item: 0 -> sample: [0]
  Item: 1 -> sample: [0, 1]
  Item: 2 -> sample: [0, 1, 2]
  Item: 3 -> sample: [1, 2, 3]
  Item: 4 -> sample: [1, 3, 4]
  Item: 5 -> sample: [1, 3, 4]
  Item: 6 -> sample: [1, 3, 4]
  Item: 7 -> sample: [1, 3, 7]
  Item: 8 -> sample: [1, 7, 8]
  Item: 9 -> sample: [1, 7, 8]

Test item frequencies for 100000 runs:
29926 30033 30145 30172 29882 30104 29773 30071 29858 30036

Go

<lang go>package main

import (

   "fmt"
   "rand"
   "time"

)

func sOfNCreator(n int) func(byte) []byte {

   s := make([]byte, 0, n)
   m := n
   return func(item byte) []byte {
       if len(s) < n {
           s = append(s, item)
       } else {
           m++
           if rand.Intn(m) < n {
               s[rand.Intn(n)] = item
           }
       }
       return s
   }

}

func main() {

   rand.Seed(time.Nanoseconds())
   var freq [10]int
   for r := 0; r < 1e5; r++ {
       sOfN := sOfNCreator(3)
       for d := byte('0'); d < '9'; d++ {
           sOfN(d)
       }
       for _, d := range sOfN('9') {
           freq[d-'0']++
       }
   }
   fmt.Println(freq)

}</lang> Output:

[30075 29955 30024 30095 30031 30018 29973 29642 30156 30031]

J

Note that this approach introduces heavy inefficiencies, to achieve information hiding.

<lang j>coclass'inefficient'

 create=:3 :0
   N=: y
   ITEMS=: 
   K=:0
 )
 s_of_n=:3 :0
   K=: K+1
   if. N>#ITEMS do.
     ITEMS=: ITEMS,y
   else.
     if. (N%K)>?0 do.
       ITEMS=: (((i.#ITEMS)-.?N){ITEMS),y
     else.
       ITEMS
     end.
   end.
 )


s_of_n_creator_base_=: 1 :0

 ctx=: conew&'inefficient' m
 s_of_n__ctx

)</lang>

Required example:

<lang j>run=:3 :0

 nl=. conl 1
 s3_of_n=. 3 s_of_n_creator
 r=. {: s3_of_n"0 i.10
 coerase (conl 1)-.nl
 r

)

  (~.,._1 + #/.~) (i.10),,D=:run"0 i.1e5

0 30099 1 29973 2 29795 3 29995 4 29996 5 30289 6 29903 7 29993 8 30215 9 29742</lang>

Java

A class-based solution: <lang java>import java.util.*;

class SOfN<T> {

   private static final Random rand = new Random();
   private List<T> sample;
   private int i = 0;
   private int n;
   public SOfN(int _n) {

n = _n; sample = new ArrayList<T>(n);

   }
   public List<T> process(T item) {

i++; if (i <= n) {

           sample.add(item);

} else if (rand.nextInt(i) < n) { sample.set(rand.nextInt(n), item); } return sample;

   }

}

public class AlgorithmS {

   public static void main(String[] args) {

int[] bin = new int[10]; for (int trial = 0; trial < 100000; trial++) { SOfN<Integer> s_of_n = new SOfN<Integer>(3); List<Integer> sample = null; for (int i = 0; i < 10; i++) sample = s_of_n.process(i); for (int s : sample) bin[s]++; } System.out.println(Arrays.toString(bin));

   }

}</lang>

Output:

[30115, 30141, 30050, 29887, 29765, 30132, 29767, 30114, 30079, 29950]

Alternative solution without using an explicitly named type; instead using an anonymous class implementing a generic "function" interface: <lang java>import java.util.*;

interface Function<S, T> {

   public T call(S x);

}

public class AlgorithmS {

   private static final Random rand = new Random();
   public static <T> Function<T, List<T>> s_of_n_creator(final int n) {

return new Function<T, List<T>>() { private List<T> sample = new ArrayList<T>(n); private int i = 0; public List<T> call(T item) { i++; if (i <= n) { sample.add(item); } else if (rand.nextInt(i) < n) { sample.set(rand.nextInt(n), item); } return sample; } };

   }
   public static void main(String[] args) {

int[] bin = new int[10]; for (int trial = 0; trial < 100000; trial++) { Function<Integer, List<Integer>> s_of_n = s_of_n_creator(3); List<Integer> sample = null; for (int i = 0; i < 10; i++) sample = s_of_n.call(i); for (int s : sample) bin[s]++; } System.out.println(Arrays.toString(bin));

   }

}</lang>

Objective-C

Works with: Mac OS X version 10.6+

Uses blocks <lang objc>#import <Foundation/Foundation.h>

typedef NSArray *(^SOfN)(id);

SOfN s_of_n_creator(int n) {

 NSMutableArray *sample = [NSMutableArray arrayWithCapacity:n];
 __block int i = 0;
 return [[^(id item) {
   i++;
   if (i <= n) {
     [sample addObject:item];
   } else if (rand() % i < n) {
     [sample replaceObjectAtIndex:rand() % n withObject:item];
   }
   return sample;
 } copy] autorelease];

}

int main(int argc, const char *argv[]) {

 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
 NSCountedSet *bin = [[NSCountedSet alloc] init];
 for (int trial = 0; trial < 100000; trial++) {
   SOfN s_of_n = s_of_n_creator(3);
   NSArray *sample;
   for (int i = 0; i < 10; i++)
     sample = s_of_n([NSNumber numberWithInt:i]);
   [bin addObjectsFromArray:sample];
 }
 NSLog(@"%@", bin);
 [bin release];
 
 [pool release];
 return 0;

}</lang>

Log:

<NSCountedSet: 0x100114120> (0 [29934], 9 [30211], 5 [29926], 1 [30067], 6 [30001], 2 [29972], 7 [30126], 3 [29944], 8 [29910], 4 [29909])

OCaml

<lang ocaml>let s_of_n_creator n =

 let i = ref 0
 and sample = ref [| |] in
 fun item ->
   incr i;
   if !i <= n then sample := Array.append [| item |] !sample
   else if Random.int !i < n then !sample.(Random.int n) <- item;
   !sample

let test n items_set =

 let s_of_n = s_of_n_creator n in
 Array.fold_left (fun _ v -> s_of_n v) [| |] items_set

let () =

 Random.self_init();
 let n = 3 in
 let num_items = 10 in
 let items_set = Array.init num_items (fun i -> i) in
 let results = Array.create num_items 0 in
 for i = 1 to 100_000 do
   let res = test n items_set in
   Array.iter (fun j -> results.(j) <- succ results.(j)) res
 done;
 Array.iter (Printf.printf " %d") results;
 print_newline()</lang>

Output:

 30051 29899 30249 30058 30012 29836 29998 29882 30148 29867

PARI/GP

This example is in need of improvement:

Does not return a function.

<lang parigp>KnuthS(v,n)={

 my(u=vector(n,i,i));
 for(i=n+1,#v,
   if(random(i)<n,u[random(n)+1]=i)
 );
 vecextract(v,u)

}; test()={

 my(v=vector(10),t);
 for(i=1,1e5,
   t=KnuthS([0,1,2,3,4,5,6,7,8,9],3);
   v[t[1]+1]++;v[t[2]+1]++;v[t[3]+1]++
 );
 for(i=0,9,print(i" "v[i+1]))

};</lang>

Output:

0 30048
1 29902
2 30168
3 29950
4 30003
5 30094
6 29905
7 30286
8 29839
9 29805

Perl

<lang perl>use strict;

sub s_of_n_creator {

   my $n = shift;
   my @sample;
   my $i = 0;
   sub {
       my $item = shift;
       $i++;
       if ($i <= $n) {
           # Keep first n items
           push @sample, $item;
       } elsif (rand() < $n / $i) {
           # Keep item
           @sample[rand $n] = $item;
       }
       @sample
   }

}

my @items = (0..9); my @bin;

foreach my $trial (1 .. 100000) {

   my $s_of_n = s_of_n_creator(3);
   my @sample;
   foreach my $item (@items) {
       @sample = $s_of_n->($item);
   }
   foreach my $s (@sample) {
       $bin[$s]++;
   }

} print "@bin\n"; </lang>

Sample output
30003 29923 30192 30164 29994 29976 29935 29860 30040 29913

PHP

Works with: PHP version 5.3+

<lang php><?php function s_of_n_creator($n) {

   $sample = array();
   $i = 0;
   return function($item) use (&$sample, &$i, $n) {
       $i++;
       if ($i <= $n) {
           // Keep first n items
           $sample[] = $item;
       } else if (rand(0, $i-1) < $n) {
           // Keep item
           $sample[rand(0, $n-1)] = $item;
       }
       return $sample;
   };

}

$items = range(0, 9);

for ($trial = 0; $trial < 100000; $trial++) {

   $s_of_n = s_of_n_creator(3);
   foreach ($items as $item)
       $sample = $s_of_n($item);
   foreach ($sample as $s)
       $bin[$s]++;

} print_r($bin); ?></lang>

Sample output
Array
(
    [3] => 30158
    [8] => 29859
    [9] => 29984
    [6] => 29937
    [7] => 30361
    [4] => 29994
    [5] => 29849
    [0] => 29724
    [1] => 29997
    [2] => 30137
)

PicoLisp

<lang PicoLisp>(de s_of_n_creator (@N)

  (curry (@N (I . 0) (Res)) (Item)
     (cond
        ((>= @N (inc 'I)) (push 'Res Item))
        ((>= @N (rand 1 I)) (set (nth Res (rand 1 @N)) Item)) )
     Res ) )

(let Freq (need 10 0)

  (do 100000
     (let S_of_n (s_of_n_creator 3)
        (for I (mapc S_of_n (0 1 2 3 4 5 6 7 8 9))
           (inc (nth Freq (inc I))) ) ) )
  Freq )</lang>

Output:

-> (30003 29941 29918 30255 29848 29875 30056 29839 30174 30091)

Python

Works with: Python version 3.x

<lang python>from random import randrange

def s_of_n_creator(n):

   sample, i = [], 0
   def s_of_n(item):
       nonlocal i
       i += 1
       if i <= n:
           # Keep first n items
           sample.append(item)
       elif randrange(i) < n:
           # Keep item
           sample[randrange(n)] = item
       return sample
   return s_of_n

if __name__ == '__main__':

   bin = [0]* 10
   items = range(10)
   print("Single run samples for n = 3:")
   s_of_n = s_of_n_creator(3)
   for item in items:
       sample = s_of_n(item)
       print("  Item: %i -> sample: %s" % (item, sample))
   #
   for trial in range(100000):
       s_of_n = s_of_n_creator(3)
       for item in items:
           sample = s_of_n(item)
       for s in sample:
           bin[s] += 1
   print("\nTest item frequencies for 100000 runs:\n ",
         '\n  '.join("%i:%i" % x for x in enumerate(bin)))</lang>
Sample output
Single run samples for n = 3:
  Item: 0 -> sample: [0]
  Item: 1 -> sample: [0, 1]
  Item: 2 -> sample: [0, 1, 2]
  Item: 3 -> sample: [0, 1, 3]
  Item: 4 -> sample: [0, 1, 3]
  Item: 5 -> sample: [0, 1, 3]
  Item: 6 -> sample: [0, 1, 3]
  Item: 7 -> sample: [0, 3, 7]
  Item: 8 -> sample: [0, 3, 7]
  Item: 9 -> sample: [0, 3, 7]

Test item frequencies for 100000 runs:
  0:29983
  1:30240
  2:29779
  3:29921
  4:30224
  5:29967
  6:30036
  7:30050
  8:29758
  9:30042

Python Class based version

Only a slight change creates the following class-based implementation: <lang python>class S_of_n_creator():

   def __init__(self, n):
       self.n = n
       self.i = 0
       self.sample = []
   
   def __call__(self, item):
       self.i += 1
       n, i, sample = self.n, self.i, self.sample
       if i <= n:
           # Keep first n items
           sample.append(item)
       elif randrange(i) < n:
           # Keep item
           sample[randrange(n)] = item
       return sample</lang>

The above can be instantiated as follows after which s_of_n can be called in the same way as it is in the first example where it is a function instead of an instance. <lang python>s_of_n = S_of_n_creator(3)</lang>

Ruby

Using a closure <lang ruby>def s_of_n_creator(n)

 sample = []
 i = 0
 Proc.new do |item|
   i += 1
   if i <= n
     sample << item
   elsif rand(i) < n
     sample[rand(n)] = item
   end
   sample
 end

end

frequency = Array.new(10,0) 100_000.times do

 s_of_n = s_of_n_creator(3)
 sample = nil
 (0..9).each {|digit| sample = s_of_n[digit]}
 sample.each {|digit| frequency[digit] += 1}

end

(0..9).each {|digit| puts "#{digit}\t#{frequency[digit]}"}</lang>

Example

0       29850
1       30015
2       29970
3       29789
4       29841
5       30075
6       30281
7       30374
8       29953
9       29852

Tcl

<lang tcl>package require Tcl 8.6

oo::class create SofN {

   variable items size count
   constructor {n} {

set size $n

   }
   method item {item} {

if {[incr count] <= $size} { lappend items $item } elseif {rand()*$count < $size} { lset items [expr {int($size * rand())}] $item } return $items

   }

}

  1. Test code

for {set i 0} {$i < 100000} {incr i} {

   set sOf3 [SofN new 3]
   foreach digit {0 1 2 3 4 5 6 7 8 9} {

set digs [$sOf3 item $digit]

   }
   $sOf3 destroy
   foreach digit $digs {

incr freq($digit)

   }

} parray freq</lang>

Sample output:

freq(0) = 29812
freq(1) = 30099
freq(2) = 29927
freq(3) = 30106
freq(4) = 30048
freq(5) = 29993
freq(6) = 29912
freq(7) = 30219
freq(8) = 30060
freq(9) = 29824