Knuth's algorithm S: Difference between revisions
(Undo revision 126416 by CRGreathouse Not only does it not return a function, but it also shows no output on this page.) |
(→{{header|PARI/GP}}: Really? You want output for a self-reported incomplete solution?) |
||
Line 494: | Line 494: | ||
=={{header|PARI/GP}}== |
=={{header|PARI/GP}}== |
||
{{output?|PARI/GP}} |
|||
{{incomplete|PARI/GP|Does not return a function.}} |
{{incomplete|PARI/GP|Does not return a function.}} |
||
<lang parigp>KnuthS(v,n)={ |
<lang parigp>KnuthS(v,n)={ |
||
Line 504: | Line 503: | ||
}; |
}; |
||
KnuthS([0,1,2,3,4,5,6,7,8,9],3)</lang> |
KnuthS([0,1,2,3,4,5,6,7,8,9],3)</lang> |
||
Output: |
|||
<pre>%1 = [8, 4, 2]</pre> |
|||
=={{header|Perl}}== |
=={{header|Perl}}== |
Revision as of 15:58, 22 November 2011
You are encouraged to solve this task according to the task description, using any language you may know.
This is a method of randomly sampling n items from a set of M items, with equal probability; where M >= n and M, the number of items is unknown until the end. This means that the equal probability sampling should be maintained for all successive items > n as they become available (although the content of successive samples can change).
- The algorithm
- Select the first n items as the sample as they become available;
- For the i-th item where i > n, have a random chance of n/i of keeping it. If failing this chance, the sample remains the same. If not, have it randomly (1/n) replace one of the previously selected n items of the sample.
- Repeat #2 for any subsequent items.
- The Task
- Create a function
s_of_n_creator
that given the maximum sample size, returns a functions_of_n
that takes one parameter,item
. - Function
s_of_n
when called with successive items returns an equi-weighted random sample of up to n of its items so far, each time it is called, calculated using Knuths Algorithm S. - Test your functions by printing and showing the frequency of occurrences of the selected digits from 100,000 repetitions of:
- Use the s_of_n_creator with n == 3 to generate an s_of_n.
- call s_of_n with each of the digits 0 to 9 in order, keeping the returned three digits of its random sampling from its last call with argument item=9.
Note: A class taking n and generating a callable instance/function might also be used.
- Reference
- The Art of Computer Programming, Vol 2, 3.4.2 p.142
- Cf.
C
Instead of returning a closure we set the environment in a structure:
<lang c>#include <stdlib.h>
- include <stdio.h>
- include <string.h>
- include <time.h>
struct s_env {
unsigned int n, i; size_t size; void *sample;
};
void s_of_n_init(struct s_env *s_env, size_t size, unsigned int n) {
s_env->i = 0; s_env->n = n; s_env->size = size; s_env->sample = malloc(n * size);
}
void sample_set_i(struct s_env *s_env, unsigned int i, void *item) {
memcpy(s_env->sample + i * s_env->size, item, s_env->size);
}
void *s_of_n(struct s_env *s_env, void *item) {
s_env->i++; if (s_env->i <= s_env->n) sample_set_i(s_env, s_env->i - 1, item); else if ((rand() % s_env->i) < s_env->n) sample_set_i(s_env, rand() % s_env->n, item); return s_env->sample;
}
int *test(unsigned int n, int *items_set, unsigned int num_items) {
int i; int *sample; struct s_env s_env; s_of_n_init(&s_env, sizeof(items_set[0]), n); for (i = 0; i < num_items; i++) { s_of_n(&s_env, (void *) &items_set[i]); } return (int *)s_env.sample;
}
int main() {
unsigned int i, j; unsigned int n = 3; unsigned int num_items = 10; unsigned int *frequencies; int *items_set; srand(time(NULL)); items_set = malloc(num_items * sizeof(int)); frequencies = malloc(num_items * sizeof(int)); for (i = 0; i < num_items; i++) { items_set[i] = i; frequencies[i] = 0; } for (i = 0; i < 100000; i++) { int *res = test(n, items_set, num_items); for (j = 0; j < n; j++) { frequencies[res[j]]++; }
free(res);
} for (i = 0; i < num_items; i++) { printf(" %d", frequencies[i]); } puts(""); return 0;
}</lang>
C++
<lang cpp>#include <iostream>
- include <functional>
- include <vector>
- include <cstdlib>
- include <ctime>
template <typename T> std::function<std::vector<T>(T)> s_of_n_creator(int n) {
std::vector<T> sample; int i = 0; return [=](T item) mutable { i++; if (i <= n) { sample.push_back(item); } else if (std::rand() % i < n) { sample[std::rand() % n] = item; } return sample; };
}
int main() {
std::srand(std::time(NULL)); int bin[10] = {0}; for (int trial = 0; trial < 100000; trial++) { auto s_of_n = s_of_n_creator<int>(3); std::vector<int> sample; for (int i = 0; i < 10; i++) sample = s_of_n(i); for (int s : sample) bin[s]++; } for (int x : bin) std::cout << x << std::endl; return 0;
}</lang>
Output:
30052 29740 30197 30223 29857 29688 30095 29803 30098 30247
Common Lisp
<lang lisp>(setf *random-state* (make-random-state t))
(defun make-selector (n)
(let ((i 0) res) (lambda (&optional (x nil x-p)) (if (and x-p (< (random (incf i)) n))
(if (< (length res) n) (push x res) (setf (elt res (random n)) x)))
res)))
- test
(loop repeat 100000
with freq = (make-array 10 :initial-element 0) do (let ((f (make-selector 3)))
(mapc f '(0 1 2 3 4 5 6 7 8 9)) (mapc (lambda (i) (incf (aref freq i))) (funcall f)))
finally (prin1 freq))</lang>output<lang>#(30026 30023 29754 30017 30267 29997 29932 29990 29965 30029)</lang>
D
<lang d>import std.stdio, std.random, std.range, std.algorithm;
auto s_of_n_creator(in int n) {
int[] sample; int i; auto s_of_n(in int item) { i++; if (i <= n) { sample ~= item; } else if (uniform(0.0, 1.0) < (cast(double)n / i)) { sample = remove(sample, uniform(0, n)); sample ~= item; } return sample; } return &s_of_n;
}
void main() {
int[10] bin; auto items = iota(bin.length); writeln("Single run samples for n = 3:"); auto s_of_n1 = s_of_n_creator(3); foreach (item; items) { auto sample = s_of_n1(item); writefln(" Item: %d -> sample: %s", item, sample); }
enum nruns = 100_000; foreach (trial; 0 .. nruns) { auto s_of_n2 = s_of_n_creator(3); int[] sample; foreach (item; items) sample = s_of_n2(item); foreach (s; sample) bin[s]++; } writefln("\nTest item frequencies for %d runs:", nruns); foreach (d; bin) write(d, " ");
}</lang> Example output:
Single run samples for n = 3: Item: 0 -> sample: [0] Item: 1 -> sample: [0, 1] Item: 2 -> sample: [0, 1, 2] Item: 3 -> sample: [1, 2, 3] Item: 4 -> sample: [1, 3, 4] Item: 5 -> sample: [1, 3, 4] Item: 6 -> sample: [1, 3, 4] Item: 7 -> sample: [1, 3, 7] Item: 8 -> sample: [1, 7, 8] Item: 9 -> sample: [1, 7, 8] Test item frequencies for 100000 runs: 29926 30033 30145 30172 29882 30104 29773 30071 29858 30036
Go
<lang go>package main
import (
"fmt" "rand" "time"
)
func sOfNCreator(n int) func(byte) []byte {
s := make([]byte, 0, n) m := n return func(item byte) []byte { if len(s) < n { s = append(s, item) } else { m++ if rand.Intn(m) < n { s[rand.Intn(n)] = item } } return s }
}
func main() {
rand.Seed(time.Nanoseconds()) var freq [10]int for r := 0; r < 1e5; r++ { sOfN := sOfNCreator(3) for d := byte('0'); d < '9'; d++ { sOfN(d) } for _, d := range sOfN('9') { freq[d-'0']++ } } fmt.Println(freq)
}</lang> Output:
[30075 29955 30024 30095 30031 30018 29973 29642 30156 30031]
J
Note that this approach introduces heavy inefficiencies, to achieve information hiding.
<lang j>coclass'inefficient'
create=:3 :0 N=: y ITEMS=: K=:0 )
s_of_n=:3 :0 K=: K+1 if. N>#ITEMS do. ITEMS=: ITEMS,y else. if. (N%K)>?0 do. ITEMS=: (((i.#ITEMS)-.?N){ITEMS),y else. ITEMS end. end. )
s_of_n_creator_base_=: 1 :0
ctx=: conew&'inefficient' m s_of_n__ctx
)</lang>
Required example:
<lang j>run=:3 :0
nl=. conl 1 s3_of_n=. 3 s_of_n_creator r=. {: s3_of_n"0 i.10 coerase (conl 1)-.nl r
)
(~.,._1 + #/.~) (i.10),,D=:run"0 i.1e5
0 30099 1 29973 2 29795 3 29995 4 29996 5 30289 6 29903 7 29993 8 30215 9 29742</lang>
Java
A class-based solution: <lang java>import java.util.*;
class SOfN<T> {
private static final Random rand = new Random();
private List<T> sample; private int i = 0; private int n; public SOfN(int _n) {
n = _n; sample = new ArrayList<T>(n);
} public List<T> process(T item) {
i++; if (i <= n) {
sample.add(item);
} else if (rand.nextInt(i) < n) { sample.set(rand.nextInt(n), item); } return sample;
}
}
public class AlgorithmS {
public static void main(String[] args) {
int[] bin = new int[10]; for (int trial = 0; trial < 100000; trial++) { SOfN<Integer> s_of_n = new SOfN<Integer>(3); List<Integer> sample = null; for (int i = 0; i < 10; i++) sample = s_of_n.process(i); for (int s : sample) bin[s]++; } System.out.println(Arrays.toString(bin));
}
}</lang>
Output:
[30115, 30141, 30050, 29887, 29765, 30132, 29767, 30114, 30079, 29950]
Alternative solution without using an explicitly named type; instead using an anonymous class implementing a generic "function" interface: <lang java>import java.util.*;
interface Function<S, T> {
public T call(S x);
}
public class AlgorithmS {
private static final Random rand = new Random(); public static <T> Function<T, List<T>> s_of_n_creator(final int n) {
return new Function<T, List<T>>() { private List<T> sample = new ArrayList<T>(n); private int i = 0; public List<T> call(T item) { i++; if (i <= n) { sample.add(item); } else if (rand.nextInt(i) < n) { sample.set(rand.nextInt(n), item); } return sample; } };
}
public static void main(String[] args) {
int[] bin = new int[10]; for (int trial = 0; trial < 100000; trial++) { Function<Integer, List<Integer>> s_of_n = s_of_n_creator(3); List<Integer> sample = null; for (int i = 0; i < 10; i++) sample = s_of_n.call(i); for (int s : sample) bin[s]++; } System.out.println(Arrays.toString(bin));
}
}</lang>
Objective-C
Uses blocks <lang objc>#import <Foundation/Foundation.h>
typedef NSArray *(^SOfN)(id);
SOfN s_of_n_creator(int n) {
NSMutableArray *sample = [NSMutableArray arrayWithCapacity:n]; __block int i = 0; return [[^(id item) { i++; if (i <= n) { [sample addObject:item]; } else if (rand() % i < n) { [sample replaceObjectAtIndex:rand() % n withObject:item]; } return sample; } copy] autorelease];
}
int main(int argc, const char *argv[]) {
NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
NSCountedSet *bin = [[NSCountedSet alloc] init]; for (int trial = 0; trial < 100000; trial++) { SOfN s_of_n = s_of_n_creator(3); NSArray *sample; for (int i = 0; i < 10; i++) sample = s_of_n([NSNumber numberWithInt:i]); [bin addObjectsFromArray:sample]; } NSLog(@"%@", bin); [bin release]; [pool release]; return 0;
}</lang>
Log:
<NSCountedSet: 0x100114120> (0 [29934], 9 [30211], 5 [29926], 1 [30067], 6 [30001], 2 [29972], 7 [30126], 3 [29944], 8 [29910], 4 [29909])
OCaml
<lang ocaml>let s_of_n_creator n =
let i = ref 0 and sample = ref [| |] in function item -> incr i; if !i <= n then sample := Array.append [| item |] !sample else if Random.int !i < n then !sample.(Random.int n) <- item; !sample
let test n items_set =
let s_of_n = s_of_n_creator n in Array.fold_left (fun _ v -> s_of_n v) [| |] items_set
let () =
Random.self_init(); let n = 3 in let num_items = 10 in let items_set = Array.init num_items (fun i -> i) in let results = Array.create num_items 0 in for i = 1 to 100_000 do let res = test n items_set in Array.iter (fun j -> results.(j) <- succ results.(j)) res done; Array.iter (Printf.printf " %d") results; print_newline()</lang>
Output:
30051 29899 30249 30058 30012 29836 29998 29882 30148 29867
PARI/GP
<lang parigp>KnuthS(v,n)={
my(u=vector(n,i,i)); for(i=n+1,#v, if(random(i)<n,u[random(n)+1]=i) ); vecextract(v,u)
}; KnuthS([0,1,2,3,4,5,6,7,8,9],3)</lang>
Output:
%1 = [8, 4, 2]
Perl
<lang perl>use strict;
sub s_of_n_creator {
my $n = shift; my @sample; my $i = 0; sub { my $item = shift; $i++; if ($i <= $n) { # Keep first n items push @sample, $item; } elsif (rand() < $n / $i) { # Keep item @sample[rand $n] = $item; } @sample }
}
my @items = (0..9); my @bin;
foreach my $trial (1 .. 100000) {
my $s_of_n = s_of_n_creator(3); my @sample; foreach my $item (@items) { @sample = $s_of_n->($item); } foreach my $s (@sample) { $bin[$s]++; }
} print "@bin\n"; </lang>
- Sample output
30003 29923 30192 30164 29994 29976 29935 29860 30040 29913
PHP
<lang php><?php function s_of_n_creator($n) {
$sample = array(); $i = 0; return function($item) use (&$sample, &$i, $n) { $i++; if ($i <= $n) { // Keep first n items $sample[] = $item; } else if (rand(0, $i-1) < $n) { // Keep item $sample[rand(0, $n-1)] = $item; } return $sample; };
}
$items = range(0, 9);
for ($trial = 0; $trial < 100000; $trial++) {
$s_of_n = s_of_n_creator(3); foreach ($items as $item) $sample = $s_of_n($item); foreach ($sample as $s) $bin[$s]++;
} print_r($bin); ?></lang>
- Sample output
Array ( [3] => 30158 [8] => 29859 [9] => 29984 [6] => 29937 [7] => 30361 [4] => 29994 [5] => 29849 [0] => 29724 [1] => 29997 [2] => 30137 )
PicoLisp
<lang PicoLisp>(de s_of_n_creator (@N)
(curry (@N (I . 0) (Res)) (Item) (cond ((>= @N (inc 'I)) (push 'Res Item)) ((>= @N (rand 1 I)) (set (nth Res (rand 1 @N)) Item)) ) Res ) )
(let Freq (need 10 0)
(do 100000 (let S_of_n (s_of_n_creator 3) (for I (mapc S_of_n (0 1 2 3 4 5 6 7 8 9)) (inc (nth Freq (inc I))) ) ) ) Freq )</lang>
Output:
-> (30003 29941 29918 30255 29848 29875 30056 29839 30174 30091)
Python
<lang python>from random import randrange
def s_of_n_creator(n):
sample, i = [], 0 def s_of_n(item): nonlocal i
i += 1 if i <= n: # Keep first n items sample.append(item) elif randrange(i) < n: # Keep item sample[randrange(n)] = item return sample return s_of_n
if __name__ == '__main__':
bin = [0]* 10 items = range(10) print("Single run samples for n = 3:") s_of_n = s_of_n_creator(3) for item in items: sample = s_of_n(item) print(" Item: %i -> sample: %s" % (item, sample)) # for trial in range(100000): s_of_n = s_of_n_creator(3) for item in items: sample = s_of_n(item) for s in sample: bin[s] += 1 print("\nTest item frequencies for 100000 runs:\n ", '\n '.join("%i:%i" % x for x in enumerate(bin)))</lang>
- Sample output
Single run samples for n = 3: Item: 0 -> sample: [0] Item: 1 -> sample: [0, 1] Item: 2 -> sample: [0, 1, 2] Item: 3 -> sample: [0, 1, 3] Item: 4 -> sample: [0, 1, 3] Item: 5 -> sample: [0, 1, 3] Item: 6 -> sample: [0, 1, 3] Item: 7 -> sample: [0, 3, 7] Item: 8 -> sample: [0, 3, 7] Item: 9 -> sample: [0, 3, 7] Test item frequencies for 100000 runs: 0:29983 1:30240 2:29779 3:29921 4:30224 5:29967 6:30036 7:30050 8:29758 9:30042
Python Class based version
Only a slight change creates the following class-based implementation: <lang python>class S_of_n_creator():
def __init__(self, n): self.n = n self.i = 0 self.sample = [] def __call__(self, item): self.i += 1 n, i, sample = self.n, self.i, self.sample if i <= n: # Keep first n items sample.append(item) elif randrange(i) < n: # Keep item sample[randrange(n)] = item return sample</lang>
The above can be instantiated as follows after which s_of_n
can be called in the same way as it is in the first example where it is a function instead of an instance.
<lang python>s_of_n = S_of_n_creator(3)</lang>
Ruby
Using a closure <lang ruby>def s_of_n_creator(n)
sample = [] i = 0 Proc.new do |item| i += 1 if i <= n sample << item elsif rand(i) < n sample[rand(n)] = item end sample end
end
frequency = Array.new(10,0) 100_000.times do
s_of_n = s_of_n_creator(3) sample = nil (0..9).each {|digit| sample = s_of_n[digit]} sample.each {|digit| frequency[digit] += 1}
end
(0..9).each {|digit| puts "#{digit}\t#{frequency[digit]}"}</lang>
Example
0 29850 1 30015 2 29970 3 29789 4 29841 5 30075 6 30281 7 30374 8 29953 9 29852
Tcl
<lang tcl>package require Tcl 8.6
oo::class create SofN {
variable items size count constructor {n} {
set size $n
} method item {item} {
if {[incr count] <= $size} { lappend items $item } elseif {rand()*$count < $size} { lset items [expr {int($size * rand())}] $item } return $items
}
}
- Test code
for {set i 0} {$i < 100000} {incr i} {
set sOf3 [SofN new 3] foreach digit {0 1 2 3 4 5 6 7 8 9} {
set digs [$sOf3 item $digit]
} $sOf3 destroy foreach digit $digs {
incr freq($digit)
}
} parray freq</lang>
Sample output:
freq(0) = 29812 freq(1) = 30099 freq(2) = 29927 freq(3) = 30106 freq(4) = 30048 freq(5) = 29993 freq(6) = 29912 freq(7) = 30219 freq(8) = 30060 freq(9) = 29824