Compare sorting algorithms' performance: Difference between revisions

Content added Content deleted

Inline

Revision as of 16:02, 19 May 2009

Measure a relative performance of sorting algorithms implementations.

Plot execution time vs. input sequence length dependencies for various implementation of sorting algorithm and different input sequence types (example figures).

Consider three type of input sequences:

ones: sequence of all 1's. Example: {1, 1, 1, 1, 1}
range: ascending sequence, i.e. already sorted. Example: {1, 2, 3, 10, 15}
shuffled range: sequence with elements randomly distributed. Example: {5, 3, 9, 6, 8}

Consider at least two different sorting functions (different algorithms or/and different implementation of the same algorithm). For example, consider Bubble Sort, Insertion sort, Quicksort or/and implementations of Quicksort with different pivot selection mechanisms. Where possible, use existing implementations.

Preliminary subtask:

General steps:

Define sorting routines to be considered.
Define appropriate sequence generators and write timings.
Plot timings.
What conclusions about relative performance of the sorting routines could be made based on the plots?

C

(The reference example is Python)

Examples of sorting routines

We can use the codes in the category Sorting Algorithms; since these codes deal with integer arrays, we should change them a little. To accomplish this task I've also renamed them more consistently algorithm_sort; so we have e.g. bubble_sort, quick_sort and so on.

Sequence generators

csequence.h <lang c>#ifndef _CSEQUENCE_H

define _CSEQUENCE_H
include <stdlib.h>

void setfillconst(double c); void fillwithconst(double *v, int n); void fillwithrrange(double *v, int n); void shuffledrange(double *v, int n);

endif</lang>

csequence.c <lang c>#include "csequence.h"

static double fill_constant;

void setfillconst(double c) {

 fill_constant = c;

}

void fillwithconst(double *v, int n) {

 while( --n >= 0 ) v[n] = fill_constant;

}

void fillwithrrange(double *v, int n) {

 int on = n;
 while( --on >= 0 ) v[on] = n - on;

}

void shuffledrange(double *v, int n) {

 int on = n;
 fillwithrrange(v, n);
 while( --n >= 0 ) {
   int r = rand() % on;
   double t = v[n];
   v[n] = v[r];
   v[r] = t;
 }

}</lang>

Write timings

We shall use the code from Query Performance. Since the action is a generic function with a single argument, we need wrappers which encapsule each sorting algorithms we want to test.

writetimings.h <lang c>#ifndef _WRITETIMINGS_H

define _WRITETIMINGS_H
include "sorts.h"
include "csequence.h"
include "timeit.h"

/* we repeat the same MEANREPEAT times, and get the mean; this *should*

  give "better" results ... */

define MEANREPEAT 10.0
define BUFLEN 128
define MAKEACTION(ALGO) \

 int action_ ## ALGO (int size) {				\
   ALGO ## _sort(tobesorted, size);				\
   return 0; }

define MAKEPIECE(N) { #N , action_ ## N }

int action_bubble(int size); int action_shell(int size); int action_quick(int size); int action_insertion(int size); int action_merge(int size); int doublecompare( const void *a, const void *b ); int action_qsort(int size); int get_the_longest(int *a);

struct testpiece {

 char *name;
 int (*action)(int);

}; typedef struct testpiece testpiece_t;

struct seqdef {

 char *name;
 void (*seqcreator)(double *, int);

}; typedef struct seqdef seqdef_t;

endif</lang>

writetimings.c <lang c>#include <stdio.h>

include <stdlib.h>

include "writetimings.h"

double *tobesorted = NULL; const char *bname = "data_"; const char *filetempl = "%s%s_%s.dat"; int datlengths[] = {100, 200, 300, 500, 1000, 5000, 10000, 50000, 100000};

testpiece_t testpieces[] = { // MAKEPIECE(bubble),

 MAKEPIECE(shell),
 MAKEPIECE(merge),
 MAKEPIECE(insertion),
 MAKEPIECE(quick),
 MAKEPIECE(qsort),
 { NULL, NULL }

};

seqdef_t seqdefs[] = {

 { "c1", fillwithconst },
 { "rr", fillwithrrange },
 { "sr", shuffledrange },
 { NULL, NULL }

};

MAKEACTION(bubble) MAKEACTION(insertion) MAKEACTION(quick) MAKEACTION(shell)

int action_merge(int size) {

 double *res = merge_sort(tobesorted, size);
 free(res); /* unluckly this affects performance */
 return 0;

}

int doublecompare( const void *a, const void *b ) {

 if ( *(double *)a == *(double *)b ) return 0;
 if ( *(double *)a < *(double *)b ) return -1; else return 1;

} int action_qsort(int size) {

 qsort(tobesorted, size, sizeof(double), doublecompare);
 return 0;

}

int get_the_longest(int *a) {

 int r = *a;
 while( *a > 0 ) {
   if ( *a > r ) r = *a;
   a++;
 }
 return r;

}

int main() {

 int i, j, k, z, lenmax;
 char buf[BUFLEN];
 FILE *out;
 double thetime;

 lenmax = get_the_longest(datlengths);
 printf("Bigger data set has %d elements\n", lenmax);
 tobesorted = malloc(sizeof(double)*lenmax);
 if ( tobesorted == NULL ) return 1;

 setfillconst(1.0);

 for(i=0; testpieces[i].name != NULL; i++) {
   for(j=0; seqdefs[j].name != NULL; j++) {
     snprintf(buf, BUFLEN, filetempl, bname, testpieces[i].name,

seqdefs[j].name);

     out = fopen(buf, "w");
     if ( out == NULL ) goto severe;
     printf("Producing data for sort '%s', created data type '%s'\n",

testpieces[i].name, seqdefs[j].name);

     for(k=0; datlengths[k] > 0; k++) {

printf("\tNumber of elements: %d\n", datlengths[k]); thetime = 0.0; seqdefs[j].seqcreator(tobesorted, datlengths[k]); fprintf(out, "%d ", datlengths[k]); for(z=0; z < MEANREPEAT; z++) { thetime += time_it(testpieces[i].action, datlengths[k]); } thetime /= MEANREPEAT; fprintf(out, "%.8lf\n", thetime);

     }
     fclose(out);
   }
 }

severe:

 free(tobesorted);
 return 0;

}</lang> This code produce several files with the following naming convention:

data_algorithm_sequence.dat

Where algorithm is one of the following: insertion, merge, shell, quick, qsort (the quicksort in the libc library) (bubble sort became too slow for longest sequences). Sequence is c1 (constant value 1), rr (reverse range), sr (shufled range). These data can be easily plotted by Gnuplot, which can also do fitting. Instead we do our fitting using Polynomial Fitting. <lang c>#include <stdio.h>

include <stdlib.h>
include <math.h>

include "polifitgsl.h"

define MAXNUMOFDATA 100

double x[MAXNUMOFDATA], y[MAXNUMOFDATA]; double cf[2];

int main() {

 int i, nod;
 int r;

 for(i=0; i < MAXNUMOFDATA; i++)
 {
   r = scanf("%lf %lf\n", &x[i], &y[i]);
   if ( (r == EOF) || (r < 2) ) break;
   x[i] = log10(x[i]);
   y[i] = log10(y[i]);
 }
 nod = i;

 polynomialfit(nod, 2, x, y, cf);
 printf("C0 = %lf\nC1 = %lf\n", cf[0], cf[1]);

 return 0;

}</lang> Here we search for a fit with C₀+C₁x "in the log scale", since we supposed the data, once plotted on a logscale graph, can be fitted by a line. We can use e.g. a shell one-liner to produce the parameters for the line for each data file previously output. In particular I've used the following

for el in *.dat ; do fitdata <$el >${el%.dat}.fd ; done

Plot timings and Figures

Once we have all the ".dat" files and associated ".fd", we can use Gnuplot to draw our data and think about conclusions (we could also use the idea in Plot x, y arrays, but it needs too much enhancements to be usable for this analysis). Here an example of such a draw for a single file (using Gnuplot)

gnuplot> f(x) = C0 + C1*x
gnuplot> set logscale xy
gnuplot> load 'data_quick_sr_u.fd'
gnuplot> set xrange [100:100000]
gnuplot> set key left
gnuplot> plot 10**f(log10(x)), 'data_quick_sr_u.dat'

(The _u.dat are produced by a modified version of the code in order to write timings in microseconds instead of seconds) We can easily write another shell script/one-liner to produce a single file driver for Gnuplot in order to produce all the graph we can be interested in. These graphs show that the linear (in log scale) fit do not always fit the data... I haven't repeated the tests; the problems are when the sequence length becomes huge; for some algorithm that uses extra memory (like implementation of the merge sort), this could depend on the allocation of the needed memory. Another extraneous factor could be system load (the CLOCK_MONOTONIC used by the timing function is system wide rather than per process, so counting time spent in other processes too?). The "most stable" algorithms seem to be quick sort (but not qsort, which indeed is just the libc quick sort, here not plotted!) and shell sort (except for reversed range).

Conclusion: we should repeat the tests...

Python

Works with: Python version 2.5

Examples of sorting routines

<lang python> def builtinsort(x):

    x.sort()

def partition(seq, pivot):
   low, middle, up = [], [], []
   for x in seq:
       if x < pivot:
           low.append(x)
       elif x == pivot:
           middle.append(x)
       else:
           up.append(x)
   return low, middle, up
import random
def qsortranpart(seq):
   size = len(seq)
   if size < 2: return seq
   low, middle, up = partition(seq, seq[random.randrange(size)])
   return qsortranpart(low) + middle + qsortranpart(up)</lang>

Sequence generators

<lang python> def ones(n):

    return [1]*n

def reversedrange(n):
    x = range(n)
    x.reverse()
    return x

def shuffledrange(n):
    x = range(n)
    random.shuffle(x)
    return x</lang>

Write timings

<lang python> def write_timings(npoints=10, maxN=10**4, sort_functions=(builtinsort,insertion_sort, qsort),

                  sequence_creators = (ones, range, shuffledrange)):    
   Ns = range(2, maxN, maxN//npoints)
   for sort in sort_functions:
       for make_seq in sequence_creators:
           Ts = map(lambda n: usec(sort, (make_seq(n),)), Ns)
           writedat('%s-%s-%d-%d.xy' % (sort.__name__,  make_seq.__name__, len(Ns), max(Ns)), Ns, Ts)</lang>

Where writedat() is defined in the Write float arrays to a text file, usec() - Query Performance, insertion_sort() - Insertion sort, qsort - Quicksort subtasks, correspondingly.

Plot timings

Library: matplotlib

Library: numpy

<lang python> import operator

import numpy, pylab
def plotdd(dictplotdict):
   """See ``plot_timings()`` below."""
   symbols = ('o', '^', 'v', '<', '>', 's', '+', 'x', 'D', 'd',
              '1', '2', '3', '4', 'h', 'H', 'p', '|', '_')
   colors = map(None, 'bgrcmyk') # split string on distinct characters
   for npoints, plotdict in dictplotdict.iteritems():
       for ttle, lst in plotdict.iteritems():            
           pylab.hold(False)                                
           for i, (label, polynom, x, y) in enumerate(sorted(lst,key=operator.itemgetter(0))):
               pylab.plot(x, y, colors[i % len(colors)] + symbols[i % len(symbols)], label='%s %s' % (polynom, label))
               pylab.hold(True)
               y = numpy.polyval(polynom, x)
               pylab.plot(x, y, colors[i % len(colors)], label= '_nolegend_')                
           pylab.legend(loc='upper left')
           pylab.xlabel(polynom.variable)
           pylab.ylabel('log2( time in microseconds )')                
           pylab.title(ttle, verticalalignment='bottom')
           figname = '_%(npoints)03d%(ttle)s' % vars()
           pylab.savefig(figname+'.png')
           pylab.savefig(figname+'.pdf')
           print figname</lang>

See Plot x, y arrays and Polynomial Fitting subtasks for a basic usage of pylab.plot() and numpy.polyfit().

<lang python> import collections, itertools, glob, re

import numpy
def plot_timings():
   makedict = lambda: collections.defaultdict(lambda: collections.defaultdict(list))
   df = makedict()
   ds = makedict()
   # populate plot dictionaries
   for filename in glob.glob('*.xy'):
       m = re.match(r'([^-]+)-([^-]+)-(\d+)-(\d+)\.xy', filename)
       print filename
       assert m, filename
       funcname, seqname, npoints, maxN = m.groups()
       npoints, maxN = int(npoints), int(maxN)        
       a = numpy.fromiter(itertools.imap(float, open(filename).read().split()), dtype='f')
       Ns = a[::2]  # sequences lengths
       Ts = a[1::2] # corresponding times 
       assert len(Ns) == len(Ts) == npoints
       assert max(Ns) <= maxN
       #
       logsafe = numpy.logical_and(Ns>0, Ts>0)
       Ts = numpy.log2(Ts[logsafe])
       Ns = numpy.log2(Ns[logsafe])
       coeffs = numpy.polyfit(Ns, Ts, deg=1)
       poly = numpy.poly1d(coeffs, variable='log2(N)')
       #
       df[npoints][funcname].append((seqname, poly, Ns, Ts))
       ds[npoints][seqname].append((funcname, poly, Ns, Ts))
   # actual plotting
   plotdd(df)
   plotdd(ds) # see ``plotdd()`` above</lang>

Figures: log2( time in microseconds ) vs. log2( sequence length )

<lang python> sort_functions = [

    builtinsort,         # see implementation above
    insertion_sort,      # see Insertion sort
    insertion_sort_lowb, # insertion_sort, where sequential search is replaced
                         #     by lower_bound() function
    qsort,               # see Quicksort
    qsortranlc,          # qsort with randomly choosen pivot
                         #     and the filtering via list comprehension
    qsortranpart,        # qsortranlc with filtering via partition function
    qsortranpartis,      # qsortranpart, where for a small input sequence lengths
    ]                    #     insertion_sort is called
if __name__=="__main__":
   import sys
   sys.setrecursionlimit(10000)
   write_timings(npoints=100, maxN=1024, # 1 <= N <= 2**10 an input sequence length
                 sort_functions=sort_functions,
                 sequence_creators = (ones, range, shuffledrange))
   plot_timings()</lang>

Executing above script we get belowed figures.

ones

ones.png (143KiB)

builtinsort     - O(N)
insertion_sort  - O(N)
qsort           - O(N**2)
qsortranpart    - O(N)

range

range.png (145KiB)

builtinsort     - O(N)
insertion_sort  - O(N)
qsort           - O(N**2)
qsortranpart    - O(N*log(N))

shuffled range

shuffledrange.png (152KiB)

builtinsort     - O(N)  
insertion_sort  - O(N**4) ???
qsort           - O(N*log(N))
qsortranpart    - O(N) ???

Tcl

This example is incorrect. Please fix the code and remove this message.

Details: This solution does not address the plotting requirement of the task.

The lsort command is Tcl's built-in sorting engine. <lang tcl># list creation procedures proc ones n {

   lrepeat $n 1

} proc reversed n {

   while {[incr n -1] >= 0} {
       lappend result $n
   }
   return $result

} proc random n {

   for {set i 0} {$i < $n} {incr i} {
       lappend result [expr {int($n * rand())}]
   }
   return $result

}

set algorithms {lsort quicksort shellsort insertionsort bubblesort} set sizes {1 10 100 1000 10000 100000} set types {ones reversed random}

create some lists to be used by all sorting algorithms

array set lists {} foreach size $sizes {

   foreach type $types {
       set lists($type,$size) [$type $size]
   }

}

set runs 10

header

fconfigure stdout -buffering none puts -nonewline [format "%-16s" "list length:"] foreach size $sizes {

   puts -nonewline [format " %10d" $size]

} puts ""

perform the sort timings and output results

foreach type $types {

   puts "\nlist type: $type"
   foreach algo $algorithms {
       set errs [list]
       $algo {} ;# call it once to ensure it's compiled
       
       puts -nonewline [format "   %-13s" $algo]
       foreach size $sizes {
           # some implementations are just too slow
           if {$type ne "ones" && (
               ($algo eq "insertionsort" && $size > 10000) ||
               ($algo eq "bubblesort" && $size > 1000))
           } {
               set time "skipped"
           } else {
               # OK, do it
               if {[catch {time [list $algo $lists($type,$size)] $runs} result] != 0} {
                   set time -1
                   lappend errs $result
               } else {
                   set time [lindex [split $result] 0]
               }
           }
           puts -nonewline [format " %10s" $time]
       }
       puts ""
       if {[llength $errs] > 0} {
           puts [format "      %s" [join $errs "\n      "]]
       }
   }

} puts "\ntimes in microseconds, average of $runs runs"</lang> outputs

list length:              1         10        100       1000      10000     100000

list type: ones
   lsort                0.8        1.3        8.7       99.0     1007.4    12171.6
   quicksort            1.1        8.1       52.7      398.7     3758.3    36828.4
   shellsort            1.3       20.7      257.8     4209.7    58185.7   749289.4
   insertionsort        1.1        6.5       62.3      638.2     5617.0    53863.3
   bubblesort           2.7        7.8       38.6      324.2     3463.3    34478.5

list type: reversed
   lsort                1.9        1.5       12.3      120.8     1565.0    21087.8
   quicksort            2.5       41.1      534.2     6435.8    79947.2   981884.1
   shellsort            3.1       36.6      453.5     6694.2    93966.8  1234147.3
   insertionsort        2.0       24.9     1553.5   153679.5 15814539.4    skipped
   bubblesort           4.0      485.8    47525.8  4838366.7    skipped    skipped

list type: random
   lsort                1.8        1.8       18.1      243.5     3237.4    61951.7
   quicksort            1.3       30.7      462.8     5631.2    74675.1   945864.0
   shellsort            3.1       27.9      608.7     9412.5   141413.7  2334003.6
   insertionsort        2.5       13.4      848.3    78158.5  7836658.9    skipped
   bubblesort           4.0      160.5    24189.7  2458080.7    skipped    skipped

times in microseconds, average of 10 runs

@@ Line 222: / Line 222: @@
 This code produce several files with the following naming convention:
 * data_''algorithm''_''sequence''.dat
-Where ''algorithm'' is one of the following: insertion, merge, shell, quick, qsort (the quicksort in the libc library) (bubble sort became too slow for longest sequences). ''Sequence'' is ''c1'' (constant value 1), ''rr'' (reverse range), ''sr'' (shufled range). These data can be easily plotted by GNUplot, which can also do fitting. Instead we do our fitting using [[Polynomial Fitting]].
+Where ''algorithm'' is one of the following: insertion, merge, shell, quick, qsort (the quicksort in the libc library) (bubble sort became too slow for longest sequences). ''Sequence'' is ''c1'' (constant value 1), ''rr'' (reverse range), ''sr'' (shufled range). These data can be easily plotted by Gnuplot, which can also do fitting. Instead we do our fitting using [[Polynomial Fitting]].
 <lang c>#include <stdio.h>
 #include <stdlib.h>
@@ Line 255: / Line 255: @@
 Here we search for a fit with C<sub>0</sub>+C<sub>1</sub>x "in the log scale", since we supposed the data, once plotted on a logscale graph, can be fitted by a line. We can use e.g. a shell one-liner to produce the parameters for the line for each data file previously output. In particular I've used the following
 <pre>for el in *.dat ; do fitdata <$el >${el%.dat}.fd ; done</pre>
 ===Plot timings and Figures===
 Once we have all the ".dat" files and associated ".fd", we can use Gnuplot to draw our '''data''' and think about conclusions (we could also use the idea in [[Plot x, y arrays]], but it needs too much enhancements to be usable for this analysis). Here an example of such a draw for a single file (using Gnuplot)