Fivenum: Difference between revisions
mNo edit summary |
(+Stata) |
||
Line 367: | Line 367: | ||
[1] -1.9505959 -0.6767412 0.2332471 0.7460709 1.7313151 |
[1] -1.9505959 -0.6767412 0.2332471 0.7460709 1.7313151 |
||
</lang> |
</lang> |
||
=={{header|Stata}}== |
|||
First build a dataset to |
|||
<lang stata>clear |
|||
set seed 17760704 |
|||
qui set obs 10000 |
|||
gen x=rnormal()</lang> |
|||
The '''summarize''' command produces all the required statistics, and more: |
|||
<lang stata>qui sum x, detail |
|||
di r(min),r(p25),r(p50),r(p75),r(max)</lang> |
|||
'''Output''' |
|||
<pre>-3.6345866 -.66536 .0026834 .68398139 3.7997103</pre> |
|||
It's also possible to use the '''tabstat''' command |
|||
<lang stata>tabstat x, s(mi q ma)</lang> |
|||
'''Output''' |
|||
<pre> variable | min p25 p50 p75 max |
|||
-------------+-------------------------------------------------- |
|||
x | -3.634587 -.66536 .0026834 .6839814 3.79971 |
|||
----------------------------------------------------------------</pre> |
|||
=={{header|zkl}}== |
=={{header|zkl}}== |
Revision as of 19:44, 27 February 2018
Many big data or scientific programs use boxplots to show distributions of data. In addition, sometimes saving large arrays for boxplots can be impractical and use extreme amounts of RAM. It can be useful to save large arrays as arrays with 5 numbers to save memory. For example, the base statistics of the R programming language have implement Tukey's five number summary as the `fivenum` function (cf. Five-number Summary)
Task Description
Given an array of numbers, compute the five-number summary.
C
<lang c>#include <stdio.h>
- include <stdlib.h>
double median(double *x, int start, int end_inclusive) {
int size = end_inclusive - start + 1; if (size <= 0) { printf("Array slice cannot be empty\n"); exit(1); } int m = start + size / 2; if (size % 2) return x[m]; return (x[m - 1] + x[m]) / 2.0;
}
int compare (const void *a, const void *b) {
double aa = *(double*)a; double bb = *(double*)b; if (aa > bb) return 1; if (aa < bb) return -1; return 0;
}
int fivenum(double *x, double *result, int x_len) {
int i, m, lower_end; for (i = 0; i < x_len; i++) { if (x[i] != x[i]) { printf("Unable to deal with arrays containing NaN\n\n"); return 1; } } qsort(x, x_len, sizeof(double), compare); result[0] = x[0]; result[2] = median(x, 0, x_len - 1); result[4] = x[x_len - 1]; m = x_len / 2; lower_end = (x_len % 2) ? m : m - 1; result[1] = median(x, 0, lower_end); result[3] = median(x, m, x_len - 1); return 0;
}
int show(double *result, int places) {
int i; char f[7]; sprintf(f, "%%.%dlf", places); printf("["); for (i = 0; i < 5; i++) { printf(f, result[i]); if (i < 4) printf(", "); } printf("]\n\n");
}
int main() {
double result[5];
double x1[11] = {15.0, 6.0, 42.0, 41.0, 7.0, 36.0, 49.0, 40.0, 39.0, 47.0, 43.0}; if (!fivenum(x1, result, 11)) show(result, 1);
double x2[6] = {36.0, 40.0, 7.0, 39.0, 41.0, 15.0}; if (!fivenum(x2, result, 6)) show(result, 1);
double x3[20] = { 0.14082834, 0.09748790, 1.73131507, 0.87636009, -1.95059594, 0.73438555, -0.03035726, 1.46675970, -0.74621349, -0.72588772, 0.63905160, 0.61501527, -0.98983780, -1.00447874, -0.62759469, 0.66206163, 1.04312009, -0.10305385, 0.75775634, 0.32566578 }; if (!fivenum(x3, result, 20)) show(result, 9);
return 0;
}</lang>
- Output:
[6.0, 25.5, 40.0, 42.5, 49.0] [7.0, 15.0, 37.5, 40.0, 41.0] [-1.950595940, -0.676741205, 0.233247060, 0.746070945, 1.731315070]
Java
<lang java>import java.util.Arrays;
public class Fivenum {
static double median(double[] x, int start, int endInclusive) { int size = endInclusive - start + 1; if (size <= 0) throw new IllegalArgumentException("Array slice cannot be empty"); int m = start + size / 2; return (size % 2 == 1) ? x[m] : (x[m - 1] + x[m]) / 2.0; }
static double[] fivenum(double[] x) { for (Double d : x) { if (d.isNaN()) throw new IllegalArgumentException("Unable to deal with arrays containing NaN"); } double[] result = new double[5]; Arrays.sort(x); result[0] = x[0]; result[2] = median(x, 0, x.length - 1); result[4] = x[x.length - 1]; int m = x.length / 2; int lowerEnd = (x.length % 2 == 1) ? m : m - 1; result[1] = median(x, 0, lowerEnd); result[3] = median(x, m, x.length - 1); return result; }
public static void main(String[] args) { double xl[][] = { {15.0, 6.0, 42.0, 41.0, 7.0, 36.0, 49.0, 40.0, 39.0, 47.0, 43.0}, {36.0, 40.0, 7.0, 39.0, 41.0, 15.0}, { 0.14082834, 0.09748790, 1.73131507, 0.87636009, -1.95059594, 0.73438555, -0.03035726, 1.46675970, -0.74621349, -0.72588772, 0.63905160, 0.61501527, -0.98983780, -1.00447874, -0.62759469, 0.66206163, 1.04312009, -0.10305385, 0.75775634, 0.32566578 } }; for (double[] x : xl) System.out.printf("%s\n\n", Arrays.toString(fivenum(x))); }
}</lang>
- Output:
[6.0, 25.5, 40.0, 42.5, 49.0] [7.0, 15.0, 37.5, 40.0, 41.0] [-1.95059594, -0.676741205, 0.23324706, 0.746070945, 1.73131507]
Julia
<lang julia>function mediansorted(x::AbstractVector{T}, i::Integer, l::Integer)::T where T
len = l - i + 1 len > zero(len) || throw(ArgumentError("Array slice cannot be empty.")) mid = i + len ÷ 2 return isodd(len) ? x[mid] : (x[mid-1] + x[mid]) / 2
end
function fivenum(x::AbstractVector{T}) where T<:AbstractFloat
r = Vector{T}(5) xs = sort(x) mid::Int = length(xs) ÷ 2 lowerend::Int = isodd(length(xs)) ? mid : mid - 1 r[1] = xs[1] r[2] = mediansorted(xs, 1, lowerend) r[3] = mediansorted(xs, 1, endof(xs)) r[4] = mediansorted(xs, mid, endof(xs)) r[end] = xs[end] return r
end
for v in ([15.0, 6.0, 42.0, 41.0, 7.0, 36.0, 49.0, 40.0, 39.0, 47.0, 43.0],
[36.0, 40.0, 7.0, 39.0, 41.0, 15.0], [0.14082834, 0.09748790, 1.73131507, 0.87636009, -1.95059594, 0.73438555, -0.03035726, 1.46675970, -0.74621349, -0.72588772, 0.63905160, 0.61501527, -0.98983780, -1.00447874, -0.62759469, 0.66206163, 1.04312009, -0.10305385, 0.75775634, 0.32566578]) println("# ", v, "\n -> ", fivenum(v))
end</lang>
- Output:
# [15.0, 6.0, 42.0, 41.0, 7.0, 36.0, 49.0, 40.0, 39.0, 47.0, 43.0] -> [6.0, 15.0, 40.0, 42.0, 49.0] # [36.0, 40.0, 7.0, 39.0, 41.0, 15.0] -> [7.0, 11.0, 37.5, 39.5, 41.0] # [0.140828, 0.0974879, 1.73132, 0.87636, -1.9506, 0.734386, -0.0303573, 1.46676, -0.746213, -0.725888, 0.639052, 0.615015, -0.989838, -1.00448, -0.627595,0.662062, 1.04312, -0.103054, 0.757756, 0.325666] -> [-1.9506, -0.725888, 0.233247, 0.734386, 1.73132]
Kotlin
The following uses Tukey's method for calculating the lower and upper quartiles (or 'hinges') which is what the R function, fivenum, appears to use.
As arrays containing NaNs and nulls cannot really be dealt with in a sensible fashion in Kotlin, they've been excluded altogether. <lang scala>// version 1.2.21
fun median(x: DoubleArray, start: Int, endInclusive: Int): Double {
val size = endInclusive - start + 1 require (size > 0) { "Array slice cannot be empty" } val m = start + size / 2 return if (size % 2 == 1) x[m] else (x[m - 1] + x[m]) / 2.0
}
fun fivenum(x: DoubleArray): DoubleArray {
require(x.none { it.isNaN() }) { "Unable to deal with arrays containing NaN" } val result = DoubleArray(5) x.sort() result[0] = x[0] result[2] = median(x, 0, x.size - 1) result[4] = x[x.lastIndex] val m = x.size / 2 var lowerEnd = if (x.size % 2 == 1) m else m - 1 result[1] = median(x, 0, lowerEnd) result[3] = median(x, m, x.size - 1) return result
}
fun main(args: Array<String>) {
var xl = listOf( doubleArrayOf(15.0, 6.0, 42.0, 41.0, 7.0, 36.0, 49.0, 40.0, 39.0, 47.0, 43.0), doubleArrayOf(36.0, 40.0, 7.0, 39.0, 41.0, 15.0), doubleArrayOf( 0.14082834, 0.09748790, 1.73131507, 0.87636009, -1.95059594, 0.73438555, -0.03035726, 1.46675970, -0.74621349, -0.72588772, 0.63905160, 0.61501527, -0.98983780, -1.00447874, -0.62759469, 0.66206163, 1.04312009, -0.10305385, 0.75775634, 0.32566578 ) ) xl.forEach { println("${fivenum(it).asList()}\n") }
}</lang>
- Output:
[6.0, 25.5, 40.0, 42.5, 49.0] [7.0, 15.0, 37.5, 40.0, 41.0] [-1.95059594, -0.676741205, 0.23324706, 0.746070945, 1.73131507]
Perl
<lang Perl>
- !/usr/bin/env perl
use strict; use warnings; use Cwd 'getcwd'; use feature 'say'; my $TOP_DIRECTORY = getcwd(); local $SIG{__WARN__} = sub {#kill the program if there are any warnings my $message = shift; my $fail_filename = "$TOP_DIRECTORY/$0.FAIL"; open my $fh, '>', $fail_filename or die "Can't write $fail_filename: $!"; printf $fh ("$message @ %s\n", getcwd()); close $fh; die "$message\n"; };#http://perlmaven.com/how-to-capture-and-save-warnings-in-perl
use POSIX qw(ceil floor);
sub fivenum { my $array = shift; my @x = sort {$a <=> $b} @{ $array }; printf("There are %u elements.\n", scalar @{ $array }); my $n = scalar @{ $array }; if ($n == 0) { print "no values were entered into fivenum.\n"; die; } my $n4 = floor(($n+3)/2)/2; my @d = (1, $n4, ($n +1)/2, $n+1-$n4, $n);#d <- c(1, n4, (n + 1)/2, n + 1 - n4, n) my (@floor_d, @ceiling_d); foreach my $d (0..4) { $floor_d[$d] = floor($d[$d]); $ceiling_d[$d] = ceil($d[$d]); } my @sum_array; foreach my $e (0..4) { if (not defined $floor_d[$e]) { say "\$floor_d[$e] isn't defined."; die; } if (not defined $ceiling_d[$e]) { say "\$ceiling_d[$e] isn't defined."; die; } if (!defined $x[$floor_d[$e]-1]) { say "\$x[$floor_d[$e-1]-1] isn't defined."; die; } if (!defined $x[$ceiling_d[$e]-1]) { say "\$x[$ceiling_d[$e]-1] isn't defined."; die; } push @sum_array, (0.5 * ($x[$floor_d[$e]-1] + $x[$ceiling_d[$e]-1])); } return @sum_array; }
my @x = qw(0.14082834 0.09748790 1.73131507 0.87636009 -1.95059594 0.73438555 -0.03035726 1.46675970 -0.74621349 -0.72588772 0.63905160 0.61501527
-0.98983780 -1.00447874 -0.62759469 0.66206163 1.04312009 -0.10305385 0.75775634 0.32566578);
my @y = fivenum(\@x);
say join (',', @y); </lang>
- Output:
-1.95059594,-0.676741205,0.23324706,0.746070945,1.73131507
Perl 6
<lang perl6>sub fourths ( Int $end ) {
my $end_22 = $end div 2 / 2;
return 0, $end_22, $end/2, $end - $end_22, $end;
} sub fivenum ( @nums ) {
my @x = @nums.sort(+*) or die 'Input must have at least one element';
my @d = fourths(@x.end);
return ( @x[@d».floor] Z+ @x[@d».ceiling] ) »/» 2;
}
say .&fivenum for [15, 6, 42, 41, 7, 36, 49, 40, 39, 47, 43],
[36, 40, 7, 39, 41, 15], [ 0.14082834, 0.09748790, 1.73131507, 0.87636009, -1.95059594, 0.73438555, -0.03035726, 1.46675970, -0.74621349, -0.72588772, 0.63905160, 0.61501527, -0.98983780, -1.00447874, -0.62759469, 0.66206163, 1.04312009, -0.10305385, 0.75775634, 0.32566578,
]; </lang>
- Output:
(6 25.5 40 42.5 49) (7 15 37.5 40 41) (-1.95059594 -0.676741205 0.23324706 0.746070945 1.73131507)
R
The commented lines are from R source code. This is extremely easy to execute in R.
Notice that this function, being part of the R source code, is covered by the GNU Public License version 2.
<lang R>
- > fivenum
- function (x, na.rm = TRUE)
- {
- xna <- is.na(x)
- if (any(xna)) {
- if (na.rm)
- x <- x[!xna]
- else return(rep.int(NA, 5))
- }
- x <- sort(x)
- n <- length(x)
- if (n == 0)
- rep.int(NA, 5)
- else {
- n4 <- floor((n + 3)/2)/2
- d <- c(1, n4, (n + 1)/2, n + 1 - n4, n)
- 0.5 * (x[floor(d)] + x[ceiling(d)])
- }
- }
- <bytecode: 0x7fd0db42a7b8>
- <environment: namespace:stats>
x <- c(0.14082834, 0.09748790, 1.73131507, 0.87636009, -1.95059594, 0.73438555,-0.03035726, 1.46675970, -0.74621349, -0.72588772, 0.63905160, 0.61501527, -0.98983780, -1.00447874, -0.62759469, 0.66206163, 1.04312009, -0.10305385, 0.75775634, 0.32566578) > fivenum(x) [1] -1.9505959 -0.6767412 0.2332471 0.7460709 1.7313151 </lang>
Stata
First build a dataset to <lang stata>clear set seed 17760704 qui set obs 10000 gen x=rnormal()</lang>
The summarize command produces all the required statistics, and more:
<lang stata>qui sum x, detail di r(min),r(p25),r(p50),r(p75),r(max)</lang>
Output
-3.6345866 -.66536 .0026834 .68398139 3.7997103
It's also possible to use the tabstat command
<lang stata>tabstat x, s(mi q ma)</lang>
Output
variable | min p25 p50 p75 max -------------+-------------------------------------------------- x | -3.634587 -.66536 .0026834 .6839814 3.79971 ----------------------------------------------------------------
zkl
Uses GNU GSL library. <lang zkl>var [const] GSL=Import("zklGSL"); // libGSL (GNU Scientific Library) fcn fiveNum(v){ // V is a GSL Vector, --> min, 1st qu, median, 3rd qu, max
v.sort(); return(v.min(),v.quantile(0.25),v.median(),v.quantile(0.75),v.max())
}</lang> <lang zkl>fiveNum(GSL.VectorFromData(
15.0, 6.0, 42.0, 41.0, 7.0, 36.0, 49.0, 40.0, 39.0, 47.0, 43.0)).println();
println(fiveNum(GSL.VectorFromData(36.0, 40.0, 7.0, 39.0, 41.0, 15.0)));
v:=GSL.VectorFromData(
0.14082834, 0.09748790, 1.73131507, 0.87636009, -1.95059594, 0.73438555, -0.03035726, 1.46675970, -0.74621349, -0.72588772, 0.63905160, 0.61501527, -0.98983780, -1.00447874, -0.62759469, 0.66206163, 1.04312009, -0.10305385, 0.75775634, 0.32566578);
println(fiveNum(v));</lang>
- Output:
L(6,25.5,40,42.5,49) L(7,20.25,37.5,39.75,41) L(-1.9506,-0.652168,0.233247,0.740228,1.73132)