-
Notifications
You must be signed in to change notification settings - Fork 6
/
stats
executable file
·824 lines (753 loc) · 34.2 KB
/
stats
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
#!/usr/bin/env perl
# by Harry Mangalam, [email protected].
# after significant changes, update the tarballs that need it and cp to moo for distribution; update the scut github
# export filename="/home/hjm/bin/stats"; scp ${filename} moo:~/public_html; scp ${filename} moo:~/bin; scp ${filename} bridgit:~/bin; ssh bridgit 'scp ~/bin/stats hmangala@hpc12:~/bin'
# cd ~/gits/scut; cp ~/bin/stats .; git add stats; git commit -m 'commit message'; git push
# TODO add QQ plot to test for normality?
# [DONE] add required sample size estimate - see:
# https://sphweb.bumc.bu.edu/otlt/mph-modules/bs/bs704_power/bs704_power_print.html
use strict;
use Getopt::Long;
use vars qw( $wide $dist $Xsize $Ysize $ln $N $sum $Min $Max $XYDist @Data $pager
$XRange $XBinSize $XMul @SData $NWH $Median $even $Mean $SumDiffs2 $SumDiffs3
$SumDiffs4 $ValCnt $Val $MaxSoFarValCnt $ModeInd @Dist $jmin $jmax $J $YMax
$YBinSize $YMul @XYDist $ModeNum $Mode $S2 $S $Kurtosis $SEM $Skew $StdSkew
$gfmt $VERSION $DATE $HELPFILE $HELP $ConfIntLow $ConfIntHi $QUIET $stdout
%xfhash $xf $id $od $raw $QUANTILE $cutindex $cutvalue @QUANT $QUANTSTR
$ONLY $pSUM $pNUM $pMEAN $pMEDIAN $pMODE $pNMODES $pMIN $pMAX $pRANGE $pVAR
$pSD $pSEM $p95C $pSKEW $pSTDSKEW $pKURTOSIS $pQUANTILE $pQindex $PopKurtosis
$pPopKURTOSIS $pSAMPLE @pr %OUT $MINLIMIT $MAXLIMIT $CAPTURE %ConfMult $pCONF $CONFINT
$pConfindex $MOE $SampleSz $ALL $PVER
);
$VERSION = "2.1.4 April Fool"; # adds sample size estimation
$DATE = "Apr 1, 2023";
$ONLY = 0; # def if want specific outputs ONLY
if (!defined $ENV{'PAGER'}) {$pager = "less";};
$gfmt = 1;
$NWH = 0;
$stdout = 0;
$pQUANTILE = 0;
$pQindex = 100000; # outside possible range.
$MINLIMIT = -1e32;
$MAXLIMIT = 1e32;
$CAPTURE = 0;
$dist=0; # no plot is default
$ALL = 0; # by default print nothing.
%xfhash = (
'log10' => 1, 'ln' => 1, 'sqrt' => 1, 'x^2' => 1, 'x^3' => 1,
'1/x' => 1, 'sin' => 1, 'cos' => 1, 'tan' => 1, 'asin' => 1,
'acos' => 1, 'atan' => 1, 'round' => 1, 'trunc' => 1, 'frac' => 1,
'abs' => 1, 'exp' => 1, 'pass' => 1, '' => 1
);
%ConfMult = ( # see: https://www.mathsisfun.com/data/confidence-interval.html
'80' => 1.282, '85' => 1.440, '90' => 1.645, '95' => 1.960, '99' => 2.576,
'99.5' => 2.807, '99.9' => 3.291
);
&GetOptions(
"all!" => \$ALL, # print everything, or as much as possible
"wide!" => \$wide, # no args - just set to 1
"dist=i" => \$dist, # 1 for 1-liner, 2 for xy plot with the following vars, 3 for both
"version!"=> \$PVER, # print version and die
"x=i" => \$Xsize, # the # of characters in the X axis
"y=i" => \$Ysize, # the # of lines in the Y axis
"help!" => \$HELP, # ask for help
"h!" => \$HELP,
"quiet!" => \$QUIET, # shhhhh!
"nwh!" => \$NWH, # No Wide Headers (if repeating wide mode, don't want headers)
"stdout!" => \$stdout, # just print, don't do stats on the numbers
"xf=s" => \$xf, # do a transform of the #s before doing anything.
"gfmt!" => \$gfmt, # set to Perl's 'general' numeric notation; leave alone so interface doesn't change
"raw!" => \$raw, # set to see raw Perl notation (not gfmt, now the default)
"id=s" => \$id, # input delimiter
"od=s" => \$od, # output delimiter
"minlimit=i" => \$MINLIMIT, # filter values < this
"maxlimit=i" => \$MAXLIMIT, # filter values > this
"quantile=i" => \$QUANTILE, # number of quantiles to calculate
"conf=s" => \$CONFINT, # confidence interval, default 95%, as below
"sample=s" => \$MOE, # estimated Margin Of Error for expt.
"capture!" => \$CAPTURE, # capture filtered values in './stats.capture', overwritten each time
"sum!" => \$pSUM, # for printSUM, etc
"num!" => \$pNUM, #
"mean!" => \$pMEAN, #
"median!" => \$pMEDIAN, #
"mode!" => \$pMODE, #
"nmodes!" => \$pNMODES, #
"min!" => \$pMIN, #
"max!" => \$pMAX, #
"range!" => \$pRANGE, #
"var!" => \$pVAR, #
"sd!" => \$pSD, #
"sem!" => \$pSEM, #
# "conf!" => \$pCONF, #
"skew!" => \$pSKEW, #
"stdskew!" => \$pSTDSKEW, #
"kurt!" => \$pKURTOSIS, # https://brownmath.com/stat/shape.htm#KurtosisCompute
"popkurt" => \$pPopKURTOSIS, #
"qq!" => \$pQUANTILE, # quantile
);
if ($PVER) {print "Version $VERSION, last modified $DATE";}
if ($wide) {$ALL = 1;}
if (defined $CONFINT) {
$CONFINT = trim($CONFINT); $CONFINT =~ tr/%//d;
$pCONF = 1; # if define it, want to print it.
if (!defined $ConfMult{$CONFINT}){
die("The Confidence Interval you specified [$CONFINT] isn't supported.
Supported values are 80, 85, 90, 95, 99, 99.5, 99.9");
}
} else { $CONFINT = "95";}
if (defined $MOE) { # can be any number like string.
if ($MOE !~ /\d+(\.\d+)?/) {
print "The Margin Of Error value [$MOE] you gave for --sample doesn't look like a number to me. Try again?\n"; exit;
} else {$pSAMPLE = 1;}
}
if (defined $QUANTILE) {$pQUANTILE = 1;} # if define, want to print it.
# then run thru the array els and incr $ONLY for each key.
if ($pSUM){$pr[$ONLY++] = "Sum";}
if ($pNUM){$pr[$ONLY++] = "Number";}
if ($pMEAN){$pr[$ONLY++] = "Mean";}
if ($pMEDIAN){$pr[$ONLY++] = "Median";}
if ($pMODE){$pr[$ONLY++] = "Mode";}
if ($pNMODES){$pr[$ONLY++] = "NModes";}
if ($pMIN){$pr[$ONLY++] = "Min";}
if ($pMAX){$pr[$ONLY++] = "Max";}
if ($pRANGE){$pr[$ONLY++] = "Range";}
if ($pVAR){$pr[$ONLY++] = "Variance";}
if ($pSD){$pr[$ONLY++] = "Std_Dev";}
if ($pSEM){$pr[$ONLY++] = "SEM";}
if ($pCONF){$pConfindex = $ONLY; $pr[$ONLY++] = "Conf_Int";}
if ($pSKEW){$pr[$ONLY++] = "Skew";}
if ($pSTDSKEW){$pr[$ONLY++] = "Std_Skew";}
if ($pKURTOSIS){$pr[$ONLY++] = "Kurtosis";}
if ($pPopKURTOSIS){$pr[$ONLY++] = "PopKurt";}
if ($pSAMPLE){ $pr[$ONLY++] = "SampleSz";}
if ($pQUANTILE){$pQindex = $ONLY; $pr[$ONLY++] = "Quantiles";}
# $ONLY is now pointing past the end of the array
if ($CAPTURE) {open (CAP, "> ./stats.exceptions") or die "Can't open filter file [./stats.filtered]"; }
if ($raw) { $gfmt = 0; } # set to print raw, NOT in perl native format
# print indiv results by using $pr[x] as header and $OUTPUT{"$pr[x]"} as the value.
# perltidy cmd to format uniformly: perltidy -ce -i=2 -l=100 stats
# 06.02.2021 fixed bug that disabled correct (single) outputs if all input numbers were the same
# 11.16.2020 added variable quantiles, variable confidence intervals.
# 10.26.2020 added quantiles, variable printing.
# 11.28.2017 added xf=pass for simple text/data filtering.
# 11.10.2017 added transforms (--xf) & --stdout so stats can be used as an inline transform
# 06.15.2017 added 95% confidence intervals
# 03.12.2014 added '--quiet' to silence non-fatal warnings.
# 01.11.12 added 'general numeric format, replacing strict sci notation
# 05.01.08 added comma removal after embarrassing conversation with credit card company
# 7.14.06 add --sci to format for scientific notation output. ie:
# --sci
# 2.05.01 added Distribution calc/graph
# 2.01.01 format change to ease integration (Mode, NMode# split onto separate lines)
# Made Labels single words and unambiguous for easier grepping
# 11.10.00 added wide printing (--wide)
# 9.27.00 adding check for included non-numbers
# 4.21.00 adding check for FLAT mode, modecount =1
# 11.24.99 adding Mode, Mode count, Median to output.
$N = 0;
$sum = 0;
$Min = $Max = 0;
# handle input and offer help if none.
if (-t STDIN) {
if ($HELP) {usage()}
else {
print "\n[$0] will emit descriptive statistics based on
all number-like input fed it on STDIN. Use '-h' for more help.\n";
}
exit 0;
}
# define undefined vars
if (!$xfhash{$xf}){die "ERROR: I don't support that transform; try again or see the help (-h)\n"; }
if (!defined $Xsize) { $Xsize = 60; }
if (!defined $Ysize) { $Ysize = 25; }
if (!defined $id) {$id = "\\s+";}
if (!defined $od) {$od = "\t";}
if (!defined $QUANTILE) {$QUANTILE=5;}
#Zero the DIST array
for (my $x=0; $x<$Xsize; $x++) {
for (my $y=0; $y<$Ysize; $y++) {
$XYDist[$x][$y] = ' ';
}
}
# main loop to ingest data
while (<>) {
$_ = trim($_);
my $x = my @arr = split /$id/;
for (my $i = 0; $i < $x; $i++) {
# make sure all the things we're including are number-like
# remove commas to prevent rejection downstream
$arr[$i] =~ s/,//g;
# previous slightly shaky regex
# if (($arr[$i] =~ /\d+|\d*\.\d*|\d+\.\d*[eE]-?\d+/) &&
# ($arr[$i] !~ /[a-df-zA-DF-Z\[\]]+/) ) {
# stolen from https://docstore.mik.ua/orelly/perl4/cook/ch02_02.htm, 2.1.3
if ($arr[$i] =~/^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) {
if ($arr[$i] > $MINLIMIT && $arr[$i] < $MAXLIMIT) {
#$Data[$N++] = $arr[$i]; # store them for calcing the SD, etc
if (defined $xf) { # want to exec a transform; already checked that its supported.
# some of these are direct maps to perl-supplied functions; others have to be munged.
my $v = $arr[$i];
if ($xf eq "pass") { $arr[$i] = $v} # simply a filtering and passthru.
elsif ($xf eq "log10") { if ($v > 0) { $arr[$i] = log($v) / log(10);} else { $arr[$i] = "NA";} }
elsif ($xf eq "ln") { if ($v > 0) { $arr[$i] = log($v);} else { $arr[$i] = "NA";} }
elsif ($xf eq "sqrt") { if ($v >= 0) { $arr[$i] = sqrt($v);} else { $arr[$i] = "NA";} }
elsif ($xf eq "x^2") { $arr[$i] *= ($v)}
elsif ($xf eq "x^3") { $arr[$i] = ($v)*($v)*($v)}
elsif ($xf eq "1/x") { $arr[$i] = 1 / $v }
elsif ($xf eq "sin") { $arr[$i] = sin($v) }
elsif ($xf eq "cos") { $arr[$i] = cos($v) }
elsif ($xf eq "tan") { $arr[$i] = sin($v) / cos($v) }
elsif ($xf eq "asin") { $arr[$i] = 1 / sin($v) }
elsif ($xf eq "acos") { $arr[$i] = 1 / cos($v) }
elsif ($xf eq "atan") { $arr[$i] = 1 / tan($v) }
elsif ($xf eq "round") { $arr[$i] = int ($v + 0.5) }
elsif ($xf eq "trunc") { $arr[$i] = int ($v) }
elsif ($xf eq "frac") { $arr[$i] = $v - int($v) }
elsif ($xf eq "abs") { $arr[$i] = abs($v) }
elsif ($xf eq "exp") { $arr[$i] = exp($v) }
}
if ($stdout) {
my $y = $x-1;
if ($gfmt) { printf "%g", $arr[$i]; if ($i<$y) {print "$od"} else {print "\n";} }
else { print "$arr[$i]"; if ($i<$y) {print "$od"} else {print "\n";} }
}
$sum += $arr[$i]; # sum the numbers as they come in
if ($N == 0) { $Min = $Max = $arr[$i]; }
if ($arr[$i] < $Min) { $Min = $arr[$i]; }
if ($arr[$i] > $Max) { $Max = $arr[$i]; }
$Data[$N++] = $arr[$i]; # store them for calcing the SD, etc
} elsif ($CAPTURE) {print CAP "$arr[$i]\n";}
}
}
}
if ($N == 0) {
print STDERR "(stats) ERROR on input, no usable data. Check what you're feeding me.\n";
exit;
}
if (! $stdout) {
# All the numbers sucked in; now calc the values wanted
# autoscale the X axis
$XRange = $Max - $Min;
if ($XRange != 0){
$XBinSize = $XRange/$Xsize;
$XMul = $Xsize/$XRange;
} else {
$XBinSize = -1;
$XMul = -1;
}
# if want to get mode, median, would help to sort $Data
@SData = sort {$a <=> $b} @Data;
if ($N % 2 < 0.001) {
#then $N is even and we can calc median via...
$Median = ($SData[($N-1)/2] + $SData[(($N-1)+2)/2]) / 2;
$even = 1;
} else {
# then $N is odd and we can calc median via...
$Median = ($SData[($N+1)/2]) ;
$even = 0;
}
$Mean = $sum / $N;
$SumDiffs2 = 0;
$SumDiffs3 = 0;
$SumDiffs4 = 0;
$MaxSoFarValCnt = 0;
$ModeInd = 0;
$ValCnt = 0;
$Val = $SData[0];
#init Distribution array
for (my $i=0; $i<$Xsize; $i++){ $Dist[$i] = 0; }
$jmin = $jmax = 0;
for (my $i=0; $i < $N; $i++){
$SumDiffs2 = $SumDiffs2 + (($Data[$i] - $Mean)**2);
$SumDiffs3 = $SumDiffs3 + (($Data[$i] - $Mean)**3);
$SumDiffs4 = $SumDiffs4 + (($Data[$i] - $Mean)**4);
# this next stanza calculates the Mode pointer
if ($Val == $SData[$i]) {
# if its another of the same #, incr the counters
$ValCnt++;
$Val = $SData[$i];
} else { # it's a new value, so check if the run of the last set of #s
# exceeds the longest so far
if ($ValCnt > $MaxSoFarValCnt) {
# and if so, replace the old values with the new 'winners'
$MaxSoFarValCnt = $ValCnt;
$ModeInd = $i-1;
}
$ValCnt = 0; # and reset the counters for the new
}
$Val = $SData[$i];
# calc the distribution
if ($XMul > 0) {
$J = int($Data[$i] * $XMul);
if ($J < $jmin) { $jmin = $J; }
if ($J > $jmax) { $jmax = $J; }
$Dist[$J]++; # range of Dist should be close to $Xsize
} #else {print "\nErr: All #s same, no range, no distribution\n";}
}
#Scale the Y axis; 1st find out the range for Y
$YMax = 0;
for (my $i=0; $i<$jmax; $i++) {
if (abs($Dist[$i]) > $YMax) { $YMax = abs($Dist[$i]);}
}
if ($YMax == 0) { $YMax = 1;}
$YBinSize = $YMax/($Ysize-1);
#print "\nYMax = $YMax\n";
$YMul = ($Ysize-1) / $YMax;
for (my $x=0; $x<$Xsize; $x++) {
my $y = int($Dist[$x] * $YMul);
$XYDist[$x][$y] = '*';
}
## Calc Quantiles
# not providing min/max values since they're already printed
# fill the array, create a string; no printing here.
if (defined $QUANTILE){
$QUANTSTR = "";
$cutindex = 0;
for (my $q=1; $q<$QUANTILE; $q++) {
my $inc = int(($N * ($q/$QUANTILE)) + 1);
$cutindex = $inc - 1; #
my $cutvalue = $SData[$cutindex];
$QUANT[$q][0] = $cutindex; $QUANT[$q][1] = $cutvalue;
$QUANTSTR .= "$q\t$cutindex\t$cutvalue\n";
}
}
if ($MaxSoFarValCnt > 1) {
$ModeNum = $MaxSoFarValCnt + 1;
$Mode = $SData[$ModeInd];
} else {
$ModeNum = "No # was represented more than once";
$Mode = "FLAT";
}
# set up the @OUT hash
$OUT{"Sum"} = $sum;
$OUT{"Number"} = $N;
$OUT{"Mean"} = $Mean;
$OUT{"Median"} = $Median;
$OUT{"Mode"} = $Mode;
$OUT{"NModes"} = $ModeNum;
$OUT{"Min"} = $Min;
$OUT{"Max"} = $Max;
$OUT{"Range"} = $XRange;
$OUT{"Variance"} = $S2 = $SumDiffs2 / ($N - 1);
$OUT{"Std_Dev"} = $S = sqrt($S2);
## Check this formula (this looks like it's normalized to 0; should be 3
## and also add the population-normalized Kurtosis Or not..? Kurtosis is
## a pretty useless measure.
$OUT{"SEM"} = $SEM = $S / sqrt($N);
if ($S > 0 && $N > 3) {
$OUT{"Skew"} = $Skew = ($N * $SumDiffs3) / (($N-1) * ($N-2) * ($S ** 3));
$OUT{"Std_Skew"} = $StdSkew = $Skew / sqrt(6/$N);
$ConfIntHi = $Mean + ($ConfMult{$CONFINT} * ($S/sqrt($N)));
$ConfIntLow = $Mean - ($ConfMult{$CONFINT} * ($S/sqrt($N)));
$OUT{"Conf_Int"} = sprintf("%g:%g",$ConfIntLow,$ConfIntHi);
$OUT{"Quantiles"} = $QUANTSTR;
if (defined $MOE) { # calc and round up by one.
$SampleSz = int (((($ConfMult{$CONFINT} * $S) / $MOE)**2) + 1.49999999);
} else { undef $SampleSz; } # $SampleSz = '';
$OUT{"SampleSz"} = $SampleSz;
}
# only try to calculate this if $XMul > 0 (otherwise div by zero)
if ($XMul > 0) {
$OUT{"Kurtosis"} = $Kurtosis = $SumDiffs4 / ($N * $S**4);
# was $OUT{"Kurtosis"} = $Kurtosis = ($SumDiffs4 / (($N-1)*($S**4))) - 3;
if ($N > 3) {
$OUT{"PopKurt"} = $PopKurtosis = ($N * ($N+1)) / (($N-1) * ($N-2) * ($N-3)) * $Kurtosis - ((3*($N-1)^2) / (($N-2) * ($N-3)));
} else {$OUT{"PopKurt"} = "UNDEF";}
}
if (($ALL || !$ONLY) && !$wide && $gfmt == 0) { # don't include in golang version
print "Sum $sum",
"\nNumber $N",
"\nMean $Mean",
"\nMedian $Median",
"\nMode $Mode ",
"\nNModes $ModeNum",
"\nMin $Min",
"\nMax $Max",
"\nRange $XRange",
"\nVariance $S2",
"\nStd_Dev $S",
"\nSEM $SEM",
"\n${CONFINT}% Conf $ConfIntLow to $ConfIntHi",
"\n [for a normal distribution (ND) - see Skew & Kurtosis]",
"\nQuantiles ($QUANTILE)\n\tIndex\tValue\n$QUANTSTR";
} elsif (($ALL || !$ONLY) && !$wide && $gfmt > 0) {
printf "Sum %g",$sum;
printf "\nNumber %g",$N;
printf "\nMean %g",$Mean;
printf "\nMedian %g",$Median;
printf "\nMode %g",$Mode;
printf "\nNModes %g",$ModeNum;
printf "\nMin %g",$Min;
printf "\nMax %g",$Max;
printf "\nRange %g",$XRange;
printf "\nVariance %g",$S2;
printf "\nStd_Dev %g",$S;
printf "\nSEM %g",$SEM;
printf "\n%d% Conf %g to %g", $CONFINT, $ConfIntLow, $ConfIntHi;
print "\n [for a normal distribution (ND) - see skew]";
print "\nQuantiles ($QUANTILE)\n\tIndex\tValue\n$QUANTSTR";
}
if (($ALL || !$ONLY) && $S > 0 && $N > 3 && !$wide) {
if (!$gfmt) {
print "Skew $Skew",
"\n [Skew=0 for a symmetric dist]",
"\nStd_Skew $StdSkew",
"\nKurtosis $Kurtosis",
"\n [Kurtosis=3 for a normal dist (ND)]",
"\nPopKurt $PopKurtosis",
"\n [Pop'n Kurtosis is normalized to sample size; PK=0 for a ND]";
if (defined $MOE ) { # && $ONLY == 1
print "\nSampleSz $SampleSz\n";
} else {print "\n";}
} else {
printf "Skew %g", $Skew;
print "\n [Skew=0 for a symmetric dist]";
printf "\nStd_Skew %g", $StdSkew;
printf "\nKurtosis %g", $Kurtosis;
print "\n [Kurtosis=3 for a ND]";
printf "\nPopKurt %g", $PopKurtosis;
print "\n [Pop'n Kurtosis is normalized to sample size; PK=0 for a ND]";
if (defined $MOE) {
printf "\nSampleSz %g\n", $SampleSz;
} else {print "\n" ;}
}
} elsif (!$ONLY && !$QUIET) {
print STDERR "(stats): #Std Dev = 0 or N <=3 or printing wide; Skipping all Skewness & Kurtosis cal'ns.\n";
}
if (($ALL || !$ONLY) && $wide) {
if (!$NWH){
print "# (Quantiles not printed in wide mode.)
#Sum\tN\tMean\tMedian\tMode\tNModes\tMin\tMax\tRange\tVariance\tStd_Dev\tSEM\t95%L\t95%H\tSkew\tStd_Skew\tKurtosis\tSampleSz\n";
}
if ($gfmt) {
printf "%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g", $sum,$N,$Mean,$Median,$Mode,$ModeNum,$Min,$Max,$XRange,$S2,$S,$SEM,$ConfIntLow,$ConfIntHi;
} else {
print "$sum\t$N\t$Mean\t$Median\t$Mode\t$ModeNum\t$Min\t$Max\t$XRange\t$S2\t$S\t$SEM\t$ConfIntLow\t$ConfIntHi";
}
if ($S > 0 && $N > 3) {
if ($gfmt) {
printf "\t%g\t%g\t%g\t%g\t%g\n", $Skew,$StdSkew,$Kurtosis,$PopKurtosis,$SampleSz;
} else { print "\t$Skew\t$StdSkew\t$Kurtosis\t$PopKurtosis\t$SampleSz\n";}
} elsif (! $QUIET) {
print STDERR "(stats): NA\tNA\nStd Dev = 0 or N <=3; Skipping all Skewness & Kurtosis cal'ns.\n";
}
}
# print out the distribution
# this way prints it out in 1 line
if ($dist == 1 || $dist == 3) {
for (my $r=0; $r<($Xsize); $r++) {
if ($Dist[$r] < 10) { print $Dist[$r]; }
else { print "($Dist[$r])"; }
}
}
# if used only 1 option, print out the single number requested
#print __LINE__, " : ONLY=[$ONLY], MOE=[$MOE]\n";
if ($ONLY == 1) {
if ($pQUANTILE) {
printf "Quant\tIndex\tValue\n%s", $OUT{$pr[0]};
} elsif ($pCONF) {
printf "%s%s",$OUT{$pr[0]},$od;
} else {
if ( ($ALL == 0 && $MOE > 0) || !defined $MOE) {
if ($gfmt) { printf "%g\n",$OUT{$pr[0]}; }
else { printf "%d\n",$OUT{$pr[0]}; } # raw
}
}
} elsif ($ONLY > 1) {
print "#";
for (my $r=0;$r<=$#pr; $r++) {
printf "%12s%s", $pr[$r],$od;
} print "\n";
for (my $r=0;$r<=$#pr; $r++) {
if ($r == $pConfindex) { printf "%s%s",$OUT{$pr[$r]},$od; }
elsif ($r != $pQindex) {
if ($gfmt) {printf "%12g%s",$OUT{$pr[$r]},$od;}
else {printf "%12d%s",$OUT{$pr[$r]},$od;}
} elsif ($pQUANTILE) {
printf "(quantiles next lines)\nQuant\tIndex\tValue\n%s",$OUT{$pr[$r]},$od;
}
} print "\n";
}
# if used selective options, print out the selected bits
# omit from golang version
# this way prints a little xy graph, if wanted
my $spacer = "";
for (my $x=0; $x<($Xsize - 14); $x++) { $spacer = $spacer . " ";}
if ($XBinSize > 0) {
if ($dist == 2 || $dist == 3) {
print "\n\nDistribution\nX BinSize $XBinSize\nY BinSize $YBinSize\n\nYMax:$YMax\n |";
for (my $y=($Ysize-1); $y>=0; $y--) {
for (my $x=0; $x<$Xsize; $x++) { print "$XYDist[$x][$y]"; }
print "\n |";
}
for (my $x=0; $x<$Xsize; $x++) { print '-';}
print "\n X Min $spacer X Max\n";
printf "%7.2f %s %12.2f \n", $Min, $spacer, $Max;
if (!$ln) {
print "\nIf points are jammed at one end, use '--xf=ln' to spread them.\n";
}
}
} elsif ($dist != 0) {
print STDERR "\n(stats): Identical numbers in input; no range, no distribution\n";
}
}
sub trim($) {
my $string = shift;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
sub usage {
my $LESSHELP = <<HELP;
stats version: $VERSION, last mod: $DATE
Harry Mangalam <hjmangalam\@gmail.com>
Lives with 'scut' and 'cols' here: <https://github.com/hjmangalam/scut>
'stats' is a utility that reads STDIN for all #s, whether in one line
or in many (removes commas, checks for text contaminants; only have to
be separated by whitespace), calculates some basic stats, then spits
them to STDOUT. Starting from this version, the default is to format the
test in Perl's General Numeric Format (gfmt), altho you can ask for the raw
output with '--raw'.
New in 2.0.8, you can ask for specific output with the options listed below.
If you ask for a single output '--sum', it will be emitted as a single,
naked number. If you ask for multiple specific outputs, they will
have a #-prefixed line of headers above the numbers, roughly aligned and
separated by the output delimiter ('--od', by default a <tab>).
If you don't ask for specific stats, all will be emitted as shown in the
example stanza below.
Starting in V2.x, you can choose from a small set of transforms (see
below) to be applied to the data and then emitted with '--stdout'
(thus using stats as a transform filter) or have the stats applied
to that transformed data.
usage: stats < file.of.numbers
or
cmd1 | cmd2 |cmd3 | stats [options]
where Options are:
-all ................. print all the variables; either this or a
specific statistic is required for output
--help ....................................... dumps this help
--nwh ...... 'No Wide Headers' (no headers on wide output output
good for repeating output as for logging stuff)
--xf="fn" ..... to transform STDIN before doing the stats, where
"fn" is one of log10, ln, sqrt, x^2, x^3, 1/x,
sin, cos, tan, asin, acos, atan, round, abs, exp,
pass(thru), trunc (integer part), frac (decimal part).
--stdout ....... JUST print STDIN to STDOUT (don't do any stats)
(--xf also applies to STDOUT)
--id="token" .... Input delimiter; defaults to whitespace (\\s+)
--od="token" ........... Output delimiter; defaults to tab (\\t)
--minlimit=# .............. sets a MINIMUM limit to filter input
--maxlimit=# .............. sets a MAXIMUM limit to filter input
above 2 lines also set the filters
for the transforms (--xf, see above)
--capture ...... captures the exceptions from the above 2 limits
to file 'stats.exceptions' in the cwd
--conf=# ............... the confidence interval of the set; one
of 80, 85, 90, 95, 99, 99.5, or 99.9. If the confidence
interval is smaller, the range of values will of course be
be larger. Assumes normal distribution - see warnings.
--sample=# ....... returns the sample size to reach significance
set by '--conf' (above) when given a Margin Of Error (MOE)
requirement, given the input data set as a population estimate.
This means: if the input data is representative of the sample
population, at a the given confidence interval & the MOE you
give, it will return the minimum sample size required.
--gfmt ............. output in Perl's 'general numeric notation'
(now default)
--raw ............................... unformatted numeric output
--quiet ...................... decrease amount of error messages
--wide . writes the stats in 1-line, useful for some spreadsheet
apps. Omits Quantiles. Use cols/column/columns to view
aligned columns. See example below..
--dist=# .................. plots a distribution function where:
# = 1 = 1-liner distribution
2 = the std xy plot of the data
3 = both 1 liner & longer version
Strong advice: pipe '--stdout' into 'feedgnuplot'
for much better plotting)
--x=# ..................where # = an integer indicating the # of
characters in the X axis
--y=# ..................................... ditto for the y axis
## the following options specifically request the related values
--sum ........................... the total of all input values
--num ............................... the number of input values
--mean ............................... the average of all values
--median ...................the approx value above & below which
there are the same number of values.
--mode ......................... the most frequently seen value
--nmodes ................................... the number of modes
--min ............................. the minimum value in the set
--max ............................. the maximum value in the set
--range .............................................. max - min
--var .................................. the variance of the set
--sd .............................. the std deviation of the set
--sem ..................... the std error of the mean of the set
--skew ............ an estimate of the non-normalilty of the set
a normal distribution will have skew = 0
--stdskew ............... skew normalized to the size of the set
--kurt ........ kurtosis is a measure of the 'tailedness' of the
distribution. A normal distribution will have
kurtosis = 3.
--popkurt ............. kurtosis normalized to the sample size &
adjusted to be zero for a normal distribution
--quantile=# ........ sets the number of quantiles to calculate
--qq ..... prints the quantiles requested by the '--quantile=#'
option above. Outputs 3 values: the # of quantile,
the index of the sorted array holding the values,
and the value of the array at that index.
The quantile output can't easily be put in 1 line,
so it spills over into multiple lines.
--sample=# ..................... see above for what this returns
Performance isn't stellar - it's Perl. A rough estimate is about 128K
integers / sec with about 180bytes per integer RAM usage on a Thinkpad
T530 (Intel i5-3210M cpu @ 2.50GHz). Feel free to convert it C.
Example 1:
=======================================================================
Get only the sum and mean of the input data:
\$ cat 'file-of-numbers.txt' | stats --sum --mean
# Sum Mean
1.12697e+11 2.10727e+07
Example 2:
=======================================================================
Get only the sum of the input data:
\$ cat 'file-of-numbers.txt' | stats --sum
1.12697e+11
(note that ONLY the result is printed; no headers)
Example 3:
=======================================================================
Print output wide: (as it appears in 'less')
\$ cat 'file-of-numbers.txt' | stats --wide | cols | less
#Std Dev = 0 or N <=3 or printing wide; Skipping Skewness, Std Skewness cal'n.
1 0 1 2 3 4 5 6 7 8 ...
2 # (Quantiles not printed in wide mode.) - - ...
3 #Sum N Mean Median Mode NModes Min Max Range ...
4 134418 3200 42.0056 41.98 41.94 22 38.33 45.7 7.37 ...
Example 4:
=======================================================================
Calculate the file size distribution in the current directory:
\$ ls -l | awk '{print \$5}' | stats --all --dist=2 --x=20 --y=10
Number 401
Mean 1.10608e+07
Median 10642
Mode 4096
NModes 13
Min 0
Max 1.88495e+09
Range 1.88495e+09
Variance 1.15667e+16
Std_Dev 1.07549e+08
SEM 5.37073e+06
95% Conf 534133 to 2.15874e+07
[for a normal distribution (ND) - see skew]
Quantiles (5)
Index Value
1 80 226
2 160 3400
3 240 21132
4 320 438552
Skew 14.5696
[Skew=0 for a symmetric dist]
Std_Skew 119.109
Kurtosis 237.437
[Kurtosis=3 for a ND]
PopKurt 0.594993
[Pop'n Kurtosis is normalized to sample size; PK=0 for a ND]
** This assumes normal distribution , but since this distribution is
extremely skewed (see above Kurtosis value, and the plot below), the
confidence limits will be incorrect.
(For a web page that calculates more descriptive stats, including
estimation of normality, see: <http://www.xuru.org/st/DS.asp>
For specific plots or analyses, see: <http://www.wessa.net/desc.wasp>
Distribution
X BinSize 94247474.3
Y BinSize 43.8888888888889
YMax:395
|*
|
|
|
|
|
|
|
|
| *******************
|--------------------
X Min X Max
0.00 1884949486.00
If points are jammed at one end, use '--xf=ln' to spread them.
** This assumes normal distribution, but since this distribution is
extremely skewed, the confidence limits will be inaccurate.
(for a web page that calculates more descriptive stats, including
estimation of normality, see: http://www.xuru.org/st/DS.asp
for specific plots or analyses, see: http://www.wessa.net/desc.wasp
=======================================================================
Example 5:
=======================================================================
Calculate the file size distribution in the current directory with the
suggested ln transform.
NB: the stats are calculated with the transformed data.
\$ ls -l | awk '{print \$5}' | stats --xf='ln' --dist=2 --x=20 --y=10
Sum 3697.07
Number 401
Mean 9.21961
Median 9.27256
Mode 8.31777
NModes 13
Min 0
Max 21.3572
Range 21.3572
Variance 17.2889
Std_Dev 4.15799
SEM 0.20764
95% Conf 8.81264 to 9.62659
[for a normal distribution (ND) - see skew]
Quantiles (5)
Index Value
1 80 5.42053499927229
2 160 8.13153071060425
3 240 9.95854375828421
4 320 12.9912336698525
Skew 0.114128
[Skew=0 for a symmetric dist]
Std_Skew 0.933012
Kurtosis 2.61252
[Kurtosis=3 for a ND]
PopKurt -0.000939166
[Pop'n Kurtosis is normalized to sample size; PK=0 for a ND]
Distribution
X BinSize 1.06785834298047
Y BinSize 5.33333333333333
YMax:48
| *
| *
| *
| *
| ** *
| * *
| * * *
| **
|* *
| ****
|--------------------
X Min X Max
0.00 21.36
If points are jammed at one end, use '--xf=ln' to spread them.
= Hint: while the above plotting function is better than nothing,
consider using the excellent 'feedgnuplot' to plot columns of numbers.
ex: scut/cut [options] | feedgnuplot --lines --points
Feel free to add whatever additional calculations you want,
but if you do and you think they might be of general use,
let me know so I can add them to the original.
Help me make it better; send bug reports, suggestions back to the author:
<hjmangalam\@gmail.com>
HELP
$HELPFILE = ".statshelpfile" . $$; # write a hidden helpfile
open(HF, ">$HELPFILE") or die "Can't open helpfile [$HELPFILE] at __LINE__ \n";
print HF $LESSHELP;
close HF;
system("$pager $HELPFILE");
unlink $HELPFILE; # and get rid of it asap
exit 0;
}
exit 0;