-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmanuscript_appendix.tex
897 lines (737 loc) · 41.4 KB
/
manuscript_appendix.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
%
\documentclass[
a4paper,
]{article}
\usepackage{amsmath,amssymb}
\usepackage{iftex}
\ifPDFTeX
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
\usepackage{unicode-math} % this also loads fontspec
\defaultfontfeatures{Scale=MatchLowercase}
\defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
\usepackage{lmodern}
\ifPDFTeX\else
% xetex/luatex font selection
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
\usepackage[]{microtype}
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
\IfFileExists{parskip.sty}{%
\usepackage{parskip}
}{% else
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
\KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\usepackage[margin=1in]{geometry}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
\setlength{\emergencystretch}{3em} % prevent overfull lines
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{-\maxdimen} % remove section numbering
% definitions for citeproc citations
\NewDocumentCommand\citeproctext{}{}
\NewDocumentCommand\citeproc{mm}{%
\begingroup\def\citeproctext{#2}\cite{#1}\endgroup}
\makeatletter
% allow citations to break across lines
\let\@cite@ofmt\@firstofone
% avoid brackets around text for \cite:
\def\@biblabel#1{}
\def\@cite#1#2{{#1\if@tempswa , #2\fi}}
\makeatother
\newlength{\cslhangindent}
\setlength{\cslhangindent}{1.5em}
\newlength{\csllabelwidth}
\setlength{\csllabelwidth}{3em}
\newenvironment{CSLReferences}[2] % #1 hanging-indent, #2 entry-spacing
{\begin{list}{}{%
\setlength{\itemindent}{0pt}
\setlength{\leftmargin}{0pt}
\setlength{\parsep}{0pt}
% turn on hanging indent if param 1 is 1
\ifodd #1
\setlength{\leftmargin}{\cslhangindent}
\setlength{\itemindent}{-1\cslhangindent}
\fi
% set entry spacing
\setlength{\itemsep}{#2\baselineskip}}}
{\end{list}}
\usepackage{calc}
\newcommand{\CSLBlock}[1]{\hfill\break\parbox[t]{\linewidth}{\strut\ignorespaces#1\strut}}
\newcommand{\CSLLeftMargin}[1]{\parbox[t]{\csllabelwidth}{\strut#1\strut}}
\newcommand{\CSLRightInline}[1]{\parbox[t]{\linewidth - \csllabelwidth}{\strut#1\strut}}
\newcommand{\CSLIndent}[1]{\hspace{\cslhangindent}#1}
\usepackage{amsmath}
\usepackage{booktabs}
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator{\sgn}{sgn}
\ifLuaTeX
\usepackage{selnolig} % disable illegal ligatures
\fi
\usepackage{bookmark}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\urlstyle{same}
\hypersetup{
pdftitle={Appendix},
pdfauthor={Alex Zwanenburg, Steffen Löck},
hidelinks,
pdfcreator={LaTeX via pandoc}}
\title{Appendix}
\author{Alex Zwanenburg, Steffen Löck}
\date{2024-10-17}
\begin{document}
\maketitle
\section{Appendix A: Log-likelihood functions for location and scale
invariant power
transformation}\label{appendix-a-log-likelihood-functions-for-location-and-scale-invariant-power-transformation}
Location and scale-invariant Box-Cox and Yeo-Johnson transformations are
parametrised using location \(x_0\) and scale \(s\) parameters, in
addition to transformation parameter \(\lambda\). This leads to the
following transformations. The location and scale-invariant Box-Cox
transformation is:
\begin{equation}
\phi_{\text{BC}}^{\lambda, x_0, s} (x_i) =
\begin{cases}
\left( \left(\frac{x_i - x_0}{s} \right)^\lambda - 1 \right) / \lambda & \text{if } \lambda \neq 0\\
\log\left[\frac{x_i - x_0}{s}\right] & \text{if } \lambda = 0
\end{cases}
\end{equation}
where \(x_i - x_0 > 0\). The location and scale-invariant Yeo-Johnson
transformation is:
\begin{equation}
\phi_{\text{YJ}}^{\lambda, x_0, s} (x_i) =
\begin{cases}
\left( \left( 1 + \frac{x_i - x_0}{s}\right)^\lambda - 1\right) / \lambda & \text{if } \lambda \neq 0 \text{ and } x_i - x_0 \geq 0\\
\log\left[1 + \frac{x_i - x_0}{s}\right] & \text{if } \lambda = 0 \text{ and } x_i - x_0 \geq 0\\
-\left( \left( 1 - \frac{x_i - x_0}{s}\right)^{2 - \lambda} - 1 \right) / \left(2 - \lambda \right) & \text{if } \lambda \neq 2 \text{ and } x_i - x_0 < 0\\
-\log\left[1 - \frac{x_i - x_0}{s}\right] & \text{if } \lambda = 2 \text{ and } x_i - x_0 < 0
\end{cases}
\end{equation}
The parameters of these power transformations can be optimised based by
maximising the log-likelihood function, under the assumption that the
transformed feature \(\phi^{\lambda, x_0, s} (\mathbf{X})\) follows a
normal distribution. The log-likelihood functions for conventional
Box-Cox and Yeo-Johnson transformations are well-known. However, the
introduction of scaling parameter \(s\) prevents their direct use. Here,
we first derive the general form of the log-likelihood functions, and
then derive their power-transformation specific definitions.
Let \(f(x_1, \ldots, x_n)\) be the probability density function of
feature \(\mathbf{X} = \{ x_1, \ldots, x_n\}\), and
\(f^{\lambda, x_0, s} (\phi^{\lambda, x_0, s}(x_1), \ldots, \phi^{\lambda, x_0, s}(x_n))\)
be the probability density function of the transformed feature
\(\phi^{\lambda, x_0, s} (\mathbf{X})\), that is assumed to follow a
normal distribution.
The two probability density functions are related as follows:
\begin{equation}
f^{\lambda, x_0, s}(x_1, \ldots, x_n) = f^{\lambda, x_0, s} (\phi^{\lambda, x_0, s}(x_1), \ldots, \phi^{\lambda, x_0, s}(x_n)) \left|\mathbf{J}\right|
\end{equation}
Where, \(\left|\mathbf{J}\right|\) is the determinant of Jacobian
\(\mathbf{J}\). The Jacobian takes the following form, with off-diagonal
elements \(0\):
\begin{equation}
\mathbf{J} =
\begin{bmatrix}
\frac{\partial}{\partial x_1} \phi^{\lambda, x_0, s}(x_1) & 0 & \dots & 0 \\
0 & \frac{\partial}{\partial x_2} \phi^{\lambda, x_0, s}(x_2) & \dots & 0 \\
\vdots & \vdots & \ddots & \vdots \\
0 & 0 & 0 & \frac{\partial}{\partial x_n} \phi^{\lambda, x_0, s}(x_n)
\end{bmatrix}
\end{equation}
Thus,
\(\left| \mathbf{J} \right| = \prod_{i=1}^n \frac{\partial}{\partial x_i} \phi^{\lambda, x_0, s}(x_i)\).
Since in our situation \(\{x_1, \ldots, x_n\}\) in
\(f^{\lambda, x_0, s}(x_1, \ldots, x_n)\) are considered fixed (i.e.,
known), \(f^{\lambda, x_0, s}(x_1, \ldots, x_n)\) may be considered a
likelihood function. The log-likelihood function
\(\mathcal{l}^{\lambda, x_0, s}\) is then:
\begin{equation}
\begin{split}
\mathcal{l}^{\lambda, x_0, s} & = \log f^{\lambda, x_0, s}(x_1, \ldots, x_n) \\
& = \log \left[ f^{\lambda, x_0, s} (\phi^{\lambda, x_0, s}(x_1), \ldots, \phi^{\lambda, x_0, s}(x_n)) \right] + \log \left|\mathbf{J}\right| \\
& = \log \left[ f^{\lambda, x_0, s} (\phi^{\lambda, x_0, s}(x_1), \ldots, \phi^{\lambda, x_0, s}(x_n)) \right] + \log \prod_{i=1}^n \frac{\partial}{\partial x_i} \phi^{\lambda, x_0, s}(x_i) \\
& = -\frac{n}{2} \log \left[2 \pi \sigma^2 \right] -\frac{1}{2 \sigma^2} \sum_{i=1}^n \left( \phi^{\lambda, x_0, s}(x_i) - \mu \right)^2 + \sum_{i=1}^n \log \left[ \frac{\partial}{\partial x_i} \phi^{\lambda, x_0, s}(x_i)\right]
\end{split}
\end{equation}
With \(\mu\) the average of \(\phi^{\lambda, x_0, s}(\mathbf{X})\) and
\(\sigma^2\) its variance. The first two terms derive directly from the
log-likelihood function of a normal distribution, and are not specific
to the type of power transformation used. However, the final term
differs between Box-Cox and Yeo-Johnson transformations.
\subsection{Location- and scale-invariant Box-Cox
transformation}\label{location--and-scale-invariant-box-cox-transformation}
For the location- and scale-invariant Box-Cox transformation the partial
derivative is:
\begin{equation}
\begin{split}
\frac{\partial}{\partial x_i} \phi_{\text{BC}}^{\lambda, x_0, s}(x_i) & = \frac{1}{s} \left(\frac{x_i - x_0}{s} \right)^{\lambda-1} \\
& = \frac{1} {s^\lambda} \left(x_i - x_0 \right)^{\lambda - 1}
\end{split}
\end{equation}
Thus the final term in \(\mathcal{l}_{\text{BC}}^{\lambda, x_0, s}\) is:
\begin{equation}
\begin{split}
\sum_{i=1}^n \log \frac{\partial}{\partial x_i} \phi_{\text{BC}}^{\lambda, x_0, s}(x_i) & = \sum_{i=1}^n \log \left[ s^{-\lambda} (x_i - x_0)^{\lambda - 1} \right] \\
& = \sum_{i=1}^n \log \left[s^{-\lambda} \right] + \log \left[ (x_i - x_0)^{\lambda - 1} \right]\\
& = -n \lambda \log s + \left( \lambda - 1 \right) \sum_{i=1}^n \log \left[ x_i - x_0 \right]
\end{split}
\end{equation}
This leads to the following log-likelihood:
\begin{equation}
\begin{split}
\mathcal{l}_{\text{BC}}^{\lambda, x_0, s} = & -\frac{n}{2} \log \left[2 \pi \sigma^2 \right] -\frac{1}{2 \sigma^2} \sum_{i=1}^n \left( \phi^{\lambda, x_0, s}(x_i) - \mu \right)^2 \\
& -n \lambda \log s + \left( \lambda - 1 \right) \sum_{i=1}^n \log \left[ x_i - x_0 \right]
\end{split}
\end{equation}
Similarly to Raymaekers and Rousseeuw (2024), sample weights \(w_i\) are
introduced to facilitate robust power transformations. The weighted
log-likelihood of the location- and scale-invariant Box-Cox
transformation is:
\begin{equation}
\begin{split}
\mathcal{l}_{\text{rBC}}^{\lambda, x_0, s} = & -\frac{1}{2} \left(\sum_{i=1}^n w_i \right) \log \left[ 2 \pi \sigma_w^2 \right] -\frac{1}{2 \sigma_w^2} \sum_{i=1}^n w_i \left( \phi^{\lambda, x_0, s}(x_i) - \mu_w \right)^2 \\
& - \lambda \left( \sum_{i=1}^n w_i \right) \log s + \left( \lambda - 1 \right) \sum_{i=1}^n w_i \log \left[ x_i - x_0 \right]
\end{split}
\end{equation}
where \(\mu_w\) and \(\sigma^2_w\) are the weighted mean and weighted
variance of the Box-Cox transformed feature
\(\phi_{\text{BC}}^{\lambda, x_0, s} (\mathbf{X})\), respectively:
\begin{equation}
\sigma_w^2 = \frac{\sum_{i=1}^n w_i \left(\phi_{\text{BC}}^{\lambda, x_0, s} (x_i) - \mu_w \right)^2}{\sum_{i=1}^n w_i} \quad \text{with } \mu_w = \frac{\sum_{i=1}^n \phi_{\text{BC}}^{\lambda, x_0, s} (x_i)} {\sum_{i=1}^n w_i}
\end{equation}
\subsection{Location- and scale-invariant Yeo-Johnson
transformation}\label{location--and-scale-invariant-yeo-johnson-transformation}
For the location- and scale-invariant Yeo-Johnson transformation, the
partial derivative is:
\begin{equation}
\frac{\partial}{\partial x_i} \phi_{\text{YJ}}^{\lambda, x_0, s}(x_i) =
\begin{cases}
\frac{1}{s} \left(1 + \frac{x_i - x_0}{s}\right)^{\lambda - 1} & \text{if } x_i - x_0 \geq 0\\
\frac{1}{s} \left(1 - \frac{x_i - x_0}{s}\right)^{1 - \lambda} & \text{if } x_i - x_0 < 0
\end{cases}
\end{equation}
Thus the final term in \(\mathcal{l}_{\text{YJ}}^{\lambda, x_0, s}\) is:
\begin{equation}
\begin{split}
\sum_{i=1}^n \log \frac{\partial}{\partial x_i} \phi_{\text{YJ}}^{\lambda, x_0, s}(x_i) & = - n \log s + (\lambda - 1) \sum_{i=1}^n \sgn(x_i - x_0) \log \left[1 + \frac{|x_i - x_0|}{s} \right]
\end{split}
\end{equation}
This leads to the following log-likelihood:
\begin{equation}
\begin{split}
\mathcal{l}_{\text{YJ}}^{\lambda, x_0, s} = & -\frac{n}{2} \log\left[2 \pi \sigma^2\right] -\frac{1}{2 \sigma^2} \sum_{i=1}^n \left( \phi^{\lambda, x_0, s}(x_i) - \mu \right)^2 \\
& - n \log s + (\lambda - 1) \sum_{i=1}^n \sgn(x_i - x_0) \log \left[1 + \frac{|x_i - x_0|}{s} \right]
\end{split}
\end{equation}
The weighted log-likelihood for location- and scale-invariant
Yeo-Johnson transformation is:
\begin{equation}
\begin{split}
\mathcal{l}_{\text{rYJ}}^{\lambda, x_0, s} = & -\frac{1}{2} \left(\sum_{i=1}^n w_i \right) \log \left[ 2 \pi \sigma_w^2 \right] -\frac{1}{2 \sigma_w^2} \sum_{i=1}^n w_i \left( \phi^{\lambda, x_0, s}(x_i) - \mu_w \right)^2 \\
& - \left( \sum_{i=1}^n w_i \right) \log s + (\lambda - 1) \sum_{i=1}^n w_i \sgn(x_i - x_0) \log \left[1 + \frac{|x_i - x_0|}{s} \right]
\end{split}
\end{equation}
where \(\mu_w\) and \(\sigma^2_w\) are the weighted mean and weighted
variance of the Yeo-Johnson transformed feature
\(\phi_{\text{YJ}}^{\lambda, x_0, s} (\mathbf{X})\):
\begin{equation}
\sigma_w^2 = \frac{\sum_{i=1}^n w_i \left(\phi_{\text{YJ}}^{\lambda, x_0, s} (x_i) - \mu_w \right)^2}{\sum_{i=1}^n w_i} \quad \text{with } \mu_w = \frac{\sum_{i=1}^n \phi_{\text{YJ}}^{\lambda, x_0, s} (x_i)} {\sum_{i=1}^n w_i}
\end{equation}
\section{Appendix B: Optimisation of transformation
parameters}\label{appendix-b-optimisation-of-transformation-parameters}
Maximum likelihood estimation (MLE) is commonly used to optimise
parameters for power transformation. Generally, optimisation requires
minimisation or maximisation of a criterion. In MLE, the maximised
criterion is the log-likelihood function of the normal distribution.
Here, we investigate power transformation using optimisation criteria
that are closely related to test statistics for normality tests.
Let \(\mathbf{X}\) be a feature with ordered feature values, and
\(\mathbf{Y}^\lambda =\phi^{\lambda} \left(\mathbf{X} \right)\) and
\(\mathbf{Y}^{\lambda, x_0, s} =\phi^{\lambda, x_0, s} \left(\mathbf{X} \right)\)
its transformed values using conventional and shift and scale invariant
power transformations, respectively. Since power transformations are
monotonic, \(\mathbf{Y}\) will likewise be ordered.
Below we will focus on criteria based on the empirical density function
and those based on skewness and kurtosis of the transformed featured.
Other potential criteria, such as the Shapiro-Wilk test statistic
(Shapiro and Wilk 1965) are not investigated here. In the case of the
Shapiro-Wilk test statistic this is because of lack of scalability to
features with many (\(> 5000\)) instances, and because adapting the test
statistic to include weights is not straightforward.
\subsection{Empirical density function-based
criteria}\label{empirical-density-function-based-criteria}
The first class of criteria is based on the empirical distribution
function (EDF). Transformation parameters are then fit through
minimisation of the distance between the empirical distribution function
\(F_{\epsilon}\) and the cumulative density function (CDF) of the normal
distribution \(F_{\mathcal{N}}\). Let
\(F_{\epsilon}\left(x_i \right) = \frac{i - 1/3}{n + 1/3}\) be the
empirical probability of instance \(i\). The normal distribution is
parametrised by location parameter \(\mu\) and scale parameter
\(\sigma\), both of which have to be estimated from the data. For
non-robust power transformations, \(\mu\) and \(\sigma\) are sample mean
and sample standard deviation, respectively. For robust power
transformations, we estimate \(\mu\) and \(\sigma\) as Huber M-estimates
of location and scale of the transformed feature
\(\phi^{\lambda, x_0, s} (\mathbf{X})\) (Huber 1981).
\subsubsection{Anderson-Darling
criterion}\label{anderson-darling-criterion}
The Anderson-Darling criterion is based on the empirical distribution
function of \(\mathbf{X}\). We define this criterion as follows:
\begin{equation}
U_{\text{AD}} \left(\mathbf{X}, \lambda, x_0 \right) = \frac{1}{\sum_{i=1}^n w_i} \sum_{i=1}^n w_i \frac{\left( F_{\epsilon}\left(x_i \right) - F_{\mathcal{N}} \left(\phi^{\lambda, x_0, s} \left(x_i \right); \mu, \sigma \right) \right)^2} {F_{\mathcal{N}} \left(\phi^{\lambda, x_0, s} \left(x_i \right); \mu, \sigma \right) \left(1 - F_{\mathcal{N}} \left(\phi^{\lambda, x_0, s} \left(x_i \right); \mu, \sigma \right) \right) }
\end{equation}
Here \(w_i\) are weights, and \(\mu\) and \(\sigma\) are location and
scale parameters. For non-robust power transformations, all \(w_i = 1\).
Note that this criterion is not the same as the Anderson-Darling test
statistic (Anderson and Darling 1952), which involves solving (or
approximating) an integral function, contains an extra scalar
multiplication term, and does not include weights. The Anderson-Darling
criterion seeks to minimise the squared Euclidean distance between the
EDF and the normal CDF, with differences at the upper and lower end of
the normal CDF receiving more weight than those at the the centre of the
CDF.
\subsubsection{Cramér-von Mises
criterion}\label{cramuxe9r-von-mises-criterion}
The Cramér-von Mises criterion is also based on the empirical
distribution function of \(\mathbf{X}\). We define the Cramér-von Mises
criterion as follows:
\begin{equation}
U_{\text{CvM}} \left(\mathbf{X}, \lambda, x_0 \right) = \frac{1}{\sum_{i=1}^n w_i} \sum_{i=1}^n w_i \left( F_{\epsilon}\left(x_i \right) - F_{\mathcal{N}} \left(\phi^{\lambda, x_0, s} \left(x_i \right); \mu, \sigma \right) \right)^2
\end{equation}
Here \(w_i\) are weights, and \(\mu\) and \(\sigma\) are location and
scale parameters. For non-robust power transformations, all \(w_i = 1\).
The criterion is similar to the Cramér-von Mises test statistic Mises
(1928), aside from a additive scalar value and the introduction of
weights. This criterion, like the Anderson-Darling criterion, seeks to
minimise the squared Euclidean distance between the EDF and the normal
CDF. Unlike the Anderson-Darling criterion, this criterion weights all
instances equally.
For conventional power transformations with a fixed shift parameter, the
transformation \(\phi^{\lambda, x_0, s} (\mathbf{X})\) may be
substituted by \(\phi^{\lambda} (\mathbf{X})\) in the definition of the
Cramér-von Mises criterion.
\subsection{Skewness-kurtosis-based
criteria}\label{skewness-kurtosis-based-criteria}
The second class of criteria seeks to reduce skewness and (excess)
kurtosis of the transformed feature \(\mathbf{Y}\). We will first define
the location \(\mu\) and scale \(\sigma\) of the the transformed as
these are required for computing skewness and kurtosis. Here, \(\mu\) is
defined as:
\begin{equation}
\mu = \frac{\sum_{i=1}^n \phi^{\lambda, x_0, s} \left(x_i \right)} {\sum_{i=1}^n w_i}
\end{equation}
The location, or mean, is weighted using weights \(w_i\). For non-robust
transformations, \(w_i = 1\). Then, \(\sigma^2\) is defined as:
\begin{equation}
\sigma^2 = \frac{\sum_{i=1}^n w_i \left(\phi^{\lambda, x_0, s} \left( x_i \right) - \mu \right)^2}{\sum_{i=1}^n w_i}
\end{equation}
Skewness is defined as:
\begin{equation}
s = \frac{\sum_{i=1}^n w_i \left(\phi^{\lambda, x_0, s} \left( x_i \right) - \mu \right)^3}{\sigma^3 \sum_{i=1}^n w_i}
\end{equation}
Kurtosis is defined as:
\begin{equation}
k = \frac{\sum_{i=1}^n w_i \left(\phi^{\lambda, x_0, s} \left( x_i \right) - \mu \right)^4}{\sigma^4 \sum_{i=1}^n w_i}
\end{equation}
\subsubsection{D'Agostino criterion}\label{dagostino-criterion}
The D'Agostino criterion defined here follows the D'Agostino \(K^2\)
test statistic (D'Agostino and Belanger 1990). This test statistic is
composed of two separate test statistics, one of which is related to
skewness, and the other to kurtosis. Both test statistics are computed
in several steps. Let us first define \(\nu=\sum_{i=1}^n w_i\). Thus for
non-robust power transformations, \(\nu = n\).
For the skewness test statistic we first compute (D'Agostino and
Belanger 1990):
\begin{equation}
\beta_1 = s \sqrt{ \frac{\left(\nu + 1\right) \left(\nu + 3\right)} {6 \left(\nu - 2\right)} }
\end{equation}
\begin{equation}
\beta_2 = 3 \frac{\left(\nu^2 + 27\nu - 70\right) \left(\nu + 1\right) \left(\nu + 3\right)} {\left(\nu - 2\right) \left(\nu + 5\right) \left(\nu + 7\right) \left(\nu + 9\right)}
\end{equation}
\begin{equation}
\alpha = \sqrt{\frac{2} {\sqrt{2 \beta_2 - 2} - 2}}
\end{equation}
\begin{equation}
\delta = \frac{1}{\sqrt{\log \left[\sqrt{-1 + \sqrt{2 * \beta_2 - 2}} \right]}}
\end{equation}
The skewness test statistic is then:
\begin{equation}
Z_s = \delta \log\left[\frac{\beta_1}{\alpha} + \sqrt{\frac{\beta_1^2}{\alpha^2} + 1} \right]
\end{equation}
For the kurtosis test statistic we first compute (D'Agostino and
Belanger 1990; Anscombe and Glynn 1983):
\begin{equation}
\beta_1 = 3 \frac{\nu - 1}{\nu + 1}
\end{equation}
\begin{equation}
\beta_2 = 24 \nu \frac{\left(\nu - 2\right)\left(\nu - 3\right)}{\left(\nu + 1\right)^2 \left(\nu + 3\right) \left(\nu + 5\right)}
\end{equation}
\begin{equation}
\beta_3 = 6 \frac{\nu^2 - 5 \nu + 2}{\left(\nu + 7\right) \left(\nu + 9\right)} \sqrt{6 \frac{\left(\nu + 3\right) \left(\nu + 5\right)}{\nu \left(\nu - 2\right) \left(\nu - 3 \right)}}
\end{equation}
\begin{equation}
\alpha_1 = 6 + \frac{8}{\beta_3} \left[\frac{2}{\beta_3} + \sqrt{1 + \frac{4}{\beta_3^2}} \right]
\end{equation}
\begin{equation}
\alpha_2 = \frac{k - \beta_1}{\sqrt{\beta_2}}
\end{equation}
The kurtosis test statistic is then:
\begin{equation}
Z_k = \sqrt{\frac{9 \alpha_1}{2}} \left[ 1 - \frac{2}{9 \alpha_1} - \left(\frac{1 - 2 / \alpha_1}{1 + \alpha_2 \sqrt{2 / \left(\alpha_1 - 4 \right)}} \right)^{1 / 3} \right]
\end{equation}
The D'Agostino \(K^2\) test statistic and our criterion are the same,
and are defined as:
\begin{equation}
U_{\text{DA}} \left(\mathbf{X}, \lambda, x_0 \right) = Z_s^2 + Z_k^2
\end{equation}
The main difference between the test statistic as originally formulated,
and the criterion proposed here is the presence of weights for robust
power transformation.
\subsubsection{Jarque-Bera criterion}\label{jarque-bera-criterion}
The second criterion based on skewness and kurtosis is the Jarque-Bera
criterion. It is relatively simple to compute compared to the D'Agostino
criterion:
\begin{equation}
U_{\text{JB}} \left(\mathbf{X}, \lambda, x_0 \right) = s^2 + \left(k - 3\right)^2 / 4
\end{equation}
The main difference between the above criterion and the Jarque-Bera test
statistic (Jarque and Bera 1980) is that a scalar multiplication is
absent.
\subsection{Optimisation using non-MLE
criteria}\label{optimisation-using-non-mle-criteria}
Each of the above criteria can be used for optimisation, i.e.:
\begin{equation}
\left\{ \hat{\lambda}, \hat{x}_0, \hat{s}_0 \right\} = \argmin_{\lambda, x_0, s} U\left(\mathbf{X}, \lambda, x_0, s \right)
\end{equation}
For conventional power transformations with fixed location and scale
parameters, the transformation \(\phi^{\lambda, x_0, s} (\mathbf{X})\)
may be substituted by \(\phi^{\lambda} (\mathbf{X})\), or equivalently,
\(x_0\) and \(s\) may be fixed:
\begin{equation}
\left\{ \hat{\lambda}\right\} = \argmin_{\lambda} U\left(\mathbf{X}, \lambda; x_0, s \right)
\end{equation}
\section{Appendix C: Simulations with other optimisation
criteria}\label{appendix-c-simulations-with-other-optimisation-criteria}
Invariance of location- and scale-invariant power transformations was
assessed using the optimisation criteria in
\href{Appendix\%20B:\%20Optimisation\%20of\%20transformation\%20parameters}{Appendix
B}. This follows the simulation in the main manuscript, where MLE was
used for optimization. In short, we first randomly drew \(10000\) values
from a normal distribution:
\(\mathbf{X}_{\text{normal}} = \left\{x_1, x_2, \ldots, x_{10000} \right\} \sim \mathcal{N}\left(0, 1\right)\),
or equivalently
\(\mathbf{X}_{\text{normal}} = \left\{x_1, x_2, \ldots, x_{10000} \right\} \sim \mathcal{AGN}\left(0, 1/\sqrt{2}, 0.5, 2\right)\).
The second distribution was a right-skewed normal distribution
\(\mathbf{X}_{\text{right}} = \left\{x_1, x_2, \ldots, x_{10000} \right\} \sim \mathcal{AGN}\left(0, 1/\sqrt{2}, 0.2, 2\right)\).
The third distribution was a left-skewed normal distribution
\(\mathbf{X}_{\text{left}} = \left\{x_1, x_2, \ldots, x_{10000} \right\} \sim \mathcal{AGN}\left(0, 1/\sqrt{2}, 0.8, 2\right)\).
We then computed transformation parameter \(\lambda\) using the original
definitions (equations \ref{eqn:box-cox-original} and
\ref{eqn:yeo-johnson-original}) and the location- and scale-invariant
definitions (equations \ref{eqn:box-cox-invariant} and
\ref{eqn:yeo-johnson-invariant}) for each distribution using different
optimisation criteria. To assess location invariance, a positive value
\(d_{\text{shift}}\) was added to each distribution with
\(d_{\text{shift}} \in [1, 10^6]\). Similarly, to assess scale
invariance, each distribution was multiplied by a positive value
\(d_{\text{scale}}\), where \(d_{\text{scale}} \in [1, 10^6]\).
The results are shown in Figure
\ref{fig:shifted-distributions-appendix}.
\begin{figure}
{\centering \includegraphics{manuscript_appendix_files/figure-latex/shifted-distributions-appendix-1}
}
\caption{Invariant power transformation produces transformation parameters that are invariant to location and scale. Samples were drawn from normal, right-skewed and left-skewed distributions, respectively, which then underwent a shift $d_{\text{shift}}$ or multiplication by $d_{\text{scale}}$. Estimates of the transformation parameter $\lambda$ for the conventional power transformations show strong dependency on the overall location and scale of the distribution and the optimisation criterion, whereas estimates obtained for the location- and scale-invariant power transformations are constant. For location- and scale-invariant power transformations, the Anderson-Darling criterion leads to unstable estimates of $\lambda$ for skewed distributions, possibly due to large weights being assigned to samples at the upper and lower ends of the distribution.}\label{fig:shifted-distributions-appendix}
\end{figure}
\section{Appendix D: Experimental results using location- and
scale-invariant Box-Cox
transformation}\label{appendix-d-experimental-results-using-location--and-scale-invariant-box-cox-transformation}
The effect of using location- and scale-invariant transformations was
investigated using real-world datasets.
\subsection{Invariance}\label{invariance}
Results for Box-Cox transformations of features without outliers are
shown in Figure \ref{fig:experimental-results-invariance-appendix}.
\begin{figure}
{\centering \includegraphics{manuscript_appendix_files/figure-latex/experimental-results-invariance-appendix-1}
}
\caption{Quantile-quantile plots for several datasets: age of patients with lung cancer (top row); penguin body mass (middle row); and latitude coordinates of houses sold in Ames, Iowa (bottom row). Multiple quantile-quantile plots are shown: for the original feature (left column); the feature transformed using the conventional Box-Cox transformation and Raymaekers and Rousseeuw's robust adaptation (middle column); and the feature transformed using the non-robust and robust location- and-scale invariant Box-Cox transformations (right column).}\label{fig:experimental-results-invariance-appendix}
\end{figure}
\subsubsection{Age of patients with lung
cancer}\label{age-of-patients-with-lung-cancer}
Applying conventional and invariant Box-Cox transformations to age of
patients with lung cancer (Loprinzi et al. 1994) yielded the following
results: no transformation (sum of residuals with normal distribution
\(\sum r_i = 16.5\)); conventional transformation (\(\lambda = 1.9\),
\(\sum r_i = 11.5\), \(\mu_{BC} = 1.6 \cdot 10^3\),
\(\sigma_{BC} = 0.4 \cdot 10^3\)); Raymaekers and Rousseeuw's robust
adaptation (\(\lambda = 1.9\), \(\sum r_i = 11.5\),
\(\mu_{BC} = 1.6 \cdot 10^3\), \(\sigma_{BC} = 0.4 \cdot 10^3\));
location- and scale-invariant transformation (\(\lambda = 1.7\),
\(\sum r_i = 11.6\), \(\mu_{BC} = 1.9\), \(\sigma_{BC} = 0.8\)); and
robust location- and scale-invariant transformation (\(\lambda = 1.5\),
\(\sum r_i = 11.6\), \(\mu_{BC} = 3.6\), \(\sigma_{BC} = 1.2\)).
Compared to location- and scale-invariant Yeo-Johnson transformations,
the Box-Cox transformations do not reduce residuals compared to
conventional variants.
\subsubsection{Penguin body mass}\label{penguin-body-mass}
Applying conventional and invariant Box-Cox transformations to the body
mass of penguins (Gorman, Williams, and Fraser 2014) yielded the
following results: no transformation (residual sum \(\sum r_i = 48.0\));
conventional transformation (\(\lambda = -0.5\), \(\sum r_i = 32.2\),
\(\mu_{BC} = 2.1\), \(\sigma_{BC} = 4 \cdot 10^{-3}\)); Raymaekers and
Rousseeuw's robust adaptation (\(\lambda = -0.5\), \(\sum r_i = 32.2\),
\(\mu_{BC} = 2.1\), \(\sigma_{BC} = 4 \cdot 10^{-3}\)); location- and
scale-invariant transformation (\(\lambda = 0.5\), \(\sum r_i = 27.3\),
\(\mu_{BC} = 0.3\), \(\sigma_{BC} = 0.6\)); and robust location- and
scale-invariant transformation (\(\lambda = 0.2\), \(\sum r_i = 25.2\),
\(\mu_{BC} = 0.4\), \(\sigma_{BC} = 0.6\)).
Just as for location- and scale-invariant Yeo-Johnson transformations,
Box-Cox transformations produced a lower overall residual sum compared
to their conventional counterparts. Similarly, conventional
transformations led to low standard deviation \(\sigma_{YJ}\) of the
body mass feature after transformation.
\subsubsection{Latitude in the Ames housing
dataset}\label{latitude-in-the-ames-housing-dataset}
Applying conventional and invariant Box-Cox transformations to the
latitude of houses in the Ames housing dataset (De Cock 2011) yielded
the following results: no transformation (residual sum
\(\sum r_i = 328\)); conventional transformation (\(\lambda = 62.1\),
\(\sum r_i = 319\), \(\mu_{BC} = 1.1 \cdot 10^{99}\),
\(\sigma_{BC} = 0.0 \cdot 10^{99}\)); Raymaekers and Rousseeuw's robust
adaptation (\(\lambda = 96.0\), \(\sum r_i = 319\),
\(\mu_{BC} = 6.2 \cdot 10^{153}\),
\(\sigma_{BC} = 0.3 \cdot 10^{153}\)); location- and scale-invariant
transformation (\(\lambda = 1.9\), \(\sum r_i = 312\),
\(\mu_{BC} = 2.3\), \(\sigma_{BC} = 0.9\)); and robust location- and
scale-invariant transformation (\(\lambda = 1.2\), \(\sum r_i = 316\),
\(\mu_{BC} = 5.5\), \(\sigma_{BC} = 1.4\)).
Similar to conventional Yeo-Johnson transformations (non-robust and
robust), conventional Box-Cox transformations had high values for the
\(\lambda\) parameter, which could lead to numerical issues. Location-
and scale-invariant Box-Cox transformations did not suffer from this
issue.
\subsection{Robustness against
outliers}\label{robustness-against-outliers}
Results for Box-Cox transformations of features with outliers are shown
in Figure \ref{fig:experimental-results-outlier-robustness-appendix}.
\begin{figure}
{\centering \includegraphics{manuscript_appendix_files/figure-latex/experimental-results-outlier-robustness-appendix-1}
}
\caption{Quantile-quantile plots for two datasets with outliers: vehicle fuel consumption (top row), where outliers are related to highly fuel-efficient vehicles; and maximum arterial wall thickness in patients with ischemic stroke (bottom row). Multiple quantile-quantile plots are shown: for the original feature (left column); the feature transformed using the conventional Box-Cox transformation and Raymaekers and Rousseeuw's robust adaptation (middle column); and the feature transformed using the non-robust and robust location- and-scale invariant Box-Cox transformations (right column). Samples with observed quantiles below $-3.0$ or above $3.0$ are indicated by crosses.}\label{fig:experimental-results-outlier-robustness-appendix}
\end{figure}
\subsubsection{Fuel efficiency in the Top Gear
dataset}\label{fuel-efficiency-in-the-top-gear-dataset}
The Top Gear dataset contains data on 297 vehicles, with outliers
related highly fuel-efficient vehicles (Alfons 2021). Applying
conventional and invariant Box-Cox transformations to the fuel
consumption feature yielded the following results: no transformation
(residual sum \(\sum r_i = 54\), \(p=0.76\)); conventional
transformation (\(\lambda = -0.1\), \(\sum r_i = 55\),
\(\mu_{BC} = 3.0\), \(\sigma_{BC} = 0.3\), \(p=0.01\)); Raymaekers and
Rousseeuw's robust adaptation (\(\lambda = 0.8\), \(\sum r_i = 48\),
\(\mu_{BC} = 29\), \(\sigma_{BC} = 15\), \(p=0.55\)); location- and
scale-invariant transformation (\(\lambda = -0.7\), \(\sum r_i = 44\),
\(\mu_{BC} = 0.6\), \(\sigma_{BC} = 0.2\), \(p=0.02\)); and robust
location- and scale-invariant transformation (\(\lambda = 1.1\),
\(\sum r_i = 59\), \(\mu_{BC} = 2.4\), \(\sigma_{BC} = 1.8\),
\(p=0.83\)).
\subsubsection{Maximum arterial wall thickness in an ischemic stroke
dataset}\label{maximum-arterial-wall-thickness-in-an-ischemic-stroke-dataset}
The ischemic stroke dataset contains historic data from 126 patients
with risk at ischemic stroke (Kuhn and Johnson 2019). Applying
conventional and invariant Box-Cox transformations to the maximum
arterial wall thickness feature yielded the following results: no
transformation (residual sum \(\sum r_i = 110\), \(p=0.56\));
conventional transformation (\(\lambda = -0.5\), \(\sum r_i = 33\),
\(\mu_{BC} = 1.0\), \(\sigma_{BC} = 0.2\), \(p=0.01\)); Raymaekers and
Rousseeuw's robust adaptation (\(\lambda = 1.1\), \(\sum r_i = 127\),
\(\mu_{BC} = 5.5\), \(\sigma_{BC} = 12\), \(p=0.60\)); location- and
scale-invariant transformation (\(\lambda = -1.0\), \(\sum r_i = 28\),
\(\mu_{BC} = 0.7\), \(\sigma_{BC} = 0.1\), \(p=0.01\)); and robust
location- and scale-invariant transformation (\(\lambda = 0.5\),
\(\sum r_i = 56\), \(\mu_{BC} = 2.2\), \(\sigma_{BC} = 1.4\),
\(p=0.35\)).
\section{Appendix E: Empirical central normality
test}\label{appendix-e-empirical-central-normality-test}
The empirical central normality test was derived using data sampled from
asymmetric generalised normal distributions, including outliers, to
resemble more realistic datasets. Here we assess the type I error rate
of two, less realistic, sets of data:
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
Data sampled from asymmetric generalised normal distributions without
outliers.
\item
Data sampled from normal distributions without outliers, without any
power transformation applied.
\end{enumerate}
Other aspects of the experiment remained the same. Thus, we first drew
\(m_d=10000\) random distributions. For asymmetric generalised normal
distributions, each distribution was parametrised with a randomly chosen
skewness parameter \(\alpha \sim U\left(0.01, 0.99\right)\) and shape
parameter \(\beta \sim U\left(1.00, 5.00 \right)\). For fully normal
distributions, skewness parameter \(\alpha = 0.5\) and shape parameter
\(\beta = 2.0\) were fixed. Location and scale parameters were set as
\(\mu = 0\) and \(\sigma = 1\), respectively.
\(n = \lceil 10^\gamma \rceil\) values were then randomly drawn, with
\(\gamma \sim U\left(1.47, 3.00\right)\), which led to between \(30\)
and \(1000\) values being drawn to create \(\mathbf{X}_i\). Residuals
were then computed after performing robust location- and scale-invariant
transformations with the empirical tapered cosine weighting method for
the dataset with asymmetric generalised normal distributions, and
without any transformation for the dataset with only normal
distributions.
\begin{figure}
{\centering \includegraphics{manuscript_appendix_files/figure-latex/empirical-central-normality-test-appendix-1}
}
\caption{Type I error rate as function of the test statistic $\tau_{\text{ecn}}$ for five datasets, with central portion $\kappa=0.80$. The type I error rate is computed from $m_d=10000$ randomly sampled features, These are sampled from asymmetric generalized normal distributions, with and without outliers (Box-Cox and Yeo-Johnson), or normal distributions with outliers (none). The test statistic is computed as the average residual of each feature after (Box-Cox and Yeo-Johnson) robust location- and shift-invariant power transformation, or before (none).}\label{fig:empirical-central-normality-test-appendix}
\end{figure}
\begin{table}
\begin{center}
\caption{Test statistic $\tau_{\text{ecn}}$ for empirical central normality at $\kappa = 0.80$ as a function of Type I error rate for several datasets.}
\label{tab:empirical-central-normality-appendix}
\begin{tabular}{l | c c c c c c c}
\toprule
type I error rate & 0.50 & 0.20 & 0.10 & 0.05 & 0.02 & 0.01 & 0.001 \\
\midrule
Box-Cox & 0.047 & 0.073 & 0.090 & 0.106 & 0.126 & 0.140 & 0.188 \\
Box-Cox (no outlier) & 0.043 & 0.065 & 0.079 & 0.092 & 0.106 & 0.116 & 0.155 \\
Yeo-Johnson & 0.041 & 0.062 & 0.075 & 0.088 & 0.103 & 0.115 & 0.154 \\
Yeo-Johnson (no outlier) & 0.041 & 0.061 & 0.074 & 0.085 & 0.099 & 0.109 & 0.139 \\
normal distr. & 0.039 & 0.066 & 0.083 & 0.097 & 0.117 & 0.132 & 0.174 \\
\bottomrule
\end{tabular}
\end{center}
\end{table}
The results are shown in Figure
\ref{fig:empirical-central-normality-test-appendix} and Table
\ref {tab:empirical-central-normality-appendix}. These indicate that the
test behaves similarly for the different datasets. For low type I error
rates, the test statistic proposed in the main manuscript is more
conservative than alternatives based on residuals after Box-Cox
transformations of asymmetric generalised normally distributed features
or on residuals from strictly normally distributed features.
\section{Appendix F: Normalisation before
transformation}\label{appendix-f-normalisation-before-transformation}
An alternative to location- and scale-invariant transformations is
normalising feature distributions prior to conventional transformations.
Table \ref{tab:normalisation-before-transformation-appendix} shows
residual errors, after transformation to normality, of the five features
from real-world datasets presented previously in Appendix D and the main
manuscript. In these examples location- and scale-invariant
transformations have similar or lower residual errors compared to errors
resulting from normalisation prior to transformation.
\begin{table}
\begin{center}
\caption{Residual errors for features from real-world datasets after Yeo-Johnson transformation to normality. conv.: conventional; norm.: normalisation; rob.: robust}
\label{tab:normalisation-before-transformation-appendix}
\begin{tabular}{l | c c c c c}
\toprule
feature & none & conventional & conv. (z-score norm.) & conv. (rob. scaling) & invariant \\
\midrule
age & 16.5 & 11.5 & 11.5 & 11.3 & 8.8 \\
penguin body mass & 48.0 & 32.2 & 33.3 & 32.2 & 26.8 \\
latitude & 328.1 & 319.0 & 326.2 & 324.5 & 326.4 \\
fuel efficiency & 54.5 & 55.3 & 49.0 & 53.3 & 44.0 \\
arterial wall thickness & 110.1 & 30.0 & 19.3 & 31.8 & 12.2 \\
\bottomrule
\end{tabular}
\end{center}
\end{table}
\section*{References}\label{references}
\addcontentsline{toc}{section}{References}
\phantomsection\label{refs}
\begin{CSLReferences}{1}{0}
\bibitem[\citeproctext]{ref-Alfons2021-kc}
Alfons, Andreas. 2021. {``{robustHD}: An {R} Package for Robust
Regression with High-Dimensional Data.''} \emph{J. Open Source Softw.} 6
(67): 3786. \url{https://doi.org/10.21105/joss.03786}.
\bibitem[\citeproctext]{ref-Anderson1952-gz}
Anderson, T W, and D A Darling. 1952. {``Asymptotic Theory of Certain
{`Goodness of Fit'} Criteria Based on Stochastic Processes.''}
\emph{Annals of Mathematical Statistics} 23 (2): 193--212.
\url{https://doi.org/10.1214/aoms/1177729437}.
\bibitem[\citeproctext]{ref-Anscombe1983-nz}
Anscombe, F J, and William J Glynn. 1983. {``Distribution of the
Kurtosis Statistic B2 for Normal Samples.''} \emph{Biometrika} 70 (1):
227--34. \url{https://doi.org/10.1093/biomet/70.1.227}.
\bibitem[\citeproctext]{ref-Cramer1928-rc}
Cramér, Harald. 1928. {``On the Composition of Elementary Errors.''}
\emph{Scand. Actuar. J.} 1928 (1): 13--74.
\url{https://doi.org/10.1080/03461238.1928.10416862}.
\bibitem[\citeproctext]{ref-DAgostino1990-kp}
D'Agostino, Ralph B, and Albert Belanger. 1990. {``A Suggestion for
Using Powerful and Informative Tests of Normality.''} \emph{Am. Stat.}
44 (4): 316--21. \url{https://doi.org/10.2307/2684359}.
\bibitem[\citeproctext]{ref-De-Cock2011-jf}
De Cock, Dean. 2011. {``Ames, Iowa: Alternative to the Boston Housing
Data as an End of Semester Regression Project.''} \emph{J. Stat. Educ.}
19 (3). \url{https://doi.org/10.1080/10691898.2011.11889627}.
\bibitem[\citeproctext]{ref-Gorman2014-eo}
Gorman, Kristen B, Tony D Williams, and William R Fraser. 2014.
{``Ecological Sexual Dimorphism and Environmental Variability Within a
Community of Antarctic Penguins (Genus Pygoscelis).''} \emph{PLoS One} 9
(3): e90081. \url{https://doi.org/10.1371/journal.pone.0090081}.
\bibitem[\citeproctext]{ref-Huber1981-su}
Huber, Peter J. 1981. \emph{Robust Statistics}. John Wiley \& Sons.
\url{https://doi.org/10.1002/0471725250}.
\bibitem[\citeproctext]{ref-Jarque1980-hw}
Jarque, Carlos M, and Anil K Bera. 1980. {``Efficient Tests for
Normality, Homoscedasticity and Serial Independence of Regression
Residuals.''} \emph{Econ. Lett.} 6 (3): 255--59.
\url{https://doi.org/10.1016/0165-1765(80)90024-5}.
\bibitem[\citeproctext]{ref-Kuhn2019-kt}
Kuhn, Max, and Kjell Johnson. 2019. \emph{Feature Engineering and
Selection: A Practical Approach for Predictive Models}. Chapman \&
Hall/CRC Data Science Series. Chapman; Hall/CRC.
\url{https://doi.org/10.1201/9781315108230}.
\bibitem[\citeproctext]{ref-Loprinzi1994-cd}
Loprinzi, C L, J A Laurie, H S Wieand, J E Krook, P J Novotny, J W
Kugler, J Bartel, M Law, M Bateman, and N E Klatt. 1994. {``Prospective
Evaluation of Prognostic Variables from Patient-Completed
Questionnaires. North Central Cancer Treatment Group.''} \emph{J. Clin.
Oncol.} 12 (3): 601--7. \url{https://doi.org/10.1200/JCO.1994.12.3.601}.
\bibitem[\citeproctext]{ref-Von_Mises1928-ef}
Mises, Richard von. 1928. \emph{Wahrscheinlichkeit Statistik Und
Wahrheit}. Schriften Zur Wissenschaftlichen Weltauffassung.
Springer-Verlag Berlin, Heidelberg.
\url{https://doi.org/10.1007/978-3-662-36230-3}.
\bibitem[\citeproctext]{ref-Raymaekers2024-zf}
Raymaekers, Jakob, and Peter J Rousseeuw. 2024. {``Transforming
Variables to Central Normality.''} \emph{Mach. Learn.} 113 (8):
4953--75. \url{https://doi.org/10.1007/s10994-021-05960-5}.
\bibitem[\citeproctext]{ref-Shapiro1965-zd}
Shapiro, S S, and M B Wilk. 1965. {``An Analysis of Variance Test for
Normality (Complete Samples).''} \emph{Biometrika} 52 (3/4): 591--611.
\url{https://doi.org/10.2307/2333709}.
\end{CSLReferences}
\end{document}