-
Notifications
You must be signed in to change notification settings - Fork 1
/
06-ddfab.Rmd
1663 lines (1353 loc) · 168 KB
/
06-ddfab.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Detection of data fabrication using statistical tools
```{r echo = FALSE}
# also requires dplyr
library(ggplot2)
library(plyr)
suppressPackageStartupMessages(library(viridis))
library(magrittr)
library(kableExtra)
library(knitr)
# devtools::install_github('chartgerink/ddfab')
library(ddfab)
options(digits = 3)
file <- './assets/data/study_01/raw_summary_results_fabrication_qualtrics.csv'
res_file <- './assets/data/study_01/qualtrics_processed.csv'
ml_dat_file <- './assets/data/study_01/anchoring_ml/chjh ml1_anchoring cleaned.sav'
summary_stat_file <- './assets/data/study_01/ml_summary_stats.csv'
ml3_dat_file <- './assets/data/study_02/study_02-ml3_stroop/StroopCleanSet.csv'
# Set the number of iterations to use in calculations
iter <- 100000
m <- 50
sd <- 10
n <- 100
set.seed(1234)
x1_1 <- replicate(n = 10, expr = rnorm(n, m, sd))
x1_2 <- replicate(n = 10, expr = rnorm(n, m, sd))
ex1_1em <- apply(x1_1, 2, mean)
ex1_1esd <- apply(x1_1, 2, sd)
ex1_1cm <- apply(x1_2, 2, mean)
ex1_1csd <- apply(x1_2, 2, sd)
ex1_1tp <- NULL
for (i in 1:dim(x1_1)[2]) ex1_1tp[i] <- t.test(x = x1_1[,i], y = x1_2[,i], var.equal=TRUE)$p.value
ex1_1tp <- unlist(ex1_1tp)
x2_1 <- replicate(n = 1000, expr = rnorm(n, m, sd))
x2_2 <- replicate(n = 1000, expr = rnorm(n, m, sd))
ex2_1em <- apply(x2_1, 2, mean)
ex2_1esd <- apply(x2_1, 2, sd)
ex2_1cm <- apply(x2_2, 2, mean)
ex2_1csd <- apply(x2_2, 2, sd)
ex2_1tp <- NULL
for (i in 1:dim(x2_1)[2]) ex2_1tp[i] <- t.test(x = x2_1[,i], y = x2_2[,i], var.equal=TRUE)$p.value
ex2_1tp <- unlist(ex2_1tp)
tmp <- data.frame(ex2_1em, ex2_1esd, ex2_1cm, ex2_1csd, ex2_1tp)
set2_df <- tail(tmp[order(tmp$ex2_1tp),], 10)
set2_df <- set2_df[sample(nrow(set2_df)),]
set.seed(1234)
x2_1em <- replicate(n = 10, expr = m + runif(1, 0, 20))
x2_2cm <- replicate(n = 10, expr = m + runif(1, 0, 20))
x2_1esd <- replicate(n = 10, expr = sd + runif(1, 0, 1.5))
x2_2csd <- replicate(n = 10, expr = sd + runif(1, 0, 1.5))
sdpooled <- ((n - 1) * x2_1esd^2 + (n - 1) * x2_2csd^2)/(n * 2 - 2)
tval <- (x2_1em - x2_2cm) / sdpooled
ex2_1tp <- pt(abs(tval), n * 2 - 2, lower.tail = FALSE) * 2
# Just saving for later use in theory section
# set.seed(1234)
# std_var(n = rep(100, length(x2_1esd) * 2), sds = c(x2_1esd, x2_2csd), iter = 10000)
df <- data.frame(study = sprintf("Study %s", 1:10),
sprintf("%s (%s)", round(ex1_1em, 3), round(ex1_1esd, 3)),
sprintf("%s (%s)", round(ex1_1cm, 3), round(ex1_1csd, 3)),
ex1_1tp,
sprintf("%s (%s)", round(set2_df$ex2_1em, 3), round(set2_df$ex2_1esd, 3)),
sprintf("%s (%s)", round(set2_df$ex2_1cm, 3), round(set2_df$ex2_1csd, 3)),
set2_df$ex2_1tp)
names(df) <- c("",
"M (SD)",
"M (SD)",
"P-value",
"M (SD)",
"M (SD)",
"P-value")
```
Any field of empirical inquiry is faced with cases of scientific misconduct at some point, either in the form of fabrication, falsification, or plagiarism (FFP).
Psychology faced Stapel; medical sciences faced Poldermans and Macchiarini; life sciences faced Voignet; physical sciences faced Schön --- these are just a few examples of research misconduct cases in the last decade.
Overall, an estimated 2% of all scholars admit to having falsified or fabricated research results at least once during their career [@doi:10.1371/journal.pone.0005738], which due to its self-report nature is likely to be an underestimate of the true rate of misconduct.
The detection rate of data fabrication is likely to be even lower; for example, among several hundreds of thousands of researchers working in the United States and the Netherlands, only around a dozen cases become public each year.
At best, this suggests a detection rate below 1% among those 2% who admit to fabricating or falsifying data --- the tip of a seemingly much larger iceberg.
The ability to detect fabricated data may help deter researchers from fabricating data in their work. Deterrence theory [e.g., @leviathan] states that improved detection and sanctioning of behaviors decreases the expected utility of said behaviors, ultimately leading to fewer people to engage in it.
Detection techniques have developed differently for fabrication, falsification, and plagiarism.
Plagiarism scanners have been around the longest [e.g., @doi:10.1109/13.28038] and are widely implemented in practice, not only at journals but also in the evaluation of student theses (e.g., with commercial services such as Turnitin).
Various tools have been developed to detect image manipulation and some of these tools have been implemented at biomedical journals to screen for fabricated or falsified images.
For example, the Journal of Cell Biology and the EMBO journal scan each submitted image for potential image manipulation [@The_Journal_of_Cell_Biology2015-vh;@doi:10.1038/546575a], which supposedly increases the risk of detecting (blatant) image manipulation.
Recently developed algorithms even allow automated scanning of images for such manipulations [@doi:10.1007/s11948-016-9841-7].
The application of such tools can also help researchers systematically evaluate research articles in order to estimate the extent to which image manipulation occurs in the literature [4% of all papers are estimated to contain manipulated images; @doi:10.1128/mBio.00809-16] and to study factors that predict image manipulation [@doi:10.1007/s11948-018-0023-7].
Methods to detect fabrication of quantitative data are often based on a mix of psychology theory and statistics theory.
Because humans are notoriously bad at understanding and estimating randomness [@Haldane1948-nm;@doi:10.1126/science.185.4157.1124;@doi:10.1037/h0031322;@doi:10.1037/1082-989X.5.2.241;@doi:10.1037/h0032060], they might create fabricated data that fail to follow the fundamentally probabilistic nature of genuine data.
Data and outcomes of analyses based on these data that fail to align with the (at least partly probabilistic) processes that are assumed to underlie genuine data may indicate deviations from the reported data collecting protocol, potentially even data fabrication or falsification.
Statistical methods have proven to be of importance in initiating data fabrication investigations or in assessing the scope of potential data fabrication.
For example, Kranke, Apfel, and Roewer skeptically perceived Fujii's data [@doi:10.1213/00000539-200004000-00053] and used statistical methods to contextualize their skepticism.
At the time, a reviewer perceived them to be on a "crusade against Fujii and his colleagues" [@doi:10.1111/j.1365-2044.2012.07318.x] and further investigation remained absent.
Only when Carlisle extended the systematic investigation to 168 of Fujii's papers for misconduct [@doi:10.1111/j.1365-2044.2012.07128.x;@doi:10.1111/anae.13650;@doi:10.1111/anae.13126] did events cumulate into an investigation- and ultimately retraction of 183 of Fujii's peer-reviewed papers [@oransky2015;@doi:10.1016/j.ijoa.2012.10.001].
In another example, the Stapel case, statistical evaluation of his oeuvre occurred after he had already confessed to fabricating data, which ultimately resulted in 58 retractions of papers (co-)authored by Stapel [@Levelt2012;@oransky2015].
In order to determine whether the application of statistical methods to detect data fabrication is responsible and valuable, we need to study their diagnostic value.
Specifically, many of the developed statistical methods to detect data fabrication are quantifications of case specific suspicions by researchers, but these applications do not inform us on their diagnostic value (i.e., sensitivity and specificity) outside of those specific cases.
Side-by-side comparisons of different statistical methods to detect data fabrication have also been difficult through the in-casu origin of these methods.
Moreover, the efficacy of these methods based on known cases is likely to be biased, considering that an unknown amount of undetected cases is not included.
Using different statistical methods to detect fabricated data using genuine versus fabricated data yields information on the sensitivity and specificity of the detection tools. This is important because of the severe professional- and personal consequences of accusations of potential research misconduct [as illustrated by the STAP case; @doi:10.1038/520600a].
These methods might have utility in misconduct investigations where the prior chances of misconduct are high, but their diagnostic value in large-scale applications to screen the literature are unclear.
In this article, we investigate the diagnostic performance of various statistical methods to detect data fabrication.
These statistical methods (detailed next) have not previously been validated systematically in research using both genuine and fabricated data.
We present two studies where we try to distinguish (arguably) genuine data from known fabricated data based on these statistical methods.
These studies investigate methods to detect data fabrication in summary statistics (Study 1) or in individual level (raw) data (Study 2) in psychology.
In Study 1, we invited researchers to fabricate summary statistics for a set of four anchoring studies, for which we also had genuine data from the Many Labs 1 initiative [[https://osf.io/pqf9r](https://osf.io/pqf9r); @doi:10.1027/1864-9335/a000178].
In Study 2, we invited researchers to fabricate individual level data for a classic Stroop experiment, for which we also had genuine data from the Many Labs 3 initiative [[https://osf.io/n8xa7/](https://osf.io/n8xa7/); @doi:10.1016/j.jesp.2015.10.012].
Before presenting these studies, we discuss the theoretical framework of the investigated statistical methods to detect data fabrication.
## Theoretical framework
Statistical methods to detect potential data fabrication can be based either on reported summary statistics that can often be retrieved from articles or on the raw (underlying) data if these are available. Below we detail $p$-value analysis, variance analysis, and effect size analysis as potential ways to detect data fabrication using summary statistics.
$P$-value analyses can be applied whenever a set of nonsignificant $p$-values are reported; variance analysis can be applied whenever a set of variances and accompanying sample sizes are reported for independent, randomly assigned groups; effect size analysis can be used whenever the effect size is reported or calculated [e.g., an APA reported $t$- or $F$-statistic; @doi:10.1525/collabra.71].
Among the methods that can be applied to uncover potential fabrication using raw data, we consider digit analyses (i.e., the Newcomb-Benford law and terminal digit analysis) and multivariate associations between variables. The Newcomb-Benford law can be applied on ratio- or count scale measures that have sufficient digits and that are not truncated [@doi:10.1016/j.spa.2005.05.003]; terminal digit analysis can also be applied whenever measures have sufficient digits [see also @doi:10.1080/08989629508573866].
Multivariate associations can be investigated whenever there are two or more numerical variables available and data on that same relation is available from (arguably) genuine data sources.
### Detecting data fabrication in summary statistics
#### $P$-value analysis
The distribution of a single or a set of independent $p$-values is uniform if the null hypothesis is true, while it is right-skewed if the alternative hypothesis is true [@Fisher1925-jl].
If the model assumptions of the underlying process hold, the probability density function of one $p$-value is the result of the population effect size, the precision of the estimate, and the observed effect size, whose properties carry over to a set of $p$-values if those $p$-values are independent.
When assumptions underlying the model used to compute a $p$-value are violated, $p$-value distributions can take on a variety of shapes.
For example, when optional stopping (i.e., adding batches of participants until you have a statistically significant result) occurs and the null hypothesis is true, $p$-values just below .05 become more frequent [@doi:10.1080/17470218.2014.982664;@doi:10.7717/peerj.1935].
However, when optional stopping occurs under the alternative hypothesis or when other researcher degrees of freedom are used in an effort to obtain significance [@doi:10.1177/0956797611417632;@doi:10.3389/fpsyg.2016.01832], a right-skewed distribution for significant $p$-values can and will likely still occur [@doi:10.1037/xge0000086;@doi:10.7717/peerj.1935].
A failure of independent $p$-values to be right-skewed or uniformly distributed (as would be theoretically expected) can indicate potential data fabrication.
For example, in the Fujii case, baseline measurements of supposed randomly assigned groups later turned out to be fabricated.
When participants are randomly assigned to conditions, measures at baseline are expected to statistically equivalent between the groups (i.e., equivalent distributions), hence, produce uniformly distributed $p$-values.
However, in the Fujii case, Carlisle observed many large $p$-values, which ultimately led to the identification of potential data fabrication [@doi:10.1111/j.1365-2044.2012.07128.x].
The cause of such large $p$-values may be that the effect of randomness is underappreciated when fabricating statistically nonsignificant data due to (for example) widespread misunderstanding of what a $p$-value means [@doi:10.1007/s11336-015-9444-2;@doi:10.1053/j.seminhematol.2008.04.003], which results in groups of data that are too similar conditional on the null hypothesis of no differences between the groups.
As an illustration, we simulated normal distributed measurements for studies and their $t$-test comparisons in Table \@ref(tab:ddfab-tab1), under statistically equivalent populations (Set 1).
We also fabricated independent data for equivalent groups, where we fixed the mean and standard deviation for all studies and subsequently added (too) little uniform noise to these parameters (Set 2).
The expected value of a uniform $p$-value distribution is .5, but the fabricated data from our illustration have a mean $p$-value of `r round(mean(set2_df$ex2_1tp), 3)`.
```{r ddfab-tab1, echo = FALSE}
if (!knitr::is_html_output()) {
knitr::kable(df, format = 'latex', digits = 3, caption = "Examples of means and standard deviations for a continuous outcome in genuine and fabricated randomized clinical trials. Set 1 is randomly generated data under the null hypothesis of random assignment (assumed to be the genuine process), whereas Set 2 is generated under excessive consistency with equal groups. Each trial condition contains 100 participants. The $p$-values are the result of independent $t$-tests comparing the experimental and control conditions within each respective set of a study.", booktabs = TRUE) %>%
kable_styling(latex_options = c('striped', 'scale_down', 'hold_position'), position = 'center') %>%
add_header_above(c('', rep(c('Experimental', 'Control', ''), 2))) %>%
add_header_above(c('', 'Set 1' = 3, 'Set 2' = 3))
} else {
knitr::kable(df, digits = 3, caption = "Examples of means and standard deviations for a continuous outcome in genuine and fabricated randomized clinical trials. Set 1 is randomly generated data under the null hypothesis of random assignment (assumed to be the genuine process), whereas Set 2 is generated under excessive consistency with equal groups. Each trial condition contains 100 participants. The $p$-values are the result of independent $t$-tests comparing the experimental and control conditions within each respective set of a study.", booktabs = TRUE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F), position = 'center') %>%
add_header_above(c('', rep(c('Experimental', 'Control', ''), 2))) %>%
add_header_above(c('', 'Set 1' = 3, 'Set 2' = 3))
}
```
In order to test whether a distribution of independent $p$-values might be fabricated, we propose using the Fisher method [@Fisher1925-jl;@doi:10.1186/s41073-016-0012-9].
The Fisher method originally was intended as a meta-analytic tool, which tests whether there is sufficient evidence for an effect (i.e., right-skewed $p$-value distribution).
The original Fisher method is computed over the individual $p$-values ($p_i$) as
\begin{equation}
\chi^2_{2k}=-2\sum\limits^k_{i=1}\ln(p_i)
(\#eq:fisher)
\end{equation}
where the null hypothesis of a zero true effect size underlying all $k$ results is tested and is rejected for values of the test statistic that are larger than a certain value, typically the 95th percentile of $\chi^2_{2k}$, to conclude that true effect size differs from zero for at least one of $k$ results. The Fisher method can be adapted to test the same null hypothesis against the alternative that the results are closer to their expected values than expected under the null. The adapted test statistic of this so-called 'reversed Fisher method' is
\begin{equation}
\chi^2_{2k}=-2\sum\limits^k_{i=1}\ln(1-\frac{p_i-t}{1-t})
(\#eq:revfisher)
\end{equation}
where $t$ determines the range of $p$-values that are selected in the method. For instance, if $t=0$, all $p$-values are selected, whereas if $t=0.05$ only statistically nonsignificant results are selected in the method. Note that each result's contribution (between the brackets) is in the interval (0,1), as for the original Fisher method. The reversed Fisher method is similar (but not equivalent) to Carlisle's method testing for excessive homogeneity across baseline measurements in Randomized Controlled Trials [@doi:10.1111/anae.13938;@doi:10.1111/j.1365-2044.2012.07128.x;@doi:10.1111/anae.13126].
```{r, echo = FALSE}
threshold <- .05
genuine_p <- ex1_1tp[ex1_1tp > threshold]
fab_p <- set2_df$ex2_1tp[set2_df$ex2_1tp > threshold]
genuine_chi <- -2 * sum(log(1 - ((genuine_p - threshold) / (1 - threshold))))
fab_chi <- -2 * sum(log(1 - ((fab_p - threshold) / (1 - threshold))))
genuine_chi_p <- pchisq(q = genuine_chi, df = 2 * length(genuine_p), lower.tail = FALSE)
fab_chi_p <- pchisq(q = fab_chi, df = 2 * length(fab_p), lower.tail = FALSE)
x <- c(.21, -.08, -.37, -.08, .32)
pval <- pnorm(abs(x), 0, 1, lower.tail = FALSE) * 2
pval <- pval[pval > threshold]
chi <- -2 * sum(log(1 - ((pval - threshold) / (1 - threshold))))
p_chi <- pchisq(q = chi, df = 2 * length(pval), lower.tail = FALSE)
```
As an example, we apply the reversed Fisher method to both the genuine and fabricated results from Table \@ref(tab:ddfab-tab1).
Using the threshold $t=`r threshold`$ to select only the nonsignificant results from Table \@ref(tab:ddfab-tab1), we retain $k=`r length(genuine_p)`$ genuine $p$-values and $k=`r length(fab_p)`$ fabricated $p$-values. This results in $\chi^2_{2\times`r length(genuine_p)`}=`r round(genuine_chi, 3)`,p=`r round(genuine_chi_p, 3)`$ for the genuine data (Set 1), and $\chi^2_{2\times`r length(fab_p)`}=`r round(fab_chi, 3)`,p=`r round(fab_chi_p, 7)`$ for the fabricated data (Set 2).
Another example, from the Fujii case [@doi:10.1111/j.1365-2044.2012.07128.x], also illustrates that the reversed Fisher method may detect fabricated data; the $p$-values related to fentanyl dose [as presented in Table 3 of @doi:10.1111/j.1365-2044.2012.07128.x] for five independent comparisons also show excessively high $p$-values, $\chi^2_{2\times`r length(pval)`}=`r round(chi, 3)`, p=`r round(p_chi, 3)`$.
However, based on this anecdotal evidence little can be said about the sensitivity, specificity, and utility of the reversed Fisher method.
We note that incorrectly specified one-tailed tests can also result in excessive amounts of large $p$-values.
For correctly specified one-tailed tests, the $p$-value distribution is right-skewed if the alternative hypothesis were true.
When the alternative hypothesis is true, but the effect is in the opposite direction of the hypothesized effect (e.g., a negative effect when a one-tailed test for a positive effect is conducted), this results in a left-skewed $p$-value distribution.
As such, any potential data fabrication detected with this method would need to be inspected for misspecified one-tailed hypotheses to preclude false conclusions.
In the studies we present in this paper, misspecification of one-tailed hypothesis testing is not an issue because we prespecified the effect and its direction to the participants who were requested to fabricate data.
#### Variance analysis
In most empirical research papers, sample variance or standard deviation estimates are typically reported alongside means to indicate dispersion in the data. For example, if a sample has a reported age of $M(SD)=21.05(2.11)$ we know this sample is both younger and more homogeneous than another sample with reported $M(SD)=42.78(17.83)$ [see also @klaassen2014evidential for another approach].
Similar to the estimate of the mean in the data, there is sampling error in the estimated variance in the data (i.e., dispersion of the variance).
The sampling error of the estimated variance is inversely related to the sample size.
For example, under the assumption of normality the sampling error of a given standard deviation can be estimated as $\sigma/\sqrt{2n}$ [p. 351,@yule1922], where $n$ is the sample size of the group.
Additionally, if an observed random variable $x$ is normally distributed, the standardized variance of $x$ in sample $j$ is $\chi^2$-distributed [p. 445; @hogg-tanis]; that is
\begin{equation}
var(x)\sim\frac{\chi^2_{n_j-1}}{n_j-1}
(\#eq:varx)
\end{equation}
where $n$ is the sample size of the $j$th group. Assuming equal variances of the $J$ populations, this population variance is estimated by the Mean Squares within ($MS_w$) as
\begin{equation}
MS_w=\frac{\sum\limits^k_{j=1}(n_j-1)s^2_j}{\sum\limits^k_{j=1}(n_j-1)}
(\#eq:msw)
\end{equation}
where $s^2_j$ is the sample variance and $n_j$ the sample size in group $j$.
As such, under normality and equality of variances, the sampling distribution of standardized^[By dividing all variances by $MS_w$ their weighted average equals 1. This is what we call standardization for this scenario.] variances in group $j$ (i.e., $z^2_j$) is
\begin{equation}
z^2_j\sim\left(\frac{\chi^2_{n_j-1}}{n_j-1}\right)/MS_w
(\#eq:z2j)
\end{equation}
Using the theoretical sampling distribution of the standardized variances, we bootstrap the expected distribution of the dispersion of variances. In other words, we use the theoretical sampling distribution of the standard deviations to formulate a null model of the dispersion of variances that is in line with the probabilistic sampling processes for groups of equal population variances. First, we randomly draw standard deviations for all $j$ groups according to Equation \@ref(eq:varx). Second, we calculate $MS_w$ using those previously drawn values (Equation \@ref(eq:msw)). Third, we standardize the standard deviations using Equation \@ref(eq:z2j). Fourth, we compute the measure of dispersion across the $j$ groups as the standard deviation of the standardized variances [denoted $SD_z$, @doi:10.1177/0956797613480366] or as the range of the standardized variances (denoted $max_z-min_z$).
This process is repeated for $i$ iterations to generate a parametric bootstrap distribution of the dispersion of variances according to the null model of equal variances across populations.
The observed dispersion of the variances, when compared to its expected distribution, allows a test for potential data fabrication. To this end we compute the proportion of iterations that show equally- or more extreme consistency in the dispersion of the variances to compute a bootstrapped $p$-value (e.g., $P(X\leq SD_{obs})$), with $SD_{obs}$ the standard deviation of standardized variances and $X$ the random variable corresponding to the standard deviation of standardized variances under the null model.
In other words, we compute how many samples of $j$ groups under the null show the observed consistency of the dispersion in the variances (or more consistent), to test whether the data are plausible given a genuine probabilistic sampling process [@doi:10.1177/0956797613480366].
Similar to the Fisher method, this could be the result of the fabricator underappreciating the higher level sampling fluctuations, resulting in generating too little randomness (i.e., error) in the standard deviations across groups [@doi:10.1080/08989629508573866].
```{r, echo = FALSE}
sd1 <- c(ex1_1esd, ex1_1csd)
sd2 <- c(set2_df$ex2_1esd, set2_df$ex2_1csd)
iterate <- 100000
res1 <- std_var(n = rep(100, length(sd1)), sds = sd1, iter = iterate, subgroups = rep(1, length(sd1)))
res2 <- std_var(n = rep(100, length(sd2)), sds = sd2, iter = iterate, subgroups = rep(1, length(sd2)))
```
As an example, we apply the variance analysis to the illustration from Table \@ref(tab:ddfab-tab1) and the Smeesters case [@doi:10.1177/0956797613480366].
We apply the variance analysis across the standard deviations from each set in Table \@ref(tab:ddfab-tab1).
For the genuinely probabilistic data (Set 1), we find that the reported mean standard deviation is `r mean(sd1)` with a standard deviation equal to `r sd(sd1)`. For the fabricated data (Set 2), we find that the reported mean standard deviation is `r mean(sd2)` with a standard deviation equal to `r sd(sd2)`. Such means illustrate the differences, but are insufficient to test them.
Using the standard deviation of variances as the dispersion of variances measure, we can quantify how extreme this difference is using the previously outlined procedure. Results indicate that Set 1 has no excessive consistency in the dispersion of the standard deviations ($p=`r round(res1, 3)`$), whereas Set 2 does show excessive consistency in the dispersion of the standard deviations ($p=`r round(res2, 3)`$).
In words, out of 100,000 randomly selected samples under the null model of independent groups with equal variances on a normally distributed measure, $`r res1 * iterate`$ showed less dispersion in standard deviations for Set 1, whereas only $`r res2 * iterate`$ showed less dispersion in standard deviations for Set 2.
As a non-fictional example, three independent conditions from a study in the Smeesters case ($n_j=15$) were reported to have standard deviations 25.09, 24.58, and 25.65 [@doi:10.1177/0956797613480366].
Here too, we can use the outlined procedure to see whether these reported standard deviations are too consistent according to sampling fluctuations of the second moment of the data according to theory.
The standard deviation of these standard deviations is $`r round(sd(c(25.09, 24.58, 25.65)), 2)`$.
Comparing this to 100,000 randomly selected replications under the theoretical null model, such consistency in standard deviations (or even more) would only be observed in `r round(std_var(n = rep(15, 3), sds = c(25.09, 24.58, 25.65), iter = iterate, subgroups = rep(1, 3)) * 100, 2)`% of those [@doi:10.1177/0956797613480366].
#### Extreme effect sizes
There is sufficient evidence that data fabrication can result in (too) large effects.
For example, in the misconduct investigations in the Stapel case, large effect sizes were used as an indicator of data fabrication [@Levelt2012] with some papers showing incredibly large effect sizes that translate to explained variances of up to 95% or these effect sizes were larger than the product of the reliabilities of the related measures.
Moreover, @doi:10.1186/1471-2288-3-18 asked faculty members from three universities to fabricate data sets and found that the fabricated data generally showed much larger effect sizes than the genuine data.
From our own anecdotal experience, we have found that large effect sizes raised initial suspicions of data fabrication (e.g., $d>20$).
In clinical trials, extreme effect sizes are also used to identify potentially fabricated data in multisite trials while the study is still being conducted [@doi:10.1016/0197-24569190037-M].
```{r, echo = FALSE}
tval <- 3.55
df <- 59
```
Effect sizes can be reported in research reports in various ways.
For example, effect sizes in psychology papers are often reported as a standardized mean difference (e.g., $d$) or as an explained variance (e.g., $R^2$).
A test statistic can be transformed into a measure of effect size.
A test result such as $t(
`r df`)=`r tval`$ in a between-subjects design corresponds to $d=`r round(2*tval / sqrt(df), 3)`$ and $r=`r round(sqrt((tval^2 / df) / ((tval^2 / df) + 1)), 3)`$ [@doi:10.1525/collabra.71].
These effect sizes can readily be recomputed based on data extracted with `statcheck` across thousands of results [see Appendix B or @doi:10.3758/s13428-015-0664-2;@doi:10.3390/data1030014].
Observed effect sizes can subsequently be compared with the effect distribution of other studies investigating the same effect.
For example, if a study on the 'foot-in-the-door' technique [@doi:10.1146/annurev.psych.55.090902.142015] yields an effect size of $r=.8$, we can collect other studies that investigate the 'foot-in-the-door' effect and compare how extreme that $r=.8$ is in comparison to the other studies.
If the largest observed effect size in the distribution is $r=.2$ and a reasonable number of studies on the 'foot-in-the-door' effect have been conducted, an extremely large effect might be considered a flag for potential data fabrication.
This method specifically looks at situations where fabricators would want to fabricate the existence of an effect (not the absence of one).
### Detecting data fabrication in raw data
#### Digit analysis
The properties of leading (first) digits (e.g., the 1 in 123.45) or terminal (last) digits (e.g., the 5 in 123.45) may be examined in raw data. Here we focus on testing the distribution of leading digits based on the Newcomb-Benford Law (NBL) and testing the distribution of terminal digits based on the uniform distribution in order to detect potentially fabricated data.
For leading digits, the Newcomb-Benford Law or NBL [@doi:10.2307/2369148;@doi:10.2307/984802] states that these digits do not have an equal probability of occuring under certain conditions, but rather a monotonically decreasing probability. A leading digit is the left-most digit of a numeric value, where a digit is any of the nine natural numbers ($1,2,3,...,9$). The distribution of the leading digit is, according to the NBL:
\begin{equation}
P(d)=log_{10}\frac{1+d}{d}
(\#eq:nbl)
\end{equation}
where $d$ is the natural number of the leading digit and $P(d)$ is the probability of $d$ occurring. Table \@ref(tab:nbl) indicates the expected leading digit distribution based on the NBL. This expected distribution is typically compared to the observed distribution using a $\chi^2$-test ($df=9-1$). In order to make such a comparison feasible, it requires a minimum of 45 observations based on the rule of thumb outlined by @isbn:0471360937 ($n=I\times J\times 5$, with $I$ rows and $J$ columns). The NBL has been applied to detect financial fraud [e.g., @doi:10.2307/27643897], voting fraud [e.g., @durtschi2004effective], and also problems in scientific data [@doi:10.1007/s00101-017-0333-1;@doi:10.1515/9783110508420-010].
```{r nbl, echo=FALSE}
tab <- data.frame(digit = 1:9, prop = log10((1+1:9)/1:9))
names(tab) <- c('Digit', 'Proportion')
if (!knitr::is_html_output()) {
knitr::kable(tab, format = 'latex', digits = 3, caption = 'The expected first digit distribution, based on the Newcomb-Benford Law.', booktabs = TRUE) %>%
kableExtra::kable_styling(latex_options = c('striped', 'hold_position'), position = 'center')
} else {
knitr::kable(tab, digits = 3, caption = 'The expected first digit distribution, based on the Newcomb-Benford Law.', booktabs = TRUE) %>%
kableExtra::kable_styling(position = 'center',
bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
}
```
However, the NBL only applies under specific conditions that are rarely fulfilled in the social sciences. Hence, its applicability for detecting data fabrication in science can be questioned. First, the NBL only applies for true ratio scale measures [@doi:10.2307/2246134;@doi:10.1214/11-ps175]. Second, sufficient range on the measure is required for the NBL to apply [i.e., range from at least $1-1000000$ or $1-10^6$;@doi:10.1198/tast.2009.0005]. Third, these measures should not be subject to digit preferences, for example due to psychological preferences for rounded numbers. Fourth, any form of truncation undermines the NBL [@doi:10.1515/9781400866595-011]. Moreover, some research has even indicated that humans might be able to fabricate data that are in line with the NBL [@doi:10.1080/02664760601004940;@Burns2009], immediately undermining the applicability of the NBL in context of detecting data fabrication.
For terminal digits, analysis is based on the principle that the rightmost digit is the most random digit of a number, hence, is expected to be uniformly distributed under specific conditions [@doi:10.1080/08989629508573866;@doi:10.1080/03610919608813325]. Terminal digit analysis is also conducted using a $\chi^2$-test ($df=10-1$) on the digit occurrence counts (including zero), where the observed frequencies are compared with the expected uniform frequencies. The rule of thumb outlined by @isbn:0471360937 indicates at least 50 observations are required to provide a meaningful test of the terminal digit distribution ($n=I\times J \times 5$, with $I$ rows and $J$ columns). Terminal digit analysis was developed during the Imanishi-Kari case by @doi:10.1080/03610919608813325 [for a history of this decade long case, see @isbn:9780393319705].
```{r, echo=FALSE}
if(!file.exists('./assets/data/study_02/p5')) {
p1 <- NULL
p2 <- NULL
p3 <- NULL
p4 <- NULL
p5 <- NULL
for(i in 1:1000) {
set.seed(1234 + i)
tmp1 <- 0
tmp2 <- 1
n <- 500
y <- data.frame(value = rnorm(n, tmp1, tmp2))
x <- abs(y$value)
splitted <- strsplit(as.character(x), "")
df1 <- data.frame(digit = as.numeric(unlist(lapply(splitted, function(x) return(x[1])))))
df1 <- data.frame(digit = df1[!df1$digit == 0,])
df2 <- data.frame(digit = as.numeric(unlist(lapply(splitted, function(x) return(x[3])))))
df3 <- data.frame(digit = as.numeric(unlist(lapply(splitted, function(x) return(x[4])))))
df4 <- data.frame(digit = as.numeric(unlist(lapply(splitted, function(x) return(x[5])))))
df5 <- data.frame(digit = as.numeric(unlist(lapply(splitted, function(x) return(x[6])))))
p1[i] <- digit_analysis(df1[,1], type = "terminal")$pval
p2[i] <- digit_analysis(df2[,1], type = "terminal")$pval
p3[i] <- digit_analysis(df3[,1], type = "terminal")$pval
p4[i] <- digit_analysis(df4[,1], type = "terminal")$pval
p5[i] <- digit_analysis(df5[,1], type = "terminal")$pval
}
save(p1, file = './assets/data/study_02/p1')
save(p2, file = './assets/data/study_02/p2')
save(p3, file = './assets/data/study_02/p3')
save(p4, file = './assets/data/study_02/p4')
save(p5, file = './assets/data/study_02/p5')
}
load('./assets/data/study_02/p1')
load('./assets/data/study_02/p2')
load('./assets/data/study_02/p3')
load('./assets/data/study_02/p4')
load('./assets/data/study_02/p5')
```
Figure \@ref(fig:digit-dist) depicts simulated digit counts for the first- through third digit of a random, standard normally distributed variable (i.e., $N\sim(0,1)$). The first- and second digit distributions are clearly non-uniform, whereas the third digit distribution seems only slightly non-uniform. As such, the rightmost digit can be expected to be uniformly distributed if sufficient precision is provided [@doi:10.1080/08989629508573866]. What sufficient precision is, depends on the process generating the data. In our example with $N\sim(0,1)$, the distribution of the third and later digits seem well-approximated by the uniform distribution. <!-- we investigated by running a small simulation study, drawing 500 random values from a normal distribution ($N\sim(0,1)$) thousand times and conducting a terminal digit test for each of the first five digits. For the third-, fourth-, and fifth- digits, tests operated on nominal $\alpha$ levels (i.e., under $\alpha=.05$, false positives were $`r round(sum(p3 > .05) / 1000, 3)`$, $`r round(sum(p4 > .05) / 1000, 3)`$, $`r round(sum(p5 > .05) / 1000, 3)`$, respectively). Hence, sufficient precision for our purposes is determined as the terminal digit being conducted on at least the third leading digit (i.e., minimally 1.23 or 12.3 or 123). -->
```{r digit-dist, echo = FALSE, fig.cap="Frequency distributions of the first-, second-, and third digits. We sampled 100,000 values from a standard normal distribution to create these digit distributions.", fig.align = 'center', fig.asp=.625}
set.seed(1234)
tmp1 <- 0
tmp2 <- 1
y <- data.frame(value = rnorm(100000, tmp1, tmp2))
x <- abs(y$value)
splitted <- strsplit(as.character(x), "")
df1 <- data.frame(digit = as.numeric(unlist(lapply(splitted, function(x) return(x[1])))))
df1 <- data.frame(digit = df1[!df1$digit == 0,])
df2 <- data.frame(digit = as.numeric(unlist(lapply(splitted, function(x) return(x[3])))))
df3 <- data.frame(digit = as.numeric(unlist(lapply(splitted, function(x) return(x[4])))))
df4 <- data.frame(digit = as.numeric(unlist(lapply(splitted, function(x) return(x[5])))))
df5 <- data.frame(digit = as.numeric(unlist(lapply(splitted, function(x) return(x[6])))))
overall <- ggplot(y, aes(x = value)) +
geom_density(alpha = .3, fill = 'red') +
xlab(sprintf("Value from N~(%s,%s)", tmp1, tmp2)) +
ylab("Density") +
theme(axis.title=element_text(size=10)) +
scale_x_continuous(breaks = seq(0, .4, .1), labels = seq(0, .4, .1))
p1 <- ggplot(df1, aes(x = digit)) +
geom_bar(alpha = .3, fill = 'blue') +
xlab("First digit") +
ylab("Frequency") +
theme(axis.title=element_text(size=10)) +
scale_x_discrete(breaks = 0:10, limits = 0:9)
p2 <- ggplot(df2, aes(x = digit)) +
geom_bar(alpha = .3, fill = 'blue') +
xlab("Second digit") +
ylab("") +
theme(axis.title=element_text(size=10)) +
scale_x_discrete(breaks = 0:10, limits = 0:9)
p3 <- ggplot(df3, aes(x = digit)) +
geom_bar(alpha = .3, fill = 'blue') +
xlab("Third digit") +
ylab("") +
theme(axis.title=element_text(size=10)) +
scale_x_discrete(breaks = 0:10, limits = 0:9)
p4 <- ggplot(df4, aes(x = digit)) +
geom_bar(alpha = .3, fill = 'blue') +
xlab("Fourth digit") +
ylab("") +
theme(axis.title=element_text(size=10)) +
scale_x_discrete(breaks = 0:10, limits = 0:9)
# theme(aspect.ratio = .625)
p5 <- ggplot(df5, aes(x = digit)) +
geom_bar(alpha = .3, fill = 'blue') +
xlab("Fifth digit") +
ylab("") +
theme(axis.title=element_text(size=10)) +
scale_x_discrete(breaks = 0:10, limits = 0:9)
# theme(aspect.ratio = .625)
# Plot
empty <- ggplot() + theme(panel.background = element_rect(fill = 'white'))
gridExtra::grid.arrange(p1, p2, p3,
ncol = 3)
```
#### Multivariate associations
Variables or measurements included in one study can have multivariate associations that might be non-obvious to researchers. Hence, such relations between variables or measurements might be overlooked by people who fabricate data. Fabricators might also simply be practically unable to fabricate data that reflect these multivariate associations, even if they are aware of these associations. For example, in response time latencies, there typically is a positive relation between mean response time and the variance of the response time. Given that the genuine multivariate relations between different variables arise from stochastic processes and are not readily known in either their form or size, these might be difficult to take into account for someone who wants to fabricate data. As such, using multivariate associations to discern fabricated data from genuine data might prove worthwhile.
The multivariate associations between different variables can be estimated from control data that are (arguably) genuine. For example, if the multivariate association between means ($M$s) and standard deviations ($SD$s) is of interest, control data for that same measure can be collected from the literature. With these control data, a meta-analysis provides an overall estimate of the multivariate relation that can subsequently be used to verify the credibility of a set of statistics.
```{r, echo=FALSE}
cortmp <- NULL
set.seed(1234)
for(i in 1:100) {
n <- 100 # length of vector
rho <- rnorm(1, .123, .1) # desired correlation = cos(angle)
theta <- acos(rho) # corresponding angle
x1 <- rnorm(n, 75, 10) # fixed given data
x2 <- rnorm(n, 25, 4) # new random data
X <- cbind(x1, x2) # matrix
Xctr <- scale(X, center=TRUE, scale=FALSE) # centered columns (mean 0)
Id <- diag(n) # identity matrix
Q <- qr.Q(qr(Xctr[ , 1, drop=FALSE])) # QR-decomposition, just matrix Q
P <- tcrossprod(Q) # = Q Q' # projection onto space defined by x1
x2o <- (Id-P) %*% Xctr[ , 2] # x2ctr made orthogonal to x1ctr
Xc2 <- cbind(Xctr[ , 1], x2o) # bind to matrix
Y <- Xc2 %*% diag(1/sqrt(colSums(Xc2^2))) # scale columns to length 1
x <- Y[ , 2] + (1 / tan(theta)) * Y[ , 1] # final new vector
x <- x+25
cortmp[i] <- cor(x1, x) #
}
cortmp <- data.frame(cortmp)
corfabno <- data.frame(cor = .2)
corfab <- data.frame(cor = .5)
# x <- metafor::rma(atanh(cortmp$cortmp), sei = 1 / sqrt(n - 3), method = 'REML')
# atanh(corfab$cor)
```
Specifically, the multivariate associations from the genuine data are subsequently used to estimate the extremity of an observed multivariate relation in investigated data. Consider the following fictitious example, regarding the multivariate association between $M$s and $SD$s for a response latency task mentioned earlier. Figure \@ref(fig:multivariate-association) depicts a (simulated) population distribution of the association (e.g., a correlation) between $M$s and $SD$s from the literature ($N\sim(.123, .1)$). Assume we have two papers, each coming from a pool of direct replications providing an equal number of $M$s and corresponding $SD$s. Associations between these statistics are $`r corfab$cor`$ for Paper 1 and $`r corfabno$cor`$ for Paper 2. From Figure \@ref(fig:multivariate-association) we see that the association in Paper 1 has a much higher percentile score in the distribution (i.e., $`r pnorm(corfab$cor, mean(cortmp$cortmp), sd(cortmp$cortmp), lower.tail = TRUE) * 100`$th percentile) than that of Paper 2 (i.e., $`r pnorm(corfabno$cor, mean(cortmp$cortmp), sd(cortmp$cortmp), lower.tail = TRUE) * 100`$th percentile).
```{r multivariate-association, echo=FALSE, out.width='100%', fig.cap="Distribution of 100 simulated observed associations between $M$s and $SD$s for a response latency task; simulated under $N(.123,.1)$. The red- and blue dots indicate observed multivariate associations from fictitious papers. Paper 1 may be considered relatively extreme and of interest for further inspection; Paper 2 may be considered relatively normal.", fig.align = 'center'}
ggplot(cortmp, aes(x = cortmp)) +
geom_density() +
xlim(-1, 1) +
xlab("Association between M and SD") +
ylab("Density") +
geom_point(data = corfabno, aes(x = cor, y = 0, col="Paper 2", size = 2)) +
geom_point(data = corfab, aes(x = cor, y = 0, col="Paper 1", size = 2)) +
scale_size('', guide = 'none') +
scale_fill_discrete('', guide = FALSE)
```
## Study 1 - detecting fabricated summary statistics
We tested the performance of statistical methods to detect data fabrication in summary statistics with genuine and fabricated summary statistics with psychological data. We asked participants to fabricate data that were supposedly drawn from a study on the anchoring effect [@doi:10.1126/science.185.4157.1124;@doi:10.1037/e722982011-058].
The anchoring effect is a well-known psychological heuristic that uses the information in the question as the starting point for the answer, which is then adjusted to yield a final estimate of a quantity.
For example:
> *Do you think the percentage of African countries in the UN is above or below [10\% or 65\%]? What do you think is the percentage of African countries in the UN?*
In their classic study, @doi:10.1126/science.185.4157.1124 varied the anchor in this question between 10% and 65% and found that they yielded mean responses of 25\% and 45\%, respectively [@doi:10.1126/science.185.4157.1124].
We chose the anchoring effect because it is well known and because a considerable amount of (arguably) genuine data sets on the anchoring heuristic are freely available [[https://osf.io/pqf9r](https://osf.io/pqf9r); @doi:10.1027/1864-9335/a000178].
This allowed us to compare data knowingly and openly fabricated by our participants (researchers in psychology) to actual data that can be assumed to be genuine because they were drawn from a large-scale international project involving many contributing labs (a so-called Many Labs study).
Our data fabrication study was approved by Tilburg University's Ethics Review Board (EC-2015.50; [https://osf.io/7tg8g/](https://osf.io/7tg8g/)).
### Methods
```{r prep study 1, echo=FALSE}
# Compute the summary statistics for Many Labs
suppressWarnings(suppressMessages(source('./assets/functions/compute_summary_anch_ml.R')))
# Process the collected, fabricated data
suppressMessages(source('./assets/functions/process_qualtrics_anch_01.R'))
# Concatenate, analyze, and write out all ML and qualtrics data
if(!file.exists('./assets/data/study_01/study1_res.csv'))
{
set.seed(123);suppressMessages(source('./assets/functions/concatenate_analyze_01.R'))
}
dat_summary <- read.csv('./assets/data/study_01/study1_res.csv', stringsAsFactors = FALSE)
# r2 => r
## First set a negative effect size to zero for study 4 of R_6opw4qQURhqxT3D
dat_summary[dat_summary$id == 'R_6opw4qQURhqxT3D' & dat_summary$study == 'Study 4' & dat_summary$test == 'Effect size (r2) interaction',]$result <- 0
dat_summary$result[grepl(dat_summary$test, pattern = 'Effect size (r2)*')] <- sqrt(dat_summary$result[grepl(dat_summary$test, pattern = 'Effect size (r2)*')])
dat_summary$test <- sub(pattern = '(r2)', replacement = 'r', dat_summary$test)
# add grouping
dat_summary$fabricated <- as.factor(ifelse(grepl(dat_summary$id, pattern = 'R_*'), 'Fabricated', 'Genuine'))
# ROC
AUC <- plyr::ddply(dat_summary, .(test, study), .fun = function(x) {
if (grepl(x$test[1], pattern = 'Effect size')) {
tmp <- pROC::roc(response = x$fabricate, x$result, direction = ">", ci = TRUE)
} else {
tmp <- pROC::roc(response = x$fabricate, x$result, direction = "<", ci = TRUE)
}
return(data.frame(auc = tmp$auc, ci_lb = tmp$ci[1], ci_ub = tmp$ci[3]))
})
# add CIs?
AUC <- AUC[!grepl(AUC$test, pattern = 'P-value*'),]
dat_summary$rng_plot <- sprintf('%s, RNG: %s', dat_summary$fabricated, dat_summary$rng)
```
We collected genuine summary statistics from the Many Labs study and fabricated summary statistics from our participating fabricators for four anchoring studies: (i) distance from San Francisco to New York, (ii) human population of Chicago, (iii) height of the Mount Everest, and (iv) the number of babies born per day in the United States [@doi:10.1037/e722982011-058].
Each of the four (genuine or fabricated) studies provided us with summary statistics in a 2 (low/high anchoring) $\times$ 2 (male/female) factorial design. Our analysis of the data fabrication detection methods used the summary statistics (i.e., means, standard deviations, and test results) of the four anchoring studies fabricated by each participant or the four anchoring studies that had actually been conducted by each participating lab in the Many Labs project [@doi:10.1027/1864-9335/a000178].
The test results available are the main effect of the anchoring condition, the main effect of gender, and the interaction effect between the anchoring conditions and gender conditions.
For current purposes, a participant is defined as researcher/lab where the four anchoring studies' summary statistics originate from.
All materials, data, and analyses scripts are freely available on the OSF ([https://osf.io/b24pq](https://osf.io/b24pq)) and a preregistration is available at [https://osf.io/tshx8/](https://osf.io/tshx8/).
Throughout this report, we will indicate which facets were not preregistered or deviate from the preregistration (for example by denoting "(not preregistered)" or "(deviation from preregistration)") and explain the reason of the deviation.
#### Data collection
We downloaded thirty-six genuine data sets from the publicly available Many Labs (ML) project [[https://osf.io/pqf9r](https://osf.io/pqf9r); @doi:10.1027/1864-9335/a000178].
The ML project replicated several effects across thirty-six locations, including the anchoring effect in the four studies mentioned previously.
Considering the size of the ML project, the transparency of research results, and minimal individual gain for fabricating data, we felt confident to assume these data are genuine.
For each of the thirty-six labs we computed three summary statistics (i.e., sample sizes, means, and standard deviations) for each of the four conditions in the four anchoring studies (i.e., $3\times4\times4$; data: [https://osf.io/5xgcp/](https://osf.io/5xgcp/)).
We computed these summary statistics from the raw ML data, which were cleaned using the original analysis scripts from the ML project.
The sampling frame for the participants asked to fabricate data consisted of 2,038 psychology researchers who published a peer-reviewed paper in 2015, as indexed in Web of Science (WoS) with the filter set to the U.S.
We sampled psychology researchers to improve familiarity with the anchoring effect [@doi:10.1126/science.185.4157.1124;@doi:10.1037/e722982011-058].
We filtered for U.S. researchers to ensure familiarity with the imperial measurement system, which is the scale of some of the anchoring studies and in order to reduce heterogeneity across fabricators.^[We discovered that we included several non-U.S. researchers against our initial aim. We filtered Web of Science on U.S. origin, but found out that this meant that one of the authors on the paper was U.S. based. As such, corresponding authors might still be non-U.S. Based on a search through the open ended comments of the participant's responses, there was no mention of issues in fabricating the data related to the metric or imperial system.]
We searched WoS on October 13, 2015. In total, 2,038 unique corresponding emails were extracted from 2,014 papers (due to multiple corresponding authors).
From these 2,038 psychology researchers, we emailed a random sample of 1,000 researchers to participate in our study (April 25, 2016; [osf.io/s4w8r](https://osf.io/s4w8r)).
We used Qualtrics and removed identifying information not essential to the study (e.g., no IP-addresses saved).
We informed the participating researchers that the study would require them to fabricate data and explicitly mentioned that we would investigate these data with statistical methods to detect data fabrication.
We also clarified to the participants that they could stop at any time without providing a reason.
If they wanted, participants received a $30 Amazon gift card as compensation for their participation if they were willing to enter their email address.
They could win an additional $50 Amazon gift card if they were one of three top fabricators (participants were not informed about how we planned to detect data fabrication; the procedure for this is explained in the Data Analysis section).
The provided email addresses were unlinked from individual responses upon sending the bonus gift cards.
The full Qualtrics survey is available at [osf.io/rg3qc](https://osf.io/rg3qc).
Each participant was instructed to fabricate 32 summary statistics (4 studies $\times$ 2 anchoring conditions $\times$ 2 sexes $\times$ 2 statistics [mean and $SD$]) that corresponded to three hypotheses.
We instructed participants to fabricate results for the following hypotheses: there is (i) a positive main effect of the anchoring condition, (ii) no effect of sex, and (iii) no interaction effect between condition and sex.
We fixed the sample sizes in the fabricated anchoring studies to 25 per cell so that participants did not need to fabricate sample sizes.
These fabricated summary statistics and their accompanying test results for these three hypotheses serve as the data to examine the properties of statistical tools to detect data fabrication.
We provided participants with a template spreadsheet to fill out the fabricated data, in order to standardize the fabrication process without restraining the participant in how they chose to fabricate data.
Figure \@ref(fig:spreadsheet-study1) depicts an example of this spreadsheet (original: [https://osf.io/w6v4u](https://osf.io/w6v4u)).
We requested participants to fill out the yellow cells with fabricated data, which included means and standard deviations for the four conditions.
Using these values, the spreadsheet automatically computed statistical tests and immediately showed them in the "Current result" column instantaneously.
If these results supported the (fabrication) hypotheses, a checkmark appeared as depicted in Figure \@ref(fig:spreadsheet-study1).
We required participants to copy-paste the yellow cells into Qualtrics.
This provided a standardized response format that could be automatically processed in the analyses.
Technically, participants could provide a response that did not correspond to the instructions but none of them did.
```{r spreadsheet-study1, fig.cap="Example of a filled out template spreadsheet used in the fabrication process of Study 1. Respondents fabricated data in the yellow cells, which were used to automatically compute the results of the hypothesis tests, shown in the column \"Current result\". If the fabricated data confirm the hypotheses, a checkmark appeared in a green cell (one of four template spreadsheets available at https://osf.io/w6v4u).", out.width='100%', echo=FALSE, fig.align = 'center'}
knitr::include_graphics('./assets/figures/spreadsheet.png')
tmp <- unique(cbind(dat_summary$id, dat_summary$bonus, as.numeric(as.character(dat_summary$rng))))
count1 <- sum(grepl(tmp[,1], pattern = 'R_'))
nobonus1 <- sum(grepl(tmp[,1], pattern = 'R_') & is.na(tmp[,2]))
rng1 <- sum(grepl(tmp[,1], pattern = 'R_') & tmp[,3] == 1)
```
Upon completion of the data fabrication, we debriefed respondents within Qualtrics (full survey: [osf.io/rg3qc/](https://osf.io/rg3qc/)).
Respondents self-rated their statistical knowledge (1 = extremely poor, 10 = excellent), what statistical analysis programs they used frequently (i.e., at least once per week), whether they had ever conducted an anchoring study themselves, whether they used a random number generator to fabricate data in this study, whether they fabricated raw data to get summary statistics, how many combinations of means and standard deviations they created for each study (on average), and a free-text description of their fabrication procedures per study.
Lastly we reminded participants that data fabrication is widely condemned by professional organizations, institutions, and funding agencies alike.
This reminder was intended to minimize potential carry-over effects of the unethical behavior into actual research practice [@doi:10.1509/jmkr.45.6.633].
<!-- deze naar DOI updaten als tijdig gepubliceerd in AMPSS -->
Using quotum sampling, we collected as many responses as possible for the available 36 rewards, resulting in `r count1` fabricated data sets ([https://osf.io/e6zys](https://osf.io/e6zys); `r nobonus1` participants did not participate for a bonus).
#### Data analysis
We analyzed the genuine and fabricated data sets (36 and 39, respectively), with each data set consisting of summary statistics of four anchoring studies. The data set is the unit of analysis. Four types of analyses are conducted on each of the 75 data sets; (i) the reversed Fisher method, (ii) variance analyses, (iii) the Fisher method applied to the results of the former two, and (iv) analysis of the effect sizes of the statistically significant anchoring effect of the four anchoring studies. Per type of analysis, we examine if we can distinguish the 36 genuine from the 39 fabricated data sets, mainly using Area Under Receiving Operator Characteristic (AUROC) curves. Below we first describe each of the four types of analyses, followed by a description of the AUROC curve analysis.
We conducted two analyses to detect data fabrication using the reversed Fisher method. More specifically, we conducted one reversed Fisher method analysis for the four statistically nonsignificant results of the gender effect (one per anchoring study) and one for the four statistically nonsignificant interaction effects (one per anchoring study). This results in two reversed Fisher method results (based on $k$=4) per data set.
For the variance analyses, we substantially deviated from the preregistration ([https://osf.io/tshx8/](https://osf.io/tshx8/)) and added multiple analyses. We analyzed the 16 sample variances (four anchoring studies $\times$ four conditions per anchoring study) per lab or participant in fourteen different ways. Each of the fourteen variance analyses was conducted using two dispersion of variance measures. One measure inspects the standard deviation of the sample variances (i.e., $SD_z$); one measure inspects the range of the sample variances (i.e., $max_z-min_z$); we ran all 28 analyses with 100,000 iterations from which we computed the bootstrapped $p$-value (see also the Theoretical Framework). Of these 28 variance analyses (14 for each dispersion of variances measure), only one was preregistered. This was the variance analysis combining all 16 sample variances of the four anchoring studies. Upon analyzing the results of this preregistered variance analysis, however, we realized that the variance analyses assume that the included variances are from the same population distribution. Assuming homogeneous populations of variances is unrealistic for the four very different anchoring conditions or studies (i.e., they have outcome measures on very different scales, such as distances in miles and babies born). Hence, we included variance analyses based on subgroups, where we analyzed each anchoring study separately (four variance analyses) or analyzed each anchoring condition of each study separately (i.e., the low/high anchoring condition collapsed across gender; eight variance analyses). We also conducted one variance analysis that combined all variances across studies but takes into account the subgroups per anchoring condition per study.
We also combined the reversed Fisher method results with the results from the variance analyses using the original Fisher method. More specifically, we combined the results from the two reversed Fisher method analyses (one analysis for the four gender effects and one analysis for the four interaction effects) with the preregistered variance analysis (the result of this analysis was used to determine the three most difficult to detect fabricated datasets and subsequently to reward the 'best fabricators'). We additionally applied the Fisher method to results of the reversed Fisher method (two results) with three different combinations of results of the variance analyses; based on variance analyses per anchoring study (four results), per anchoring study $\times$ condition combination (eight results), and across all studies and conditions but taking into account heterogeneous variances per anchoring condition for each study (one result). Hence, the additional Fisher method analyses were based on six, ten, and three results, respectively. Throughout these combinations, we only use the $SD_z$ dispersion of variance measure for parsimony. Note that the performance of the Fisher method combining results of various analyses (the reversed Fisher method and the variance analyses) as we do here is naturally dependent on the performance of the individual results included in the combination; if all included results perform well the Fisher method is bound to perform well and vice versa.
Finally, we looked at statistically significant effect sizes. We expected fabricated statistically significant effects to be larger than genuine statistically significant effects. As such, we compared the 75 statistically significant anchoring effects for each of the four anchoring studies separately (not preregistered).
For each of the previously described statistical methods to detect data fabrication, we carried out AUROC curve analyses.
AUROC analyses summarize the sensitivity (i.e., True Positive Rate [TPR]) and specificity (i.e., True Negative Rate [TNR]) for various decision criteria (e.g., $\alpha=0, .01, .02, ..., .99, 1$).
For our purposes, AUROC values indicate the probability that a randomly drawn fabricated and genuine dataset can be correctly classified as fabricated or genuine based on the result of the analysis [@doi:10.1148/radiology.143.1.7063747].
In other words, if $AUROC=.5$, correctly classifying a randomly drawn dataset as fabricated (or genuine) is equal to 50% (assuming equal prevalences).
For this setting, we follow the guidelines of @doi:10.1093/jpepsy/jst062 and regard any AUROC value $<.7$ as poor for detecting data fabrication, $.7\leq$ AUROC $<.8$ as fair, $.8\leq$ AUROC $<.9$ as good, and AUROC $\geq.9$ as excellent.
We conducted all analyses using the `pROC` package [@doi:10.1186/1471-2105-12-77].
### Results
Figure \@ref(fig:ddfab-density1) shows a group-level comparison of the genuine- ($k=36$) and fabricated ($k=`r count1`$) datasets, which contain four $p$-values and relevant effect sizes ($r$) for each type of effect (gender, anchoring, interaction) per dataset (i.e., $75\times4$ data points for each plot).
These group-level comparisons provide a general overview of the differences between the genuine and fabricated data. Figure \@ref(fig:ddfab-density1) (right and left column) already indicates that there are few systematic differences between fabricated and genuine summary statistics from the anchoring studies when statistically nonsignificant effects are inspected (i.e., gender and interaction hypotheses).
However, there seem to be larger differences when we required participants to fabricate statistically significant summary statistics (i.e., anchoring hypothesis; middle column).
We discuss results bearing on the specific tests for data fabrication next.
```{r ddfab-density1, fig.cap="Density distributions of genuine and fabricated summary statistics across four anchoring studies, per effect (gender, anchoring, or interaction across columns) and type of result ($p$-value or effect size across rows).", echo=FALSE}
plot_p_gender <- ggplot(dat_summary[dat_summary$test == 'P-value gender',],
aes(x = result, fill = fabricated)) +
geom_density(alpha = .3) +
xlab(latex2exp::TeX("$P$-value")) +
ylab("Density") +
ylim(0, 1.75) +
scale_fill_viridis(guide=FALSE, discrete = TRUE) +
ggtitle('Gender')
plot_es_gender <- ggplot(dat_summary[dat_summary$test == 'Effect size (r) gender',],
aes(x = result, fill = fabricated)) +
geom_density(alpha = .3) +
xlab(latex2exp::TeX("Effect size ($r$)")) +
ylab("Density") +
scale_fill_viridis(guide=FALSE, discrete = TRUE)
# Condition
dat_summary_tmp <- dat_summary
dat_summary_tmp$result <- log10(dat_summary_tmp$result)
plot_p_condition <- ggplot(dat_summary_tmp[dat_summary_tmp$test == 'P-value anchoring',],
aes(x = result, fill = fabricated)) +
geom_density(alpha = .3) +
xlab(latex2exp::TeX("log10 $P$-value")) +
ylab("Density") +
scale_fill_viridis(labels = c("Fabricated","Genuine"), discrete = TRUE) +
theme(legend.position="top", legend.text=element_text(size=6),legend.key.size =unit(.4, 'cm')) +
ggtitle('Anchoring') +
scale_y_continuous(breaks = c(0, 0.02, .04), labels = seq(0, .04, .02))
plot_p_condition$labels$fill <- ""
plot_es_condition <- ggplot(dat_summary[dat_summary$test == 'Effect size (r) anchoring',],
aes(x = result, fill = fabricated)) +
geom_density(alpha = .3) +
xlim(0, 1) +
xlab(latex2exp::TeX("Effect size ($r$)")) +
ylab("Density") +
scale_fill_viridis(guide=FALSE, discrete = TRUE)
# Interaction
plot_p_interaction <- ggplot(dat_summary[dat_summary$test == 'P-value interaction',],
aes(x = result, fill = fabricated)) +
geom_density(alpha = .3) +
xlab(latex2exp::TeX("$P$-value")) +
ylab("Density") +
scale_fill_viridis(guide=FALSE, discrete = TRUE) +
ylim(0, 1.5) +
ggtitle('Interaction')
plot_es_interaction <- ggplot(dat_summary[dat_summary$test == 'Effect size (r) interaction',],
aes(x = result, fill = fabricated)) +
geom_density(alpha = .3) +
xlab(latex2exp::TeX("Effect size ($r$)")) +
ylab("Density") +
scale_fill_viridis(guide=FALSE, discrete = TRUE)
# Plot
suppressWarnings(gridExtra::grid.arrange(plot_p_gender,
plot_p_condition,
plot_p_interaction,
plot_es_gender,
plot_es_condition,
plot_es_interaction,
ncol = 3))
```
```{r echo = FALSE}
sel1 <- AUC[AUC$test == 'Fisher method gender p-values', ]
sel2 <- AUC[AUC$test == 'Fisher method interaction p-values', ]
```
#### $P$-value analysis
```{r echo = FALSE}
selGen <- dat_summary$test == 'Fisher method gender p-values'
selInt <- dat_summary$test == 'Fisher method interaction p-values'
dfGen <- unique(dat_summary[selGen, c(1, 7)])
write.csv(dfGen[with(dfGen, order(-result)), ], './assets/data/study_01/p-value-gender.csv', row.names = FALSE)
dfInt <- unique(dat_summary[selInt, c(1, 7)])
write.csv(dfInt[with(dfInt, order(-result)), ], './assets/data/study_01/p-value-int.csv', row.names = FALSE)
dfGenFabDet <- sum(grepl(x = dfGen$id, 'R_') & dfGen$result < .01)
dfGenGenDet <- sum(!grepl(x = dfGen$id, 'R_') & dfGen$result < .01)
dfIntFabDet <- sum(grepl(x = dfInt$id, 'R_') & dfInt$result < .01)
dfIntGenDet <- sum(!grepl(x = dfInt$id, 'R_') & dfInt$result < .01)
```
When we applied the reversed Fisher method to the statistically nonsignificant effects, results indicated its performance is approximately equal to chance classification. We found $AUROC=`r round(sel1$auc[1], 3)`$, 95% CI [$`r round(sel1$ci_lb[1], 3)`$-$`r round(sel1$ci_ub[1], 3)`$] for statistically nonsignificant gender effects and $AUROC=`r round(sel2$auc[1], 3)`$, 95% CI [$`r round(sel2$ci_lb[1], 3)`$-$`r round(sel2$ci_ub[1], 3)`$] for statistically nonsignificant interaction effects. For the gender effects, we classified `r dfGenFabDet` of the `r sum(grepl(x = dfGen$id, 'R_'))` fabricated summary statistics as fabricated ($\alpha=.01$) and `r dfGenGenDet` of the `r sum(!grepl(x = dfGen$id, 'R_'))` genuine summary statistics as fabricated (results per respondent available at [osf.io/a6jb4](https://osf.io/a6jb4)). For the interaction effects, we classified `r dfIntFabDet` of the `r sum(grepl(x = dfInt$id, 'R_'))` fabricated summary statistics ($\alpha=.01$) and `r dfIntGenDet` of the `r sum(!grepl(x = dfInt$id, 'R_'))` genuine summary statistics as fabricated (results per respondent available at [osf.io/jz57p](https://osf.io/jz57p)). In other words, results from this sample indicated that detection of fabricated data using the distribution of statistically nonsignificant $p$-values to detect excessive amounts of high $p$-values does not seem promising.
#### Variance analysis
```{r echo = FALSE}
df <- data.frame(popvar = c('Heterogeneity', rep('Homogeneity', 5), rep('Heterogeneity', 8)),
study = c(rep('Overall', 2), paste('Study', 1:4),
'Study 1, low anchoring', 'Study 1, high anchoring',
'Study 2, low anchoring', 'Study 2, high anchoring',
'Study 3, low anchoring', 'Study 3, high anchoring',
'Study 4, low anchoring', 'Study 4, high anchoring'),
SD_z = sprintf('%s [%s-%s]',
round(AUC$auc[grepl(AUC$test, pattern = 'Variance analysis sd')], 3),
round(AUC$ci_lb[grepl(AUC$test, pattern = 'Variance analysis sd')], 3),
round(AUC$ci_ub[grepl(AUC$test, pattern = 'Variance analysis sd')], 3)),
maxz_minz = sprintf('%s [%s-%s]',
round(AUC$auc[grepl(AUC$test, pattern = 'Variance analysis maxmin')], 3),
round(AUC$ci_lb[grepl(AUC$test, pattern = 'Variance analysis maxmin')], 3),
round(AUC$ci_ub[grepl(AUC$test, pattern = 'Variance analysis maxmin')], 3))
)
tmp <- c(df$SD_z[-c(1,2)], df$maxz_minz[-c(1,2)])
tmp <- AUC$auc[grepl(AUC$study, pattern = 'Study') & grepl(AUC$test, pattern = 'Variance analysis')]
selSDHomoOverall <- dat_summary$test == 'Variance analysis sd [homogeneity]' & dat_summary$study == 'Overall'
dfSDHomoOverall <- unique(dat_summary[selSDHomoOverall, c(1, 7)])
write.csv(dfSDHomoOverall[with(dfSDHomoOverall, order(-result)), ], './assets/data/study_01/sd-homo-overall.csv', row.names = FALSE)
dfSDHomoOverallFabDet <- sum(grepl(x = dfSDHomoOverall$id, 'R_') & dfSDHomoOverall$result < .01)
dfSDHomoOverallGenDet <- sum(!grepl(x = dfSDHomoOverall$id, 'R_') & dfSDHomoOverall$result < .01)
selMAXMINHomoOverall <- dat_summary$test == 'Variance analysis maxmin [homogeneity]' & dat_summary$study == 'Overall'
dfMAXMINHomoOverall <- unique(dat_summary[selMAXMINHomoOverall, c(1, 7)])
write.csv(dfMAXMINHomoOverall[with(dfMAXMINHomoOverall, order(-result)), ], './assets/data/study_01/maxmin-homo-overall.csv', row.names = FALSE)
dfMAXMINHomoOverallFabDet <- sum(grepl(x = dfMAXMINHomoOverall$id, 'R_') & dfMAXMINHomoOverall$result < .01)
dfMAXMINHomoOverallGenDet <- sum(!grepl(x = dfMAXMINHomoOverall$id, 'R_') & dfMAXMINHomoOverall$result < .01)
```
We expected the dispersion of variances to be lower in fabricated data as opposed to genuine data. We computed the AUROC values for the variance analyses with the directional hypothesis that genuine data shows more variation than fabricated data, using either the dispersion of variance as captured by the standard deviation of the variances (i.e., $SD_z$) or the range of the variances (i.e., $max_z-min_z$). AUROC results of all `r dim(df)[1]` analyses (as described in the Data analysis section) are presented in Table \@ref(tab:auc-var1), one result for each dispersion of variance measure. Of these `r dim(df)[1]` results, we only preregistered the variance analysis inspecting the standardized variances across all studies under both the $SD_z$ and $max_z-min_z$ operationalizations, assuming unrealistically homogeneous population variances ([https://osf.io/tshx8/](https://osf.io/tshx8/); second row of Table \@ref(tab:auc-var1)). As we did not preregister the other variance analyses, these should be considered exploratory.
```{r auc-var1, echo = FALSE}
names(df) <- c('Population variance assumption',
'Study', '$SD_z$', '$max_{z}-min_{z}$')
if (!knitr::is_html_output()) {
knitr::kable(df, format = 'latex', digits = 3, caption = "Area Under Receiving Operator Characteristic (AUROC) values of each variance analysis and operationalization, including its 95 percent Confidence Interval. 'Heterogeneity' assumes unequal population variances for the low- and high anchoring conditions, whereas 'homogeneity' assumes equal population variances across anchoring conditions in the same study. We preregistered only the analyses in the second row.", booktabs = TRUE, escape = FALSE) %>%
kableExtra::kable_styling(latex_options = c('striped', 'hold_position', 'scale_down'), position = 'center')
} else {
knitr::kable(df, digits = 3, caption = "Area Under Receiving Operator Characteristic (AUROC) values of each variance analysis and operationalization, including its 95 percent Confidence Interval. 'Heterogeneity' assumes unequal population variances for the low- and high anchoring conditions, whereas 'homogeneity' assumes equal population variances across anchoring conditions in the same study. We preregistered only the analyses in the second row.", booktabs = TRUE, escape = FALSE) %>%
kableExtra::kable_styling(position = 'center',
bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
}
sel <- AUC[grepl(AUC$test, pattern = 'Variance analysis'), ]
```
Under the (in hindsight unrealistic) assumption of homogeneous population variances, our preregistered variance analyses did not perform above chance level.
Using the standard deviation of the variances (i.e., $SD_z$) as dispersion of variance measure, the results are: $AUROC=`r round(sel$auc[16], 3)`$, 95% CI [$`r round(sel$ci_lb[16], 3)`$-$`r round(sel$ci_ub[16], 3)`$]. With this statistic, we classified `r dfSDHomoOverallFabDet` of the `r sum(grepl(x = dfSDHomoOverall$id, 'R_'))` fabricated summary statistics ($\alpha=.01$) and `r dfSDHomoOverallGenDet` of the `r sum(!grepl(x = dfSDHomoOverall$id, 'R_'))` genuine summary statistics as fabricated (results per respondent available at [osf.io/9cjdh](https://osf.io/9cjdh)).
Using the range of the variances (i.e., $max_z-min_z$) as dispersion of variance, the results are: $AUROC=`r round(sel$auc[2], 3)`$, 95% CI [$`r round(sel$ci_lb[2], 3)`$-$`r round(sel$ci_ub[2], 3)`$].
With this statistic, we detected `r dfMAXMINHomoOverallFabDet` of the `r sum(grepl(x = dfMAXMINHomoOverall$id, 'R_'))` fabricated summary statistics as fabricated ($\alpha=.01$) and `r dfMAXMINHomoOverallGenDet` of the `r sum(!grepl(x = dfMAXMINHomoOverall$id, 'R_'))` genuine summary statistics as fabricated (results per respondent available at [osf.io/2ts6b](https://osf.io/2ts6b)).
Comparing the results between $SD_z$ and $max_z-min_z$ indicates that the range of the variances measure seems more robust to the violations of the assumption of homogeneous variances than the standard deviation of the variances measure. Overall these results indicate that a violation of the homogeneity assumption may undermine analyses on heterogeneous variances. These assumptions should be made more explicit and checked whenever possible, to prevent improper use.
```{r, echo = FALSE}
selSDHeteroOverall <- dat_summary$test == 'Variance analysis sd [heterogeneity]' & dat_summary$study == 'Overall'
dfSDHeteroOverall <- unique(dat_summary[selSDHeteroOverall, c(1, 7)])
write.csv(dfSDHeteroOverall[with(dfSDHeteroOverall, order(-result)), ], './assets/data/study_01/sd-hetero-overall.csv', row.names = FALSE)
dfSDHeteroOverallFabDet <- sum(grepl(x = dfSDHeteroOverall$id, 'R_') & dfSDHeteroOverall$result < .01)
dfSDHeteroOverallGenDet <- sum(!grepl(x = dfSDHeteroOverall$id, 'R_') & dfSDHeteroOverall$result < .01)
selmaxminHeteroOverall <- dat_summary$test == 'Variance analysis sd [heterogeneity]' & dat_summary$study == 'Overall'
dfmaxminHeteroOverall <- unique(dat_summary[selmaxminHeteroOverall, c(1, 7)])
write.csv(dfmaxminHeteroOverall[with(dfmaxminHeteroOverall, order(-result)), ], './assets/data/study_01/maxmin-hetero-overall.csv', row.names = FALSE)
dfmaxminHeteroOverallFabDet <- sum(grepl(x = dfmaxminHeteroOverall$id, 'R_') & dfmaxminHeteroOverall$result < .01)
dfmaxminHeteroOverallGenDet <- sum(!grepl(x = dfmaxminHeteroOverall$id, 'R_') & dfmaxminHeteroOverall$result < .01)
```
We conducted exploratory analyses that take into account the heterogeneity of variances across conditions and studies, which sometimes also resulted in improved performance to detect data fabrication.
Analyses separated per study or anchoring condition show variable $AUROC$ results (ranging from `r round(min(tmp), 3)`-`r round(max(tmp), 3)`; rows 3-14 in Table \@ref(tab:auc-var1)). Using the standard deviation of variances (i.e., $SD_z$; row 1 in Table \@ref(tab:auc-var1)) in a heterogeneous manner across the conditions and studies, $AUROC=`r round(sel$auc[15], 3)`$, 95% CI [$`r round(sel$ci_lb[15], 3)`$-$`r round(sel$ci_ub[15], 3)`$].
With this statistic, we classified `r dfSDHeteroOverallFabDet` of the `r sum(grepl(x = dfSDHeteroOverall$id, 'R_'))` fabricated summary statistics as fabricated ($\alpha=.01$) and `r dfSDHeteroOverallGenDet` of the `r sum(!grepl(x = dfSDHeteroOverall$id, 'R_'))` genuine summary statistics (results per respondent available at [osf.io/srpg9](https://osf.io/srpg9)).
Using the range of variances (i.e., $max_z-min_z$) in a heterogeneous manner across the conditions and studies, $AUROC=`r round(sel$auc[1], 3)`$, 95% CI [$`r round(sel$ci_lb[1], 3)`$-$`r round(sel$ci_ub[1], 3)`$].
With this statistic, we classified the same `r dfmaxminHeteroOverallFabDet` of the `r sum(grepl(x = dfmaxminHeteroOverall$id, 'R_'))` fabricated summary statistics as fabricated ($\alpha=.01$) and `r dfmaxminHeteroOverallGenDet` of the `r sum(!grepl(x = dfmaxminHeteroOverall$id, 'R_'))` genuine summary statistics (results per respondent available at [osf.io/93rek](https://osf.io/93rek)).
#### Combining $p$-value and variance analyses
```{r, echo = FALSE}
selCombHomoComb <- dat_summary$test == 'Combination Fisher method (gender, interaction, variance sd [homogeneity, combined])' & dat_summary$study == 'Overall'
dfCombHomoComb <- unique(dat_summary[selCombHomoComb, c(1, 7)])
write.csv(dfCombHomoComb[with(dfCombHomoComb, order(-result)), ], './assets/data/study_01/combination-homo-combined.csv', row.names = FALSE)
dfCombHomoCombFabDet <- sum(grepl(x = dfCombHomoComb$id, 'R_') & dfCombHomoComb$result < .01)
dfCombHomoCombGenDet <- sum(!grepl(x = dfCombHomoComb$id, 'R_') & dfCombHomoComb$result < .01)
# https://osf.io/hq29t/
selCombHomoSplit <- dat_summary$test == 'Combination Fisher method (gender, interaction, variance sd [homogeneity, split])' & dat_summary$study == 'Overall'
dfCombHomoSplit <- unique(dat_summary[selCombHomoSplit, c(1, 7)])
write.csv(dfCombHomoSplit[with(dfCombHomoSplit, order(-result)), ], './assets/data/study_01/combination-homo-split.csv', row.names = FALSE)
dfCombHomoSplitFabDet <- sum(grepl(x = dfCombHomoSplit$id, 'R_') & dfCombHomoSplit$result < .01)
dfCombHomoSplitGenDet <- sum(!grepl(x = dfCombHomoSplit$id, 'R_') & dfCombHomoSplit$result < .01)
# https://osf.io/r8pf5/
selCombHeteroComb <- dat_summary$test == 'Combination Fisher method (gender, interaction, variance sd [heterogeneity, combined])' & dat_summary$study == 'Overall'
dfCombHeteroComb <- unique(dat_summary[selCombHeteroComb, c(1, 7)])
write.csv(dfCombHeteroComb[with(dfCombHeteroComb, order(-result)), ], './assets/data/study_01/combination-hetero-combined.csv', row.names = FALSE)
dfCombHeteroCombFabDet <- sum(grepl(x = dfCombHeteroComb$id, 'R_') & dfCombHeteroComb$result < .01)
dfCombHeteroCombGenDet <- sum(!grepl(x = dfCombHeteroComb$id, 'R_') & dfCombHeteroComb$result < .01)
# https://osf.io/zt3nk/
selCombHeteroSplit <- dat_summary$test == 'Combination Fisher method (gender, interaction, variance sd [heterogeneity, split])' & dat_summary$study == 'Overall'
dfCombHeteroSplit <- unique(dat_summary[selCombHeteroSplit, c(1, 7)])
write.csv(dfCombHeteroSplit[with(dfCombHeteroSplit, order(-result)), ], './assets/data/study_01/combination-hetero-split.csv', row.names = FALSE)
dfCombHeteroSplitFabDet <- sum(grepl(x = dfCombHeteroSplit$id, 'R_') & dfCombHeteroSplit$result < .01)
dfCombHeteroSplitGenDet <- sum(!grepl(x = dfCombHeteroSplit$id, 'R_') & dfCombHeteroSplit$result < .01)
# https://osf.io/sv35k/
```
Our preregistered analysis combined the homogeneous variance analysis across studies and conditions with the $p$-value analyses of the gender and interaction effects. This combined analysis yielded $AUROC=`r round(AUC$auc[3], 3)`$, 95% CI [`r round(AUC$ci_lb[3], 3)`-`r round(AUC$ci_ub[3], 3)`]. With this statistic, we classified `r dfCombHomoCombFabDet` of the `r sum(grepl(x = dfCombHomoComb$id, 'R_'))` fabricated summary statistics as fabricated ($\alpha=.01$) and `r dfCombHomoCombGenDet` of the `r sum(!grepl(x = dfCombHomoComb$id, 'R_'))` genuine summary statistics (results per respondent available at [osf.io/hq29t](https://osf.io/hq29t)). Given that the combination method would be expected to perform not much better than its constituent results it logically follows that the combination of $p$-values and variance analyses performs poorly.
The poor performance is in part is due to the unrealistic assumption of
homogeneous variances in the variance analysis; we explored the
efficacy of other combinations that loosen this assumption. First, we
split the variance analyses per study and included four variance
analysis results instead of one when we analyzed them overall;
$AUROC=`r round(AUC$auc[4], 3)`$, 95% CI [`r round(AUC$ci_lb[4],
3)`-`r round(AUC$ci_ub[4], 3)`]. With this statistic, we classified `r dfCombHomoSplitFabDet` of the `r sum(grepl(x = dfCombHomoSplit$id, 'R_'))` fabricated summary statistics as fabricated ($\alpha=.01$) and `r dfCombHomoSplitGenDet` of the `r sum(!grepl(x = dfCombHomoSplit$id, 'R_'))` genuine summary statistics (results per respondent available at [osf.io/r8pf5](https://osf.io/r8pf5)).
Second, we split the variance analyses further, splitting across
conditions within studies. This adds another four variance analyses (a
total of eight); $AUROC=`r round(AUC$auc[2], 3)`$, 95% CI [`r round(AUC$ci_lb[2], 3)`-`r round(AUC$ci_ub[2], 3)`]. With this
statistic, we classified `r dfCombHeteroSplitFabDet` of the `r sum(grepl(x = dfCombHeteroSplit$id, 'R_'))` fabricated summary
statistics as fabricated ($\alpha=.01$) and `r dfCombHeteroSplitGenDet` of the `r sum(!grepl(x = dfCombHeteroSplit$id, 'R_'))` genuine summary statistics (results per respondent available at [osf.io/sv35k](https://osf.io/sv35k)). Finally, we replaced the original homogeneous variance analysis (row 2 in Table \@ref(tab:auc-var1)) with the overall
and heterogeneous variance analysis (row 1 in Table \@ref(tab:auc-var1)); $AUROC=`r round(AUC$auc[1], 3)`$,
95% CI [`r round(AUC$ci_lb[1], 3)`-`r round(AUC$ci_ub[1], 3)`]. With
this statistic, we classified `r dfCombHeteroCombFabDet` of the `r sum(grepl(x = dfCombHeteroComb$id, 'R_'))` fabricated summary
statistics as fabricated ($\alpha=.01$) and `r dfCombHeteroCombGenDet`
of the `r sum(!grepl(x = dfCombHeteroComb$id, 'R_'))` genuine summary
statistics (results per respondent available at
[osf.io/zt3nk](https://osf.io/zt3nk)). As the $AUROC$s of the combination method did not exceed that of the variance analyses alone, we conclude that the combination method failed to outperform the variance analyses.
```{r auc-std-var, echo = FALSE}
df <- data.frame(comb = c('Gender, interaction, variance $SD_z$ (heterogeneity, overall, k = 1)',
'Gender, interaction, variance $SD_z$ (heterogeneity, split, k = 8)',
'Gender, interaction, variance $SD_z$ (homogeneity, overall, k = 1)',
'Gender, interaction, variance $SD_z$ (homogeneity, split, k = 4)'),
AUROC = sprintf('%s [%s-%s]',
round(AUC$auc[1:4], 3),
round(AUC$ci_lb[1:4], 3),
round(AUC$ci_ub[1:4], 3))
)
names(df) <- c('', 'AUROC')
# knitr::kable(df, digits = 3, caption = "Area Under Receiving Operator Characteristic (AUROC) values for the various combined p-value and variance analyses, with corresponding 95 percent confidence intervals. Heterogeneity assumes population variances differ for the low- and high anchoring conditions, whereas homogeneity assumes equal population variances across anchoring conditions. Overall indicates that the variance analysis was conducted across all studies simultaneously. Split indicates the variance analyses are separated per study or per anchoring condition per study, for homogeneous and heterogeneous approaches, respectively. K indicates the number of p-values included. Only the result from the third row was preregistered.", booktabs = TRUE, escape = FALSE) %>%
# kable_styling(latex_options = c('striped', 'hold_position'))
```
#### Extreme effect sizes
```{r select, echo = FALSE}
tmp1 <- AUC[AUC$test == 'Effect size (r) anchoring',]
medfab <- summary(dat_summary$result[dat_summary$test == 'Effect size (r) anchoring' & dat_summary$fabricated == 'Fabricated'])[3]
medgen <- summary(dat_summary$result[dat_summary$test == 'Effect size (r) anchoring' & dat_summary$fabricated == 'Genuine'])[3]
max1 <- max(dat_summary$result[dat_summary$test == 'Effect size (r) anchoring' & dat_summary$study == 'Study 1' & !duplicated(dat_summary) & dat_summary$fabricated == 'Genuine'])
fab1 <- dat_summary$result[dat_summary$test == 'Effect size (r) anchoring' & dat_summary$study == 'Study 1' & !duplicated(dat_summary) & dat_summary$fabricated == 'Fabricated']
perc1 <- (sum(fab1 > max1) / length(fab1)) * 100
max2 <- max(dat_summary$result[dat_summary$test == 'Effect size (r) anchoring' & dat_summary$study == 'Study 2' & !duplicated(dat_summary) & dat_summary$fabricated == 'Genuine'])
fab2 <- dat_summary$result[dat_summary$test == 'Effect size (r) anchoring' & dat_summary$study == 'Study 2' & !duplicated(dat_summary) & dat_summary$fabricated == 'Fabricated']
perc2 <- (sum(fab2 > max2) / length(fab2)) * 100
max3 <- max(dat_summary$result[dat_summary$test == 'Effect size (r) anchoring' & dat_summary$study == 'Study 3' & !duplicated(dat_summary) & dat_summary$fabricated == 'Genuine'])
fab3 <- dat_summary$result[dat_summary$test == 'Effect size (r) anchoring' & dat_summary$study == 'Study 3' & !duplicated(dat_summary) & dat_summary$fabricated == 'Fabricated']
perc3 <- (sum(fab3 > max3) / length(fab3)) * 100
max4 <- max(dat_summary$result[dat_summary$test == 'Effect size (r) anchoring' & dat_summary$study == 'Study 4' & !duplicated(dat_summary) & dat_summary$fabricated == 'Genuine'])
fab4 <- dat_summary$result[dat_summary$test == 'Effect size (r) anchoring' & dat_summary$study == 'Study 4' & !duplicated(dat_summary) & dat_summary$fabricated == 'Fabricated']
perc4 <- (sum(fab4 > max4) / length(fab4)) * 100
```
Using the statistically significant effect sizes from the anchoring studies, we differentiated between the fabricated and genuine results fairly well. Figure \@ref(fig:ddfab-density1) (middle column, second row) indicates that the fabricated statistically significant effects were considerably different from the genuine ones. When we inspected the effect size distributions ($r$), we saw that the median fabricated effect size across the four studies was $`r round(medfab, 3)`$ whereas the median genuine effect size was considerably smaller ($`r round(medgen, 3)`$; median difference across the four anchoring effects $`r round(medfab - medgen, 3)`$). In contrast to the fabricated nonsignificant effects, which resembled the genuine data quite well, the statistically significant effects seem to have been harder to fabricate for the participants. More specifically, the $AUROC$ for the studies approximate .75 each ($`r round(tmp1$auc[1], 3)`$, 95% CI [$`r round(tmp1$ci_lb[1], 3)`$-$`r round(tmp1$ci_ub[1], 3)`$]; $`r round(tmp1$auc[2], 3)`$, 95% CI [$`r round(tmp1$ci_lb[2], 3)`$-$`r round(tmp1$ci_ub[2], 3)`$]; $`r round(tmp1$auc[3], 3)`$, 95% CI [$`r round(tmp1$ci_lb[3], 3)`$-$`r round(tmp1$ci_ub[3], 3)`$]; $`r round(tmp1$auc[4], 3)`$, 95% CI [$`r round(tmp1$ci_lb[4], 3)`$-$`r round(tmp1$ci_ub[4], 3)`$]; respectively). Figure \@ref(fig:ddfab-es1) depicts the density distributions of the genuine and fabricated effect sizes per anchoring study, which shows the extent to which the density of the fabricated effect sizes exceeds the maximum of the genuine effect sizes. For instance, the percentage of fabricated statistically significant anchoring effect sizes that is larger than all 36 genuine statistically significant anchoring effect sizes is
`r round(perc1, 1)`\% in Study 1,
`r round(perc2, 1)`\% in Study 2,
`r round(perc3, 1)`\% in Study 3,
and `r round(perc4, 1)`\% in Study 4.
Based on these results, it seems that using extreme effect sizes to detect potential data fabrication may be is a parsimonious and fairly effective method.
```{r ddfab-es1, fig.cap="Density distributions of genuine and fabricated anchoring effect sizes for each of the four anchoring studies.", out.width='100%', echo=FALSE, fig.align = 'center'}
p1 <- ggplot(dat_summary[dat_summary$test == 'Effect size (r) anchoring' &
dat_summary$study == 'Study 1',],
aes(x = result, fill = fabricated)) +
geom_density(alpha = .3) +
xlim(0, 1) +
xlab(latex2exp::TeX("Effect size ($r$)")) +
ylab("Density") +
ggtitle('Study 1') +
scale_fill_viridis(guide=FALSE, discrete = TRUE)
p2 <- ggplot(dat_summary[dat_summary$test == 'Effect size (r) anchoring' &
dat_summary$study == 'Study 2',],
aes(x = result, fill = fabricated)) +
geom_density(alpha = .3) +
xlim(0, 1) +
xlab(latex2exp::TeX("Effect size ($r$)")) +
ylab("Density") +
ggtitle('Study 2') +
scale_fill_viridis(guide=FALSE, discrete = TRUE)
p3 <- ggplot(dat_summary[dat_summary$test == 'Effect size (r) anchoring' &
dat_summary$study == 'Study 3',],
aes(x = result, fill = fabricated)) +
geom_density(alpha = .3) +
xlim(0, 1) +
xlab(latex2exp::TeX("Effect size ($r$)")) +
ylab("Density") +
ggtitle('Study 3') +
scale_fill_viridis(guide = FALSE, discrete = TRUE)+
theme(legend.position="top", legend.text=element_text(size=6))
p4 <- ggplot(dat_summary[dat_summary$test == 'Effect size (r) anchoring' &
dat_summary$study == 'Study 4',],
aes(x = result, fill = fabricated)) +
geom_density(alpha = .3) +
xlim(0, 1) +
xlab(latex2exp::TeX("Effect size ($r$)")) +
ylab("Density") +
ggtitle('Study 4') +
scale_fill_viridis('', labels = c("Fabricated","Genuine"), discrete = TRUE)+
theme(legend.position="bottom", legend.text=element_text(size=6),legend.direction="horizontal",legend.key.size =unit(.4, 'cm'))
suppressWarnings(gridExtra::grid.arrange(p1, p2, p3, p4,
ncol = 2))
```
#### Fabricating effects with Random Number Generators (RNGs)
```{r echo = FALSE}
# With rng split
sel_rng <- dat_summary$rng == 1 & dat_summary$fabricated == 'Fabricated' | dat_summary$fabricated == 'Genuine'
sel_norng <- dat_summary$rng == 0 & dat_summary$fabricated == 'Fabricated' |
dat_summary$fabricated == 'Genuine'
AUC_rng <- plyr::ddply(dat_summary[sel_rng,], .(test, study), .fun = function(x) {
# print(x$rng)
if (grepl(x$test[1], pattern = 'Effect size')) {
tmp <- pROC::roc(response = x$fabricate, x$result, direction = ">", ci = TRUE)
} else {
tmp <- pROC::roc(response = x$fabricate, x$result, direction = "<", ci = TRUE)
}
return(data.frame(auc = tmp$auc, ci_lb = tmp$ci[1], ci_ub = tmp$ci[3]))
})
AUC_rng <- AUC_rng[!grepl(AUC_rng$test, pattern = 'P-value*'),]
AUC_norng <- plyr::ddply(dat_summary[sel_norng,], .(test, study), .fun = function(x) {
# print(x$rng)
if (grepl(x$test[1], pattern = 'Effect size')) {
tmp <- pROC::roc(response = x$fabricate, x$result, direction = ">", ci = TRUE)
} else {
tmp <- pROC::roc(response = x$fabricate, x$result, direction = "<", ci = TRUE)
}
return(data.frame(auc = tmp$auc, ci_lb = tmp$ci[1], ci_ub = tmp$ci[3]))
})
AUC_norng <- AUC_norng[!grepl(AUC_norng$test, pattern = 'P-value*'),]
```
Fabricated effects might seem more genuine when participants used Random Number Generators (RNGs). RNGs are typically used in computer-based simulation procedures where data are generated that are supposed to arise from probabilistic processes. Given that our framework of detecting data fabrication rests on the lack of intuitive understanding of humans at drawing values from probability distributions, those participants who used an RNG might come closer to fabricating seemingly genuine data, leading to more difficult to detect fabricated data. The analyses presented next were not preregistered.
We split our analyses for those `r rng1` participants who indicated using RNGs and the remaining `r count1 - rng1` participants who indicated not to have used RNGs. Figure \@ref(fig:rng-density1) shows the same density distributions as in Figure \@ref(fig:ddfab-density1), except that this time the density distributions of the fabricated data are split between these two groups.
```{r rng-density1, fig.cap="Density distributions of p-values and effect sizes for the gender effect, the anchoring effect, and the interaction effect across the four anchoring studies. This figure is similar to Figure \\@ref(fig:ddfab-density1), except that each panel now separates the density distributions for fabricated results using a random number generator (RNG), fabricated results without using a RNG, and genuine effects. Respondents self-selected to use (or not use) RNGs in their fabrication process.", echo=FALSE}
plot_p_gender <- ggplot(dat_summary[dat_summary$test == 'P-value gender',],
aes(x = result, fill = rng_plot)) +
geom_density(alpha = .3) +
xlab(latex2exp::TeX("$P$-value")) +
ylab("Density") +
ylim(0, 1.75) +
ggtitle('Gender') +
scale_fill_viridis(discrete = TRUE, guide = FALSE)
plot_es_gender <- ggplot(dat_summary[dat_summary$test == 'Effect size (r) gender',],
aes(x = result, fill = rng_plot)) +
geom_density(alpha = .3) +
xlab(latex2exp::TeX("Effect size ($r$)")) +
ylab("Density") +
scale_fill_viridis(discrete = TRUE, guide=FALSE)
# Condition
dat_summary_tmp <- dat_summary
dat_summary_tmp$result <- log10(dat_summary_tmp$result)
plot_p_condition <- ggplot(dat_summary_tmp[dat_summary_tmp$test == 'P-value anchoring',],
aes(x = result, fill = rng_plot)) +
geom_density(alpha = .3) +
xlab(latex2exp::TeX("log10 $P$-value")) +
ylab("Density") +