-
Notifications
You must be signed in to change notification settings - Fork 1
/
Script for cleaning merged dataset.R
3152 lines (2774 loc) · 134 KB
/
Script for cleaning merged dataset.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
library(readr)
library(tidyverse)
library(knitr)
library(kableExtra)
library(misty)
library(openxlsx)
library(readxl)
# Merge and Clean Dataset -------------------------------------------------
df_full_psych <- read_csv("Full Psychotherapy Dataset.csv", col_types = cols(year = col_character()))
df_full_med <- read.csv("Full Medication Dataset.csv")
# merge datasets
merged_dataset <- bind_rows(df_full_med, df_full_psych)
# fix column ordering
merged_dataset <- dplyr:: select(merged_dataset, -c(primary, hierarchy, time))
merged_dataset <- merged_dataset %>% relocate(descr_active, .after = control_type)
merged_dataset <- merged_dataset %>% relocate(descr_control, .after = descr_active)
merged_dataset <- merged_dataset %>% relocate(responders_active: cuij_responders_control, .after = observed_resp_rate_control)
#create vector indicating instrument primacy according to our hierarchy
unique(merged_dataset$instrument) # check unique values to account for differences in formatting of instrument names
merged_dataset <- merged_dataset %>%
mutate(
instrument_value = case_when(
instrument %in% c("CDRS", "CDRS-R", "cdrs-r") ~ 1,
instrument %in% c("HAM-D", "ham-d") ~ 2,
instrument %in% c("MADRS") ~ 3,
instrument %in% c("BDI", "bdi", "bdi-ii") ~ 4,
instrument %in% c("cdi") ~ 5,
instrument %in% c("K-SADS-L", "K-SADS", "K-SADS (adapted)") ~ 6,
instrument %in% c("mfq", "smfq") ~ 7,
instrument %in% c("RADS", "rads") ~ 8,
instrument %in% c("bid") ~ 9,
instrument %in% c("CDS") ~ 10,
instrument %in% c("ces-d") ~ 11,
instrument %in% c("CAS") ~ 12,
instrument %in% c("cbcl-d") ~ 13,
TRUE ~ 99 # Default case, if none of the above conditions match
)
)
merged_dataset <- merged_dataset %>% relocate(instrument_value, .after = instrument)
# create an instrument name variable to ensure consistency
merged_dataset <- merged_dataset %>%
mutate(
instrument_name = case_when(
instrument %in% c("CDRS", "CDRS-R", "cdrs-r") ~ "cdrs",
instrument %in% c("HAM-D", "ham-d") ~ "hamd",
instrument %in% c("MADRS") ~ "madrs",
instrument %in% c("BDI", "bdi", "bdi-ii") ~ "bdi",
instrument %in% c("cdi") ~ "cdi",
instrument %in% c("K-SADS-L", "K-SADS", "K-SADS (adapted)") ~ "ksads",
instrument %in% c("mfq") ~ "mfq",
instrument %in% c("smfq") ~ "smfq",
instrument %in% c("RADS", "rads") ~ "rads",
instrument %in% c("bid") ~ "bid",
instrument %in% c("CDS") ~ "cds",
instrument %in% c("ces-d") ~ "ces_d",
instrument %in% c("CAS") ~ "cas",
instrument %in% c("cbcl-d") ~ "cbcl_d",
instrument %in% c("CGI", "CGI-I") ~ "cgi",
instrument %in% c("GAS") ~ "gas",
instrument %in% c("Acholi Psychosocial Assessment Instrument (APAI): a local instrument") ~ "apai",
instrument %in% c("phq-9") ~ "phq_9",
instrument %in% c("PFC–S") ~ "pfc",
instrument %in% c("PAPA") ~ "papa",
instrument %in% c("dass") ~ "dass",
instrument %in% c("disc-c MDD symptom count") ~ "disc_c_mdd",
# TRUE ~ 99 # Default case, if none of the above conditions match
)
)
merged_dataset <- merged_dataset %>% relocate(instrument_name, .after = instrument)
# Charlotte note to self: Now that I am more familiar with R, I can see that it would have been better to do the below with mutate().
# At some point, fix code below to make more tidy.
# create response rate variables
# first create our primary response rate variable, which is estimated number of responders / n at randomisation
resp_rate_active <- (merged_dataset$responders_active)/(merged_dataset$baseline_n_active)
resp_rate_control <- (merged_dataset$responders_control)/(merged_dataset$baseline_n_control)
# then calculate response rate using n at post-test (i.e. completers only)
resp_rate_active_completers <- (merged_dataset$responders_active)/(merged_dataset$post_n_active)
resp_rate_control_completers <- (merged_dataset$responders_control)/(merged_dataset$post_n_control)
merged_dataset <- cbind(merged_dataset, resp_rate_active, resp_rate_control, resp_rate_active_completers, resp_rate_control_completers)
merged_dataset <- merged_dataset %>% relocate(resp_rate_active: resp_rate_control_completers, .after = responders_control)
# calculate response rate for studies that reported no of responders rather than a response rate
calc_observed_resp_rate_active <- (merged_dataset$observed_responders_active)/(merged_dataset$observed_responders_active_n)
calc_observed_resp_rate_control <- (merged_dataset$observed_responders_control)/(merged_dataset$observed_responders_control_n)
# then concatenate with variable indicating reported response rates
merged_dataset$observed_resp_rate_active <- coalesce(calc_observed_resp_rate_active, merged_dataset$observed_resp_rate_active)
merged_dataset$observed_resp_rate_control <- coalesce(calc_observed_resp_rate_control, merged_dataset$observed_resp_rate_control)
# check correlation between our calculations and observed response rate
cor(merged_dataset$resp_rate_active, merged_dataset$observed_resp_rate_active, method = "pearson", use = "pairwise.complete.obs")
cor(merged_dataset$resp_rate_control, merged_dataset$observed_resp_rate_control, method = "pearson", use = "pairwise.complete.obs")
# check correlation between our calculations and cuijpers' calculations
cor(merged_dataset$responders_active, merged_dataset$cuij_responders_active, method = "pearson", use = "pairwise.complete.obs")
cor(merged_dataset$responders_control, merged_dataset$cuij_responders_control, method = "pearson", use = "pairwise.complete.obs")
#rename dataframe
df_appl_v_orange <- merged_dataset
# The code below should actually be implemented once we turn the dataframe to long, which we do in the quarto. So I will not run this code here.
# # We have percent_women reported overall for psy trials but per arm for med trials
# #create overall percent women variable for all studies
# df_appl_v_orange <- df_appl_v_orange %>% mutate(overall_percent_women = (active_percent_women * baseline_n_active + control_percent_women * baseline_n_control)
# /(baseline_n_active + baseline_n_control))
# df_appl_v_orange <- df_appl_v_orange %>% mutate(percent_women = coalesce(percent_women, overall_percent_women))
#
# # we have separate age variables for each arm and for overall. Where age is reported per arm, I will calculate an overall.
# # I'm using an old variable name so I don't have the change the rest of the code
#
# df_appl_v_orange <- df_appl_v_orange %>% mutate(overall_mean_age = (age_m_active * baseline_n_active + age_m_control * baseline_n_control)
# /(baseline_n_active + baseline_n_control))
# df_appl_v_orange <- df_appl_v_orange %>% mutate(mean_age = coalesce(age_m_overall, overall_mean_age))
#
# # I'd like to create a pooled SD for age
# df_appl_v_orange <- df_appl_v_orange %>%
# mutate(pooled_sd_age = sqrt( ( (baseline_n_active - 1)*((age_sd_active)^2) + (baseline_n_control - 1)*((age_sd_control)^2)) /
# (baseline_n_active + baseline_n_control - 2)))
# df_appl_v_orange <- df_appl_v_orange %>% mutate(sd_age = coalesce(age_sd_overall , pooled_sd_age))
# calculate cohens d
df_appl_v_orange <- df_appl_v_orange %>%
mutate(cohens_d_active = (post_mean_active - baseline_mean_active)/
((post_sd_active + baseline_sd_active)/2), cohens_d_control = (post_mean_control - baseline_mean_control)/
((post_sd_control + baseline_sd_control)/2))
# Create summary statistics for each instrument --------
#average statistics for each instrument, collapsed across studies for both med and psy categories
# this includes duplicates of studies
meds_d_means <- list()
psy_d_means <- list()
meds_baseline_means <- list()
psy_baseline_means <- list()
meds_post_means <- list()
psy_post_means <- list()
inst_names <- unique(df_appl_v_orange$instrument_name)[1:6]
for(i in 1: length(inst_names)){
#cohens d for meds
meds_d_means[[i]] <- df_appl_v_orange %>%
filter(psy_or_med == 0, (instrument_name == inst_names[i])) %>%
summarise(n = n(), avg_d_act = mean(cohens_d_active, na.rm = T), sd_d_act = sd(cohens_d_active, na.rm = T),
avg_d_ctrl = mean(cohens_d_control, na.rm = T), sd_d_ctrl = sd(cohens_d_control, na.rm = T))
# cohens d for psy
psy_d_means[[i]] <- df_appl_v_orange %>%
filter(psy_or_med == 1, (instrument_name == inst_names[i])) %>%
summarise(n = n(), avg_d_act = mean(cohens_d_active, na.rm = T), sd_d_act = sd(cohens_d_active, na.rm = T),
avg_d_ctrl = mean(cohens_d_control, na.rm = T), sd_d_ctrl = sd(cohens_d_control, na.rm = T))
# for medication trials, average baseline mean scores for each instrument
meds_baseline_means[[i]] <- df_appl_v_orange %>%
filter(psy_or_med == 0, (instrument_name == inst_names[i])) %>%
summarise(n = n(), total_n_control = sum(baseline_n_control, na.rm = T),
total_n_active = sum(baseline_n_active, na.rm = T),
# avg_age = mean(mean_age, na.rm = T),
avg_baseline_mean_act = mean(baseline_mean_active, na.rm = T),
sd_baseline_mean_act = sd(baseline_mean_active, na.rm = T),
avg_baseline_mean_ctrl = mean(baseline_mean_control, na.rm = T),
sd_baseline_mean_ctrl = sd(baseline_mean_control, na.rm = T))
# for psychotherapy trials, average baseline mean scores for each instrument
psy_baseline_means[[i]] <- df_appl_v_orange %>%
filter(psy_or_med == 1, (instrument_name == inst_names[i])) %>%
summarise(n = n(), total_n_control = sum(baseline_n_control, na.rm = T),
total_n_active = sum(baseline_n_active, na.rm = T),
# avg_age = mean(mean_age, na.rm = T),
avg_baseline_mean_act = mean(baseline_mean_active, na.rm = T),
sd_baseline_mean_act = sd(baseline_mean_active, na.rm = T),
avg_baseline_mean_ctrl = mean(baseline_mean_control, na.rm = T),
sd_baseline_mean_ctrl = sd(baseline_mean_control, na.rm = T))
# for medication trials, average post-test mean scores for each instrument
meds_post_means[[i]] <- df_appl_v_orange %>%
filter(psy_or_med == 0, (instrument_name == inst_names[i])) %>%
summarise(n = n(), avg_post_mean_act = mean(post_mean_active, na.rm = T), sd_post_mean_act = sd(post_mean_active, na.rm = T),
avg_post_mean_ctrl = mean(post_mean_control, na.rm = T), sd_post_mean_ctrl = sd(post_mean_control, na.rm = T))
# for psychotherapy trials, average post-test mean scores for each instrument
psy_post_means[[i]] <- df_appl_v_orange %>%
filter(psy_or_med == 1, (instrument_name == inst_names[i])) %>%
summarise(n = n(), avg_post_mean_act = mean(post_mean_active, na.rm = T), sd_post_mean_act = sd(post_mean_active, na.rm = T),
avg_post_mean_ctrl = mean(post_mean_control, na.rm = T), sd_post_mean_ctrl = sd(post_mean_control, na.rm = T))
}
names(meds_d_means) <- inst_names
names(psy_d_means) <- inst_names
names(meds_baseline_means) <- inst_names
names(psy_baseline_means) <- inst_names
names(meds_post_means) <- inst_names
names(psy_post_means) <- inst_names
meds_d_means <- do.call(rbind,meds_d_means )
psy_d_means <- do.call(rbind,psy_d_means)
meds_baseline_means <- do.call(rbind, meds_baseline_means)
psy_baseline_means <- do.call(rbind, psy_baseline_means)
meds_post_means <- do.call(rbind, meds_post_means)
psy_post_means <- do.call(rbind, psy_post_means)
psy_d_means <- psy_d_means %>% mutate(type = "psy", instr = rownames(psy_d_means))
meds_d_means <- meds_d_means %>% mutate(type = "med", instr = rownames(psy_d_means) )
psy_baseline_means <- psy_baseline_means %>% mutate(type = "psy", instr = rownames(psy_baseline_means))
meds_baseline_means <- meds_baseline_means %>% mutate(type = "med", instr = rownames(meds_baseline_means))
psy_post_means <- psy_post_means %>% mutate(type = "psy", instr = rownames(psy_post_means))
meds_post_means <- meds_post_means %>% mutate(type = "med", instr = rownames(meds_post_means))
meds_psy_combo_d_means <- rbind(meds_d_means,psy_d_means )
meds_psy_combo_baseline_means <- rbind(meds_baseline_means, psy_baseline_means )
meds_psy_combo_post_means <- rbind(meds_post_means, psy_post_means )
df_stats_per_instrument <- full_join(meds_psy_combo_baseline_means, meds_psy_combo_post_means, by = c("type", "instr", "n"))
df_stats_per_instrument <- full_join(df_stats_per_instrument, meds_psy_combo_d_means, by = c("type", "instr", "n"))
df_stats_per_instrument <- df_stats_per_instrument %>% relocate(instr, .before = n)
df_stats_per_instrument <- df_stats_per_instrument %>% relocate(type, .after = instr)
df_stats_per_instrument <- df_stats_per_instrument %>% arrange(instr)
# Look at demographics per study, for primary instrument ------------------
# create unique study ids to account for cases when there are multiple active or control arms per study name
df_appl_v_orange <- df_appl_v_orange %>%
mutate(study_ID = ifelse(psy_or_med == 0, paste(study, year, active_type, control_type, sep = "_"),
paste(study, descr_active, descr_control, sep = "_")))
# arranging by study ID and instrument value
df_appl_v_orange <- df_appl_v_orange %>% arrange(study_ID, instrument_value)
# retain only first row to look at demographics as this will keep one row per study_ID, taking the primary instrument
df_first_row <- df_appl_v_orange %>% distinct(study_ID, .keep_all = TRUE)
write.csv(df_first_row, "df_first_row.csv")
#check how many med comparisons we cannot calc cohens d for (for primary instrument)
filter_test <- df_first_row %>% filter(psy_or_med == 0)
sum(is.na(filter_test$cohens_d_active))
sum(is.na(filter_test$cohens_d_control))
# start looking at demographics, baseline and post means, and cohens d per arm
df_demographics <- df_first_row %>% dplyr:: select(study, year, psy_or_med, active_type, control_type, descr_active, descr_control,
instrument_name, baseline_mean_active, baseline_sd_active, baseline_n_active,
baseline_mean_control, baseline_sd_control, baseline_n_control,
post_mean_active, post_sd_active, post_n_active, post_mean_control, post_sd_control, post_n_control,
cohens_d_active, cohens_d_control, percent_women)
# combine arm description variables across psy and med
df_demographics$descr_active <- ifelse(is.na(df_demographics$descr_active), df_demographics$active_type, df_demographics$descr_active)
df_demographics$descr_control <- ifelse(is.na(df_demographics$descr_control), df_demographics$control_type, df_demographics$descr_control)
df_demographics <- df_demographics %>% dplyr::select(-c(active_type, control_type))
df_demographics <- df_demographics %>% arrange(psy_or_med)
# check how many studies we have baseline means for primary instrument
df_summary_missing_data <- df_demographics %>%
group_by(psy_or_med) %>%
summarise(n_reporting_base_means_act = sum(!is.na(baseline_mean_active)), n_reporting_base_means_contr = sum(!is.na(baseline_mean_control)),
n_reporting_post_means_act = sum(!is.na(post_mean_active)), n_reporting_post_means_contr = sum(!is.na(post_mean_control)))
# average cohens d, mean age and percent women, summarising across psy and med
df_means_demo <- df_demographics %>%
group_by(psy_or_med) %>%
summarise(mean_cohens_d_active = mean(cohens_d_active, na.rm = TRUE),
mean_cohens_d_control = mean(cohens_d_control, na.rm = TRUE),
# mean_age = mean(mean_age, na.rm = TRUE),
mean_percent_women = mean(percent_women, na.rm = TRUE),
missing_d = sum(is.na(cohens_d_active)))
# Argyris looking at stats per instr
# keep only non_zero values
test_2 <- df_stats_per_instrument %>%
filter((type == "med" & n != 0) | (type == "psy" & n != 0) )
# keep only duplicated rows
library(misty)
test_2 <- df.duplicated(test_2, instr)
get_cdrs_means <- test_2 %>%
filter(instr == "cdrs")
test_2 %>%
filter(instr == "hamd")
test_2 %>%
filter(instr == "bdi")
t.test.fromSummaryStats <- function(mu,n,s) {
-diff(mu) / sqrt( sum( s^2/n ) )
}
mu <- c(get_cdrs_means$avg_d_ctrl[1],get_cdrs_means$avg_d_ctrl[2])
n <- c(get_cdrs_means$total_n_control[1],get_cdrs_means$total_n_control[2])
s <- c(get_cdrs_means$sd_d_ctrl[1],get_cdrs_means$sd_d_ctrl[2])
t.test.fromSummaryStats(mu,n,s)
# df_demographics %>%
# group_by(psy_or_med ) %>%
# summarise (avg_age = mean(mean_age, na.rm = T), sd_age = sd(mean_age, na.rm = T))
#openxlsx:: write.xlsx(df_stats_per_instrument, file = "test_2.xlsx", colNames = T, borders = "columns", asTable = F)
# check the Cohen's ds for medication and psychotherapy
tapply(df_appl_v_orange$cohens_d_control,
df_appl_v_orange$psy_or_med, summary,na.rm = TRUE)
# you see that I used tapply rather than dplyr's group_by. Here I am also using aggregate, another
# R base function. Neat, right :)
aggregate(df_appl_v_orange$cohens_d_control, by = list(df_appl_v_orange$psy_or_med), summary)
# from the above it seemed like we had some studies where the controls have a positive d
# here is a quick histogram to see what I mean
tapply(df_appl_v_orange$cohens_d_control,df_appl_v_orange$psy_or_med, hist)
# I have therefore re-run things here excluding those with positive ds.
tapply(df_appl_v_orange[df_appl_v_orange$cohens_d_control<=0, ]$cohens_d_control,
df_appl_v_orange[df_appl_v_orange$cohens_d_control<=0, ]$psy_or_med, summary,na.rm = TRUE)
# which ones are the offending studies. As can be seen, nearly all of these studies are in the
# psychotherapy part with some duplicates or triplicates.
studies_with_positive_cohens_ds <- tapply(df_appl_v_orange[df_appl_v_orange$cohens_d_control>=0
, ]$study, df_appl_v_orange[df_appl_v_orange$cohens_d_control>=0
, ]$psy_or_med, na.omit)
names(studies_with_positive_cohens_ds) <- c("meds", "psy") # to make prettier
studies_with_positive_cohens_ds
# We have inspected those with pos cohens d. It appears for most there is a genuine (small) deterioration in control arm.
# For Geller, positive cohens d is a factor of using the GAS. This should not be a problem once we take primary instrument for each study.
# Other large post cohens d is for Reynolds 1986, this seems to be correct.
# Detected one error for the De Jong Heesen study - there is a mistake in Cuijpers dataset. Fixed in original dataset.
# yet to do - inform Cuipers
#check which studies have missing means or sds at baseline or post
# Specify the columns to check
columns_to_check <- c(
"baseline_mean_active", "baseline_sd_active",
"baseline_mean_control", "baseline_sd_control",
"post_mean_active", "post_sd_active",
"post_mean_control", "post_sd_control"
)
# Identify which columns have missing values for each study
columns_with_missing_values <- df_appl_v_orange %>%
filter(rowSums(is.na(.[columns_to_check])) > 0) %>%
dplyr::select(psy_or_med, study, year, where(function(x) any(is.na(x))))
# Print columns with missing values for each study THROWS UP ERROR TOLD CHARLOTTE
# tapply(print(columns_with_missing_values[,1:2]
# ), columns_with_missing_values$psy_or_med)
# create excel sheet to edit
# we have created a notes column named how_handle_es_calc where we inspect each study and its missing data
# from there we work out which method to use to impute / derive an appropriate value to substitute missing values
openxlsx:: write.xlsx(columns_with_missing_values, file = "columns_with_missing_values.xlsx", colNames = T, borders = "columns", asTable = F)
# The updated version of this sheet with notes is called "columns_with_missing_values_updated.xlsx"
# Please note that some studies report SE rather than SD. Initially these were incorrectly inputed into our
# original medication dataset however these errors have now been corrected in the original ("Working_Dataset_Medication")
# This applied to Berard 2006, Emslie 2006, Emslie 2007, Emslie 2009, Findling 2009, Keller 2001, Paxil 2009
# Now begin imputing missing values. We will start the the meds studies. The psy studies that are missing values
# were not in Cuijper's MA table of included studies (I think because their primary outcome measures were not continuous).
# We will need to decide our approach here - if we retain these studies, we will have to redo extraction. Will come back to this.
# To check how many rows with missing SDs we have
# Let's do this for only the primary instrument for each study (using df_first_row), starting with med studies.
df_first_row_missing_sds <- df_first_row %>%
filter(psy_or_med == 0) %>%
summarise(
rows_with_na_sds = sum(rowSums(is.na(dplyr::select(., c(baseline_sd_active , post_sd_active, baseline_sd_control, post_sd_control)))) > 0)
)
df_first_row_missing_sds
# Now check how many rows with missing means
df_first_row_missing_means <- df_first_row %>%
filter(psy_or_med == 0) %>%
summarise(
rows_with_na_means = sum(rowSums(is.na(dplyr::select(., c(baseline_mean_active , post_mean_active, baseline_mean_control, post_mean_control)))) > 0)
)
df_first_row_missing_means
# Let's impute missing SDs first. If either the baseline or post-intervention SD is unavailable, it will be substituted by the other.
df_appl_v_orange <- df_appl_v_orange %>%
mutate(
baseline_sd_active = ifelse(psy_or_med == 0 & is.na(baseline_sd_active), post_sd_active, baseline_sd_active),
post_sd_active = ifelse(psy_or_med == 0 & is.na(post_sd_active), baseline_sd_active, post_sd_active),
baseline_sd_control = ifelse(psy_or_med == 0 & is.na(baseline_sd_control), post_sd_control, baseline_sd_control),
post_sd_control = ifelse(psy_or_med == 0 & is.na(post_sd_control), baseline_sd_control, post_sd_control)
)
# Now lets impute missing means. If either the baseline or post mean is missing, use the change score to calculate this.
df_appl_v_orange <- df_appl_v_orange %>%
mutate(
baseline_mean_active = ifelse(psy_or_med == 0 & is.na(baseline_mean_active), post_mean_active - active_mean_change, baseline_mean_active),
baseline_mean_control = ifelse(psy_or_med == 0 & is.na(baseline_mean_control), post_mean_control - control_mean_change, baseline_mean_control),
post_mean_active = ifelse(psy_or_med == 0 & is.na(post_mean_active), baseline_mean_active + active_mean_change, post_mean_active),
post_mean_control = ifelse(psy_or_med == 0 & is.na(post_mean_control), baseline_mean_control + control_mean_change, post_mean_control)
)
# Check how many studies with missing SDs after performing imputation
df_first_row <- df_appl_v_orange %>% distinct(study_ID, .keep_all = TRUE)
df_first_row_missing_sds <- df_first_row %>%
filter(psy_or_med == 0) %>%
summarise(
rows_with_na_sds = sum(rowSums(is.na(dplyr::select(., c(baseline_sd_active , post_sd_active, baseline_sd_control, post_sd_control)))) > 0)
)
df_first_row_missing_sds
# We have gone from 21 to 8 studies with missing SDs. Great!
# And check to see how many studies with missing means after performing imputation
df_first_row_missing_means <- df_first_row %>%
filter(psy_or_med == 0) %>%
summarise(
rows_with_na_means = sum(rowSums(is.na(dplyr::select(., c(baseline_mean_active , post_mean_active, baseline_mean_control, post_mean_control)))) > 0)
)
df_first_row_missing_means
# We have gone from 15 to 9 studies with missing means. Great!
# Lets recalculate cohens d now that we have more data
df_appl_v_orange <- df_appl_v_orange %>%
mutate(cohens_d_active = (post_mean_active - baseline_mean_active)/
((post_sd_active + baseline_sd_active)/2), cohens_d_control = (post_mean_control - baseline_mean_control)/
((post_sd_control + baseline_sd_control)/2))
df_first_row <- df_appl_v_orange %>% distinct(study_ID, .keep_all = TRUE)
filter_test <- df_first_row %>% filter(psy_or_med == 0)
sum(!is.na(filter_test$cohens_d_active))
sum(!is.na(filter_test$cohens_d_control))
# Before we could only calculate cohens d for 12 med studies, now we can calculate this for 22
# how many unique med comparisons do we have
unique_psy <- df_appl_v_orange %>%
filter(psy_or_med == 1) %>%
summarise(unique_psy = n_distinct(study_ID))
unique_psy
# and for psy
unique_med <- df_appl_v_orange %>%
filter(psy_or_med == 0) %>%
summarise(unique_med = n_distinct(study_ID))
unique_med
# Check again to identify which med studies still have missing values in our columns of interest after performing the imputation above
new_columns_with_missing_values <- df_first_row %>%
filter(rowSums(is.na(.[columns_to_check])) > 0) %>%
filter(psy_or_med == 0) %>%
dplyr:: select(study, year, active_type, instrument_name, where(function(x) any(is.na(x))))
###tapply(print(new_columns_with_missing_values[,1:2]), new_columns_with_missing_values$psy_or_med) THROWS UP ERROR TOO
openxlsx:: write.xlsx(new_columns_with_missing_values, file = "new_columns_with_missing_values.xlsx", colNames = T, borders = "columns", asTable = F)
# For studies missing all sds, we will sub in an average SD taken from similar studies in the dataset. We will calculate average SDs for each arm, pre and post.
# We will just do this for the CDRS. The studies missing SDs for both pre and post are Bristol-Myers Squibb 2002a and 2002b, Emslie 2002b, Organon 2002a and 2002b,
# Paxil (GlaxoSmithKline) 2009, Wagner 2006
# First calculate the average values
calc_mean_sds <- df_appl_v_orange %>%
filter(psy_or_med == 0, instrument_name == "cdrs") %>%
summarise(mean_baseline_sd_active = mean(baseline_sd_active, na.rm = TRUE),
mean_baseline_sd_control = mean(baseline_sd_control, na.rm = TRUE),
mean_post_sd_active = mean(post_sd_active, na.rm = TRUE),
mean_post_sd_control = mean(post_sd_control, na.rm = TRUE))
# I want these as numeric values
mean_baseline_sd_active <- calc_mean_sds$mean_baseline_sd_active
mean_baseline_sd_control <- calc_mean_sds$mean_baseline_sd_control
mean_post_sd_active <- calc_mean_sds$mean_post_sd_active
mean_post_sd_control <- calc_mean_sds$mean_post_sd_control
# Then use these for imputation. Only use these mean values for med studies using the CDRS.
df_appl_v_orange <- df_appl_v_orange %>%
mutate(
baseline_sd_active = ifelse(psy_or_med == 0 & is.na(baseline_sd_active) & instrument_name == "cdrs", mean_baseline_sd_active, baseline_sd_active),
post_sd_active = ifelse(psy_or_med == 0 & is.na(post_sd_active) & instrument_name == "cdrs", mean_post_sd_active, post_sd_active),
baseline_sd_control = ifelse(psy_or_med == 0 & is.na(baseline_sd_control) & instrument_name == "cdrs", mean_baseline_sd_control, baseline_sd_control),
post_sd_control = ifelse(psy_or_med == 0 & is.na(post_sd_control) & instrument_name == "cdrs", mean_post_sd_control, post_sd_control)
)
# Read in Cipriani dataset ------------------------------------------------
# We are now going to clean and merge in Cipriani's data. For context, we have tried contacting Cipriani and Peng Xie multiple times
# to request the full dataset for their MA but did not receive a reply. There is a more limited dataset provided online
# as a PDF. The dataset I will now read in called "Full_Dataset_Cipriani_MA" is taken from the table in this PDF
# and converted to excel. I have added column names - this is the only thing I have changed.
df_cipriani <- read_excel("Full_Dataset_Cipriani_MA.xlsx",
col_types = c("text", "text", "text",
"text", "numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric"))
# Rename columns that we already have in master dataset to make sure we can identify that they have come from Cipriani
colnames(df_cipriani)
columns_to_rename <- c("change", "change_sd", "change_n", "responders", "responders_n", "responders_n_missing")
df_cipriani <- df_cipriani %>%
rename_with(~paste0("cip_", .), columns_to_rename)
df_cipriani
# A couple of things causing problems. For Almeida-Montes 2005, Eli Lilly 1986, Bristol-Myers Squibb 2002,
# GlaxoSmithKline 2009, von Knorring 2006 - these exist in Cipriani's dataset but not ours.
# Almeida-Montes and Eli Lilly we haven't been able to find the original papers, hence not in our dataset.
# GlaxoSmithKline 2009, von Knorring 2006 - fix formatting differences.
df_cipriani <- df_cipriani %>%
mutate(study = ifelse(study == "GlaxoSmithKline", "Paxil (GlaxoSmithKline)", study),
study = ifelse(study == "von Knorring", "Von Knorring", study))
# Cipriani has different rows for each condition whereas we have one row per actv vs ctrl comparison. We will need to
# subset his dataset to prepare to merge.
# Start with placebo
df_cipriani_ctrl <- df_cipriani %>%
filter(type == "Placebo")
df_cipriani_ctrl <- df_cipriani_ctrl %>%
rename_with(~paste0(.x, "_ctrl"), c("cip_change": "suicide_n")) %>%
rename(control_type = type)
# And then active
df_cipriani_actv <- df_cipriani %>%
filter(type != "Placebo")
df_cipriani_actv <- df_cipriani_actv %>%
rename_with(~paste0(.x, "_actv"), c("cip_change": "suicide_n")) %>%
rename(active_type = type)
# Now join them both.
df_cipriani_form <- full_join(df_cipriani_actv, df_cipriani_ctrl, by = c("study", "year"))
# There are 3 studies that do not include a placebo. We will exclude these.
df_cipriani_form <- df_cipriani_form %>%
filter(!is.na(control_type))
# Cipriani does not indicate which instrument his change scores apply to. However we do use the same
# instrument hierarchy. I'm going to use df_first_row for this reason, which is our dataset which includes only the primary
# instrument for each comparison
df_first_row <- df_appl_v_orange %>% distinct(study_ID, .keep_all = TRUE)
df_first_row <- full_join(df_first_row, df_cipriani_form, by = c("study", "year", "active_type", "control_type"))
# Now we want to join this back into master dataset
df_first_row_prepare <- df_first_row %>%
dplyr::select(c(study_ID, instrument_name, study, year, active_type, control_type, cip_change_actv: suicide_n_actv, cip_change_ctrl: suicide_n_ctrl))
df_appl_v_orange <- full_join(df_appl_v_orange, df_first_row_prepare, by = c("study_ID", "instrument_name"))
# There are a few studies present in second dataset but not present in first (as we were not able to locate these studies).
# Hence we only have Cipriani's data for these studies.
df_appl_v_orange <- df_appl_v_orange %>%
mutate(
study = coalesce(study.x, study.y),
year = coalesce(year.x, year.y),
active_type = coalesce(active_type.x, active_type.y),
control_type = coalesce(control_type.x, control_type.y)) %>%
dplyr::select(-study.x, -study.y, -year.x, -year.y, -active_type.x, -active_type.y, -control_type.x, -control_type.y) %>%
relocate(study, year, active_type, control_type, .after = "Column1")
# Checked dataframe, looks mostly good with a couple of problems. Some haven't joined correctly because there are errors in how the drug name is spelled.
# I have corrected spelling errors in our original med dataset.
# Wagner 2003 hasn't read in correctly because Cipriani has considered it two studies (Wager 2003a and b). I've gone back to the paper and
# though there were two studies these were pooled a priori and only combined data is reported. We have complete data for this study, so for now
# adding Cipriani's data doesn't add anything. I'll just remove the duplicate rows for now.
df_appl_v_orange <- df_appl_v_orange %>%
filter(!(study == "Wagner" & (year == "2003a" | year == "2003b")))
# Now use Cipriani's change scores to fill in missing means
df_appl_v_orange <- df_appl_v_orange %>%
mutate(
baseline_mean_active = ifelse(psy_or_med == 0 & is.na(baseline_mean_active), post_mean_active - cip_change_actv, baseline_mean_active),
baseline_mean_control = ifelse(psy_or_med == 0 & is.na(baseline_mean_control), post_mean_control - cip_change_ctrl, baseline_mean_control),
post_mean_active = ifelse(psy_or_med == 0 & is.na(post_mean_active), baseline_mean_active + cip_change_actv, post_mean_active),
post_mean_control = ifelse(psy_or_med == 0 & is.na(post_mean_control), baseline_mean_control + cip_change_ctrl, post_mean_control)
)
# Again, lets recalculate cohens d now that we have more data
df_appl_v_orange <- df_appl_v_orange %>%
mutate(cohens_d_active = (post_mean_active - baseline_mean_active)/
((post_sd_active + baseline_sd_active)/2), cohens_d_control = (post_mean_control - baseline_mean_control)/
((post_sd_control + baseline_sd_control)/2))
df_first_row <- df_appl_v_orange %>% distinct(study_ID, .keep_all = TRUE)
filter_test <- df_first_row %>% filter(psy_or_med == 0)
sum(!is.na(filter_test$cohens_d_active))
sum(!is.na(filter_test$cohens_d_control))
unique(filter_test$study_ID)
study_ids_with_missing_ds <- df_first_row %>%
filter(is.na(cohens_d_active)) %>%
filter(psy_or_med == 0) %>%
dplyr::select(study_ID)
# We have cohens ds for 28/33 med studies. For Bristol Myers Squibb 2002a, Emslie 2002b, Hughes 1990 and Paxil GlaxoSmithKline 2009
# we will not be able to calculate ds due to very poor reporting in the original studies. No possible way to impute missing data.
# The other study is Emslie 2007(combined) which is actually misleading as it is a combination of 2007a and 2007b, however for two outcome measures
# data has been provided for the studies combined. We have complete data on the CDRS for the individual studies. Have checked with Argyris, we will remove this study.
df_appl_v_orange <- df_appl_v_orange %>%
filter(study_ID != "Emslie_2007 (combined)_Venlafaxine_Placebo")
# Okay, I think that is done. We now have Cohens ds for 28/32 studies and that's the best we can do.
# Let's look at missing ds for psy studies -------------------------------
# Look at cohens d missing for psy studies
filter_test <- df_first_row %>% filter(psy_or_med == 1)
sum(!is.na(filter_test$cohens_d_active))
sum(!is.na(filter_test$cohens_d_control))
# We have cohens d for 58/66 studies. Let's identify the studies w missing data.
study_ids_with_missing_ds <- df_first_row %>%
filter(is.na(cohens_d_active)) %>%
filter(psy_or_med == 1) %>%
dplyr::select(study_ID)
# None of these 8 studies are included in Cuijpers' MA because their primary outcome measure is not continuous.
# For a few of them I may be able to extract some relevant data and impute other data using methods above. I can do this for Shomaker 2016 and Yu 2002.
# I have extracted additional data and added to "FUll_Dataset_Cuijpers_MA".
# The 6 other studies do not report on a continuous outcome measure or do not provide sufficient data for us to impute missing means or SDs. See notes column
# of "columns_with_missing_values_updated" for specific descriptions of each study.
# Shomaker 2016 has a missing sd at post, so I will perform imputation as above
df_appl_v_orange <- df_appl_v_orange %>%
mutate(
baseline_sd_active = ifelse(psy_or_med == 1 & is.na(baseline_sd_active), post_sd_active, baseline_sd_active),
post_sd_active = ifelse(psy_or_med == 1 & is.na(post_sd_active), baseline_sd_active, post_sd_active),
baseline_sd_control = ifelse(psy_or_med == 1 & is.na(baseline_sd_control), post_sd_control, baseline_sd_control),
post_sd_control = ifelse(psy_or_med == 1 & is.na(post_sd_control), baseline_sd_control, post_sd_control)
)
# And check whether we have more studies with cohens ds now
df_appl_v_orange <- df_appl_v_orange %>%
mutate(cohens_d_active = (post_mean_active - baseline_mean_active)/
((post_sd_active + baseline_sd_active)/2), cohens_d_control = (post_mean_control - baseline_mean_control)/
((post_sd_control + baseline_sd_control)/2))
df_first_row <- df_appl_v_orange %>% distinct(study_ID, .keep_all = TRUE)
filter_test <- df_first_row %>% filter(psy_or_med == 1)
sum(!is.na(filter_test$cohens_d_active))
sum(!is.na(filter_test$cohens_d_control))
unique(filter_test$study_ID)
study_ids_with_missing_ds <- df_first_row %>%
filter(is.na(cohens_d_active)) %>%
filter(psy_or_med == 1) %>%
dplyr::select(study_ID)
# Great, now we have Cohens ds for Shomaker 2016 and Yu 2002. Hence we now have cohens d for 60 / 66 psy studies.
# Argyris and Charlotte discussed using change scores rather than pre and post means / sds as these are more easily accessible
# for psy studies change scores can be easily computed using baseline and post means. SDs of change scores can be computed according to Cochrane recource
# titled "Chapter 6: Choosing effect measures and computing estimates of effect" https://training.cochrane.org/handbook/current/chapter-06#section-6-5
# First calculate change scores for psy studies. I will calculate this as post score - baseline for consistency with Cipriani
df_appl_v_orange <- df_appl_v_orange %>%
mutate(change_active = ifelse(psy_or_med ==1, post_mean_active - baseline_mean_active, NA)) %>%
mutate(change_control = ifelse(psy_or_med ==1, post_mean_control - baseline_mean_control, NA))
# Now calculate the correlation coefficients. For this we need a study with complete data (i.e. SDs for pre, post, and change).
# We don't have this for psychotherapy (Cuijpers only reports pre and post), so let's check if we have this for any med studies
studies_w_complete_sds <- df_appl_v_orange %>%
filter(psy_or_med == 0, complete.cases(baseline_sd_active, post_sd_active, active_sd_change))
# We only have this for one study. Hence we decided to stick with cohens d for now and perform the imputations above.
# save dataframe
openxlsx:: write.xlsx(df_appl_v_orange, "df_appl_v_orange.xlsx") # you can use this now for the quarto document
############IGNORE from HERE to line 976
####### Argyris additions aide memoir
# test <- df_appl_v_orange %>%
# group_by(study_ID) %>%
# filter(instrument_value == min(instrument_value)) %>%
# group_by(psy_or_med) %>%
# summarise(av_cohens_d_ctrl = mean(cohens_d_control, na.rm = T), sd_cohens_d_ctrl = sd(cohens_d_control, na.rm = T),
# av_cohens_d_actuve = mean(cohens_d_active, na.rm = T), sd_cohens_d_active = sd(cohens_d_active, na.rm = T))
#
# test
#
# # create new id
# test <- df_appl_v_orange %>%
# mutate(new_study_id = case_when(psy_or_med == 0 ~ paste(study,year, sep = ", "),
# .default = study ))
#
#
#
#
# test[test$psy_or_med==0, ]$new_study_id == test[test$psy_or_med==0, ]$study
# test[test$psy_or_med==1, ]$new_study_id == test[test$psy_or_med==1, ]$study
#
#
# # USE THIS CODE only for examinig CONTROLS, for actives use distinct(active_type...)
#
# # discovered an error in the above for the percentage women o fthe Fristad study. I have checked in the
# # cuijpers dataset and the correct percentage is 43.1, but could not verify with the paper as it is not in
# # our folder and after a quick search I could not find it online either. Messaged Charlotte on Discord to
# # check again.
# df_appl_v_orange[df_appl_v_orange$study_ID=="Fristad, 2019_cbt + placebo_placebo",]$percent_women <-43.1
#
#
# df_appl_v_orange <-df_appl_v_orange %>%
# mutate(new_study_id = case_when(psy_or_med == 0 ~ paste(study,year, sep = ", "),
# .default = study ))
#
#
# # We also need to calculate SE for proportion women
# # for proportions, this is calculated as sqrt(p(1-p)/n), which I implement stepwise below
#
# prodcut_perc_women <- (df_appl_v_orange$percent_women/100)*
# (1-(df_appl_v_orange$percent_women/100) )
#
# total_n <- df_appl_v_orange$baseline_n_active +
# df_appl_v_orange$baseline_n_control
#
# df_appl_v_orange$percent_women_std_error <- sqrt(prodcut_perc_women/total_women )
#
#
# # We also need to calculate SE for baseline severity
#
# df_appl_v_orange$baseline_st_error_active <- df_appl_v_orange$baseline_sd_active/sqrt(df_appl_v_orange$baseline_n_active)
#
# df_appl_v_orange$baseline_st_error_control <-df_appl_v_orange$baseline_sd_control/sqrt(df_appl_v_orange$baseline_n_control)
#
#
#
#
# test_dist_contrl<- df_appl_v_orange %>%
# group_by(new_study_id) %>%
# filter(instrument_value == min(instrument_value)) %>%
# distinct(control_type,.keep_all = TRUE)
#
# test_dist_contrl[duplicated(test_dist_contrl$study_ID),]
#
# test_dist_contrl[test_dist_contrl$new_study_id=="Atkinson, 2014",]
#
# test_2 <- test[duplicated(test$study_ID),] #sorting out
# dim(test_2)
#
# dups <- test_2$study_ID
# test_3 <- test[test$study_ID %in% dups,]
#
# head(test_3, 26)
#
#
#
# test_dist_contrl%>%
# ggplot(aes(y = cohens_d_control, x = as.factor(psy_or_med), colour = as.factor(psy_or_med) ))+
# geom_point(aes(y = cohens_d_control, x = as.factor(psy_or_med), size = baseline_n_control),position = position_jitter(seed = .3, width = 0.08), alpha = 0.6) +
# geom_violin(alpha = 0.5)+
# stat_summary(
# mapping = aes(y = cohens_d_control, x = as.factor(psy_or_med)),
# fun.min = function(z) { quantile(z,0.25) },
# fun.max = function(z) { quantile(z,0.75) },
# fun = median)+
# ggtitle("Effect Sizes of the Control Arms of Anti-depressant\n and Psychotherapy Trials in Adolescent Depression", subtitle = "with medians and IQRs")+
# ylab("Cohen's d of the primary outcome")+
# scale_x_discrete(labels=c("0" = "Anti-depressant trial\nControls", "1" = "Psychotherapy trial\nControls"))+
# theme(axis.text.x= element_text(size= 12)) +
# theme(axis.text.y= element_text(size= 12)) +
# theme(axis.title.y= element_text(size= 12))+
# theme(axis.title.x= element_blank())+
# theme(legend.position = "none")+
# geom_hline(yintercept = 0, colour = "grey", linetype = "dashed")
# theme(plot.title = element_text(color="black", size=14, face="bold")) +
# theme_minimal()
#
#
# test_dist_contrl%>%
# filter(instrument_name == "cdrs") %>%
# ggplot(aes(y = baseline_mean_control, x = as.factor(psy_or_med), colour = as.factor(psy_or_med) ))+
# geom_point(aes(y = baseline_mean_control, x = as.factor(psy_or_med), size = baseline_n_control),position = position_jitter(seed = .3, width = 0.08), alpha = 0.6) +
# geom_violin(alpha = 0.5)+
# stat_summary(
# mapping = aes(y = baseline_mean_control, x = as.factor(psy_or_med)),
# fun.min = function(z) { quantile(z,0.25) },
# fun.max = function(z) { quantile(z,0.75) },
# fun = median)
#
#
# test_dist_contrl%>%
# filter(instrument_name == "hamd") %>%
# ggplot(aes(y = baseline_mean_control, x = as.factor(psy_or_med), colour = as.factor(psy_or_med) ))+
# geom_point(aes(y = baseline_mean_control, x = as.factor(psy_or_med), size = baseline_n_control),position = position_jitter(seed = .3, width = 0.08), alpha = 0.6) +
# geom_violin(alpha = 0.5)+
# stat_summary(
# mapping = aes(y = baseline_mean_control, x = as.factor(psy_or_med)),
# fun.min = function(z) { quantile(z,0.25) },
# fun.max = function(z) { quantile(z,0.75) },
# fun = median)
#
#
#
# test_dist_contrl%>%
# ggplot(aes(y = percent_women, x = as.factor(psy_or_med), colour = as.factor(psy_or_med) ))+
# geom_point(aes(y = percent_women, x = as.factor(psy_or_med), size = baseline_n_control),position = position_jitter(seed = .3, width = 0.08), alpha = 0.6) +
# geom_violin(alpha = 0.5)+
# stat_summary(
# mapping = aes(y = percent_women, x = as.factor(psy_or_med)),
# fun.min = function(z) { quantile(z,0.25) },
# fun.max = function(z) { quantile(z,0.75) },
# fun = median)
# t.test(test_dist_contrl$percent_women, test_dist_contrl$psy_or_med)
#
# test_dist_contrl%>%
# ggplot(aes(y = mean_age, x = as.factor(psy_or_med), colour = as.factor(psy_or_med) ))+
# geom_point(aes(y = mean_age, x = as.factor(psy_or_med), size = baseline_n_control),position = position_jitter(seed = .3, width = 0.08), alpha = 0.6) +
# geom_violin(alpha = 0.5)+
# stat_summary(
# mapping = aes(y = mean_age, x = as.factor(psy_or_med)),
# fun.min = function(z) { quantile(z,0.25) },
# fun.max = function(z) { quantile(z,0.75) },
# fun = median)
# t.test(test_dist_contrl$mean_age, test_dist_contrl$psy_or_med)
#
#
# test_dist_contrl%>%
# filter(instrument_name == "cdrs") %>%
# group_by(psy_or_med) %>%
# summarise(n= n(), avg_baseline = mean(baseline_mean_control, na.rm = T))
#
#
# test_dist_contrl%>%
# filter(instrument_name == "hamd") %>%
# group_by(psy_or_med) %>%
# summarise(n= n(), avg_baseline = mean(baseline_mean_control, na.rm = T))
#
# test_dist_contrl%>%
# filter(instrument_name == "bdi") %>%
# group_by(psy_or_med) %>%
# summarise(n= n(), avg_baseline = mean(baseline_mean_control, na.rm = T))
#
# test_dist_contrl%>%
# filter(instrument_name == "ces_d") %>%
# group_by(psy_or_med) %>%
# summarise(n= n(), avg_baseline = mean(baseline_mean_control, na.rm = T))
#
# unique(test_dist_contrl$instrument_name)
#
#
#
# summarise(n= n(), avg_gender = mean(baseline_mean_control, na.rm = T))
#
# ###
# library(tidyverse) # needed for 'glimpse'
# library(dmetar)
# library(meta)
#
#
#
# met_perc_women <- metagen(TE = percent_women,
# seTE = percent_women_std_error,
# studlab = study,
# data = df_appl_v_orange,
# sm = "SMD",
# fixed = FALSE,
# random = TRUE,
# method.tau = "REML",
# hakn = TRUE,
# title = "percentage women across studies")
# update.meta(met_perc_women,
# subgroup =psy_or_med,
# tau.common = FALSE)
#
#
#
# met_perc_women <- metagen(TE = percent_women,
# seTE = percent_women_std_error,
# studlab = study,
# data = df_appl_v_orange,
# sm = "SMD",
# fixed = FALSE,
# random = TRUE,
# method.tau = "REML",
# hakn = TRUE,
# title = "percentage women across studies")
# update.meta(met_perc_women,
# subgroup =psy_or_med,
# tau.common = FALSE)
#
#
# df_for_cdrs <- test_dist_contrl %>%
# filter(instrument_name == "cdrs")
#
# met_baseline_severity_cdrs <- metagen(TE = baseline_mean_control,
# seTE = baseline_st_error_control,
# studlab = new_study_id,
# data = df_for_cdrs,
# sm = "MD",
# fixed = FALSE,
# random = TRUE,
# method.tau = "REML",
# hakn = TRUE,
# title = "percentage women across studies")
# update.meta(met_baseline_severity_cdrs,
# subgroup =psy_or_med,
# tau.common = FALSE)
#
#
#
# df_for_hamd <- test_dist_contrl %>%
# filter(instrument_name == "hamd")
#
# met_baseline_severity_hamd <- metagen(TE = baseline_mean_control,
# seTE = baseline_st_error_control,
# studlab = new_study_id,
# data = df_for_hamd,
# sm = "MD",
# fixed = FALSE,
# random = TRUE,
# method.tau = "REML",
# hakn = TRUE,
# title = "percentage women across studies")
# update.meta(met_baseline_severity_hamd,
# subgroup =psy_or_med,
# tau.common = FALSE)
#
#
#
#
#
#
# pdf(file = "forestplot.pdf", width = 15, height = 20)
# forest.meta(met_cohens_control_cbt_fluox_esc,
# sortvar = TE,
# prediction = TRUE,
# print.tau2 = FALSE,
# order = order(df_cbt_fluox_esc$psy_or_med),
# subgroup = TRUE,
# subgroup.name = df_cbt_fluox_esc$psy_or_med,
# test.subgroup.random,
# test.effect.subgroup.random,
# sep.subgroup = df_cbt_fluox_esc$psy_or_med,
# leftlabs = c("Study", "g", "SE"))
# dev.off()
#
#
# pdf(file = "forestplot.pdf", width = 15, height = 20)
# par(mar = c(6, 6, 6, 6))
# forest.meta(met_cohens_control_cbt_fluox_esc, layout = "JAMA",main = "Bigger margin: 6, 6, 6, 6")
# dev.off()
#
#
#
# met_cohens_control_cbt_fluox_esc <- metagen(TE = baseline_mean_control,
# seTE = baseline_st_error_control,
# studlab = new_study_id,
# data = df_cbt_fluox_esc,
# sm = "SMD",
# fixed = FALSE,
# random = TRUE,
# method.tau = "REML",
# hakn = TRUE,
# title = "percentage women across studies")
#
#
# #### It is useful to have two metanalyses and combine them to one
# ### the idea is that each metanalysis has its own heterogeneity, tau, and that it is reasonable
# ### to combine.
#
#
#
#######Important code########### latest as of 30th December 2023
# Start here------------------------------------------------------
#Need to use SMDs, ie our Cohen's d and then use Standard error of SMD, to achieve this
# I need reliabilities.