Merge pull request #73 from poldrack/develop

Develop
poldrack · Jan 25, 2020 · ba45508 · ba45508
2 parents f1fe7e7 + 8dd6f56
commit ba45508
Show file tree

Hide file tree

Showing 4 changed files with 204 additions and 29 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -51,7 +51,7 @@ jobs:
           command: |
              cat /root/project/ImageAnalyses/*.Rmd | grep library >| /root/project/ImageAnalyses/R_libraries.R
              export NARPS_BASEDIR="/tmp/data"; Rscript -e 'library(knitr);library(rmarkdown);rmarkdown::render("/root/project/ImageAnalyses/DecisionAnalysis.Rmd","html_document", output_dir = "/tmp/data/figures")'
-          no_output_timeout: 10000
+          no_output_timeout: 90000
       - run:
           name: run PM analyses
           command: |
@@ -62,6 +62,6 @@ jobs:
       - run:
           name: create results tarball
           command: |
-             cd /tmp/data; tar zcvf results.tgz ./cached ./figures ./logs ./metadata ./image_diagnostics* ./output/unthresh_concat_zstat/hypo*_voxelmap.nii.gz ./output/overlap_binarized_thresh/hypo*.nii.gz ./output/consensus_analysis/hypo*_1-fdr.nii.gz ./output/ALE/hypo*_fdr_oneminusp.nii.gz ./output/cluster_maps ./output/correlation_unthresh ./PredictionMarkets
+             cd /tmp/data; tar zcvf results.tgz ./output/npboot_output.RData ./cached ./figures ./logs ./metadata ./image_diagnostics* ./output/unthresh_concat_zstat/hypo*_voxelmap.nii.gz ./output/overlap_binarized_thresh/hypo*.nii.gz ./output/consensus_analysis/hypo*_1-fdr.nii.gz ./output/ALE/hypo*_fdr_oneminusp.nii.gz ./output/cluster_maps ./output/correlation_unthresh ./PredictionMarkets
       - store_artifacts:
              path: /tmp/data/results.tgz
diff --git a/ImageAnalyses/DecisionAnalysis.Rmd b/ImageAnalyses/DecisionAnalysis.Rmd
@@ -38,7 +38,7 @@ library(xtable)
 library(psych)
 library(knitr)
 library(sessioninfo)
-
+library(boot)
 
 
 basedir = Sys.getenv('NARPS_BASEDIR')
@@ -218,39 +218,175 @@ r.squaredGLMM(m_hyp_full)
 
 ```
 
-Run bootstrap to get confidence intervals
+### Nonparametric bootstrap
+The bootMer package performs resampling for lmer models, but it uses a parametric bootstrap which we did not feel adequately addressed concerns about non-normality.  Thus, we implemented a nonparametric bootstrap, in which we resample over teams.
+
+First we generate wide data, for resampling.
 
 ```{r}
+decisions_wide <- hyp_df %>%
+  mutate(Hypothesis = str_c("Hyp", varnum, sep="")) %>%
+  dplyr::select(teamID, Decision, Hypothesis) %>%
+  spread(Hypothesis,Decision)
+
+team_info <- hyp_df %>% 
+  filter(varnum==1) %>%
+  dplyr::select(fwhm, used_fmriprep_data,package,
+                testing, movement_modeling,teamID)
+
+decisions_wide <- join(decisions_wide,
+                       team_info,
+                       by='teamID')
+```
 
-mySumm <- function(.) {
-    c(beta=fixef(.))
+Function to run analysis within bootstrap.  For the regressors that are parametric or binary, we can simply compute a bootstrap confidence interval on their parameter estimate.  For the two variables that are factorial, we can't do this because some bootstrap samples will not contain all of the same values for the factors, and thus the models are not equivalent.  Instead, for those variables we perform a model comparison and save the model comparison statistics so that we can compute a bootstrap estimate on those (ala https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5482523/).  
+
+```{r message=FALSE,warning=FALSE}
+bs_fun <- function(df, indices){
+  df <- df[indices,]
+  control = glmerControl(
+              optimizer ='optimx', 
+              optCtrl=list(method='nlminb'))
+  # we take in a wide data frame, but must
+  # convert it back to long form for lmer
+  df_long <- df %>% gather(key='Hypothesis',
+              value='Decision',
+              -movement_modeling,-teamID,-fwhm,
+              -used_fmriprep_data,-package,-testing)
+  
+  model_full <- glmer(Decision ~ fwhm + 
+                        used_fmriprep_data + 
+                        movement_modeling + 
+                        Hypothesis + 
+                        package + 
+                        testing  + (1|teamID),
+            data = df_long,family=binomial,
+            control = control)
+  
+  # get coefs for all but factorial effects
+  fixed_effect_coefs = fixef(model_full)[2:4]
+  
+   # do model comparison for Hypothesis effect
+  model_nohyp <- glmer(Decision ~ fwhm + 
+                             used_fmriprep_data + 
+                             movement_modeling + 
+                             package + 
+                             testing  + (1|teamID),
+            data = df_long,family=binomial,
+            control = control)
+
+  # perform model comparison and save statistics
+  anova_nohyp <- anova(model_nohyp,model_full)
+  fixed_effect_coefs['hyp_BICdiff'] = 
+    diff(anova_nohyp$BIC)
+  fixed_effect_coefs['hyp_AICdiff'] = 
+    diff(anova_nohyp$AIC)
+
+  # do model comparison for package effect
+  model_nopackage <- glmer(Decision ~ Hypothesis + 
+                             fwhm + 
+                             used_fmriprep_data + 
+                             movement_modeling + 
+                             testing  + (1|teamID),
+            data = df_long,family=binomial,
+            control = control)
+
+  # perform model comparison and save statistics
+  anova_nopackage <- anova(model_nopackage,model_full)
+  fixed_effect_coefs['package_BICdiff'] = 
+    diff(anova_nopackage$BIC)
+   fixed_effect_coefs['package_AICdiff'] = 
+    diff(anova_nopackage$AIC)
+ 
+  # do model comparison for testing effect
+   model_notesting <- glmer(Decision ~ Hypothesis + 
+                              fwhm + 
+                              used_fmriprep_data + 
+                              movement_modeling + 
+                              package +  (1|teamID),
+            data = df_long,family=binomial,
+            control = control)
+    
+  # perform model comparison and save statistics
+  anova_notesting <- anova(model_notesting,model_full)
+  fixed_effect_coefs['testing_BICdiff'] = 
+    diff(anova_notesting$BIC)
+  fixed_effect_coefs['testing_AICdiff'] = 
+    diff(anova_notesting$AIC)
+
+fixed_effect_coefs
 }
 
-boo01 <- bootMer(m_hyp_full, mySumm, nsim = 1000, seed=12345678) 
+```
+
+
+```{r}
+# perform bootstrap sampling with 1000 replications
+Out <- boot(data=decisions_wide, statistic=bs_fun, R=1000)
+
+# save bootstrap samples
+save(Out,file=paste(basedir,                     
+                     'output/npboot_output.RData',
+                     sep='/'))
 
 ```
 
+Get confidence intervals, first for betas for parametric regressors.
+
 ```{r}
-model.coefs <- coef(summary_full_model)
-percentile_bs_ci <- data.frame(lower=array(NA, 17),
-                               mean=array(NA, 17),
-                               upper=array(NA,17),
-                               row.names=row.names(model.coefs))
-boot_means = apply(boo01$t,2,mean)
-
-for (i in 1:17){
-  ci_raw <- boot::boot.ci(boot.out = boo01, type = c("perc"), index = i)
-  
-  percentile_bs_ci[i,] <- c(ci_raw$percent[4],boot_means[i],ci_raw$percent[5])
+# set up data frame to save results
+rownames = c('fwhm',
+            'used_fmriprep_data',
+            'movement_modeling', 
+            'Hypothesis (deltaBIC)',
+            'Hypothesis (deltaAIC)',
+            'package (deltaBIC)',
+            'package (deltaAIC)',
+            'testing (deltaBIC)',
+            'testing (deltaAIC)') 
+
+
+percentile_np_ci <- data.frame(lower=array(NA, 
+                                    length(rownames)),
+                               mean=array(NA, 
+                                    length(rownames)),
+                               upper=array(NA,
+                                    length(rownames)),
+                               exceedence=array(NA,
+                                    length(rownames)),
+                               row.names=rownames)
+
+# get means for parameters across amples
+npboot_means = apply(Out$t,2,mean)
+
+# loop over parametric regressors and get percentile CI
+for (i in 1:3){
+  ci_raw <- boot::boot.ci(boot.out = Out, type = c("perc"), index = i)
+  percentile_np_ci[i,] <- c(ci_raw$percent[4],npboot_means[i],ci_raw$percent[5],NA)
+}
+
+```
+
+Get CI for model comparisons for factorial regressors (package and testing).
+
+```{r}
+# get ci for BIC diff
+
+for (i in 4:9){
+  ci_raw <- boot::boot.ci(boot.out = Out, type = c("perc"), index = i)
+  # use lower p because we want to know how often full model
+  # is better than reduced model
+  percentile_np_ci[i,] <- c(ci_raw$percent[4],npboot_means[i],ci_raw$percent[5],mean(Out$t[,i]<0))
 }
 
-kable(percentile_bs_ci)
-write.table(percentile_bs_ci,
-            file=paste(basedir,"figures/full_model_bootstrap_CI.tsv",sep='/'),
+kable(percentile_np_ci)
+write.table(percentile_np_ci,
+            file=paste(basedir,"output/nonparametric_bootstrap_CI.tsv",sep='/'),
             sep='\t')
 
 ```
 
+
 Get odds ratios for factorial variables.
 
 ```{r}
@@ -406,12 +542,36 @@ summary_df['Chi-squared'] = c(anova_hyp$Chisq[2],
                               anova_package$Chisq[2],
                               anova_testing$Chisq[2],
                               anova_movement$Chisq[2])
-summary_df['P value'] = c(anova_hyp$"Pr(>Chisq)"[2],
+summary_df['P value (parametric)'] = c(anova_hyp$"Pr(>Chisq)"[2],
                               anova_smoothing$"Pr(>Chisq)"[2],
                               anova_fmriprep$"Pr(>Chisq)"[2],
-                              anova_package$"Pr(>Chisq)"[2],
+                             anova_package$"Pr(>Chisq)"[2],
                               anova_testing$"Pr(>Chisq)"[2],
                               anova_movement$"Pr(>Chisq)"[2])
+summary_df['Bootstrap CI'] = c(NA,
+                              sprintf('(%0.2f - %0.2f)',
+                                      percentile_np_ci['fwhm','lower'],
+                                      percentile_np_ci['fwhm','upper']),
+                              sprintf('(%0.2f - %0.2f)',
+                                      percentile_np_ci['used_fmriprep_data','lower'],
+                                      percentile_np_ci['used_fmriprep_data','upper']),
+                              NA,
+                              NA,
+                              sprintf('(%0.2f - %0.2f)',
+                                      percentile_np_ci['movement_modeling','lower'],
+                                      percentile_np_ci['movement_modeling','upper']))
+summary_df['Model selection probability (BIC)'] = c(percentile_np_ci['Hypothesis (deltaBIC)','exceedence'],
+                              NA,
+                              NA,
+                              percentile_np_ci['package (deltaBIC)','exceedence'],
+                              percentile_np_ci['testing (deltaBIC)','exceedence'],
+                              NA)
+summary_df['Model selection probability (AIC)'] = c(percentile_np_ci['Hypothesis (deltaAIC)','exceedence'],
+                              NA,
+                              NA,
+                              percentile_np_ci['package (deltaAIC)','exceedence'],
+                              percentile_np_ci['testing (deltaAIC)','exceedence'],
+                              NA)
 summary_df['Delta R^2'] = c(delta_r2_hyp[2,1],
                             delta_r2_smoothing[2,1],
                             delta_r2_fmriprep[2,1],
@@ -421,8 +581,6 @@ summary_df['Delta R^2'] = c(delta_r2_hyp[2,1],
 summary_df <- summary_df %>%
   mutate(`Chi-squared` = formatC(summary_df$'Chi-squared',
                                  digits=2,format='f'),
-    `P value` = formatC(summary_df$'P value',
-                        digits=3,format='f'),
     `Delta R^2` = formatC(summary_df$'Delta R^2',
                         digits=2,format='f')
     )

diff --git a/ImageAnalyses/MakeSupplementaryFigure1.py b/ImageAnalyses/MakeSupplementaryFigure1.py
@@ -114,17 +114,34 @@ def mk_supp_figure1(narps, metadata):
         narps.dirs.dirs['figures'],
         'SuppFigure1.png'))
 
+    # save data to file
+    metadata_merged.to_csv(
+        os.path.join(
+            narps.dirs.dirs['figures'],
+            'MethodsTableMetadataMerged.csv')
+    )
+    decision_wide.to_csv(
+        os.path.join(
+            narps.dirs.dirs['figures'],
+            'DecisionDataWide.csv')
+    )
+    confidence_wide.to_csv(
+        os.path.join(
+            narps.dirs.dirs['figures'],
+            'ConfidenceDataWide.csv')
+    )
+
 
 if __name__ == "__main__":
     # set an environment variable called NARPS_BASEDIR
     # with location of base directory
     if 'NARPS_BASEDIR' in os.environ:
         basedir = os.environ['NARPS_BASEDIR']
     else:
-        basedir = '/data'
+        basedir = '/tmp/data'
 
     # setup main class
-    narps = Narps(basedir)
+    narps = Narps(basedir, dataurl=os.environ['DATA_URL'])
     narps.load_data()
 
     logfile = os.path.join(

diff --git a/PredictionMarketAnalyses/PM_Analyses.Rmd b/PredictionMarketAnalyses/PM_Analyses.Rmd
@@ -89,8 +89,8 @@ supptable_df <- supptable_df %>%
          TeamsOutside = recode(supptable_df$t_exceed, `0` = '', `1` = '*')) %>%
   unite('CI', CI_lower:CI_upper, sep='-') %>%
   mutate(CI = sapply(CI,bracket)) %>%
-  unite('Non-teams FV', NonTeams:NonTeamsOutside, sep=' ') %>%
-  unite('Teams FV', Teams:TeamsOutside, sep=' ') %>%
+  unite('Non-teams market prediction', NonTeams:NonTeamsOutside, sep=' ') %>%
+  unite('Teams market prediction', Teams:TeamsOutside, sep=' ') %>%
   dplyr::select(-fv,-fv_95l, -fv_95u, -price.0, -price.1, -t_exceed, -nt_exceed, -hypothesis) 
 supptable_df <- supptable_df %>%
   rename(`Hyp #` = hid)