Merge pull request #16 from MAmbrose-IDM/dev

reorganize and standardize plotting and quantitative comparison content
InstituteforDiseaseModeling · Jun 7, 2022 · 2140441 · 2140441
2 parents 8efcc90 + eaccf86
commit 2140441
Show file tree

Hide file tree

Showing 13 changed files with 2,151 additions and 491 deletions.
diff --git a/create_plots/calculate_comparison_metrics.R → .../_20220603/calculate_comparison_metrics.R b/create_plots/calculate_comparison_metrics.R → .../_20220603/calculate_comparison_metrics.R
@@ -8,17 +8,8 @@ library(broom)
 library(tidyverse)
 
 
-# prepare dataframe with simulation and reference data formatted together
-prepare_inc_df = function(sim_df, ref_df){
-  # scale down incidence in simulation according to probability of detecting a case
-  sim_df$Incidence = sim_df$Incidence * sim_df$p_detect_case
+match_sim_ref_ages = function(ref_df, sim_df, bench_df=data.frame()){
 
-  # set up reference and simulation dataset columns
-  ref_df$Incidence = ref_df$INC / 1000
-  ref_df$mean_age = (ref_df$INC_LAR + ref_df$INC_UAR)/2
-  ref_df = data.frame('reference_value'=ref_df$Incidence, 'mean_age'=ref_df$mean_age, 'Site'=ref_df$Site) #, 'ref_pop_size'=ref_df$POP, 'ref_year'=ref_df$START_YEAR)
-  sim_df = data.frame('simulation_value'=sim_df$Incidence, 'mean_age'=sim_df$mean_age, 'Site'=sim_df$Site)
-
   sites = intersect(unique(sim_df$Site), unique(ref_df$Site))
   # check that ages match between reference and simulation. if there is a small difference (<1 year, update simulation)
   for(ss in sites){
@@ -27,6 +18,11 @@ prepare_inc_df = function(sim_df, ref_df){
     missing_ages_ref = ages_ref[which(!(ages_ref %in% ages_sim))]
     missing_ages_sim = ages_sim[which(!(ages_sim %in% ages_ref))]
 
+    if(nrow(bench_df)>0 && !all(ages_sim == sort(unique(bench_df$mean_age[bench_df$Site==ss])))){
+      warning(paste0("The age bins used in the benchmarking simulation are different from those used in the new simulation for site: ", ss, ". This benchmark simulation will be excluded."))
+      bench_df = bench_df[bench_df$Site != ss,]
+    }
+
     if(!all(ages_ref <= ages_sim+0.1) | !all(ages_ref >= ages_sim-0.1)){
       print(paste0('Imperfect age match between reference and simulations for site: ', ss))
       print('...  Mismatched reference / simulation ages are:')
@@ -38,12 +34,15 @@ prepare_inc_df = function(sim_df, ref_df){
         sim_replace_age = missing_ages_sim[which(abs(missing_ages_sim - mm)<1)]
         if(length(sim_replace_age)==1){
           sim_df$mean_age[sim_df$Site == ss & sim_df$mean_age == sim_replace_age] = mm
+          if(nrow(bench_df>0) && any(bench_df$Site ==ss)){
+            bench_df$mean_age[bench_df$Site == ss & bench_df$mean_age == sim_replace_age] = mm
+          }
         }
       }
 
       # update sim ages
       ages_sim = sort(unique(sim_df$mean_age[sim_df$Site == ss]))
-
+      
       if(!all(ages_ref <= ages_sim+0.1) | !all(ages_ref >= ages_sim-0.1)){
         print('...After adjustment, there remains an imperfect match between reference and simulation age bins.')
         print(paste0('      Reference has ', length(ages_ref), ' age groups and simulation has ', length(ages_sim), ' age groups.'))
@@ -52,16 +51,42 @@ prepare_inc_df = function(sim_df, ref_df){
       }
     }
   }
+  return(list(sim_df, bench_df))
+}
+
+
+# prepare dataframe with simulation and reference data formatted together
+prepare_inc_df = function(sim_df, ref_df, bench_df = data.frame()){
+  # scale down incidence in simulation according to probability of detecting a case
+  sim_df$Incidence = sim_df$Incidence * sim_df$p_detect_case
+
+  # set up reference and simulation dataset columns
+  ref_df$Incidence = ref_df$INC / 1000
+  ref_df$mean_age = (ref_df$INC_LAR + ref_df$INC_UAR)/2
+  ref_df = data.frame('reference_value'=ref_df$Incidence, 'mean_age'=ref_df$mean_age, 'Site'=ref_df$Site) #, 'ref_pop_size'=ref_df$POP, 'ref_year'=ref_df$START_YEAR)
+  sim_df = data.frame('simulation_value'=sim_df$Incidence, 'mean_age'=sim_df$mean_age, 'Site'=sim_df$Site)
+
+  # format benchmark simulations
+  if(nrow(bench_df)>0){
+    # scale down incidence in simulation according to probability of detecting a case
+    bench_df$Incidence = bench_df$Incidence * bench_df$p_detect_case
+    bench_df = data.frame('benchmark_value'=bench_df$Incidence, 'mean_age'=bench_df$mean_age, 'Site'=bench_df$Site)
+  }
+  # check that ages match between reference and simulation. if there is a small difference (<1 year, update simulation)
+  age_matched_dfs = match_sim_ref_ages(ref_df, sim_df, bench_df)
+  sim_df = age_matched_dfs[[1]]
+  bench_df = age_matched_dfs[[2]]
 
   combined_df = merge(ref_df, sim_df, all=TRUE)
+  combined_df = merge(combined_df, bench_df, all=TRUE)
   combined_df$metric = 'incidence'
   return(combined_df)
 }
 
 
 
 # prepare dataframe with simulation and reference data formatted together
-prepare_prev_df = function(sim_df, ref_df){
+prepare_prev_df = function(sim_df, ref_df, bench_df=data.frame()){
   # get simulation average across seeds
   sim_df = sim_df %>% group_by(Site, mean_age, month) %>%
     summarise(prevalence = mean(prevalence))
@@ -70,46 +95,27 @@ prepare_prev_df = function(sim_df, ref_df){
   sim_df$Site = tolower(sim_df$Site)
 
   ref_df = data.frame('reference_value'=ref_df$prevalence, 'mean_age'=ref_df$mean_age, 'Site'=ref_df$Site, 'month'=ref_df$month,
-                      'site_month'=paste0(ref_df$Site, '_month', ref_df$month))
+                      'site_month'=paste0(ref_df$Site, '_month', ref_df$month), 'total_sampled'=ref_df$total_sampled, 'num_pos'=ref_df$num_pos)
   sim_df = data.frame('simulation_value'=sim_df$prevalence, 'mean_age'=sim_df$mean_age, 'Site'=sim_df$Site, 'month'=sim_df$month,
                       'site_month'=paste0(sim_df$Site, '_month', sim_df$month))
 
-
-  sites = intersect(unique(sim_df$site_month), unique(ref_df$site_month))
-  # check that ages match between reference and simulation. if there is a small difference (<1 year, update simulation)
-  for(ss in sites){
-    ages_ref = sort(unique(ref_df$mean_age[ref_df$site_month == ss]))
-    ages_sim = sort(unique(sim_df$mean_age[sim_df$site_month == ss]))
-    missing_ages_ref = ages_ref[which(!(ages_ref %in% ages_sim))]
-    missing_ages_sim = ages_sim[which(!(ages_sim %in% ages_ref))]
-
-    if(!all(ages_ref <= ages_sim+0.1) | !all(ages_ref >= ages_sim-0.1)){
-      print(paste0('Imperfect age match between reference and simulations for site: ', ss))
-      print('...  Mismatched reference / simulation ages are:')
-      print(paste0('     ', missing_ages_ref, ' / ', missing_ages_sim, ','))
-      print('... For age thresholds that differ by less than a year, replacing simulation age with reference age.')
-
-      # check whether the missing ages are simply off by <1 year. If so, replace simulation age with nearby reference age
-      for(mm in missing_ages_ref){
-        sim_replace_age = missing_ages_sim[which(abs(missing_ages_sim - mm)<1)]
-        if(length(sim_replace_age)==1){
-          sim_df$mean_age[sim_df$site_month == ss & sim_df$mean_age == sim_replace_age] = mm
-        }
-      }
-
-      # update sim ages
-      ages_sim = sort(unique(sim_df$mean_age[sim_df$site_month == ss]))
-
-      if(!all(ages_ref <= ages_sim+0.1) | !all(ages_ref >= ages_sim-0.1)){
-        print('...After adjustment, there remains an imperfect match between reference and simulation age bins.')
-        print(paste0('      Reference has ', length(ages_ref), ' age groups and simulation has ', length(ages_sim), ' age groups.'))
-      } else{
-        print('... All age bins now match.')
-      }
-    }
+  # format benchmark simulations
+  if(nrow(bench_df)>0){
+    # get simulation average across seeds
+    bench_df = bench_df %>% group_by(Site, mean_age, month) %>%
+      summarise(prevalence = mean(prevalence))
+    bench_df$Site = tolower(bench_df$Site)
+    bench_df = data.frame('benchmark_value'=bench_df$prevalence, 'mean_age'=bench_df$mean_age, 'Site'=bench_df$Site, 'month'=bench_df$month,
+                        'site_month'=paste0(bench_df$Site, '_month', bench_df$month))
   }
+
+  # check that ages match between reference and simulation. if there is a small difference (<1 year, update simulation)
+  age_matched_dfs = match_sim_ref_ages(ref_df, sim_df, bench_df)
+  sim_df = age_matched_dfs[[1]]
+  bench_df = age_matched_dfs[[2]]
 
   combined_df = merge(ref_df, sim_df, all=TRUE)
+  combined_df = merge(combined_df, bench_df, all=TRUE)
   combined_df$metric = 'prevalence'
   return(combined_df)
 }
@@ -213,104 +219,23 @@ corr_ref_deriv_sim_points = function(combined_df){
   return(list(gg, lm_summary, combined_df))
 }
 
-
-
-
-
-######### likelihood calculations ##############
-# if the simulation mean is the true population value, what is the probability of observing the reference data (given the study sample size)?
-get_dens_likelihood = function(sim_df, ref_df){
-  sim_df = sim_df[sim_df$Pop >0,]
-  # get the simulation mean across runs
-  if (all(sim_df$Pop ==sim_df$Pop[1])){
-    sim_df_ave= sim_df %>% group_by(Site, agebin, month, densitybin) %>%
-      summarise(sim_asexual_par_dens_freq = mean(asexual_par_dens_freq),
-                sim_gametocyte_dens_freq= mean(gametocyte_dens_freq))
-  } else{
-    warning("Different population sizes found across years within an age group... need to set up weighted averaging of parasite densities.") 
-    # If this warning is triggered, use the population size to calculate the number of individuals in each density bin, then aggregate the sum, then divide by the total aggregated population
-  }
-
-  # remove survey dates from simulation that aren't in reference
-  sim_df_ave$site_month = paste0(sim_df_ave$Site, '_month', sim_df_ave$month)
-  ref_df$site_month = paste0(ref_df$Site, '_month', ref_df$month)
-  sites = intersect(unique(sim_df_ave$site_month), unique(ref_df$site_month))
-  sim_df_ave = sim_df_ave[sim_df_ave$site_month %in% sites, ]
-  # check that ages match between reference and simulation. if there is a small difference (<1 year, update simulation)
-  for(ss in sites){
-    ages_ref = sort(unique(ref_df$agebin[ref_df$site_month == ss]))
-    ages_sim = sort(unique(sim_df_ave$agebin[sim_df_ave$site_month == ss]))
-    missing_ages_ref = ages_ref[which(!(ages_ref %in% ages_sim))]
-    missing_ages_sim = ages_sim[which(!(ages_sim %in% ages_ref))]
-
-    if(!all(ages_ref <= ages_sim+0.1) | !all(ages_ref >= ages_sim-0.1)){
-      print(paste0('Imperfect age match between reference and simulations for site: ', ss))
-      print('...  Mismatched reference / simulation ages are:')
-      print(paste0('     ', missing_ages_ref, ' / ', missing_ages_sim, ','))
-      print('... For age thresholds that differ by less than a year, replacing simulation age with reference age.')
-
-      # check whether the missing ages are simply off by <1 year. If so, replace simulation age with nearby reference age
-      for(mm in missing_ages_ref){
-        sim_replace_age = missing_ages_sim[which(abs(missing_ages_sim - mm)<1)]
-        if(length(sim_replace_age)==1){
-          sim_df_ave$agebin[sim_df_ave$site_month == ss & sim_df_ave$agebin == sim_replace_age] = mm
-        }
-      }
-
-      # update sim ages
-      ages_sim = sort(unique(sim_df_ave$agebin[sim_df_ave$agebin == ss]))
-
-      if(!all(ages_ref <= ages_sim+0.1) | !all(ages_ref >= ages_sim-0.1)){
-        print('...After adjustment, there remains an imperfect match between reference and simulation age bins.')
-        print(paste0('      Reference has ', length(ages_ref), ' age groups and simulation has ', length(ages_sim), ' age groups.'))
-      } else{
-        print('... All age bins now match.')
-      }
-    }
-  }
-
-
-  # consider the simulation mean value to be 'truth,' merge it into the reference data frame
-  ref_df = ref_df[,c('Site', 'month', 'site_month', 'agebin', 'densitybin', 'count_asex', 'bin_total_asex', 'count_gamet', 'bin_total_gamet')]
-  sim_df_ave = sim_df_ave[,c('Site', 'month', 'site_month', 'agebin', 'densitybin', 'sim_asexual_par_dens_freq', 'sim_gametocyte_dens_freq')]
-  combined_df = merge(ref_df, sim_df_ave, all=TRUE)
-  combined_df$metric = 'par_dens'
-  # remove rows where there is no reference data (sometimes there are different density bins in different sites, so rows with NA are created for the 'missing' bins - but check that there aren't values in the simulation either)
-  ref_rows_na = which(is.na(combined_df$count_asex))
-  sim_rows_0 = which(combined_df$sim_asexual_par_dens_freq < 0.0001)
-  if(all(ref_rows_na %in% sim_rows_0)){
-    combined_df = combined_df[-ref_rows_na,]
-  } else{
-    warning('There may be a mismatch in the age bins from the reference data and simulation data for at least one site. No rows were removed.')
-  }
-
-  # multinomial draw: likelihood of getting the observed distribution of parasite densities assuming the simulations show the true population-level frequencies
-  # iterate through groups of month-age-site
-  loglik_df = data.frame('site_month'= c(), 'loglikelihood_asex' = c(), 'loglikelihood_gamet' = c())
-  site_months = unique(combined_df$site_month)
-  for (ss in site_months){
-    loglikelihood_asex = 0
-    loglikelihood_gamet = 0
-    cur_agebins = unique(combined_df$agebin[combined_df$site_month == ss])
-    for (aa in cur_agebins){
-      cur_df = combined_df[combined_df$site_month == ss & combined_df$agebin == aa,]
-      # check that the sum of counts matches the sum column
-      if(sum(cur_df$count_asex) == cur_df$bin_total_asex[1] & sum(cur_df$count_gamet) == cur_df$bin_total_gamet[1] & length(unique(cur_df$bin_total_asex))==1 & length(unique(cur_df$bin_total_gamet))==1){
-        loglikelihood_asex = loglikelihood_asex + log(dmultinom(x=cur_df$count_asex, prob=cur_df$sim_asexual_par_dens_freq))
-        loglikelihood_gamet = loglikelihood_gamet + log(dmultinom(x=cur_df$count_gamet, prob=cur_df$sim_gametocyte_dens_freq))
-      } else{
-        warning(paste0('The sum of individuals across bins in the reference dataset does not match the reported total number of individuals included. This site and age bin is being skipped: ', ss, ' - ', aa))
-      }
-    }
-    loglik_df = rbind(loglik_df, data.frame('site_month'= ss, 'loglikelihood_asex' = loglikelihood_asex, 'loglikelihood_gamet' = loglikelihood_gamet))
-  }
-  return(loglik_df)
+# create scatter plots with new versus benchmark simulation output and get comparison of likelihoods for each simulation set and each site.
+compare_benchmark = function(combined_df){
+  combined_df = combined_df[!is.na(combined_df$benchmark_value),]
+  metric = combined_df$metric[1]
+  if('site_month' %in% colnames(combined_df)) combined_df$Site = combined_df$site_month
+  min_value = min(c(combined_df$benchmark_value, combined_df$simulation_value), na.rm=TRUE)
+  max_value = max(c(combined_df$benchmark_value, combined_df$simulation_value), na.rm=TRUE)
+  gg = ggplot(combined_df, aes(x=benchmark_value, y=simulation_value, color=Site, fill=Site))+
+    geom_point(size=2) + 
+    ylab(paste0('new simulation ', metric)) + 
+    xlab(paste0('benchmark sim ', metric)) + 
+    ggtitle('Benchmark versus new sim values') +
+    coord_fixed(ratio=1, xlim=c(min_value, max_value), ylim=c(min_value, max_value))+
+    geom_abline(slope=1,intercept=0, color='grey', alpha=0.5) + 
+    theme_classic() +
+    theme(plot.title = element_text(size=12))
+  return(gg)
 }
 
 
-
-
-
-
-
-
diff --git a/...are_ref_sim_infection_duration_Navrongo.R → ...are_ref_sim_infection_duration_Navrongo.R b/...are_ref_sim_infection_duration_Navrongo.R → ...are_ref_sim_infection_duration_Navrongo.R