Skip to content

Commit

Permalink
Merge pull request #16 from MAmbrose-IDM/dev
Browse files Browse the repository at this point in the history
reorganize and standardize plotting and quantitative comparison content
  • Loading branch information
YeChen-IDM authored Jun 7, 2022
2 parents 8efcc90 + eaccf86 commit 2140441
Show file tree
Hide file tree
Showing 13 changed files with 2,151 additions and 491 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,8 @@ library(broom)
library(tidyverse)


# prepare dataframe with simulation and reference data formatted together
prepare_inc_df = function(sim_df, ref_df){
# scale down incidence in simulation according to probability of detecting a case
sim_df$Incidence = sim_df$Incidence * sim_df$p_detect_case
match_sim_ref_ages = function(ref_df, sim_df, bench_df=data.frame()){

# set up reference and simulation dataset columns
ref_df$Incidence = ref_df$INC / 1000
ref_df$mean_age = (ref_df$INC_LAR + ref_df$INC_UAR)/2
ref_df = data.frame('reference_value'=ref_df$Incidence, 'mean_age'=ref_df$mean_age, 'Site'=ref_df$Site) #, 'ref_pop_size'=ref_df$POP, 'ref_year'=ref_df$START_YEAR)
sim_df = data.frame('simulation_value'=sim_df$Incidence, 'mean_age'=sim_df$mean_age, 'Site'=sim_df$Site)

sites = intersect(unique(sim_df$Site), unique(ref_df$Site))
# check that ages match between reference and simulation. if there is a small difference (<1 year, update simulation)
for(ss in sites){
Expand All @@ -27,6 +18,11 @@ prepare_inc_df = function(sim_df, ref_df){
missing_ages_ref = ages_ref[which(!(ages_ref %in% ages_sim))]
missing_ages_sim = ages_sim[which(!(ages_sim %in% ages_ref))]

if(nrow(bench_df)>0 && !all(ages_sim == sort(unique(bench_df$mean_age[bench_df$Site==ss])))){
warning(paste0("The age bins used in the benchmarking simulation are different from those used in the new simulation for site: ", ss, ". This benchmark simulation will be excluded."))
bench_df = bench_df[bench_df$Site != ss,]
}

if(!all(ages_ref <= ages_sim+0.1) | !all(ages_ref >= ages_sim-0.1)){
print(paste0('Imperfect age match between reference and simulations for site: ', ss))
print('... Mismatched reference / simulation ages are:')
Expand All @@ -38,12 +34,15 @@ prepare_inc_df = function(sim_df, ref_df){
sim_replace_age = missing_ages_sim[which(abs(missing_ages_sim - mm)<1)]
if(length(sim_replace_age)==1){
sim_df$mean_age[sim_df$Site == ss & sim_df$mean_age == sim_replace_age] = mm
if(nrow(bench_df>0) && any(bench_df$Site ==ss)){
bench_df$mean_age[bench_df$Site == ss & bench_df$mean_age == sim_replace_age] = mm
}
}
}

# update sim ages
ages_sim = sort(unique(sim_df$mean_age[sim_df$Site == ss]))

if(!all(ages_ref <= ages_sim+0.1) | !all(ages_ref >= ages_sim-0.1)){
print('...After adjustment, there remains an imperfect match between reference and simulation age bins.')
print(paste0(' Reference has ', length(ages_ref), ' age groups and simulation has ', length(ages_sim), ' age groups.'))
Expand All @@ -52,16 +51,42 @@ prepare_inc_df = function(sim_df, ref_df){
}
}
}
return(list(sim_df, bench_df))
}


# prepare dataframe with simulation and reference data formatted together
prepare_inc_df = function(sim_df, ref_df, bench_df = data.frame()){
# scale down incidence in simulation according to probability of detecting a case
sim_df$Incidence = sim_df$Incidence * sim_df$p_detect_case

# set up reference and simulation dataset columns
ref_df$Incidence = ref_df$INC / 1000
ref_df$mean_age = (ref_df$INC_LAR + ref_df$INC_UAR)/2
ref_df = data.frame('reference_value'=ref_df$Incidence, 'mean_age'=ref_df$mean_age, 'Site'=ref_df$Site) #, 'ref_pop_size'=ref_df$POP, 'ref_year'=ref_df$START_YEAR)
sim_df = data.frame('simulation_value'=sim_df$Incidence, 'mean_age'=sim_df$mean_age, 'Site'=sim_df$Site)

# format benchmark simulations
if(nrow(bench_df)>0){
# scale down incidence in simulation according to probability of detecting a case
bench_df$Incidence = bench_df$Incidence * bench_df$p_detect_case
bench_df = data.frame('benchmark_value'=bench_df$Incidence, 'mean_age'=bench_df$mean_age, 'Site'=bench_df$Site)
}
# check that ages match between reference and simulation. if there is a small difference (<1 year, update simulation)
age_matched_dfs = match_sim_ref_ages(ref_df, sim_df, bench_df)
sim_df = age_matched_dfs[[1]]
bench_df = age_matched_dfs[[2]]

combined_df = merge(ref_df, sim_df, all=TRUE)
combined_df = merge(combined_df, bench_df, all=TRUE)
combined_df$metric = 'incidence'
return(combined_df)
}



# prepare dataframe with simulation and reference data formatted together
prepare_prev_df = function(sim_df, ref_df){
prepare_prev_df = function(sim_df, ref_df, bench_df=data.frame()){
# get simulation average across seeds
sim_df = sim_df %>% group_by(Site, mean_age, month) %>%
summarise(prevalence = mean(prevalence))
Expand All @@ -70,46 +95,27 @@ prepare_prev_df = function(sim_df, ref_df){
sim_df$Site = tolower(sim_df$Site)

ref_df = data.frame('reference_value'=ref_df$prevalence, 'mean_age'=ref_df$mean_age, 'Site'=ref_df$Site, 'month'=ref_df$month,
'site_month'=paste0(ref_df$Site, '_month', ref_df$month))
'site_month'=paste0(ref_df$Site, '_month', ref_df$month), 'total_sampled'=ref_df$total_sampled, 'num_pos'=ref_df$num_pos)
sim_df = data.frame('simulation_value'=sim_df$prevalence, 'mean_age'=sim_df$mean_age, 'Site'=sim_df$Site, 'month'=sim_df$month,
'site_month'=paste0(sim_df$Site, '_month', sim_df$month))


sites = intersect(unique(sim_df$site_month), unique(ref_df$site_month))
# check that ages match between reference and simulation. if there is a small difference (<1 year, update simulation)
for(ss in sites){
ages_ref = sort(unique(ref_df$mean_age[ref_df$site_month == ss]))
ages_sim = sort(unique(sim_df$mean_age[sim_df$site_month == ss]))
missing_ages_ref = ages_ref[which(!(ages_ref %in% ages_sim))]
missing_ages_sim = ages_sim[which(!(ages_sim %in% ages_ref))]

if(!all(ages_ref <= ages_sim+0.1) | !all(ages_ref >= ages_sim-0.1)){
print(paste0('Imperfect age match between reference and simulations for site: ', ss))
print('... Mismatched reference / simulation ages are:')
print(paste0(' ', missing_ages_ref, ' / ', missing_ages_sim, ','))
print('... For age thresholds that differ by less than a year, replacing simulation age with reference age.')

# check whether the missing ages are simply off by <1 year. If so, replace simulation age with nearby reference age
for(mm in missing_ages_ref){
sim_replace_age = missing_ages_sim[which(abs(missing_ages_sim - mm)<1)]
if(length(sim_replace_age)==1){
sim_df$mean_age[sim_df$site_month == ss & sim_df$mean_age == sim_replace_age] = mm
}
}

# update sim ages
ages_sim = sort(unique(sim_df$mean_age[sim_df$site_month == ss]))

if(!all(ages_ref <= ages_sim+0.1) | !all(ages_ref >= ages_sim-0.1)){
print('...After adjustment, there remains an imperfect match between reference and simulation age bins.')
print(paste0(' Reference has ', length(ages_ref), ' age groups and simulation has ', length(ages_sim), ' age groups.'))
} else{
print('... All age bins now match.')
}
}
# format benchmark simulations
if(nrow(bench_df)>0){
# get simulation average across seeds
bench_df = bench_df %>% group_by(Site, mean_age, month) %>%
summarise(prevalence = mean(prevalence))
bench_df$Site = tolower(bench_df$Site)
bench_df = data.frame('benchmark_value'=bench_df$prevalence, 'mean_age'=bench_df$mean_age, 'Site'=bench_df$Site, 'month'=bench_df$month,
'site_month'=paste0(bench_df$Site, '_month', bench_df$month))
}

# check that ages match between reference and simulation. if there is a small difference (<1 year, update simulation)
age_matched_dfs = match_sim_ref_ages(ref_df, sim_df, bench_df)
sim_df = age_matched_dfs[[1]]
bench_df = age_matched_dfs[[2]]

combined_df = merge(ref_df, sim_df, all=TRUE)
combined_df = merge(combined_df, bench_df, all=TRUE)
combined_df$metric = 'prevalence'
return(combined_df)
}
Expand Down Expand Up @@ -213,104 +219,23 @@ corr_ref_deriv_sim_points = function(combined_df){
return(list(gg, lm_summary, combined_df))
}





######### likelihood calculations ##############
# if the simulation mean is the true population value, what is the probability of observing the reference data (given the study sample size)?
get_dens_likelihood = function(sim_df, ref_df){
sim_df = sim_df[sim_df$Pop >0,]
# get the simulation mean across runs
if (all(sim_df$Pop ==sim_df$Pop[1])){
sim_df_ave= sim_df %>% group_by(Site, agebin, month, densitybin) %>%
summarise(sim_asexual_par_dens_freq = mean(asexual_par_dens_freq),
sim_gametocyte_dens_freq= mean(gametocyte_dens_freq))
} else{
warning("Different population sizes found across years within an age group... need to set up weighted averaging of parasite densities.")
# If this warning is triggered, use the population size to calculate the number of individuals in each density bin, then aggregate the sum, then divide by the total aggregated population
}

# remove survey dates from simulation that aren't in reference
sim_df_ave$site_month = paste0(sim_df_ave$Site, '_month', sim_df_ave$month)
ref_df$site_month = paste0(ref_df$Site, '_month', ref_df$month)
sites = intersect(unique(sim_df_ave$site_month), unique(ref_df$site_month))
sim_df_ave = sim_df_ave[sim_df_ave$site_month %in% sites, ]
# check that ages match between reference and simulation. if there is a small difference (<1 year, update simulation)
for(ss in sites){
ages_ref = sort(unique(ref_df$agebin[ref_df$site_month == ss]))
ages_sim = sort(unique(sim_df_ave$agebin[sim_df_ave$site_month == ss]))
missing_ages_ref = ages_ref[which(!(ages_ref %in% ages_sim))]
missing_ages_sim = ages_sim[which(!(ages_sim %in% ages_ref))]

if(!all(ages_ref <= ages_sim+0.1) | !all(ages_ref >= ages_sim-0.1)){
print(paste0('Imperfect age match between reference and simulations for site: ', ss))
print('... Mismatched reference / simulation ages are:')
print(paste0(' ', missing_ages_ref, ' / ', missing_ages_sim, ','))
print('... For age thresholds that differ by less than a year, replacing simulation age with reference age.')

# check whether the missing ages are simply off by <1 year. If so, replace simulation age with nearby reference age
for(mm in missing_ages_ref){
sim_replace_age = missing_ages_sim[which(abs(missing_ages_sim - mm)<1)]
if(length(sim_replace_age)==1){
sim_df_ave$agebin[sim_df_ave$site_month == ss & sim_df_ave$agebin == sim_replace_age] = mm
}
}

# update sim ages
ages_sim = sort(unique(sim_df_ave$agebin[sim_df_ave$agebin == ss]))

if(!all(ages_ref <= ages_sim+0.1) | !all(ages_ref >= ages_sim-0.1)){
print('...After adjustment, there remains an imperfect match between reference and simulation age bins.')
print(paste0(' Reference has ', length(ages_ref), ' age groups and simulation has ', length(ages_sim), ' age groups.'))
} else{
print('... All age bins now match.')
}
}
}


# consider the simulation mean value to be 'truth,' merge it into the reference data frame
ref_df = ref_df[,c('Site', 'month', 'site_month', 'agebin', 'densitybin', 'count_asex', 'bin_total_asex', 'count_gamet', 'bin_total_gamet')]
sim_df_ave = sim_df_ave[,c('Site', 'month', 'site_month', 'agebin', 'densitybin', 'sim_asexual_par_dens_freq', 'sim_gametocyte_dens_freq')]
combined_df = merge(ref_df, sim_df_ave, all=TRUE)
combined_df$metric = 'par_dens'
# remove rows where there is no reference data (sometimes there are different density bins in different sites, so rows with NA are created for the 'missing' bins - but check that there aren't values in the simulation either)
ref_rows_na = which(is.na(combined_df$count_asex))
sim_rows_0 = which(combined_df$sim_asexual_par_dens_freq < 0.0001)
if(all(ref_rows_na %in% sim_rows_0)){
combined_df = combined_df[-ref_rows_na,]
} else{
warning('There may be a mismatch in the age bins from the reference data and simulation data for at least one site. No rows were removed.')
}

# multinomial draw: likelihood of getting the observed distribution of parasite densities assuming the simulations show the true population-level frequencies
# iterate through groups of month-age-site
loglik_df = data.frame('site_month'= c(), 'loglikelihood_asex' = c(), 'loglikelihood_gamet' = c())
site_months = unique(combined_df$site_month)
for (ss in site_months){
loglikelihood_asex = 0
loglikelihood_gamet = 0
cur_agebins = unique(combined_df$agebin[combined_df$site_month == ss])
for (aa in cur_agebins){
cur_df = combined_df[combined_df$site_month == ss & combined_df$agebin == aa,]
# check that the sum of counts matches the sum column
if(sum(cur_df$count_asex) == cur_df$bin_total_asex[1] & sum(cur_df$count_gamet) == cur_df$bin_total_gamet[1] & length(unique(cur_df$bin_total_asex))==1 & length(unique(cur_df$bin_total_gamet))==1){
loglikelihood_asex = loglikelihood_asex + log(dmultinom(x=cur_df$count_asex, prob=cur_df$sim_asexual_par_dens_freq))
loglikelihood_gamet = loglikelihood_gamet + log(dmultinom(x=cur_df$count_gamet, prob=cur_df$sim_gametocyte_dens_freq))
} else{
warning(paste0('The sum of individuals across bins in the reference dataset does not match the reported total number of individuals included. This site and age bin is being skipped: ', ss, ' - ', aa))
}
}
loglik_df = rbind(loglik_df, data.frame('site_month'= ss, 'loglikelihood_asex' = loglikelihood_asex, 'loglikelihood_gamet' = loglikelihood_gamet))
}
return(loglik_df)
# create scatter plots with new versus benchmark simulation output and get comparison of likelihoods for each simulation set and each site.
compare_benchmark = function(combined_df){
combined_df = combined_df[!is.na(combined_df$benchmark_value),]
metric = combined_df$metric[1]
if('site_month' %in% colnames(combined_df)) combined_df$Site = combined_df$site_month
min_value = min(c(combined_df$benchmark_value, combined_df$simulation_value), na.rm=TRUE)
max_value = max(c(combined_df$benchmark_value, combined_df$simulation_value), na.rm=TRUE)
gg = ggplot(combined_df, aes(x=benchmark_value, y=simulation_value, color=Site, fill=Site))+
geom_point(size=2) +
ylab(paste0('new simulation ', metric)) +
xlab(paste0('benchmark sim ', metric)) +
ggtitle('Benchmark versus new sim values') +
coord_fixed(ratio=1, xlim=c(min_value, max_value), ylim=c(min_value, max_value))+
geom_abline(slope=1,intercept=0, color='grey', alpha=0.5) +
theme_classic() +
theme(plot.title = element_text(size=12))
return(gg)
}








Loading

0 comments on commit 2140441

Please sign in to comment.