Merge pull request #53 from YeChen-IDM/modular_structure

Modular structure
InstituteforDiseaseModeling · Jan 4, 2023 · 31262a7 · 31262a7
2 parents d363687 + 5b757d8
commit 31262a7
Show file tree

Hide file tree

Showing 13 changed files with 581 additions and 428 deletions.
diff --git a/README.md b/README.md
@@ -5,13 +5,12 @@
 **Table of Contents**
 
 - [User Installation](#user-installation)
-  - [Developer Installation](#developer-installation)
-  - [Regular Installation](#regular-installation)
+  - [Installation](#installation)
   - [Pre-requisites](#pre-requisites)
-- [Run Simulations](#run-simulations)
+- [Run workflow](#run-workflow)
   - [Login to Comps](#login-to-comps)
-  - [Option 1_Run One Site with Python Scripts](#option-1_run-one-site-with-python-scripts)
-  - [Option 2_Run all Sites with Snakemake (Recommended)](#option-2_run-all-sites-with-snakemake-recommended)
+  - [Option 1: Run all Sites with Snakemake](#option-1-run-all-sites-with-snakemake)
+  - [Option 2: Run Sites in Certain Subset(s) with Snakemake](#option-2-run-sites-in-certain-subsets-with-snakemake)
     - [How to Change Default Setting](#how-to-change-default-setting)
     - [Snakemake Tips](#snakemake-tips)
 - [Check Plots and Final Report](#check-plots-and-final-report)
@@ -25,8 +24,8 @@ Note: we recemend to upgrade pip to the latest version before installation:
 pip install --upgrade pip
 ```
 
-## Developer Installation
-If you want to install this project in a editable mode('develop' mode), run the following command: 
+## Installation
+Run the following command: ( it will install this project in editable/'develop' mode): 
 ```bash
 pip install -e . -r requirements.txt
 ```
@@ -35,13 +34,6 @@ or you can run
 pip install -e . -i https://packages.idmod.org/api/pypi/pypi-production/simple
 ```
 
-## Regular Installation
-```bash
-pip install wheel
-python3 setup.py bdist_wheel
-pip install dist/{update-this-with-the-wheel-file-name}.whl --index-url=https://packages.idmod.org/api/pypi/pypi-production/simple
-```
-
 ## Pre-requisites
 - Python 3.9 x64
 
@@ -60,7 +52,7 @@ The conflict is caused by:
 Idmtools has dependencies with an old pandas version, we have opened a ticket to update this dependency on idmtools and are waiting for a patch release: https://github.com/InstituteforDiseaseModeling/idmtools/issues/1774.
 For now, you can workarround this error by removing 'datar' from our requirements.txt and install it after installating our package with "pip install datar".
 
-# Run Simulations
+# Run Workflow
 
 ## Login to Comps
 If you haven't login to Comps for a while, you can run the following script to login and cache your credential:
@@ -72,21 +64,22 @@ When you see "Password:" in the terminal, enter your comps password and hit ente
 ![alt text](./comps_login.PNG?raw=true)
 
 
-## Option 1_Run One Site with Python Scripts (Not recommended)
+## Option 1: Run all Sites with Snakemake
+Run the snakemake pipeline with all sites in simulation_coordinator.csv:
 ```bash
-cd simulations
-python3 run_sims.py -s {site_name} -n {nSims}
-python3 add_suite.py(optional step) 
-python3 run_analyzers.py -s {site_name}
-python3 download_wi.py -s {site_name}
+snakemake -j
 ```
 
-
-## Option 2_Run all Sites with Snakemake (Recommended)
-Run the whole pipeline with all sites in simulation_coordinator.csv:
+## Option 2: Run Sites in Certain Subset(s) with Snakemake
+Run the snakemake pipeline with sites in one or multiple subsets in simulation_coordinator.csv:
 ```bash
-snakemake -j
+snakemake --config -s="core_relationship" -j
 ```
+or 
+```bash
+snakemake --config -s="core_relationship, infection_duration" -j
+```
+
 
 ### How to Change Default Setting
 - Some details about our default setting:

diff --git a/create_plots/helpers_coordinate_each_relationship.py b/create_plots/helpers_coordinate_each_relationship.py
@@ -207,6 +207,9 @@ def generate_parasite_density_outputs(coord_csv, simulation_output_filepath, bas
     combined_df_asex = combined_dfs[0]
     combined_df_gamet = combined_dfs[1]
 
+    if combined_df_gamet.empty or combined_df_asex.empty:
+        return
+
     # todo: combine these 2 plotting block in to one function
     # asexual parasite density
     plot_output = plot_par_dens_ref_sim_comparison(combined_df=combined_df_asex)
@@ -287,6 +290,8 @@ def generate_infectiousness_outputs(coord_csv, simulation_output_filepath, base_
 
     combined_df = prepare_infect_df(coord_csv, simulation_output_filepath, base_reference_filepath,
                                     benchmark_simulation_filepath)
+    if combined_df.empty:
+        return
 
     plot_output = plot_infectiousness_ref_sim_comparison(combined_df)
     plot_list = plot_output[0]

diff --git a/create_plots/helpers_reformat_sim_ref_dfs.py b/create_plots/helpers_reformat_sim_ref_dfs.py
@@ -510,6 +510,8 @@ def prepare_dens_df(coord_csv, simulation_output_filepath, base_reference_filepa
                                                            relationship_name='age_parasite_density',
                                                            relationship_sim_filename='parasite_densities_by_age_month.csv')
 
+    if not available_sites:
+        return pd.DataFrame(), pd.DataFrame()
     # iterate through sites, grabbing relevant reference and simulation data to plot; combine data into a dataframe containing all sites
     sim_df = pd.DataFrame()
     bench_df = pd.DataFrame()
@@ -671,6 +673,8 @@ def prepare_infect_df(coord_csv, simulation_output_filepath, base_reference_file
                                                            relationship_name='infectiousness_to_mosquitos',
                                                            relationship_sim_filename='infectiousness_by_age_density_month.csv')
 
+    if not available_sites:
+        return pd.DataFrame()
     # iterate through sites, grabbing relevant reference and simulation data to plot; combine data into a dataframe containing all sites
     sim_df = pd.DataFrame()
     bench_df = pd.DataFrame()
@@ -785,42 +789,38 @@ def get_sim_survey(sim_dir, ref_df, seeds=None):
         sim_subset_full = pd.read_csv(file_path)
     else:
         # get first year of sampling in reference dataset. the simulation will be referenced from the first day of that year
-        # todo: need code review
-        # R code:
-        # first_ref_date = as.Date(paste0(year(min(ref_df$date, na.rm=TRUE)), '-01-01'))
-        first_ref_date = datetime.date(datetime.datetime.strptime(ref_df['date'].dropna().min(), "%Y-%m-%d").year, 1, 1)
+        first_ref_date = datetime.date(datetime.datetime.strptime(str(ref_df['date'].dropna().min()), "%Y-%m-%d %H:%M:%S").year, 1, 1)
         indIDs = ref_df['SID'].unique()
+        ref_df['date'] = ref_df['date'].apply(lambda x: x.date())
 
-        # todo:
-        # R code: sim_full = fread(paste0(sim_dir, '/patient_reports.csv'))
         patient_report_path = os.path.join(sim_dir, 'patient_reports.csv')
         sim_full = pd.read_csv(patient_report_path)
-        sim_full['date'] = first_ref_date + sim_full['simday']
-        sim_full['age'] = sim_full['age'] / 365
 
         if seeds is None:
             seeds = sim_full['Run_Number'].unique()
 
+        sim_subset_full = pd.DataFrame()
         for seed in sorted(seeds):
-            print('Currently on seed ' + seed)
+            print('Currently on seed ' + str(seed))
             sim = sim_full[sim_full['Run_Number'] == seed]  # subset to desired run
+            sim['date'] = [first_ref_date + datetime.timedelta(days=int(simday)) for simday in sim['simday']]
+            sim['age'] = sim['age'] / 365
             # track which individuals have already been included from the simulation (
             # to avoid double-sampling simulation individuals)
-            included_ids = list()
-            # todo: you are using sim_subset = data.table() while I am using Pandas dataframe
+            included_ids = set()
             sim_subset = pd.DataFrame()
             for ii in range(len(indIDs)):
                 if ii % 50 == 0:
-                    print('Currently on individual ' + ii + ' out of ', len(indIDs))
+                    print('Currently on individual ' + str(ii) + ' out of ', len(indIDs))
                 ref_df_cur = ref_df[ref_df['SID'] == indIDs[ii]]
                 ref_df_cur = ref_df_cur.sort_values(by='date')
                 # find a matching individual
                 age_cur = ref_df_cur['age'].iloc[0]
                 day_cur = ref_df_cur['date'].iloc[0]
 
                 # use age-specific matches
-                id_candidates = sim[(sim['date'] == day_cur) & (round(sim['age'] == round(age_cur)))]['id']
-                id_candidates = [idx not in included_ids for idx in id_candidates]
+                id_candidates = sim[(sim['date'] == day_cur) & (round(sim['age']) == round(age_cur))]['id'].tolist()
+                id_candidates = [idx for idx in id_candidates if idx not in included_ids]
                 # if no perfect age-match remain, expand year-range until finding a match
                 if len(id_candidates) == 0:
                     year_range = 0
@@ -831,27 +831,26 @@ def get_sim_survey(sim_dir, ref_df, seeds=None):
                         # id_candidates = sim$id[intersect(which(sim$date == day_cur), which(round(sim$age) % in % seq((round(age_cur)-year_range), (round(age_cur)+year_range))))]
                         id_candidates = sim[(sim['date'] == day_cur)
                                             & (sim['age'].round().isin(range((round(age_cur)-year_range),
-                                                                             round(age_cur)+year_range)))]['id']
-                        id_candidates = [idx not in included_ids for idx in id_candidates]
+                                                                              round(age_cur)+year_range)))]['id'].tolist()
+                        id_candidates = [idx for idx in id_candidates if idx not in included_ids]
 
                     if len(id_candidates) == 0:
                         print('Problem: no age-matched simulation individual found for reference id: ' + indIDs[ii])
                     else:
                         print('No exact age match remaining for reference id: ' + indIDs[ii]
                               + '. Used simulation individual within ', year_range, ' years.')
 
-                id_sim_cur = random.sample(id_candidates, 1)  # todo: should we remove this id after drawing?
-                included_ids.extend(id_sim_cur)
+                id_sim_cur = random.sample(id_candidates, 1)[0]  # todo: should we remove this id after drawing?
+                included_ids.add(id_sim_cur)
 
                 # keep the same simulation dates as the reference samples for this individual
                 sim_subset_cur = sim[(sim['id'] == id_sim_cur) & (sim['date'].isin(ref_df_cur['date']))]
                 sim_subset = pd.concat([sim_subset, sim_subset_cur])
 
             sim_subset['seed'] = seed
-            if seed == sorted(seeds)[0]:
+            if sim_subset_full.empty:
                 sim_subset_full = sim_subset
             else:
-                # todo: sim_subset_full maybe referenced before assignment
                 sim_subset_full = pd.concat([sim_subset_full, sim_subset])
 
         # rename simulation columns to match reference data

diff --git a/create_plots/run_generate_validation_comparisons_site.py b/create_plots/run_generate_validation_comparisons_site.py
@@ -13,11 +13,14 @@
     generate_age_infection_duration_outputs
 from datetime import datetime
 import shutil
+import argparse
+import os
 
 
-def run():
+def run(subset="All"):
     # read in data and create plots
     coord_csv = load_coordinator_df(set_index=False)
+    print(f"plotting with subset = {subset}.")
     if plot_output_filepath.is_dir():
         date, time = datetime.now().strftime("%d-%m-%Y %H-%M-%S").split(' ')
         plot_output_bak_filepath = plot_output_filepath.parent / (str(plot_output_filepath.name) + f'_{date}_{time}_backup')
@@ -32,46 +35,56 @@ def run():
     else:
         print(f"Folder {plot_output_filepath} was created")
 
-    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
-    #                         age - incidence                         #
-    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
-    generate_age_incidence_outputs(coord_csv, simulation_output_filepath, base_reference_filepath, plot_output_filepath,
-                                   benchmark_simulation_filepath=benchmark_simulation_filepath)
+    if subset.lower() == "all" or "core_relationship" in subset.lower():
 
-    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
-    #                         age - prevalence                        #
-    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
-    generate_age_prevalence_outputs(coord_csv, simulation_output_filepath, base_reference_filepath, plot_output_filepath,
-                                    benchmark_simulation_filepath=benchmark_simulation_filepath)
+        # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
+        #                         age - incidence                         #
+        # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
+        generate_age_incidence_outputs(coord_csv, simulation_output_filepath, base_reference_filepath, plot_output_filepath,
+                                       benchmark_simulation_filepath=benchmark_simulation_filepath)
 
-    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
-    #                      age - parasite density                     #
-    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
-    generate_parasite_density_outputs(coord_csv, simulation_output_filepath, base_reference_filepath, plot_output_filepath,
-                                      benchmark_simulation_filepath=benchmark_simulation_filepath)
+        # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
+        #                         age - prevalence                        #
+        # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
+        generate_age_prevalence_outputs(coord_csv, simulation_output_filepath, base_reference_filepath, plot_output_filepath,
+                                        benchmark_simulation_filepath=benchmark_simulation_filepath)
 
-    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
-    #                   infectiousness to vectors                        #
-    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
-    generate_infectiousness_outputs(coord_csv, simulation_output_filepath, base_reference_filepath, plot_output_filepath,
-                                    benchmark_simulation_filepath=benchmark_simulation_filepath)
+        # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
+        #                      age - parasite density                     #
+        # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
+        generate_parasite_density_outputs(coord_csv, simulation_output_filepath, base_reference_filepath, plot_output_filepath,
+                                           benchmark_simulation_filepath=benchmark_simulation_filepath)
 
-    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
-    #                    age - infection duration                     #
-    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
-    # set positive threshold density for sampled parasites in simulation output (to match PCR threshold in reference)
-    pos_thresh_dens = 0.5  # Note: from the reference dataset, the smallest positive density was 39. (39=min(ref_df$DENSITY[ref_df$DENSITY>0], na.rm=TRUE) - 1)
-    # specify binning for duration of infection
-    duration_bins = list(range(0, 400, 50))
-    duration_bins.append(500)
-    generate_age_infection_duration_outputs(coord_csv, simulation_output_filepath, base_reference_filepath,
-                                            plot_output_filepath, pos_thresh_dens, duration_bins,
-                                            benchmark_simulation_filepath=benchmark_simulation_filepath)
+        # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
+        #                   infectiousness to vectors                        #
+        # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
+        generate_infectiousness_outputs(coord_csv, simulation_output_filepath, base_reference_filepath, plot_output_filepath,
+                                        benchmark_simulation_filepath=benchmark_simulation_filepath)
+
+    if subset.lower() == "all" or "infection_duration" in subset.lower():
+        # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
+        #                    age - infection duration                     #
+        # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
+        # set positive threshold density for sampled parasites in simulation output (to match PCR threshold in reference)
+        pos_thresh_dens = 0.5  # Note: from the reference dataset, the smallest positive density was 39. (39=min(ref_df$DENSITY[ref_df$DENSITY>0], na.rm=TRUE) - 1)
+        # specify binning for duration of infection
+        duration_bins = list(range(0, 400, 50))
+        duration_bins.append(500)
+        generate_age_infection_duration_outputs(coord_csv, simulation_output_filepath, base_reference_filepath,
+                                                plot_output_filepath, pos_thresh_dens, duration_bins,
+                                                benchmark_simulation_filepath=benchmark_simulation_filepath)
     # generate dummy file for snakemake plot rule.
+    if not os.path.isdir(comps_id_folder):
+        os.mkdir(comps_id_folder)
     with open(comps_id_folder + 'plot_completed', 'w') as file:
         file.write('Plotting is completed.')
 
 
 if __name__ == '__main__':
-    run()
+    parser = argparse.ArgumentParser(description='Process site name')
+    parser.add_argument('--subset', '-s', type=str, help='subset name(s)',
+                        default="All")
+
+    args = parser.parse_args()
+    run(subset=args.subset)