ATOMScience-org · rwilfong · Jul 31, 2024 · Jul 19, 2024 · Jul 19, 2024 · Jul 31, 2024
diff --git a/atomsci/ddm/docs/PARAMETERS.md b/atomsci/ddm/docs/PARAMETERS.md
@@ -276,6 +276,14 @@ The AMPL pipeline contains many parameters and options to fit models and make pr
 |*Description:*|True/False flag for setting verbosity|
 |*Default:*|FALSE|
 |*Type:*|Bool|
+
+- **seed**  
+
+|||
+|-|-|
+|*Description:*|Seed used for initializing a random number generator to ensure results are reproducible. Default is None and a random seed will be generated.|
+|*Default:*|None|
+|*Type:*|int|
 
 - **production**  
 
@@ -529,6 +537,30 @@ the model will train for max_epochs regardless of validation error.|
 |*Default:*|scaffold|
 |*Type:*|str|
 
+- **sampling_method**  
+
+|||
+|-|-|
+|*Description:*|The sampling method for addressing class imbalance in classification datasets. Options include 'undersampling' and 'SMOTE'.|
+|*Default:*|None|
+|*Type:*|str|
+
+- **sampling_ratio**  
+
+|||
+|-|-|
+|*Description:*|The desired ratio of the minority class to the majority class after sampling (e.g., if str, 'minority', 'not minority'; if float, '0.2', '1.0'). |
+|*Default:*|auto|
+|*Type:*|str|
+
+- **sampling_k_neighbors**  
+
+|||
+|-|-|
+|*Description:*|The number of nearest neighbors to consider when generating synthetic samples (e.g., 5, 7, 9). Specifically used for SMOTE sampling method.|
+|*Default:*|5|
+|*Type:*|int|
+
 - **mtss\_num\_super\_scaffolds**  
 
 |||

diff --git a/atomsci/ddm/pipeline/GeneticAlgorithm.py b/atomsci/ddm/pipeline/GeneticAlgorithm.py
@@ -1,10 +1,10 @@
 import numpy as np
+import uuid
 import scipy.spatial.distance as scipy_distance
 import multiprocessing
-import random
 from tqdm import tqdm
 import timeit
-from typing import Any, Callable, List, Tuple
+from typing import Any, Callable, List, Tuple, Optional
 
 N_PROCS = multiprocessing.cpu_count()
 
@@ -22,7 +22,8 @@ def __init__(self,
                 init_pop: List[List[Any]],
                 fitness_func: Callable,
                 crossover_func: Callable,
-                mutate_func: Callable):
+                mutate_func: Callable,
+                seed: Optional[int]):
         """
         Creates a GeneticAlgorithm object
 
@@ -40,8 +41,14 @@ def __init__(self,
         mutate_func: Callable
             A callable that takes a list of chromosomes and returns another list of mutated 
             chromosomes
+        seed: Optional[int]
+            Seed for random number generator
         """
 
+        if seed is None:
+            seed = uuid.uuid4().int % (2**32)
+        self.random_state = np.random.default_rng(seed)
+
         self.pop = init_pop
         self.pop_scores = None
         self.num_pop = len(init_pop)
@@ -177,13 +184,13 @@ def step(self, print_timings: bool = False):
 
         # select parents using rank selection
         i = timeit.default_timer()
-        new_pop = self.crossover_func(parents, self.num_pop)
+        new_pop = self.crossover_func(parents, self.num_pop, random_state=self.random_state)
         if print_timings:
             print('\tcrossover %0.2f min'%((timeit.default_timer()-i)/60))
 
         # mutate population
         i = timeit.default_timer()
-        self.pop = self.mutate_func(new_pop)
+        self.pop = self.mutate_func(new_pop, random_state=self.random_state)
         if print_timings:
             print('\tmutate %0.2f min'%((timeit.default_timer()-i)/60))
             print('total %0.2f min'%((timeit.default_timer()-start)/60))
@@ -199,23 +206,23 @@ def step(self, print_timings: bool = False):
     def fitness_func(chromosome):
         return 1 - scipy_distance.rogerstanimoto(chromosome, target_chromosome)
 
-    def crossover_func(parents, pop_size):
+    def crossover_func(parents, pop_size, random_state):
         new_pop = []
         for i in range(num_pop):
             parent1 = parents[i%len(parents)]
             parent2 = parents[(i+1)%len(parents)]
 
-            crossover_point = random.randint(0, len(parents[0])-1)
+            crossover_point = random_state.integers(0, len(parents[0])-1, 1)[0]
             new_pop.append(parent1[:crossover_point]+parent2[crossover_point:])
 
         return new_pop
 
-    def mutate_func(pop, mutate_chance=0.01):
+    def mutate_func(pop, random_state, mutate_chance=0.01):
         new_pop = []
         for chromosome in pop:
             new_chromosome = list(chromosome)
             for i, g in enumerate(new_chromosome):
-                if random.random() < mutate_chance:
+                if random_state.random() < mutate_chance:
                     if new_chromosome[i] == 0:
                         new_chromosome[i] = 1
                     else:

diff --git a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
@@ -1,6 +1,5 @@
 import argparse
 import logging
-import random
 import timeit
 import tempfile
 from typing import List, Optional, Set, Tuple
@@ -636,8 +635,8 @@ def split(self,
             A tuple with 3 elements that are training, validation, and test compound
             indices into dataset, respectively
         """
-        if seed is not None:
-            np.random.seed(seed)
+        self.seed = seed
+
         self.dataset = dataset
         self.diff_fitness_weight_tvt = diff_fitness_weight_tvt
         self.diff_fitness_weight_tvv = diff_fitness_weight_tvv
@@ -674,7 +673,7 @@ def split(self,
             population.append(split_chromosome)
 
         gene_alg = ga.GeneticAlgorithm(population, self.grade, ga_crossover,
-                        ga_mutate)
+                        ga_mutate, self.seed)
         #gene_alg.iterate(num_generations)
         for i in range(self.num_generations):
             gene_alg.step(print_timings=print_timings)
@@ -859,7 +858,8 @@ def train_valid_test_split(self,
         return train_dataset, valid_dataset, test_dataset
 
 def ga_crossover(parents: List[List[str]],
-                num_pop: int) -> List[List[str]]:
+                num_pop: int,
+                random_state: np.random.Generator) -> List[List[str]]:
     """Create the next generation from parents
 
     A random index is chosen and genes up to that index from
@@ -872,6 +872,8 @@ def ga_crossover(parents: List[List[str]],
         A list of chromosomes.
     num_pop: int
         The number of new chromosomes to make
+    random_state: np.random.Generator
+        Random number generator
     Returns
     -------
     List[List[str]]
@@ -883,13 +885,14 @@ def ga_crossover(parents: List[List[str]],
         parent1 = parents[i%len(parents)]
         parent2 = parents[(i+1)%len(parents)]
 
-        crossover_point = random.randint(0, len(parents[0])-1)
+        crossover_point = random_state.integers(low=0, high=len(parents[0])-1, size=1)[0]
         new_pop.append(parent1[:crossover_point]+parent2[crossover_point:])
 
     return new_pop
 
 def ga_mutate(new_pop: List[List[str]],
-            mutation_rate: float = .02) -> List[List[str]]:
+            random_state: np.random.Generator,
+            mutation_rate: float = .02,) -> List[List[str]]:
     """Mutate the population
 
     Each chromosome is copied and mutated at mutation_rate.
@@ -900,6 +903,8 @@ def ga_mutate(new_pop: List[List[str]],
     ----------
     new_pop: List[List[str]]
         A list of chromosomes.
+    random_state: np.random.Generator
+        Random number generator
     mutation_rate: float
         How often a mutation occurs. 0.02 is a good rate for
         my test sets.
@@ -912,8 +917,8 @@ def ga_mutate(new_pop: List[List[str]],
     for solution in new_pop:
         new_solution = list(solution)
         for i, gene in enumerate(new_solution):
-            if random.random() < mutation_rate:
-                new_solution[i] = ['train', 'valid', 'test'][random.randint(0,2)]
+            if random_state.random() < mutation_rate:
+                new_solution[i] = ['train', 'valid', 'test'][random_state.integers(low=0, high=2, size=1)[0]]
         mutated.append(new_solution)
 
     return mutated
@@ -1039,6 +1044,7 @@ def parse_args():
     parser.add_argument('id_col', type=str, help='the column containing ids')
     parser.add_argument('response_cols', type=str, help='comma seperated string of response columns')
     parser.add_argument('output', type=str, help='name of the split file')
+    parser.add_argument('seed', type=int, default=0, help='name of the split file')
 
     return parser.parse_args()
 
@@ -1054,5 +1060,6 @@ def parse_args():
     mss = MultitaskScaffoldSplitter()
     mss_split_df = split_with(total_df, mss, 
         smiles_col=args.smiles_col, id_col=args.id_col, response_cols=response_cols, 
-        diff_fitness_weight=dfw, ratio_fitness_weight=rfw, num_generations=args.num_gens)
+        diff_fitness_weight=dfw, ratio_fitness_weight=rfw, num_generations=args.num_gens,
+        seed=args.seed)
     mss_split_df.to_csv(args.output, index=False)
diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py
@@ -433,7 +433,7 @@ def get_dataset_tasks(self, dset_df):
         return self.tasks is not None
 
     # ****************************************************************************************
-    def split_dataset(self):
+    def split_dataset(self, random_state=None, seed=None):
         """Splits the dataset into paired training/validation and test subsets, according to the split strategy
                 selected by the model params. For traditional train/valid/test splits, there is only one training/validation
                 pair. For k-fold cross-validation splits, there are k different train/valid pairs; the validation sets are
@@ -452,7 +452,7 @@ def split_dataset(self):
 
         # Create object to delegate splitting to.
         if self.splitting is None:
-            self.splitting = split.create_splitting(self.params)
+            self.splitting = split.create_splitting(self.params, random_state=random_state, seed=seed)
         self.train_valid_dsets, self.test_dset, self.train_valid_attr, self.test_attr = \
             self.splitting.split_dataset(self.dataset, self.attr, self.params.smiles_col)
         if self.train_valid_dsets is None:
@@ -479,6 +479,12 @@ def _check_classes(self):
             (Boolean): boolean specifying if all classes are specified in all splits
         """
         ref_class_set = get_classes(self.train_valid_dsets[0][0].y)
+        num_classes = len(ref_class_set)
+        if num_classes != self.params.class_number:
+            logger = logging.getLogger('ATOM')
+            logger.warning(f"Expected class_number:{self.params.class_number} "
+                           f"classes but got {num_classes} instead. Double check "
+                           "response columns or class_number parameter.")
         for train, valid in self.train_valid_dsets:
             if not ref_class_set == get_classes(train.y):
                 return False
@@ -563,7 +569,7 @@ def create_dataset_split_table(self):
         return split_df
 
     # ****************************************************************************************
-    def load_presplit_dataset(self, directory=None):
+    def load_presplit_dataset(self, directory=None, random_state=None, seed=None):
         """Loads a table of compound IDs assigned to split subsets, and uses them to split
         the currently loaded featurized dataset.
 
@@ -590,7 +596,7 @@ def load_presplit_dataset(self, directory=None):
         """
 
         # Load the split table from the datastore or filesystem
-        self.splitting = split.create_splitting(self.params)
+        self.splitting = split.create_splitting(self.params, random_state=random_state, seed=seed)
 
         try:
             split_df, split_kv = self.load_dataset_split_table(directory)
@@ -655,11 +661,31 @@ def combined_training_data(self):
         # All of the splits have the same combined train/valid data, regardless of whether we're using
         # k-fold or train/valid/test splitting.
         if self.combined_train_valid_data is None:
+            # normally combining one fold is sufficient, but if SMOTE or undersampling is being used
+            # just combining the first fold isn't enough
             (train, valid) = self.train_valid_dsets[0]
             combined_X = np.concatenate((train.X, valid.X), axis=0)
             combined_y = np.concatenate((train.y, valid.y), axis=0)
             combined_w = np.concatenate((train.w, valid.w), axis=0)
             combined_ids = np.concatenate((train.ids, valid.ids))
+
+            if self.params.sampling_method=='SMOTE' or self.params.sampling_method=='undersampling':
+                # for each successive fold, merge in any new compounds
+                # this loop just won't run if there are no additional folds
+                for train, valid in self.train_valid_dsets[1:]:
+                    fold_ids = np.concatenate((train.ids, valid.ids))
+                    new_id_indexes = [i for i in range(len(fold_ids)) if i not in combined_ids]
+
+                    fold_ids = fold_ids[new_id_indexes]
+                    fold_X = np.concatenate((train.X, valid.X), axis=0)[new_id_indexes]
+                    fold_y = np.concatenate((train.y, valid.y), axis=0)[new_id_indexes]
+                    fold_w = np.concatenate((train.w, valid.w), axis=0)[new_id_indexes]
+
+                    combined_X = np.concatenate((combined_X, fold_X), axis=0)
+                    combined_y = np.concatenate((combined_y, fold_y), axis=0)
+                    combined_w = np.concatenate((combined_w, fold_w), axis=0)
+                    combined_ids = np.concatenate((combined_ids, fold_ids))
+
             self.combined_train_valid_data = NumpyDataset(combined_X, combined_y, w=combined_w, ids=combined_ids)
         return self.combined_train_valid_data
 
@@ -697,7 +723,8 @@ def get_subset_responses_and_weights(self, subset, transformers):
         """
         if subset not in self.subset_response_dict:
             if subset in ('train', 'valid', 'train_valid'):
-                dataset = self.combined_training_data()
+                for fold, (train, valid) in enumerate(self.train_valid_dsets):
+                    dataset = self.combined_training_data()
             elif subset == 'test':
                 dataset = self.test_dset
             else: