diff --git a/atomsci/ddm/pipeline/GeneticAlgorithm.py b/atomsci/ddm/pipeline/GeneticAlgorithm.py index 531b85ba..98611d77 100644 --- a/atomsci/ddm/pipeline/GeneticAlgorithm.py +++ b/atomsci/ddm/pipeline/GeneticAlgorithm.py @@ -49,12 +49,14 @@ def __init__(self, self.fitness_func = fitness_func self.crossover_func = crossover_func self.mutate_func = mutate_func - self.parallel_grade_population() + self.serial_grade_population() + #self.parallel_grade_population() - def parallel_grade_population(self): - """ Grade the population and save the scores - Updates the order of self.pop and self.pop_scores + def serial_grade_population(self): + """ Scores the chromosomes in the current population and sorts them in decreasing score order. + Saves the sorted scores in self.pop_scores. Not multithreaded; surprisingly, this runs faster + than the multithreaded function parallel_grade_scores. Parameters ---------- @@ -62,16 +64,37 @@ def parallel_grade_population(self): Returns ------- - Nothing + None. As a side effect, sorts the chromosomes in self.pop and updates the scores in self.pop_scores. + """ + fitnesses = [] + for chrom in self.pop: + fitnesses.append(self.fitness_func(chrom)) + pairs = sorted(zip(fitnesses, self.pop), reverse=True) + self.pop = [chrome for fitness, chrome in pairs] + self.pop_scores = [fitness for fitness, chrome in pairs] + + + def parallel_grade_population(self): + """ Scores the chromosomes in the current population and sorts them in decreasing score order. + Saves the sorted scores in self.pop_scores. + + Although this does the same thing in multiple threads as the single-threaded function + serial_grade_population, it seems to run much slower, at least for multitask scaffold splits + with 100 chromosomes. + + Parameters + ---------- + None + + Returns + ------- + None """ pool = multiprocessing.Pool(processes=N_PROCS) fitnesses = pool.map(self.fitness_func, self.pop) pool.close() pool.join() - pairs = list(zip(fitnesses, self.pop)) - - pairs.sort(key=lambda x: x[0], reverse=True) - + pairs = sorted(zip(fitnesses, self.pop), reverse=True) self.pop = [chrome for fitness, chrome in pairs] self.pop_scores = [fitness for fitness, chrome in pairs] @@ -90,10 +113,7 @@ def select_parents(self) -> List[List[Any]]: parents: List[List[Any]] A list of chromosomes that will be parents for the next generation """ - self.parallel_grade_population() - - parents = [chrome for chrome in self.pop[:self.num_parents]] - return parents + return self.pop[:self.num_parents] def iterate(self, num_generations: int): """ Iterates the genetic algorithm num_generations @@ -130,12 +150,13 @@ def step(self, print_timings: bool = False): start = timeit.default_timer() i = timeit.default_timer() + # select parents using rank selection parents = self.select_parents() if print_timings: print('\tfind parents %0.2f min'%((timeit.default_timer()-i)/60)) - # select parents using rank selection i = timeit.default_timer() + # Generate new population by crossing parent chromosomes new_pop = self.crossover_func(parents, self.num_pop) if print_timings: print('\tcrossover %0.2f min'%((timeit.default_timer()-i)/60)) @@ -143,6 +164,9 @@ def step(self, print_timings: bool = False): # mutate population i = timeit.default_timer() self.pop = self.mutate_func(new_pop) + # Compute scores for new chromosomes and sort population by score + self.serial_grade_population() + #self.parallel_grade_population() if print_timings: print('\tmutate %0.2f min'%((timeit.default_timer()-i)/60)) print('total %0.2f min'%((timeit.default_timer()-start)/60)) diff --git a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py index 67e2a54f..9a4fbb18 100644 --- a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py +++ b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py @@ -1,4 +1,5 @@ import pdb +import logging import argparse import random import timeit @@ -24,6 +25,9 @@ from atomsci.ddm.pipeline import dist_metrics from atomsci.ddm.pipeline import GeneticAlgorithm as ga +logging.basicConfig(format='%(asctime)-15s %(message)s') +logger = logging.getLogger('ATOM') + def _generate_scaffold_hists(scaffold_sets: List[np.ndarray], w: np.array) -> np.array: """Counts the number of labelled samples per task per scaffold @@ -96,8 +100,8 @@ def smush_small_scaffolds(scaffolds: List[Set[int]], else: new_scaffolds.append(scaffold) - print('new scaffold lengths') - print([len(s) for s in new_scaffolds]) + logger.debug('new scaffold lengths') + logger.debug([len(s) for s in new_scaffolds]) new_scaffolds = [np.array(list(s)) for s in new_scaffolds] return new_scaffolds @@ -133,7 +137,7 @@ def calc_ecfp(smiles: List[str], fprints = [y for x in ecfps for y in x] #Flatten results else: mols = [Chem.MolFromSmiles(s) for s in smiles] - fprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) for mol in mols] + fprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 3, 1024) for mol in mols] return fprints @@ -144,9 +148,9 @@ def dist_smiles_from_ecfp(ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBit Parameters ---------- ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect], - A list of ECFP finger prints + A list of ECFP fingerprints ecfp2: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect] - A list of ECFP finger prints + A list of ECFP fingerprints Returns ------- @@ -159,57 +163,6 @@ def dist_smiles_from_ecfp(ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBit return cd.calc_summary(dist_metrics.tanimoto(ecfp1, ecfp2), calc_type='nearest', num_nearest=1, within_dset=False) -def _generate_scaffold_dist_matrix(scaffold_lists: List[np.ndarray], - ecfp_features: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect]) -> np.ndarray: - """Returns a nearest neighbors distance matrix between each scaffold. - - The distance between two scaffolds is defined as the - the distance between the two closest compounds between the two - scaffolds. - - TODO: Ask Stewart: Why did he change to using the median instead of the min? - - Parameters - ---------- - scaffold_lists: List[np.ndarray] - List of scaffolds. A scaffold is a set of indices into ecfp_features - ecfp_features: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect] - List of ecfp features, one for each compound in the dataset - - Returns - ------- - dist_mat: np.ndarray - Distance matrix, symmetric. - """ - print('start generating big dist mat') - start = timeit.default_timer() - # First compute the full matrix of distances between all pairs of ECFPs - dmat = dist_metrics.tanimoto(ecfp_features) - - mat_shape = (len(scaffold_lists), len(scaffold_lists)) - scaff_dist_mat = np.zeros(mat_shape) - for i, scaff1 in tqdm(enumerate(scaffold_lists)): - scaff1_rows = scaff1.reshape((-1,1)) - for j, scaff2 in enumerate(scaffold_lists[:i]): - if i == j: - continue - - dists = dmat[scaff1_rows, scaff2].flatten() - #min_dist = np.median(dists) - # ksm: Change back to min and see what happens - min_dist = np.min(dists) - - if min_dist==0: - print("two scaffolds match exactly?!?", i, j) - print(len(set(scaff2).intersection(set(scaff1)))) - - scaff_dist_mat[i,j] = min_dist - scaff_dist_mat[j,i] = min_dist - - print("finished scaff dist mat: %0.2f min"%((timeit.default_timer()-start)/60)) - return scaff_dist_mat - - class MultitaskScaffoldSplitter(Splitter): """MultitaskScaffoldSplitter Splitter class. @@ -261,6 +214,65 @@ def generate_scaffolds(self, return scaffold_sets + def _generate_scaffold_dist_matrix(self): + """Computes matrices used by fitness functions that score splits according + to the dissimilarity between training and test compound structures. One is + a symmetric matrix of nearest neighbor Tanimoto distances between pairs of + scaffolds (i.e., between the most similar compounds in the two scaffolds). + This matrix is used to compute the `scaffold_diff_fitness` function. + + The other is a nonsymmetric matrix of boolean vectors `has_nn[i,j]`, of length + equal to the number of compounds in scaffold `i`, indicating whether each + compound has a neighbor in scaffold `j` nearer than some threshold Tanimoto + distance `dist_thresh`. dist_thresh defaults to 0.3, but could be parameterized. + The has_nn matrix is used to compute the `far_frac_fitness` function, which is + a more robust measurement of the train/test set dissimilarity. + """ + scaffold_lists = self.ss + ecfp_features = self.ecfp_features + + # First compute the full matrix of distances between all pairs of ECFPs + dmat = dist_metrics.tanimoto(ecfp_features) + + mat_shape = (len(scaffold_lists), len(scaffold_lists)) + scaff_dist_mat = np.zeros(mat_shape) + has_near_neighbor_mat = np.empty(mat_shape, dtype=object) + + for i, scaff1 in enumerate(scaffold_lists): + scaff1_rows = scaff1.reshape((-1,1)) + for j, scaff2 in enumerate(scaffold_lists[:i]): + if i == j: + continue + + scaff1_dists = dmat[scaff1_rows, scaff2] + min_dist = np.min(scaff1_dists.flatten()) + + if min_dist==0: + logger.info(f"Scaffolds {i} and {j} have at least one ECFP in common") + for k in scaff1: + for l in scaff2: + if dmat[k,l] == 0: + logger.debug(f"\tcompound {k} in scaffold {i}, compound {l} in scaffold {j}") + logger.debug(f"\tSMILES {k}: {self.smiles[k]}") + logger.debug(f"\tSMILES {l}: {self.smiles[l]}\n") + + + scaff_dist_mat[i,j] = min_dist + scaff_dist_mat[j,i] = min_dist + + # Identify the compounds in scaff1 with a neighbor in scaff2 closer than dist_thresh + has_near_neighbor_mat[i,j] = np.array(np.min(scaff1_dists, axis=1) < self.dist_thresh) + + # Identify the compounds in scaff2 with a neighbor in scaff1 closer than dist_thresh + scaff2_rows = scaff2.reshape((-1,1)) + scaff2_dists = dmat[scaff2_rows, scaff1] + has_near_neighbor_mat[j,i] = np.array(np.min(scaff2_dists, axis=1) < self.dist_thresh) + + + self.scaff_scaff_distmat = scaff_dist_mat + self.has_near_neighbor_mat = has_near_neighbor_mat + + def expand_scaffolds(self, scaffold_list: List[int]) -> List[int]: """Turns a list of scaffold indices into a list of compound indices @@ -350,51 +362,74 @@ def scaffold_diff_fitness(self, return min_dist - def ratio_fitness_old(self, split_chromosome: List[str]) -> float: - """Calculates a fitness score based on how accurately divided training/test/validation. + def far_frac_fitness(self, + split_chromosome: List[str], + train_part: str, + test_part: str) -> float: + """Grades a split according to the fraction of valid/test compounds with + nearest training set compounds further than some threshold. + + Grades the quality of the split based on which scaffolds were alloted to + which partitions. The score is measured as the fraction of compounds in + `test_part` with nearest neighbors in `train_part` at Tanimoto distance + `self.dist_thresh` or greater. + Parameters ---------- - List[str]: split_chromosome - A list of strings, index i contains a string, 'train', 'valid', 'test' - which determines the partition that scaffold belongs to + split_chromosome: List[str] + A split represented as a list of partition names. Index i in the + chromosome contains the partition for scaffold i. + train_part: str + The name of the partition to be treated as the training subset + test_part: str + The name of the partition to be treated as the test subset + Returns ------- - List[str] - A list of strings, index i contains a string, 'train', 'valid', 'test' - which determines the partition that scaffold belongs to + score: float + Floating point value beteween 0-1. 1 is the best score and 0 is the worst """ - start = timeit.default_timer() - total_counts = np.sum(self.dataset.w, axis=0) - subset_counts = [np.sum(self.dataset.w[subset], axis=0) for subset in \ - self.split_chromosome_to_compound_split(split_chromosome)] + # TODO: Eventually, replace strings in chromosome with integers indicating the partition + # for each scaffold. - # subset order goes train, valid, test. just consider train for the moment - subset_ratios = [subset_count/total_counts for subset_count in subset_counts] - #print("mean: %0.2f, median: %0.2f, std: %0.2f"%(np.mean(subset_ratios[0]), np.median(subset_ratios[0]), np.std(subset_ratios[0]))) - - # fitness of split is the size of the smallest training dataset - # ratio_fit = min(subset_ratios[0]) # this resulted in a split with 0 test. shoulda seen that coming - num_tasks = self.dataset.w.shape[1] - # imagine the perfect split is a point in 3*num_tasks space. - target_split = np.concatenate([[self.frac_train]*num_tasks, - [self.frac_valid]*num_tasks]) - # this is the current split also on 3*num_tasks space - current_split = np.concatenate(subset_ratios[:2]) - # if any partition is 0, then this split fails - if min(current_split) == 0: + train_scaffolds = [i for i, part in enumerate(split_chromosome) if part==train_part] + test_scaffolds = [i for i, part in enumerate(split_chromosome) if part==test_part] + + # if a partition is completely empty, return 0 + if len(train_scaffolds) == 0 or len(test_scaffolds) == 0: return 0 - # worst possible distance to normalize this between 0 and 1 - worst_distance = np.linalg.norm(np.ones(num_tasks*2)) - ratio_fit = 1 - np.linalg.norm(target_split-current_split)/worst_distance - #print("\tratio_fitness: %0.2f min"%((timeit.default_timer()-start)/60)) - return ratio_fit + # Compute the "far fraction": For each test scaffold S, OR together the boolean vectors + # from has_near_neighbor_mat for the training scaffolds indicating whether each compound in + # S has a neighbor in the train scaffold closer than Tanimoto distance dist_thresh. Sum + # the results over compounds and test scaffolds and divide by the test set size to get the + # "near fraction"; the far fraction is 1 minus the near fraction. + + near_count = 0 + total_count = 0 + for test_ind in test_scaffolds: + has_nn = None + for train_ind in train_scaffolds: + assert(not (train_ind == test_ind)) + if has_nn is None: + has_nn = self.has_near_neighbor_mat[test_ind, train_ind] + else: + has_nn |= self.has_near_neighbor_mat[test_ind, train_ind] + near_count += sum(has_nn) + total_count += len(has_nn) + + far_frac = 1 - near_count/total_count + #print(f"far_frac_fitness: near_count {near_count}, total_count {total_count}, far_frac {far_frac}") + return far_frac + def ratio_fitness(self, split_chromosome: List[str]) -> float: - """Calculates a fitness score based on how accurately divided training/test/validation. + """Calculates a fitness score based on how well the subset proportions for each task, taking + only labeled compounds into account, match the proportions requested by the user. - Treats the min,median,max of each of the three partitions as a 9D point and uses - euclidean distance to measure the distance to that point + The score is determined by combining the min, median and max over tasks of the subset fractions + into a 9-dimensional vector and computing its Euclidean distance from an ideal vector having + all the fractions as requested. Parameters ---------- @@ -407,15 +442,15 @@ def ratio_fitness(self, split_chromosome: List[str]) -> float: float A float between 0 and 1. 1 best 0 is worst """ - start = timeit.default_timer() + #start = timeit.default_timer() # total_counts is the number of labels per task total_counts = np.sum(self.dataset.w, axis=0) - # subset_counts is number of labels per task per subset + # subset_counts is number of labels per task per subset. subset_counts = [np.sum(self.dataset.w[subset], axis=0) for subset in \ self.split_chromosome_to_compound_split(split_chromosome)] - # subset order goes train, valid, test. just consider train for the moment + # subset order goes train, valid, test subset_ratios = [subset_count/total_counts for subset_count in subset_counts] # imagine the perfect split is a point in 9D space. For each subset we measure @@ -433,6 +468,7 @@ def ratio_fitness(self, split_chromosome: List[str]) -> float: return 0 # worst possible distance to normalize this between 0 and 1 worst_distance = np.linalg.norm(np.ones(len(target_split))) + worst_distance = np.sqrt(len(target_split)) ratio_fit = 1 - np.linalg.norm(target_split-current_split)/worst_distance #print("\tratio_fitness: %0.2f min"%((timeit.default_timer()-start)/60)) @@ -456,7 +492,7 @@ def response_distr_fitness(self, split_chromosome: List[str]) -> float: the train subset response value distributions, averaged over tasks. One means the distributions perfectly match. """ - start = timeit.default_timer() + #start = timeit.default_timer() dist_sum = 0.0 ntasks = self.dataset.y.shape[1] train_ind, valid_ind, test_ind = self.split_chromosome_to_compound_split(split_chromosome) @@ -495,18 +531,56 @@ def grade(self, split_chromosome: List[str]) -> float: float A float between 0 and 1. 1 best 0 is worst """ + + """ fitness = 0.0 # Only call the functions for each fitness term if their weight is nonzero if self.diff_fitness_weight_tvt != 0.0: - fitness += self.diff_fitness_weight_tvt*self.scaffold_diff_fitness(split_chromosome, 'train', 'test') + #score = self.scaffold_diff_fitness(split_chromosome, 'train', 'test') + score = self.far_frac_fitness(split_chromosome, 'train', 'test') + fitness += self.diff_fitness_weight_tvt*score if self.diff_fitness_weight_tvv != 0.0: - fitness += self.diff_fitness_weight_tvv*self.scaffold_diff_fitness(split_chromosome, 'train', 'valid') + #score = self.scaffold_diff_fitness(split_chromosome, 'train', 'valid') + score = self.far_frac_fitness(split_chromosome, 'train', 'valid') + fitness += self.diff_fitness_weight_tvv*score if self.ratio_fitness_weight != 0.0: - fitness += self.ratio_fitness_weight*self.ratio_fitness(split_chromosome) + score = self.ratio_fitness(split_chromosome) + fitness += self.ratio_fitness_weight*score if self.response_distr_fitness_weight != 0.0: - fitness += self.response_distr_fitness_weight*self.response_distr_fitness(split_chromosome) + score = self.response_distr_fitness(split_chromosome) + fitness += self.response_distr_fitness_weight*score + # Normalize the score to the range [0,1] + fitness /= (self.diff_fitness_weight_tvt + self.diff_fitness_weight_tvv + self.ratio_fitness_weight + + self.response_distr_fitness_weight) return fitness + """ + fitness_scores = self.get_fitness_scores(split_chromosome) + return fitness_scores['total_fitness'] + + def get_fitness_scores(self, split_chromosome): + fitness_scores = {} + total_fitness = 0.0 + # Only call the functions for each fitness term if their weight is nonzero + if self.diff_fitness_weight_tvt != 0.0: + #fitness_scores['test_scaf_dist'] = self.scaffold_diff_fitness(split_chromosome, 'train', 'test') + fitness_scores['test_scaf_dist'] = self.far_frac_fitness(split_chromosome, 'train', 'test') + total_fitness += self.diff_fitness_weight_tvt * fitness_scores['test_scaf_dist'] + if self.diff_fitness_weight_tvv != 0.0: + #fitness_scores['valid_scaf_dist'] = self.scaffold_diff_fitness(split_chromosome, 'train', 'valid') + fitness_scores['valid_scaf_dist'] = self.far_frac_fitness(split_chromosome, 'train', 'valid') + total_fitness += self.diff_fitness_weight_tvv * fitness_scores['valid_scaf_dist'] + if self.ratio_fitness_weight != 0.0: + fitness_scores['ratio'] = self.ratio_fitness(split_chromosome) + total_fitness += self.ratio_fitness_weight * fitness_scores['ratio'] + if self.response_distr_fitness_weight != 0.0: + fitness_scores['response_distr'] = self.response_distr_fitness(split_chromosome) + total_fitness += self.response_distr_fitness_weight * fitness_scores['response_distr'] + # Normalize the score to the range [0,1] + total_fitness /= (self.diff_fitness_weight_tvt + self.diff_fitness_weight_tvv + self.ratio_fitness_weight + + self.response_distr_fitness_weight) + fitness_scores['total_fitness'] = total_fitness + return fitness_scores def init_scaffolds(self, dataset: Dataset) -> None: @@ -528,10 +602,10 @@ def init_scaffolds(self, # list of lists. one list per scaffold big_ss = self.generate_scaffolds(dataset) - # using the same stragetgy as scaffold split, combine the scaffolds + # using the same strategy as scaffold split, combine the scaffolds # together until you have roughly 100 scaffold sets self.ss = smush_small_scaffolds(big_ss, num_super_scaffolds=self.num_super_scaffolds) - print(f'num_super_scaffolds: {len(self.ss)}, {self.num_super_scaffolds}') + logger.info(f"Requested {self.num_super_scaffolds} super scaffolds, produced {len(self.ss)} from {len(big_ss)} original scaffolds") # rows is the number of scaffolds # columns is number of tasks @@ -550,8 +624,10 @@ def split(self, num_super_scaffolds: int = 20, num_pop: int = 100, num_generations: int=30, + dist_thresh: float = 0.3, print_timings: bool = False, - log_every_n: int = 1000) -> Tuple: + early_stopping_generations = 25, + log_every_n: int = 10) -> Tuple: """Creates a split for the given datset This split splits the dataset into a list of super scaffolds then @@ -609,6 +685,7 @@ def split(self, self.num_super_scaffolds = num_super_scaffolds self.num_pop = num_pop self.num_generations = num_generations + self.dist_thresh = dist_thresh self.frac_train = frac_train self.frac_valid = frac_valid @@ -618,38 +695,47 @@ def split(self, self.init_scaffolds(self.dataset) # ecpf features + self.smiles = dataset.ids self.ecfp_features = calc_ecfp(dataset.ids) # calculate ScaffoldxScaffold distance matrix - start = timeit.default_timer() if (self.diff_fitness_weight_tvv > 0.0) or (self.diff_fitness_weight_tvt > 0.0): - self.scaff_scaff_distmat = _generate_scaffold_dist_matrix(self.ss, self.ecfp_features) - #print('scaffold dist mat %0.2f min'%((timeit.default_timer()-start)/60)) + self._generate_scaffold_dist_matrix() # initial population population = [] for i in range(self.num_pop): - start = timeit.default_timer() + #start = timeit.default_timer() split_chromosome = self._split(frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test) - #print("per_loop: %0.2f min"%((timeit.default_timer()-start)/60)) population.append(split_chromosome) + self.fitness_terms = {} gene_alg = ga.GeneticAlgorithm(population, self.grade, ga_crossover, ga_mutate) #gene_alg.iterate(num_generations) + best_ever = None + best_ever_fit = -np.inf + best_gen = 0 for i in range(self.num_generations): gene_alg.step(print_timings=print_timings) best_fitness = gene_alg.pop_scores[0] - print("step %d: best_fitness %0.2f"%(i, best_fitness)) - #print("%d: %0.2f"%(i, gene_alg.grade_population()[0][0])) - - best_fit = gene_alg.pop_scores[0] - best = gene_alg.pop[0] - - #print('best ever fitness %0.2f'%best_ever_fit) - result = self.split_chromosome_to_compound_split(best) + if best_fitness > best_ever_fit: + best_ever = gene_alg.pop[0] + best_ever_fit = best_fitness + best_gen = i + score_dict = self.get_fitness_scores(best_ever) + for term, score in score_dict.items(): + self.fitness_terms.setdefault(term, []).append(score) + if i % log_every_n == 0: + logger.info(f"generation {i}: Best fitness {best_fitness:.3f}, best ever {best_ever_fit:.3f} at generation {best_gen}") + if (best_fitness <= best_ever_fit) and (i - best_gen >= early_stopping_generations): + logger.info(f"No fitness improvement after {early_stopping_generations} generations") + break + + logger.info(f"Final best fitness score: {best_ever_fit:.3f} at generation {best_gen}") + result = self.split_chromosome_to_compound_split(best_ever) return result def _split(self, @@ -738,7 +824,8 @@ def train_valid_test_split(self, train_dir: Optional[str] = None, valid_dir: Optional[str] = None, test_dir: Optional[str] = None, - log_every_n: int = 1000) -> Tuple[Dataset, Dataset, Dataset]: + dist_thresh: float = 0.3, + log_every_n: int = 10) -> Tuple[Dataset, Dataset, Dataset]: """Creates a split for the given datset This split splits the dataset into a list of super scaffolds then @@ -805,6 +892,7 @@ def train_valid_test_split(self, diff_fitness_weight_tvv=diff_fitness_weight_tvv, ratio_fitness_weight=ratio_fitness_weight, response_distr_fitness_weight=response_distr_fitness_weight, num_super_scaffolds=num_super_scaffolds, num_pop=num_pop, num_generations=num_generations, + dist_thresh=0.3, print_timings=False) if train_dir is None: diff --git a/atomsci/ddm/pipeline/compare_models.py b/atomsci/ddm/pipeline/compare_models.py index 2b9a445d..ce734c1c 100644 --- a/atomsci/ddm/pipeline/compare_models.py +++ b/atomsci/ddm/pipeline/compare_models.py @@ -1,5 +1,5 @@ -"""Functions for comparing and visualizing model performance. Most of these functions rely on ATOM's model tracker and -datastore services, which are not part of the standard AMPL installation, but a few functions will work on collections of +"""Functions for comparing and visualizing model performance. Some of these functions rely on ATOM's model tracker and +datastore services, which are not part of the standard AMPL installation, but several functions will work on collections of models saved as local files. """ @@ -163,6 +163,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg model_type_list = [] max_epochs_list = [] learning_rate_list = [] + weight_decay_penalty_list = [] + weight_decay_penalty_type_list = [] dropouts_list = [] layer_sizes_list = [] featurizer_list = [] @@ -172,6 +174,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg rf_max_depth_list = [] xgb_learning_rate_list = [] xgb_gamma_list = [] + xgb_alpha_list = [] + xgb_lambda_list = [] xgb_max_depth_list = [] xgb_colsample_bytree_list = [] xgb_subsample_list = [] @@ -218,6 +222,9 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg max_epochs_list.append(nn_params['max_epochs']) best_epoch_list.append(nn_params['best_epoch']) learning_rate_list.append(nn_params['learning_rate']) + weight_decay_penalty_list.append(nn_params['weight_decay_penalty']) + weight_decay_penalty_type_list.append(nn_params['weight_decay_penalty_type']) + learning_rate_list.append(nn_params['learning_rate']) layer_sizes_list.append(','.join(['%d' % s for s in nn_params['layer_sizes']])) dropouts_list.append(','.join(['%.2f' % d for d in nn_params['dropouts']])) rf_estimators_list.append(nan) @@ -225,6 +232,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg rf_max_depth_list.append(nan) xgb_learning_rate_list.append(nan) xgb_gamma_list.append(nan) + xgb_alpha_list.append(nan) + xgb_lambda_list.append(nan) xgb_max_depth_list.append(nan) xgb_colsample_bytree_list.append(nan) xgb_subsample_list.append(nan) @@ -239,10 +248,14 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg max_epochs_list.append(nan) best_epoch_list.append(nan) learning_rate_list.append(nan) + weight_decay_penalty_list.append(nan) + weight_decay_penalty_type_list.append(nan) layer_sizes_list.append(nan) dropouts_list.append(nan) xgb_learning_rate_list.append(nan) xgb_gamma_list.append(nan) + xgb_alpha_list.append(nan) + xgb_lambda_list.append(nan) xgb_max_depth_list.append(nan) xgb_colsample_bytree_list.append(nan) xgb_subsample_list.append(nan) @@ -256,10 +269,14 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg max_epochs_list.append(nan) best_epoch_list.append(nan) learning_rate_list.append(nan) + weight_decay_penalty_list.append(nan) + weight_decay_penalty_type_list.append(nan) layer_sizes_list.append(nan) dropouts_list.append(nan) xgb_learning_rate_list.append(xgb_params["xgb_learning_rate"]) xgb_gamma_list.append(xgb_params["xgb_gamma"]) + xgb_alpha_list.append(xgb_params.get("xgb_alpha", 0.0)) + xgb_lambda_list.append(xgb_params.get("xgb_lambda", 1.0)) xgb_max_depth_list.append(xgb_params["xgb_max_depth"]) xgb_colsample_bytree_list.append(xgb_params["xgb_colsample_bytree"]) xgb_subsample_list.append(xgb_params["xgb_subsample"]) @@ -277,6 +294,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg max_epochs=max_epochs_list, best_epoch=best_epoch_list, learning_rate=learning_rate_list, + weight_decay_penalty_type=weight_decay_penalty_type_list, + weight_decay_penalty=weight_decay_penalty_list, layer_sizes=layer_sizes_list, dropouts=dropouts_list, rf_estimators=rf_estimators_list, @@ -284,6 +303,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg rf_max_depth=rf_max_depth_list, xgb_learning_rate = xgb_learning_rate_list, xgb_gamma = xgb_gamma_list, + xgb_alpha = xgb_alpha_list, + xgb_lambda = xgb_lambda_list, xgb_max_depth = xgb_max_depth_list, xgb_colsample_bytree = xgb_colsample_bytree_list, xgb_subsample = xgb_subsample_list, @@ -307,15 +328,15 @@ def extract_model_and_feature_parameters(metadata_dict, keep_required=True): Returns: dictionary containing featurizer and model parameters. Most contain the following - keys. ['max_epochs', 'best_epoch', 'learning_rate', 'layer_sizes', 'dropouts', - 'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_learning_rate', + keys. ['max_epochs', 'best_epoch', 'learning_rate', 'weight_decay_penalty_type', 'weight_decay_penalty', 'layer_sizes', 'dropouts', + 'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_alpha', 'xgb_lambda', 'xgb_learning_rate', 'xgb_max_depth', 'xgb_colsample_bytree', 'xgb_subsample', 'xgb_n_estimators', 'xgb_min_child_weight', 'featurizer_parameters_dict', 'model_parameters_dict'] """ model_params = metadata_dict['model_parameters'] model_type = model_params['model_type'] - required = ['max_epochs', 'best_epoch', 'learning_rate', 'layer_sizes', 'dropouts', - 'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_learning_rate', + required = ['max_epochs', 'best_epoch', 'learning_rate', 'layer_sizes', 'dropouts', 'weight_decay_penalty_type', 'weight_decay_penalty', + 'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_alpha', 'xgb_lambda', 'xgb_learning_rate', 'xgb_max_depth', 'xgb_colsample_bytree', 'xgb_subsample', 'xgb_n_estimators', 'xgb_min_child_weight' ] @@ -328,6 +349,9 @@ def extract_model_and_feature_parameters(metadata_dict, keep_required=True): model_info['max_epochs'] = nn_params['max_epochs'] model_info['best_epoch'] = nn_params['best_epoch'] model_info['learning_rate'] = nn_params['learning_rate'] + model_info['weight_decay_penalty_type'] = nn_params['weight_decay_penalty_type'] + model_info['weight_decay_penalty'] = nn_params['weight_decay_penalty'] + model_info['learning_rate'] = nn_params['learning_rate'] model_info['layer_sizes'] = ','.join(['%d' % s for s in nn_params['layer_sizes']]) model_info['dropouts'] = ','.join(['%.2f' % d for d in nn_params['dropouts']]) elif model_type == 'RF': @@ -338,6 +362,8 @@ def extract_model_and_feature_parameters(metadata_dict, keep_required=True): elif model_type == 'xgboost': xgb_params = metadata_dict['xgb_specific'] model_info['xgb_gamma'] = xgb_params['xgb_gamma'] + model_info['xgb_alpha'] = xgb_params.get('xgb_alpha', 0.0) + model_info['xgb_lambda'] = xgb_params.get('xgb_lambda', 1.0) model_info['xgb_learning_rate'] = xgb_params['xgb_learning_rate'] model_info['xgb_max_depth'] = xgb_params['xgb_max_depth'] model_info['xgb_colsample_bytree'] = xgb_params['xgb_colsample_bytree'] @@ -979,6 +1005,8 @@ def get_filesystem_perf_results(result_dir, pred_type='classification', expand=T print(f"Can't access model {dirpath}") print("Found data for %d models under %s" % (len(model_list), result_dir)) + if len(model_list) == 0: + return pd.DataFrame() # build dictonary of tarball names tar_dict = {os.path.basename(tf):tf for tf in tar_list} @@ -1511,6 +1539,8 @@ def get_summary_metadata_table(uuids, collections=None): 'Layer Sizes': nn_params['layer_sizes'], 'Optimizer': nn_params['optimizer_type'], 'Learning Rate': nn_params['learning_rate'], + 'Weight decay penalty type': nn_params['weight_decay_penalty_type'], + 'Weight decay penalty': nn_params['weight_decay_penalty'], 'Dropouts': nn_params['dropouts'], 'Best Epoch (Max)': '%i (%i)' % (nn_params['best_epoch'],nn_params['max_epochs']), 'Collection': collection_name, @@ -1551,6 +1581,8 @@ def get_summary_metadata_table(uuids, collections=None): 'Model Type (Featurizer)': '%s (%s)' % (mdl_params['model_type'],featurizer), 'XGB learning rate': xgb_params['xgb_learning_rate'], 'Gamma': xgb_params['xgb_gamma'], + 'Alpha': xgb_params.get('xgb_alpha', 0.0), + 'Lambda': xgb_params.get('xgb_lambda', 1.0), 'XGB max depth': xgb_params['xgb_max_depth'], 'Column sample fraction': xgb_params['xgb_colsample_bytree'], 'Row subsample fraction': xgb_params['xgb_subsample'], @@ -2091,7 +2123,7 @@ def get_multitask_perf_from_tracker(collection_name, response_cols=None, expand_ 'weight_decay_penalty', 'weight_decay_penalty_type', 'weight_init_stddevs', 'splitter', 'split_uuid', 'split_test_frac', 'split_valid_frac', 'smiles_col', 'id_col', 'feature_transform_type', 'response_cols', 'response_transform_type', 'num_model_tasks', - 'rf_estimators', 'rf_max_depth', 'rf_max_features', 'xgb_gamma', 'xgb_learning_rate', + 'rf_estimators', 'rf_max_depth', 'rf_max_features', 'xgb_gamma', 'xgb_alpha', 'xgb_lambda', 'xgb_learning_rate', 'xgb_max_depth', 'xgb_colsample_bytree', 'xgb_subsample', 'xgb_n_estimators', 'xgb_min_child_weight', ] keepcols.extend(alldat.columns[alldat.columns.str.contains('best')]) diff --git a/atomsci/ddm/pipeline/feature_importance.py b/atomsci/ddm/pipeline/feature_importance.py index 38135f6c..25be1c84 100755 --- a/atomsci/ddm/pipeline/feature_importance.py +++ b/atomsci/ddm/pipeline/feature_importance.py @@ -525,3 +525,33 @@ def _calc_cluster_permutation_scores(estimator, X, y, col_indices, random_state, +# =================================================================================================== +def plot_nn_feature_weights_vs_epoch(pipeline, axes=None, plot_width=20, plot_height=15): + """Plot a line graph of summed weight absolute values from neural network feature nodes to first hidden layer vs + training epoch. This plot is intended to show which features have the most influence on the final + prediction output. It is also used to show the effect of increasing the weight_decay_penalty + parameter, which _should_ result in decreasing the weights on less important features. + + Args: + pipeline (ModelPipeline): The pipeline object from a neural network model that was trained in the + current Python session. + axes (matplotlib.Axes): An Axes object to draw the plot in. If none is provided, the function will + create one in a figure with the specified width and height. + plot_width (float): Figure width + plot_height (float): Figure height + + Returns: + None + """ + if axes is None: + fig, axes = plt.subplots(figsize=(plot_width, plot_height)) + fig.suptitle("Sums of feature node : hidden layer absolute weights vs epoch") + + wt_df = pipeline.model_wrapper.feature_weights_df + max_epoch = len(wt_df) + best_epoch = pipeline.model_wrapper.best_epoch + for feat in pipeline.data.featurization.get_feature_columns(): + sns.lineplot(data=wt_df, x='epoch', y=feat, ax=axes) + axes.text(max_epoch + 4, wt_df[feat].values[max_epoch-1], feat) + axes.set_ylabel("Sum over 1st hidden layer nodes of feature weight absolute values") + axes.axvline(best_epoch, linestyle='--', color='blue') \ No newline at end of file diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py index b070e8c0..614d1c4c 100644 --- a/atomsci/ddm/pipeline/model_datasets.py +++ b/atomsci/ddm/pipeline/model_datasets.py @@ -91,6 +91,24 @@ def check_task_columns(params, dset_df): raise Exception(f"Requested prediction task columns {missing_tasks} are missing from training dataset") +# **************************************************************************************** +def get_class_freqs(params): + """Given an input dataset for a classification model, returns the value counts + for each class for each task. Uses params.dataset_key to locate the dataset and params.response_cols + to identify the columns containing the class labels. Returns a list of lists, one for each task, + containing the value counts for each class (not counting missing values). + """ + dset_df = pd.read_csv(params.dataset_key) + freq_list = [] + # =-=ksm Strange but true: All classification tasks have to have the same number of classes. This may + # be a "feature" of DeepChem, because of the format of the model prediction arrays. + nclasses = params.class_number + for col in params.response_cols: + vc = dset_df[col].value_counts() + freqs = [vc.get(cl, 0) for cl in range(nclasses)] + freq_list.append(freqs) + return freq_list + # **************************************************************************************** def set_group_permissions(system, path, data_owner='public', data_owner_group='public'): """Set file group and permissions to standard values for a dataset containing proprietary @@ -412,7 +430,7 @@ def get_featurized_data(self, params=None): self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w) # Checking for minimum number of rows if len(self.dataset) < params.min_compound_number: - self.log.warning("Dataset of length %i is shorter than the required length %i" % (len(self.dataset), params.min_compound_number)) + self.log.info("Dataset of length %i is shorter than the recommended length %i" % (len(self.dataset), params.min_compound_number)) # **************************************************************************************** def get_dataset_tasks(self, dset_df): diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py index 575c6b6d..e2a06daa 100644 --- a/atomsci/ddm/pipeline/model_pipeline.py +++ b/atomsci/ddm/pipeline/model_pipeline.py @@ -535,6 +535,7 @@ def split_dataset(self, featurization=None): featurization = feat.create_featurization(self.params) self.featurization = featurization self.load_featurize_data() + self.params.split_uuid = self.data.split_uuid return self.data.split_uuid diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py index 0b5f6e44..d75930eb 100644 --- a/atomsci/ddm/pipeline/model_wrapper.py +++ b/atomsci/ddm/pipeline/model_wrapper.py @@ -10,6 +10,7 @@ import deepchem as dc import numpy as np +import pandas as pd import tensorflow as tf if dc.__version__.startswith('2.1'): from deepchem.models.tensorgraph.fcnet import MultitaskRegressor, MultitaskClassifier @@ -48,6 +49,7 @@ from atomsci.ddm.utils import datastore_functions as dsf from atomsci.ddm.utils import llnl_utils +from atomsci.ddm.pipeline import model_datasets as md from atomsci.ddm.pipeline import transformations as trans from atomsci.ddm.pipeline import perf_data as perf import atomsci.ddm.pipeline.parameter_parser as pp @@ -988,6 +990,9 @@ def train_with_early_stopping(self, pipeline): valid_epoch_perfs (np.array): A standard validation set performance metric (r2_score or roc_auc), at the end of each epoch. """ self.data = pipeline.data + feature_names = self.data.featurization.get_feature_columns() + nfeatures = len(feature_names) + self.feature_weights = dict(zip(feature_names, [[] for f in feature_names])) em = perf.EpochManager(self, prediction_type=self.params.prediction_type, @@ -1010,11 +1015,22 @@ def train_with_early_stopping(self, pipeline): ei, pipeline.metric_type, train_perf, pipeline.metric_type, valid_perf, pipeline.metric_type, test_perf)) + layer1_weights = self.model.model.layers[0].weight + feature_weights = np.zeros(nfeatures, dtype=float) + for node_weights in layer1_weights: + node_feat_weights = torch.abs(node_weights).detach().numpy() + feature_weights += node_feat_weights + for fnum, fname in enumerate(feature_names): + self.feature_weights[fname].append(feature_weights[fnum]) + self.num_epochs_trained = ei + 1 # Compute performance metrics for each subset, and check if we've reached a new best validation set score if em.should_stop(): break + self.feature_weights_df = pd.DataFrame(self.feature_weights) + self.feature_weights_df['epoch'] = range(len(self.feature_weights_df)) + # Revert to last checkpoint self.restore() self.model_save() @@ -1837,9 +1853,14 @@ def make_dc_model(self, model_dir): max_depth=self.params.rf_max_depth, n_jobs=-1) else: + if self.params.weight_transform_type == 'balancing': + class_weights = 'balanced' + else: + class_weights = None rf_model = RandomForestClassifier(n_estimators=self.params.rf_estimators, max_features=self.params.rf_max_features, max_depth=self.params.rf_max_depth, + class_weight=class_weights, n_jobs=-1) return dc.models.sklearn_models.SklearnModel(rf_model, model_dir=model_dir) @@ -1995,8 +2016,8 @@ def make_dc_model(self, model_dir): subsample=self.params.xgb_subsample, colsample_bytree=self.params.xgb_colsample_bytree, colsample_bylevel=1, - reg_alpha=0, - reg_lambda=1, + reg_alpha=self.params.xgb_alpha, + reg_lambda=self.params.xgb_lambda, scale_pos_weight=1, base_score=0.5, random_state=0, @@ -2008,6 +2029,16 @@ def make_dc_model(self, model_dir): max_bin = 16, ) else: + if self.params.weight_transform_type == 'balancing': + # Compute a class weight for positive class samples to help deal with imblanced datasets + class_freqs = md.get_class_freqs(self.params) + if len(class_freqs) > 1: + raise ValueError("xgboost models don't currently support multitask data") + if len(class_freqs[0]) > 2: + raise ValueError("xgboost models don't currently support multiclass data") + pos_class_weight = class_freqs[0][0]/class_freqs[0][1] + else: + pos_class_weight = 1 xgb_model = xgb.XGBClassifier(max_depth=self.params.xgb_max_depth, learning_rate=self.params.xgb_learning_rate, n_estimators=self.params.xgb_n_estimators, @@ -2020,9 +2051,9 @@ def make_dc_model(self, model_dir): subsample=self.params.xgb_subsample, colsample_bytree=self.params.xgb_colsample_bytree, colsample_bylevel=1, - reg_alpha=0, - reg_lambda=1, - scale_pos_weight=1, + reg_alpha=self.params.xgb_alpha, + reg_lambda=self.params.xgb_lambda, + scale_pos_weight=pos_class_weight, base_score=0.5, random_state=0, importance_type='gain', @@ -2130,8 +2161,8 @@ def reload_model(self, reload_dir): subsample=self.params.xgb_subsample, colsample_bytree=self.params.xgb_colsample_bytree, colsample_bylevel=1, - reg_alpha=0, - reg_lambda=1, + reg_alpha=self.params.xgb_alpha, + reg_lambda=self.params.xgb_lambda, scale_pos_weight=1, base_score=0.5, random_state=0, @@ -2155,8 +2186,8 @@ def reload_model(self, reload_dir): subsample=self.params.xgb_subsample, colsample_bytree=self.params.xgb_colsample_bytree, colsample_bylevel=1, - reg_alpha=0, - reg_lambda=1, + reg_alpha=self.params.xgb_alpha, + reg_lambda=self.params.xgb_lambda, scale_pos_weight=1, base_score=0.5, random_state=0, @@ -2266,6 +2297,8 @@ def get_model_specific_metadata(self): "xgb_learning_rate" : self.params.xgb_learning_rate, "xgb_n_estimators" : self.params.xgb_n_estimators, "xgb_gamma" : self.params.xgb_gamma, + "xgb_alpha" : self.params.xgb_alpha, + "xgb_lambda" : self.params.xgb_lambda, "xgb_min_child_weight" : self.params.xgb_min_child_weight, "xgb_subsample" : self.params.xgb_subsample, "xgb_colsample_bytree" :self.params.xgb_colsample_bytree diff --git a/atomsci/ddm/pipeline/parameter_parser.py b/atomsci/ddm/pipeline/parameter_parser.py index 8d960b38..07a3cf5f 100644 --- a/atomsci/ddm/pipeline/parameter_parser.py +++ b/atomsci/ddm/pipeline/parameter_parser.py @@ -535,6 +535,8 @@ def get_list_args(self): 'umap_targ_wt', 'umap_min_dist', 'dropout_list','weight_decay_penalty', 'xgb_learning_rate', 'xgb_gamma', + 'xgb_alpha', + 'xgb_lambda', "xgb_min_child_weight", "xgb_subsample", "xgb_colsample_bytree", @@ -549,6 +551,8 @@ def get_list_args(self): not_a_list_outside_of_hyperparams = {'learning_rate','weight_decay_penalty', 'xgb_learning_rate', 'xgb_gamma', + 'xgb_alpha', + 'xgb_lambda', 'xgb_min_child_weight', 'xgb_subsample', 'xgb_colsample_bytree', @@ -1314,6 +1318,14 @@ def get_parser(): '--xgb_gamma', dest='xgb_gamma', default='0.0', help='Minimum loss reduction required to make a further partition on a leaf node of the tree. Can be input' ' as a comma separated list for hyperparameter search (e.g. \'0.0,0.1,0.2\')') + parser.add_argument( + '--xgb_alpha', dest='xgb_alpha', default='0.0', + help='L1 regularization term on weights. Increasing this value will make model more conservative. Can be input' + ' as a comma separated list for hyperparameter search (e.g. \'0.0,0.1,0.2\')') + parser.add_argument( + '--xgb_lambda', dest='xgb_lambda', default='1.0', + help='L2 regularization term on weights. Increasing this value will make model more conservative. Can be input' + ' as a comma separated list for hyperparameter search (e.g. \'0.0,0.1,0.2\')') parser.add_argument( '--xgb_learning_rate', dest='xgb_learning_rate', default='0.1', help='Boosting learning rate (xgb\'s \"eta\"). Can be input as a comma separated list for hyperparameter' @@ -1510,6 +1522,12 @@ def get_parser(): parser.add_argument( '--dp', dest='dp', required=False, default=None, help='dropouts shown in HyperOpt domain format, e.g. --dp=uniform|3|0,0.4') + parser.add_argument( + '--wdp', dest='wdp', required=False, default=None, + help='weight_decay_penalty shown in HyperOpt domain format, e.g. --wdp=loguniform|-6.908,-4.605') + parser.add_argument( + '--wdt', dest='wdt', required=False, default=None, + help='weight_decay_penalty_type shown in HyperOpt domain format, e.g. --wdt=choice|l1,l2') # RF model parser.add_argument( '--rfe', dest='rfe', required=False, default=None, @@ -1524,6 +1542,12 @@ def get_parser(): parser.add_argument( '--xgbg', dest='xgbg', required=False, default=None, help='xgb_gamma shown in HyperOpt domain format, e.g. --xgbg=uniform|0,0.4') + parser.add_argument( + '--xgba', dest='xgba', required=False, default=None, + help='xgb_alpha shown in HyperOpt domain format, e.g. --xgba=uniform|0,0.4') + parser.add_argument( + '--xgbb', dest='xgbb', required=False, default=None, + help='xgb_lambda shown in HyperOpt domain format, e.g. --xgbb=uniform|0,0.4') parser.add_argument( '--xgbl', dest='xgbl', required=False, default=None, help='xgb_learning_rate shown in HyperOpt domain format, e.g. --xgbl=loguniform|-6.9,-2.3') diff --git a/atomsci/ddm/pipeline/perf_plots.py b/atomsci/ddm/pipeline/perf_plots.py index e8bb2233..50931692 100644 --- a/atomsci/ddm/pipeline/perf_plots.py +++ b/atomsci/ddm/pipeline/perf_plots.py @@ -849,59 +849,6 @@ def plot_prec_recall_curve(MP, epoch_label='best', plot_size=7, pdf_dir=None): pdf.close() MP.log.info("Wrote plot to %s" % pdf_path) -#------------------------------------------------------------------------------------------------------------------------ -def old_plot_prec_recall_curve(MP, epoch_label='best', plot_size=7, pdf_dir=None): - """Plot precision-recall curves for a classification model. - - Args: - MP (`ModelPipeline`): Pipeline object for a model that was trained in the current Python session. - - epoch_label (str): Label for training epoch to draw predicted values from. Currently 'best' is the only allowed value. - - pdf_dir (str): If given, output the plots to a PDF file in the given directory. - - Returns: - None - - """ - params = MP.params - curve_data = _get_perf_curve_data(MP, epoch_label, 'precision-recall') - if len(curve_data) == 0: - return - if MP.run_mode == 'training': - # Draw overlapping PR curves for train, valid and test sets - subsets = ['train', 'valid', 'test'] - else: - subsets = ['full'] - if pdf_dir is not None: - pdf_path = os.path.join(pdf_dir, '%s_%s_model_%s_features_%s_split_PRC_curves.pdf' % ( - params.dataset_name, params.model_type, params.featurizer, params.splitter)) - pdf = PdfPages(pdf_path) - subset_colors = dict(train=train_col, valid=valid_col, test=test_col, full=full_col) - # For multitask, do a separate figure for each task - ntasks = curve_data[subsets[0]]['prob_active'].shape[1] - for i in range(ntasks): - fig, ax = plt.subplots(figsize=(plot_size,plot_size)) - title = '%s dataset\nPrecision-recall curve for %s %s classifier on %s features with %s split' % ( - params.dataset_name, params.response_cols[i], - params.model_type, params.featurizer, params.splitter) - for subset in subsets: - precision, recall, thresholds = metrics.precision_recall_curve(curve_data[subset]['true_classes'][:,i], - curve_data[subset]['prob_active'][:,i]) - - prc_auc = curve_data[subset]['prc_aucs'][i] - ax.step(recall, precision, color=subset_colors[subset], label="%s: AUC = %.3f" % (subset, prc_auc)) - ax.set_xlabel('Recall') - ax.set_ylabel('Precision') - ax.set_title(title, fontdict={'fontsize' : 12}) - legend = ax.legend(loc='lower right') - - if pdf_dir is not None: - pdf.savefig(fig) - if pdf_dir is not None: - pdf.close() - MP.log.info("Wrote plot to %s" % pdf_path) - #------------------------------------------------------------------------------------------------------------------------ def plot_umap_feature_projections(MP, ndim=2, num_neighbors=20, min_dist=0.1, fit_to_train=True, diff --git a/atomsci/ddm/utils/hyperparam_search_wrapper.py b/atomsci/ddm/utils/hyperparam_search_wrapper.py index 832de9ee..075a034e 100755 --- a/atomsci/ddm/utils/hyperparam_search_wrapper.py +++ b/atomsci/ddm/utils/hyperparam_search_wrapper.py @@ -1418,6 +1418,7 @@ def __init__(self, params): for i in range(1,num_layer): self.space[f"ratio{i}"] = build_hyperopt_search_domain(f"ratio{i}", method, par_list) + # dropouts if self.params.dp: domain_list = self.params.dp.split("|") method = domain_list[0] @@ -1425,6 +1426,21 @@ def __init__(self, params): par_list = [float(e) for e in domain_list[2].split(",")] for i in range(num_layer): self.space[f"dp{i}"] = build_hyperopt_search_domain(f"dp{i}", method, par_list) + + # weight decay penalty + if self.params.wdp: + domain_list = self.params.wdp.split("|") + method = domain_list[0] + par_list = [float(e) for e in domain_list[1].split(",")] + self.space["weight_decay_penalty"] = build_hyperopt_search_domain("weight_decay_penalty", method, par_list) + + # weight decay penalty type + if self.params.wdt: + domain_list = self.params.wdt.split("|") + method = domain_list[0] + par_list = domain_list[1].split(",") + self.space["weight_decay_penalty_type"] = build_hyperopt_search_domain("weight_decay_penalty_type", method, par_list) + elif self.params.model_type == "xgboost": #build searching domain for XGBoost parameters if self.params.xgbg: @@ -1433,6 +1449,18 @@ def __init__(self, params): par_list = [float(e) for e in domain_list[1].split(",")] self.space["xgbg"] = build_hyperopt_search_domain("xgbg", method, par_list) + if self.params.xgba: + domain_list = self.params.xgba.split("|") + method = domain_list[0] + par_list = [float(e) for e in domain_list[1].split(",")] + self.space["xgba"] = build_hyperopt_search_domain("xgba", method, par_list) + + if self.params.xgbb: + domain_list = self.params.xgbb.split("|") + method = domain_list[0] + par_list = [float(e) for e in domain_list[1].split(",")] + self.space["xgbb"] = build_hyperopt_search_domain("xgbb", method, par_list) + if self.params.xgbl: domain_list = self.params.xgbl.split("|") method = domain_list[0] @@ -1500,6 +1528,10 @@ def lossfn(p): self.params.learning_rate = p["learning_rate"] if self.params.dp: self.params.dropouts = ",".join([str(p[e]) for e in p if e[:2] == "dp"]) + if self.params.wdp: + self.params.weight_decay_penalty = p['weight_decay_penalty'] + if self.params.wdt: + self.params.weight_decay_penalty_type = p['weight_decay_penalty_type'] if self.params.ls: if not self.params.ls_ratio: self.params.layer_sizes = ",".join([str(p[e]) for e in p if e[:2] == "ls"]) @@ -1508,11 +1540,16 @@ def lossfn(p): for i in range(1,len([e for e in p if e[:5] == "ratio"])+1): list_layer_sizes.append(int(list_layer_sizes[-1] * p[f"ratio{i}"])) self.params.layer_sizes = ",".join([str(e) for e in list_layer_sizes]) - hp_params = f'{self.params.learning_rate}_{self.params.layer_sizes}_{self.params.dropouts}' - print(f"learning_rate: {self.params.learning_rate}, layer_sizes: {self.params.layer_sizes}, dropouts: {self.params.dropouts}") + hp_params = f'{self.params.learning_rate}_{self.params.layer_sizes}_{self.params.dropouts}_{self.params.weight_decay_penalty_type}_{self.params.weight_decay_penalty}' + print(f"learning_rate: {self.params.learning_rate}, layer_sizes: {self.params.layer_sizes}, dropouts: {self.params.dropouts}, " + f"weight_decay_penalty: {self.params.weight_decay_penalty_type}({self.params.weight_decay_penalty})") elif self.params.model_type == "xgboost": if self.params.xgbg: self.params.xgb_gamma = p["xgbg"] + if self.params.xgba: + self.params.xgb_alpha = p["xgba"] + if self.params.xgbb: + self.params.xgb_lambda = p["xgbb"] if self.params.xgbl: self.params.xgb_learning_rate = p["xgbl"] if self.params.xgbd: @@ -1525,8 +1562,10 @@ def lossfn(p): self.params.xgb_n_estimators = p["xgbn"] if self.params.xgbw: self.params.xgb_min_child_weight = p["xgbw"] - hp_params = f'{self.params.xgb_gamma}_{self.params.xgb_learning_rate}_{self.params.xgb_max_depth}_{self.params.xgb_colsample_bytree}_{self.params.xgb_subsample}_{self.params.xgb_n_estimators}_{self.params.xgb_min_child_weight}' + hp_params = f'{self.params.xgb_gamma}_{self.params.xgb_alpha}_{self.params.xgb_lambda}_{self.params.xgb_learning_rate}_{self.params.xgb_max_depth}_{self.params.xgb_colsample_bytree}_{self.params.xgb_subsample}_{self.params.xgb_n_estimators}_{self.params.xgb_min_child_weight}' print(f"xgb_gamma: {self.params.xgb_gamma}, " + f"xgb_alpha: {self.params.xgb_alpha}, " + f"xgb_lambda: {self.params.xgb_lambda}, " f"xgb_learning_rate: {self.params.xgb_learning_rate}, " f"xgb_max_depth: {self.params.xgb_max_depth}, " f"xgb_colsample_bytree: {self.params.xgb_colsample_bytree}, " @@ -1709,7 +1748,10 @@ def parse_params(param_list): 'slurm_time_limit'} | excluded_keys if params.search_type == 'hyperopt': # keep more parameters - keep_params = keep_params | {'lr', 'learning_rate','ls', 'layer_sizes','ls_ratio','dp', 'dropouts','rfe', 'rf_estimators','rfd', 'rf_max_depth','rff', 'rf_max_features','xgbg', 'xgb_gamma','xgbl', 'xgb_learning_rate', 'xgbd', 'xgb_max_depth', 'xgbc', 'xgb_colsample_bytree', 'xgbs', 'xgb_subsample', 'xgbn', 'xgb_n_estimators', 'xgbw', 'xgb_min_child_weight', 'hp_checkpoint_load', 'hp_checkpoint_save'} + keep_params = keep_params | {'lr', 'learning_rate','ls', 'layer_sizes','ls_ratio','dp', 'dropouts', 'wdp', 'weight_decay_penalty', 'wdt', 'weight_decay_penalty_type', + 'rfe', 'rf_estimators','rfd', 'rf_max_depth','rff', 'rf_max_features', + 'xgbg', 'xgb_gamma', 'xgba', 'xgb_alpha', 'xgbb', 'xgb_lambda', 'xgbl', 'xgb_learning_rate', 'xgbd', 'xgb_max_depth', 'xgbc', 'xgb_colsample_bytree', + 'xgbs', 'xgb_subsample', 'xgbn', 'xgb_n_estimators', 'xgbw', 'xgb_min_child_weight', 'hp_checkpoint_load', 'hp_checkpoint_save'} params.__dict__ = parse.prune_defaults(params, keep_params=keep_params) diff --git a/atomsci/ddm/utils/split_diagnostic_plots.py b/atomsci/ddm/utils/split_diagnostic_plots.py new file mode 100644 index 00000000..447c5166 --- /dev/null +++ b/atomsci/ddm/utils/split_diagnostic_plots.py @@ -0,0 +1,213 @@ +"""Functions to generate multi-plot displays to assess split quality: response value distributions, test-vs-train Tanimoto distance distributions, etc. +""" + +import os +import numpy as np +import pandas as pd +from argparse import Namespace +from atomsci.ddm.pipeline.model_pipeline import ModelPipeline +from atomsci.ddm.pipeline import parameter_parser as parse +from atomsci.ddm.pipeline import perf_plots as pp +from atomsci.ddm.utils import split_response_dist_plots as srdp +from atomsci.ddm.utils import compare_splits_plots as csp + +import matplotlib.pyplot as plt +import seaborn as sns +from scipy import stats + + +# --------------------------------------------------------------------------------------------------------------------------------- +def plot_split_diagnostics(params_or_pl, axes=None, num_rows=None, num_cols=1, min_tvt_dist=0.3, plot_size=7): + """Generate several plots showing various aspects of split quality. These include: The value distributions for each response + column in the different split subsets; the nearest-training-set-neighbor Tanimoto distance distributions for the validation + and test sets; the actual split subset proportions; and (for multitask scaffold splits) the progression of fitness term values + over generations of the genetic algorithm. + + Args: + params_or_pl (argparse.Namespace or dict or ModelPipeline): Structure containing dataset and split parameters, or + the ModelPipeline object used to perform a split. If a ModelPipeline is passed as this argument, the parameters + are extracted from it and an additional plot will be generated for multitaskscaffold splits showing the progression + of fitness term values over generations. The following parameters are required, if not set to default values: + + | - dataset_key + | - split_uuid + | - split_strategy + | - splitter + | - split_valid_frac + | - split_test_frac + | - num_folds + | - smiles_col + | - response_cols + + axes (matplotlib.Axes): Axes to draw plots in. If provided, must contain enough entries to display all the plots requested + (2 for each response column + 3 + 1 for a multitask scaffold split when a ModelPipeline is passed to params_or_pl). If not + provided, a figure and Axes of the required length will be created. + + num_rows (int): Number of rows for the Axes layout; ignored if an existing set of Axes are passed in the `axes` argument. + num_cols (int): Number of columns for the Axes layout; ignored if an existing set of Axes are passed in the `axes` argument. + plot_size (float): Height of plots; ignored if `axes` is provided + Returns: + None + """ + + # Save current matplotlib color cycle and switch to 'colorblind' palette + old_palette = sns.color_palette() + sns.set_palette('colorblind') + + if isinstance(params_or_pl, dict): + params = parse.wrapper(params_or_pl) + elif isinstance(params_or_pl, Namespace): + params = params_or_pl + elif isinstance(params_or_pl, ModelPipeline): + params = params_or_pl.params + splitter = params_or_pl.data.splitting.splitter + params.fitness_terms = splitter.fitness_terms + params.split_uuid = params_or_pl.data.split_uuid + + else: + raise ValueError("params_or_pl must be dict, Namespace or ModelPipeline") + + # Figure out how many plots we're going to generate and how to lay them out + nresp = len(params.response_cols) + nfitness = int((params.splitter == 'multitaskscaffold') and (isinstance(params_or_pl, ModelPipeline))) + nplots = 2*nresp + 3 + nfitness + if axes is not None: + axes = axes.flatten() + if len(axes) < nplots: + raise ValueError(f"axes argument needs {nplots} axis pairs for requested plots; only provides {len(axes)}") + else: + if num_rows is None: + num_rows = (nplots + 1) // num_cols + fig, axes = plt.subplots(num_rows, num_cols, figsize=(num_cols*plot_size, num_rows*plot_size), layout='constrained') + axes = axes.flatten() + plot_num = 0 + + # Draw split response distribution plots + srdp.plot_split_subset_response_distrs(params, plot_size=plot_size, axes=axes) + + # Draw NN Tanimoto distance distribution plots with titles showing fraction of compounds below min_tvt_dist + plot_num += nresp + dset_path = params.dataset_key + split_path = f"{os.path.splitext(dset_path)[0]}_{params.split_strategy}_{params.splitter}_{params.split_uuid}.csv" + dset_df = pd.read_csv(dset_path, dtype={params.id_col: str}) + split_df = pd.read_csv(split_path, dtype={'cmpd_id': str}) + split_stats = csp.SplitStats(dset_df, split_df, smiles_col=params.smiles_col, id_col=params.id_col, + response_cols=params.response_cols) + vvtr_dists = split_stats.dists_tvv + tvtr_dists = split_stats.dists_tvt + vfrac = sum(vvtr_dists <= min_tvt_dist)/len(vvtr_dists) + tfrac = sum(tvtr_dists <= min_tvt_dist)/len(tvtr_dists) + ax = axes[plot_num] + ax = split_stats.dist_hist_train_v_valid_plot(ax=ax) + ax.set_title(f"Valid vs train Tanimoto NN distance distribution\nFraction <= {min_tvt_dist} = {vfrac:.3f}") + + plot_num += 1 + ax = axes[plot_num] + ax = split_stats.dist_hist_train_v_test_plot(ax=ax) + ax.set_title(f"Test vs train Tanimoto NN distance distribution\nFraction <= {min_tvt_dist} = {tfrac:.3f}") + + # Draw plots of actual and requested split proportions and counts for each response column after excluding missing values + plot_num += 1 + plot_split_fractions(params, axes[plot_num:]) + plot_num += nresp + + # Plot the evolution over generations of each term in the fitness function optimized by the multitask scaffold splitter. + if nfitness > 0: + plot_fitness_terms(params, axes[plot_num:]) + +def plot_fitness_terms(params, axes): + """Plot the evolution over generations of each term in the fitness function optimized by the multitask scaffold splitter. + + Args: + params (argparse.Namespace or dict): Structure containing dataset and split parameters. The parameters must include + the key `fitness_terms`, which is created by the multitask splitter; usually this is only available when `params` + is derived from the ModelPipeline object used to split the dataset. + + axes (numpy.array of matplotlib.Axes): Axes to draw plots in. + """ + if isinstance(params, dict): + params = parse.wrapper(params) + if isinstance(axes, np.ndarray): + ax = axes[0] + else: + ax = axes + fitness_df = pd.DataFrame(params.fitness_terms) + fitness_df['generation'] = list(range(len(fitness_df))) + fit_long_df = fitness_df.melt(id_vars='generation', value_vars=list(params.fitness_terms.keys()), var_name='Fitness term', value_name='Score') + ax = sns.lineplot(data=fit_long_df, x='generation', y='Score', hue='Fitness term', ax=ax) + ax.set_title('Unweighted fitness scores vs generation') + + +def plot_split_fractions(params, axes): + """Draw plots of actual and requested split proportions and counts for each response column after excluding missing values + + Args: + params (argparse.Namespace or dict): Structure containing dataset and split parameters. + The following parameters are required, if not set to default values: + + | - dataset_key + | - split_uuid + | - split_strategy + | - splitter + | - split_valid_frac + | - split_test_frac + | - num_folds + | - smiles_col + | - response_cols + + axes (matplotlib.Axes): Axes to draw plots in. Must contain at least as many entries as response columns in the dataset. + """ + split_dset_df, _ = srdp.get_split_labeled_dataset(params) + axes = axes.flatten() + req_color = pp.test_col + actual_color = pp.train_col + for icol, resp_col in enumerate(params.response_cols): + ss_df = split_dset_df[split_dset_df[resp_col].notna()] + ss_counts = ss_df.subset.value_counts() + total_count = sum(ss_counts) + ss_fracs = ss_counts / total_count + actual_df = pd.DataFrame(dict(counts=ss_counts, fracs=ss_fracs)).reindex(['train', 'valid', 'test']) + req_fracs = dict(train=1-params.split_valid_frac-params.split_test_frac, valid=params.split_valid_frac, test=params.split_test_frac) + req_df = pd.DataFrame(dict(fracs=req_fracs)).reindex(['train', 'valid', 'test']) + req_df['counts'] = np.round(total_count * req_df.fracs).astype(int) + + bar_width = 0.35 + + # Positions of the bars on the x-axis + r1 = np.arange(len(actual_df)) + r2 = r1 + bar_width + + # Plotting the proportions + ax1 = axes[icol] + bars_requested = ax1.bar(r1, req_df.fracs.values, color=req_color, width=bar_width, label='Requested', alpha=0.6) + bars_actual = ax1.bar(r2, actual_df.fracs.values, color=actual_color, width=bar_width, label='Actual', alpha=0.6) + max_count = max(max(actual_df.counts.values), max(req_df.counts.values)) + + # Left Y-axis (proportions) + ax1.set_ylabel('Proportions') + #ax1.tick_params(axis='y') + + # Create the second Y-axis (counts) + ax2 = ax1.twinx() + ax2.set_ylabel('Counts') + #ax2.tick_params(axis='y', labelcolor='k') + ax2.set_ylim(0, max_count*1.1) + + # Adding counts on top of the bars + for bar, count in zip(bars_requested, req_df.counts.values): + height = bar.get_height() + ax1.text(bar.get_x() + bar.get_width() / 2, height, f'{count}', ha='center', va='bottom', color=actual_color) + + for bar, count in zip(bars_actual, actual_df.counts.values): + height = bar.get_height() + ax1.text(bar.get_x() + bar.get_width() / 2, height, f'{count}', ha='center', va='bottom', color=req_color) + + # X-axis labels + plt.xticks([r + bar_width / 2 for r in range(len(actual_df))], actual_df.index.values) + + # Title and legend + if icol == 0: + plt.title(f"Requested and Actual Subset Proportions and Counts\n{resp_col}") + ax1.legend(loc='upper right') + else: + plt.title(f"\n{resp_col}") diff --git a/atomsci/ddm/utils/split_response_dist_plots.py b/atomsci/ddm/utils/split_response_dist_plots.py index d457b814..ef7448db 100644 --- a/atomsci/ddm/utils/split_response_dist_plots.py +++ b/atomsci/ddm/utils/split_response_dist_plots.py @@ -46,6 +46,9 @@ def plot_split_subset_response_distrs(params, axes=None, plot_size=7): else: subset_order = ['train', 'valid', 'test'] + wdist_df = compute_split_subset_wasserstein_distances(params) + + if axes is None: fig, axes = plt.subplots(1, len(params.response_cols), figsize=(plot_size*len(params.response_cols), plot_size)) if len(params.response_cols) == 1: @@ -54,10 +57,18 @@ def plot_split_subset_response_distrs(params, axes=None, plot_size=7): axes = axes.flatten() for colnum, col in enumerate(params.response_cols): ax = axes[colnum] + if params.split_strategy == 'train_valid_test': + tvv_wdist = wdist_df[(wdist_df.split_subset == 'valid') & (wdist_df.response_col == col)].distance.values[0] + tvt_wdist = wdist_df[(wdist_df.split_subset == 'test') & (wdist_df.response_col == col)].distance.values[0] if params.prediction_type == 'regression': ax = sns.kdeplot(data=dset_df, x=col, hue='split_subset', hue_order=subset_order, bw_adjust=0.7, fill=True, common_norm=False, ax=ax) ax.set_title(f"{col} distribution by subset under {split_label}") + if params.split_strategy == 'train_valid_test': + ax.set_title(f"{col} distribution by subset under {split_label}\n" \ + f"Wasserstein distances: valid = {tvv_wdist:.3f}, test = {tvt_wdist:.3f}") + else: + ax.set_title(f"{col} distribution by subset under {split_label}") else: pct_active = [] for ss in subset_order: @@ -66,7 +77,12 @@ def plot_split_subset_response_distrs(params, axes=None, plot_size=7): pct_active.append(100*nactive/sum(ss_df[col].notna())) active_df = pd.DataFrame(dict(subset=subset_order, percent_active=pct_active)) ax = sns.barplot(data=active_df, x='subset', y='percent_active', hue='subset', ax=ax) - ax.set_title(f"Percent of {col} = 1 by subset under {split_label}") + + if params.split_strategy == 'train_valid_test': + ax.set_title(f"Percent of {col} = 1 by subset under {split_label}" \ + f"Wasserstein distances: valid = {tvv_wdist:.3f}, test = {tvt_wdist:.3f}") + else: + ax.set_title(f"Percent of {col} = 1 by subset under {split_label}") ax.set_xlabel('') # Restore previous matplotlib color cycle @@ -153,7 +169,7 @@ def get_split_labeled_dataset(params): """ if isinstance(params, dict): params = parse.wrapper(params) - dset_df = pd.read_csv(params.dataset_key, dtype={'compound_id': str}) + dset_df = pd.read_csv(params.dataset_key, dtype={params.id_col: str}) if params.split_strategy == 'k_fold_cv': split_file = f"{os.path.splitext(params.dataset_key)[0]}_{params.num_folds}_fold_cv_{params.splitter}_{params.split_uuid}.csv" else: @@ -167,6 +183,9 @@ def get_split_labeled_dataset(params): split_label = f"{nfolds}-fold {params.splitter} cross-validation split" else: dset_df['split_subset'] = dset_df.subset.values - split_label = f"{params.splitter} split" + if params.splitter == 'multitaskscaffold': + split_label = 'MTSS split' + else: + split_label = f"{params.splitter} split" return dset_df, split_label