diff --git a/atomsci/ddm/pipeline/GeneticAlgorithm.py b/atomsci/ddm/pipeline/GeneticAlgorithm.py
index 531b85ba..98611d77 100644
--- a/atomsci/ddm/pipeline/GeneticAlgorithm.py
+++ b/atomsci/ddm/pipeline/GeneticAlgorithm.py
@@ -49,12 +49,14 @@ def __init__(self,
         self.fitness_func = fitness_func
         self.crossover_func = crossover_func
         self.mutate_func = mutate_func
-        self.parallel_grade_population()
+        self.serial_grade_population()
+        #self.parallel_grade_population()
 
-    def parallel_grade_population(self):
-        """ Grade the population and save the scores
 
-        Updates the order of self.pop and self.pop_scores
+    def serial_grade_population(self):
+        """ Scores the chromosomes in the current population and sorts them in decreasing score order.
+        Saves the sorted scores in self.pop_scores. Not multithreaded; surprisingly, this runs faster
+        than the multithreaded function parallel_grade_scores.
 
         Parameters
         ----------
@@ -62,16 +64,37 @@ def parallel_grade_population(self):
 
         Returns
         -------
-        Nothing
+        None. As a side effect, sorts the chromosomes in self.pop and updates the scores in self.pop_scores.
+        """
+        fitnesses = []
+        for chrom in self.pop:
+            fitnesses.append(self.fitness_func(chrom))
+        pairs = sorted(zip(fitnesses, self.pop), reverse=True)
+        self.pop = [chrome for fitness, chrome in pairs]
+        self.pop_scores = [fitness for fitness, chrome in pairs]
+
+
+    def parallel_grade_population(self):
+        """ Scores the chromosomes in the current population and sorts them in decreasing score order.
+        Saves the sorted scores in self.pop_scores. 
+        
+        Although this does the same thing in multiple threads as the single-threaded function 
+        serial_grade_population, it seems to run much slower, at least for multitask scaffold splits
+        with 100 chromosomes.
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        None
         """
         pool = multiprocessing.Pool(processes=N_PROCS)
         fitnesses = pool.map(self.fitness_func, self.pop)
         pool.close()
         pool.join()
-        pairs = list(zip(fitnesses, self.pop))
-
-        pairs.sort(key=lambda x: x[0], reverse=True)
-
+        pairs = sorted(zip(fitnesses, self.pop), reverse=True)
         self.pop = [chrome for fitness, chrome in pairs]
         self.pop_scores = [fitness for fitness, chrome in pairs]
 
@@ -90,10 +113,7 @@ def select_parents(self) -> List[List[Any]]:
         parents: List[List[Any]]
             A list of chromosomes that will be parents for the next generation
         """
-        self.parallel_grade_population()
-
-        parents = [chrome for chrome in self.pop[:self.num_parents]]
-        return parents
+        return self.pop[:self.num_parents]
 
     def iterate(self, num_generations: int):
         """ Iterates the genetic algorithm num_generations
@@ -130,12 +150,13 @@ def step(self, print_timings: bool = False):
 
         start = timeit.default_timer()
         i = timeit.default_timer()
+        # select parents using rank selection
         parents = self.select_parents()
         if print_timings:
             print('\tfind parents %0.2f min'%((timeit.default_timer()-i)/60))
 
-        # select parents using rank selection
         i = timeit.default_timer()
+        # Generate new population by crossing parent chromosomes
         new_pop = self.crossover_func(parents, self.num_pop)
         if print_timings:
             print('\tcrossover %0.2f min'%((timeit.default_timer()-i)/60))
@@ -143,6 +164,9 @@ def step(self, print_timings: bool = False):
         # mutate population
         i = timeit.default_timer()
         self.pop = self.mutate_func(new_pop)
+        # Compute scores for new chromosomes and sort population by score
+        self.serial_grade_population()
+        #self.parallel_grade_population()
         if print_timings:
             print('\tmutate %0.2f min'%((timeit.default_timer()-i)/60))
             print('total %0.2f min'%((timeit.default_timer()-start)/60))
diff --git a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
index 67e2a54f..9a4fbb18 100644
--- a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
+++ b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
@@ -1,4 +1,5 @@
 import pdb
+import logging
 import argparse
 import random
 import timeit
@@ -24,6 +25,9 @@
 from atomsci.ddm.pipeline import dist_metrics
 from atomsci.ddm.pipeline import GeneticAlgorithm as ga
 
+logging.basicConfig(format='%(asctime)-15s %(message)s')
+logger = logging.getLogger('ATOM')
+
 def _generate_scaffold_hists(scaffold_sets: List[np.ndarray], 
                                 w: np.array) -> np.array:
     """Counts the number of labelled samples per task per scaffold
@@ -96,8 +100,8 @@ def smush_small_scaffolds(scaffolds: List[Set[int]],
         else:
             new_scaffolds.append(scaffold)
 
-    print('new scaffold lengths')
-    print([len(s) for s in new_scaffolds])
+    logger.debug('new scaffold lengths')
+    logger.debug([len(s) for s in new_scaffolds])
     new_scaffolds = [np.array(list(s)) for s in new_scaffolds]
     return new_scaffolds
 
@@ -133,7 +137,7 @@ def calc_ecfp(smiles: List[str],
         fprints = [y for x in ecfps for y in x] #Flatten results
     else:
         mols = [Chem.MolFromSmiles(s) for s in smiles]
-        fprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) for mol in mols]
+        fprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 3, 1024) for mol in mols]
 
     return fprints
 
@@ -144,9 +148,9 @@ def dist_smiles_from_ecfp(ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBit
     Parameters
     ----------
     ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect],
-        A list of ECFP finger prints
+        A list of ECFP fingerprints
     ecfp2: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect]
-        A list of ECFP finger prints
+        A list of ECFP fingerprints
 
     Returns
     -------
@@ -159,57 +163,6 @@ def dist_smiles_from_ecfp(ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBit
     return cd.calc_summary(dist_metrics.tanimoto(ecfp1, ecfp2), calc_type='nearest', 
                         num_nearest=1, within_dset=False)
 
-def _generate_scaffold_dist_matrix(scaffold_lists: List[np.ndarray],
-                                ecfp_features: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect]) -> np.ndarray:
-        """Returns a nearest neighbors distance matrix between each scaffold.
-
-        The distance between two scaffolds is defined as the
-        the distance between the two closest compounds between the two
-        scaffolds.
-
-        TODO: Ask Stewart: Why did he change to using the median instead of the min?
-
-        Parameters
-        ----------
-        scaffold_lists: List[np.ndarray]
-            List of scaffolds. A scaffold is a set of indices into ecfp_features
-        ecfp_features: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect]
-            List of ecfp features, one for each compound in the dataset
-
-        Returns
-        -------
-        dist_mat: np.ndarray
-            Distance matrix, symmetric.
-        """
-        print('start generating big dist mat')
-        start = timeit.default_timer()
-        # First compute the full matrix of distances between all pairs of ECFPs
-        dmat = dist_metrics.tanimoto(ecfp_features)
-
-        mat_shape = (len(scaffold_lists), len(scaffold_lists))
-        scaff_dist_mat = np.zeros(mat_shape)
-        for i, scaff1 in tqdm(enumerate(scaffold_lists)):
-            scaff1_rows = scaff1.reshape((-1,1))
-            for j, scaff2 in enumerate(scaffold_lists[:i]):
-                if i == j:
-                    continue
-
-                dists = dmat[scaff1_rows, scaff2].flatten()
-                #min_dist = np.median(dists)
-                # ksm: Change back to min and see what happens
-                min_dist = np.min(dists)
-
-                if min_dist==0:
-                    print("two scaffolds match exactly?!?", i, j)
-                    print(len(set(scaff2).intersection(set(scaff1))))
-
-                scaff_dist_mat[i,j] = min_dist
-                scaff_dist_mat[j,i] = min_dist
-
-        print("finished scaff dist mat: %0.2f min"%((timeit.default_timer()-start)/60))
-        return scaff_dist_mat
-
-
 class MultitaskScaffoldSplitter(Splitter):
     """MultitaskScaffoldSplitter Splitter class.
 
@@ -261,6 +214,65 @@ def generate_scaffolds(self,
 
         return scaffold_sets
 
+    def _generate_scaffold_dist_matrix(self):
+        """Computes matrices used by fitness functions that score splits according
+        to the dissimilarity between training and test compound structures. One is 
+        a symmetric matrix of nearest neighbor Tanimoto distances between pairs of 
+        scaffolds (i.e., between the most similar compounds in the two scaffolds).
+        This matrix is used to compute the `scaffold_diff_fitness` function.
+        
+        The other is a nonsymmetric matrix of boolean vectors `has_nn[i,j]`, of length
+        equal to the number of compounds in scaffold `i`, indicating whether each
+        compound has a neighbor in scaffold `j` nearer than some threshold Tanimoto 
+        distance `dist_thresh`. dist_thresh defaults to 0.3, but could be parameterized.
+        The has_nn matrix is used to compute the `far_frac_fitness` function, which is
+        a more robust measurement of the train/test set dissimilarity.
+        """
+        scaffold_lists = self.ss
+        ecfp_features = self.ecfp_features
+
+        # First compute the full matrix of distances between all pairs of ECFPs
+        dmat = dist_metrics.tanimoto(ecfp_features)
+
+        mat_shape = (len(scaffold_lists), len(scaffold_lists))
+        scaff_dist_mat = np.zeros(mat_shape)
+        has_near_neighbor_mat = np.empty(mat_shape, dtype=object)
+
+        for i, scaff1 in enumerate(scaffold_lists):
+            scaff1_rows = scaff1.reshape((-1,1))
+            for j, scaff2 in enumerate(scaffold_lists[:i]):
+                if i == j:
+                    continue
+
+                scaff1_dists = dmat[scaff1_rows, scaff2]
+                min_dist = np.min(scaff1_dists.flatten())
+
+                if min_dist==0:
+                    logger.info(f"Scaffolds {i} and {j} have at least one ECFP in common")
+                    for k in scaff1:
+                        for l in scaff2:
+                            if dmat[k,l] == 0:
+                                logger.debug(f"\tcompound {k} in scaffold {i}, compound {l} in scaffold {j}")
+                                logger.debug(f"\tSMILES {k}: {self.smiles[k]}")
+                                logger.debug(f"\tSMILES {l}: {self.smiles[l]}\n")
+
+
+                scaff_dist_mat[i,j] = min_dist
+                scaff_dist_mat[j,i] = min_dist
+
+                # Identify the compounds in scaff1 with a neighbor in scaff2 closer than dist_thresh
+                has_near_neighbor_mat[i,j] = np.array(np.min(scaff1_dists, axis=1) < self.dist_thresh)
+
+                # Identify the compounds in scaff2 with a neighbor in scaff1 closer than dist_thresh
+                scaff2_rows = scaff2.reshape((-1,1))
+                scaff2_dists = dmat[scaff2_rows, scaff1]
+                has_near_neighbor_mat[j,i] = np.array(np.min(scaff2_dists, axis=1) < self.dist_thresh)
+
+
+        self.scaff_scaff_distmat = scaff_dist_mat
+        self.has_near_neighbor_mat = has_near_neighbor_mat
+
+
     def expand_scaffolds(self,
                         scaffold_list: List[int]) -> List[int]:
         """Turns a list of scaffold indices into a list of compound indices
@@ -350,51 +362,74 @@ def scaffold_diff_fitness(self,
 
         return min_dist
 
-    def ratio_fitness_old(self, split_chromosome: List[str]) -> float:
-        """Calculates a fitness score based on how accurately divided training/test/validation.
+    def far_frac_fitness(self, 
+                            split_chromosome: List[str],
+                            train_part: str,
+                            test_part: str) -> float:
+        """Grades a split according to the fraction of valid/test compounds with
+        nearest training set compounds further than some threshold.
+
+        Grades the quality of the split based on which scaffolds were alloted to
+        which partitions. The score is measured as the fraction of compounds in
+        `test_part` with nearest neighbors in `train_part` at Tanimoto distance
+        `self.dist_thresh` or greater.
+
         Parameters
         ----------
-        List[str]: split_chromosome
-            A list of strings, index i contains a string, 'train', 'valid', 'test'
-            which determines the partition that scaffold belongs to
+        split_chromosome: List[str]
+            A split represented as a list of partition names. Index i in the
+            chromosome contains the partition for scaffold i.
+        train_part: str
+            The name of the partition to be treated as the training subset
+        test_part: str
+            The name of the partition to be treated as the test subset
+
         Returns
         -------
-        List[str]
-            A list of strings, index i contains a string, 'train', 'valid', 'test'
-            which determines the partition that scaffold belongs to
+        score: float
+            Floating point value beteween 0-1. 1 is the best score and 0 is the worst
         """
-        start = timeit.default_timer()
-        total_counts = np.sum(self.dataset.w, axis=0)
-        subset_counts = [np.sum(self.dataset.w[subset], axis=0) for subset in \
-                            self.split_chromosome_to_compound_split(split_chromosome)]
+        # TODO: Eventually, replace strings in chromosome with integers indicating the partition
+        # for each scaffold.
 
-        # subset order goes train, valid, test. just consider train for the moment
-        subset_ratios = [subset_count/total_counts for subset_count in subset_counts]
-        #print("mean: %0.2f, median: %0.2f, std: %0.2f"%(np.mean(subset_ratios[0]), np.median(subset_ratios[0]), np.std(subset_ratios[0])))
-
-        # fitness of split is the size of the smallest training dataset
-        # ratio_fit = min(subset_ratios[0]) # this resulted in a split with 0 test. shoulda seen that coming
-        num_tasks = self.dataset.w.shape[1]
-        # imagine the perfect split is a point in 3*num_tasks space.
-        target_split = np.concatenate([[self.frac_train]*num_tasks,
-                                       [self.frac_valid]*num_tasks])
-        # this is the current split also on 3*num_tasks space
-        current_split = np.concatenate(subset_ratios[:2])
-        # if any partition is 0, then this split fails
-        if min(current_split) == 0:
+        train_scaffolds = [i for i, part in enumerate(split_chromosome) if part==train_part]
+        test_scaffolds = [i for i, part in enumerate(split_chromosome) if part==test_part]
+
+        # if a partition is completely empty, return 0
+        if len(train_scaffolds) == 0 or len(test_scaffolds) == 0:
             return 0
-        # worst possible distance to normalize this between 0 and 1
-        worst_distance = np.linalg.norm(np.ones(num_tasks*2))
-        ratio_fit = 1 - np.linalg.norm(target_split-current_split)/worst_distance
-        #print("\tratio_fitness: %0.2f min"%((timeit.default_timer()-start)/60))
 
-        return ratio_fit
+        # Compute the "far fraction": For each test scaffold S, OR together the boolean vectors
+        # from has_near_neighbor_mat for the training scaffolds indicating whether each compound in
+        # S has a neighbor in the train scaffold closer than Tanimoto distance dist_thresh. Sum
+        # the results over compounds and test scaffolds and divide by the test set size to get the
+        # "near fraction"; the far fraction is 1 minus the near fraction.
+
+        near_count = 0
+        total_count = 0
+        for test_ind in test_scaffolds:
+            has_nn = None
+            for train_ind in train_scaffolds:
+                assert(not (train_ind == test_ind))
+                if has_nn is None:
+                    has_nn = self.has_near_neighbor_mat[test_ind, train_ind]
+                else:
+                    has_nn |= self.has_near_neighbor_mat[test_ind, train_ind]
+            near_count += sum(has_nn)
+            total_count += len(has_nn)
+
+        far_frac = 1 - near_count/total_count
+        #print(f"far_frac_fitness: near_count {near_count}, total_count {total_count}, far_frac {far_frac}")
+        return far_frac
+
 
     def ratio_fitness(self, split_chromosome: List[str]) -> float:
-        """Calculates a fitness score based on how accurately divided training/test/validation.
+        """Calculates a fitness score based on how well the subset proportions for each task, taking
+        only labeled compounds into account, match the proportions requested by the user.
 
-        Treats the min,median,max of each of the three partitions as a 9D point and uses
-        euclidean distance to measure the distance to that point
+        The score is determined by combining the min, median and max over tasks of the subset fractions
+        into a 9-dimensional vector and computing its Euclidean distance from an ideal vector having
+        all the fractions as requested.
 
         Parameters
         ----------
@@ -407,15 +442,15 @@ def ratio_fitness(self, split_chromosome: List[str]) -> float:
         float
             A float between 0 and 1. 1 best 0 is worst
         """
-        start = timeit.default_timer()
+        #start = timeit.default_timer()
         # total_counts is the number of labels per task
         total_counts = np.sum(self.dataset.w, axis=0)
 
-        # subset_counts is number of labels per task per subset
+        # subset_counts is number of labels per task per subset.
         subset_counts = [np.sum(self.dataset.w[subset], axis=0) for subset in \
                             self.split_chromosome_to_compound_split(split_chromosome)]
 
-        # subset order goes train, valid, test. just consider train for the moment
+        # subset order goes train, valid, test
         subset_ratios = [subset_count/total_counts for subset_count in subset_counts]
 
         # imagine the perfect split is a point in 9D space. For each subset we measure
@@ -433,6 +468,7 @@ def ratio_fitness(self, split_chromosome: List[str]) -> float:
             return 0
         # worst possible distance to normalize this between 0 and 1
         worst_distance = np.linalg.norm(np.ones(len(target_split)))
+        worst_distance = np.sqrt(len(target_split))
         ratio_fit = 1 - np.linalg.norm(target_split-current_split)/worst_distance
         #print("\tratio_fitness: %0.2f min"%((timeit.default_timer()-start)/60))
 
@@ -456,7 +492,7 @@ def response_distr_fitness(self, split_chromosome: List[str]) -> float:
             the train subset response value distributions, averaged over tasks. One means
             the distributions perfectly match.
         """
-        start = timeit.default_timer()
+        #start = timeit.default_timer()
         dist_sum = 0.0
         ntasks = self.dataset.y.shape[1]
         train_ind, valid_ind, test_ind = self.split_chromosome_to_compound_split(split_chromosome)
@@ -495,18 +531,56 @@ def grade(self, split_chromosome: List[str]) -> float:
         float
             A float between 0 and 1. 1 best 0 is worst
         """
+        
+        """
         fitness = 0.0
         # Only call the functions for each fitness term if their weight is nonzero
         if self.diff_fitness_weight_tvt != 0.0:
-            fitness += self.diff_fitness_weight_tvt*self.scaffold_diff_fitness(split_chromosome, 'train', 'test')
+            #score = self.scaffold_diff_fitness(split_chromosome, 'train', 'test')
+            score = self.far_frac_fitness(split_chromosome, 'train', 'test')
+            fitness += self.diff_fitness_weight_tvt*score
         if self.diff_fitness_weight_tvv != 0.0:
-            fitness += self.diff_fitness_weight_tvv*self.scaffold_diff_fitness(split_chromosome, 'train', 'valid')
+            #score = self.scaffold_diff_fitness(split_chromosome, 'train', 'valid')
+            score = self.far_frac_fitness(split_chromosome, 'train', 'valid')
+            fitness += self.diff_fitness_weight_tvv*score
         if self.ratio_fitness_weight != 0.0:
-            fitness += self.ratio_fitness_weight*self.ratio_fitness(split_chromosome)
+            score = self.ratio_fitness(split_chromosome)
+            fitness += self.ratio_fitness_weight*score
         if self.response_distr_fitness_weight != 0.0:
-            fitness += self.response_distr_fitness_weight*self.response_distr_fitness(split_chromosome)
+            score = self.response_distr_fitness(split_chromosome)
+            fitness += self.response_distr_fitness_weight*score
 
+        # Normalize the score to the range [0,1]
+        fitness /= (self.diff_fitness_weight_tvt + self.diff_fitness_weight_tvv + self.ratio_fitness_weight +
+                    self.response_distr_fitness_weight)
         return fitness
+        """
+        fitness_scores = self.get_fitness_scores(split_chromosome)
+        return fitness_scores['total_fitness']
+
+    def get_fitness_scores(self, split_chromosome):
+        fitness_scores = {}
+        total_fitness = 0.0
+        # Only call the functions for each fitness term if their weight is nonzero
+        if self.diff_fitness_weight_tvt != 0.0:
+            #fitness_scores['test_scaf_dist'] = self.scaffold_diff_fitness(split_chromosome, 'train', 'test')
+            fitness_scores['test_scaf_dist'] = self.far_frac_fitness(split_chromosome, 'train', 'test')
+            total_fitness += self.diff_fitness_weight_tvt * fitness_scores['test_scaf_dist']
+        if self.diff_fitness_weight_tvv != 0.0:
+            #fitness_scores['valid_scaf_dist'] = self.scaffold_diff_fitness(split_chromosome, 'train', 'valid')
+            fitness_scores['valid_scaf_dist'] = self.far_frac_fitness(split_chromosome, 'train', 'valid')
+            total_fitness += self.diff_fitness_weight_tvv * fitness_scores['valid_scaf_dist']
+        if self.ratio_fitness_weight != 0.0:
+            fitness_scores['ratio'] = self.ratio_fitness(split_chromosome)
+            total_fitness += self.ratio_fitness_weight * fitness_scores['ratio']
+        if self.response_distr_fitness_weight != 0.0:
+            fitness_scores['response_distr'] = self.response_distr_fitness(split_chromosome)
+            total_fitness += self.response_distr_fitness_weight * fitness_scores['response_distr']
+        # Normalize the score to the range [0,1]
+        total_fitness /= (self.diff_fitness_weight_tvt + self.diff_fitness_weight_tvv + self.ratio_fitness_weight +
+                    self.response_distr_fitness_weight)
+        fitness_scores['total_fitness'] = total_fitness
+        return fitness_scores
 
     def init_scaffolds(self,
              dataset: Dataset) -> None:
@@ -528,10 +602,10 @@ def init_scaffolds(self,
         # list of lists. one list per scaffold
         big_ss = self.generate_scaffolds(dataset)
 
-        # using the same stragetgy as scaffold split, combine the scaffolds
+        # using the same strategy as scaffold split, combine the scaffolds
         # together until you have roughly 100 scaffold sets
         self.ss = smush_small_scaffolds(big_ss, num_super_scaffolds=self.num_super_scaffolds)
-        print(f'num_super_scaffolds: {len(self.ss)}, {self.num_super_scaffolds}')
+        logger.info(f"Requested {self.num_super_scaffolds} super scaffolds, produced {len(self.ss)} from {len(big_ss)} original scaffolds")
 
         # rows is the number of scaffolds
         # columns is number of tasks
@@ -550,8 +624,10 @@ def split(self,
             num_super_scaffolds: int = 20,
             num_pop: int = 100,
             num_generations: int=30,
+            dist_thresh: float = 0.3,
             print_timings: bool = False,
-            log_every_n: int = 1000) -> Tuple:
+            early_stopping_generations = 25,
+            log_every_n: int = 10) -> Tuple:
         """Creates a split for the given datset
 
         This split splits the dataset into a list of super scaffolds then
@@ -609,6 +685,7 @@ def split(self,
         self.num_super_scaffolds = num_super_scaffolds
         self.num_pop = num_pop
         self.num_generations = num_generations
+        self.dist_thresh = dist_thresh
 
         self.frac_train = frac_train
         self.frac_valid = frac_valid
@@ -618,38 +695,47 @@ def split(self,
         self.init_scaffolds(self.dataset)
 
         # ecpf features
+        self.smiles = dataset.ids
         self.ecfp_features = calc_ecfp(dataset.ids)
 
         # calculate ScaffoldxScaffold distance matrix
-        start = timeit.default_timer()
         if (self.diff_fitness_weight_tvv > 0.0) or (self.diff_fitness_weight_tvt > 0.0):
-            self.scaff_scaff_distmat = _generate_scaffold_dist_matrix(self.ss, self.ecfp_features)
-        #print('scaffold dist mat %0.2f min'%((timeit.default_timer()-start)/60))
+            self._generate_scaffold_dist_matrix()
 
         # initial population
         population = []
         for i in range(self.num_pop):
-            start = timeit.default_timer()
+            #start = timeit.default_timer()
             split_chromosome = self._split(frac_train=frac_train, frac_valid=frac_valid, 
                                 frac_test=frac_test)
-            #print("per_loop: %0.2f min"%((timeit.default_timer()-start)/60))
 
             population.append(split_chromosome)
 
+        self.fitness_terms = {}
         gene_alg = ga.GeneticAlgorithm(population, self.grade, ga_crossover,
                         ga_mutate)
         #gene_alg.iterate(num_generations)
+        best_ever = None
+        best_ever_fit = -np.inf
+        best_gen = 0
         for i in range(self.num_generations):
             gene_alg.step(print_timings=print_timings)
             best_fitness = gene_alg.pop_scores[0]
-            print("step %d: best_fitness %0.2f"%(i, best_fitness))
-            #print("%d: %0.2f"%(i, gene_alg.grade_population()[0][0]))
-
-        best_fit = gene_alg.pop_scores[0]
-        best = gene_alg.pop[0]
-
-        #print('best ever fitness %0.2f'%best_ever_fit)
-        result = self.split_chromosome_to_compound_split(best)
+            if best_fitness > best_ever_fit:
+                best_ever = gene_alg.pop[0]
+                best_ever_fit = best_fitness
+                best_gen = i
+            score_dict = self.get_fitness_scores(best_ever)
+            for term, score in score_dict.items():
+                self.fitness_terms.setdefault(term, []).append(score)
+            if i % log_every_n == 0:
+                logger.info(f"generation {i}: Best fitness {best_fitness:.3f}, best ever {best_ever_fit:.3f} at generation {best_gen}")
+            if (best_fitness <= best_ever_fit) and (i - best_gen >= early_stopping_generations):
+                logger.info(f"No fitness improvement after {early_stopping_generations} generations")
+                break
+
+        logger.info(f"Final best fitness score: {best_ever_fit:.3f} at generation {best_gen}")
+        result = self.split_chromosome_to_compound_split(best_ever)
         return result
 
     def _split(self,
@@ -738,7 +824,8 @@ def train_valid_test_split(self,
             train_dir: Optional[str] = None,
             valid_dir: Optional[str] = None,
             test_dir: Optional[str] = None,
-            log_every_n: int = 1000) -> Tuple[Dataset, Dataset, Dataset]:
+            dist_thresh: float = 0.3,
+            log_every_n: int = 10) -> Tuple[Dataset, Dataset, Dataset]:
         """Creates a split for the given datset
 
         This split splits the dataset into a list of super scaffolds then
@@ -805,6 +892,7 @@ def train_valid_test_split(self,
             diff_fitness_weight_tvv=diff_fitness_weight_tvv, ratio_fitness_weight=ratio_fitness_weight,
             response_distr_fitness_weight=response_distr_fitness_weight,
             num_super_scaffolds=num_super_scaffolds, num_pop=num_pop, num_generations=num_generations,
+            dist_thresh=0.3,
             print_timings=False)
 
         if train_dir is None:
diff --git a/atomsci/ddm/pipeline/compare_models.py b/atomsci/ddm/pipeline/compare_models.py
index 2b9a445d..ce734c1c 100644
--- a/atomsci/ddm/pipeline/compare_models.py
+++ b/atomsci/ddm/pipeline/compare_models.py
@@ -1,5 +1,5 @@
-"""Functions for comparing and visualizing model performance. Most of these functions rely on ATOM's model tracker and
-datastore services, which are not part of the standard AMPL installation, but a few functions will work on collections of
+"""Functions for comparing and visualizing model performance. Some of these functions rely on ATOM's model tracker and
+datastore services, which are not part of the standard AMPL installation, but several functions will work on collections of
 models saved as local files.
 """
 
@@ -163,6 +163,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
     model_type_list = []
     max_epochs_list = []
     learning_rate_list = []
+    weight_decay_penalty_list = []
+    weight_decay_penalty_type_list = []
     dropouts_list = []
     layer_sizes_list = []
     featurizer_list = []
@@ -172,6 +174,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
     rf_max_depth_list = []
     xgb_learning_rate_list = []
     xgb_gamma_list = []
+    xgb_alpha_list = []
+    xgb_lambda_list = []
     xgb_max_depth_list = []
     xgb_colsample_bytree_list = []
     xgb_subsample_list = []
@@ -218,6 +222,9 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
             max_epochs_list.append(nn_params['max_epochs'])
             best_epoch_list.append(nn_params['best_epoch'])
             learning_rate_list.append(nn_params['learning_rate'])
+            weight_decay_penalty_list.append(nn_params['weight_decay_penalty'])
+            weight_decay_penalty_type_list.append(nn_params['weight_decay_penalty_type'])
+            learning_rate_list.append(nn_params['learning_rate'])
             layer_sizes_list.append(','.join(['%d' % s for s in nn_params['layer_sizes']]))
             dropouts_list.append(','.join(['%.2f' % d for d in nn_params['dropouts']]))
             rf_estimators_list.append(nan)
@@ -225,6 +232,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
             rf_max_depth_list.append(nan)
             xgb_learning_rate_list.append(nan)
             xgb_gamma_list.append(nan)
+            xgb_alpha_list.append(nan)
+            xgb_lambda_list.append(nan)
             xgb_max_depth_list.append(nan)
             xgb_colsample_bytree_list.append(nan)
             xgb_subsample_list.append(nan)
@@ -239,10 +248,14 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
             max_epochs_list.append(nan)
             best_epoch_list.append(nan)
             learning_rate_list.append(nan)
+            weight_decay_penalty_list.append(nan)
+            weight_decay_penalty_type_list.append(nan)
             layer_sizes_list.append(nan)
             dropouts_list.append(nan)
             xgb_learning_rate_list.append(nan)
             xgb_gamma_list.append(nan)
+            xgb_alpha_list.append(nan)
+            xgb_lambda_list.append(nan)
             xgb_max_depth_list.append(nan)
             xgb_colsample_bytree_list.append(nan)
             xgb_subsample_list.append(nan)
@@ -256,10 +269,14 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
             max_epochs_list.append(nan)
             best_epoch_list.append(nan)
             learning_rate_list.append(nan)
+            weight_decay_penalty_list.append(nan)
+            weight_decay_penalty_type_list.append(nan)
             layer_sizes_list.append(nan)
             dropouts_list.append(nan)
             xgb_learning_rate_list.append(xgb_params["xgb_learning_rate"])
             xgb_gamma_list.append(xgb_params["xgb_gamma"])
+            xgb_alpha_list.append(xgb_params.get("xgb_alpha", 0.0))
+            xgb_lambda_list.append(xgb_params.get("xgb_lambda", 1.0))
             xgb_max_depth_list.append(xgb_params["xgb_max_depth"])
             xgb_colsample_bytree_list.append(xgb_params["xgb_colsample_bytree"])
             xgb_subsample_list.append(xgb_params["xgb_subsample"])
@@ -277,6 +294,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
                     max_epochs=max_epochs_list,
                     best_epoch=best_epoch_list,
                     learning_rate=learning_rate_list,
+                    weight_decay_penalty_type=weight_decay_penalty_type_list,
+                    weight_decay_penalty=weight_decay_penalty_list,
                     layer_sizes=layer_sizes_list,
                     dropouts=dropouts_list,
                     rf_estimators=rf_estimators_list,
@@ -284,6 +303,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
                     rf_max_depth=rf_max_depth_list,
                     xgb_learning_rate = xgb_learning_rate_list,
                     xgb_gamma = xgb_gamma_list,
+                    xgb_alpha = xgb_alpha_list,
+                    xgb_lambda = xgb_lambda_list,
                     xgb_max_depth = xgb_max_depth_list,
                     xgb_colsample_bytree = xgb_colsample_bytree_list,
                     xgb_subsample = xgb_subsample_list,
@@ -307,15 +328,15 @@ def extract_model_and_feature_parameters(metadata_dict, keep_required=True):
 
     Returns:
         dictionary containing featurizer and model parameters. Most contain the following
-        keys. ['max_epochs', 'best_epoch', 'learning_rate', 'layer_sizes', 'dropouts',
-        'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_learning_rate',
+        keys. ['max_epochs', 'best_epoch', 'learning_rate', 'weight_decay_penalty_type', 'weight_decay_penalty', 'layer_sizes', 'dropouts',
+        'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_alpha', 'xgb_lambda', 'xgb_learning_rate',
         'xgb_max_depth', 'xgb_colsample_bytree', 'xgb_subsample', 'xgb_n_estimators', 'xgb_min_child_weight',
         'featurizer_parameters_dict', 'model_parameters_dict']
     """
     model_params = metadata_dict['model_parameters']
     model_type = model_params['model_type']
-    required = ['max_epochs', 'best_epoch', 'learning_rate', 'layer_sizes', 'dropouts', 
-        'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_learning_rate',
+    required = ['max_epochs', 'best_epoch', 'learning_rate', 'layer_sizes', 'dropouts', 'weight_decay_penalty_type', 'weight_decay_penalty',
+        'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_alpha', 'xgb_lambda', 'xgb_learning_rate',
         'xgb_max_depth', 'xgb_colsample_bytree', 'xgb_subsample', 'xgb_n_estimators', 'xgb_min_child_weight'
         ]
 
@@ -328,6 +349,9 @@ def extract_model_and_feature_parameters(metadata_dict, keep_required=True):
             model_info['max_epochs'] = nn_params['max_epochs']
             model_info['best_epoch'] = nn_params['best_epoch']
             model_info['learning_rate'] = nn_params['learning_rate']
+            model_info['weight_decay_penalty_type'] = nn_params['weight_decay_penalty_type']
+            model_info['weight_decay_penalty'] = nn_params['weight_decay_penalty']
+            model_info['learning_rate'] = nn_params['learning_rate']
             model_info['layer_sizes'] = ','.join(['%d' % s for s in nn_params['layer_sizes']])
             model_info['dropouts'] = ','.join(['%.2f' % d for d in nn_params['dropouts']])
         elif model_type == 'RF':
@@ -338,6 +362,8 @@ def extract_model_and_feature_parameters(metadata_dict, keep_required=True):
         elif model_type == 'xgboost':
             xgb_params = metadata_dict['xgb_specific']
             model_info['xgb_gamma'] = xgb_params['xgb_gamma']
+            model_info['xgb_alpha'] = xgb_params.get('xgb_alpha', 0.0)
+            model_info['xgb_lambda'] = xgb_params.get('xgb_lambda', 1.0)
             model_info['xgb_learning_rate'] = xgb_params['xgb_learning_rate']
             model_info['xgb_max_depth'] = xgb_params['xgb_max_depth']
             model_info['xgb_colsample_bytree'] = xgb_params['xgb_colsample_bytree']
@@ -979,6 +1005,8 @@ def get_filesystem_perf_results(result_dir, pred_type='classification', expand=T
                 print(f"Can't access model {dirpath}")
 
     print("Found data for %d models under %s" % (len(model_list), result_dir))
+    if len(model_list) == 0:
+        return pd.DataFrame()
 
     # build dictonary of tarball names
     tar_dict = {os.path.basename(tf):tf for tf in tar_list}
@@ -1511,6 +1539,8 @@ def get_summary_metadata_table(uuids, collections=None):
                          'Layer Sizes':   nn_params['layer_sizes'],
                          'Optimizer':     nn_params['optimizer_type'],
                          'Learning Rate': nn_params['learning_rate'],
+                         'Weight decay penalty type': nn_params['weight_decay_penalty_type'],
+                         'Weight decay penalty': nn_params['weight_decay_penalty'],
                          'Dropouts':      nn_params['dropouts'],
                          'Best Epoch (Max)': '%i (%i)' % (nn_params['best_epoch'],nn_params['max_epochs']),
                          'Collection':    collection_name,
@@ -1551,6 +1581,8 @@ def get_summary_metadata_table(uuids, collections=None):
                          'Model Type (Featurizer)':    '%s (%s)' % (mdl_params['model_type'],featurizer),
                          'XGB learning rate': xgb_params['xgb_learning_rate'],
                          'Gamma':    xgb_params['xgb_gamma'],
+                         'Alpha':    xgb_params.get('xgb_alpha', 0.0),
+                         'Lambda':    xgb_params.get('xgb_lambda', 1.0),
                          'XGB max depth': xgb_params['xgb_max_depth'],
                          'Column sample fraction':    xgb_params['xgb_colsample_bytree'],
                          'Row subsample fraction':    xgb_params['xgb_subsample'],
@@ -2091,7 +2123,7 @@ def get_multitask_perf_from_tracker(collection_name, response_cols=None, expand_
                   'weight_decay_penalty', 'weight_decay_penalty_type', 'weight_init_stddevs', 'splitter',
                   'split_uuid', 'split_test_frac', 'split_valid_frac', 'smiles_col', 'id_col',
                   'feature_transform_type', 'response_cols', 'response_transform_type', 'num_model_tasks',
-                  'rf_estimators', 'rf_max_depth', 'rf_max_features', 'xgb_gamma', 'xgb_learning_rate',
+                  'rf_estimators', 'rf_max_depth', 'rf_max_features', 'xgb_gamma', 'xgb_alpha', 'xgb_lambda', 'xgb_learning_rate',
                   'xgb_max_depth', 'xgb_colsample_bytree', 'xgb_subsample', 'xgb_n_estimators', 'xgb_min_child_weight',
                   ]
         keepcols.extend(alldat.columns[alldat.columns.str.contains('best')])
diff --git a/atomsci/ddm/pipeline/feature_importance.py b/atomsci/ddm/pipeline/feature_importance.py
index 38135f6c..25be1c84 100755
--- a/atomsci/ddm/pipeline/feature_importance.py
+++ b/atomsci/ddm/pipeline/feature_importance.py
@@ -525,3 +525,33 @@ def _calc_cluster_permutation_scores(estimator, X, y, col_indices, random_state,
 
 
 
+# ===================================================================================================
+def plot_nn_feature_weights_vs_epoch(pipeline, axes=None, plot_width=20, plot_height=15):
+    """Plot a line graph of summed weight absolute values from neural network feature nodes to first hidden layer vs 
+    training epoch. This plot is intended to show which features have the most influence on the final
+    prediction output. It is also used to show the effect of increasing the weight_decay_penalty
+    parameter, which _should_ result in decreasing the weights on less important features.
+    
+    Args:
+        pipeline (ModelPipeline): The pipeline object from a neural network model that was trained in the
+        current Python session.
+        axes (matplotlib.Axes): An Axes object to draw the plot in. If none is provided, the function will
+        create one in a figure with the specified width and height.
+        plot_width (float): Figure width
+        plot_height (float): Figure height
+    
+    Returns:
+        None
+    """
+    if axes is None:
+        fig, axes = plt.subplots(figsize=(plot_width, plot_height))
+        fig.suptitle("Sums of feature node : hidden layer absolute weights vs epoch")
+    
+    wt_df = pipeline.model_wrapper.feature_weights_df
+    max_epoch = len(wt_df)
+    best_epoch = pipeline.model_wrapper.best_epoch
+    for feat in pipeline.data.featurization.get_feature_columns():
+        sns.lineplot(data=wt_df, x='epoch', y=feat, ax=axes)
+        axes.text(max_epoch + 4, wt_df[feat].values[max_epoch-1], feat)
+    axes.set_ylabel("Sum over 1st hidden layer nodes of feature weight absolute values")
+    axes.axvline(best_epoch, linestyle='--', color='blue')
\ No newline at end of file
diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py
index b070e8c0..614d1c4c 100644
--- a/atomsci/ddm/pipeline/model_datasets.py
+++ b/atomsci/ddm/pipeline/model_datasets.py
@@ -91,6 +91,24 @@ def check_task_columns(params, dset_df):
         raise Exception(f"Requested prediction task columns {missing_tasks} are missing from training dataset")
 
 
+# ****************************************************************************************
+def get_class_freqs(params):
+    """Given an input dataset for a classification model, returns the value counts
+    for each class for each task. Uses params.dataset_key to locate the dataset and params.response_cols
+    to identify the columns containing the class labels. Returns a list of lists, one for each task,
+    containing the value counts for each class (not counting missing values).
+    """
+    dset_df = pd.read_csv(params.dataset_key)
+    freq_list = []
+    # =-=ksm Strange but true: All classification tasks have to have the same number of classes. This may
+    # be a "feature" of DeepChem, because of the format of the model prediction arrays.
+    nclasses = params.class_number
+    for col in params.response_cols:
+        vc = dset_df[col].value_counts()
+        freqs = [vc.get(cl, 0) for cl in range(nclasses)]
+        freq_list.append(freqs)
+    return freq_list
+
 # ****************************************************************************************
 def set_group_permissions(system, path, data_owner='public', data_owner_group='public'):
     """Set file group and permissions to standard values for a dataset containing proprietary
@@ -412,7 +430,7 @@ def get_featurized_data(self, params=None):
         self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w)
         # Checking for minimum number of rows
         if len(self.dataset) < params.min_compound_number:
-            self.log.warning("Dataset of length %i is shorter than the required length %i" % (len(self.dataset), params.min_compound_number))
+            self.log.info("Dataset of length %i is shorter than the recommended length %i" % (len(self.dataset), params.min_compound_number))
 
     # ****************************************************************************************
     def get_dataset_tasks(self, dset_df):
diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py
index 575c6b6d..e2a06daa 100644
--- a/atomsci/ddm/pipeline/model_pipeline.py
+++ b/atomsci/ddm/pipeline/model_pipeline.py
@@ -535,6 +535,7 @@ def split_dataset(self, featurization=None):
             featurization = feat.create_featurization(self.params)
         self.featurization = featurization
         self.load_featurize_data()
+        self.params.split_uuid = self.data.split_uuid
         return self.data.split_uuid
 
 
diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py
index 0b5f6e44..d75930eb 100644
--- a/atomsci/ddm/pipeline/model_wrapper.py
+++ b/atomsci/ddm/pipeline/model_wrapper.py
@@ -10,6 +10,7 @@
 
 import deepchem as dc
 import numpy as np
+import pandas as pd
 import tensorflow as tf
 if dc.__version__.startswith('2.1'):
     from deepchem.models.tensorgraph.fcnet import MultitaskRegressor, MultitaskClassifier
@@ -48,6 +49,7 @@
 
 from atomsci.ddm.utils import datastore_functions as dsf
 from atomsci.ddm.utils import llnl_utils
+from atomsci.ddm.pipeline import model_datasets as md
 from atomsci.ddm.pipeline import transformations as trans
 from atomsci.ddm.pipeline import perf_data as perf
 import atomsci.ddm.pipeline.parameter_parser as pp
@@ -988,6 +990,9 @@ def train_with_early_stopping(self, pipeline):
                 valid_epoch_perfs (np.array): A standard validation set performance metric (r2_score or roc_auc), at the end of each epoch.
         """
         self.data = pipeline.data
+        feature_names = self.data.featurization.get_feature_columns()
+        nfeatures = len(feature_names)
+        self.feature_weights = dict(zip(feature_names, [[] for f in feature_names]))
 
         em = perf.EpochManager(self,
                                 prediction_type=self.params.prediction_type, 
@@ -1010,11 +1015,22 @@ def train_with_early_stopping(self, pipeline):
                           ei, pipeline.metric_type, train_perf, pipeline.metric_type, valid_perf,
                           pipeline.metric_type, test_perf))
 
+            layer1_weights = self.model.model.layers[0].weight
+            feature_weights = np.zeros(nfeatures, dtype=float)
+            for node_weights in layer1_weights:
+                node_feat_weights = torch.abs(node_weights).detach().numpy()
+                feature_weights += node_feat_weights
+            for fnum, fname in enumerate(feature_names):
+                self.feature_weights[fname].append(feature_weights[fnum])
+
             self.num_epochs_trained = ei + 1
             # Compute performance metrics for each subset, and check if we've reached a new best validation set score
             if em.should_stop():
                 break
 
+        self.feature_weights_df = pd.DataFrame(self.feature_weights)
+        self.feature_weights_df['epoch'] = range(len(self.feature_weights_df))
+
         # Revert to last checkpoint
         self.restore()
         self.model_save()
@@ -1837,9 +1853,14 @@ def make_dc_model(self, model_dir):
                                              max_depth=self.params.rf_max_depth,
                                              n_jobs=-1)
         else:
+            if self.params.weight_transform_type == 'balancing':
+                class_weights = 'balanced'
+            else:
+                class_weights = None
             rf_model = RandomForestClassifier(n_estimators=self.params.rf_estimators,
                                               max_features=self.params.rf_max_features,
                                               max_depth=self.params.rf_max_depth,
+                                              class_weight=class_weights,
                                               n_jobs=-1)
 
         return dc.models.sklearn_models.SklearnModel(rf_model, model_dir=model_dir)
@@ -1995,8 +2016,8 @@ def make_dc_model(self, model_dir):
                                          subsample=self.params.xgb_subsample,
                                          colsample_bytree=self.params.xgb_colsample_bytree,
                                          colsample_bylevel=1,
-                                         reg_alpha=0,
-                                         reg_lambda=1,
+                                         reg_alpha=self.params.xgb_alpha,
+                                         reg_lambda=self.params.xgb_lambda,
                                          scale_pos_weight=1,
                                          base_score=0.5,
                                          random_state=0,
@@ -2008,6 +2029,16 @@ def make_dc_model(self, model_dir):
                                          max_bin = 16,
                                          )
         else:
+            if self.params.weight_transform_type == 'balancing':
+                # Compute a class weight for positive class samples to help deal with imblanced datasets
+                class_freqs = md.get_class_freqs(self.params)
+                if len(class_freqs) > 1:
+                    raise ValueError("xgboost models don't currently support multitask data")
+                if len(class_freqs[0]) > 2:
+                    raise ValueError("xgboost models don't currently support multiclass data")
+                pos_class_weight = class_freqs[0][0]/class_freqs[0][1]
+            else:
+                pos_class_weight = 1
             xgb_model = xgb.XGBClassifier(max_depth=self.params.xgb_max_depth,
                                          learning_rate=self.params.xgb_learning_rate,
                                          n_estimators=self.params.xgb_n_estimators,
@@ -2020,9 +2051,9 @@ def make_dc_model(self, model_dir):
                                           subsample=self.params.xgb_subsample,
                                           colsample_bytree=self.params.xgb_colsample_bytree,
                                           colsample_bylevel=1,
-                                          reg_alpha=0,
-                                          reg_lambda=1,
-                                          scale_pos_weight=1,
+                                          reg_alpha=self.params.xgb_alpha,
+                                          reg_lambda=self.params.xgb_lambda,
+                                          scale_pos_weight=pos_class_weight,
                                           base_score=0.5,
                                           random_state=0,
                                           importance_type='gain',
@@ -2130,8 +2161,8 @@ def reload_model(self, reload_dir):
                                          subsample=self.params.xgb_subsample,
                                          colsample_bytree=self.params.xgb_colsample_bytree,
                                          colsample_bylevel=1,
-                                         reg_alpha=0,
-                                         reg_lambda=1,
+                                         reg_alpha=self.params.xgb_alpha,
+                                         reg_lambda=self.params.xgb_lambda,
                                          scale_pos_weight=1,
                                          base_score=0.5,
                                          random_state=0,
@@ -2155,8 +2186,8 @@ def reload_model(self, reload_dir):
                                          subsample=self.params.xgb_subsample,
                                          colsample_bytree=self.params.xgb_colsample_bytree,
                                          colsample_bylevel=1,
-                                         reg_alpha=0,
-                                         reg_lambda=1,
+                                         reg_alpha=self.params.xgb_alpha,
+                                         reg_lambda=self.params.xgb_lambda,
                                          scale_pos_weight=1,
                                          base_score=0.5,
                                          random_state=0,
@@ -2266,6 +2297,8 @@ def get_model_specific_metadata(self):
                        "xgb_learning_rate" : self.params.xgb_learning_rate,
                        "xgb_n_estimators" : self.params.xgb_n_estimators,
                        "xgb_gamma" : self.params.xgb_gamma,
+                       "xgb_alpha" : self.params.xgb_alpha,
+                       "xgb_lambda" : self.params.xgb_lambda,
                        "xgb_min_child_weight" : self.params.xgb_min_child_weight,
                        "xgb_subsample" : self.params.xgb_subsample,
                        "xgb_colsample_bytree"  :self.params.xgb_colsample_bytree
diff --git a/atomsci/ddm/pipeline/parameter_parser.py b/atomsci/ddm/pipeline/parameter_parser.py
index 8d960b38..07a3cf5f 100644
--- a/atomsci/ddm/pipeline/parameter_parser.py
+++ b/atomsci/ddm/pipeline/parameter_parser.py
@@ -535,6 +535,8 @@ def get_list_args(self):
                          'umap_targ_wt', 'umap_min_dist', 'dropout_list','weight_decay_penalty',
                          'xgb_learning_rate',
                          'xgb_gamma',
+                         'xgb_alpha',
+                         'xgb_lambda',
                          "xgb_min_child_weight",
                          "xgb_subsample",
                          "xgb_colsample_bytree",
@@ -549,6 +551,8 @@ def get_list_args(self):
 not_a_list_outside_of_hyperparams = {'learning_rate','weight_decay_penalty',
                                      'xgb_learning_rate',
                                      'xgb_gamma',
+                                     'xgb_alpha',
+                                     'xgb_lambda',
                                      'xgb_min_child_weight',
                                      'xgb_subsample',
                                      'xgb_colsample_bytree',
@@ -1314,6 +1318,14 @@ def get_parser():
         '--xgb_gamma', dest='xgb_gamma', default='0.0',
         help='Minimum loss reduction required to make a further partition on a leaf node of the tree. Can be input'
              ' as a comma separated list for hyperparameter search (e.g. \'0.0,0.1,0.2\')')
+    parser.add_argument(
+        '--xgb_alpha', dest='xgb_alpha', default='0.0',
+        help='L1 regularization term on weights. Increasing this value will make model more conservative. Can be input'
+             ' as a comma separated list for hyperparameter search (e.g. \'0.0,0.1,0.2\')')
+    parser.add_argument(
+        '--xgb_lambda', dest='xgb_lambda', default='1.0',
+        help='L2 regularization term on weights. Increasing this value will make model more conservative. Can be input'
+             ' as a comma separated list for hyperparameter search (e.g. \'0.0,0.1,0.2\')')
     parser.add_argument(
         '--xgb_learning_rate', dest='xgb_learning_rate', default='0.1',
         help='Boosting learning rate (xgb\'s \"eta\"). Can be input as a comma separated list for hyperparameter'
@@ -1510,6 +1522,12 @@ def get_parser():
     parser.add_argument(
         '--dp', dest='dp', required=False, default=None,
         help='dropouts shown in HyperOpt domain format, e.g. --dp=uniform|3|0,0.4')
+    parser.add_argument(
+        '--wdp', dest='wdp', required=False, default=None,
+        help='weight_decay_penalty shown in HyperOpt domain format, e.g. --wdp=loguniform|-6.908,-4.605')
+    parser.add_argument(
+        '--wdt', dest='wdt', required=False, default=None,
+        help='weight_decay_penalty_type shown in HyperOpt domain format, e.g. --wdt=choice|l1,l2')
     # RF model
     parser.add_argument(
         '--rfe', dest='rfe', required=False, default=None,
@@ -1524,6 +1542,12 @@ def get_parser():
     parser.add_argument(
         '--xgbg', dest='xgbg', required=False, default=None,
         help='xgb_gamma shown in HyperOpt domain format, e.g. --xgbg=uniform|0,0.4')
+    parser.add_argument(
+        '--xgba', dest='xgba', required=False, default=None,
+        help='xgb_alpha shown in HyperOpt domain format, e.g. --xgba=uniform|0,0.4')
+    parser.add_argument(
+        '--xgbb', dest='xgbb', required=False, default=None,
+        help='xgb_lambda shown in HyperOpt domain format, e.g. --xgbb=uniform|0,0.4')
     parser.add_argument(
         '--xgbl', dest='xgbl', required=False, default=None,
         help='xgb_learning_rate shown in HyperOpt domain format, e.g. --xgbl=loguniform|-6.9,-2.3')
diff --git a/atomsci/ddm/pipeline/perf_plots.py b/atomsci/ddm/pipeline/perf_plots.py
index e8bb2233..50931692 100644
--- a/atomsci/ddm/pipeline/perf_plots.py
+++ b/atomsci/ddm/pipeline/perf_plots.py
@@ -849,59 +849,6 @@ def plot_prec_recall_curve(MP, epoch_label='best', plot_size=7, pdf_dir=None):
         pdf.close()
         MP.log.info("Wrote plot to %s" % pdf_path)
 
-#------------------------------------------------------------------------------------------------------------------------
-def old_plot_prec_recall_curve(MP, epoch_label='best', plot_size=7, pdf_dir=None):
-    """Plot precision-recall curves for a classification model.
-
-    Args:
-        MP (`ModelPipeline`): Pipeline object for a model that was trained in the current Python session.
-
-        epoch_label (str): Label for training epoch to draw predicted values from. Currently 'best' is the only allowed value.
-
-        pdf_dir (str): If given, output the plots to a PDF file in the given directory.
-
-    Returns:
-        None
-
-    """
-    params = MP.params
-    curve_data = _get_perf_curve_data(MP, epoch_label, 'precision-recall')
-    if len(curve_data) == 0:
-        return
-    if MP.run_mode == 'training':
-        # Draw overlapping PR curves for train, valid and test sets
-        subsets = ['train', 'valid', 'test']
-    else:
-        subsets = ['full']
-    if pdf_dir is not None:
-        pdf_path = os.path.join(pdf_dir, '%s_%s_model_%s_features_%s_split_PRC_curves.pdf' % (
-                                params.dataset_name, params.model_type, params.featurizer, params.splitter))
-        pdf = PdfPages(pdf_path)
-    subset_colors = dict(train=train_col, valid=valid_col, test=test_col, full=full_col)
-    # For multitask, do a separate figure for each task
-    ntasks = curve_data[subsets[0]]['prob_active'].shape[1]
-    for i in range(ntasks):
-        fig, ax = plt.subplots(figsize=(plot_size,plot_size))
-        title = '%s dataset\nPrecision-recall curve for %s %s classifier on %s features with %s split' % (
-                           params.dataset_name, params.response_cols[i], 
-                           params.model_type, params.featurizer, params.splitter)
-        for subset in subsets:
-            precision, recall, thresholds = metrics.precision_recall_curve(curve_data[subset]['true_classes'][:,i],
-                                                     curve_data[subset]['prob_active'][:,i])
-      
-            prc_auc = curve_data[subset]['prc_aucs'][i]
-            ax.step(recall, precision, color=subset_colors[subset], label="%s: AUC = %.3f" % (subset, prc_auc))
-        ax.set_xlabel('Recall')
-        ax.set_ylabel('Precision')
-        ax.set_title(title, fontdict={'fontsize' : 12})
-        legend = ax.legend(loc='lower right')
-    
-        if pdf_dir is not None:
-            pdf.savefig(fig)
-    if pdf_dir is not None:
-        pdf.close()
-        MP.log.info("Wrote plot to %s" % pdf_path)
-
 #------------------------------------------------------------------------------------------------------------------------
 def plot_umap_feature_projections(MP, ndim=2, num_neighbors=20, min_dist=0.1, 
                                   fit_to_train=True,
diff --git a/atomsci/ddm/utils/hyperparam_search_wrapper.py b/atomsci/ddm/utils/hyperparam_search_wrapper.py
index 832de9ee..075a034e 100755
--- a/atomsci/ddm/utils/hyperparam_search_wrapper.py
+++ b/atomsci/ddm/utils/hyperparam_search_wrapper.py
@@ -1418,6 +1418,7 @@ def __init__(self, params):
                     for i in range(1,num_layer):
                         self.space[f"ratio{i}"] = build_hyperopt_search_domain(f"ratio{i}", method, par_list)
 
+            # dropouts
             if self.params.dp:
                 domain_list = self.params.dp.split("|")
                 method = domain_list[0]
@@ -1425,6 +1426,21 @@ def __init__(self, params):
                 par_list = [float(e) for e in domain_list[2].split(",")]
                 for i in range(num_layer):
                     self.space[f"dp{i}"] = build_hyperopt_search_domain(f"dp{i}", method, par_list)
+
+            # weight decay penalty
+            if self.params.wdp:
+                domain_list = self.params.wdp.split("|")
+                method = domain_list[0]
+                par_list = [float(e) for e in domain_list[1].split(",")]
+                self.space["weight_decay_penalty"] = build_hyperopt_search_domain("weight_decay_penalty", method, par_list)
+
+            # weight decay penalty type
+            if self.params.wdt:
+                domain_list = self.params.wdt.split("|")
+                method = domain_list[0]
+                par_list = domain_list[1].split(",")
+                self.space["weight_decay_penalty_type"] = build_hyperopt_search_domain("weight_decay_penalty_type", method, par_list)
+
         elif self.params.model_type == "xgboost":
             #build searching domain for XGBoost parameters
             if self.params.xgbg:
@@ -1433,6 +1449,18 @@ def __init__(self, params):
                 par_list = [float(e) for e in domain_list[1].split(",")]
                 self.space["xgbg"] = build_hyperopt_search_domain("xgbg", method, par_list)
 
+            if self.params.xgba:
+                domain_list = self.params.xgba.split("|")
+                method = domain_list[0]
+                par_list = [float(e) for e in domain_list[1].split(",")]
+                self.space["xgba"] = build_hyperopt_search_domain("xgba", method, par_list)
+
+            if self.params.xgbb:
+                domain_list = self.params.xgbb.split("|")
+                method = domain_list[0]
+                par_list = [float(e) for e in domain_list[1].split(",")]
+                self.space["xgbb"] = build_hyperopt_search_domain("xgbb", method, par_list)
+
             if self.params.xgbl:
                 domain_list = self.params.xgbl.split("|")
                 method = domain_list[0]
@@ -1500,6 +1528,10 @@ def lossfn(p):
                     self.params.learning_rate = p["learning_rate"]
                 if self.params.dp:
                     self.params.dropouts = ",".join([str(p[e]) for e in p if e[:2] == "dp"])
+                if self.params.wdp:
+                    self.params.weight_decay_penalty = p['weight_decay_penalty']
+                if self.params.wdt:
+                    self.params.weight_decay_penalty_type = p['weight_decay_penalty_type']
                 if self.params.ls:
                     if not self.params.ls_ratio:
                         self.params.layer_sizes = ",".join([str(p[e]) for e in p if e[:2] == "ls"])
@@ -1508,11 +1540,16 @@ def lossfn(p):
                         for i in range(1,len([e for e in p if e[:5] == "ratio"])+1):
                             list_layer_sizes.append(int(list_layer_sizes[-1] * p[f"ratio{i}"]))
                         self.params.layer_sizes = ",".join([str(e) for e in list_layer_sizes])
-                hp_params = f'{self.params.learning_rate}_{self.params.layer_sizes}_{self.params.dropouts}'
-                print(f"learning_rate: {self.params.learning_rate}, layer_sizes: {self.params.layer_sizes}, dropouts: {self.params.dropouts}")
+                hp_params = f'{self.params.learning_rate}_{self.params.layer_sizes}_{self.params.dropouts}_{self.params.weight_decay_penalty_type}_{self.params.weight_decay_penalty}'
+                print(f"learning_rate: {self.params.learning_rate}, layer_sizes: {self.params.layer_sizes}, dropouts: {self.params.dropouts}, "
+                      f"weight_decay_penalty: {self.params.weight_decay_penalty_type}({self.params.weight_decay_penalty})")
             elif self.params.model_type == "xgboost":
                 if self.params.xgbg:
                     self.params.xgb_gamma = p["xgbg"]
+                if self.params.xgba:
+                    self.params.xgb_alpha = p["xgba"]
+                if self.params.xgbb:
+                    self.params.xgb_lambda = p["xgbb"]
                 if self.params.xgbl:
                     self.params.xgb_learning_rate = p["xgbl"]
                 if self.params.xgbd:
@@ -1525,8 +1562,10 @@ def lossfn(p):
                     self.params.xgb_n_estimators = p["xgbn"]
                 if self.params.xgbw:
                     self.params.xgb_min_child_weight = p["xgbw"]
-                hp_params = f'{self.params.xgb_gamma}_{self.params.xgb_learning_rate}_{self.params.xgb_max_depth}_{self.params.xgb_colsample_bytree}_{self.params.xgb_subsample}_{self.params.xgb_n_estimators}_{self.params.xgb_min_child_weight}'
+                hp_params = f'{self.params.xgb_gamma}_{self.params.xgb_alpha}_{self.params.xgb_lambda}_{self.params.xgb_learning_rate}_{self.params.xgb_max_depth}_{self.params.xgb_colsample_bytree}_{self.params.xgb_subsample}_{self.params.xgb_n_estimators}_{self.params.xgb_min_child_weight}'
                 print(f"xgb_gamma: {self.params.xgb_gamma}, "
+                      f"xgb_alpha: {self.params.xgb_alpha}, "
+                      f"xgb_lambda: {self.params.xgb_lambda}, "
                       f"xgb_learning_rate: {self.params.xgb_learning_rate}, "
                       f"xgb_max_depth: {self.params.xgb_max_depth}, "
                       f"xgb_colsample_bytree: {self.params.xgb_colsample_bytree}, "
@@ -1709,7 +1748,10 @@ def parse_params(param_list):
                    'slurm_time_limit'} | excluded_keys
     if params.search_type == 'hyperopt':
         # keep more parameters
-        keep_params = keep_params | {'lr', 'learning_rate','ls', 'layer_sizes','ls_ratio','dp', 'dropouts','rfe', 'rf_estimators','rfd', 'rf_max_depth','rff', 'rf_max_features','xgbg', 'xgb_gamma','xgbl', 'xgb_learning_rate', 'xgbd', 'xgb_max_depth', 'xgbc', 'xgb_colsample_bytree', 'xgbs', 'xgb_subsample', 'xgbn', 'xgb_n_estimators', 'xgbw', 'xgb_min_child_weight', 'hp_checkpoint_load', 'hp_checkpoint_save'}
+        keep_params = keep_params | {'lr', 'learning_rate','ls', 'layer_sizes','ls_ratio','dp', 'dropouts', 'wdp', 'weight_decay_penalty', 'wdt', 'weight_decay_penalty_type',
+                                     'rfe', 'rf_estimators','rfd', 'rf_max_depth','rff', 'rf_max_features',
+                                     'xgbg', 'xgb_gamma', 'xgba', 'xgb_alpha', 'xgbb', 'xgb_lambda', 'xgbl', 'xgb_learning_rate', 'xgbd', 'xgb_max_depth', 'xgbc', 'xgb_colsample_bytree', 
+                                     'xgbs', 'xgb_subsample', 'xgbn', 'xgb_n_estimators', 'xgbw', 'xgb_min_child_weight', 'hp_checkpoint_load', 'hp_checkpoint_save'}
 
     params.__dict__ = parse.prune_defaults(params, keep_params=keep_params)
 
diff --git a/atomsci/ddm/utils/split_diagnostic_plots.py b/atomsci/ddm/utils/split_diagnostic_plots.py
new file mode 100644
index 00000000..447c5166
--- /dev/null
+++ b/atomsci/ddm/utils/split_diagnostic_plots.py
@@ -0,0 +1,213 @@
+"""Functions to generate multi-plot displays to assess split quality: response value distributions, test-vs-train Tanimoto distance distributions, etc.
+"""
+
+import os
+import numpy as np
+import pandas as pd
+from argparse import Namespace
+from atomsci.ddm.pipeline.model_pipeline import ModelPipeline
+from atomsci.ddm.pipeline import parameter_parser as parse
+from atomsci.ddm.pipeline import perf_plots as pp
+from atomsci.ddm.utils import split_response_dist_plots as srdp
+from atomsci.ddm.utils import compare_splits_plots as csp
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+from scipy import stats
+
+
+# ---------------------------------------------------------------------------------------------------------------------------------
+def plot_split_diagnostics(params_or_pl, axes=None, num_rows=None, num_cols=1, min_tvt_dist=0.3, plot_size=7):
+    """Generate several plots showing various aspects of split quality. These include: The value distributions for each response
+    column in the different split subsets; the nearest-training-set-neighbor Tanimoto distance distributions for the validation
+    and test sets; the actual split subset proportions; and (for multitask scaffold splits) the progression of fitness term values
+    over generations of the genetic algorithm.
+    
+    Args:
+        params_or_pl (argparse.Namespace or dict or ModelPipeline): Structure containing dataset and split parameters, or
+        the ModelPipeline object used to perform a split. If a ModelPipeline is passed as this argument, the parameters
+        are extracted from it and an additional plot will be generated for multitaskscaffold splits showing the progression
+        of fitness term values over generations.  The following parameters are required, if not set to default values:
+        
+        | - dataset_key
+        | - split_uuid
+        | - split_strategy
+        | - splitter
+        | - split_valid_frac
+        | - split_test_frac
+        | - num_folds
+        | - smiles_col
+        | - response_cols
+
+        axes (matplotlib.Axes): Axes to draw plots in. If provided, must contain enough entries to display all the plots requested
+        (2 for each response column + 3 + 1 for a multitask scaffold split when a ModelPipeline is passed to params_or_pl). If not
+        provided, a figure and Axes of the required length will be created.
+
+        num_rows (int): Number of rows for the Axes layout; ignored if an existing set of Axes are passed in the `axes` argument.
+        num_cols (int): Number of columns for the Axes layout; ignored if an existing set of Axes are passed in the `axes` argument.
+        plot_size (float): Height of plots; ignored if `axes` is provided
+    Returns:
+        None
+    """
+
+    # Save current matplotlib color cycle and switch to 'colorblind' palette
+    old_palette = sns.color_palette()
+    sns.set_palette('colorblind')
+
+    if isinstance(params_or_pl, dict):
+        params = parse.wrapper(params_or_pl)
+    elif isinstance(params_or_pl, Namespace):
+        params = params_or_pl
+    elif isinstance(params_or_pl, ModelPipeline):
+        params = params_or_pl.params
+        splitter = params_or_pl.data.splitting.splitter
+        params.fitness_terms = splitter.fitness_terms
+        params.split_uuid = params_or_pl.data.split_uuid
+        
+    else:
+        raise ValueError("params_or_pl must be dict, Namespace or ModelPipeline")
+    
+    # Figure out how many plots we're going to generate and how to lay them out
+    nresp = len(params.response_cols)
+    nfitness = int((params.splitter == 'multitaskscaffold') and (isinstance(params_or_pl, ModelPipeline)))
+    nplots = 2*nresp + 3 + nfitness
+    if axes is not None:
+        axes = axes.flatten()
+        if len(axes) < nplots:
+            raise ValueError(f"axes argument needs {nplots} axis pairs for requested plots; only provides {len(axes)}")
+    else:
+        if num_rows is None:
+            num_rows = (nplots + 1) // num_cols
+        fig, axes = plt.subplots(num_rows, num_cols, figsize=(num_cols*plot_size, num_rows*plot_size), layout='constrained')
+        axes = axes.flatten()
+    plot_num = 0
+
+    # Draw split response distribution plots
+    srdp.plot_split_subset_response_distrs(params, plot_size=plot_size, axes=axes)
+
+    # Draw NN Tanimoto distance distribution plots with titles showing fraction of compounds below min_tvt_dist
+    plot_num += nresp
+    dset_path = params.dataset_key
+    split_path = f"{os.path.splitext(dset_path)[0]}_{params.split_strategy}_{params.splitter}_{params.split_uuid}.csv"
+    dset_df = pd.read_csv(dset_path, dtype={params.id_col: str})
+    split_df = pd.read_csv(split_path, dtype={'cmpd_id': str})
+    split_stats = csp.SplitStats(dset_df, split_df, smiles_col=params.smiles_col, id_col=params.id_col,
+                                 response_cols=params.response_cols)
+    vvtr_dists = split_stats.dists_tvv
+    tvtr_dists = split_stats.dists_tvt
+    vfrac = sum(vvtr_dists <= min_tvt_dist)/len(vvtr_dists)
+    tfrac = sum(tvtr_dists <= min_tvt_dist)/len(tvtr_dists)
+    ax = axes[plot_num]
+    ax = split_stats.dist_hist_train_v_valid_plot(ax=ax)
+    ax.set_title(f"Valid vs train Tanimoto NN distance distribution\nFraction <= {min_tvt_dist} = {vfrac:.3f}")
+
+    plot_num += 1
+    ax = axes[plot_num]
+    ax = split_stats.dist_hist_train_v_test_plot(ax=ax)
+    ax.set_title(f"Test vs train Tanimoto NN distance distribution\nFraction <= {min_tvt_dist} = {tfrac:.3f}")
+
+    # Draw plots of actual and requested split proportions and counts for each response column after excluding missing values
+    plot_num += 1
+    plot_split_fractions(params, axes[plot_num:])
+    plot_num += nresp
+
+    # Plot the evolution over generations of each term in the fitness function optimized by the multitask scaffold splitter.
+    if nfitness > 0:
+        plot_fitness_terms(params, axes[plot_num:])
+
+def plot_fitness_terms(params, axes):
+    """Plot the evolution over generations of each term in the fitness function optimized by the multitask scaffold splitter.
+
+    Args:
+        params (argparse.Namespace or dict): Structure containing dataset and split parameters. The parameters must include
+        the key `fitness_terms`, which is created by the multitask splitter; usually this is only available when `params`
+        is derived from the ModelPipeline object used to split the dataset.
+
+        axes (numpy.array of matplotlib.Axes): Axes to draw plots in.
+    """
+    if isinstance(params, dict):
+        params = parse.wrapper(params)
+    if isinstance(axes, np.ndarray):
+        ax = axes[0]
+    else:
+        ax = axes
+    fitness_df = pd.DataFrame(params.fitness_terms)
+    fitness_df['generation'] = list(range(len(fitness_df)))
+    fit_long_df = fitness_df.melt(id_vars='generation', value_vars=list(params.fitness_terms.keys()), var_name='Fitness term', value_name='Score')
+    ax = sns.lineplot(data=fit_long_df, x='generation', y='Score', hue='Fitness term', ax=ax)
+    ax.set_title('Unweighted fitness scores vs generation')
+        
+
+def plot_split_fractions(params, axes):
+    """Draw plots of actual and requested split proportions and counts for each response column after excluding missing values
+
+    Args:
+        params (argparse.Namespace or dict): Structure containing dataset and split parameters.
+        The following parameters are required, if not set to default values:
+        
+        | - dataset_key
+        | - split_uuid
+        | - split_strategy
+        | - splitter
+        | - split_valid_frac
+        | - split_test_frac
+        | - num_folds
+        | - smiles_col
+        | - response_cols
+
+        axes (matplotlib.Axes): Axes to draw plots in. Must contain at least as many entries as response columns in the dataset.
+    """
+    split_dset_df, _ = srdp.get_split_labeled_dataset(params)
+    axes = axes.flatten()
+    req_color = pp.test_col
+    actual_color = pp.train_col
+    for icol, resp_col in enumerate(params.response_cols):
+        ss_df = split_dset_df[split_dset_df[resp_col].notna()]
+        ss_counts = ss_df.subset.value_counts()
+        total_count = sum(ss_counts)
+        ss_fracs = ss_counts / total_count
+        actual_df = pd.DataFrame(dict(counts=ss_counts, fracs=ss_fracs)).reindex(['train', 'valid', 'test'])
+        req_fracs = dict(train=1-params.split_valid_frac-params.split_test_frac, valid=params.split_valid_frac, test=params.split_test_frac)
+        req_df = pd.DataFrame(dict(fracs=req_fracs)).reindex(['train', 'valid', 'test'])
+        req_df['counts'] = np.round(total_count * req_df.fracs).astype(int)
+
+        bar_width = 0.35
+
+        # Positions of the bars on the x-axis
+        r1 = np.arange(len(actual_df))
+        r2 = r1 + bar_width
+
+        # Plotting the proportions
+        ax1 = axes[icol]
+        bars_requested = ax1.bar(r1, req_df.fracs.values, color=req_color, width=bar_width, label='Requested', alpha=0.6)
+        bars_actual = ax1.bar(r2, actual_df.fracs.values, color=actual_color, width=bar_width, label='Actual', alpha=0.6)
+        max_count = max(max(actual_df.counts.values), max(req_df.counts.values))
+
+        # Left Y-axis (proportions)
+        ax1.set_ylabel('Proportions')
+        #ax1.tick_params(axis='y')
+
+        # Create the second Y-axis (counts)
+        ax2 = ax1.twinx()
+        ax2.set_ylabel('Counts')
+        #ax2.tick_params(axis='y', labelcolor='k')
+        ax2.set_ylim(0, max_count*1.1)
+
+        # Adding counts on top of the bars
+        for bar, count in zip(bars_requested, req_df.counts.values):
+            height = bar.get_height()
+            ax1.text(bar.get_x() + bar.get_width() / 2, height, f'{count}', ha='center', va='bottom', color=actual_color)
+
+        for bar, count in zip(bars_actual, actual_df.counts.values):
+            height = bar.get_height()
+            ax1.text(bar.get_x() + bar.get_width() / 2, height, f'{count}', ha='center', va='bottom', color=req_color)
+
+        # X-axis labels
+        plt.xticks([r + bar_width / 2 for r in range(len(actual_df))], actual_df.index.values)
+
+        # Title and legend
+        if icol == 0:
+            plt.title(f"Requested and Actual Subset Proportions and Counts\n{resp_col}")
+            ax1.legend(loc='upper right')
+        else:
+            plt.title(f"\n{resp_col}")
diff --git a/atomsci/ddm/utils/split_response_dist_plots.py b/atomsci/ddm/utils/split_response_dist_plots.py
index d457b814..ef7448db 100644
--- a/atomsci/ddm/utils/split_response_dist_plots.py
+++ b/atomsci/ddm/utils/split_response_dist_plots.py
@@ -46,6 +46,9 @@ def plot_split_subset_response_distrs(params, axes=None, plot_size=7):
     else:
         subset_order = ['train', 'valid', 'test']
 
+    wdist_df = compute_split_subset_wasserstein_distances(params)
+
+
     if axes is None:
         fig, axes = plt.subplots(1, len(params.response_cols), figsize=(plot_size*len(params.response_cols), plot_size))
     if len(params.response_cols) == 1:
@@ -54,10 +57,18 @@ def plot_split_subset_response_distrs(params, axes=None, plot_size=7):
         axes = axes.flatten()
     for colnum, col in enumerate(params.response_cols):
         ax = axes[colnum]
+        if params.split_strategy == 'train_valid_test':
+            tvv_wdist = wdist_df[(wdist_df.split_subset == 'valid') & (wdist_df.response_col == col)].distance.values[0]
+            tvt_wdist = wdist_df[(wdist_df.split_subset == 'test') & (wdist_df.response_col == col)].distance.values[0]
         if params.prediction_type == 'regression':
             ax = sns.kdeplot(data=dset_df, x=col, hue='split_subset', hue_order=subset_order, 
                              bw_adjust=0.7, fill=True, common_norm=False, ax=ax)
             ax.set_title(f"{col} distribution by subset under {split_label}")
+            if params.split_strategy == 'train_valid_test':
+                ax.set_title(f"{col} distribution by subset under {split_label}\n" \
+                             f"Wasserstein distances: valid = {tvv_wdist:.3f}, test = {tvt_wdist:.3f}")
+            else:
+                ax.set_title(f"{col} distribution by subset under {split_label}")
         else:
             pct_active = []
             for ss in subset_order:
@@ -66,7 +77,12 @@ def plot_split_subset_response_distrs(params, axes=None, plot_size=7):
                 pct_active.append(100*nactive/sum(ss_df[col].notna()))
             active_df = pd.DataFrame(dict(subset=subset_order, percent_active=pct_active))
             ax = sns.barplot(data=active_df, x='subset', y='percent_active', hue='subset', ax=ax)
-            ax.set_title(f"Percent of {col} = 1 by subset under {split_label}")
+
+            if params.split_strategy == 'train_valid_test':
+                ax.set_title(f"Percent of {col} = 1 by subset under {split_label}" \
+                             f"Wasserstein distances: valid = {tvv_wdist:.3f}, test = {tvt_wdist:.3f}")
+            else:
+                ax.set_title(f"Percent of {col} = 1 by subset under {split_label}")
             ax.set_xlabel('')
 
     # Restore previous matplotlib color cycle
@@ -153,7 +169,7 @@ def get_split_labeled_dataset(params):
     """
     if isinstance(params, dict):
         params = parse.wrapper(params)
-    dset_df = pd.read_csv(params.dataset_key, dtype={'compound_id': str})
+    dset_df = pd.read_csv(params.dataset_key, dtype={params.id_col: str})
     if params.split_strategy == 'k_fold_cv':
         split_file = f"{os.path.splitext(params.dataset_key)[0]}_{params.num_folds}_fold_cv_{params.splitter}_{params.split_uuid}.csv"
     else:
@@ -167,6 +183,9 @@ def get_split_labeled_dataset(params):
         split_label = f"{nfolds}-fold {params.splitter} cross-validation split"
     else:
         dset_df['split_subset'] = dset_df.subset.values
-        split_label = f"{params.splitter} split"
+        if params.splitter == 'multitaskscaffold':
+            split_label = 'MTSS split'
+        else:
+            split_label = f"{params.splitter} split"
     return dset_df, split_label