From b73b98a37802cd681e8765b10204c6062b0f76d7 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Mon, 6 May 2024 23:45:17 -0700
Subject: [PATCH 01/28] (1) Added code to DCNNModelWrapper to track absolute
 sum of 1st hidden layer weights for each feature as it evolves over epochs,
 to see effect of setting weight decay parameters; (2) Added model parameters
 xgb_alpha and xgb_lambda and corresponding hyperopt parameters xgba and xgbb
 to control strength of L1 and L2 regularization penalties.

---
 atomsci/ddm/pipeline/model_wrapper.py    | 31 ++++++++++++++++++------
 atomsci/ddm/pipeline/parameter_parser.py | 18 ++++++++++++++
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py
index 365c85ee..db937a56 100644
--- a/atomsci/ddm/pipeline/model_wrapper.py
+++ b/atomsci/ddm/pipeline/model_wrapper.py
@@ -10,6 +10,7 @@
 
 import deepchem as dc
 import numpy as np
+import pandas as pd
 import tensorflow as tf
 if dc.__version__.startswith('2.1'):
     from deepchem.models.tensorgraph.fcnet import MultitaskRegressor, MultitaskClassifier
@@ -986,6 +987,9 @@ def train_with_early_stopping(self, pipeline):
                 valid_epoch_perfs (np.array): A standard validation set performance metric (r2_score or roc_auc), at the end of each epoch.
         """
         self.data = pipeline.data
+        feature_names = self.data.featurization.get_feature_columns()
+        nfeatures = len(feature_names)
+        self.feature_weights = dict(zip(feature_names, [[] for f in feature_names]))
 
         em = perf.EpochManager(self,
                                 prediction_type=self.params.prediction_type, 
@@ -1008,11 +1012,22 @@ def train_with_early_stopping(self, pipeline):
                           ei, pipeline.metric_type, train_perf, pipeline.metric_type, valid_perf,
                           pipeline.metric_type, test_perf))
 
+            layer1_weights = self.model.model.layers[0].weight
+            feature_weights = np.zeros(nfeatures, dtype=float)
+            for node_weights in layer1_weights:
+                node_feat_weights = torch.abs(node_weights).detach().numpy()
+                feature_weights += node_feat_weights
+            for fnum, fname in enumerate(feature_names):
+                self.feature_weights[fname].append(feature_weights[fnum])
+
             self.num_epochs_trained = ei + 1
             # Compute performance metrics for each subset, and check if we've reached a new best validation set score
             if em.should_stop():
                 break
 
+        self.feature_weights_df = pd.DataFrame(self.feature_weights)
+        self.feature_weights_df['epoch'] = range(len(self.feature_weights_df))
+
         # Revert to last checkpoint
         self.restore()
         self.model_save()
@@ -1975,8 +1990,8 @@ def make_dc_model(self, model_dir):
                                          subsample=self.params.xgb_subsample,
                                          colsample_bytree=self.params.xgb_colsample_bytree,
                                          colsample_bylevel=1,
-                                         reg_alpha=0,
-                                         reg_lambda=1,
+                                         reg_alpha=self.params.xgb_alpha,
+                                         reg_lambda=self.params.xgb_lambda,
                                          scale_pos_weight=1,
                                          base_score=0.5,
                                          random_state=0,
@@ -2000,8 +2015,8 @@ def make_dc_model(self, model_dir):
                                           subsample=self.params.xgb_subsample,
                                           colsample_bytree=self.params.xgb_colsample_bytree,
                                           colsample_bylevel=1,
-                                          reg_alpha=0,
-                                          reg_lambda=1,
+                                          reg_alpha=self.params.xgb_alpha,
+                                          reg_lambda=self.params.xgb_lambda,
                                           scale_pos_weight=1,
                                           base_score=0.5,
                                           random_state=0,
@@ -2110,8 +2125,8 @@ def reload_model(self, reload_dir):
                                          subsample=self.params.xgb_subsample,
                                          colsample_bytree=self.params.xgb_colsample_bytree,
                                          colsample_bylevel=1,
-                                         reg_alpha=0,
-                                         reg_lambda=1,
+                                         reg_alpha=self.params.xgb_alpha,
+                                         reg_lambda=self.params.xgb_lambda,
                                          scale_pos_weight=1,
                                          base_score=0.5,
                                          random_state=0,
@@ -2135,8 +2150,8 @@ def reload_model(self, reload_dir):
                                          subsample=self.params.xgb_subsample,
                                          colsample_bytree=self.params.xgb_colsample_bytree,
                                          colsample_bylevel=1,
-                                         reg_alpha=0,
-                                         reg_lambda=1,
+                                         reg_alpha=self.params.xgb_alpha,
+                                         reg_lambda=self.params.xgb_lambda,
                                          scale_pos_weight=1,
                                          base_score=0.5,
                                          random_state=0,
diff --git a/atomsci/ddm/pipeline/parameter_parser.py b/atomsci/ddm/pipeline/parameter_parser.py
index 8d960b38..2c2a31c6 100644
--- a/atomsci/ddm/pipeline/parameter_parser.py
+++ b/atomsci/ddm/pipeline/parameter_parser.py
@@ -535,6 +535,8 @@ def get_list_args(self):
                          'umap_targ_wt', 'umap_min_dist', 'dropout_list','weight_decay_penalty',
                          'xgb_learning_rate',
                          'xgb_gamma',
+                         'xgb_alpha',
+                         'xgb_lambda',
                          "xgb_min_child_weight",
                          "xgb_subsample",
                          "xgb_colsample_bytree",
@@ -549,6 +551,8 @@ def get_list_args(self):
 not_a_list_outside_of_hyperparams = {'learning_rate','weight_decay_penalty',
                                      'xgb_learning_rate',
                                      'xgb_gamma',
+                                     'xgb_alpha',
+                                     'xgb_lambda',
                                      'xgb_min_child_weight',
                                      'xgb_subsample',
                                      'xgb_colsample_bytree',
@@ -1314,6 +1318,14 @@ def get_parser():
         '--xgb_gamma', dest='xgb_gamma', default='0.0',
         help='Minimum loss reduction required to make a further partition on a leaf node of the tree. Can be input'
              ' as a comma separated list for hyperparameter search (e.g. \'0.0,0.1,0.2\')')
+    parser.add_argument(
+        '--xgb_alpha', dest='xgb_alpha', default='0.0',
+        help='L1 regularization term on weights. Increasing this value will make model more conservative. Can be input'
+             ' as a comma separated list for hyperparameter search (e.g. \'0.0,0.1,0.2\')')
+    parser.add_argument(
+        '--xgb_lambda', dest='xgb_lambda', default='1.0',
+        help='L2 regularization term on weights. Increasing this value will make model more conservative. Can be input'
+             ' as a comma separated list for hyperparameter search (e.g. \'0.0,0.1,0.2\')')
     parser.add_argument(
         '--xgb_learning_rate', dest='xgb_learning_rate', default='0.1',
         help='Boosting learning rate (xgb\'s \"eta\"). Can be input as a comma separated list for hyperparameter'
@@ -1524,6 +1536,12 @@ def get_parser():
     parser.add_argument(
         '--xgbg', dest='xgbg', required=False, default=None,
         help='xgb_gamma shown in HyperOpt domain format, e.g. --xgbg=uniform|0,0.4')
+    parser.add_argument(
+        '--xgba', dest='xgba', required=False, default=None,
+        help='xgb_alpha shown in HyperOpt domain format, e.g. --xgba=uniform|0,0.4')
+    parser.add_argument(
+        '--xgbb', dest='xgbb', required=False, default=None,
+        help='xgb_lambda shown in HyperOpt domain format, e.g. --xgbb=uniform|0,0.4')
     parser.add_argument(
         '--xgbl', dest='xgbl', required=False, default=None,
         help='xgb_learning_rate shown in HyperOpt domain format, e.g. --xgbl=loguniform|-6.9,-2.3')

From 5d3c897e493cfb661ab2e34d6a00d68ee056dff8 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Fri, 31 May 2024 16:49:08 -0700
Subject: [PATCH 02/28] Added two new parameters, wdp and wdt, to allow
 hyperopt domain searching on weight_decay_penalty and
 weight_decay_penalty_type parameters. Completed implementation of hyperopt
 domain specification for xgb_alpha and xgb_lambda.

---
 atomsci/ddm/pipeline/parameter_parser.py      |  6 +++
 .../ddm/utils/hyperparam_search_wrapper.py    | 50 +++++++++++++++++--
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/atomsci/ddm/pipeline/parameter_parser.py b/atomsci/ddm/pipeline/parameter_parser.py
index 2c2a31c6..07a3cf5f 100644
--- a/atomsci/ddm/pipeline/parameter_parser.py
+++ b/atomsci/ddm/pipeline/parameter_parser.py
@@ -1522,6 +1522,12 @@ def get_parser():
     parser.add_argument(
         '--dp', dest='dp', required=False, default=None,
         help='dropouts shown in HyperOpt domain format, e.g. --dp=uniform|3|0,0.4')
+    parser.add_argument(
+        '--wdp', dest='wdp', required=False, default=None,
+        help='weight_decay_penalty shown in HyperOpt domain format, e.g. --wdp=loguniform|-6.908,-4.605')
+    parser.add_argument(
+        '--wdt', dest='wdt', required=False, default=None,
+        help='weight_decay_penalty_type shown in HyperOpt domain format, e.g. --wdt=choice|l1,l2')
     # RF model
     parser.add_argument(
         '--rfe', dest='rfe', required=False, default=None,
diff --git a/atomsci/ddm/utils/hyperparam_search_wrapper.py b/atomsci/ddm/utils/hyperparam_search_wrapper.py
index 832de9ee..075a034e 100755
--- a/atomsci/ddm/utils/hyperparam_search_wrapper.py
+++ b/atomsci/ddm/utils/hyperparam_search_wrapper.py
@@ -1418,6 +1418,7 @@ def __init__(self, params):
                     for i in range(1,num_layer):
                         self.space[f"ratio{i}"] = build_hyperopt_search_domain(f"ratio{i}", method, par_list)
 
+            # dropouts
             if self.params.dp:
                 domain_list = self.params.dp.split("|")
                 method = domain_list[0]
@@ -1425,6 +1426,21 @@ def __init__(self, params):
                 par_list = [float(e) for e in domain_list[2].split(",")]
                 for i in range(num_layer):
                     self.space[f"dp{i}"] = build_hyperopt_search_domain(f"dp{i}", method, par_list)
+
+            # weight decay penalty
+            if self.params.wdp:
+                domain_list = self.params.wdp.split("|")
+                method = domain_list[0]
+                par_list = [float(e) for e in domain_list[1].split(",")]
+                self.space["weight_decay_penalty"] = build_hyperopt_search_domain("weight_decay_penalty", method, par_list)
+
+            # weight decay penalty type
+            if self.params.wdt:
+                domain_list = self.params.wdt.split("|")
+                method = domain_list[0]
+                par_list = domain_list[1].split(",")
+                self.space["weight_decay_penalty_type"] = build_hyperopt_search_domain("weight_decay_penalty_type", method, par_list)
+
         elif self.params.model_type == "xgboost":
             #build searching domain for XGBoost parameters
             if self.params.xgbg:
@@ -1433,6 +1449,18 @@ def __init__(self, params):
                 par_list = [float(e) for e in domain_list[1].split(",")]
                 self.space["xgbg"] = build_hyperopt_search_domain("xgbg", method, par_list)
 
+            if self.params.xgba:
+                domain_list = self.params.xgba.split("|")
+                method = domain_list[0]
+                par_list = [float(e) for e in domain_list[1].split(",")]
+                self.space["xgba"] = build_hyperopt_search_domain("xgba", method, par_list)
+
+            if self.params.xgbb:
+                domain_list = self.params.xgbb.split("|")
+                method = domain_list[0]
+                par_list = [float(e) for e in domain_list[1].split(",")]
+                self.space["xgbb"] = build_hyperopt_search_domain("xgbb", method, par_list)
+
             if self.params.xgbl:
                 domain_list = self.params.xgbl.split("|")
                 method = domain_list[0]
@@ -1500,6 +1528,10 @@ def lossfn(p):
                     self.params.learning_rate = p["learning_rate"]
                 if self.params.dp:
                     self.params.dropouts = ",".join([str(p[e]) for e in p if e[:2] == "dp"])
+                if self.params.wdp:
+                    self.params.weight_decay_penalty = p['weight_decay_penalty']
+                if self.params.wdt:
+                    self.params.weight_decay_penalty_type = p['weight_decay_penalty_type']
                 if self.params.ls:
                     if not self.params.ls_ratio:
                         self.params.layer_sizes = ",".join([str(p[e]) for e in p if e[:2] == "ls"])
@@ -1508,11 +1540,16 @@ def lossfn(p):
                         for i in range(1,len([e for e in p if e[:5] == "ratio"])+1):
                             list_layer_sizes.append(int(list_layer_sizes[-1] * p[f"ratio{i}"]))
                         self.params.layer_sizes = ",".join([str(e) for e in list_layer_sizes])
-                hp_params = f'{self.params.learning_rate}_{self.params.layer_sizes}_{self.params.dropouts}'
-                print(f"learning_rate: {self.params.learning_rate}, layer_sizes: {self.params.layer_sizes}, dropouts: {self.params.dropouts}")
+                hp_params = f'{self.params.learning_rate}_{self.params.layer_sizes}_{self.params.dropouts}_{self.params.weight_decay_penalty_type}_{self.params.weight_decay_penalty}'
+                print(f"learning_rate: {self.params.learning_rate}, layer_sizes: {self.params.layer_sizes}, dropouts: {self.params.dropouts}, "
+                      f"weight_decay_penalty: {self.params.weight_decay_penalty_type}({self.params.weight_decay_penalty})")
             elif self.params.model_type == "xgboost":
                 if self.params.xgbg:
                     self.params.xgb_gamma = p["xgbg"]
+                if self.params.xgba:
+                    self.params.xgb_alpha = p["xgba"]
+                if self.params.xgbb:
+                    self.params.xgb_lambda = p["xgbb"]
                 if self.params.xgbl:
                     self.params.xgb_learning_rate = p["xgbl"]
                 if self.params.xgbd:
@@ -1525,8 +1562,10 @@ def lossfn(p):
                     self.params.xgb_n_estimators = p["xgbn"]
                 if self.params.xgbw:
                     self.params.xgb_min_child_weight = p["xgbw"]
-                hp_params = f'{self.params.xgb_gamma}_{self.params.xgb_learning_rate}_{self.params.xgb_max_depth}_{self.params.xgb_colsample_bytree}_{self.params.xgb_subsample}_{self.params.xgb_n_estimators}_{self.params.xgb_min_child_weight}'
+                hp_params = f'{self.params.xgb_gamma}_{self.params.xgb_alpha}_{self.params.xgb_lambda}_{self.params.xgb_learning_rate}_{self.params.xgb_max_depth}_{self.params.xgb_colsample_bytree}_{self.params.xgb_subsample}_{self.params.xgb_n_estimators}_{self.params.xgb_min_child_weight}'
                 print(f"xgb_gamma: {self.params.xgb_gamma}, "
+                      f"xgb_alpha: {self.params.xgb_alpha}, "
+                      f"xgb_lambda: {self.params.xgb_lambda}, "
                       f"xgb_learning_rate: {self.params.xgb_learning_rate}, "
                       f"xgb_max_depth: {self.params.xgb_max_depth}, "
                       f"xgb_colsample_bytree: {self.params.xgb_colsample_bytree}, "
@@ -1709,7 +1748,10 @@ def parse_params(param_list):
                    'slurm_time_limit'} | excluded_keys
     if params.search_type == 'hyperopt':
         # keep more parameters
-        keep_params = keep_params | {'lr', 'learning_rate','ls', 'layer_sizes','ls_ratio','dp', 'dropouts','rfe', 'rf_estimators','rfd', 'rf_max_depth','rff', 'rf_max_features','xgbg', 'xgb_gamma','xgbl', 'xgb_learning_rate', 'xgbd', 'xgb_max_depth', 'xgbc', 'xgb_colsample_bytree', 'xgbs', 'xgb_subsample', 'xgbn', 'xgb_n_estimators', 'xgbw', 'xgb_min_child_weight', 'hp_checkpoint_load', 'hp_checkpoint_save'}
+        keep_params = keep_params | {'lr', 'learning_rate','ls', 'layer_sizes','ls_ratio','dp', 'dropouts', 'wdp', 'weight_decay_penalty', 'wdt', 'weight_decay_penalty_type',
+                                     'rfe', 'rf_estimators','rfd', 'rf_max_depth','rff', 'rf_max_features',
+                                     'xgbg', 'xgb_gamma', 'xgba', 'xgb_alpha', 'xgbb', 'xgb_lambda', 'xgbl', 'xgb_learning_rate', 'xgbd', 'xgb_max_depth', 'xgbc', 'xgb_colsample_bytree', 
+                                     'xgbs', 'xgb_subsample', 'xgbn', 'xgb_n_estimators', 'xgbw', 'xgb_min_child_weight', 'hp_checkpoint_load', 'hp_checkpoint_save'}
 
     params.__dict__ = parse.prune_defaults(params, keep_params=keep_params)
 

From 512d8eb13bbc518c596cb8110b802ff5072ccb57 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Fri, 31 May 2024 16:57:42 -0700
Subject: [PATCH 03/28] Added sparsity-related parameters weight_decay_penalty,
 weight_decay_penalty_type, xgb_alpha and xgb_lambda to set of model
 parameters displayed by the various compare_models functions.

---
 atomsci/ddm/pipeline/compare_models.py | 46 ++++++++++++++++++++++----
 atomsci/ddm/pipeline/model_wrapper.py  |  2 ++
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/atomsci/ddm/pipeline/compare_models.py b/atomsci/ddm/pipeline/compare_models.py
index bb66caca..0c023655 100644
--- a/atomsci/ddm/pipeline/compare_models.py
+++ b/atomsci/ddm/pipeline/compare_models.py
@@ -1,5 +1,5 @@
-"""Functions for comparing and visualizing model performance. Most of these functions rely on ATOM's model tracker and
-datastore services, which are not part of the standard AMPL installation, but a few functions will work on collections of
+"""Functions for comparing and visualizing model performance. Some of these functions rely on ATOM's model tracker and
+datastore services, which are not part of the standard AMPL installation, but several functions will work on collections of
 models saved as local files.
 """
 
@@ -163,6 +163,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
     model_type_list = []
     max_epochs_list = []
     learning_rate_list = []
+    weight_decay_penalty_list = []
+    weight_decay_penalty_type_list = []
     dropouts_list = []
     layer_sizes_list = []
     featurizer_list = []
@@ -172,6 +174,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
     rf_max_depth_list = []
     xgb_learning_rate_list = []
     xgb_gamma_list = []
+    xgb_alpha_list = []
+    xgb_lambda_list = []
     xgb_max_depth_list = []
     xgb_colsample_bytree_list = []
     xgb_subsample_list = []
@@ -218,6 +222,9 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
             max_epochs_list.append(nn_params['max_epochs'])
             best_epoch_list.append(nn_params['best_epoch'])
             learning_rate_list.append(nn_params['learning_rate'])
+            weight_decay_penalty_list.append(nn_params['weight_decay_penalty'])
+            weight_decay_penalty_type_list.append(nn_params['weight_decay_penalty_type'])
+            learning_rate_list.append(nn_params['learning_rate'])
             layer_sizes_list.append(','.join(['%d' % s for s in nn_params['layer_sizes']]))
             dropouts_list.append(','.join(['%.2f' % d for d in nn_params['dropouts']]))
             rf_estimators_list.append(nan)
@@ -225,6 +232,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
             rf_max_depth_list.append(nan)
             xgb_learning_rate_list.append(nan)
             xgb_gamma_list.append(nan)
+            xgb_alpha_list.append(nan)
+            xgb_lambda_list.append(nan)
             xgb_max_depth_list.append(nan)
             xgb_colsample_bytree_list.append(nan)
             xgb_subsample_list.append(nan)
@@ -239,10 +248,14 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
             max_epochs_list.append(nan)
             best_epoch_list.append(nan)
             learning_rate_list.append(nan)
+            weight_decay_penalty_list.append(nan)
+            weight_decay_penalty_type_list.append(nan)
             layer_sizes_list.append(nan)
             dropouts_list.append(nan)
             xgb_learning_rate_list.append(nan)
             xgb_gamma_list.append(nan)
+            xgb_alpha_list.append(nan)
+            xgb_lambda_list.append(nan)
             xgb_max_depth_list.append(nan)
             xgb_colsample_bytree_list.append(nan)
             xgb_subsample_list.append(nan)
@@ -256,10 +269,14 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
             max_epochs_list.append(nan)
             best_epoch_list.append(nan)
             learning_rate_list.append(nan)
+            weight_decay_penalty_list.append(nan)
+            weight_decay_penalty_type_list.append(nan)
             layer_sizes_list.append(nan)
             dropouts_list.append(nan)
             xgb_learning_rate_list.append(xgb_params["xgb_learning_rate"])
             xgb_gamma_list.append(xgb_params["xgb_gamma"])
+            xgb_alpha_list.append(xgb_params.get("xgb_alpha", 0.0))
+            xgb_lambda_list.append(xgb_params.get("xgb_lambda", 1.0))
             xgb_max_depth_list.append(xgb_params["xgb_max_depth"])
             xgb_colsample_bytree_list.append(xgb_params["xgb_colsample_bytree"])
             xgb_subsample_list.append(xgb_params["xgb_subsample"])
@@ -277,6 +294,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
                     max_epochs=max_epochs_list,
                     best_epoch=best_epoch_list,
                     learning_rate=learning_rate_list,
+                    weight_decay_penalty_type=weight_decay_penalty_type_list,
+                    weight_decay_penalty=weight_decay_penalty_list,
                     layer_sizes=layer_sizes_list,
                     dropouts=dropouts_list,
                     rf_estimators=rf_estimators_list,
@@ -284,6 +303,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
                     rf_max_depth=rf_max_depth_list,
                     xgb_learning_rate = xgb_learning_rate_list,
                     xgb_gamma = xgb_gamma_list,
+                    xgb_alpha = xgb_alpha_list,
+                    xgb_lambda = xgb_lambda_list,
                     xgb_max_depth = xgb_max_depth_list,
                     xgb_colsample_bytree = xgb_colsample_bytree_list,
                     xgb_subsample = xgb_subsample_list,
@@ -307,15 +328,15 @@ def extract_model_and_feature_parameters(metadata_dict, keep_required=True):
 
     Returns:
         dictionary containing featurizer and model parameters. Most contain the following
-        keys. ['max_epochs', 'best_epoch', 'learning_rate', 'layer_sizes', 'dropouts',
-        'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_learning_rate',
+        keys. ['max_epochs', 'best_epoch', 'learning_rate', 'weight_decay_penalty_type', 'weight_decay_penalty', 'layer_sizes', 'dropouts',
+        'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_alpha', 'xgb_lambda', 'xgb_learning_rate',
         'xgb_max_depth', 'xgb_colsample_bytree', 'xgb_subsample', 'xgb_n_estimators', 'xgb_min_child_weight',
         'featurizer_parameters_dict', 'model_parameters_dict']
     """
     model_params = metadata_dict['model_parameters']
     model_type = model_params['model_type']
-    required = ['max_epochs', 'best_epoch', 'learning_rate', 'layer_sizes', 'dropouts', 
-        'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_learning_rate',
+    required = ['max_epochs', 'best_epoch', 'learning_rate', 'layer_sizes', 'dropouts', 'weight_decay_penalty_type', 'weight_decay_penalty',
+        'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_alpha', 'xgb_lambda', 'xgb_learning_rate',
         'xgb_max_depth', 'xgb_colsample_bytree', 'xgb_subsample', 'xgb_n_estimators', 'xgb_min_child_weight'
         ]
 
@@ -328,6 +349,9 @@ def extract_model_and_feature_parameters(metadata_dict, keep_required=True):
             model_info['max_epochs'] = nn_params['max_epochs']
             model_info['best_epoch'] = nn_params['best_epoch']
             model_info['learning_rate'] = nn_params['learning_rate']
+            model_info['weight_decay_penalty_type'] = nn_params['weight_decay_penalty_type']
+            model_info['weight_decay_penalty'] = nn_params['weight_decay_penalty']
+            model_info['learning_rate'] = nn_params['learning_rate']
             model_info['layer_sizes'] = ','.join(['%d' % s for s in nn_params['layer_sizes']])
             model_info['dropouts'] = ','.join(['%.2f' % d for d in nn_params['dropouts']])
         elif model_type == 'RF':
@@ -338,6 +362,8 @@ def extract_model_and_feature_parameters(metadata_dict, keep_required=True):
         elif model_type == 'xgboost':
             xgb_params = metadata_dict['xgb_specific']
             model_info['xgb_gamma'] = xgb_params['xgb_gamma']
+            model_info['xgb_alpha'] = xgb_params.get('xgb_alpha', 0.0)
+            model_info['xgb_lambda'] = xgb_params.get('xgb_lambda', 1.0)
             model_info['xgb_learning_rate'] = xgb_params['xgb_learning_rate']
             model_info['xgb_max_depth'] = xgb_params['xgb_max_depth']
             model_info['xgb_colsample_bytree'] = xgb_params['xgb_colsample_bytree']
@@ -979,6 +1005,8 @@ def get_filesystem_perf_results(result_dir, pred_type='classification', expand=T
                 print(f"Can't access model {dirpath}")
 
     print("Found data for %d models under %s" % (len(model_list), result_dir))
+    if len(model_list) == 0:
+        return pd.DataFrame()
 
     # build dictonary of tarball names
     tar_dict = {os.path.basename(tf):tf for tf in tar_list}
@@ -1511,6 +1539,8 @@ def get_summary_metadata_table(uuids, collections=None):
                          'Layer Sizes':   nn_params['layer_sizes'],
                          'Optimizer':     nn_params['optimizer_type'],
                          'Learning Rate': nn_params['learning_rate'],
+                         'Weight decay penalty type': nn_params['weight_decay_penalty_type'],
+                         'Weight decay penalty': nn_params['weight_decay_penalty'],
                          'Dropouts':      nn_params['dropouts'],
                          'Best Epoch (Max)': '%i (%i)' % (nn_params['best_epoch'],nn_params['max_epochs']),
                          'Collection':    collection_name,
@@ -1551,6 +1581,8 @@ def get_summary_metadata_table(uuids, collections=None):
                          'Model Type (Featurizer)':    '%s (%s)' % (mdl_params['model_type'],featurizer),
                          'XGB learning rate': xgb_params['xgb_learning_rate'],
                          'Gamma':    xgb_params['xgb_gamma'],
+                         'Alpha':    xgb_params.get('xgb_alpha', 0.0),
+                         'Lambda':    xgb_params.get('xgb_lambda', 1.0),
                          'XGB max depth': xgb_params['xgb_max_depth'],
                          'Column sample fraction':    xgb_params['xgb_colsample_bytree'],
                          'Row subsample fraction':    xgb_params['xgb_subsample'],
@@ -2091,7 +2123,7 @@ def get_multitask_perf_from_tracker(collection_name, response_cols=None, expand_
                   'weight_decay_penalty', 'weight_decay_penalty_type', 'weight_init_stddevs', 'splitter',
                   'split_uuid', 'split_test_frac', 'split_valid_frac', 'smiles_col', 'id_col',
                   'feature_transform_type', 'response_cols', 'response_transform_type', 'num_model_tasks',
-                  'rf_estimators', 'rf_max_depth', 'rf_max_features', 'xgb_gamma', 'xgb_learning_rate',
+                  'rf_estimators', 'rf_max_depth', 'rf_max_features', 'xgb_gamma', 'xgb_alpha', 'xgb_lambda', 'xgb_learning_rate',
                   'xgb_max_depth', 'xgb_colsample_bytree', 'xgb_subsample', 'xgb_n_estimators', 'xgb_min_child_weight',
                   ]
         keepcols.extend(alldat.columns[alldat.columns.str.contains('best')])
diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py
index db937a56..26eb19a3 100644
--- a/atomsci/ddm/pipeline/model_wrapper.py
+++ b/atomsci/ddm/pipeline/model_wrapper.py
@@ -2261,6 +2261,8 @@ def get_model_specific_metadata(self):
                        "xgb_learning_rate" : self.params.xgb_learning_rate,
                        "xgb_n_estimators" : self.params.xgb_n_estimators,
                        "xgb_gamma" : self.params.xgb_gamma,
+                       "xgb_alpha" : self.params.xgb_alpha,
+                       "xgb_lambda" : self.params.xgb_lambda,
                        "xgb_min_child_weight" : self.params.xgb_min_child_weight,
                        "xgb_subsample" : self.params.xgb_subsample,
                        "xgb_colsample_bytree"  :self.params.xgb_colsample_bytree

From b84098a18b1a3fdeaff183f8ceb8f509a600577f Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Wed, 12 Jun 2024 11:28:34 -0700
Subject: [PATCH 04/28] Fixed MTSS bug where population was not sorted by score
 after calling step(); changed order of operations so that grading happens at
 end of step() method rather than at beginning. Added serial_grade_population
 method for debugging. Simplified code to address some performance issues.

---
 atomsci/ddm/pipeline/GeneticAlgorithm.py | 34 +++++++++++++++---------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/atomsci/ddm/pipeline/GeneticAlgorithm.py b/atomsci/ddm/pipeline/GeneticAlgorithm.py
index 531b85ba..195ae83d 100644
--- a/atomsci/ddm/pipeline/GeneticAlgorithm.py
+++ b/atomsci/ddm/pipeline/GeneticAlgorithm.py
@@ -49,12 +49,24 @@ def __init__(self,
         self.fitness_func = fitness_func
         self.crossover_func = crossover_func
         self.mutate_func = mutate_func
+        #self.serial_grade_population()
         self.parallel_grade_population()
 
-    def parallel_grade_population(self):
-        """ Grade the population and save the scores
 
-        Updates the order of self.pop and self.pop_scores
+    def serial_grade_population(self):
+        """ Non-multithreaded version of parallel_grade_population, used for debugging fitness functions only.
+        """
+        fitnesses = []
+        for chrom in self.pop:
+            fitnesses.append(self.fitness_func(chrom))
+        pairs = sorted(zip(fitnesses, self.pop), reverse=True)
+        self.pop = [chrome for fitness, chrome in pairs]
+        self.pop_scores = [fitness for fitness, chrome in pairs]
+
+
+    def parallel_grade_population(self):
+        """ Scores the chromosomes in the current population and sorts them in decreasing score order.
+        Saves the sorted scores in self.pop_scores.
 
         Parameters
         ----------
@@ -68,10 +80,7 @@ def parallel_grade_population(self):
         fitnesses = pool.map(self.fitness_func, self.pop)
         pool.close()
         pool.join()
-        pairs = list(zip(fitnesses, self.pop))
-
-        pairs.sort(key=lambda x: x[0], reverse=True)
-
+        pairs = sorted(zip(fitnesses, self.pop), reverse=True)
         self.pop = [chrome for fitness, chrome in pairs]
         self.pop_scores = [fitness for fitness, chrome in pairs]
 
@@ -90,10 +99,7 @@ def select_parents(self) -> List[List[Any]]:
         parents: List[List[Any]]
             A list of chromosomes that will be parents for the next generation
         """
-        self.parallel_grade_population()
-
-        parents = [chrome for chrome in self.pop[:self.num_parents]]
-        return parents
+        return self.pop[:self.num_parents]
 
     def iterate(self, num_generations: int):
         """ Iterates the genetic algorithm num_generations
@@ -130,12 +136,13 @@ def step(self, print_timings: bool = False):
 
         start = timeit.default_timer()
         i = timeit.default_timer()
+        # select parents using rank selection
         parents = self.select_parents()
         if print_timings:
             print('\tfind parents %0.2f min'%((timeit.default_timer()-i)/60))
 
-        # select parents using rank selection
         i = timeit.default_timer()
+        # Generate new population by crossing parent chromosomes
         new_pop = self.crossover_func(parents, self.num_pop)
         if print_timings:
             print('\tcrossover %0.2f min'%((timeit.default_timer()-i)/60))
@@ -143,6 +150,9 @@ def step(self, print_timings: bool = False):
         # mutate population
         i = timeit.default_timer()
         self.pop = self.mutate_func(new_pop)
+        # Compute scores for new chromosomes and sort population by score
+        #self.serial_grade_population()
+        self.parallel_grade_population()
         if print_timings:
             print('\tmutate %0.2f min'%((timeit.default_timer()-i)/60))
             print('total %0.2f min'%((timeit.default_timer()-start)/60))

From e4a5f620b5e3084bf9470285529fe637aea29418 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Wed, 12 Jun 2024 17:41:42 -0700
Subject: [PATCH 05/28] Changed to use single-threaded function to grade
 chromosomes, after seeing that it runs much faster than the multithreaded
 version. Added documentation.

---
 atomsci/ddm/pipeline/GeneticAlgorithm.py | 28 ++++++++++++++++++------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/atomsci/ddm/pipeline/GeneticAlgorithm.py b/atomsci/ddm/pipeline/GeneticAlgorithm.py
index 195ae83d..98611d77 100644
--- a/atomsci/ddm/pipeline/GeneticAlgorithm.py
+++ b/atomsci/ddm/pipeline/GeneticAlgorithm.py
@@ -49,12 +49,22 @@ def __init__(self,
         self.fitness_func = fitness_func
         self.crossover_func = crossover_func
         self.mutate_func = mutate_func
-        #self.serial_grade_population()
-        self.parallel_grade_population()
+        self.serial_grade_population()
+        #self.parallel_grade_population()
 
 
     def serial_grade_population(self):
-        """ Non-multithreaded version of parallel_grade_population, used for debugging fitness functions only.
+        """ Scores the chromosomes in the current population and sorts them in decreasing score order.
+        Saves the sorted scores in self.pop_scores. Not multithreaded; surprisingly, this runs faster
+        than the multithreaded function parallel_grade_scores.
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        None. As a side effect, sorts the chromosomes in self.pop and updates the scores in self.pop_scores.
         """
         fitnesses = []
         for chrom in self.pop:
@@ -66,7 +76,11 @@ def serial_grade_population(self):
 
     def parallel_grade_population(self):
         """ Scores the chromosomes in the current population and sorts them in decreasing score order.
-        Saves the sorted scores in self.pop_scores.
+        Saves the sorted scores in self.pop_scores. 
+        
+        Although this does the same thing in multiple threads as the single-threaded function 
+        serial_grade_population, it seems to run much slower, at least for multitask scaffold splits
+        with 100 chromosomes.
 
         Parameters
         ----------
@@ -74,7 +88,7 @@ def parallel_grade_population(self):
 
         Returns
         -------
-        Nothing
+        None
         """
         pool = multiprocessing.Pool(processes=N_PROCS)
         fitnesses = pool.map(self.fitness_func, self.pop)
@@ -151,8 +165,8 @@ def step(self, print_timings: bool = False):
         i = timeit.default_timer()
         self.pop = self.mutate_func(new_pop)
         # Compute scores for new chromosomes and sort population by score
-        #self.serial_grade_population()
-        self.parallel_grade_population()
+        self.serial_grade_population()
+        #self.parallel_grade_population()
         if print_timings:
             print('\tmutate %0.2f min'%((timeit.default_timer()-i)/60))
             print('total %0.2f min'%((timeit.default_timer()-start)/60))

From c524a7b0cd456134262a7ea8b030012856b73dfd Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Wed, 12 Jun 2024 17:45:20 -0700
Subject: [PATCH 06/28] Implemented a new fitness function to more robustly
 enforce dissimilarity between test and training set scaffold structures.
 Fixed a bug where the splitter always returned the split from the last
 generation rather than the best-ever split. Added code to track the
 individual fitness function terms over generations so that they can be
 displayed in diagnostic plots.

---
 .../ddm/pipeline/MultitaskScaffoldSplit.py    | 269 +++++++++++-------
 1 file changed, 167 insertions(+), 102 deletions(-)

diff --git a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
index de1e7a48..f32f4428 100644
--- a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
+++ b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
@@ -159,55 +159,6 @@ def dist_smiles_from_ecfp(ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBit
     return cd.calc_summary(dist_metrics.tanimoto(ecfp1, ecfp2), calc_type='nearest', 
                         num_nearest=1, within_dset=False)
 
-def _generate_scaffold_dist_matrix(scaffold_lists: List[np.ndarray],
-                                ecfp_features: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect]) -> np.ndarray:
-        """Returns a nearest neighbors distance matrix between each scaffold.
-
-        The distance between two scaffolds is defined as the
-        the distance between the two closest compounds between the two
-        scaffolds.
-
-        TODO: Ask Stewart: Why did he change to using the median instead of the min?
-
-        Parameters
-        ----------
-        scaffold_lists: List[np.ndarray]
-            List of scaffolds. A scaffold is a set of indices into ecfp_features
-        ecfp_features: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect]
-            List of ecfp features, one for each compound in the dataset
-
-        Returns
-        -------
-        dist_mat: np.ndarray
-            Distance matrix, symmetric.
-        """
-        print('start generating big dist mat')
-        start = timeit.default_timer()
-        # First compute the full matrix of distances between all pairs of ECFPs
-        dmat = dist_metrics.tanimoto(ecfp_features)
-
-        mat_shape = (len(scaffold_lists), len(scaffold_lists))
-        scaff_dist_mat = np.zeros(mat_shape)
-        for i, scaff1 in tqdm(enumerate(scaffold_lists)):
-            scaff1_rows = scaff1.reshape((-1,1))
-            for j, scaff2 in enumerate(scaffold_lists[:i]):
-                if i == j:
-                    continue
-
-                dists = dmat[scaff1_rows, scaff2].flatten()
-                min_dist = np.median(dists)
-
-                if min_dist==0:
-                    print("two scaffolds match exactly?!?", i, j)
-                    print(len(set(scaff2).intersection(set(scaff1))))
-
-                scaff_dist_mat[i,j] = min_dist
-                scaff_dist_mat[j,i] = min_dist
-
-        print("finished scaff dist mat: %0.2f min"%((timeit.default_timer()-start)/60))
-        return scaff_dist_mat
-
-
 class MultitaskScaffoldSplitter(Splitter):
     """MultitaskScaffoldSplitter Splitter class.
 
@@ -259,6 +210,66 @@ def generate_scaffolds(self,
 
         return scaffold_sets
 
+    def _generate_scaffold_dist_matrix(self):
+        """Computes matrices used by fitness functions that score splits according
+        to the dissimilarity between training and test compound structures. One is 
+        a symmetric matrix of nearest neighbor Tanimoto distances between pairs of 
+        scaffolds (i.e., between the most similar compounds in the two scaffolds).
+        This matrix is used to compute the `scaffold_diff_fitness` function.
+        
+        The other is a nonsymmetric matrix of boolean vectors `has_nn[i,j]`, of length
+        equal to the number of compounds in scaffold `i`, indicating whether each
+        compound has a neighbor in scaffold `j` nearer than some threshold Tanimoto 
+        distance `dist_thresh`. dist_thresh defaults to 0.3, but could be parameterized.
+        The has_nn matrix is used to compute the `far_frac_fitness` function, which is
+        a more robust measurement of the train/test set dissimilarity.
+        """
+        scaffold_lists = self.ss
+        ecfp_features = self.ecfp_features
+
+        # First compute the full matrix of distances between all pairs of ECFPs
+        dmat = dist_metrics.tanimoto(ecfp_features)
+
+        mat_shape = (len(scaffold_lists), len(scaffold_lists))
+        scaff_dist_mat = np.zeros(mat_shape)
+        has_near_neighbor_mat = np.empty(mat_shape, dtype=object)
+
+        for i, scaff1 in enumerate(scaffold_lists):
+            scaff1_rows = scaff1.reshape((-1,1))
+            for j, scaff2 in enumerate(scaffold_lists[:i]):
+                if i == j:
+                    continue
+
+                scaff1_dists = dmat[scaff1_rows, scaff2]
+                min_dist = np.min(scaff1_dists.flatten())
+
+                if min_dist==0:
+                    #print("two scaffolds match exactly?!?", i, j)
+                    print(f"Scaffolds {i} and {j} have at least one ECFP in common")
+                    for k in scaff1:
+                        for l in scaff2:
+                            if dmat[k,l] == 0:
+                                print(f"\tcompound {k} in scaffold {i}, compound {l} in scaffold {j}")
+                                print(f"\tSMILES {k}: {self.smiles[k]}")
+                                print(f"\tSMILES {l}: {self.smiles[l]}\n")
+
+
+                scaff_dist_mat[i,j] = min_dist
+                scaff_dist_mat[j,i] = min_dist
+
+                # Identify the compounds in scaff1 with a neighbor in scaff2 closer than dist_thresh
+                has_near_neighbor_mat[i,j] = np.array(np.min(scaff1_dists, axis=1) < self.dist_thresh)
+
+                # Identify the compounds in scaff2 with a neighbor in scaff1 closer than dist_thresh
+                scaff2_rows = scaff2.reshape((-1,1))
+                scaff2_dists = dmat[scaff2_rows, scaff1]
+                has_near_neighbor_mat[j,i] = np.array(np.min(scaff2_dists, axis=1) < self.dist_thresh)
+
+
+        self.scaff_scaff_distmat = scaff_dist_mat
+        self.has_near_neighbor_mat = has_near_neighbor_mat
+
+
     def expand_scaffolds(self,
                         scaffold_list: List[int]) -> List[int]:
         """Turns a list of scaffold indices into a list of compound indices
@@ -348,51 +359,74 @@ def scaffold_diff_fitness(self,
 
         return min_dist
 
-    def ratio_fitness_old(self, split_chromosome: List[str]) -> float:
-        """Calculates a fitness score based on how accurately divided training/test/validation.
+    def far_frac_fitness(self, 
+                            split_chromosome: List[str],
+                            train_part: str,
+                            test_part: str) -> float:
+        """Grades a split according to the fraction of valid/test compounds with
+        nearest training set compounds further than some threshold.
+
+        Grades the quality of the split based on which scaffolds were alloted to
+        which partitions. The score is measured as the fraction of compounds in
+        `test_part` with nearest neighbors in `train_part` at Tanimoto distance
+        `self.dist_thresh` or greater.
+
         Parameters
         ----------
-        List[str]: split_chromosome
-            A list of strings, index i contains a string, 'train', 'valid', 'test'
-            which determines the partition that scaffold belongs to
+        split_chromosome: List[str]
+            A split represented as a list of partition names. Index i in the
+            chromosome contains the partition for scaffold i.
+        train_part: str
+            The name of the partition to be treated as the training subset
+        test_part: str
+            The name of the partition to be treated as the test subset
+
         Returns
         -------
-        List[str]
-            A list of strings, index i contains a string, 'train', 'valid', 'test'
-            which determines the partition that scaffold belongs to
+        score: float
+            Floating point value beteween 0-1. 1 is the best score and 0 is the worst
         """
-        start = timeit.default_timer()
-        total_counts = np.sum(self.dataset.w, axis=0)
-        subset_counts = [np.sum(self.dataset.w[subset], axis=0) for subset in \
-                            self.split_chromosome_to_compound_split(split_chromosome)]
+        # TODO: Eventually, replace strings in chromosome with integers indicating the partition
+        # for each scaffold.
 
-        # subset order goes train, valid, test. just consider train for the moment
-        subset_ratios = [subset_count/total_counts for subset_count in subset_counts]
-        #print("mean: %0.2f, median: %0.2f, std: %0.2f"%(np.mean(subset_ratios[0]), np.median(subset_ratios[0]), np.std(subset_ratios[0])))
-
-        # fitness of split is the size of the smallest training dataset
-        # ratio_fit = min(subset_ratios[0]) # this resulted in a split with 0 test. shoulda seen that coming
-        num_tasks = self.dataset.w.shape[1]
-        # imagine the perfect split is a point in 3*num_tasks space.
-        target_split = np.concatenate([[self.frac_train]*num_tasks,
-                                       [self.frac_valid]*num_tasks])
-        # this is the current split also on 3*num_tasks space
-        current_split = np.concatenate(subset_ratios[:2])
-        # if any partition is 0, then this split fails
-        if min(current_split) == 0:
+        train_scaffolds = [i for i, part in enumerate(split_chromosome) if part==train_part]
+        test_scaffolds = [i for i, part in enumerate(split_chromosome) if part==test_part]
+
+        # if a partition is completely empty, return 0
+        if len(train_scaffolds) == 0 or len(test_scaffolds) == 0:
             return 0
-        # worst possible distance to normalize this between 0 and 1
-        worst_distance = np.linalg.norm(np.ones(num_tasks*2))
-        ratio_fit = 1 - np.linalg.norm(target_split-current_split)/worst_distance
-        #print("\tratio_fitness: %0.2f min"%((timeit.default_timer()-start)/60))
 
-        return ratio_fit
+        # Compute the "far fraction": For each test scaffold S, OR together the boolean vectors
+        # from has_near_neighbor_mat for the training scaffolds indicating whether each compound in
+        # S has a neighbor in the train scaffold closer than Tanimoto distance dist_thresh. Sum
+        # the results over compounds and test scaffolds and divide by the test set size to get the
+        # "near fraction"; the far fraction is 1 minus the near fraction.
+
+        near_count = 0
+        total_count = 0
+        for test_ind in test_scaffolds:
+            has_nn = None
+            for train_ind in train_scaffolds:
+                assert(not (train_ind == test_ind))
+                if has_nn is None:
+                    has_nn = self.has_near_neighbor_mat[test_ind, train_ind]
+                else:
+                    has_nn |= self.has_near_neighbor_mat[test_ind, train_ind]
+            near_count += sum(has_nn)
+            total_count += len(has_nn)
+
+        far_frac = 1 - near_count/total_count
+        #print(f"far_frac_fitness: near_count {near_count}, total_count {total_count}, far_frac {far_frac}")
+        return far_frac
+
 
     def ratio_fitness(self, split_chromosome: List[str]) -> float:
-        """Calculates a fitness score based on how accurately divided training/test/validation.
+        """Calculates a fitness score based on how well the subset proportions for each task, taking
+        only labeled compounds into account, match the proportions requested by the user.
 
-        Treats the min,median,max of each of the three partitions as a 9D point and uses
-        euclidean distance to measure the distance to that point
+        The score is determined by combining the min, median and max over tasks of the subset fractions
+        into a 9-dimensional vector and computing its Euclidean distance from an ideal vector having
+        all the fractions as requested.
 
         Parameters
         ----------
@@ -405,15 +439,15 @@ def ratio_fitness(self, split_chromosome: List[str]) -> float:
         float
             A float between 0 and 1. 1 best 0 is worst
         """
-        start = timeit.default_timer()
+        #start = timeit.default_timer()
         # total_counts is the number of labels per task
         total_counts = np.sum(self.dataset.w, axis=0)
 
-        # subset_counts is number of labels per task per subset
+        # subset_counts is number of labels per task per subset.
         subset_counts = [np.sum(self.dataset.w[subset], axis=0) for subset in \
                             self.split_chromosome_to_compound_split(split_chromosome)]
 
-        # subset order goes train, valid, test. just consider train for the moment
+        # subset order goes train, valid, test
         subset_ratios = [subset_count/total_counts for subset_count in subset_counts]
 
         # imagine the perfect split is a point in 9D space. For each subset we measure
@@ -431,6 +465,7 @@ def ratio_fitness(self, split_chromosome: List[str]) -> float:
             return 0
         # worst possible distance to normalize this between 0 and 1
         worst_distance = np.linalg.norm(np.ones(len(target_split)))
+        worst_distance = np.sqrt(len(target_split))
         ratio_fit = 1 - np.linalg.norm(target_split-current_split)/worst_distance
         #print("\tratio_fitness: %0.2f min"%((timeit.default_timer()-start)/60))
 
@@ -454,7 +489,7 @@ def response_distr_fitness(self, split_chromosome: List[str]) -> float:
             the train subset response value distributions, averaged over tasks. One means
             the distributions perfectly match.
         """
-        start = timeit.default_timer()
+        #start = timeit.default_timer()
         dist_sum = 0.0
         ntasks = self.dataset.y.shape[1]
         train_ind, valid_ind, test_ind = self.split_chromosome_to_compound_split(split_chromosome)
@@ -496,16 +531,37 @@ def grade(self, split_chromosome: List[str]) -> float:
         fitness = 0.0
         # Only call the functions for each fitness term if their weight is nonzero
         if self.diff_fitness_weight_tvt != 0.0:
-            fitness += self.diff_fitness_weight_tvt*self.scaffold_diff_fitness(split_chromosome, 'train', 'test')
+            #score = self.scaffold_diff_fitness(split_chromosome, 'train', 'test')
+            score = self.far_frac_fitness(split_chromosome, 'train', 'test')
+            fitness += self.diff_fitness_weight_tvt*score
         if self.diff_fitness_weight_tvv != 0.0:
-            fitness += self.diff_fitness_weight_tvv*self.scaffold_diff_fitness(split_chromosome, 'train', 'valid')
+            #score = self.scaffold_diff_fitness(split_chromosome, 'train', 'valid')
+            score = self.far_frac_fitness(split_chromosome, 'train', 'valid')
+            fitness += self.diff_fitness_weight_tvv*score
         if self.ratio_fitness_weight != 0.0:
-            fitness += self.ratio_fitness_weight*self.ratio_fitness(split_chromosome)
+            score = self.ratio_fitness(split_chromosome)
+            fitness += self.ratio_fitness_weight*score
         if self.response_distr_fitness_weight != 0.0:
-            fitness += self.response_distr_fitness_weight*self.response_distr_fitness(split_chromosome)
+            score = self.response_distr_fitness(split_chromosome)
+            fitness += self.response_distr_fitness_weight*score
 
         return fitness
 
+    def get_fitness_scores(self, split_chromosome):
+        fitness_scores = {}
+        # Only call the functions for each fitness term if their weight is nonzero
+        if self.diff_fitness_weight_tvt != 0.0:
+            #fitness_scores['test_scaf_dist'] = self.scaffold_diff_fitness(split_chromosome, 'train', 'test')
+            fitness_scores['test_scaf_dist'] = self.far_frac_fitness(split_chromosome, 'train', 'test')
+        if self.diff_fitness_weight_tvv != 0.0:
+            #fitness_scores['valid_scaf_dist'] = self.scaffold_diff_fitness(split_chromosome, 'train', 'valid')
+            fitness_scores['valid_scaf_dist'] = self.far_frac_fitness(split_chromosome, 'train', 'valid')
+        if self.ratio_fitness_weight != 0.0:
+            fitness_scores['ratio'] = self.ratio_fitness(split_chromosome)
+        if self.response_distr_fitness_weight != 0.0:
+            fitness_scores['response_distr'] = self.response_distr_fitness(split_chromosome)
+        return fitness_scores
+
     def init_scaffolds(self,
              dataset: Dataset) -> None:
         """Creates super scaffolds used in splitting
@@ -548,6 +604,7 @@ def split(self,
             num_super_scaffolds: int = 20,
             num_pop: int = 100,
             num_generations: int=30,
+            dist_thresh: float = 0.3,
             print_timings: bool = False,
             log_every_n: int = 1000) -> Tuple:
         """Creates a split for the given datset
@@ -607,6 +664,7 @@ def split(self,
         self.num_super_scaffolds = num_super_scaffolds
         self.num_pop = num_pop
         self.num_generations = num_generations
+        self.dist_thresh = dist_thresh
 
         self.frac_train = frac_train
         self.frac_valid = frac_valid
@@ -616,38 +674,43 @@ def split(self,
         self.init_scaffolds(self.dataset)
 
         # ecpf features
+        self.smiles = dataset.ids
         self.ecfp_features = calc_ecfp(dataset.ids)
 
         # calculate ScaffoldxScaffold distance matrix
-        start = timeit.default_timer()
+        #start = timeit.default_timer()
         if (self.diff_fitness_weight_tvv > 0.0) or (self.diff_fitness_weight_tvt > 0.0):
-            self.scaff_scaff_distmat = _generate_scaffold_dist_matrix(self.ss, self.ecfp_features)
+            self._generate_scaffold_dist_matrix()
         #print('scaffold dist mat %0.2f min'%((timeit.default_timer()-start)/60))
 
         # initial population
         population = []
         for i in range(self.num_pop):
-            start = timeit.default_timer()
+            #start = timeit.default_timer()
             split_chromosome = self._split(frac_train=frac_train, frac_valid=frac_valid, 
                                 frac_test=frac_test)
             #print("per_loop: %0.2f min"%((timeit.default_timer()-start)/60))
 
             population.append(split_chromosome)
 
+        self.fitness_terms = {}
         gene_alg = ga.GeneticAlgorithm(population, self.grade, ga_crossover,
                         ga_mutate)
         #gene_alg.iterate(num_generations)
+        best_ever = None
+        best_ever_fit = -np.inf
         for i in range(self.num_generations):
             gene_alg.step(print_timings=print_timings)
             best_fitness = gene_alg.pop_scores[0]
-            print("step %d: best_fitness %0.2f"%(i, best_fitness))
-            #print("%d: %0.2f"%(i, gene_alg.grade_population()[0][0]))
-
-        best_fit = gene_alg.pop_scores[0]
-        best = gene_alg.pop[0]
-
-        #print('best ever fitness %0.2f'%best_ever_fit)
-        result = self.split_chromosome_to_compound_split(best)
+            if best_fitness > best_ever_fit:
+                best_ever = gene_alg.pop[0]
+                best_ever_fit = best_fitness
+            score_dict = self.get_fitness_scores(best_ever)
+            for term, score in score_dict.items():
+                self.fitness_terms.setdefault(term, []).append(score)
+            print(f"generation {i}: Best fitness {best_fitness:.2f}, best ever {best_ever_fit:.2f}")
+
+        result = self.split_chromosome_to_compound_split(best_ever)
         return result
 
     def _split(self,
@@ -736,6 +799,7 @@ def train_valid_test_split(self,
             train_dir: Optional[str] = None,
             valid_dir: Optional[str] = None,
             test_dir: Optional[str] = None,
+            dist_thresh: float = 0.3,
             log_every_n: int = 1000) -> Tuple[Dataset, Dataset, Dataset]:
         """Creates a split for the given datset
 
@@ -803,6 +867,7 @@ def train_valid_test_split(self,
             diff_fitness_weight_tvv=diff_fitness_weight_tvv, ratio_fitness_weight=ratio_fitness_weight,
             response_distr_fitness_weight=response_distr_fitness_weight,
             num_super_scaffolds=num_super_scaffolds, num_pop=num_pop, num_generations=num_generations,
+            dist_thresh=0.3,
             print_timings=False)
 
         if train_dir is None:

From be48c587bfe1c0b9f0b8095beba0f2a4bc029253 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Mon, 17 Jun 2024 19:45:27 -0700
Subject: [PATCH 07/28] Normalize fitness scores to the range [0,1]. Add the
 total fitness score to the fitness_scores dictionary so that it can be
 plotted together with the component scores.

---
 .../ddm/pipeline/MultitaskScaffoldSplit.py    | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
index f32f4428..6c52128e 100644
--- a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
+++ b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
@@ -133,7 +133,7 @@ def calc_ecfp(smiles: List[str],
         fprints = [y for x in ecfps for y in x] #Flatten results
     else:
         mols = [Chem.MolFromSmiles(s) for s in smiles]
-        fprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) for mol in mols]
+        fprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 3, 1024) for mol in mols]
 
     return fprints
 
@@ -528,6 +528,8 @@ def grade(self, split_chromosome: List[str]) -> float:
         float
             A float between 0 and 1. 1 best 0 is worst
         """
+        
+        """
         fitness = 0.0
         # Only call the functions for each fitness term if their weight is nonzero
         if self.diff_fitness_weight_tvt != 0.0:
@@ -545,21 +547,36 @@ def grade(self, split_chromosome: List[str]) -> float:
             score = self.response_distr_fitness(split_chromosome)
             fitness += self.response_distr_fitness_weight*score
 
+        # Normalize the score to the range [0,1]
+        fitness /= (self.diff_fitness_weight_tvt + self.diff_fitness_weight_tvv + self.ratio_fitness_weight +
+                    self.response_distr_fitness_weight)
         return fitness
+        """
+        fitness_scores = self.get_fitness_scores(split_chromosome)
+        return fitness_scores['total_fitness']
 
     def get_fitness_scores(self, split_chromosome):
         fitness_scores = {}
+        total_fitness = 0.0
         # Only call the functions for each fitness term if their weight is nonzero
         if self.diff_fitness_weight_tvt != 0.0:
             #fitness_scores['test_scaf_dist'] = self.scaffold_diff_fitness(split_chromosome, 'train', 'test')
             fitness_scores['test_scaf_dist'] = self.far_frac_fitness(split_chromosome, 'train', 'test')
+            total_fitness += self.diff_fitness_weight_tvt * fitness_scores['test_scaf_dist']
         if self.diff_fitness_weight_tvv != 0.0:
             #fitness_scores['valid_scaf_dist'] = self.scaffold_diff_fitness(split_chromosome, 'train', 'valid')
             fitness_scores['valid_scaf_dist'] = self.far_frac_fitness(split_chromosome, 'train', 'valid')
+            total_fitness += self.diff_fitness_weight_tvv * fitness_scores['valid_scaf_dist']
         if self.ratio_fitness_weight != 0.0:
             fitness_scores['ratio'] = self.ratio_fitness(split_chromosome)
+            total_fitness += self.ratio_fitness_weight * fitness_scores['ratio']
         if self.response_distr_fitness_weight != 0.0:
             fitness_scores['response_distr'] = self.response_distr_fitness(split_chromosome)
+            total_fitness += self.response_distr_fitness_weight * fitness_scores['response_distr']
+        # Normalize the score to the range [0,1]
+        total_fitness /= (self.diff_fitness_weight_tvt + self.diff_fitness_weight_tvv + self.ratio_fitness_weight +
+                    self.response_distr_fitness_weight)
+        fitness_scores['total_fitness'] = total_fitness
         return fitness_scores
 
     def init_scaffolds(self,

From da5a121c776da8b4d213d532ab5361973f1a841c Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Thu, 20 Jun 2024 17:46:00 -0700
Subject: [PATCH 08/28] Added early stopping to terminate GA if no fitness
 improvement after specified number of generations. Replaced print() calls
 with log messages so we can control verbosity of output. Changed split() to
 use log_every_n argument to control frequency of messages during GA
 operation.

---
 .../ddm/pipeline/MultitaskScaffoldSplit.py    | 42 +++++++++++--------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
index 6c52128e..9a4fbb18 100644
--- a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
+++ b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
@@ -1,4 +1,5 @@
 import pdb
+import logging
 import argparse
 import random
 import timeit
@@ -24,6 +25,9 @@
 from atomsci.ddm.pipeline import dist_metrics
 from atomsci.ddm.pipeline import GeneticAlgorithm as ga
 
+logging.basicConfig(format='%(asctime)-15s %(message)s')
+logger = logging.getLogger('ATOM')
+
 def _generate_scaffold_hists(scaffold_sets: List[np.ndarray], 
                                 w: np.array) -> np.array:
     """Counts the number of labelled samples per task per scaffold
@@ -96,8 +100,8 @@ def smush_small_scaffolds(scaffolds: List[Set[int]],
         else:
             new_scaffolds.append(scaffold)
 
-    print('new scaffold lengths')
-    print([len(s) for s in new_scaffolds])
+    logger.debug('new scaffold lengths')
+    logger.debug([len(s) for s in new_scaffolds])
     new_scaffolds = [np.array(list(s)) for s in new_scaffolds]
     return new_scaffolds
 
@@ -144,9 +148,9 @@ def dist_smiles_from_ecfp(ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBit
     Parameters
     ----------
     ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect],
-        A list of ECFP finger prints
+        A list of ECFP fingerprints
     ecfp2: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect]
-        A list of ECFP finger prints
+        A list of ECFP fingerprints
 
     Returns
     -------
@@ -244,14 +248,13 @@ def _generate_scaffold_dist_matrix(self):
                 min_dist = np.min(scaff1_dists.flatten())
 
                 if min_dist==0:
-                    #print("two scaffolds match exactly?!?", i, j)
-                    print(f"Scaffolds {i} and {j} have at least one ECFP in common")
+                    logger.info(f"Scaffolds {i} and {j} have at least one ECFP in common")
                     for k in scaff1:
                         for l in scaff2:
                             if dmat[k,l] == 0:
-                                print(f"\tcompound {k} in scaffold {i}, compound {l} in scaffold {j}")
-                                print(f"\tSMILES {k}: {self.smiles[k]}")
-                                print(f"\tSMILES {l}: {self.smiles[l]}\n")
+                                logger.debug(f"\tcompound {k} in scaffold {i}, compound {l} in scaffold {j}")
+                                logger.debug(f"\tSMILES {k}: {self.smiles[k]}")
+                                logger.debug(f"\tSMILES {l}: {self.smiles[l]}\n")
 
 
                 scaff_dist_mat[i,j] = min_dist
@@ -599,10 +602,10 @@ def init_scaffolds(self,
         # list of lists. one list per scaffold
         big_ss = self.generate_scaffolds(dataset)
 
-        # using the same stragetgy as scaffold split, combine the scaffolds
+        # using the same strategy as scaffold split, combine the scaffolds
         # together until you have roughly 100 scaffold sets
         self.ss = smush_small_scaffolds(big_ss, num_super_scaffolds=self.num_super_scaffolds)
-        print(f'num_super_scaffolds: {len(self.ss)}, {self.num_super_scaffolds}')
+        logger.info(f"Requested {self.num_super_scaffolds} super scaffolds, produced {len(self.ss)} from {len(big_ss)} original scaffolds")
 
         # rows is the number of scaffolds
         # columns is number of tasks
@@ -623,7 +626,8 @@ def split(self,
             num_generations: int=30,
             dist_thresh: float = 0.3,
             print_timings: bool = False,
-            log_every_n: int = 1000) -> Tuple:
+            early_stopping_generations = 25,
+            log_every_n: int = 10) -> Tuple:
         """Creates a split for the given datset
 
         This split splits the dataset into a list of super scaffolds then
@@ -695,10 +699,8 @@ def split(self,
         self.ecfp_features = calc_ecfp(dataset.ids)
 
         # calculate ScaffoldxScaffold distance matrix
-        #start = timeit.default_timer()
         if (self.diff_fitness_weight_tvv > 0.0) or (self.diff_fitness_weight_tvt > 0.0):
             self._generate_scaffold_dist_matrix()
-        #print('scaffold dist mat %0.2f min'%((timeit.default_timer()-start)/60))
 
         # initial population
         population = []
@@ -706,7 +708,6 @@ def split(self,
             #start = timeit.default_timer()
             split_chromosome = self._split(frac_train=frac_train, frac_valid=frac_valid, 
                                 frac_test=frac_test)
-            #print("per_loop: %0.2f min"%((timeit.default_timer()-start)/60))
 
             population.append(split_chromosome)
 
@@ -716,17 +717,24 @@ def split(self,
         #gene_alg.iterate(num_generations)
         best_ever = None
         best_ever_fit = -np.inf
+        best_gen = 0
         for i in range(self.num_generations):
             gene_alg.step(print_timings=print_timings)
             best_fitness = gene_alg.pop_scores[0]
             if best_fitness > best_ever_fit:
                 best_ever = gene_alg.pop[0]
                 best_ever_fit = best_fitness
+                best_gen = i
             score_dict = self.get_fitness_scores(best_ever)
             for term, score in score_dict.items():
                 self.fitness_terms.setdefault(term, []).append(score)
-            print(f"generation {i}: Best fitness {best_fitness:.2f}, best ever {best_ever_fit:.2f}")
+            if i % log_every_n == 0:
+                logger.info(f"generation {i}: Best fitness {best_fitness:.3f}, best ever {best_ever_fit:.3f} at generation {best_gen}")
+            if (best_fitness <= best_ever_fit) and (i - best_gen >= early_stopping_generations):
+                logger.info(f"No fitness improvement after {early_stopping_generations} generations")
+                break
 
+        logger.info(f"Final best fitness score: {best_ever_fit:.3f} at generation {best_gen}")
         result = self.split_chromosome_to_compound_split(best_ever)
         return result
 
@@ -817,7 +825,7 @@ def train_valid_test_split(self,
             valid_dir: Optional[str] = None,
             test_dir: Optional[str] = None,
             dist_thresh: float = 0.3,
-            log_every_n: int = 1000) -> Tuple[Dataset, Dataset, Dataset]:
+            log_every_n: int = 10) -> Tuple[Dataset, Dataset, Dataset]:
         """Creates a split for the given datset
 
         This split splits the dataset into a list of super scaffolds then

From 035d2837798d61b83f0c14ba2ebba6447d566d59 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Thu, 20 Jun 2024 17:50:49 -0700
Subject: [PATCH 09/28] Implemented enhancement request from AMPL issue #318:
 Random forest and XGBoost classification models now support class balancing
 weights when weight_transform_type parameter is set to 'balancing'.

---
 atomsci/ddm/pipeline/model_datasets.py | 20 +++++++++++++++++++-
 atomsci/ddm/pipeline/model_wrapper.py  | 18 +++++++++++++++++-
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py
index c6b66326..a809ef3b 100644
--- a/atomsci/ddm/pipeline/model_datasets.py
+++ b/atomsci/ddm/pipeline/model_datasets.py
@@ -87,6 +87,24 @@ def check_task_columns(params, dset_df):
         raise Exception(f"Requested prediction task columns {missing_tasks} are missing from training dataset")
 
 
+# ****************************************************************************************
+def get_class_freqs(params):
+    """Given an input dataset for a classification model, returns the value counts
+    for each class for each task. Uses params.dataset_key to locate the dataset and params.response_cols
+    to identify the columns containing the class labels. Returns a list of lists, one for each task,
+    containing the value counts for each class (not counting missing values).
+    """
+    dset_df = pd.read_csv(params.dataset_key)
+    freq_list = []
+    # =-=ksm Strange but true: All classification tasks have to have the same number of classes. This may
+    # be a "feature" of DeepChem, because of the format of the model prediction arrays.
+    nclasses = params.class_number
+    for col in params.response_cols:
+        vc = dset_df[col].value_counts()
+        freqs = [vc.get(cl, 0) for cl in range(nclasses)]
+        freq_list.append(freqs)
+    return freq_list
+
 # ****************************************************************************************
 def set_group_permissions(system, path, data_owner='public', data_owner_group='public'):
     """Set file group and permissions to standard values for a dataset containing proprietary
@@ -408,7 +426,7 @@ def get_featurized_data(self, params=None):
         self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w)
         # Checking for minimum number of rows
         if len(self.dataset) < params.min_compound_number:
-            self.log.warning("Dataset of length %i is shorter than the required length %i" % (len(self.dataset), params.min_compound_number))
+            self.log.info("Dataset of length %i is shorter than the recommended length %i" % (len(self.dataset), params.min_compound_number))
 
     # ****************************************************************************************
     def get_dataset_tasks(self, dset_df):
diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py
index 26eb19a3..7f5b59fe 100644
--- a/atomsci/ddm/pipeline/model_wrapper.py
+++ b/atomsci/ddm/pipeline/model_wrapper.py
@@ -49,6 +49,7 @@
 
 from atomsci.ddm.utils import datastore_functions as dsf
 from atomsci.ddm.utils import llnl_utils
+from atomsci.ddm.pipeline import model_datasets as md
 from atomsci.ddm.pipeline import transformations as trans
 from atomsci.ddm.pipeline import perf_data as perf
 import atomsci.ddm.pipeline.parameter_parser as pp
@@ -1832,9 +1833,14 @@ def make_dc_model(self, model_dir):
                                              max_depth=self.params.rf_max_depth,
                                              n_jobs=-1)
         else:
+            if self.params.weight_transform_type == 'balancing':
+                class_weights = 'balanced'
+            else:
+                class_weights = None
             rf_model = RandomForestClassifier(n_estimators=self.params.rf_estimators,
                                               max_features=self.params.rf_max_features,
                                               max_depth=self.params.rf_max_depth,
+                                              class_weight=class_weights,
                                               n_jobs=-1)
 
         return dc.models.sklearn_models.SklearnModel(rf_model, model_dir=model_dir)
@@ -2003,6 +2009,16 @@ def make_dc_model(self, model_dir):
                                          max_bin = 16,
                                          )
         else:
+            if self.params.weight_transform_type == 'balancing':
+                # Compute a class weight for positive class samples to help deal with imblanced datasets
+                class_freqs = md.get_class_freqs(self.params)
+                if len(class_freqs) > 1:
+                    raise ValueError("xgboost models don't currently support multitask data")
+                if len(class_freqs[0]) > 2:
+                    raise ValueError("xgboost models don't currently support multiclass data")
+                pos_class_weight = class_freqs[0][0]/class_freqs[0][1]
+            else:
+                pos_class_weight = 1
             xgb_model = xgb.XGBClassifier(max_depth=self.params.xgb_max_depth,
                                          learning_rate=self.params.xgb_learning_rate,
                                          n_estimators=self.params.xgb_n_estimators,
@@ -2017,7 +2033,7 @@ def make_dc_model(self, model_dir):
                                           colsample_bylevel=1,
                                           reg_alpha=self.params.xgb_alpha,
                                           reg_lambda=self.params.xgb_lambda,
-                                          scale_pos_weight=1,
+                                          scale_pos_weight=pos_class_weight,
                                           base_score=0.5,
                                           random_state=0,
                                           importance_type='gain',

From b31769e1466e99f3dc584f50a1c111098ac08819 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Fri, 28 Jun 2024 14:20:06 -0700
Subject: [PATCH 10/28] Save split_uuid in self.params at end of
 split_dataset().

---
 atomsci/ddm/pipeline/model_pipeline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py
index bafaca4a..2486c313 100644
--- a/atomsci/ddm/pipeline/model_pipeline.py
+++ b/atomsci/ddm/pipeline/model_pipeline.py
@@ -535,6 +535,7 @@ def split_dataset(self, featurization=None):
             featurization = feat.create_featurization(self.params)
         self.featurization = featurization
         self.load_featurize_data()
+        self.params.split_uuid = self.data.split_uuid
         return self.data.split_uuid
 
 

From 03b8642246e911ae88b0abe59bb97b345b92ec78 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Fri, 28 Jun 2024 14:21:21 -0700
Subject: [PATCH 11/28] Added code to show valid & train Wasserstein distances
 in plot titles.

---
 .../ddm/utils/split_response_dist_plots.py    | 25 ++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/atomsci/ddm/utils/split_response_dist_plots.py b/atomsci/ddm/utils/split_response_dist_plots.py
index d457b814..ef7448db 100644
--- a/atomsci/ddm/utils/split_response_dist_plots.py
+++ b/atomsci/ddm/utils/split_response_dist_plots.py
@@ -46,6 +46,9 @@ def plot_split_subset_response_distrs(params, axes=None, plot_size=7):
     else:
         subset_order = ['train', 'valid', 'test']
 
+    wdist_df = compute_split_subset_wasserstein_distances(params)
+
+
     if axes is None:
         fig, axes = plt.subplots(1, len(params.response_cols), figsize=(plot_size*len(params.response_cols), plot_size))
     if len(params.response_cols) == 1:
@@ -54,10 +57,18 @@ def plot_split_subset_response_distrs(params, axes=None, plot_size=7):
         axes = axes.flatten()
     for colnum, col in enumerate(params.response_cols):
         ax = axes[colnum]
+        if params.split_strategy == 'train_valid_test':
+            tvv_wdist = wdist_df[(wdist_df.split_subset == 'valid') & (wdist_df.response_col == col)].distance.values[0]
+            tvt_wdist = wdist_df[(wdist_df.split_subset == 'test') & (wdist_df.response_col == col)].distance.values[0]
         if params.prediction_type == 'regression':
             ax = sns.kdeplot(data=dset_df, x=col, hue='split_subset', hue_order=subset_order, 
                              bw_adjust=0.7, fill=True, common_norm=False, ax=ax)
             ax.set_title(f"{col} distribution by subset under {split_label}")
+            if params.split_strategy == 'train_valid_test':
+                ax.set_title(f"{col} distribution by subset under {split_label}\n" \
+                             f"Wasserstein distances: valid = {tvv_wdist:.3f}, test = {tvt_wdist:.3f}")
+            else:
+                ax.set_title(f"{col} distribution by subset under {split_label}")
         else:
             pct_active = []
             for ss in subset_order:
@@ -66,7 +77,12 @@ def plot_split_subset_response_distrs(params, axes=None, plot_size=7):
                 pct_active.append(100*nactive/sum(ss_df[col].notna()))
             active_df = pd.DataFrame(dict(subset=subset_order, percent_active=pct_active))
             ax = sns.barplot(data=active_df, x='subset', y='percent_active', hue='subset', ax=ax)
-            ax.set_title(f"Percent of {col} = 1 by subset under {split_label}")
+
+            if params.split_strategy == 'train_valid_test':
+                ax.set_title(f"Percent of {col} = 1 by subset under {split_label}" \
+                             f"Wasserstein distances: valid = {tvv_wdist:.3f}, test = {tvt_wdist:.3f}")
+            else:
+                ax.set_title(f"Percent of {col} = 1 by subset under {split_label}")
             ax.set_xlabel('')
 
     # Restore previous matplotlib color cycle
@@ -153,7 +169,7 @@ def get_split_labeled_dataset(params):
     """
     if isinstance(params, dict):
         params = parse.wrapper(params)
-    dset_df = pd.read_csv(params.dataset_key, dtype={'compound_id': str})
+    dset_df = pd.read_csv(params.dataset_key, dtype={params.id_col: str})
     if params.split_strategy == 'k_fold_cv':
         split_file = f"{os.path.splitext(params.dataset_key)[0]}_{params.num_folds}_fold_cv_{params.splitter}_{params.split_uuid}.csv"
     else:
@@ -167,6 +183,9 @@ def get_split_labeled_dataset(params):
         split_label = f"{nfolds}-fold {params.splitter} cross-validation split"
     else:
         dset_df['split_subset'] = dset_df.subset.values
-        split_label = f"{params.splitter} split"
+        if params.splitter == 'multitaskscaffold':
+            split_label = 'MTSS split'
+        else:
+            split_label = f"{params.splitter} split"
     return dset_df, split_label
 

From 39dc2c19b1da77ecd483454c46d3c22a615162c5 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Fri, 28 Jun 2024 14:23:53 -0700
Subject: [PATCH 12/28] Added functions to generate multi-plot displays to
 assess split quality.

---
 atomsci/ddm/utils/split_diagnostic_plots.py | 213 ++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 atomsci/ddm/utils/split_diagnostic_plots.py

diff --git a/atomsci/ddm/utils/split_diagnostic_plots.py b/atomsci/ddm/utils/split_diagnostic_plots.py
new file mode 100644
index 00000000..fc67aa05
--- /dev/null
+++ b/atomsci/ddm/utils/split_diagnostic_plots.py
@@ -0,0 +1,213 @@
+"""Functions to generate multi-plot displays to assess split quality: response value distributions, test-vs-train Tanimoto distance distributions, etc.
+"""
+
+import os
+import numpy as np
+import pandas as pd
+from argparse import Namespace
+from atomsci.ddm.pipeline.model_pipeline import ModelPipeline
+from atomsci.ddm.pipeline import parameter_parser as parse
+from atomsci.ddm.pipeline import perf_plots as pp
+from atomsci.ddm.utils import split_response_dist_plots as srdp
+from atomsci.ddm.utils import compare_splits_plots as csp
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+from scipy import stats
+
+
+# ---------------------------------------------------------------------------------------------------------------------------------
+def plot_split_diagnostics(params_or_pl, axes=None, num_rows=None, num_cols=1, min_tvt_dist=0.3, plot_size=7):
+    """Generate several plots showing various aspects of split quality. These include: The value distributions for each response
+    column in the different split subsets; the nearest-training-set-neighbor Tanimoto distance distributions for the validation
+    and test sets; the actual split subset proportions; and (for multitask scaffold splits) the progression of fitness term values
+    over generations of the genetic algorithm.
+    
+    Args:
+        params_or_pl (argparse.Namespace or dict or ModelPipeline): Structure containing dataset and split parameters, or
+        the ModelPipeline object used to perform a split. If a ModelPipeline is passed as this argument, the parameters
+        are extracted from it and an additional plot will be generated for multitaskscaffold splits showing the progression
+        of fitness term values over generations.  The following parameters are required, if not set to default values:
+        
+        | - dataset_key
+        | - split_uuid
+        | - split_strategy
+        | - splitter
+        | - split_valid_frac
+        | - split_test_frac
+        | - num_folds
+        | - smiles_col
+        | - response_cols
+
+        axes (matplotlib.Axes): Axes to draw plots in. If provided, must contain enough entries to display all the plots requested
+        (2 for each response column + 3 + 1 for a multitask scaffold split when a ModelPipeline is passed to params_or_pl). If not
+        provided, a figure and Axes of the required length will be created.
+
+        num_rows (int): Number of rows for the Axes layout; ignored if an existing set of Axes are passed in the `axes` argument.
+        num_cols (int): Number of columns for the Axes layout; ignored if an existing set of Axes are passed in the `axes` argument.
+        plot_size (float): Height of plots; ignored if `axes` is provided
+    Returns:
+        None
+    """
+
+    # Save current matplotlib color cycle and switch to 'colorblind' palette
+    old_palette = sns.color_palette()
+    sns.set_palette('colorblind')
+
+    if isinstance(params_or_pl, dict):
+        params = parse.wrapper(params_or_pl)
+    elif isinstance(params_or_pl, Namespace):
+        params = params_or_pl
+    elif isinstance(params_or_pl, ModelPipeline):
+        params = params_or_pl.params
+        splitter = params_or_pl.data.splitting.splitter
+        params.fitness_terms = splitter.fitness_terms
+        params.split_uuid = params_or_pl.data.split_uuid
+        
+    else:
+        raise ValueError("params_or_pl must be dict, Namespace or ModelPipeline")
+    
+    # Figure out how many plots we're going to generate and how to lay them out
+    nresp = len(params.response_cols)
+    nfitness = int((params.splitter == 'multitaskscaffold') and (isinstance(params_or_pl, ModelPipeline)))
+    nplots = 2*nresp + 3 + nfitness
+    if axes is not None:
+        axes = axes.flatten()
+        if len(axes) < nplots:
+            raise ValueError(f"axes argument needs {nplots} axis pairs for requested plots; only provides {len(axes)}")
+    else:
+        if num_rows is None:
+            num_rows = (nplots + 1) // num_cols
+        fig, axes = plt.subplots(num_rows, num_cols, figsize=(num_cols*plot_size, num_rows*plot_size), layout='constrained')
+        axes = axes.flatten()
+    plot_num = 0
+
+    # Draw split response distribution plots
+    srdp.plot_split_subset_response_distrs(params, plot_size=plot_size, axes=axes)
+
+    # Draw NN Tanimoto distance distribution plots with titles showing fraction of compounds below min_tvt_dist
+    plot_num += nresp
+    dset_path = params.dataset_key
+    split_path = f"{os.path.splitext(dset_path)[0]}_{params.split_strategy}_{params.splitter}_{params.split_uuid}.csv"
+    dset_df = pd.read_csv(dset_path, dtype={params.id_col: str})
+    split_df = pd.read_csv(split_path, dtype={'cmpd_id': str})
+    split_stats = csp.SplitStats(dset_df, split_df, smiles_col=params.smiles_col, id_col=params.id_col,
+                                 response_cols=params.response_cols)
+    vvtr_dists = split_stats.dists_tvv
+    tvtr_dists = split_stats.dists_tvt
+    vfrac = sum(vvtr_dists <= min_tvt_dist)/len(vvtr_dists)
+    tfrac = sum(tvtr_dists <= min_tvt_dist)/len(tvtr_dists)
+    ax = axes[plot_num]
+    ax = split_stats.dist_hist_train_v_valid_plot(ax=ax)
+    ax.set_title(f"Valid vs train Tanimoto NN distance distribution\nFraction <= {min_tvt_dist} = {vfrac:.3f}")
+
+    plot_num += 1
+    ax = axes[plot_num]
+    ax = split_stats.dist_hist_train_v_test_plot(ax=ax)
+    ax.set_title(f"Test vs train Tanimoto NN distance distribution\nFraction <= {min_tvt_dist} = {tfrac:.3f}")
+
+    # Draw plots of actual and requested split proportions and counts for each response column after excluding missing values
+    plot_num += 1
+    plot_split_fractions(params, axes[plot_num:])
+    plot_num += nresp
+
+    # Plot the evolution over generations of each term in the fitness function optimized by the multitask scaffold splitter.
+    if nfitness > 0:
+        plot_fitness_terms(params, axes[plot_num:])
+
+def plot_fitness_terms(params, axes):
+    """Plot the evolution over generations of each term in the fitness function optimized by the multitask scaffold splitter.
+
+    Args:
+        params (argparse.Namespace or dict): Structure containing dataset and split parameters. The parameters must include
+        the key `fitness_terms`, which is created by the multitask splitter; usually this is only available when `params`
+        is derived from the ModelPipeline object used to split the dataset.
+
+        axes (numpy.array of matplotlib.Axes): Axes to draw plots in.
+    """
+    if isinstance(params, dict):
+        params = parse.wrapper(params)
+    if isinstance(axes, np.ndarray):
+        ax = axes[0]
+    else:
+        ax = axes
+    fitness_df = pd.DataFrame(params.fitness_terms)
+    fitness_df['generation'] = list(range(len(fitness_df)))
+    fit_long_df = fitness_df.melt(id_vars='generation', value_vars=list(params.fitness_terms.keys()), var_name='Fitness term', value_name='Score')
+    ax = sns.lineplot(data=fit_long_df, x='generation', y='Score', hue='Fitness term', ax=ax)
+    ax.set_title('Unweighted fitness scores vs generation')
+        
+
+def plot_split_fractions(params, axes):
+    """Draw plots of actual and requested split proportions and counts for each response column after excluding missing values
+
+    Args:
+        params (argparse.Namespace or dict): Structure containing dataset and split parameters.
+        The following parameters are required, if not set to default values:
+        
+        | - dataset_key
+        | - split_uuid
+        | - split_strategy
+        | - splitter
+        | - split_valid_frac
+        | - split_test_frac
+        | - num_folds
+        | - smiles_col
+        | - response_cols
+
+        axes (matplotlib.Axes): Axes to draw plots in. Must contain at least as many entries as response columns in the dataset.
+    """
+    split_dset_df, _ = srdp.get_split_labeled_dataset(params)
+    axes = axes.flatten()
+    req_color = pp.test_col
+    actual_color = pp.train_col
+    for icol, resp_col in enumerate(params.response_cols):
+        ss_df = split_dset_df[split_dset_df[resp_col].notna()]
+        ss_counts = ss_df.subset.value_counts()
+        total_count = sum(ss_counts)
+        ss_fracs = ss_counts / total_count
+        actual_df = pd.DataFrame(dict(counts=ss_counts, fracs=ss_fracs))
+        req_fracs = dict(train=1-params.split_valid_frac-params.split_test_frac, valid=params.split_valid_frac, test=params.split_test_frac)
+        req_df = pd.DataFrame(dict(fracs=req_fracs))
+        req_df['counts'] = np.round(total_count * req_df.fracs).astype(int)
+
+        bar_width = 0.35
+
+        # Positions of the bars on the x-axis
+        r1 = np.arange(len(actual_df))
+        r2 = r1 + bar_width
+
+        # Plotting the proportions
+        ax1 = axes[icol]
+        bars_requested = ax1.bar(r1, req_df.fracs.values, color=req_color, width=bar_width, label='Requested', alpha=0.6)
+        bars_actual = ax1.bar(r2, actual_df.fracs.values, color=actual_color, width=bar_width, label='Actual', alpha=0.6)
+        max_count = max(max(actual_df.counts.values), max(req_df.counts.values))
+
+        # Left Y-axis (proportions)
+        ax1.set_ylabel('Proportions')
+        #ax1.tick_params(axis='y')
+
+        # Create the second Y-axis (counts)
+        ax2 = ax1.twinx()
+        ax2.set_ylabel('Counts')
+        #ax2.tick_params(axis='y', labelcolor='k')
+        ax2.set_ylim(0, max_count*1.1)
+
+        # Adding counts on top of the bars
+        for bar, count in zip(bars_requested, req_df.counts.values):
+            height = bar.get_height()
+            ax1.text(bar.get_x() + bar.get_width() / 2, height, f'{count}', ha='center', va='bottom', color=actual_color)
+
+        for bar, count in zip(bars_actual, actual_df.counts.values):
+            height = bar.get_height()
+            ax1.text(bar.get_x() + bar.get_width() / 2, height, f'{count}', ha='center', va='bottom', color=req_color)
+
+        # X-axis labels
+        plt.xticks([r + bar_width / 2 for r in range(len(actual_df))], actual_df.index.values)
+
+        # Title and legend
+        if icol == 0:
+            plt.title(f"Requested and Actual Subset Proportions and Counts\n{resp_col}")
+            ax1.legend(loc='upper right')
+        else:
+            plt.title(f"\n{resp_col}")

From 8ea481a16d21f5fe3780e605cbe4778b1a198e58 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Mon, 1 Jul 2024 12:34:40 -0700
Subject: [PATCH 13/28] Fixed plot_split_fractions so that bars always appear
 in order train, valid, test.

---
 atomsci/ddm/utils/split_diagnostic_plots.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/atomsci/ddm/utils/split_diagnostic_plots.py b/atomsci/ddm/utils/split_diagnostic_plots.py
index fc67aa05..447c5166 100644
--- a/atomsci/ddm/utils/split_diagnostic_plots.py
+++ b/atomsci/ddm/utils/split_diagnostic_plots.py
@@ -166,9 +166,9 @@ def plot_split_fractions(params, axes):
         ss_counts = ss_df.subset.value_counts()
         total_count = sum(ss_counts)
         ss_fracs = ss_counts / total_count
-        actual_df = pd.DataFrame(dict(counts=ss_counts, fracs=ss_fracs))
+        actual_df = pd.DataFrame(dict(counts=ss_counts, fracs=ss_fracs)).reindex(['train', 'valid', 'test'])
         req_fracs = dict(train=1-params.split_valid_frac-params.split_test_frac, valid=params.split_valid_frac, test=params.split_test_frac)
-        req_df = pd.DataFrame(dict(fracs=req_fracs))
+        req_df = pd.DataFrame(dict(fracs=req_fracs)).reindex(['train', 'valid', 'test'])
         req_df['counts'] = np.round(total_count * req_df.fracs).astype(int)
 
         bar_width = 0.35

From bc43bb50495845164c0535d209c77a259e272584 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Thu, 11 Jul 2024 18:55:42 -0700
Subject: [PATCH 14/28] Added initial version of function to draw line plot of
 NN feature weight absolute sums vs epoch, to assess effect of weight decay
 penalty.

---
 atomsci/ddm/pipeline/feature_importance.py | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/atomsci/ddm/pipeline/feature_importance.py b/atomsci/ddm/pipeline/feature_importance.py
index 38135f6c..25be1c84 100755
--- a/atomsci/ddm/pipeline/feature_importance.py
+++ b/atomsci/ddm/pipeline/feature_importance.py
@@ -525,3 +525,33 @@ def _calc_cluster_permutation_scores(estimator, X, y, col_indices, random_state,
 
 
 
+# ===================================================================================================
+def plot_nn_feature_weights_vs_epoch(pipeline, axes=None, plot_width=20, plot_height=15):
+    """Plot a line graph of summed weight absolute values from neural network feature nodes to first hidden layer vs 
+    training epoch. This plot is intended to show which features have the most influence on the final
+    prediction output. It is also used to show the effect of increasing the weight_decay_penalty
+    parameter, which _should_ result in decreasing the weights on less important features.
+    
+    Args:
+        pipeline (ModelPipeline): The pipeline object from a neural network model that was trained in the
+        current Python session.
+        axes (matplotlib.Axes): An Axes object to draw the plot in. If none is provided, the function will
+        create one in a figure with the specified width and height.
+        plot_width (float): Figure width
+        plot_height (float): Figure height
+    
+    Returns:
+        None
+    """
+    if axes is None:
+        fig, axes = plt.subplots(figsize=(plot_width, plot_height))
+        fig.suptitle("Sums of feature node : hidden layer absolute weights vs epoch")
+    
+    wt_df = pipeline.model_wrapper.feature_weights_df
+    max_epoch = len(wt_df)
+    best_epoch = pipeline.model_wrapper.best_epoch
+    for feat in pipeline.data.featurization.get_feature_columns():
+        sns.lineplot(data=wt_df, x='epoch', y=feat, ax=axes)
+        axes.text(max_epoch + 4, wt_df[feat].values[max_epoch-1], feat)
+    axes.set_ylabel("Sum over 1st hidden layer nodes of feature weight absolute values")
+    axes.axvline(best_epoch, linestyle='--', color='blue')
\ No newline at end of file

From f6c7a44e5ea12103f1ffb629abaf03e2dc3ab4c2 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Thu, 11 Jul 2024 18:57:16 -0700
Subject: [PATCH 15/28] Removed old PRC plot function.

---
 atomsci/ddm/pipeline/perf_plots.py | 53 ------------------------------
 1 file changed, 53 deletions(-)

diff --git a/atomsci/ddm/pipeline/perf_plots.py b/atomsci/ddm/pipeline/perf_plots.py
index c1b2eeec..36590d1b 100644
--- a/atomsci/ddm/pipeline/perf_plots.py
+++ b/atomsci/ddm/pipeline/perf_plots.py
@@ -845,59 +845,6 @@ def plot_prec_recall_curve(MP, epoch_label='best', plot_size=7, pdf_dir=None):
         pdf.close()
         MP.log.info("Wrote plot to %s" % pdf_path)
 
-#------------------------------------------------------------------------------------------------------------------------
-def old_plot_prec_recall_curve(MP, epoch_label='best', plot_size=7, pdf_dir=None):
-    """Plot precision-recall curves for a classification model.
-
-    Args:
-        MP (`ModelPipeline`): Pipeline object for a model that was trained in the current Python session.
-
-        epoch_label (str): Label for training epoch to draw predicted values from. Currently 'best' is the only allowed value.
-
-        pdf_dir (str): If given, output the plots to a PDF file in the given directory.
-
-    Returns:
-        None
-
-    """
-    params = MP.params
-    curve_data = _get_perf_curve_data(MP, epoch_label, 'precision-recall')
-    if len(curve_data) == 0:
-        return
-    if MP.run_mode == 'training':
-        # Draw overlapping PR curves for train, valid and test sets
-        subsets = ['train', 'valid', 'test']
-    else:
-        subsets = ['full']
-    if pdf_dir is not None:
-        pdf_path = os.path.join(pdf_dir, '%s_%s_model_%s_features_%s_split_PRC_curves.pdf' % (
-                                params.dataset_name, params.model_type, params.featurizer, params.splitter))
-        pdf = PdfPages(pdf_path)
-    subset_colors = dict(train=train_col, valid=valid_col, test=test_col, full=full_col)
-    # For multitask, do a separate figure for each task
-    ntasks = curve_data[subsets[0]]['prob_active'].shape[1]
-    for i in range(ntasks):
-        fig, ax = plt.subplots(figsize=(plot_size,plot_size))
-        title = '%s dataset\nPrecision-recall curve for %s %s classifier on %s features with %s split' % (
-                           params.dataset_name, params.response_cols[i], 
-                           params.model_type, params.featurizer, params.splitter)
-        for subset in subsets:
-            precision, recall, thresholds = metrics.precision_recall_curve(curve_data[subset]['true_classes'][:,i],
-                                                     curve_data[subset]['prob_active'][:,i])
-      
-            prc_auc = curve_data[subset]['prc_aucs'][i]
-            ax.step(recall, precision, color=subset_colors[subset], label="%s: AUC = %.3f" % (subset, prc_auc))
-        ax.set_xlabel('Recall')
-        ax.set_ylabel('Precision')
-        ax.set_title(title, fontdict={'fontsize' : 12})
-        legend = ax.legend(loc='lower right')
-    
-        if pdf_dir is not None:
-            pdf.savefig(fig)
-    if pdf_dir is not None:
-        pdf.close()
-        MP.log.info("Wrote plot to %s" % pdf_path)
-
 #------------------------------------------------------------------------------------------------------------------------
 def plot_umap_feature_projections(MP, ndim=2, num_neighbors=20, min_dist=0.1, 
                                   fit_to_train=True,

From 48a53cb57e08c16cdec4f527a683567e302eab04 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Mon, 6 May 2024 23:45:17 -0700
Subject: [PATCH 16/28] (1) Added code to DCNNModelWrapper to track absolute
 sum of 1st hidden layer weights for each feature as it evolves over epochs,
 to see effect of setting weight decay parameters; (2) Added model parameters
 xgb_alpha and xgb_lambda and corresponding hyperopt parameters xgba and xgbb
 to control strength of L1 and L2 regularization penalties.

---
 atomsci/ddm/pipeline/model_wrapper.py    | 31 ++++++++++++++++++------
 atomsci/ddm/pipeline/parameter_parser.py | 18 ++++++++++++++
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py
index 5bd23977..40956132 100644
--- a/atomsci/ddm/pipeline/model_wrapper.py
+++ b/atomsci/ddm/pipeline/model_wrapper.py
@@ -10,6 +10,7 @@
 
 import deepchem as dc
 import numpy as np
+import pandas as pd
 import tensorflow as tf
 if dc.__version__.startswith('2.1'):
     from deepchem.models.tensorgraph.fcnet import MultitaskRegressor, MultitaskClassifier
@@ -986,6 +987,9 @@ def train_with_early_stopping(self, pipeline):
                 valid_epoch_perfs (np.array): A standard validation set performance metric (r2_score or roc_auc), at the end of each epoch.
         """
         self.data = pipeline.data
+        feature_names = self.data.featurization.get_feature_columns()
+        nfeatures = len(feature_names)
+        self.feature_weights = dict(zip(feature_names, [[] for f in feature_names]))
 
         em = perf.EpochManager(self,
                                 prediction_type=self.params.prediction_type, 
@@ -1008,11 +1012,22 @@ def train_with_early_stopping(self, pipeline):
                           ei, pipeline.metric_type, train_perf, pipeline.metric_type, valid_perf,
                           pipeline.metric_type, test_perf))
 
+            layer1_weights = self.model.model.layers[0].weight
+            feature_weights = np.zeros(nfeatures, dtype=float)
+            for node_weights in layer1_weights:
+                node_feat_weights = torch.abs(node_weights).detach().numpy()
+                feature_weights += node_feat_weights
+            for fnum, fname in enumerate(feature_names):
+                self.feature_weights[fname].append(feature_weights[fnum])
+
             self.num_epochs_trained = ei + 1
             # Compute performance metrics for each subset, and check if we've reached a new best validation set score
             if em.should_stop():
                 break
 
+        self.feature_weights_df = pd.DataFrame(self.feature_weights)
+        self.feature_weights_df['epoch'] = range(len(self.feature_weights_df))
+
         # Revert to last checkpoint
         self.restore()
         self.model_save()
@@ -1993,8 +2008,8 @@ def make_dc_model(self, model_dir):
                                          subsample=self.params.xgb_subsample,
                                          colsample_bytree=self.params.xgb_colsample_bytree,
                                          colsample_bylevel=1,
-                                         reg_alpha=0,
-                                         reg_lambda=1,
+                                         reg_alpha=self.params.xgb_alpha,
+                                         reg_lambda=self.params.xgb_lambda,
                                          scale_pos_weight=1,
                                          base_score=0.5,
                                          random_state=0,
@@ -2018,8 +2033,8 @@ def make_dc_model(self, model_dir):
                                           subsample=self.params.xgb_subsample,
                                           colsample_bytree=self.params.xgb_colsample_bytree,
                                           colsample_bylevel=1,
-                                          reg_alpha=0,
-                                          reg_lambda=1,
+                                          reg_alpha=self.params.xgb_alpha,
+                                          reg_lambda=self.params.xgb_lambda,
                                           scale_pos_weight=1,
                                           base_score=0.5,
                                           random_state=0,
@@ -2128,8 +2143,8 @@ def reload_model(self, reload_dir):
                                          subsample=self.params.xgb_subsample,
                                          colsample_bytree=self.params.xgb_colsample_bytree,
                                          colsample_bylevel=1,
-                                         reg_alpha=0,
-                                         reg_lambda=1,
+                                         reg_alpha=self.params.xgb_alpha,
+                                         reg_lambda=self.params.xgb_lambda,
                                          scale_pos_weight=1,
                                          base_score=0.5,
                                          random_state=0,
@@ -2153,8 +2168,8 @@ def reload_model(self, reload_dir):
                                          subsample=self.params.xgb_subsample,
                                          colsample_bytree=self.params.xgb_colsample_bytree,
                                          colsample_bylevel=1,
-                                         reg_alpha=0,
-                                         reg_lambda=1,
+                                         reg_alpha=self.params.xgb_alpha,
+                                         reg_lambda=self.params.xgb_lambda,
                                          scale_pos_weight=1,
                                          base_score=0.5,
                                          random_state=0,
diff --git a/atomsci/ddm/pipeline/parameter_parser.py b/atomsci/ddm/pipeline/parameter_parser.py
index 8d960b38..2c2a31c6 100644
--- a/atomsci/ddm/pipeline/parameter_parser.py
+++ b/atomsci/ddm/pipeline/parameter_parser.py
@@ -535,6 +535,8 @@ def get_list_args(self):
                          'umap_targ_wt', 'umap_min_dist', 'dropout_list','weight_decay_penalty',
                          'xgb_learning_rate',
                          'xgb_gamma',
+                         'xgb_alpha',
+                         'xgb_lambda',
                          "xgb_min_child_weight",
                          "xgb_subsample",
                          "xgb_colsample_bytree",
@@ -549,6 +551,8 @@ def get_list_args(self):
 not_a_list_outside_of_hyperparams = {'learning_rate','weight_decay_penalty',
                                      'xgb_learning_rate',
                                      'xgb_gamma',
+                                     'xgb_alpha',
+                                     'xgb_lambda',
                                      'xgb_min_child_weight',
                                      'xgb_subsample',
                                      'xgb_colsample_bytree',
@@ -1314,6 +1318,14 @@ def get_parser():
         '--xgb_gamma', dest='xgb_gamma', default='0.0',
         help='Minimum loss reduction required to make a further partition on a leaf node of the tree. Can be input'
              ' as a comma separated list for hyperparameter search (e.g. \'0.0,0.1,0.2\')')
+    parser.add_argument(
+        '--xgb_alpha', dest='xgb_alpha', default='0.0',
+        help='L1 regularization term on weights. Increasing this value will make model more conservative. Can be input'
+             ' as a comma separated list for hyperparameter search (e.g. \'0.0,0.1,0.2\')')
+    parser.add_argument(
+        '--xgb_lambda', dest='xgb_lambda', default='1.0',
+        help='L2 regularization term on weights. Increasing this value will make model more conservative. Can be input'
+             ' as a comma separated list for hyperparameter search (e.g. \'0.0,0.1,0.2\')')
     parser.add_argument(
         '--xgb_learning_rate', dest='xgb_learning_rate', default='0.1',
         help='Boosting learning rate (xgb\'s \"eta\"). Can be input as a comma separated list for hyperparameter'
@@ -1524,6 +1536,12 @@ def get_parser():
     parser.add_argument(
         '--xgbg', dest='xgbg', required=False, default=None,
         help='xgb_gamma shown in HyperOpt domain format, e.g. --xgbg=uniform|0,0.4')
+    parser.add_argument(
+        '--xgba', dest='xgba', required=False, default=None,
+        help='xgb_alpha shown in HyperOpt domain format, e.g. --xgba=uniform|0,0.4')
+    parser.add_argument(
+        '--xgbb', dest='xgbb', required=False, default=None,
+        help='xgb_lambda shown in HyperOpt domain format, e.g. --xgbb=uniform|0,0.4')
     parser.add_argument(
         '--xgbl', dest='xgbl', required=False, default=None,
         help='xgb_learning_rate shown in HyperOpt domain format, e.g. --xgbl=loguniform|-6.9,-2.3')

From 0f0eded0b567accc9a57e8433e3d0ec12ce608f9 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Fri, 31 May 2024 16:49:08 -0700
Subject: [PATCH 17/28] Added two new parameters, wdp and wdt, to allow
 hyperopt domain searching on weight_decay_penalty and
 weight_decay_penalty_type parameters. Completed implementation of hyperopt
 domain specification for xgb_alpha and xgb_lambda.

---
 atomsci/ddm/pipeline/parameter_parser.py      |  6 +++
 .../ddm/utils/hyperparam_search_wrapper.py    | 50 +++++++++++++++++--
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/atomsci/ddm/pipeline/parameter_parser.py b/atomsci/ddm/pipeline/parameter_parser.py
index 2c2a31c6..07a3cf5f 100644
--- a/atomsci/ddm/pipeline/parameter_parser.py
+++ b/atomsci/ddm/pipeline/parameter_parser.py
@@ -1522,6 +1522,12 @@ def get_parser():
     parser.add_argument(
         '--dp', dest='dp', required=False, default=None,
         help='dropouts shown in HyperOpt domain format, e.g. --dp=uniform|3|0,0.4')
+    parser.add_argument(
+        '--wdp', dest='wdp', required=False, default=None,
+        help='weight_decay_penalty shown in HyperOpt domain format, e.g. --wdp=loguniform|-6.908,-4.605')
+    parser.add_argument(
+        '--wdt', dest='wdt', required=False, default=None,
+        help='weight_decay_penalty_type shown in HyperOpt domain format, e.g. --wdt=choice|l1,l2')
     # RF model
     parser.add_argument(
         '--rfe', dest='rfe', required=False, default=None,
diff --git a/atomsci/ddm/utils/hyperparam_search_wrapper.py b/atomsci/ddm/utils/hyperparam_search_wrapper.py
index 832de9ee..075a034e 100755
--- a/atomsci/ddm/utils/hyperparam_search_wrapper.py
+++ b/atomsci/ddm/utils/hyperparam_search_wrapper.py
@@ -1418,6 +1418,7 @@ def __init__(self, params):
                     for i in range(1,num_layer):
                         self.space[f"ratio{i}"] = build_hyperopt_search_domain(f"ratio{i}", method, par_list)
 
+            # dropouts
             if self.params.dp:
                 domain_list = self.params.dp.split("|")
                 method = domain_list[0]
@@ -1425,6 +1426,21 @@ def __init__(self, params):
                 par_list = [float(e) for e in domain_list[2].split(",")]
                 for i in range(num_layer):
                     self.space[f"dp{i}"] = build_hyperopt_search_domain(f"dp{i}", method, par_list)
+
+            # weight decay penalty
+            if self.params.wdp:
+                domain_list = self.params.wdp.split("|")
+                method = domain_list[0]
+                par_list = [float(e) for e in domain_list[1].split(",")]
+                self.space["weight_decay_penalty"] = build_hyperopt_search_domain("weight_decay_penalty", method, par_list)
+
+            # weight decay penalty type
+            if self.params.wdt:
+                domain_list = self.params.wdt.split("|")
+                method = domain_list[0]
+                par_list = domain_list[1].split(",")
+                self.space["weight_decay_penalty_type"] = build_hyperopt_search_domain("weight_decay_penalty_type", method, par_list)
+
         elif self.params.model_type == "xgboost":
             #build searching domain for XGBoost parameters
             if self.params.xgbg:
@@ -1433,6 +1449,18 @@ def __init__(self, params):
                 par_list = [float(e) for e in domain_list[1].split(",")]
                 self.space["xgbg"] = build_hyperopt_search_domain("xgbg", method, par_list)
 
+            if self.params.xgba:
+                domain_list = self.params.xgba.split("|")
+                method = domain_list[0]
+                par_list = [float(e) for e in domain_list[1].split(",")]
+                self.space["xgba"] = build_hyperopt_search_domain("xgba", method, par_list)
+
+            if self.params.xgbb:
+                domain_list = self.params.xgbb.split("|")
+                method = domain_list[0]
+                par_list = [float(e) for e in domain_list[1].split(",")]
+                self.space["xgbb"] = build_hyperopt_search_domain("xgbb", method, par_list)
+
             if self.params.xgbl:
                 domain_list = self.params.xgbl.split("|")
                 method = domain_list[0]
@@ -1500,6 +1528,10 @@ def lossfn(p):
                     self.params.learning_rate = p["learning_rate"]
                 if self.params.dp:
                     self.params.dropouts = ",".join([str(p[e]) for e in p if e[:2] == "dp"])
+                if self.params.wdp:
+                    self.params.weight_decay_penalty = p['weight_decay_penalty']
+                if self.params.wdt:
+                    self.params.weight_decay_penalty_type = p['weight_decay_penalty_type']
                 if self.params.ls:
                     if not self.params.ls_ratio:
                         self.params.layer_sizes = ",".join([str(p[e]) for e in p if e[:2] == "ls"])
@@ -1508,11 +1540,16 @@ def lossfn(p):
                         for i in range(1,len([e for e in p if e[:5] == "ratio"])+1):
                             list_layer_sizes.append(int(list_layer_sizes[-1] * p[f"ratio{i}"]))
                         self.params.layer_sizes = ",".join([str(e) for e in list_layer_sizes])
-                hp_params = f'{self.params.learning_rate}_{self.params.layer_sizes}_{self.params.dropouts}'
-                print(f"learning_rate: {self.params.learning_rate}, layer_sizes: {self.params.layer_sizes}, dropouts: {self.params.dropouts}")
+                hp_params = f'{self.params.learning_rate}_{self.params.layer_sizes}_{self.params.dropouts}_{self.params.weight_decay_penalty_type}_{self.params.weight_decay_penalty}'
+                print(f"learning_rate: {self.params.learning_rate}, layer_sizes: {self.params.layer_sizes}, dropouts: {self.params.dropouts}, "
+                      f"weight_decay_penalty: {self.params.weight_decay_penalty_type}({self.params.weight_decay_penalty})")
             elif self.params.model_type == "xgboost":
                 if self.params.xgbg:
                     self.params.xgb_gamma = p["xgbg"]
+                if self.params.xgba:
+                    self.params.xgb_alpha = p["xgba"]
+                if self.params.xgbb:
+                    self.params.xgb_lambda = p["xgbb"]
                 if self.params.xgbl:
                     self.params.xgb_learning_rate = p["xgbl"]
                 if self.params.xgbd:
@@ -1525,8 +1562,10 @@ def lossfn(p):
                     self.params.xgb_n_estimators = p["xgbn"]
                 if self.params.xgbw:
                     self.params.xgb_min_child_weight = p["xgbw"]
-                hp_params = f'{self.params.xgb_gamma}_{self.params.xgb_learning_rate}_{self.params.xgb_max_depth}_{self.params.xgb_colsample_bytree}_{self.params.xgb_subsample}_{self.params.xgb_n_estimators}_{self.params.xgb_min_child_weight}'
+                hp_params = f'{self.params.xgb_gamma}_{self.params.xgb_alpha}_{self.params.xgb_lambda}_{self.params.xgb_learning_rate}_{self.params.xgb_max_depth}_{self.params.xgb_colsample_bytree}_{self.params.xgb_subsample}_{self.params.xgb_n_estimators}_{self.params.xgb_min_child_weight}'
                 print(f"xgb_gamma: {self.params.xgb_gamma}, "
+                      f"xgb_alpha: {self.params.xgb_alpha}, "
+                      f"xgb_lambda: {self.params.xgb_lambda}, "
                       f"xgb_learning_rate: {self.params.xgb_learning_rate}, "
                       f"xgb_max_depth: {self.params.xgb_max_depth}, "
                       f"xgb_colsample_bytree: {self.params.xgb_colsample_bytree}, "
@@ -1709,7 +1748,10 @@ def parse_params(param_list):
                    'slurm_time_limit'} | excluded_keys
     if params.search_type == 'hyperopt':
         # keep more parameters
-        keep_params = keep_params | {'lr', 'learning_rate','ls', 'layer_sizes','ls_ratio','dp', 'dropouts','rfe', 'rf_estimators','rfd', 'rf_max_depth','rff', 'rf_max_features','xgbg', 'xgb_gamma','xgbl', 'xgb_learning_rate', 'xgbd', 'xgb_max_depth', 'xgbc', 'xgb_colsample_bytree', 'xgbs', 'xgb_subsample', 'xgbn', 'xgb_n_estimators', 'xgbw', 'xgb_min_child_weight', 'hp_checkpoint_load', 'hp_checkpoint_save'}
+        keep_params = keep_params | {'lr', 'learning_rate','ls', 'layer_sizes','ls_ratio','dp', 'dropouts', 'wdp', 'weight_decay_penalty', 'wdt', 'weight_decay_penalty_type',
+                                     'rfe', 'rf_estimators','rfd', 'rf_max_depth','rff', 'rf_max_features',
+                                     'xgbg', 'xgb_gamma', 'xgba', 'xgb_alpha', 'xgbb', 'xgb_lambda', 'xgbl', 'xgb_learning_rate', 'xgbd', 'xgb_max_depth', 'xgbc', 'xgb_colsample_bytree', 
+                                     'xgbs', 'xgb_subsample', 'xgbn', 'xgb_n_estimators', 'xgbw', 'xgb_min_child_weight', 'hp_checkpoint_load', 'hp_checkpoint_save'}
 
     params.__dict__ = parse.prune_defaults(params, keep_params=keep_params)
 

From 7b5e99d7520653f4e73c3cdcc783cc8d7c22d06a Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Fri, 31 May 2024 16:57:42 -0700
Subject: [PATCH 18/28] Added sparsity-related parameters weight_decay_penalty,
 weight_decay_penalty_type, xgb_alpha and xgb_lambda to set of model
 parameters displayed by the various compare_models functions.

---
 atomsci/ddm/pipeline/compare_models.py | 46 ++++++++++++++++++++++----
 atomsci/ddm/pipeline/model_wrapper.py  |  2 ++
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/atomsci/ddm/pipeline/compare_models.py b/atomsci/ddm/pipeline/compare_models.py
index 2b9a445d..ce734c1c 100644
--- a/atomsci/ddm/pipeline/compare_models.py
+++ b/atomsci/ddm/pipeline/compare_models.py
@@ -1,5 +1,5 @@
-"""Functions for comparing and visualizing model performance. Most of these functions rely on ATOM's model tracker and
-datastore services, which are not part of the standard AMPL installation, but a few functions will work on collections of
+"""Functions for comparing and visualizing model performance. Some of these functions rely on ATOM's model tracker and
+datastore services, which are not part of the standard AMPL installation, but several functions will work on collections of
 models saved as local files.
 """
 
@@ -163,6 +163,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
     model_type_list = []
     max_epochs_list = []
     learning_rate_list = []
+    weight_decay_penalty_list = []
+    weight_decay_penalty_type_list = []
     dropouts_list = []
     layer_sizes_list = []
     featurizer_list = []
@@ -172,6 +174,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
     rf_max_depth_list = []
     xgb_learning_rate_list = []
     xgb_gamma_list = []
+    xgb_alpha_list = []
+    xgb_lambda_list = []
     xgb_max_depth_list = []
     xgb_colsample_bytree_list = []
     xgb_subsample_list = []
@@ -218,6 +222,9 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
             max_epochs_list.append(nn_params['max_epochs'])
             best_epoch_list.append(nn_params['best_epoch'])
             learning_rate_list.append(nn_params['learning_rate'])
+            weight_decay_penalty_list.append(nn_params['weight_decay_penalty'])
+            weight_decay_penalty_type_list.append(nn_params['weight_decay_penalty_type'])
+            learning_rate_list.append(nn_params['learning_rate'])
             layer_sizes_list.append(','.join(['%d' % s for s in nn_params['layer_sizes']]))
             dropouts_list.append(','.join(['%.2f' % d for d in nn_params['dropouts']]))
             rf_estimators_list.append(nan)
@@ -225,6 +232,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
             rf_max_depth_list.append(nan)
             xgb_learning_rate_list.append(nan)
             xgb_gamma_list.append(nan)
+            xgb_alpha_list.append(nan)
+            xgb_lambda_list.append(nan)
             xgb_max_depth_list.append(nan)
             xgb_colsample_bytree_list.append(nan)
             xgb_subsample_list.append(nan)
@@ -239,10 +248,14 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
             max_epochs_list.append(nan)
             best_epoch_list.append(nan)
             learning_rate_list.append(nan)
+            weight_decay_penalty_list.append(nan)
+            weight_decay_penalty_type_list.append(nan)
             layer_sizes_list.append(nan)
             dropouts_list.append(nan)
             xgb_learning_rate_list.append(nan)
             xgb_gamma_list.append(nan)
+            xgb_alpha_list.append(nan)
+            xgb_lambda_list.append(nan)
             xgb_max_depth_list.append(nan)
             xgb_colsample_bytree_list.append(nan)
             xgb_subsample_list.append(nan)
@@ -256,10 +269,14 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
             max_epochs_list.append(nan)
             best_epoch_list.append(nan)
             learning_rate_list.append(nan)
+            weight_decay_penalty_list.append(nan)
+            weight_decay_penalty_type_list.append(nan)
             layer_sizes_list.append(nan)
             dropouts_list.append(nan)
             xgb_learning_rate_list.append(xgb_params["xgb_learning_rate"])
             xgb_gamma_list.append(xgb_params["xgb_gamma"])
+            xgb_alpha_list.append(xgb_params.get("xgb_alpha", 0.0))
+            xgb_lambda_list.append(xgb_params.get("xgb_lambda", 1.0))
             xgb_max_depth_list.append(xgb_params["xgb_max_depth"])
             xgb_colsample_bytree_list.append(xgb_params["xgb_colsample_bytree"])
             xgb_subsample_list.append(xgb_params["xgb_subsample"])
@@ -277,6 +294,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
                     max_epochs=max_epochs_list,
                     best_epoch=best_epoch_list,
                     learning_rate=learning_rate_list,
+                    weight_decay_penalty_type=weight_decay_penalty_type_list,
+                    weight_decay_penalty=weight_decay_penalty_list,
                     layer_sizes=layer_sizes_list,
                     dropouts=dropouts_list,
                     rf_estimators=rf_estimators_list,
@@ -284,6 +303,8 @@ def get_training_perf_table(dataset_key, bucket, collection_name, pred_type='reg
                     rf_max_depth=rf_max_depth_list,
                     xgb_learning_rate = xgb_learning_rate_list,
                     xgb_gamma = xgb_gamma_list,
+                    xgb_alpha = xgb_alpha_list,
+                    xgb_lambda = xgb_lambda_list,
                     xgb_max_depth = xgb_max_depth_list,
                     xgb_colsample_bytree = xgb_colsample_bytree_list,
                     xgb_subsample = xgb_subsample_list,
@@ -307,15 +328,15 @@ def extract_model_and_feature_parameters(metadata_dict, keep_required=True):
 
     Returns:
         dictionary containing featurizer and model parameters. Most contain the following
-        keys. ['max_epochs', 'best_epoch', 'learning_rate', 'layer_sizes', 'dropouts',
-        'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_learning_rate',
+        keys. ['max_epochs', 'best_epoch', 'learning_rate', 'weight_decay_penalty_type', 'weight_decay_penalty', 'layer_sizes', 'dropouts',
+        'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_alpha', 'xgb_lambda', 'xgb_learning_rate',
         'xgb_max_depth', 'xgb_colsample_bytree', 'xgb_subsample', 'xgb_n_estimators', 'xgb_min_child_weight',
         'featurizer_parameters_dict', 'model_parameters_dict']
     """
     model_params = metadata_dict['model_parameters']
     model_type = model_params['model_type']
-    required = ['max_epochs', 'best_epoch', 'learning_rate', 'layer_sizes', 'dropouts', 
-        'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_learning_rate',
+    required = ['max_epochs', 'best_epoch', 'learning_rate', 'layer_sizes', 'dropouts', 'weight_decay_penalty_type', 'weight_decay_penalty',
+        'rf_estimators', 'rf_max_features', 'rf_max_depth', 'xgb_gamma', 'xgb_alpha', 'xgb_lambda', 'xgb_learning_rate',
         'xgb_max_depth', 'xgb_colsample_bytree', 'xgb_subsample', 'xgb_n_estimators', 'xgb_min_child_weight'
         ]
 
@@ -328,6 +349,9 @@ def extract_model_and_feature_parameters(metadata_dict, keep_required=True):
             model_info['max_epochs'] = nn_params['max_epochs']
             model_info['best_epoch'] = nn_params['best_epoch']
             model_info['learning_rate'] = nn_params['learning_rate']
+            model_info['weight_decay_penalty_type'] = nn_params['weight_decay_penalty_type']
+            model_info['weight_decay_penalty'] = nn_params['weight_decay_penalty']
+            model_info['learning_rate'] = nn_params['learning_rate']
             model_info['layer_sizes'] = ','.join(['%d' % s for s in nn_params['layer_sizes']])
             model_info['dropouts'] = ','.join(['%.2f' % d for d in nn_params['dropouts']])
         elif model_type == 'RF':
@@ -338,6 +362,8 @@ def extract_model_and_feature_parameters(metadata_dict, keep_required=True):
         elif model_type == 'xgboost':
             xgb_params = metadata_dict['xgb_specific']
             model_info['xgb_gamma'] = xgb_params['xgb_gamma']
+            model_info['xgb_alpha'] = xgb_params.get('xgb_alpha', 0.0)
+            model_info['xgb_lambda'] = xgb_params.get('xgb_lambda', 1.0)
             model_info['xgb_learning_rate'] = xgb_params['xgb_learning_rate']
             model_info['xgb_max_depth'] = xgb_params['xgb_max_depth']
             model_info['xgb_colsample_bytree'] = xgb_params['xgb_colsample_bytree']
@@ -979,6 +1005,8 @@ def get_filesystem_perf_results(result_dir, pred_type='classification', expand=T
                 print(f"Can't access model {dirpath}")
 
     print("Found data for %d models under %s" % (len(model_list), result_dir))
+    if len(model_list) == 0:
+        return pd.DataFrame()
 
     # build dictonary of tarball names
     tar_dict = {os.path.basename(tf):tf for tf in tar_list}
@@ -1511,6 +1539,8 @@ def get_summary_metadata_table(uuids, collections=None):
                          'Layer Sizes':   nn_params['layer_sizes'],
                          'Optimizer':     nn_params['optimizer_type'],
                          'Learning Rate': nn_params['learning_rate'],
+                         'Weight decay penalty type': nn_params['weight_decay_penalty_type'],
+                         'Weight decay penalty': nn_params['weight_decay_penalty'],
                          'Dropouts':      nn_params['dropouts'],
                          'Best Epoch (Max)': '%i (%i)' % (nn_params['best_epoch'],nn_params['max_epochs']),
                          'Collection':    collection_name,
@@ -1551,6 +1581,8 @@ def get_summary_metadata_table(uuids, collections=None):
                          'Model Type (Featurizer)':    '%s (%s)' % (mdl_params['model_type'],featurizer),
                          'XGB learning rate': xgb_params['xgb_learning_rate'],
                          'Gamma':    xgb_params['xgb_gamma'],
+                         'Alpha':    xgb_params.get('xgb_alpha', 0.0),
+                         'Lambda':    xgb_params.get('xgb_lambda', 1.0),
                          'XGB max depth': xgb_params['xgb_max_depth'],
                          'Column sample fraction':    xgb_params['xgb_colsample_bytree'],
                          'Row subsample fraction':    xgb_params['xgb_subsample'],
@@ -2091,7 +2123,7 @@ def get_multitask_perf_from_tracker(collection_name, response_cols=None, expand_
                   'weight_decay_penalty', 'weight_decay_penalty_type', 'weight_init_stddevs', 'splitter',
                   'split_uuid', 'split_test_frac', 'split_valid_frac', 'smiles_col', 'id_col',
                   'feature_transform_type', 'response_cols', 'response_transform_type', 'num_model_tasks',
-                  'rf_estimators', 'rf_max_depth', 'rf_max_features', 'xgb_gamma', 'xgb_learning_rate',
+                  'rf_estimators', 'rf_max_depth', 'rf_max_features', 'xgb_gamma', 'xgb_alpha', 'xgb_lambda', 'xgb_learning_rate',
                   'xgb_max_depth', 'xgb_colsample_bytree', 'xgb_subsample', 'xgb_n_estimators', 'xgb_min_child_weight',
                   ]
         keepcols.extend(alldat.columns[alldat.columns.str.contains('best')])
diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py
index 40956132..62746724 100644
--- a/atomsci/ddm/pipeline/model_wrapper.py
+++ b/atomsci/ddm/pipeline/model_wrapper.py
@@ -2279,6 +2279,8 @@ def get_model_specific_metadata(self):
                        "xgb_learning_rate" : self.params.xgb_learning_rate,
                        "xgb_n_estimators" : self.params.xgb_n_estimators,
                        "xgb_gamma" : self.params.xgb_gamma,
+                       "xgb_alpha" : self.params.xgb_alpha,
+                       "xgb_lambda" : self.params.xgb_lambda,
                        "xgb_min_child_weight" : self.params.xgb_min_child_weight,
                        "xgb_subsample" : self.params.xgb_subsample,
                        "xgb_colsample_bytree"  :self.params.xgb_colsample_bytree

From 812e13f74c14acdcc017d9a5fd6ac80bcb5cb162 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Wed, 12 Jun 2024 11:28:34 -0700
Subject: [PATCH 19/28] Fixed MTSS bug where population was not sorted by score
 after calling step(); changed order of operations so that grading happens at
 end of step() method rather than at beginning. Added serial_grade_population
 method for debugging. Simplified code to address some performance issues.

---
 atomsci/ddm/pipeline/GeneticAlgorithm.py | 34 +++++++++++++++---------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/atomsci/ddm/pipeline/GeneticAlgorithm.py b/atomsci/ddm/pipeline/GeneticAlgorithm.py
index 531b85ba..195ae83d 100644
--- a/atomsci/ddm/pipeline/GeneticAlgorithm.py
+++ b/atomsci/ddm/pipeline/GeneticAlgorithm.py
@@ -49,12 +49,24 @@ def __init__(self,
         self.fitness_func = fitness_func
         self.crossover_func = crossover_func
         self.mutate_func = mutate_func
+        #self.serial_grade_population()
         self.parallel_grade_population()
 
-    def parallel_grade_population(self):
-        """ Grade the population and save the scores
 
-        Updates the order of self.pop and self.pop_scores
+    def serial_grade_population(self):
+        """ Non-multithreaded version of parallel_grade_population, used for debugging fitness functions only.
+        """
+        fitnesses = []
+        for chrom in self.pop:
+            fitnesses.append(self.fitness_func(chrom))
+        pairs = sorted(zip(fitnesses, self.pop), reverse=True)
+        self.pop = [chrome for fitness, chrome in pairs]
+        self.pop_scores = [fitness for fitness, chrome in pairs]
+
+
+    def parallel_grade_population(self):
+        """ Scores the chromosomes in the current population and sorts them in decreasing score order.
+        Saves the sorted scores in self.pop_scores.
 
         Parameters
         ----------
@@ -68,10 +80,7 @@ def parallel_grade_population(self):
         fitnesses = pool.map(self.fitness_func, self.pop)
         pool.close()
         pool.join()
-        pairs = list(zip(fitnesses, self.pop))
-
-        pairs.sort(key=lambda x: x[0], reverse=True)
-
+        pairs = sorted(zip(fitnesses, self.pop), reverse=True)
         self.pop = [chrome for fitness, chrome in pairs]
         self.pop_scores = [fitness for fitness, chrome in pairs]
 
@@ -90,10 +99,7 @@ def select_parents(self) -> List[List[Any]]:
         parents: List[List[Any]]
             A list of chromosomes that will be parents for the next generation
         """
-        self.parallel_grade_population()
-
-        parents = [chrome for chrome in self.pop[:self.num_parents]]
-        return parents
+        return self.pop[:self.num_parents]
 
     def iterate(self, num_generations: int):
         """ Iterates the genetic algorithm num_generations
@@ -130,12 +136,13 @@ def step(self, print_timings: bool = False):
 
         start = timeit.default_timer()
         i = timeit.default_timer()
+        # select parents using rank selection
         parents = self.select_parents()
         if print_timings:
             print('\tfind parents %0.2f min'%((timeit.default_timer()-i)/60))
 
-        # select parents using rank selection
         i = timeit.default_timer()
+        # Generate new population by crossing parent chromosomes
         new_pop = self.crossover_func(parents, self.num_pop)
         if print_timings:
             print('\tcrossover %0.2f min'%((timeit.default_timer()-i)/60))
@@ -143,6 +150,9 @@ def step(self, print_timings: bool = False):
         # mutate population
         i = timeit.default_timer()
         self.pop = self.mutate_func(new_pop)
+        # Compute scores for new chromosomes and sort population by score
+        #self.serial_grade_population()
+        self.parallel_grade_population()
         if print_timings:
             print('\tmutate %0.2f min'%((timeit.default_timer()-i)/60))
             print('total %0.2f min'%((timeit.default_timer()-start)/60))

From 120c6a12e3619a95d2e474ba45eec4d5fb55f941 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Wed, 12 Jun 2024 17:41:42 -0700
Subject: [PATCH 20/28] Changed to use single-threaded function to grade
 chromosomes, after seeing that it runs much faster than the multithreaded
 version. Added documentation.

---
 atomsci/ddm/pipeline/GeneticAlgorithm.py | 28 ++++++++++++++++++------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/atomsci/ddm/pipeline/GeneticAlgorithm.py b/atomsci/ddm/pipeline/GeneticAlgorithm.py
index 195ae83d..98611d77 100644
--- a/atomsci/ddm/pipeline/GeneticAlgorithm.py
+++ b/atomsci/ddm/pipeline/GeneticAlgorithm.py
@@ -49,12 +49,22 @@ def __init__(self,
         self.fitness_func = fitness_func
         self.crossover_func = crossover_func
         self.mutate_func = mutate_func
-        #self.serial_grade_population()
-        self.parallel_grade_population()
+        self.serial_grade_population()
+        #self.parallel_grade_population()
 
 
     def serial_grade_population(self):
-        """ Non-multithreaded version of parallel_grade_population, used for debugging fitness functions only.
+        """ Scores the chromosomes in the current population and sorts them in decreasing score order.
+        Saves the sorted scores in self.pop_scores. Not multithreaded; surprisingly, this runs faster
+        than the multithreaded function parallel_grade_scores.
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        None. As a side effect, sorts the chromosomes in self.pop and updates the scores in self.pop_scores.
         """
         fitnesses = []
         for chrom in self.pop:
@@ -66,7 +76,11 @@ def serial_grade_population(self):
 
     def parallel_grade_population(self):
         """ Scores the chromosomes in the current population and sorts them in decreasing score order.
-        Saves the sorted scores in self.pop_scores.
+        Saves the sorted scores in self.pop_scores. 
+        
+        Although this does the same thing in multiple threads as the single-threaded function 
+        serial_grade_population, it seems to run much slower, at least for multitask scaffold splits
+        with 100 chromosomes.
 
         Parameters
         ----------
@@ -74,7 +88,7 @@ def parallel_grade_population(self):
 
         Returns
         -------
-        Nothing
+        None
         """
         pool = multiprocessing.Pool(processes=N_PROCS)
         fitnesses = pool.map(self.fitness_func, self.pop)
@@ -151,8 +165,8 @@ def step(self, print_timings: bool = False):
         i = timeit.default_timer()
         self.pop = self.mutate_func(new_pop)
         # Compute scores for new chromosomes and sort population by score
-        #self.serial_grade_population()
-        self.parallel_grade_population()
+        self.serial_grade_population()
+        #self.parallel_grade_population()
         if print_timings:
             print('\tmutate %0.2f min'%((timeit.default_timer()-i)/60))
             print('total %0.2f min'%((timeit.default_timer()-start)/60))

From 074117873b23f99ba72eccec05f0b4030fff205e Mon Sep 17 00:00:00 2001
From: Jessica Mauvais <mauvais2@llnl.gov>
Date: Fri, 19 Jul 2024 12:11:42 -0700
Subject: [PATCH 21/28] use the copy from sparsity branch

---
 .../ddm/pipeline/MultitaskScaffoldSplit.py    | 318 +++++++++++-------
 1 file changed, 203 insertions(+), 115 deletions(-)

diff --git a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
index 67e2a54f..9a4fbb18 100644
--- a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
+++ b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py
@@ -1,4 +1,5 @@
 import pdb
+import logging
 import argparse
 import random
 import timeit
@@ -24,6 +25,9 @@
 from atomsci.ddm.pipeline import dist_metrics
 from atomsci.ddm.pipeline import GeneticAlgorithm as ga
 
+logging.basicConfig(format='%(asctime)-15s %(message)s')
+logger = logging.getLogger('ATOM')
+
 def _generate_scaffold_hists(scaffold_sets: List[np.ndarray], 
                                 w: np.array) -> np.array:
     """Counts the number of labelled samples per task per scaffold
@@ -96,8 +100,8 @@ def smush_small_scaffolds(scaffolds: List[Set[int]],
         else:
             new_scaffolds.append(scaffold)
 
-    print('new scaffold lengths')
-    print([len(s) for s in new_scaffolds])
+    logger.debug('new scaffold lengths')
+    logger.debug([len(s) for s in new_scaffolds])
     new_scaffolds = [np.array(list(s)) for s in new_scaffolds]
     return new_scaffolds
 
@@ -133,7 +137,7 @@ def calc_ecfp(smiles: List[str],
         fprints = [y for x in ecfps for y in x] #Flatten results
     else:
         mols = [Chem.MolFromSmiles(s) for s in smiles]
-        fprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) for mol in mols]
+        fprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 3, 1024) for mol in mols]
 
     return fprints
 
@@ -144,9 +148,9 @@ def dist_smiles_from_ecfp(ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBit
     Parameters
     ----------
     ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect],
-        A list of ECFP finger prints
+        A list of ECFP fingerprints
     ecfp2: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect]
-        A list of ECFP finger prints
+        A list of ECFP fingerprints
 
     Returns
     -------
@@ -159,57 +163,6 @@ def dist_smiles_from_ecfp(ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBit
     return cd.calc_summary(dist_metrics.tanimoto(ecfp1, ecfp2), calc_type='nearest', 
                         num_nearest=1, within_dset=False)
 
-def _generate_scaffold_dist_matrix(scaffold_lists: List[np.ndarray],
-                                ecfp_features: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect]) -> np.ndarray:
-        """Returns a nearest neighbors distance matrix between each scaffold.
-
-        The distance between two scaffolds is defined as the
-        the distance between the two closest compounds between the two
-        scaffolds.
-
-        TODO: Ask Stewart: Why did he change to using the median instead of the min?
-
-        Parameters
-        ----------
-        scaffold_lists: List[np.ndarray]
-            List of scaffolds. A scaffold is a set of indices into ecfp_features
-        ecfp_features: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect]
-            List of ecfp features, one for each compound in the dataset
-
-        Returns
-        -------
-        dist_mat: np.ndarray
-            Distance matrix, symmetric.
-        """
-        print('start generating big dist mat')
-        start = timeit.default_timer()
-        # First compute the full matrix of distances between all pairs of ECFPs
-        dmat = dist_metrics.tanimoto(ecfp_features)
-
-        mat_shape = (len(scaffold_lists), len(scaffold_lists))
-        scaff_dist_mat = np.zeros(mat_shape)
-        for i, scaff1 in tqdm(enumerate(scaffold_lists)):
-            scaff1_rows = scaff1.reshape((-1,1))
-            for j, scaff2 in enumerate(scaffold_lists[:i]):
-                if i == j:
-                    continue
-
-                dists = dmat[scaff1_rows, scaff2].flatten()
-                #min_dist = np.median(dists)
-                # ksm: Change back to min and see what happens
-                min_dist = np.min(dists)
-
-                if min_dist==0:
-                    print("two scaffolds match exactly?!?", i, j)
-                    print(len(set(scaff2).intersection(set(scaff1))))
-
-                scaff_dist_mat[i,j] = min_dist
-                scaff_dist_mat[j,i] = min_dist
-
-        print("finished scaff dist mat: %0.2f min"%((timeit.default_timer()-start)/60))
-        return scaff_dist_mat
-
-
 class MultitaskScaffoldSplitter(Splitter):
     """MultitaskScaffoldSplitter Splitter class.
 
@@ -261,6 +214,65 @@ def generate_scaffolds(self,
 
         return scaffold_sets
 
+    def _generate_scaffold_dist_matrix(self):
+        """Computes matrices used by fitness functions that score splits according
+        to the dissimilarity between training and test compound structures. One is 
+        a symmetric matrix of nearest neighbor Tanimoto distances between pairs of 
+        scaffolds (i.e., between the most similar compounds in the two scaffolds).
+        This matrix is used to compute the `scaffold_diff_fitness` function.
+        
+        The other is a nonsymmetric matrix of boolean vectors `has_nn[i,j]`, of length
+        equal to the number of compounds in scaffold `i`, indicating whether each
+        compound has a neighbor in scaffold `j` nearer than some threshold Tanimoto 
+        distance `dist_thresh`. dist_thresh defaults to 0.3, but could be parameterized.
+        The has_nn matrix is used to compute the `far_frac_fitness` function, which is
+        a more robust measurement of the train/test set dissimilarity.
+        """
+        scaffold_lists = self.ss
+        ecfp_features = self.ecfp_features
+
+        # First compute the full matrix of distances between all pairs of ECFPs
+        dmat = dist_metrics.tanimoto(ecfp_features)
+
+        mat_shape = (len(scaffold_lists), len(scaffold_lists))
+        scaff_dist_mat = np.zeros(mat_shape)
+        has_near_neighbor_mat = np.empty(mat_shape, dtype=object)
+
+        for i, scaff1 in enumerate(scaffold_lists):
+            scaff1_rows = scaff1.reshape((-1,1))
+            for j, scaff2 in enumerate(scaffold_lists[:i]):
+                if i == j:
+                    continue
+
+                scaff1_dists = dmat[scaff1_rows, scaff2]
+                min_dist = np.min(scaff1_dists.flatten())
+
+                if min_dist==0:
+                    logger.info(f"Scaffolds {i} and {j} have at least one ECFP in common")
+                    for k in scaff1:
+                        for l in scaff2:
+                            if dmat[k,l] == 0:
+                                logger.debug(f"\tcompound {k} in scaffold {i}, compound {l} in scaffold {j}")
+                                logger.debug(f"\tSMILES {k}: {self.smiles[k]}")
+                                logger.debug(f"\tSMILES {l}: {self.smiles[l]}\n")
+
+
+                scaff_dist_mat[i,j] = min_dist
+                scaff_dist_mat[j,i] = min_dist
+
+                # Identify the compounds in scaff1 with a neighbor in scaff2 closer than dist_thresh
+                has_near_neighbor_mat[i,j] = np.array(np.min(scaff1_dists, axis=1) < self.dist_thresh)
+
+                # Identify the compounds in scaff2 with a neighbor in scaff1 closer than dist_thresh
+                scaff2_rows = scaff2.reshape((-1,1))
+                scaff2_dists = dmat[scaff2_rows, scaff1]
+                has_near_neighbor_mat[j,i] = np.array(np.min(scaff2_dists, axis=1) < self.dist_thresh)
+
+
+        self.scaff_scaff_distmat = scaff_dist_mat
+        self.has_near_neighbor_mat = has_near_neighbor_mat
+
+
     def expand_scaffolds(self,
                         scaffold_list: List[int]) -> List[int]:
         """Turns a list of scaffold indices into a list of compound indices
@@ -350,51 +362,74 @@ def scaffold_diff_fitness(self,
 
         return min_dist
 
-    def ratio_fitness_old(self, split_chromosome: List[str]) -> float:
-        """Calculates a fitness score based on how accurately divided training/test/validation.
+    def far_frac_fitness(self, 
+                            split_chromosome: List[str],
+                            train_part: str,
+                            test_part: str) -> float:
+        """Grades a split according to the fraction of valid/test compounds with
+        nearest training set compounds further than some threshold.
+
+        Grades the quality of the split based on which scaffolds were alloted to
+        which partitions. The score is measured as the fraction of compounds in
+        `test_part` with nearest neighbors in `train_part` at Tanimoto distance
+        `self.dist_thresh` or greater.
+
         Parameters
         ----------
-        List[str]: split_chromosome
-            A list of strings, index i contains a string, 'train', 'valid', 'test'
-            which determines the partition that scaffold belongs to
+        split_chromosome: List[str]
+            A split represented as a list of partition names. Index i in the
+            chromosome contains the partition for scaffold i.
+        train_part: str
+            The name of the partition to be treated as the training subset
+        test_part: str
+            The name of the partition to be treated as the test subset
+
         Returns
         -------
-        List[str]
-            A list of strings, index i contains a string, 'train', 'valid', 'test'
-            which determines the partition that scaffold belongs to
+        score: float
+            Floating point value beteween 0-1. 1 is the best score and 0 is the worst
         """
-        start = timeit.default_timer()
-        total_counts = np.sum(self.dataset.w, axis=0)
-        subset_counts = [np.sum(self.dataset.w[subset], axis=0) for subset in \
-                            self.split_chromosome_to_compound_split(split_chromosome)]
+        # TODO: Eventually, replace strings in chromosome with integers indicating the partition
+        # for each scaffold.
 
-        # subset order goes train, valid, test. just consider train for the moment
-        subset_ratios = [subset_count/total_counts for subset_count in subset_counts]
-        #print("mean: %0.2f, median: %0.2f, std: %0.2f"%(np.mean(subset_ratios[0]), np.median(subset_ratios[0]), np.std(subset_ratios[0])))
-
-        # fitness of split is the size of the smallest training dataset
-        # ratio_fit = min(subset_ratios[0]) # this resulted in a split with 0 test. shoulda seen that coming
-        num_tasks = self.dataset.w.shape[1]
-        # imagine the perfect split is a point in 3*num_tasks space.
-        target_split = np.concatenate([[self.frac_train]*num_tasks,
-                                       [self.frac_valid]*num_tasks])
-        # this is the current split also on 3*num_tasks space
-        current_split = np.concatenate(subset_ratios[:2])
-        # if any partition is 0, then this split fails
-        if min(current_split) == 0:
+        train_scaffolds = [i for i, part in enumerate(split_chromosome) if part==train_part]
+        test_scaffolds = [i for i, part in enumerate(split_chromosome) if part==test_part]
+
+        # if a partition is completely empty, return 0
+        if len(train_scaffolds) == 0 or len(test_scaffolds) == 0:
             return 0
-        # worst possible distance to normalize this between 0 and 1
-        worst_distance = np.linalg.norm(np.ones(num_tasks*2))
-        ratio_fit = 1 - np.linalg.norm(target_split-current_split)/worst_distance
-        #print("\tratio_fitness: %0.2f min"%((timeit.default_timer()-start)/60))
 
-        return ratio_fit
+        # Compute the "far fraction": For each test scaffold S, OR together the boolean vectors
+        # from has_near_neighbor_mat for the training scaffolds indicating whether each compound in
+        # S has a neighbor in the train scaffold closer than Tanimoto distance dist_thresh. Sum
+        # the results over compounds and test scaffolds and divide by the test set size to get the
+        # "near fraction"; the far fraction is 1 minus the near fraction.
+
+        near_count = 0
+        total_count = 0
+        for test_ind in test_scaffolds:
+            has_nn = None
+            for train_ind in train_scaffolds:
+                assert(not (train_ind == test_ind))
+                if has_nn is None:
+                    has_nn = self.has_near_neighbor_mat[test_ind, train_ind]
+                else:
+                    has_nn |= self.has_near_neighbor_mat[test_ind, train_ind]
+            near_count += sum(has_nn)
+            total_count += len(has_nn)
+
+        far_frac = 1 - near_count/total_count
+        #print(f"far_frac_fitness: near_count {near_count}, total_count {total_count}, far_frac {far_frac}")
+        return far_frac
+
 
     def ratio_fitness(self, split_chromosome: List[str]) -> float:
-        """Calculates a fitness score based on how accurately divided training/test/validation.
+        """Calculates a fitness score based on how well the subset proportions for each task, taking
+        only labeled compounds into account, match the proportions requested by the user.
 
-        Treats the min,median,max of each of the three partitions as a 9D point and uses
-        euclidean distance to measure the distance to that point
+        The score is determined by combining the min, median and max over tasks of the subset fractions
+        into a 9-dimensional vector and computing its Euclidean distance from an ideal vector having
+        all the fractions as requested.
 
         Parameters
         ----------
@@ -407,15 +442,15 @@ def ratio_fitness(self, split_chromosome: List[str]) -> float:
         float
             A float between 0 and 1. 1 best 0 is worst
         """
-        start = timeit.default_timer()
+        #start = timeit.default_timer()
         # total_counts is the number of labels per task
         total_counts = np.sum(self.dataset.w, axis=0)
 
-        # subset_counts is number of labels per task per subset
+        # subset_counts is number of labels per task per subset.
         subset_counts = [np.sum(self.dataset.w[subset], axis=0) for subset in \
                             self.split_chromosome_to_compound_split(split_chromosome)]
 
-        # subset order goes train, valid, test. just consider train for the moment
+        # subset order goes train, valid, test
         subset_ratios = [subset_count/total_counts for subset_count in subset_counts]
 
         # imagine the perfect split is a point in 9D space. For each subset we measure
@@ -433,6 +468,7 @@ def ratio_fitness(self, split_chromosome: List[str]) -> float:
             return 0
         # worst possible distance to normalize this between 0 and 1
         worst_distance = np.linalg.norm(np.ones(len(target_split)))
+        worst_distance = np.sqrt(len(target_split))
         ratio_fit = 1 - np.linalg.norm(target_split-current_split)/worst_distance
         #print("\tratio_fitness: %0.2f min"%((timeit.default_timer()-start)/60))
 
@@ -456,7 +492,7 @@ def response_distr_fitness(self, split_chromosome: List[str]) -> float:
             the train subset response value distributions, averaged over tasks. One means
             the distributions perfectly match.
         """
-        start = timeit.default_timer()
+        #start = timeit.default_timer()
         dist_sum = 0.0
         ntasks = self.dataset.y.shape[1]
         train_ind, valid_ind, test_ind = self.split_chromosome_to_compound_split(split_chromosome)
@@ -495,18 +531,56 @@ def grade(self, split_chromosome: List[str]) -> float:
         float
             A float between 0 and 1. 1 best 0 is worst
         """
+        
+        """
         fitness = 0.0
         # Only call the functions for each fitness term if their weight is nonzero
         if self.diff_fitness_weight_tvt != 0.0:
-            fitness += self.diff_fitness_weight_tvt*self.scaffold_diff_fitness(split_chromosome, 'train', 'test')
+            #score = self.scaffold_diff_fitness(split_chromosome, 'train', 'test')
+            score = self.far_frac_fitness(split_chromosome, 'train', 'test')
+            fitness += self.diff_fitness_weight_tvt*score
         if self.diff_fitness_weight_tvv != 0.0:
-            fitness += self.diff_fitness_weight_tvv*self.scaffold_diff_fitness(split_chromosome, 'train', 'valid')
+            #score = self.scaffold_diff_fitness(split_chromosome, 'train', 'valid')
+            score = self.far_frac_fitness(split_chromosome, 'train', 'valid')
+            fitness += self.diff_fitness_weight_tvv*score
         if self.ratio_fitness_weight != 0.0:
-            fitness += self.ratio_fitness_weight*self.ratio_fitness(split_chromosome)
+            score = self.ratio_fitness(split_chromosome)
+            fitness += self.ratio_fitness_weight*score
         if self.response_distr_fitness_weight != 0.0:
-            fitness += self.response_distr_fitness_weight*self.response_distr_fitness(split_chromosome)
+            score = self.response_distr_fitness(split_chromosome)
+            fitness += self.response_distr_fitness_weight*score
 
+        # Normalize the score to the range [0,1]
+        fitness /= (self.diff_fitness_weight_tvt + self.diff_fitness_weight_tvv + self.ratio_fitness_weight +
+                    self.response_distr_fitness_weight)
         return fitness
+        """
+        fitness_scores = self.get_fitness_scores(split_chromosome)
+        return fitness_scores['total_fitness']
+
+    def get_fitness_scores(self, split_chromosome):
+        fitness_scores = {}
+        total_fitness = 0.0
+        # Only call the functions for each fitness term if their weight is nonzero
+        if self.diff_fitness_weight_tvt != 0.0:
+            #fitness_scores['test_scaf_dist'] = self.scaffold_diff_fitness(split_chromosome, 'train', 'test')
+            fitness_scores['test_scaf_dist'] = self.far_frac_fitness(split_chromosome, 'train', 'test')
+            total_fitness += self.diff_fitness_weight_tvt * fitness_scores['test_scaf_dist']
+        if self.diff_fitness_weight_tvv != 0.0:
+            #fitness_scores['valid_scaf_dist'] = self.scaffold_diff_fitness(split_chromosome, 'train', 'valid')
+            fitness_scores['valid_scaf_dist'] = self.far_frac_fitness(split_chromosome, 'train', 'valid')
+            total_fitness += self.diff_fitness_weight_tvv * fitness_scores['valid_scaf_dist']
+        if self.ratio_fitness_weight != 0.0:
+            fitness_scores['ratio'] = self.ratio_fitness(split_chromosome)
+            total_fitness += self.ratio_fitness_weight * fitness_scores['ratio']
+        if self.response_distr_fitness_weight != 0.0:
+            fitness_scores['response_distr'] = self.response_distr_fitness(split_chromosome)
+            total_fitness += self.response_distr_fitness_weight * fitness_scores['response_distr']
+        # Normalize the score to the range [0,1]
+        total_fitness /= (self.diff_fitness_weight_tvt + self.diff_fitness_weight_tvv + self.ratio_fitness_weight +
+                    self.response_distr_fitness_weight)
+        fitness_scores['total_fitness'] = total_fitness
+        return fitness_scores
 
     def init_scaffolds(self,
              dataset: Dataset) -> None:
@@ -528,10 +602,10 @@ def init_scaffolds(self,
         # list of lists. one list per scaffold
         big_ss = self.generate_scaffolds(dataset)
 
-        # using the same stragetgy as scaffold split, combine the scaffolds
+        # using the same strategy as scaffold split, combine the scaffolds
         # together until you have roughly 100 scaffold sets
         self.ss = smush_small_scaffolds(big_ss, num_super_scaffolds=self.num_super_scaffolds)
-        print(f'num_super_scaffolds: {len(self.ss)}, {self.num_super_scaffolds}')
+        logger.info(f"Requested {self.num_super_scaffolds} super scaffolds, produced {len(self.ss)} from {len(big_ss)} original scaffolds")
 
         # rows is the number of scaffolds
         # columns is number of tasks
@@ -550,8 +624,10 @@ def split(self,
             num_super_scaffolds: int = 20,
             num_pop: int = 100,
             num_generations: int=30,
+            dist_thresh: float = 0.3,
             print_timings: bool = False,
-            log_every_n: int = 1000) -> Tuple:
+            early_stopping_generations = 25,
+            log_every_n: int = 10) -> Tuple:
         """Creates a split for the given datset
 
         This split splits the dataset into a list of super scaffolds then
@@ -609,6 +685,7 @@ def split(self,
         self.num_super_scaffolds = num_super_scaffolds
         self.num_pop = num_pop
         self.num_generations = num_generations
+        self.dist_thresh = dist_thresh
 
         self.frac_train = frac_train
         self.frac_valid = frac_valid
@@ -618,38 +695,47 @@ def split(self,
         self.init_scaffolds(self.dataset)
 
         # ecpf features
+        self.smiles = dataset.ids
         self.ecfp_features = calc_ecfp(dataset.ids)
 
         # calculate ScaffoldxScaffold distance matrix
-        start = timeit.default_timer()
         if (self.diff_fitness_weight_tvv > 0.0) or (self.diff_fitness_weight_tvt > 0.0):
-            self.scaff_scaff_distmat = _generate_scaffold_dist_matrix(self.ss, self.ecfp_features)
-        #print('scaffold dist mat %0.2f min'%((timeit.default_timer()-start)/60))
+            self._generate_scaffold_dist_matrix()
 
         # initial population
         population = []
         for i in range(self.num_pop):
-            start = timeit.default_timer()
+            #start = timeit.default_timer()
             split_chromosome = self._split(frac_train=frac_train, frac_valid=frac_valid, 
                                 frac_test=frac_test)
-            #print("per_loop: %0.2f min"%((timeit.default_timer()-start)/60))
 
             population.append(split_chromosome)
 
+        self.fitness_terms = {}
         gene_alg = ga.GeneticAlgorithm(population, self.grade, ga_crossover,
                         ga_mutate)
         #gene_alg.iterate(num_generations)
+        best_ever = None
+        best_ever_fit = -np.inf
+        best_gen = 0
         for i in range(self.num_generations):
             gene_alg.step(print_timings=print_timings)
             best_fitness = gene_alg.pop_scores[0]
-            print("step %d: best_fitness %0.2f"%(i, best_fitness))
-            #print("%d: %0.2f"%(i, gene_alg.grade_population()[0][0]))
-
-        best_fit = gene_alg.pop_scores[0]
-        best = gene_alg.pop[0]
-
-        #print('best ever fitness %0.2f'%best_ever_fit)
-        result = self.split_chromosome_to_compound_split(best)
+            if best_fitness > best_ever_fit:
+                best_ever = gene_alg.pop[0]
+                best_ever_fit = best_fitness
+                best_gen = i
+            score_dict = self.get_fitness_scores(best_ever)
+            for term, score in score_dict.items():
+                self.fitness_terms.setdefault(term, []).append(score)
+            if i % log_every_n == 0:
+                logger.info(f"generation {i}: Best fitness {best_fitness:.3f}, best ever {best_ever_fit:.3f} at generation {best_gen}")
+            if (best_fitness <= best_ever_fit) and (i - best_gen >= early_stopping_generations):
+                logger.info(f"No fitness improvement after {early_stopping_generations} generations")
+                break
+
+        logger.info(f"Final best fitness score: {best_ever_fit:.3f} at generation {best_gen}")
+        result = self.split_chromosome_to_compound_split(best_ever)
         return result
 
     def _split(self,
@@ -738,7 +824,8 @@ def train_valid_test_split(self,
             train_dir: Optional[str] = None,
             valid_dir: Optional[str] = None,
             test_dir: Optional[str] = None,
-            log_every_n: int = 1000) -> Tuple[Dataset, Dataset, Dataset]:
+            dist_thresh: float = 0.3,
+            log_every_n: int = 10) -> Tuple[Dataset, Dataset, Dataset]:
         """Creates a split for the given datset
 
         This split splits the dataset into a list of super scaffolds then
@@ -805,6 +892,7 @@ def train_valid_test_split(self,
             diff_fitness_weight_tvv=diff_fitness_weight_tvv, ratio_fitness_weight=ratio_fitness_weight,
             response_distr_fitness_weight=response_distr_fitness_weight,
             num_super_scaffolds=num_super_scaffolds, num_pop=num_pop, num_generations=num_generations,
+            dist_thresh=0.3,
             print_timings=False)
 
         if train_dir is None:

From 8d1e759830a6765beb4c25a4f31dbc6b2812a99b Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Thu, 20 Jun 2024 17:50:49 -0700
Subject: [PATCH 22/28] Implemented enhancement request from AMPL issue #318:
 Random forest and XGBoost classification models now support class balancing
 weights when weight_transform_type parameter is set to 'balancing'.

---
 atomsci/ddm/pipeline/model_datasets.py | 20 +++++++++++++++++++-
 atomsci/ddm/pipeline/model_wrapper.py  | 18 +++++++++++++++++-
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py
index c6b66326..a809ef3b 100644
--- a/atomsci/ddm/pipeline/model_datasets.py
+++ b/atomsci/ddm/pipeline/model_datasets.py
@@ -87,6 +87,24 @@ def check_task_columns(params, dset_df):
         raise Exception(f"Requested prediction task columns {missing_tasks} are missing from training dataset")
 
 
+# ****************************************************************************************
+def get_class_freqs(params):
+    """Given an input dataset for a classification model, returns the value counts
+    for each class for each task. Uses params.dataset_key to locate the dataset and params.response_cols
+    to identify the columns containing the class labels. Returns a list of lists, one for each task,
+    containing the value counts for each class (not counting missing values).
+    """
+    dset_df = pd.read_csv(params.dataset_key)
+    freq_list = []
+    # =-=ksm Strange but true: All classification tasks have to have the same number of classes. This may
+    # be a "feature" of DeepChem, because of the format of the model prediction arrays.
+    nclasses = params.class_number
+    for col in params.response_cols:
+        vc = dset_df[col].value_counts()
+        freqs = [vc.get(cl, 0) for cl in range(nclasses)]
+        freq_list.append(freqs)
+    return freq_list
+
 # ****************************************************************************************
 def set_group_permissions(system, path, data_owner='public', data_owner_group='public'):
     """Set file group and permissions to standard values for a dataset containing proprietary
@@ -408,7 +426,7 @@ def get_featurized_data(self, params=None):
         self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w)
         # Checking for minimum number of rows
         if len(self.dataset) < params.min_compound_number:
-            self.log.warning("Dataset of length %i is shorter than the required length %i" % (len(self.dataset), params.min_compound_number))
+            self.log.info("Dataset of length %i is shorter than the recommended length %i" % (len(self.dataset), params.min_compound_number))
 
     # ****************************************************************************************
     def get_dataset_tasks(self, dset_df):
diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py
index 62746724..3048b422 100644
--- a/atomsci/ddm/pipeline/model_wrapper.py
+++ b/atomsci/ddm/pipeline/model_wrapper.py
@@ -49,6 +49,7 @@
 
 from atomsci.ddm.utils import datastore_functions as dsf
 from atomsci.ddm.utils import llnl_utils
+from atomsci.ddm.pipeline import model_datasets as md
 from atomsci.ddm.pipeline import transformations as trans
 from atomsci.ddm.pipeline import perf_data as perf
 import atomsci.ddm.pipeline.parameter_parser as pp
@@ -1850,9 +1851,14 @@ def make_dc_model(self, model_dir):
                                              max_depth=self.params.rf_max_depth,
                                              n_jobs=-1)
         else:
+            if self.params.weight_transform_type == 'balancing':
+                class_weights = 'balanced'
+            else:
+                class_weights = None
             rf_model = RandomForestClassifier(n_estimators=self.params.rf_estimators,
                                               max_features=self.params.rf_max_features,
                                               max_depth=self.params.rf_max_depth,
+                                              class_weight=class_weights,
                                               n_jobs=-1)
 
         return dc.models.sklearn_models.SklearnModel(rf_model, model_dir=model_dir)
@@ -2021,6 +2027,16 @@ def make_dc_model(self, model_dir):
                                          max_bin = 16,
                                          )
         else:
+            if self.params.weight_transform_type == 'balancing':
+                # Compute a class weight for positive class samples to help deal with imblanced datasets
+                class_freqs = md.get_class_freqs(self.params)
+                if len(class_freqs) > 1:
+                    raise ValueError("xgboost models don't currently support multitask data")
+                if len(class_freqs[0]) > 2:
+                    raise ValueError("xgboost models don't currently support multiclass data")
+                pos_class_weight = class_freqs[0][0]/class_freqs[0][1]
+            else:
+                pos_class_weight = 1
             xgb_model = xgb.XGBClassifier(max_depth=self.params.xgb_max_depth,
                                          learning_rate=self.params.xgb_learning_rate,
                                          n_estimators=self.params.xgb_n_estimators,
@@ -2035,7 +2051,7 @@ def make_dc_model(self, model_dir):
                                           colsample_bylevel=1,
                                           reg_alpha=self.params.xgb_alpha,
                                           reg_lambda=self.params.xgb_lambda,
-                                          scale_pos_weight=1,
+                                          scale_pos_weight=pos_class_weight,
                                           base_score=0.5,
                                           random_state=0,
                                           importance_type='gain',

From 9297ffa0911c0ef380eb9ca8b369f3993dd76a09 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Fri, 28 Jun 2024 14:20:06 -0700
Subject: [PATCH 23/28] Save split_uuid in self.params at end of
 split_dataset().

---
 atomsci/ddm/pipeline/model_pipeline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py
index 575c6b6d..e2a06daa 100644
--- a/atomsci/ddm/pipeline/model_pipeline.py
+++ b/atomsci/ddm/pipeline/model_pipeline.py
@@ -535,6 +535,7 @@ def split_dataset(self, featurization=None):
             featurization = feat.create_featurization(self.params)
         self.featurization = featurization
         self.load_featurize_data()
+        self.params.split_uuid = self.data.split_uuid
         return self.data.split_uuid
 
 

From f6d220d51c5636e5936173ef703f6dadebe8cc0c Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Fri, 28 Jun 2024 14:21:21 -0700
Subject: [PATCH 24/28] Added code to show valid & train Wasserstein distances
 in plot titles.

---
 .../ddm/utils/split_response_dist_plots.py    | 25 ++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/atomsci/ddm/utils/split_response_dist_plots.py b/atomsci/ddm/utils/split_response_dist_plots.py
index d457b814..ef7448db 100644
--- a/atomsci/ddm/utils/split_response_dist_plots.py
+++ b/atomsci/ddm/utils/split_response_dist_plots.py
@@ -46,6 +46,9 @@ def plot_split_subset_response_distrs(params, axes=None, plot_size=7):
     else:
         subset_order = ['train', 'valid', 'test']
 
+    wdist_df = compute_split_subset_wasserstein_distances(params)
+
+
     if axes is None:
         fig, axes = plt.subplots(1, len(params.response_cols), figsize=(plot_size*len(params.response_cols), plot_size))
     if len(params.response_cols) == 1:
@@ -54,10 +57,18 @@ def plot_split_subset_response_distrs(params, axes=None, plot_size=7):
         axes = axes.flatten()
     for colnum, col in enumerate(params.response_cols):
         ax = axes[colnum]
+        if params.split_strategy == 'train_valid_test':
+            tvv_wdist = wdist_df[(wdist_df.split_subset == 'valid') & (wdist_df.response_col == col)].distance.values[0]
+            tvt_wdist = wdist_df[(wdist_df.split_subset == 'test') & (wdist_df.response_col == col)].distance.values[0]
         if params.prediction_type == 'regression':
             ax = sns.kdeplot(data=dset_df, x=col, hue='split_subset', hue_order=subset_order, 
                              bw_adjust=0.7, fill=True, common_norm=False, ax=ax)
             ax.set_title(f"{col} distribution by subset under {split_label}")
+            if params.split_strategy == 'train_valid_test':
+                ax.set_title(f"{col} distribution by subset under {split_label}\n" \
+                             f"Wasserstein distances: valid = {tvv_wdist:.3f}, test = {tvt_wdist:.3f}")
+            else:
+                ax.set_title(f"{col} distribution by subset under {split_label}")
         else:
             pct_active = []
             for ss in subset_order:
@@ -66,7 +77,12 @@ def plot_split_subset_response_distrs(params, axes=None, plot_size=7):
                 pct_active.append(100*nactive/sum(ss_df[col].notna()))
             active_df = pd.DataFrame(dict(subset=subset_order, percent_active=pct_active))
             ax = sns.barplot(data=active_df, x='subset', y='percent_active', hue='subset', ax=ax)
-            ax.set_title(f"Percent of {col} = 1 by subset under {split_label}")
+
+            if params.split_strategy == 'train_valid_test':
+                ax.set_title(f"Percent of {col} = 1 by subset under {split_label}" \
+                             f"Wasserstein distances: valid = {tvv_wdist:.3f}, test = {tvt_wdist:.3f}")
+            else:
+                ax.set_title(f"Percent of {col} = 1 by subset under {split_label}")
             ax.set_xlabel('')
 
     # Restore previous matplotlib color cycle
@@ -153,7 +169,7 @@ def get_split_labeled_dataset(params):
     """
     if isinstance(params, dict):
         params = parse.wrapper(params)
-    dset_df = pd.read_csv(params.dataset_key, dtype={'compound_id': str})
+    dset_df = pd.read_csv(params.dataset_key, dtype={params.id_col: str})
     if params.split_strategy == 'k_fold_cv':
         split_file = f"{os.path.splitext(params.dataset_key)[0]}_{params.num_folds}_fold_cv_{params.splitter}_{params.split_uuid}.csv"
     else:
@@ -167,6 +183,9 @@ def get_split_labeled_dataset(params):
         split_label = f"{nfolds}-fold {params.splitter} cross-validation split"
     else:
         dset_df['split_subset'] = dset_df.subset.values
-        split_label = f"{params.splitter} split"
+        if params.splitter == 'multitaskscaffold':
+            split_label = 'MTSS split'
+        else:
+            split_label = f"{params.splitter} split"
     return dset_df, split_label
 

From a43118812fd822068c57588befa4e4433fcc5a2c Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Fri, 28 Jun 2024 14:23:53 -0700
Subject: [PATCH 25/28] Added functions to generate multi-plot displays to
 assess split quality.

---
 atomsci/ddm/utils/split_diagnostic_plots.py | 213 ++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 atomsci/ddm/utils/split_diagnostic_plots.py

diff --git a/atomsci/ddm/utils/split_diagnostic_plots.py b/atomsci/ddm/utils/split_diagnostic_plots.py
new file mode 100644
index 00000000..fc67aa05
--- /dev/null
+++ b/atomsci/ddm/utils/split_diagnostic_plots.py
@@ -0,0 +1,213 @@
+"""Functions to generate multi-plot displays to assess split quality: response value distributions, test-vs-train Tanimoto distance distributions, etc.
+"""
+
+import os
+import numpy as np
+import pandas as pd
+from argparse import Namespace
+from atomsci.ddm.pipeline.model_pipeline import ModelPipeline
+from atomsci.ddm.pipeline import parameter_parser as parse
+from atomsci.ddm.pipeline import perf_plots as pp
+from atomsci.ddm.utils import split_response_dist_plots as srdp
+from atomsci.ddm.utils import compare_splits_plots as csp
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+from scipy import stats
+
+
+# ---------------------------------------------------------------------------------------------------------------------------------
+def plot_split_diagnostics(params_or_pl, axes=None, num_rows=None, num_cols=1, min_tvt_dist=0.3, plot_size=7):
+    """Generate several plots showing various aspects of split quality. These include: The value distributions for each response
+    column in the different split subsets; the nearest-training-set-neighbor Tanimoto distance distributions for the validation
+    and test sets; the actual split subset proportions; and (for multitask scaffold splits) the progression of fitness term values
+    over generations of the genetic algorithm.
+    
+    Args:
+        params_or_pl (argparse.Namespace or dict or ModelPipeline): Structure containing dataset and split parameters, or
+        the ModelPipeline object used to perform a split. If a ModelPipeline is passed as this argument, the parameters
+        are extracted from it and an additional plot will be generated for multitaskscaffold splits showing the progression
+        of fitness term values over generations.  The following parameters are required, if not set to default values:
+        
+        | - dataset_key
+        | - split_uuid
+        | - split_strategy
+        | - splitter
+        | - split_valid_frac
+        | - split_test_frac
+        | - num_folds
+        | - smiles_col
+        | - response_cols
+
+        axes (matplotlib.Axes): Axes to draw plots in. If provided, must contain enough entries to display all the plots requested
+        (2 for each response column + 3 + 1 for a multitask scaffold split when a ModelPipeline is passed to params_or_pl). If not
+        provided, a figure and Axes of the required length will be created.
+
+        num_rows (int): Number of rows for the Axes layout; ignored if an existing set of Axes are passed in the `axes` argument.
+        num_cols (int): Number of columns for the Axes layout; ignored if an existing set of Axes are passed in the `axes` argument.
+        plot_size (float): Height of plots; ignored if `axes` is provided
+    Returns:
+        None
+    """
+
+    # Save current matplotlib color cycle and switch to 'colorblind' palette
+    old_palette = sns.color_palette()
+    sns.set_palette('colorblind')
+
+    if isinstance(params_or_pl, dict):
+        params = parse.wrapper(params_or_pl)
+    elif isinstance(params_or_pl, Namespace):
+        params = params_or_pl
+    elif isinstance(params_or_pl, ModelPipeline):
+        params = params_or_pl.params
+        splitter = params_or_pl.data.splitting.splitter
+        params.fitness_terms = splitter.fitness_terms
+        params.split_uuid = params_or_pl.data.split_uuid
+        
+    else:
+        raise ValueError("params_or_pl must be dict, Namespace or ModelPipeline")
+    
+    # Figure out how many plots we're going to generate and how to lay them out
+    nresp = len(params.response_cols)
+    nfitness = int((params.splitter == 'multitaskscaffold') and (isinstance(params_or_pl, ModelPipeline)))
+    nplots = 2*nresp + 3 + nfitness
+    if axes is not None:
+        axes = axes.flatten()
+        if len(axes) < nplots:
+            raise ValueError(f"axes argument needs {nplots} axis pairs for requested plots; only provides {len(axes)}")
+    else:
+        if num_rows is None:
+            num_rows = (nplots + 1) // num_cols
+        fig, axes = plt.subplots(num_rows, num_cols, figsize=(num_cols*plot_size, num_rows*plot_size), layout='constrained')
+        axes = axes.flatten()
+    plot_num = 0
+
+    # Draw split response distribution plots
+    srdp.plot_split_subset_response_distrs(params, plot_size=plot_size, axes=axes)
+
+    # Draw NN Tanimoto distance distribution plots with titles showing fraction of compounds below min_tvt_dist
+    plot_num += nresp
+    dset_path = params.dataset_key
+    split_path = f"{os.path.splitext(dset_path)[0]}_{params.split_strategy}_{params.splitter}_{params.split_uuid}.csv"
+    dset_df = pd.read_csv(dset_path, dtype={params.id_col: str})
+    split_df = pd.read_csv(split_path, dtype={'cmpd_id': str})
+    split_stats = csp.SplitStats(dset_df, split_df, smiles_col=params.smiles_col, id_col=params.id_col,
+                                 response_cols=params.response_cols)
+    vvtr_dists = split_stats.dists_tvv
+    tvtr_dists = split_stats.dists_tvt
+    vfrac = sum(vvtr_dists <= min_tvt_dist)/len(vvtr_dists)
+    tfrac = sum(tvtr_dists <= min_tvt_dist)/len(tvtr_dists)
+    ax = axes[plot_num]
+    ax = split_stats.dist_hist_train_v_valid_plot(ax=ax)
+    ax.set_title(f"Valid vs train Tanimoto NN distance distribution\nFraction <= {min_tvt_dist} = {vfrac:.3f}")
+
+    plot_num += 1
+    ax = axes[plot_num]
+    ax = split_stats.dist_hist_train_v_test_plot(ax=ax)
+    ax.set_title(f"Test vs train Tanimoto NN distance distribution\nFraction <= {min_tvt_dist} = {tfrac:.3f}")
+
+    # Draw plots of actual and requested split proportions and counts for each response column after excluding missing values
+    plot_num += 1
+    plot_split_fractions(params, axes[plot_num:])
+    plot_num += nresp
+
+    # Plot the evolution over generations of each term in the fitness function optimized by the multitask scaffold splitter.
+    if nfitness > 0:
+        plot_fitness_terms(params, axes[plot_num:])
+
+def plot_fitness_terms(params, axes):
+    """Plot the evolution over generations of each term in the fitness function optimized by the multitask scaffold splitter.
+
+    Args:
+        params (argparse.Namespace or dict): Structure containing dataset and split parameters. The parameters must include
+        the key `fitness_terms`, which is created by the multitask splitter; usually this is only available when `params`
+        is derived from the ModelPipeline object used to split the dataset.
+
+        axes (numpy.array of matplotlib.Axes): Axes to draw plots in.
+    """
+    if isinstance(params, dict):
+        params = parse.wrapper(params)
+    if isinstance(axes, np.ndarray):
+        ax = axes[0]
+    else:
+        ax = axes
+    fitness_df = pd.DataFrame(params.fitness_terms)
+    fitness_df['generation'] = list(range(len(fitness_df)))
+    fit_long_df = fitness_df.melt(id_vars='generation', value_vars=list(params.fitness_terms.keys()), var_name='Fitness term', value_name='Score')
+    ax = sns.lineplot(data=fit_long_df, x='generation', y='Score', hue='Fitness term', ax=ax)
+    ax.set_title('Unweighted fitness scores vs generation')
+        
+
+def plot_split_fractions(params, axes):
+    """Draw plots of actual and requested split proportions and counts for each response column after excluding missing values
+
+    Args:
+        params (argparse.Namespace or dict): Structure containing dataset and split parameters.
+        The following parameters are required, if not set to default values:
+        
+        | - dataset_key
+        | - split_uuid
+        | - split_strategy
+        | - splitter
+        | - split_valid_frac
+        | - split_test_frac
+        | - num_folds
+        | - smiles_col
+        | - response_cols
+
+        axes (matplotlib.Axes): Axes to draw plots in. Must contain at least as many entries as response columns in the dataset.
+    """
+    split_dset_df, _ = srdp.get_split_labeled_dataset(params)
+    axes = axes.flatten()
+    req_color = pp.test_col
+    actual_color = pp.train_col
+    for icol, resp_col in enumerate(params.response_cols):
+        ss_df = split_dset_df[split_dset_df[resp_col].notna()]
+        ss_counts = ss_df.subset.value_counts()
+        total_count = sum(ss_counts)
+        ss_fracs = ss_counts / total_count
+        actual_df = pd.DataFrame(dict(counts=ss_counts, fracs=ss_fracs))
+        req_fracs = dict(train=1-params.split_valid_frac-params.split_test_frac, valid=params.split_valid_frac, test=params.split_test_frac)
+        req_df = pd.DataFrame(dict(fracs=req_fracs))
+        req_df['counts'] = np.round(total_count * req_df.fracs).astype(int)
+
+        bar_width = 0.35
+
+        # Positions of the bars on the x-axis
+        r1 = np.arange(len(actual_df))
+        r2 = r1 + bar_width
+
+        # Plotting the proportions
+        ax1 = axes[icol]
+        bars_requested = ax1.bar(r1, req_df.fracs.values, color=req_color, width=bar_width, label='Requested', alpha=0.6)
+        bars_actual = ax1.bar(r2, actual_df.fracs.values, color=actual_color, width=bar_width, label='Actual', alpha=0.6)
+        max_count = max(max(actual_df.counts.values), max(req_df.counts.values))
+
+        # Left Y-axis (proportions)
+        ax1.set_ylabel('Proportions')
+        #ax1.tick_params(axis='y')
+
+        # Create the second Y-axis (counts)
+        ax2 = ax1.twinx()
+        ax2.set_ylabel('Counts')
+        #ax2.tick_params(axis='y', labelcolor='k')
+        ax2.set_ylim(0, max_count*1.1)
+
+        # Adding counts on top of the bars
+        for bar, count in zip(bars_requested, req_df.counts.values):
+            height = bar.get_height()
+            ax1.text(bar.get_x() + bar.get_width() / 2, height, f'{count}', ha='center', va='bottom', color=actual_color)
+
+        for bar, count in zip(bars_actual, actual_df.counts.values):
+            height = bar.get_height()
+            ax1.text(bar.get_x() + bar.get_width() / 2, height, f'{count}', ha='center', va='bottom', color=req_color)
+
+        # X-axis labels
+        plt.xticks([r + bar_width / 2 for r in range(len(actual_df))], actual_df.index.values)
+
+        # Title and legend
+        if icol == 0:
+            plt.title(f"Requested and Actual Subset Proportions and Counts\n{resp_col}")
+            ax1.legend(loc='upper right')
+        else:
+            plt.title(f"\n{resp_col}")

From 2d91eb7ba22f6fff540800a7210710939e1a73fc Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Mon, 1 Jul 2024 12:34:40 -0700
Subject: [PATCH 26/28] Fixed plot_split_fractions so that bars always appear
 in order train, valid, test.

---
 atomsci/ddm/utils/split_diagnostic_plots.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/atomsci/ddm/utils/split_diagnostic_plots.py b/atomsci/ddm/utils/split_diagnostic_plots.py
index fc67aa05..447c5166 100644
--- a/atomsci/ddm/utils/split_diagnostic_plots.py
+++ b/atomsci/ddm/utils/split_diagnostic_plots.py
@@ -166,9 +166,9 @@ def plot_split_fractions(params, axes):
         ss_counts = ss_df.subset.value_counts()
         total_count = sum(ss_counts)
         ss_fracs = ss_counts / total_count
-        actual_df = pd.DataFrame(dict(counts=ss_counts, fracs=ss_fracs))
+        actual_df = pd.DataFrame(dict(counts=ss_counts, fracs=ss_fracs)).reindex(['train', 'valid', 'test'])
         req_fracs = dict(train=1-params.split_valid_frac-params.split_test_frac, valid=params.split_valid_frac, test=params.split_test_frac)
-        req_df = pd.DataFrame(dict(fracs=req_fracs))
+        req_df = pd.DataFrame(dict(fracs=req_fracs)).reindex(['train', 'valid', 'test'])
         req_df['counts'] = np.round(total_count * req_df.fracs).astype(int)
 
         bar_width = 0.35

From 4f48c6d24097525d25156e137b3557f388c3bb6e Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Thu, 11 Jul 2024 18:55:42 -0700
Subject: [PATCH 27/28] Added initial version of function to draw line plot of
 NN feature weight absolute sums vs epoch, to assess effect of weight decay
 penalty.

---
 atomsci/ddm/pipeline/feature_importance.py | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/atomsci/ddm/pipeline/feature_importance.py b/atomsci/ddm/pipeline/feature_importance.py
index 38135f6c..25be1c84 100755
--- a/atomsci/ddm/pipeline/feature_importance.py
+++ b/atomsci/ddm/pipeline/feature_importance.py
@@ -525,3 +525,33 @@ def _calc_cluster_permutation_scores(estimator, X, y, col_indices, random_state,
 
 
 
+# ===================================================================================================
+def plot_nn_feature_weights_vs_epoch(pipeline, axes=None, plot_width=20, plot_height=15):
+    """Plot a line graph of summed weight absolute values from neural network feature nodes to first hidden layer vs 
+    training epoch. This plot is intended to show which features have the most influence on the final
+    prediction output. It is also used to show the effect of increasing the weight_decay_penalty
+    parameter, which _should_ result in decreasing the weights on less important features.
+    
+    Args:
+        pipeline (ModelPipeline): The pipeline object from a neural network model that was trained in the
+        current Python session.
+        axes (matplotlib.Axes): An Axes object to draw the plot in. If none is provided, the function will
+        create one in a figure with the specified width and height.
+        plot_width (float): Figure width
+        plot_height (float): Figure height
+    
+    Returns:
+        None
+    """
+    if axes is None:
+        fig, axes = plt.subplots(figsize=(plot_width, plot_height))
+        fig.suptitle("Sums of feature node : hidden layer absolute weights vs epoch")
+    
+    wt_df = pipeline.model_wrapper.feature_weights_df
+    max_epoch = len(wt_df)
+    best_epoch = pipeline.model_wrapper.best_epoch
+    for feat in pipeline.data.featurization.get_feature_columns():
+        sns.lineplot(data=wt_df, x='epoch', y=feat, ax=axes)
+        axes.text(max_epoch + 4, wt_df[feat].values[max_epoch-1], feat)
+    axes.set_ylabel("Sum over 1st hidden layer nodes of feature weight absolute values")
+    axes.axvline(best_epoch, linestyle='--', color='blue')
\ No newline at end of file

From c7f49381b6d7ed686e7e290f0bf19d0edac30562 Mon Sep 17 00:00:00 2001
From: "Kevin S. McLoughlin" <mcloughlin2@llnl.gov>
Date: Thu, 11 Jul 2024 18:57:16 -0700
Subject: [PATCH 28/28] Removed old PRC plot function.

---
 atomsci/ddm/pipeline/perf_plots.py | 53 ------------------------------
 1 file changed, 53 deletions(-)

diff --git a/atomsci/ddm/pipeline/perf_plots.py b/atomsci/ddm/pipeline/perf_plots.py
index c1b2eeec..36590d1b 100644
--- a/atomsci/ddm/pipeline/perf_plots.py
+++ b/atomsci/ddm/pipeline/perf_plots.py
@@ -845,59 +845,6 @@ def plot_prec_recall_curve(MP, epoch_label='best', plot_size=7, pdf_dir=None):
         pdf.close()
         MP.log.info("Wrote plot to %s" % pdf_path)
 
-#------------------------------------------------------------------------------------------------------------------------
-def old_plot_prec_recall_curve(MP, epoch_label='best', plot_size=7, pdf_dir=None):
-    """Plot precision-recall curves for a classification model.
-
-    Args:
-        MP (`ModelPipeline`): Pipeline object for a model that was trained in the current Python session.
-
-        epoch_label (str): Label for training epoch to draw predicted values from. Currently 'best' is the only allowed value.
-
-        pdf_dir (str): If given, output the plots to a PDF file in the given directory.
-
-    Returns:
-        None
-
-    """
-    params = MP.params
-    curve_data = _get_perf_curve_data(MP, epoch_label, 'precision-recall')
-    if len(curve_data) == 0:
-        return
-    if MP.run_mode == 'training':
-        # Draw overlapping PR curves for train, valid and test sets
-        subsets = ['train', 'valid', 'test']
-    else:
-        subsets = ['full']
-    if pdf_dir is not None:
-        pdf_path = os.path.join(pdf_dir, '%s_%s_model_%s_features_%s_split_PRC_curves.pdf' % (
-                                params.dataset_name, params.model_type, params.featurizer, params.splitter))
-        pdf = PdfPages(pdf_path)
-    subset_colors = dict(train=train_col, valid=valid_col, test=test_col, full=full_col)
-    # For multitask, do a separate figure for each task
-    ntasks = curve_data[subsets[0]]['prob_active'].shape[1]
-    for i in range(ntasks):
-        fig, ax = plt.subplots(figsize=(plot_size,plot_size))
-        title = '%s dataset\nPrecision-recall curve for %s %s classifier on %s features with %s split' % (
-                           params.dataset_name, params.response_cols[i], 
-                           params.model_type, params.featurizer, params.splitter)
-        for subset in subsets:
-            precision, recall, thresholds = metrics.precision_recall_curve(curve_data[subset]['true_classes'][:,i],
-                                                     curve_data[subset]['prob_active'][:,i])
-      
-            prc_auc = curve_data[subset]['prc_aucs'][i]
-            ax.step(recall, precision, color=subset_colors[subset], label="%s: AUC = %.3f" % (subset, prc_auc))
-        ax.set_xlabel('Recall')
-        ax.set_ylabel('Precision')
-        ax.set_title(title, fontdict={'fontsize' : 12})
-        legend = ax.legend(loc='lower right')
-    
-        if pdf_dir is not None:
-            pdf.savefig(fig)
-    if pdf_dir is not None:
-        pdf.close()
-        MP.log.info("Wrote plot to %s" % pdf_path)
-
 #------------------------------------------------------------------------------------------------------------------------
 def plot_umap_feature_projections(MP, ndim=2, num_neighbors=20, min_dist=0.1, 
                                   fit_to_train=True,