diff --git a/melloddy_predictor/predictor_single.py b/melloddy_predictor/predictor_single.py index 89ee273..804947d 100644 --- a/melloddy_predictor/predictor_single.py +++ b/melloddy_predictor/predictor_single.py @@ -16,16 +16,21 @@ def csr_to_torch_coo(csr_mat: csr_matrix) -> torch.Tensor: """ Converts from scipy sparse csr matrix to a torch sparsae_coo_tensor to be submitted to the sparsechem network - Sparsewchem requires - + Sparsewchem requires + Args: csr_matx (scipy.sparse.csr_matrix) sprse csr matrix to convert - + Returns: - torch.sparse_coo_tensor + torch.sparse_coo_tensor """ coo_mat = csr_mat.tocoo() - return torch.sparse_coo_tensor(indices = np.array([coo_mat.row,coo_mat.col]), values = coo_mat.data, size = coo_mat.shape, dtype=torch.float) + return torch.sparse_coo_tensor( + indices=np.array([coo_mat.row, coo_mat.col]), + values=coo_mat.data, + size=coo_mat.shape, + dtype=torch.float, + ) class ScModelType(Enum): @@ -33,28 +38,36 @@ class ScModelType(Enum): regression = 1 hybrid = 2 - + class PredictorSingle: """ This class handles predictions for single instances. It bypasses a lot of mechansims for batched data loading """ - def __init__(self, model, conf, class_task_map=None, regr_task_map=None, dropout = False, device = "cpu"): + def __init__( + self, + model, + conf, + class_task_map=None, + regr_task_map=None, + dropout=False, + device="cpu", + ): """ Inititialze the predictor object - + Args: model: filename of the model pytorch model file conf: filename of the the corresponsing configuration file for the model class_task_map: a dictionary or pandas series having classification task labels as as key or index, resp, and continuous classification task IDs (column indexes of the prediction matrix) as values - regr_task_map: a dictionary or pandas series having regression task labels as as key or index, resp, and continuous regression task IDs (column indexes of the prediction matrix) as values + regr_task_map: a dictionary or pandas series having regression task labels as as key or index, resp, and continuous regression task IDs (column indexes of the prediction matrix) as values dropout(bool): whether to apply dropout or nor device: device to run on, per dafault cpu - + """ results_loaded = sc.load_results(conf, two_heads=True) - self.conf = results_loaded["conf"] + self.conf = results_loaded["conf"] self.device = device self.net = sc.SparseFFN(self.conf).to(self.device) self.inverse_normalization = False @@ -63,29 +76,31 @@ def __init__(self, model, conf, class_task_map=None, regr_task_map=None, dropout if self.conf.model_type == "federated": state_dict_new = OrderedDict() state_dict_new["net.0.net_freq.weight"] = state_dict["0.0.net_freq.weight"] - state_dict_new["net.0.net_freq.bias"] = state_dict["0.0.net_freq.bias"] - state_dict_new["net.2.net.2.weight"] = state_dict["1.net.2.weight"] - state_dict_new["net.2.net.2.bias"] = state_dict["1.net.2.bias"] + state_dict_new["net.0.net_freq.bias"] = state_dict["0.0.net_freq.bias"] + state_dict_new["net.2.net.2.weight"] = state_dict["1.net.2.weight"] + state_dict_new["net.2.net.2.bias"] = state_dict["1.net.2.bias"] state_dict = state_dict_new - #apply model weights + # apply model weights self.net.load_state_dict(state_dict) - #set model into evaluation mode + # set model into evaluation mode self.net.eval() - #apply dropout, if chosen + # apply dropout, if chosen self.dropout = dropout if self.dropout: - self.net.apply(sc.utils.enable_dropout) + self.net.apply(sc.utils.enable_dropout) # if inverse normalization is done load the stats - if 'stats' in results_loaded: + if "stats" in results_loaded: self.inverse_normalization = True stats = results_loaded["stats"] self.reg_mean = np.array(stats["mean"]) self.reg_var = np.array(stats["var"]) self.reg_stddev = np.sqrt(self.reg_var) - + if self.net.cat_id_size is not None: - raise NotImplementedError("Predictions for models with a catalog head are not yet implemented") + raise NotImplementedError( + "Predictions for models with a catalog head are not yet implemented" + ) if self.net.class_output_size > 0: if self.net.regr_output_size > 0: self.model_type = ScModelType.hybrid @@ -97,33 +112,33 @@ def __init__(self, model, conf, class_task_map=None, regr_task_map=None, dropout self.has_task_maps = False if (class_task_map is not None) or (regr_task_map is not None): self.set_tasks_maps(class_task_map, regr_task_map) - def set_tasks_maps(self, class_task_map=None, regr_task_map=None): """ Set the task maps stored in the object - + Args: class_task_map: a dictionary or pandas series having classification task labels as as key or index, resp, and continuous classification task IDs (column indexes of the prediction matrix) as values - regr_task_map: a dictionary or pandas series having regression task labels as as key or index, resp, and continuous regression task IDs (column indexes of the prediction matrix) as values - + regr_task_map: a dictionary or pandas series having regression task labels as as key or index, resp, and continuous regression task IDs (column indexes of the prediction matrix) as values + """ - class_task_map, regr_task_map, mapped_tasks_type = self.__validate_maps__(class_task_map, regr_task_map) + class_task_map, regr_task_map, mapped_tasks_type = self.__validate_maps__( + class_task_map, regr_task_map + ) self.class_task_map = class_task_map self.regr_task_map = regr_task_map self.mapped_tasks_type = mapped_tasks_type self.has_task_maps = True - def get_mapped_task_names(self): if not self.has_task_maps: return None elif self.mapped_tasks_type == ScModelType.classification: - return class_task_map.index.values + return self.class_task_map.index.values elif self.mapped_tasks_type == ScModelType.regression: - return regr_task_map.index.values + return self.regr_task_map.index.values elif self.mapped_tasks_type == ScModelType.hybrid: - return self.mapped_task_type_info.index.values + return np.concatenate([self.class_task_map.index.values,self.regr_task_map.index.values]) def get_model_type(self): return self.model_type @@ -151,10 +166,12 @@ def get_num_tasks_by_type(self, type): else: raise ValueError("Non permitted task type {}".format(type)) - def __validate_maps__(self,class_task_map, regr_task_map): - class_task_map = self.__validate_map__(class_task_map, ScModelType.classification) + def __validate_maps__(self, class_task_map, regr_task_map): + class_task_map = self.__validate_map__( + class_task_map, ScModelType.classification + ) regr_task_map = self.__validate_map__(regr_task_map, ScModelType.regression) - #test for non overlap of task labels between classification and regression + # test for non overlap of task labels between classification and regression if class_task_map is not None: if regr_task_map is not None: mapped_tasks_type = ScModelType.hybrid @@ -164,44 +181,68 @@ def __validate_maps__(self,class_task_map, regr_task_map): if regr_task_map is not None: mapped_tasks_type = ScModelType.regression else: - raise ValueError("Task maps for both classification and regression are None") + raise ValueError( + "Task maps for both classification and regression are None" + ) if mapped_tasks_type == ScModelType.hybrid: if class_task_map.index.intersection(regr_task_map.index).size > 0: - raise ValueError("classification and regression task map have task labels in common, this is not permitted") + raise ValueError( + "classification and regression task map have task labels in common, this is not permitted" + ) return class_task_map, regr_task_map, mapped_tasks_type - def __validate_map__(self, task_map, task_type): if task_map is not None: if self.get_num_tasks_by_type(task_type) == 0: - raise ValueError("A {0} task map has been provided for a model without {0} tasks".format(task_type.name)) + raise ValueError( + "A {0} task map has been provided for a model without {0} tasks".format( + task_type.name + ) + ) if type(task_map) == pd.Series: pass elif type(task_map) == dict: task_map = pd.Series(task_map) else: - raise TypeError("{0} task_map needs be either of type pandas.Series or be a dictionary, but is a {1}".\ - format(task_type.name, type(task_map))) + raise TypeError( + "{0} task_map needs be either of type pandas.Series or be a dictionary, but is a {1}".format( + task_type.name, type(task_map) + ) + ) if not task_map.dtype == int: - raise TypeError("The {0} task_map needs to have values of type int".format(task_type.name)) + raise TypeError( + "The {0} task_map needs to have values of type int".format( + task_type.name + ) + ) if task_map.max() >= self.get_num_tasks_by_type(task_type): - raise ValueError("The maximum value of {0} task_map exceeps the number of {0} outputs ({1})".\ - format(task_type.name,self.get_num_tasks_by_type(task_type))) + raise ValueError( + "The maximum value of {0} task_map exceeps the number of {0} outputs ({1})".format( + task_type.name, self.get_num_tasks_by_type(task_type) + ) + ) if not task_map.is_unique: - raise ValueError("the task indexes in {0} task map are not unique".format(task_type.name)) + raise ValueError( + "the task indexes in {0} task map are not unique".format( + task_type.name + ) + ) if not task_map.index.is_unique: - raise ValueError("the task labels in {0} task map are not unique".format(task_type.name)) + raise ValueError( + "the task labels in {0} task map are not unique".format( + task_type.name + ) + ) return task_map - def predict_from_csr(self, x_csr: csr_matrix) -> tuple: """ Feed the input csr matrix in on the go to the neural net for prediction, by passing the torch data loader This is meant to be used for small batches Returns a dense numpy array for classification and regression tasks - + Args: x_csr(scipy.sparse.csr_matrix) : a scipy sparse csr matrix with fingerprint features - + Returns: Tupe(np.array, np.array) with classifcation and regression predictions """ @@ -212,200 +253,238 @@ def predict_from_csr(self, x_csr: csr_matrix) -> tuple: def predict_from_tensor(self, X: torch.Tensor) -> tuple: """ Feed the input torch sparse coo_tensor in on the go to the neural net for prediction, by passing the torch data loader - This is meant to be used for small batches - + This is meant to be used for small batches + Args: X(torch.sparse_coo_tensor) : a torch sparse coo tensor matrix with fingerprint features - + Returns: Tupe(np.array, np.array) with classifcation and regression predictions """ - #don't compute gradients + # don't compute gradients with torch.no_grad(): if self.net.cat_id_size is None: y_class, y_regr = self.net(X.to(self.device)) else: y_class, y_regr, yc_cat = self.net(X.to(self.device)) y_class_array = torch.sigmoid(y_class).cpu().numpy() - y_regr_array = y_regr.cpu().numpy() + y_regr_array = y_regr.cpu().numpy() if self.inverse_normalization: - #y_regr_array = sc.inverse_normalization(csr_matrix(y_regr_array) , mean=self.reg_mean, \ + # y_regr_array = sc.inverse_normalization(csr_matrix(y_regr_array) , mean=self.reg_mean, \ # variance=self.reg_var, array=True) y_regr_array = y_regr_array * self.reg_stddev + self.reg_mean return y_class_array, y_regr_array @staticmethod def extract_tasks(y_array, task_map): - return pd.Series(y_array[0,task_map.values],index = task_map.index) - - def predict_decorated_series_from_tensor(self, X: torch.Tensor, class_task_map=None, regr_task_map=None, limit_to_type = None) -> pd.Series: + return pd.Series(y_array[0, task_map.values], index=task_map.index) + + def predict_decorated_series_from_tensor( + self, + X: torch.Tensor, + class_task_map=None, + regr_task_map=None, + limit_to_type=None, + ) -> pd.Series: """ - This runs the prediction on the input tensor expected to have single row and extracts the desired tasks based on the information in the task - maps that have been passed either on predictor intitialization, or with the call of this function (having precedence). - It extracts from the raw prediction the tasks of interstest as specified through the task map(s) and warps them into a - series having the task lables of the task maps index as series index. Predictions for tasks, which index is not listed in the tasks maps(s) + This runs the prediction on the input tensor expected to have single row and extracts the desired tasks based on the information in the task + maps that have been passed either on predictor intitialization, or with the call of this function (having precedence). + It extracts from the raw prediction the tasks of interstest as specified through the task map(s) and warps them into a + series having the task lables of the task maps index as series index. Predictions for tasks, which index is not listed in the tasks maps(s) are not included in the returned series - + Args: X(torch.sparse_coo_tensor) : a torch sparse coo tensor matrix with fingerprint features class_task_map: a dictionary or pandas series having classification task labels as as key or index, resp, and continuous classification task IDs (column indexes of the prediction matrix) as values - regr_task_map: a dictionary or pandas series having regression task labels as as key or index, resp, and continuous regression task IDs (column indexes of the prediction matrix) as values + regr_task_map: a dictionary or pandas series having regression task labels as as key or index, resp, and continuous regression task IDs (column indexes of the prediction matrix) as values Returns: - pd.Series of predictions with task labels as index + pd.Series of predictions with task labels as index """ - #This function expects receiving a tensor witrh a single row, as we also only return results for the first row + # This function expects receiving a tensor witrh a single row, as we also only return results for the first row if X.size(0) != 1: - raise ValueError("This function expects only single row tensor, but the tensor passed has size of {0}".format(X.size(0))) + raise ValueError( + "This function expects only single row tensor, but the tensor passed has size of {0}".format( + X.size(0) + ) + ) - #if task maps are passed as arguments we use them + # if task maps are passed as arguments we use them if (class_task_map is not None) or (regr_task_map is not None): - class_task_map, regr_task_map, mapped_tasks_type = self.__validate_maps__(class_task_map, regr_task_map) - #otherwise we fall back to the tasks maps passed upon intialization, if peresent + class_task_map, regr_task_map, mapped_tasks_type = self.__validate_maps__( + class_task_map, regr_task_map + ) + # otherwise we fall back to the tasks maps passed upon intialization, if peresent elif self.has_task_maps: class_task_map = self.class_task_map regr_task_map = self.regr_task_map mapped_tasks_type = self.mapped_tasks_type - #if those don't exist, the function cannpot proceed + # if those don't exist, the function cannpot proceed else: - raise ValueError("Task maps must be passed either at intialization time of the predictor or when calling the prediction functions") - + raise ValueError( + "Task maps must be passed either at intialization time of the predictor or when calling the prediction functions" + ) + y_class_array, y_regr_array = self.predict_from_tensor(X) - + if mapped_tasks_type == ScModelType.hybrid and limit_to_type is not None: - if mapped_type in [ScModelType.classification,ScModelType.regression]: + if limit_to_type in [ScModelType.classification, ScModelType.regression]: mapped_tasks_type = limit_to_type else: - raise ValueError("Not permitted type {0} has been provided for limit_to_type".format(limit_to_type)) + raise ValueError( + "Not permitted type {0} has been provided for limit_to_type".format( + limit_to_type + ) + ) if mapped_tasks_type == ScModelType.classification: - results = self.extract_tasks(y_class_array, class_task_map) + results = self.extract_tasks(y_class_array, class_task_map) elif mapped_tasks_type == ScModelType.regression: - results = self.extract_tasks(y_regr_array, regr_task_map) + results = self.extract_tasks(y_regr_array, regr_task_map) elif mapped_tasks_type == ScModelType.hybrid: - results = pd.concat([self.extract_tasks(y_class_array, class_task_map),self.extract_tasks(y_regr_array, regr_task_map)]) + results = pd.concat( + [ + self.extract_tasks(y_class_array, class_task_map), + self.extract_tasks(y_regr_array, regr_task_map), + ] + ) return results - - def predict_decorated_series_from_csr(self, x_csr: csr_matrix, class_task_map=None, regr_task_map=None, limit_to_type = None) -> pd.Series: + def predict_decorated_series_from_csr( + self, + x_csr: csr_matrix, + class_task_map=None, + regr_task_map=None, + limit_to_type=None, + ) -> pd.Series: """ - This runs the prediction on the input csr_matrix expected to have single row and extracts the desired tasks based on the information in the task - maps that have been passed either on predictor intitialization, or with the call of this function (having precedence). - It extracts from the raw prediction the tasks of interstest as specified through the task map(s) and warps them into a - series having the task lables of the task maps index as series index. Predictions for tasks, which index is not listed in the tasks maps(s) + This runs the prediction on the input csr_matrix expected to have single row and extracts the desired tasks based on the information in the task + maps that have been passed either on predictor intitialization, or with the call of this function (having precedence). + It extracts from the raw prediction the tasks of interstest as specified through the task map(s) and warps them into a + series having the task lables of the task maps index as series index. Predictions for tasks, which index is not listed in the tasks maps(s) are not included in the returned series - + Args: X(torch.sparse_coo_tensor) : a torch sparse coo tensor matrix with fingerprint features class_task_map: a dictionary or pandas series having classification task labels as as key or index, resp, and continuous classification task IDs (column indexes of the prediction matrix) as values - regr_task_map: a dictionary or pandas series having regression task labels as as key or index, resp, and continuous regression task IDs (column indexes of the prediction matrix) as values + regr_task_map: a dictionary or pandas series having regression task labels as as key or index, resp, and continuous regression task IDs (column indexes of the prediction matrix) as values Returns: - pd.Series of predictions with task labels as index + pd.Series of predictions with task labels as index """ X = csr_to_torch_coo(x_csr) - return self.predict_decorated_series_from_tensor(X, class_task_map, regr_task_map, limit_to_type = limit_to_type) - + return self.predict_decorated_series_from_tensor( + X, class_task_map, regr_task_map, limit_to_type=limit_to_type + ) def predict_last_hidden_from_tensor(self, X: torch.Tensor) -> np.ndarray: """ This function computes the last hidden layer of the model - + Args: X (torch.sparse_coo_tensor) : fingerprint features as tporch sparse_coo_tensor - + Returns: numpy.ndarray of hidden layer values """ - + with torch.no_grad(): return self.net(X.to(self.device), last_hidden=True).cpu().numpy() - def predict_last_hidden_from_csr(self, x_csr): """ This function computes the last hidden layer of the model - + Args: x_csr (scipy.sparse.csr_matrix) : fingerprint features as csr_matrix - + Returns: numpy.ndarray of hidden layer values """ X = csr_to_torch_coo(x_csr) - return self.predict_hidden_from_tensor(X) - + return self.predict_hidden_from_tensor(X) def predict_trunk_from_tensor(self, X: torch.Tensor) -> np.ndarray: """ This function computes the last hidden layer of the model - + Args: X (torch.sparse_coo_tensor) : fingerprint features as tporch sparse_coo_tensor - + Returns: numpy.ndarray of hidden layer values """ - + with torch.no_grad(): return self.net(X.to(self.device), trunk_embeddings=True).cpu().numpy() - def predict_trunk_from_csr(self, x_csr): """ This function computes the last hidden layer of the model - + Args: x_csr (scipy.sparse.csr_matrix) : fingerprint features as csr_matrix - + Returns: numpy.ndarray of hidden layer values """ X = csr_to_torch_coo(x_csr) return self.predict_hidden_from_tensor(X) - - -def t8df_to_task_map(t8_df: pd.DataFrame, task_type: str, name_column : str = "input_assay_id", concat_task_tye : bool = False, threshold_multi_ix = False, concat_threshold : bool = True) -> pd.Series: +def t8df_to_task_map( + t8_df: pd.DataFrame, + task_type: str, + name_column: str = "input_assay_id", + threshold_multi_ix=False, +) -> pd.Series: """ This function extracts from a t8 type dataframe (or a selected slice thereof) a task_map for the predictor object - + Args: - t8_df (pandas.DataFrame): dataframe to extarct thge task map from + t8_df (pandas.DataFrame): dataframe to extract the task map from task_type (str): either "classification" or "regression" name_column (str) : column in datafarem to use as task labels - concat_task_type (bool): Prepend task_label with the task_type, this can be usefull for hybdrid models where there can be indetically names tasks - concat_threshold (bool): If set to true for classification tasks the threshold value will be appended to the task name - + threshold_multi_ix (bool): Whether to create a multi index with class_labela nd threshold as index columns. Default False + + """ temp_df = t8_df.copy() - if not task_type in ["classification","regression"]: - raise ValueError("Task type must be either \"classification\" or \"regression\", passed type is {0}".format(task_type)) + if not task_type in ["classification", "regression"]: + raise ValueError( + 'Task type must be either "classification" or "regression", passed type is {0}'.format( + task_type + ) + ) task_id_column = "cont_{0}_task_id".format(task_type) if not task_id_column in temp_df: - raise ValueError("task index column \"{0}\" is not present in the task dataframe".format(task_id_column)) + raise ValueError( + 'task index column "{0}" is not present in the task dataframe'.format( + task_id_column + ) + ) if temp_df[task_id_column].isnull().any(): - raise ValueError("Null value task indices are present in data frame") + raise ValueError("Null value continuous task indices are present in data frame") + if temp_df[task_id_column].duplicated().any(): + raise ValueError("Duplicate continuous task indices are present in data frame") task_ids = temp_df[task_id_column].astype(int) if not name_column in temp_df: - raise ValueError("task name column \"{0}\" is not present in the task dataframe".format(name_column)) - temp_df["task_labels"] = temp_df[task_id_column].astype(str) - if concat_task_tye: - temp_df["task_labels"] = task_type + '_' + temp_df["task_labels"] - if task_type == "classification" and (concat_threshold or threshold_multi_ix): - if not "threshold" in temp_df: - raise ValueError("option \"concat_threshold\" was chosen, but \"threshold\" column not present in dataframe") - if concat_threshold: - temp_df["task_labels"] = temp_df["task_labels"] + "_" + temp_df["threshold"].astype(str) - elif threshold_multi_ix: - temp_df["threshold"] = temp_df["threshold"].astype(float) - if task_type == "classification" and threshold_multi_ix: - temp_df = temp_df.set_index(["task_labels","threshold"]) + raise ValueError( + 'task name column "{0}" is not present in the task dataframe'.format( + name_column + ) + ) + if task_type == "classification": + if threshold_multi_ix: + temp_df["task_labels"] = temp_df.apply(lambda x: "assay_{name}_class".format(name=x[name_column]),axis=1) + else: + temp_df["task_labels"] = temp_df.apply(lambda x: "assay_{name}_class_{threshold:0.2f}".format(name=x[name_column],threshold=x["threshold"]),axis=1) else: - temp_df = temp_df.set_index("task_labels") + temp_df["task_labels"] = temp_df.apply(lambda x: "assay_{name}_value".format(name=x[name_column]),axis=1) + temp_df = temp_df.set_index("task_labels") + if threshold_multi_ix: + temp_df = temp_df.set_index("threshold",append=True) if not temp_df.index.is_unique: - raise ValueError("task labels are not unique, try to use a diffeent name column, and/or make use of concat_threshold or option") - return tempd_df[task_id_column] - - - + raise ValueError( + "task labels are not unique, try to use a different name column, and/or make use of concat_threshold or option" + ) + return temp_df[task_id_column].astype(int) diff --git a/tests/begin_to_end_test/sc_output/cls_model-class.npy b/tests/begin_to_end_test/sc_output/cls_model-class.npy new file mode 100644 index 0000000..da9f0f4 Binary files /dev/null and b/tests/begin_to_end_test/sc_output/cls_model-class.npy differ diff --git a/tests/begin_to_end_test/sc_output/clsaux_model-class.npy b/tests/begin_to_end_test/sc_output/clsaux_model-class.npy new file mode 100644 index 0000000..29a34a4 Binary files /dev/null and b/tests/begin_to_end_test/sc_output/clsaux_model-class.npy differ diff --git a/tests/begin_to_end_test/sc_output/hyb_model-class.npy b/tests/begin_to_end_test/sc_output/hyb_model-class.npy new file mode 100644 index 0000000..0667583 Binary files /dev/null and b/tests/begin_to_end_test/sc_output/hyb_model-class.npy differ diff --git a/tests/begin_to_end_test/sc_output/hyb_model-regr.npy b/tests/begin_to_end_test/sc_output/hyb_model-regr.npy new file mode 100644 index 0000000..9210d52 Binary files /dev/null and b/tests/begin_to_end_test/sc_output/hyb_model-regr.npy differ diff --git a/tests/begin_to_end_test/sc_output/reg_model-regr.npy b/tests/begin_to_end_test/sc_output/reg_model-regr.npy new file mode 100644 index 0000000..b6fb269 Binary files /dev/null and b/tests/begin_to_end_test/sc_output/reg_model-regr.npy differ diff --git a/tests/begin_to_end_test/sc_output/trunk_cls.npy b/tests/begin_to_end_test/sc_output/trunk_cls.npy new file mode 100644 index 0000000..f30ed10 Binary files /dev/null and b/tests/begin_to_end_test/sc_output/trunk_cls.npy differ diff --git a/tests/begin_to_end_test/sc_output/trunk_clsaux.npy b/tests/begin_to_end_test/sc_output/trunk_clsaux.npy new file mode 100644 index 0000000..c2428c7 Binary files /dev/null and b/tests/begin_to_end_test/sc_output/trunk_clsaux.npy differ diff --git a/tests/begin_to_end_test/sc_output/trunk_hyb.npy b/tests/begin_to_end_test/sc_output/trunk_hyb.npy new file mode 100644 index 0000000..d38af77 Binary files /dev/null and b/tests/begin_to_end_test/sc_output/trunk_hyb.npy differ diff --git a/tests/begin_to_end_test/sc_output/trunk_reg.npy b/tests/begin_to_end_test/sc_output/trunk_reg.npy new file mode 100644 index 0000000..ccb899f Binary files /dev/null and b/tests/begin_to_end_test/sc_output/trunk_reg.npy differ diff --git a/tests/test_single_predictor.py b/tests/test_single_predictor.py index 52e451f..1671748 100644 --- a/tests/test_single_predictor.py +++ b/tests/test_single_predictor.py @@ -7,12 +7,12 @@ import pytest -from melloddy_tuner.utils.single_row_prep2pred import SingleRowPreparator -from melloddy_predictor.predictor_single import PredictorSingle +from melloddy_tuner.utils.single_row_prep2pred import SingleRowPreparator, KeyProviderFromJsonFile +from melloddy_predictor.predictor_single import PredictorSingle, ScModelType, t8df_to_task_map -from pandas._testing import assert_frame_equal +from pandas._testing import assert_frame_equal, assert_series_equal from scipy.sparse import save_npz, load_npz TEST_FILE_DIR = os.path.dirname(__file__) @@ -33,8 +33,12 @@ def ref_row_mapping_table(): return pd.read_csv(os.path.join(TEST_FILE_DIR,"begin_to_end_test/mt_output/mapping_table.csv")) @pytest.fixture -def srprep(): - return SingleRowPreparator(secret = ENCRYPTION_KEY, params = PREPARATION_PARAMETER) +def kprovider(): + return KeyProviderFromJsonFile(ENCRYPTION_KEY) + +@pytest.fixture +def srprep(kprovider): + return SingleRowPreparator(key_provider = kprovider, params = PREPARATION_PARAMETER) @pytest.fixture def ref_output_ydata(): @@ -45,20 +49,37 @@ def ref_output_ydata(): "hyb": np.load(os.path.join(TEST_FILE_DIR,"begin_to_end_test/sc_output/hyb_model-regr.npy"))} return {"class" : y_refs_class, "regr" : y_refs_regr} +@pytest.fixture +def ref_output_trunk(): + return {mtype : np.load(os.path.join(TEST_FILE_DIR,"begin_to_end_test/sc_output/trunk_{}.npy".format(mtype))) for mtype in ["cls","clsaux","reg","hyb"]} + + @pytest.fixture def class_task_map(): - return {'class_570':570,'class_581':581,'class_2276':2276} + return {"class_570":570,"class_581":581,"class_2276":2276} @pytest.fixture def regr_task_map(): return {"regr_633":633,"regr_740":740,"regr_2":2} +@pytest.fixture +def ref_name_arrays(class_task_map,regr_task_map): + return {"cls" : pd.Series(class_task_map).index.values,\ + "clsaux" : pd.Series(class_task_map).index.values,\ + "reg" : pd.Series(regr_task_map).index.values,\ + "hyb" : np.concatenate([pd.Series(class_task_map).index.values,pd.Series(regr_task_map).index.values]) + } + +@pytest.fixture +def ref_model_types(): + return {"cls": ScModelType.classification, "clsaux": ScModelType.classification, "reg": ScModelType.regression, "hyb": ScModelType.hybrid} + @pytest.fixture def test_preds(class_task_map, regr_task_map): - return {'cls' : PredictorSingle(model= os.path.join(MODELS_PATH,"example_cls_model/model.pth"), conf=os.path.join(MODELS_PATH,"example_cls_model/hyperparameters.json"), class_task_map = class_task_map),\ - 'clsaux' : PredictorSingle(model= os.path.join(MODELS_PATH,"example_clsaux_model/model.pth"), conf=os.path.join(MODELS_PATH,"example_clsaux_model/hyperparameters.json"), class_task_map = class_task_map),\ - 'reg' : PredictorSingle(model= os.path.join(MODELS_PATH,"example_reg_model/model.pth"), conf=os.path.join(MODELS_PATH,"example_reg_model/hyperparameters.json"), regr_task_map = regr_task_map),\ - 'hyb' : PredictorSingle(model= os.path.join(MODELS_PATH,"example_hyb_model/model.pth"), conf=os.path.join(MODELS_PATH,"example_hyb_model/hyperparameters.json"), class_task_map = class_task_map, regr_task_map = regr_task_map) + return {"cls" : PredictorSingle(model= os.path.join(MODELS_PATH,"example_cls_model/model.pth"), conf=os.path.join(MODELS_PATH,"example_cls_model/hyperparameters.json"), class_task_map = class_task_map),\ + "clsaux" : PredictorSingle(model= os.path.join(MODELS_PATH,"example_clsaux_model/model.pth"), conf=os.path.join(MODELS_PATH,"example_clsaux_model/hyperparameters.json"), class_task_map = class_task_map),\ + "reg" : PredictorSingle(model= os.path.join(MODELS_PATH,"example_reg_model/model.pth"), conf=os.path.join(MODELS_PATH,"example_reg_model/hyperparameters.json"), regr_task_map = regr_task_map),\ + "hyb" : PredictorSingle(model= os.path.join(MODELS_PATH,"example_hyb_model/model.pth"), conf=os.path.join(MODELS_PATH,"example_hyb_model/hyperparameters.json"), class_task_map = class_task_map, regr_task_map = regr_task_map) } @pytest.fixture @@ -73,6 +94,28 @@ def input_failing_smiles_df(): def ix_rename_map(ref_row_mapping_table): return ref_row_mapping_table.set_index("cont_descriptor_vector_id")["input_compound_id"] +@pytest.fixture +def get_benzene_x_csr(srprep): + return srprep.descriptor_calc.calculate_single_csr('c1ccccc1') + +@pytest.fixture +def get_benzene_y_ref(): + return {"cls":pd.Series({"class_570" : 0.516933, "class_581" : 0.433307, "class_2276" : 0.565609},dtype="float32"), + "clsaux": pd.Series({"class_570" : 0.412029, "class_581" : 0.489868, "class_2276" : 0.504993},dtype="float32"), + "reg": pd.Series({"regr_633" : 5.097863, "regr_740" : 5.743073, "regr_2" : 7.306094},dtype="float64"), + "hyb": pd.Series({"class_570" : 0.821179, "class_581" : 0.209964, "class_2276" : 0.560037, "regr_633" : 5.118069, "regr_740" : 5.721944, "regr_2" : 7.383655},dtype="float64")} + +@pytest.fixture +def cls_t8df_head(): + int_cols = ['cont_classification_task_id', 'classification_task_id', 'num_total_actives', 'num_fold_min_actives', 'num_total_inactives', 'num_fold_min_inactives', 'n_tasks', 'retained_tasks'] + T8c = pd.read_csv(os.path.join(MODELS_PATH,"example_cls_model/T8c.csv")) + T8c[int_cols] = T8c[int_cols].astype("Int64") + return T8c[T8c["cont_classification_task_id"] <10] + +@pytest.fixture +def test_pred_multi_ix(cls_t8df_head): + multi_ix_task_map = t8df_to_task_map(cls_t8df_head,task_type = "classification",threshold_multi_ix=True) + return PredictorSingle(model= os.path.join(MODELS_PATH,"example_cls_model/model.pth"), conf=os.path.join(MODELS_PATH,"example_cls_model/hyperparameters.json"), class_task_map = multi_ix_task_map) def test_dense_tasks_prediction(srprep, input_smiles_df, ref_output_xdata, ref_output_ydata, ix_rename_map, test_preds): #generate x-data @@ -127,7 +170,7 @@ def test_named_task_predictions(srprep, input_smiles_df, test_preds, class_task_ y_refs_select_class_df = pd.DataFrame(y_refs_selected_class_tasks, columns = list(class_task_map.keys())).rename(index=ix_rename_map) y_refs_select_regr_df = pd.DataFrame(y_refs_selected_regr_tasks, columns = list(regr_task_map.keys())).rename(index=ix_rename_map) ref_hyb_res_slice_df_reconstructed = pd.concat([y_refs_select_class_df, y_refs_select_regr_df],axis=1) - ref_hyb_res_slice_df_reconstructed.index.names = ['input_compound_id'] + ref_hyb_res_slice_df_reconstructed.index.names = ["input_compound_id"] assert_frame_equal(test_hyb_res_slice_df.sort_index().astype("float32"), ref_hyb_res_slice_df_reconstructed.sort_index().astype("float32")) def test_failing_predictions(srprep, input_failing_smiles_df, test_preds): @@ -137,3 +180,53 @@ def test_failing_predictions(srprep, input_failing_smiles_df, test_preds): x = srprep.process_smiles(smi) y = test_preds["hyb"].predict_decorated_series_from_tensor(x) y_res_slice[k] = y + +def test_get_mapped_task_names(test_preds, ref_name_arrays): + for mtype, my_pred in test_preds.items(): + assert (my_pred.get_mapped_task_names() == ref_name_arrays[mtype]).all() + +def test_get_model_type(test_preds, ref_model_types): + for mtype, my_pred in test_preds.items(): + assert my_pred.get_model_type() == ref_model_types[mtype] + +def test_limit_to_type(srprep, test_preds): + x = srprep.process_smiles('c1ccccc1') + #provoke failure with invalid type + with pytest.raises(ValueError): + y = test_preds["hyb"].predict_decorated_series_from_tensor(x,limit_to_type=5) + #now test a valid type + y = test_preds["hyb"].predict_decorated_series_from_tensor(x,limit_to_type=ScModelType.regression) + y_ref = pd.Series({"regr_633": 5.118069, "regr_740" : 5.721944, "regr_2" : 7.383655}) + assert_series_equal(y, y_ref) + +def test_csr_predictions(get_benzene_x_csr, get_benzene_y_ref, test_preds): + for mtype, my_pred in test_preds.items(): + y_test = my_pred.predict_decorated_series_from_csr(get_benzene_x_csr) + assert_series_equal(y_test, get_benzene_y_ref[mtype]) + +def test_trunk_output(test_preds, srprep, input_smiles_df, ref_output_trunk): + for mtype, my_pred in test_preds.items(): + assert np.allclose(np.concatenate([my_pred.predict_trunk_from_tensor(srprep.process_smiles(smi)) for k,smi in input_smiles_df.set_index("input_compound_id")["smiles"].items()]),ref_output_trunk[mtype]) + + +def test_task_map_generator(cls_t8df_head): + task_map_test1 = t8df_to_task_map(cls_t8df_head,task_type = "classification") + labels = {"assay_517_class_7.00": 0, "assay_924_class_6.50": 1, "assay_924_class_7.00": 2, "assay_924_class_7.50": 3, "assay_1160_class_6.50": 4,\ + "assay_1160_class_7.00": 5, "assay_1512_class_7.50": 6, "assay_1512_class_8.00": 7, "assay_1512_class_8.50": 8, "assay_1520_class_8.00": 9} + task_map_ref1 = pd.Series(labels ,name='cont_classification_task_id',dtype="int64").rename_axis("task_labels") + assert_series_equal(task_map_test1, task_map_ref1) + + task_map_test2 = t8df_to_task_map(cls_t8df_head,task_type = "classification",threshold_multi_ix=True) + labels2 = {"assay_517_class":{7.0 : 0},"assay_924_class":{6.5 : 1, 7.0 : 2, 7.5 : 3}, "assay_1160_class" : {6.5 : 4, 7.0 : 5},\ + "assay_1512_class" : {7.5 : 6, 8.0 : 7, 8.5 : 8}, "assay_1520_class" :{ 8.0 : 9}} + task_map_ref2 = pd.concat({key:pd.Series(val,name='cont_classification_task_id',dtype="int64") for key, val in labels2.items()}).rename_axis(["task_labels","threshold"]) + assert_series_equal(task_map_test2, task_map_ref2) + +def test_multi_ix_predictions(srprep,test_pred_multi_ix): + x = srprep.process_smiles("c1ccccc1") + y_multi_ix_test = test_pred_multi_ix.predict_decorated_series_from_tensor(x) + values_multi_ix = {"assay_517_class":{7.0 : 0.531071},"assay_924_class":{6.5 : 0.583757, 7.0 : 0.542668, 7.5 : 0.474523}, "assay_1160_class" : {6.5 : 0.530777, 7.0 : 0.428757},\ + "assay_1512_class" : {7.5 : 0.472368, 8.0 : 0.367206, 8.5 : 0.306637}, "assay_1520_class" :{ 8.0 : 0.499579}} + y_multi_ix_ref = pd.concat({key:pd.Series(val,dtype="float32") for key, val in values_multi_ix.items()}).rename_axis(["task_labels","threshold"]) + assert_series_equal(y_multi_ix_test,y_multi_ix_ref) + \ No newline at end of file