diff --git a/docs/conf.py b/docs/conf.py index 2f923ed..bb6636c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -9,7 +9,7 @@ import os import sys -sys.path.insert(0, os.path.abspath("../")) +sys.path.insert(0, os.path.abspath("../spare_scores/")) project = "spare-scores" copyright = "2024, Gyujoon Hwang, George Aidinis" diff --git a/spare_scores/classes.py b/spare_scores/classes.py index 03511e0..0918af1 100644 --- a/spare_scores/classes.py +++ b/spare_scores/classes.py @@ -11,21 +11,6 @@ class SpareModel: """ A class for managing different spare models. - - Static attributes: - :param model_type: Type of model to be used. - :type model_type: str - :predictors: List of predictors used for modeling. - :type predictors: list - :param target: Target variable for modeling. - :type target: str - :param key_var: key variable for modeling - :type key_var: str - :param verbose: Verbosity level. - :type verbose: int - :param parameters: Additional parameters for the model. - :type parameters: dict - Additionally, the class can be initialized with any number of keyword arguments. These will be added as attributes to the class. @@ -39,6 +24,20 @@ class SpareModel: set_parameters(**parameters): Updates the model's parameters with the provided values. This also changes the model's attributes, while retaining the original ones. + + :param model_type: Type of model to be used. + :type model_type: str + :predictors: List of predictors used for modeling. + :type predictors: list + :param target: Target variable for modeling. + :type target: str + :param key_var: key variable for modeling + :type key_var: str + :param verbose: Verbosity level. + :type verbose: int + :param parameters: Additional parameters for the model. + :type parameters: dict + """ def __init__( @@ -141,20 +140,20 @@ def apply_model(self, df: pd.DataFrame) -> Any: @dataclass class MetaData: """ - Stores training information on its paired SPARE model - Attributes: - :param mdl_type: Type of model to be used. - :type mdl_type: str - :param mdl_task: Task of the model to be used. - :type mdl_task: str - :param kernel: Kernel used for SVM. - :type kernel: str - :param predictors: List of predictors used for modeling. - :type predictors: list - :param to_predict: Target variable for modeling. - :type to_predict: str - :param key_var: Key variable for modeling. - :type key_var: str + Stores training information on its paired SPARE model + + :param mdl_type: Type of model to be used. + :type mdl_type: str + :param mdl_task: Task of the model to be used. + :type mdl_task: str + :param kernel: Kernel used for SVM. + :type kernel: str + :param predictors: List of predictors used for modeling. + :type predictors: list + :param to_predict: Target variable for modeling. + :type to_predict: str + :param key_var: Key variable for modeling. + :type key_var: str """ mdl_type: str diff --git a/spare_scores/data_prep.py b/spare_scores/data_prep.py index c987ed4..64fe56f 100644 --- a/spare_scores/data_prep.py +++ b/spare_scores/data_prep.py @@ -16,20 +16,21 @@ def check_train( verbose: int = 1, # this needs to be removed(non used) pos_group: str = "", ) -> Union[str, Tuple[pd.DataFrame, list, str]]: - """Checks training dataframe for errors. - - Args: - :param df: a pandas dataframe containing training data. - :type df: pandas.DataFrame - :param predictors: a list of predictors for SPARE model training. - :type predictors: list - :param to_predict: variable to predict. - :type to_predict: str - :param pos_group: group to assign a positive SPARE score (only for classification). - :type pos_group: str - - :return: a tuple containing 1) the filtered dataframe, 2) filtered predictors, 3)SPARE model type. - :rtype: [pandas.DataFrame, list, str] + """ + Checks training dataframe for errors. + + :param df: a pandas dataframe containing training data. + :type df: pandas.DataFrame + :param predictors: a list of predictors for SPARE model training. + :type predictors: list + :param to_predict: variable to predict. + :type to_predict: str + :param pos_group: group to assign a positive SPARE score (only for classification). + :type pos_group: str + + :return: a tuple containing 1) the filtered dataframe, 2) filtered predictors, 3)SPARE model type. + :rtype: [pandas.DataFrame, list, str] + """ # GAI 26/04/2023: Removed check for existence of these columns # if not {'ID','Age','Sex'}.issubset(set(df.columns)): @@ -101,11 +102,11 @@ def check_test( """ Checks testing dataframe for errors. - Args: - :param df: a pandas dataframe containing testing data. - :type df: pandas.DataFrame - :param meta_data: a dictionary containing training information on its paired SPARE model. - :type meta_data: dict + :param df: a pandas dataframe containing testing data. + :type df: pandas.DataFrame + :param meta_data: a dictionary containing training information on its paired SPARE model. + :type meta_data: dict + """ # if not {'ID','Age','Sex'}.issubset(set(df.columns)): # return logging.error('Please check required columns: ID, Age, Sex.') @@ -143,17 +144,17 @@ def smart_unique( For SPARE regression, preserve data points with extreme values. For SPARE classification, preserve data points that help age match. - Args: - :param df1: the passed dataframe - :type df1: pandas.DataFrame - :param df2: optional, if df1 and df2 are two groups to classify. - :type df2: pandas.DataFrame - :param to_predict: variable to predict. Binary for classification and continuous for regression. - Must be one of the columnes in df. Ignored if df2 is given. - :type to_predict: str - - :return: a trimmed pandas dataframe or a tuple of two dataframes with only one time point per ID. - :rtype: pandas.DataFrame + :param df1: the passed dataframe + :type df1: pandas.DataFrame + :param df2: optional, if df1 and df2 are two groups to classify. + :type df2: pandas.DataFrame + :param to_predict: variable to predict. Binary for classification and continuous for regression. + Must be one of the columnes in df. Ignored if df2 is given. + :type to_predict: str + + :return: a trimmed pandas dataframe or a tuple of two dataframes with only one time point per ID. + :rtype: pandas.DataFrame + """ assert isinstance(df2, pd.DataFrame) or ( df2 is None @@ -227,33 +228,35 @@ def age_sex_match( """ Match two groups for age and sex. - Args: - :param df1: the passed dataframe - :type df1: pandas.DataFrame - :param df2: optional, if df1 and df2 are two groups to classify. - :type df2: pandas.DataFrame - :param to_match: a binary variable of two groups. - Must be one of the columns in df. - Ignored if df2 is given.If to_match - is 'Sex', then only perform age matching. - :type to_match: str - :param p_threshold: minimum p-value for matching. Default value = 0.15 - :type p_threshold: float - :param verbose: whether to output messages.(Will be deprecated later) - :type verbose: int - :param age_out_percentage: percentage of the larger group to - randomly select a participant to - take out from during the age matching. - For example, if age_out_percentage = 20 and the - larger group is significantly older, then exclude - one random participant from the fifth - quintile based on age. Default value = 20 - :type age_out_percentage: float - - :return: a trimmed pandas dataframe or a tuple of two dataframes - with age/sex matched groups. - :rtype: pandas.DataFrame + + :param df1: the passed dataframe + :type df1: pandas.DataFrame + :param df2: optional, if df1 and df2 are two groups to classify. + :type df2: pandas.DataFrame + :param to_match: a binary variable of two groups. + Must be one of the columns in df. + Ignored if df2 is given.If to_match + is 'Sex', then only perform age matching. + :type to_match: str + :param p_threshold: minimum p-value for matching. Default value = 0.15 + :type p_threshold: float + :param verbose: whether to output messages.(Will be deprecated later) + :type verbose: int + :param age_out_percentage: percentage of the larger group to + randomly select a participant to + take out from during the age matching. + For example, if age_out_percentage = 20 and the + larger group is significantly older, then exclude + one random participant from the fifth + quintile based on age. Default value = 20 + :type age_out_percentage: float + + :return: a trimmed pandas dataframe or a tuple of two dataframes + with age/sex matched groups. + :rtype: pandas.DataFrame + """ + assert isinstance(df2, pd.DataFrame) or ( df2 is None ), 'Either provide a 2nd pandas dataframe for the 2nd argument or specify the two groups with "to_match"' @@ -356,13 +359,13 @@ def logging_basic_config( """ Basic logging configuration for error exceptions - Args: - :param verbose: input verbose. Default value = 1 - :type verbose: int - :param content_only: If set to True it will output only the needed content. Default value = False - :type content_only: bool - :param filename: input filename. Default value = '' - :type filename: str + :param verbose: input verbose. Default value = 1 + :type verbose: int + :param content_only: If set to True it will output only the needed content. Default value = False + :type content_only: bool + :param filename: input filename. Default value = '' + :type filename: str + """ logging_level = { 0: logging.WARNING, diff --git a/spare_scores/mlp.py b/spare_scores/mlp.py index b07b1bc..675c637 100644 --- a/spare_scores/mlp.py +++ b/spare_scores/mlp.py @@ -17,17 +17,16 @@ class MLPModel: """ A class for managing MLP models. - - Static attributes: - :param predictors: List of predictors used for modeling. - :type predictors: list - :param to_predict: Target variable for modeling. - :type to_predict: str - :param key_var: Key variable for modeling. - :type key_var: str - Additionally, the class can be initialized with any number of keyword arguments. These will be added as attributes to the class. + + :param predictors: List of predictors used for modeling. + :type predictors: list + :param to_predict: Target variable for modeling. + :type to_predict: str + :param key_var: Key variable for modeling. + :type key_var: str + """ def __init__( @@ -158,14 +157,14 @@ def fit(self, df: pd.DataFrame, verbose: int = 1) -> dict: """ Trains the model using the provided dataframe and default parameters. - Args: - :param df: the provided dataframe. - :type df: pandas.DataFrame - :param verbose: the verbosity level - :type verbose: int + :param df: the provided dataframe. + :type df: pandas.DataFrame + :param verbose: the verbosity level + :type verbose: int + + :return: A dictionary with the results from training. + :rtype: dict - :return: A dictionary with the results from training. - :rtype: dict """ logger = logging_basic_config(verbose, content_only=True) @@ -208,12 +207,11 @@ def predict(self, df: pd.DataFrame) -> np.ndarray: """ Predicts the result of the provided dataframe using the trained model. - Args: - :param df: the provided dataframe. - :type df: pandas.DataFrame + :param df: the provided dataframe. + :type df: pandas.DataFrame - :return: The predictions from the trained model regarding the provided dataframe. - :rtype: np.ndarray + :return: The predictions from the trained model regarding the provided dataframe. + :rtype: np.ndarray """ diff --git a/spare_scores/mlp_torch.py b/spare_scores/mlp_torch.py index 0defaee..79a1189 100644 --- a/spare_scores/mlp_torch.py +++ b/spare_scores/mlp_torch.py @@ -38,11 +38,12 @@ class MLPDataset(Dataset): """ A class for managing datasets that will be used for MLP training - Static attributes: - :param X: the first dimension of the provided data(input) - :type X: list - :param y: the second dimension of the provided data(output) - :type y: list + + :param X: the first dimension of the provided data(input) + :type X: list + :param y: the second dimension of the provided data(output) + :type y: list + """ def __init__(self, X: list, y: list): @@ -68,19 +69,18 @@ class SimpleMLP(nn.Module): """ A class to create a simple MLP model. - Static attributes: - :param num_features: total number of features. Default value = 147. - :type num_features: int - :param hidden_size: number of features that will be passed to normalization layers of the model. Default value = 256. - :type hidden_size: int - :param classification: If set to True, then the model will perform classification, otherwise, regression. Default value = True. - :type classification: bool - :param dropout: the dropout value. - :type dropout: float - :param use_bn: if set to True, then the model will use the normalization layers, otherwise, the model will use the linear layers. - :type use_bn: bool - :param bn: if set to 'bn' the model will use BatchNorm1d() for the hidden layers, otherwise, it will use InstanceNorm1d(). - :type bn: str + :param num_features: total number of features. Default value = 147. + :type num_features: int + :param hidden_size: number of features that will be passed to normalization layers of the model. Default value = 256. + :type hidden_size: int + :param classification: If set to True, then the model will perform classification, otherwise, regression. Default value = True. + :type classification: bool + :param dropout: the dropout value. + :type dropout: float + :param use_bn: if set to True, then the model will use the normalization layers, otherwise, the model will use the linear layers. + :type use_bn: bool + :param bn: if set to 'bn' the model will use BatchNorm1d() for the hidden layers, otherwise, it will use InstanceNorm1d(). + :type bn: str """ @@ -150,13 +150,12 @@ class MLPTorchModel: """ A class for managing MLP models. - Static attributes: - :param predictors: List of predictors used for modeling. - :type predictors: list - :param to_predict: Target variable for modeling. - :type to_predict: str - :param key_var: Key variable for modeling. - :type key_var: str + :param predictors: List of predictors used for modeling. + :type predictors: list + :param to_predict: Target variable for modeling. + :type to_predict: str + :param key_var: Key variable for modeling. + :type key_var: str Additionally, the class can be initialized with any number of keyword arguments. These will be added as attributes to the class. @@ -249,14 +248,16 @@ def find_best_threshold(self, y_hat: list, y: list) -> Any: def get_all_stats(self, y_hat: list, y: list, classification: bool = True) -> dict: """ - Args: - :param y: ground truth y (1: AD, 0: CN) -> numpy - :type y: list - :param y_hat:predicted y -> numpy, notice y_hat is predicted value [0.2, 0.8, 0.1 ...] - :type y_hat: list - - :return: A dictionary with the Accuracy, F1 score, Sensitivity, Specificity, Balanced Accuracy, Precision, Recall - :rtype: dict + Returns all stats from training in a dictionary + + :param y: ground truth y (1: AD, 0: CN) -> numpy + :type y: list + :param y_hat:predicted y -> numpy, notice y_hat is predicted value [0.2, 0.8, 0.1 ...] + :type y_hat: list + + :return: A dictionary with the Accuracy, F1 score, Sensitivity, Specificity, Balanced Accuracy, Precision, Recall + :rtype: dict + """ y = np.array(y) y_hat = np.array(y_hat) diff --git a/spare_scores/spare.py b/spare_scores/spare.py index 90bf90c..7caddc8 100644 --- a/spare_scores/spare.py +++ b/spare_scores/spare.py @@ -30,44 +30,44 @@ def spare_train( """ Trains a SPARE model, either classification or regression - Args: - :param df: either a pandas dataframe or a path to a saved csv - containing training data. - :type df: pandas.DataFrame - :param to_predict: variable to predict. Binary for classification and - continuous for regression. Must be one of the columnes in - df. - :type to_predict: str - :param pos_group: group to assign a positive SPARE score (only for - classification). - :type pos_group: str - :param key_var: The key variable to be used for training. If not - given, the first column of the dataset is considered the - primary key of the dataset. - :type key_var: str - :param data_vars: a list of predictors for the training. All must be present - in columns of df. - :type data_vars: list - :param ignore_vars:The list of predictors to be ignored for training. Can be - a listkey_var, or empty. - :type ignore_vars: list - :param kernel: 'linear' or 'rbf' (only linear is supported currently in - regression). - :type kernel: str - :param output: path to save the trained model. '.pkl.gz' file extension - optional. If None is given, no model will be saved. - :type output: str - :param verbose: Verbosity. Int, higher is more verbose. [0,1,2] - :type verbose: int - :param logs: Where to save log file. If not given, logs will only be printed out. - :type logs: str - - :return: A dictionary with three keys, 'status_code', 'status' and 'data'. - 'status' is either'OK' or the error message. 'data' is a dictionary - containing the trained model and metadata if successful, or - None / error object if unsuccessful. 'status_code' is either 0, 1 or 2. - 0 is success, 1 is warning, 2 is error. - :rtype: dict + :param df: either a pandas dataframe or a path to a saved csv + containing training data. + :type df: pandas.DataFrame + :param to_predict: variable to predict. Binary for classification and + continuous for regression. Must be one of the columnes in + df. + :type to_predict: str + :param pos_group: group to assign a positive SPARE score (only for + classification). + :type pos_group: str + :param key_var: The key variable to be used for training. If not + given, the first column of the dataset is considered the + primary key of the dataset. + :type key_var: str + :param data_vars: a list of predictors for the training. All must be present + in columns of df. + :type data_vars: list + :param ignore_vars:The list of predictors to be ignored for training. Can be + a listkey_var, or empty. + :type ignore_vars: list + :param kernel: 'linear' or 'rbf' (only linear is supported currently in + regression). + :type kernel: str + :param output: path to save the trained model. '.pkl.gz' file extension + optional. If None is given, no model will be saved. + :type output: str + :param verbose: Verbosity. Int, higher is more verbose. [0,1,2] + :type verbose: int + :param logs: Where to save log file. If not given, logs will only be printed out. + :type logs: str + + :return: A dictionary with three keys, 'status_code', 'status' and 'data'. + 'status' is either'OK' or the error message. 'data' is a dictionary + containing the trained model and metadata if successful, or + None / error object if unsuccessful. 'status_code' is either 0, 1 or 2. + 0 is success, 1 is warning, 2 is error. + :rtype: dict + """ res = {"status_code": int, "status": Any, "data": Any} @@ -221,37 +221,37 @@ def spare_test( """ Applies a trained SPARE model on a test dataset - Args: - :param df: either a pandas dataframe or a path to a saved csv - containing the test sample. - :type df: pandas.DataFrame - :param mdl_path: either a path to a saved SPARE model ('.pkl.gz' file - extension expected) or a tuple of SPARE model and - meta_data. - :type mdl_path: str - :param key_var: The of key variable to be used for training. If not - given, and the saved model does not contain it,the first - column of the dataset is considered the primary key of the - dataset. - :type key_var: str - :param output: path to save the calculated scores. '.csv' file extension - optional. If None is given, no data will be saved. - :type output: str - :param spare_var: The name of the variable to be predicted. If not given, - the name 'SPARE_score' will be used. - :type spare_var: str - :param verbose: Verbosity. Int, higher is more verbose. [0,1,2] - :type verbose: int - :param logs: Where to save log file. If not given, logs will only be - printed out. - :type logs: str - - :return: A dictionary with three keys, 'status_code', 'status' and 'data'. - 'status' is either 'OK' or the error message. 'data' is the pandas - dataframe containing predicted SPARE scores, or None / error object - if unsuccessful. 'status_code' is either 0, 1 or 2. - 0 is success, 1 is warning, 2 is error. - :rtype: dict + :param df: either a pandas dataframe or a path to a saved csv + containing the test sample. + :type df: pandas.DataFrame + :param mdl_path: either a path to a saved SPARE model ('.pkl.gz' file + extension expected) or a tuple of SPARE model and + meta_data. + :type mdl_path: str + :param key_var: The of key variable to be used for training. If not + given, and the saved model does not contain it,the first + column of the dataset is considered the primary key of the + dataset. + :type key_var: str + :param output: path to save the calculated scores. '.csv' file extension + optional. If None is given, no data will be saved. + :type output: str + :param spare_var: The name of the variable to be predicted. If not given, + the name 'SPARE_score' will be used. + :type spare_var: str + :param verbose: Verbosity. Int, higher is more verbose. [0,1,2] + :type verbose: int + :param logs: Where to save log file. If not given, logs will only be + printed out. + :type logs: str + + :return: A dictionary with three keys, 'status_code', 'status' and 'data'. + 'status' is either 'OK' or the error message. 'data' is the pandas + dataframe containing predicted SPARE scores, or None / error object + if unsuccessful. 'status_code' is either 0, 1 or 2. + 0 is success, 1 is warning, 2 is error. + :rtype: dict + """ res = {"status_code": int, "status": Any, "data": Any} diff --git a/spare_scores/svm.py b/spare_scores/svm.py index 7420d9b..c33e375 100644 --- a/spare_scores/svm.py +++ b/spare_scores/svm.py @@ -15,15 +15,6 @@ class SVMModel: """ A class for managing SVM models. - - Static attributes: - :param predictors: List of predictors used for modeling. - :type predictors: list - :param to_predict: Target variable for modeling. - :type to_predict: str - :param key_var: Key variable for modeling. - :type key_var: str - Additionally, the class can be initialized with any number of keyword arguments. These will be added as attributes to the class. @@ -38,6 +29,14 @@ class SVMModel: set_parameters(**parameters): Updates the model's parameters with the provided values. This also changes the model's attributes, while retaining the original ones. + + :param predictors: List of predictors used for modeling. + :type predictors: list + :param to_predict: Target variable for modeling. + :type to_predict: str + :param key_var: Key variable for modeling. + :type key_var: str + """ def __init__( diff --git a/spare_scores/util.py b/spare_scores/util.py index e020fd8..9e47915 100644 --- a/spare_scores/util.py +++ b/spare_scores/util.py @@ -150,6 +150,7 @@ def load_model(mdl_path: str) -> Any: :param mdl_path: the path to the weights of the model :type mdl_path: str + """ with gzip.open(mdl_path, "rb") as f: @@ -159,13 +160,13 @@ def load_model(mdl_path: str) -> Any: def load_examples(file_name: str = "") -> Any: """Loads example data and models in the package. - Args: - :param file_name: either name of the example data saved as .csv or - name of the SPARE model saved as .pkl.gz. - :type file_name: str + :param file_name: either name of the example data saved as .csv or + name of the SPARE model saved as .pkl.gz. + :type file_name: str + + :return: the resulted dataframe + :rtype: None or pandas.DataFrame - :return: the resulted dataframe - :rtype: None or pandas.DataFrame """ pkg_path = pkg_resources.resource_filename("spare_scores", "") list_data = os.listdir(f"{pkg_path}/data/") @@ -188,12 +189,12 @@ def convert_to_number_if_possible(string: str) -> Union[float, str]: """ Converts the the input string to a float if possible - Args: - :param string: the input string - :type string: str + :param string: the input string + :type string: str + + :return: float if the string is numeric, the same string if it's not + :rtype: float or str - :return: float if the string is numeric, the same string if it's not - :rtype: float or str """ if string.isnumeric(): return float(string)