diff --git a/.github/workflows/macos_test_cases.yml b/.github/workflows/macos_test_cases.yml new file mode 100644 index 0000000..b1bbf28 --- /dev/null +++ b/.github/workflows/macos_test_cases.yml @@ -0,0 +1,33 @@ +name: spare_scores test cases on macos + +# workflow dispatch has been added for testing purposes +on: [push, pull_request, workflow_dispatch] + +jobs: + build: + runs-on: ["macos-latest"] + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.8' + - name: Set-up miniconda for macos and ubuntu + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.8 + miniconda-version: "latest" + - name: Create conda env + run: conda create -n spare python=3.8 + - name: Install pip + run: conda run -n spare conda install pip + - name: Install spare scores + run: conda run -n spare pip install spare_scores + - name: Download dependencies + run: pip install setuptools && pip install . + - name: Run unit tests + run: | + cd tests/unit && python -m unittest discover -s . -p "*.py" + + diff --git a/.github/workflows/ubuntu_test_cases.yml b/.github/workflows/ubuntu_test_cases.yml new file mode 100644 index 0000000..ed74b01 --- /dev/null +++ b/.github/workflows/ubuntu_test_cases.yml @@ -0,0 +1,32 @@ +name: spare_scores test cases on ubuntu + +# workflow dispatch has been added for testing purposes +on: [push, pull_request, workflow_dispatch] + +jobs: + build: + runs-on: ["ubuntu-latest"] + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.8' + - name: Set-up miniconda for macos and ubuntu + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.8 + miniconda-version: "latest" + - name: Create conda env + run: conda create -n spare python=3.8 + - name: Install pip + run: conda run -n spare conda install pip + - name: Install spare scores + run: conda run -n spare pip install spare_scores + - name: Download dependencies + run: pip install setuptools && pip install . + - name: Run unit tests + run: | + cd tests/unit && python -m unittest discover -s . -p "*.py" + diff --git a/dev-dependencies.txt b/dev-dependencies.txt index eedaac1..3292bc9 100644 --- a/dev-dependencies.txt +++ b/dev-dependencies.txt @@ -18,29 +18,30 @@ jsonschema==4.17.3 kiwisolver==1.4.4 matplotlib==3.7.1 msgpack==1.0.5 -numpy==1.24.4 +numpy==1.23.5 packaging==23.1 pandas==2.0.3 Pillow==9.5.0 pkgutil_resolve_name==1.3.10 -pluggy==1.2.0 +pluggy==1.5.0 protobuf==4.23.3 pyparsing==3.1.0 pyrsistent==0.19.3 -pytest==7.4.0 +pytest==8.2.2 python-dateutil==2.8.2 pytz==2023.3 PyYAML==6.0 ray==2.5.1 requests==2.31.0 -scikit-learn==1.2.2 -scipy==1.10.1 +scikit-learn==0.24.2 +scipy==1.8.0 six==1.16.0 -e git+https://github.com/georgeaidinis/spare_score@3055a393e7aad704dd00dd378e45d695d99deebd#egg=spare_scores threadpoolctl==3.1.0 tomli==2.0.1 -torch==1.11.0 +torch==2.3.1 typing_extensions==4.7.0 tzdata==2023.3 urllib3==2.0.3 zipp==3.15.0 +setuptools==70.3.0 diff --git a/setup.py b/setup.py index 5773b36..ef10c74 100644 --- a/setup.py +++ b/setup.py @@ -18,8 +18,9 @@ include_package_data=True, install_requires=['numpy', 'pandas', + 'setuptools', 'scikit-learn', - 'torch<2.1', + 'torch<2.3.1', 'matplotlib', 'optuna'], entry_points={ @@ -27,4 +28,4 @@ "spare_scores = spare_scores.cli:main", "SPARE = spare_scores.cli:main"] }, - ) \ No newline at end of file + ) diff --git a/spare_scores/cli.py b/spare_scores/cli.py index bd739f7..f9c415f 100644 --- a/spare_scores/cli.py +++ b/spare_scores/cli.py @@ -307,4 +307,4 @@ def main(): arguments.logs) return - return \ No newline at end of file + return diff --git a/spare_scores/data_prep.py b/spare_scores/data_prep.py index 9f0cd59..1203bd7 100644 --- a/spare_scores/data_prep.py +++ b/spare_scores/data_prep.py @@ -11,21 +11,21 @@ def check_train(df: pd.DataFrame, - predictors: list, - to_predict: str, - key_var: str, - pos_group: str = '', - verbose: int = 1) -> Tuple[pd.DataFrame, list, str]: + predictors: list, + to_predict: str, + verbose: int = 1, # this needs to be removed(non used). If i remove + # it, then there are bugs to the test cases(check_train() unexpected argument verbose) + pos_group: str = '') -> Tuple[pd.DataFrame, list, str]: """Checks training dataframe for errors. Args: - df: a pandas dataframe containing training data. - predictors: a list of predictors for SPARE model training. - to_predict: variable to predict. - pos_group: group to assign a positive SPARE score (only for classification). + df(pandas.DataFrame): a pandas dataframe containing training data. + predictors(list): a list of predictors for SPARE model training. + to_predict(str): variable to predict. + pos_group(str): group to assign a positive SPARE score (only for classification). Returns: - a tuple containing 1) filtered dataframe, 2) filtered predictors, 3) SPARE model type. + Tuple[pandas.DataFrame, list, str]: a tuple containing 1)filtered dataframe, 2)filtered predictors, 3)SPARE model type. """ # GAI 26/04/2023: Removed check for existence of these columns # if not {'ID','Age','Sex'}.issubset(set(df.columns)): @@ -77,13 +77,12 @@ def check_train(df: pd.DataFrame, return df, predictors, mdl_task def check_test(df: pd.DataFrame, - meta_data: dict, - verbose: int = 1): + meta_data: dict): """Checks testing dataframe for errors. Args: - df: a pandas dataframe containing testing data. - meta_data: a dictionary containing training information on its paired SPARE model. + df(pandas.DataFrame): a pandas dataframe containing testing data. + meta_data(dict): a dictionary containing training information on its paired SPARE model. """ ############# Removing the hardcoded check for the below cols ############# # if not {'ID','Age','Sex'}.issubset(set(df.columns)): @@ -106,31 +105,27 @@ def check_test(df: pd.DataFrame, if np.sum(np.sum(pd.isna(df[meta_data['predictors']]))) > 0: logging.warn('Some participants have invalid (missing or NaN values) predictor variables.') - ############# Removing the hardcoded ID checks ############# - if 'ID' not in df.columns: - # logging.info('"ID" column not found in the input dataframe. Treating all participants as independent from training.') - pass - else: + if 'ID' in df.columns: if np.any(df['ID'].isin(meta_data['cv_results']['ID'])): logging.info('Some participants seem to have been in the model training.') + return 'OK', None def smart_unique(df1: pd.DataFrame, - df2: pd.DataFrame=None, - to_predict: str=None, - verbose: int=1) -> Union[pd.DataFrame, tuple]: + df2: pd.DataFrame=None, + to_predict: str=None) -> Union[pd.DataFrame, tuple]: """Select unique data points in a way that optimizes SPARE training. For SPARE regression, preserve data points with extreme values. For SPARE classification, preserve data points that help age match. Args: - df1: a pandas dataframe. - df2: a pandas dataframe (optional) if df1 and df2 are two groups to classify. - to_predict: variable to predict. Binary for classification and continuous for regression. + df1(pandas.DataFrame) + df2(pandas.DataFrame): optional, if df1 and df2 are two groups to classify. + to_predict(str): variable to predict. Binary for classification and continuous for regression. Must be one of the columnes in df. Ignored if df2 is given. Returns: - a trimmed pandas dataframe or a tuple of two dataframes with only one time point per ID. + pandas.DataFrame: a trimmed pandas dataframe or a tuple of two dataframes with only one time point per ID. """ assert (isinstance(df2, pd.DataFrame) or (df2 is None)), ( 'Either provide a 2nd pandas dataframe for the 2nd argument or specify it with "to_predict"') @@ -191,20 +186,20 @@ def age_sex_match(df1: pd.DataFrame, """Match two groups for age and sex. Args: - df1: a pandas dataframe. - df2: a pandas dataframe (optional) if df1 and df2 are two groups to classify. - to_match: a binary variable of two groups. Must be one of the columns in df. + df1(pandas.DataFrame) + df2(pandas.DataFrame): optional, if df1 and df2 are two groups to classify. + to_match(str): a binary variable of two groups. Must be one of the columns in df. Ignored if df2 is given. If to_match is 'Sex', then only perform age matching. - p_threshold: minimum p-value for matching. - verbose: whether to output messages. - age_out_percentage: percentage of the larger group to randomly select a participant to + p_threshold(float): minimum p-value for matching. Default value = 0.15 + ----------- verbose: whether to output messages.(Will be deprecated later) + age_out_percentage(float): percentage of the larger group to randomly select a participant to take out from during the age matching. For example, if age_out_percentage = 20 and the larger group is significantly older, then exclude one random participant from the fifth - quintile based on age. + quintile based on age. Default value = 20 Returns: - a trimmed pandas dataframe or a tuple of two dataframes with age/sex matched groups. + pandas.DataFrame: a trimmed pandas dataframe or a tuple of two dataframes with age/sex matched groups. """ assert (isinstance(df2, pd.DataFrame) or (df2 is None)), ( 'Either provide a 2nd pandas dataframe for the 2nd argument or specify the two groups with "to_match"') @@ -286,7 +281,15 @@ def age_sex_match(df1: pd.DataFrame, else: return (df1, df2) -def logging_basic_config(verbose=1, content_only=False, filename=''): +def logging_basic_config(verbose :int = 1, content_only = False, filename :str = ''): + """ + Basic logging configuration for error exceptions + + Args: + verbose(int): input verbose. Default value = 1 + content_only(bool): If set to True it will output only the needed content. Default value = False + filename(str): input filename. Default value = '' + """ logging_level = {0:logging.WARNING, 1:logging.INFO, 2:logging.DEBUG, 3:logging.ERROR, 4:logging.CRITICAL} fmt = ' %(message)s' if content_only else '%(levelname)s (%(funcName)s): %(message)s' if filename != '' and filename is not None: @@ -313,4 +316,4 @@ def convert_cat_variables(df, predictors, meta_data): elif len(df[var].unique()) > 2: raise ValueError('Categorical variables with more than 2 ' + 'categories are currently not supported.') - return df, meta_data \ No newline at end of file + return df, meta_data diff --git a/spare_scores/mlp.py b/spare_scores/mlp.py index e105253..28de306 100644 --- a/spare_scores/mlp.py +++ b/spare_scores/mlp.py @@ -28,18 +28,23 @@ class MLPModel: arguments. These will be added as attributes to the class. Methods: - train_model(df, **kwargs): - Trains the model using the provided dataframe. + fit(df, verbose): + Trains the model using the provided dataframe and default parameters. + Args: + df(pandas.DataFrame): the provided dataframe. + verbose(int) + Returns: + dict: A dictionary with the results from training. - apply_model(df): - Applies the trained model on the provided dataframe and returns - the predictions. - - set_parameters(**parameters): - Updates the model's parameters with the provided values. This also - changes the model's attributes, while retaining the original ones. + predict(df): + Predicts the result of the provided dataframe using the trained model. + Args: + df(pandas.DataFrame): the provided dataframe. + Returns: + list: The predictions from the trained model regarding the provided dataframe. + """ - def __init__(self, predictors, to_predict, key_var, verbose=1,**kwargs): + def __init__(self, predictors, to_predict, key_var, verbose=1, **kwargs): logger = logging_basic_config(verbose, content_only=True) self.predictors = predictors @@ -130,7 +135,7 @@ def _fit(self, df): self.get_stats(y, self.y_hat) @ignore_warnings(category= (ConvergenceWarning,UserWarning)) - def fit(self, df, verbose=1, **kwargs): + def fit(self, df, verbose=1) -> dict: logger = logging_basic_config(verbose, content_only=True) @@ -168,7 +173,7 @@ def fit(self, df, verbose=1, **kwargs): return result - def predict(self, df, verbose=1): + def predict(self, df): X = df[self.predictors] X_transformed = self.scaler.transform(X) diff --git a/spare_scores/mlp_torch.py b/spare_scores/mlp_torch.py index 5084831..b69152d 100644 --- a/spare_scores/mlp_torch.py +++ b/spare_scores/mlp_torch.py @@ -18,9 +18,21 @@ import optuna -device = "cuda" if torch.cuda.is_available() else "cpu" +device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu") class MLPDataset(Dataset): + """ + A class for managing datasets that will be used for MLP training + Static attributes: + X(list): the first dimension of the provided data(input) + y(list): the second dimension of the provided data(output) + + Methods: + __getitem__(idx)(getter): + returns the index of both X and y at index: idx(X[idx], y[idx]) + __len__: + returns the length of the provided dataset + """ def __init__(self, X, y): self.X = np.array(X, dtype=np.float32) self.y = np.array(y, dtype=np.float32) @@ -28,11 +40,23 @@ def __init__(self, X, y): def __len__(self): return len(self.y) - def __getitem__(self, idx): + def __getitem__(self, idx: int): return self.X[idx], self.y[idx] class SimpleMLP(nn.Module): - def __init__(self, num_features = 147, hidden_size = 256, classification = True, dropout = 0.2, use_bn = False, bn = 'bn'): + """ + A class to create a simple MLP model. + + Static attributes: + num_features(int): total number of features. Default value = 147. + hidden_size(int): number of features that will be passed to normalization layers of the model. Default value = 256. + classification(bool): If set to True, then the model will perform classification, otherwise, regression. Default value = True. + dropout(float): the dropout value. + use_bn(bool): if set to True, then the model will use the normalization layers, otherwise, the model will use the linear layers. + bn(str): if set to 'bn' the model will use BatchNorm1d() for the hidden layers, otherwise, it will use InstanceNorm1d(). + + """ + def __init__(self, num_features: int = 147, hidden_size: int = 256, classification: bool = True, dropout: float = 0.2, use_bn: bool = False, bn: str = 'bn'): super(SimpleMLP, self).__init__() self.num_features = num_features @@ -79,29 +103,17 @@ def forward(self, x): class MLPTorchModel: """ - A class for managing MLP models. - - Static attributes: - predictors (list): List of predictors used for modeling. - to_predict (str): Target variable for modeling. - key_var (str): Key variable for modeling. + A class for managing MLP models. - Additionally, the class can be initialized with any number of keyword - arguments. These will be added as attributes to the class. + Static attributes: + predictors (list): List of predictors used for modeling. + to_predict (str): Target variable for modeling. + key_var (str): Key variable for modeling. - Methods: - train_model(df, **kwargs): - Trains the model using the provided dataframe. - - apply_model(df): - Applies the trained model on the provided dataframe and returns - the predictions. - - set_parameters(**parameters): - Updates the model's parameters with the provided values. This also - changes the model's attributes, while retaining the original ones. + Additionally, the class can be initialized with any number of keyword + arguments. These will be added as attributes to the class. """ - def __init__(self, predictors, to_predict, key_var, verbose=1,**kwargs): + def __init__(self, predictors, to_predict, key_var, verbose=1, **kwargs): logger = logging_basic_config(verbose, content_only=True) self.predictors = predictors diff --git a/spare_scores/spare_scores.py b/spare_scores/spare_scores.py index 9564df2..38e7b2a 100644 --- a/spare_scores/spare_scores.py +++ b/spare_scores/spare_scores.py @@ -61,7 +61,6 @@ def spare_train( None / error object if unsuccessful. 'status_code' is either 0, 1 or 2. 0 is success, 1 is warning, 2 is error. """ - res = {'status_code': None, 'status': None, 'data': None} logger = logging_basic_config(verbose=verbose, filename=logs) @@ -102,9 +101,8 @@ def spare_train( df, predictors, mdl_task = check_train(df, predictors, to_predict, - key_var, - pos_group, - verbose=verbose) + verbose, + pos_group) except Exception as e: err = "Dataset check failed before training was initiated." logger.error(err) @@ -380,4 +378,4 @@ def spare_test(df: Union[pd.DataFrame, str], res['status'] = 'OK' res['data'] = out_df res['status_code'] = 0 - return res \ No newline at end of file + return res diff --git a/spare_scores/util.py b/spare_scores/util.py index 8203398..e05f55a 100644 --- a/spare_scores/util.py +++ b/spare_scores/util.py @@ -109,6 +109,15 @@ def load_examples(file_name: str=''): [logging.info(f' - {a}') for a in list_mdl] def convert_to_number_if_possible(string): + """ + Converts the the input string to a float if possible + + Args: + string(str): the input string + Returns: + float: if the exception is not raised + str: if the exception is raised + """ try: number = float(string) # Attempt to convert the string to a float return number diff --git a/tests/unit/test_data_prep.py b/tests/unit/test_data_prep.py index 3c9688e..1102b6c 100644 --- a/tests/unit/test_data_prep.py +++ b/tests/unit/test_data_prep.py @@ -1,89 +1,93 @@ import pandas as pd +import unittest +import sys -from spare_scores.data_prep import check_test, check_train +sys.path.append('../../spare_scores/') # check_test and check_train were imported from the build, but now they are updated +from data_prep import check_test, check_train # If updates go through, it can be updated to spare_scores.data_prep +class CheckDataPrep(unittest.TestCase): -def test_check_train(df_fixture): - # Test case 1: Valid input dataframe and predictors - predictors = ['ROI1', 'ROI2', 'ROI3'] - to_predict = 'Sex' - pos_group = 'M' - key_var = 'ID' - filtered_df, filtered_predictors, mdl_type = check_train(df_fixture, - predictors, - to_predict, - key_var, - pos_group=pos_group) - assert filtered_df.equals(df_fixture) # Check if filtered dataframe is the same as the input dataframe - assert filtered_predictors == predictors # Check if filtered predictors are the same as the input predictors - assert mdl_type == 'Classification' # Check if the SPARE model type is correct + def test_check_train(self): + # Test case 1: Valid input dataframe and predictors + self.df_fixture = pd.read_csv("../fixtures/sample_data.csv") + predictors = ['ROI1', 'ROI2', 'ROI3'] + to_predict = 'Sex' + pos_group = 'M' + key_var = 'ID' + filtered_df, filtered_predictors, mdl_type = check_train(self.df_fixture, + predictors, + to_predict, + pos_group=pos_group) + self.assertTrue(filtered_df.equals(self.df_fixture)) # Check if filtered dataframe is the same as the input dataframe + self.assertTrue(filtered_predictors == predictors) # Check if filtered predictors are the same as the input predictors + self.assertTrue(mdl_type == 'Classification') # Check if the SPARE model type is correct - # Test case 2: Missing required columns - df_missing_columns = pd.DataFrame({'ID': [1, 2, 3], - 'Var1': [1, 2, 3], - 'Var2': [4, 5, 6]}) - predictors = ['Var1', 'Var2'] - to_predict = 'ToPredict' - pos_group = '1' - res = check_train(df_missing_columns, predictors, to_predict, pos_group) - assert res == 'Variable to predict is not in the input dataframe.' + # Test case 2: Missing required columns + df_missing_columns = pd.DataFrame({'ID': [1, 2, 3], + 'Var1': [1, 2, 3], + 'Var2': [4, 5, 6]}) + predictors = ['Var1', 'Var2'] + to_predict = 'ToPredict' + pos_group = '1' + res = check_train(df_missing_columns, predictors, to_predict, pos_group) + self.assertTrue(res == 'Variable to predict is not in the input dataframe.') - # Test case 3: Predictor not in input dataframe - df = pd.DataFrame({'ID': [1, 2, 3], - 'Age': [30, 40, 50], - 'Sex': ['M', 'F', 'M'], - 'Var1': [1, 2, 3]}) - predictors = ['Var1', 'Var2'] # Var2 is not in the input dataframe - to_predict = 'ToPredict' - pos_group = '1' - res = check_train(df, predictors, to_predict, pos_group) - assert res == 'Not all predictors exist in the input dataframe.' + # Test case 3: Predictor not in input dataframe + df = pd.DataFrame({'ID': [1, 2, 3], + 'Age': [30, 40, 50], + 'Sex': ['M', 'F', 'M'], + 'Var1': [1, 2, 3]}) + predictors = ['Var1', 'Var2'] # Var2 is not in the input dataframe + to_predict = 'ToPredict' + pos_group = '1' + res = check_train(df, predictors, to_predict, pos_group) + self.assertTrue(res == 'Not all predictors exist in the input dataframe.') -def test_check_test(): - # Test case 1: Valid input dataframe and meta_data - df = pd.DataFrame({'ID': [1, 2, 3], - 'Age': [30, 40, 50], - 'Sex': ['M', 'F', 'M'], - 'Var1': [1, 2, 3], - 'Var2': [4, 5, 6]}) - meta_data = {'predictors': ['Var1', 'Var2'], - 'cv_results': pd.DataFrame({'ID': [1, 2, 3, 4, 5], - 'Age': [30, 40, 50, 60, 70]})} + def test_check_test(self): + # Test case 1: Valid input dataframe and meta_data + df = pd.DataFrame({'ID': [1, 2, 3], + 'Age': [30, 40, 50], + 'Sex': ['M', 'F', 'M'], + 'Var1': [1, 2, 3], + 'Var2': [4, 5, 6]}) + meta_data = {'predictors': ['Var1', 'Var2'], + 'cv_results': pd.DataFrame({'ID': [1, 2, 3, 4, 5], + 'Age': [30, 40, 50, 60, 70]})} - res = check_test(df, meta_data) - assert res[1] is None # Check if filtered dataframe is the same as the input dataframe + res = check_test(df, meta_data) + self.assertTrue(res[1] is None) # Check if filtered dataframe is the same as the input dataframe - # Test case 2: Missing predictors in the input dataframe - df_missing_predictors = pd.DataFrame({'ID': [1, 2, 3], - 'Age': [30, 40, 50], - 'Sex': ['M', 'F', 'M'], - 'Var1': [1, 2, 3]}) - meta_data = {'predictors': ['Var1', 'Var2', 'Var3'], - 'cv_results': pd.DataFrame({'ID': [1, 2, 3, 4, 5], - 'Age': [30, 40, 50, 60, 70]})} - res = check_test(df_missing_predictors, meta_data) - assert res[0] == "Not all predictors exist in the input dataframe: ['Var2', 'Var3']" + # Test case 2: Missing predictors in the input dataframe + df_missing_predictors = pd.DataFrame({'ID': [1, 2, 3], + 'Age': [30, 40, 50], + 'Sex': ['M', 'F', 'M'], + 'Var1': [1, 2, 3]}) + meta_data = {'predictors': ['Var1', 'Var2', 'Var3'], + 'cv_results': pd.DataFrame({'ID': [1, 2, 3, 4, 5], + 'Age': [30, 40, 50, 60, 70]})} + res = check_test(df_missing_predictors, meta_data) + self.assertTrue(res[0] == "Not all predictors exist in the input dataframe: ['Var2', 'Var3']") - # Test case 3: Passing check. - df_age_outside_range = pd.DataFrame({'ID': [1, 2, 3], - 'Age': [20, 45, 55], - 'Sex': ['M', 'F', 'M'], - 'Var1': [1, 2, 3], - 'Var2': [4, 5, 6]}) - meta_data = {'predictors': ['Var1', 'Var2'], - 'cv_results': pd.DataFrame({'ID': [1, 2, 3, 4, 5], - 'Age': [30, 40, 50, 60, 70]})} - res = check_test(df_age_outside_range, meta_data) - assert res[1] == None + # Test case 3: Passing check. + df_age_outside_range = pd.DataFrame({'ID': [1, 2, 3], + 'Age': [20, 45, 55], + 'Sex': ['M', 'F', 'M'], + 'Var1': [1, 2, 3], + 'Var2': [4, 5, 6]}) + meta_data = {'predictors': ['Var1', 'Var2'], + 'cv_results': pd.DataFrame({'ID': [1, 2, 3, 4, 5], + 'Age': [30, 40, 50, 60, 70]})} + res = check_test(df_age_outside_range, meta_data) + self.assertTrue(res[1] == None) -def test_smart_unique(): - pass + def test_smart_unique(self): + pass -def test_age_sex_match(): - pass + def test_age_sex_match(self): + pass -def test_logging_basic_config(): - pass + def test_logging_basic_config(self): + pass -def test_convert_cat_variables(): - pass \ No newline at end of file + def test_convert_cat_variables(self): + pass diff --git a/tests/unit/test_spare_scores.py b/tests/unit/test_spare_scores.py index 3dc09a8..3992d0e 100644 --- a/tests/unit/test_spare_scores.py +++ b/tests/unit/test_spare_scores.py @@ -1,64 +1,67 @@ -import re from pathlib import Path - +import unittest import numpy as np import pandas as pd -import pytest from spare_scores.spare_scores import spare_test, spare_train +from spare_scores.util import load_df, load_model + +class CheckSpareScores(unittest.TestCase): + + def test_spare_test(self): + self.df_fixture = load_df("../fixtures/sample_data.csv") + self.model_fixture = load_model("../fixtures/sample_model.pkl.gz") + + # Test case 1: Test with df + result = spare_test(self.df_fixture, self.model_fixture) + status_code, status, result = result['status_code'], result['status'], result['data'] + self.assertTrue(status == 'OK') + self.assertTrue(isinstance(result, pd.DataFrame)) + self.assertTrue(result.shape[0] == self.df_fixture.shape[0]) + self.assertTrue('SPARE_score' in result.columns) # Column name + + # Test case 2: Test with csv file: + filepath = Path(__file__).resolve().parent.parent / 'fixtures' / 'sample_data.csv' + filepath = str(filepath) + result = spare_test(filepath, self.model_fixture) + status, result = result['status'], result['data'] + self.assertTrue(status == 'OK') + self.assertTrue(isinstance(result, pd.DataFrame)) + self.assertTrue(result.shape[0] == self.df_fixture.shape[0]) + self.assertTrue('SPARE_score' in result.columns) # Column name + + # Test case 3: Column required by the model is missing + self.df_fixture.drop(columns='ROI1', inplace=True) + result = spare_test(self.df_fixture, self.model_fixture) + # {'status' : "Not all predictors exist in the input dataframe: ['ROI1']", + # 'data' : ['ROI1']} + status_code, status, result = result['status_code'], result['status'], result['data'] + self.assertTrue(status == 'Not all predictors exist in the input dataframe: [\'ROI1\']') + self.assertTrue(result == ['ROI1']) + + + def test_spare_train(self): + self.df_fixture = load_df("../fixtures/sample_data.csv") + self.model_fixture = load_model("../fixtures/sample_model.pkl.gz") + + # Test case 1: Test with df + result = spare_train(self.df_fixture, + 'Age', + data_vars = ['ROI1', 'ROI2', 'ROI3', 'ROI4', 'ROI5', + 'ROI6', 'ROI7', 'ROI8', 'ROI9', 'ROI10'], + ) + + status, result = result['status'], result['data'] + + metadata = result[1] # For some reason, this is None + self.assertTrue(status == 'OK') + self.assertTrue(metadata['mdl_type'] == self.model_fixture[1]['mdl_type']) + self.assertTrue(metadata['kernel'] == self.model_fixture[1]['kernel']) + self.assertTrue(set(metadata['predictors']) == set(self.model_fixture[1]['predictors'])) + self.assertTrue(metadata['to_predict'] == self.model_fixture[1]['to_predict']) + self.assertTrue(metadata['categorical_var_map'] == self.model_fixture[1]['categorical_var_map']) + + + -def test_spare_test(df_fixture, model_fixture): - - # Test case 1: No arguments given: - with pytest.raises(TypeError): - spare_test() - - # Test case 2: Test with df - result = spare_test(df_fixture, model_fixture) - status_code, status, result = result['status_code'], result['status'], result['data'] - assert status == 'OK' - assert isinstance(result, pd.DataFrame) - assert result.shape[0] == df_fixture.shape[0] - assert 'SPARE_score' in result.columns # Column name - - # Test case 3: Test with csv file: - filepath = Path(__file__).resolve().parent.parent / 'fixtures' / 'sample_data.csv' - filepath = str(filepath) - result = spare_test(filepath, model_fixture) - status_code, status, result = result['status_code'], result['status'], result['data'] - assert status == 'OK' - assert isinstance(result, pd.DataFrame) - assert result.shape[0] == df_fixture.shape[0] - assert 'SPARE_score' in result.columns # Column name - - # Test case 4: Column required by the model is missing - df_fixture.drop(columns='ROI1', inplace=True) - result = spare_test(df_fixture, model_fixture) - # {'status' : "Not all predictors exist in the input dataframe: ['ROI1']", - # 'data' : ['ROI1']} - status_code, status, result = result['status_code'], result['status'], result['data'] - assert status == 'Not all predictors exist in the input dataframe: [\'ROI1\']' - assert result == ['ROI1'] - - -def test_spare_train(df_fixture, model_fixture): - - # Test case 1: No arguments given: - with pytest.raises(TypeError): - spare_train() - - # Test case 2: Test with df - result = spare_train(df_fixture, - 'Age', - data_vars = ['ROI1', 'ROI2', 'ROI3', 'ROI4', 'ROI5', - 'ROI6', 'ROI7', 'ROI8', 'ROI9', 'ROI10'], - ) - status_code, status, result = result['status_code'], result['status'], result['data'] - model, metadata = result[0], result[1] - assert status == 'OK' - assert metadata['mdl_type'] == model_fixture[1]['mdl_type'] - assert metadata['kernel'] == model_fixture[1]['kernel'] - assert set(metadata['predictors']) == set(model_fixture[1]['predictors']) - assert metadata['to_predict'] == model_fixture[1]['to_predict'] - assert metadata['categorical_var_map'] == model_fixture[1]['categorical_var_map'] diff --git a/tests/unit/test_util.py b/tests/unit/test_util.py index ab4c8cc..02a8666 100644 --- a/tests/unit/test_util.py +++ b/tests/unit/test_util.py @@ -1,113 +1,110 @@ import re from pathlib import Path - +import unittest import numpy as np import pandas as pd -import pytest from spare_scores.util import (add_file_extension, check_file_exists, expspace, is_unique_identifier, load_df, load_examples, load_model, save_file) - -def test_load_model(model_fixture): - - # Test case 1: No arguments given: - no_args = "load_model() missing 1 required positional " + \ - "argument: 'mdl_path'" - with pytest.raises(TypeError, match=re.escape(no_args)): - load_model() - - # Test case 2: Load a model - filepath = Path(__file__).resolve().parent.parent / 'fixtures' / 'sample_model.pkl.gz' - filepath = str(filepath) - result = load_model(filepath) - assert result[1]['mdl_type'] == model_fixture[1]['mdl_type'] - assert result[1]['kernel'] == model_fixture[1]['kernel'] - assert result[1]['predictors'] == model_fixture[1]['predictors'] - assert result[1]['to_predict'] == model_fixture[1]['to_predict'] - assert result[1]['categorical_var_map'] == model_fixture[1]['categorical_var_map'] - -def test_expspace(): - # Test case 1: span = [0, 2] - span = [0, 2] - expected_result = np.array([1., 2.71828183, 7.3890561]) - assert np.allclose(expspace(span), expected_result) - - # Test case 2: span = [1, 5] - span = [1, 5] - expected_result = np.array([ 2.71828183, 7.3890561, 20.08553692, 54.59815003, 148.4131591]) - assert np.allclose(expspace(span), expected_result) - - # Test case 3: span = [-2, 1] - span = [-2, 1] - expected_result = np.array([0.13533528, 0.36787944, 1., 2.71828183]) - assert np.allclose(expspace(span), expected_result) - -def test_check_file_exists(): - pass - -def test_save_file(): - pass - -def test_is_unique_identifier(): - pass - -def test_load_model(): - pass - -def test_load_examples(): - pass - -def test_load_df(): - # Test case 1: Input is a string (CSV file path) - filepath = Path(__file__).resolve().parent.parent / 'fixtures' / 'sample_data.csv' - filepath = str(filepath) - expected_df = pd.read_csv(filepath, low_memory=False) - assert load_df(filepath).equals(expected_df) - - # Test case 2: Input is already a DataFrame - input_df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - expected_df = input_df.copy() - assert load_df(input_df).equals(expected_df) - - # Test case 3: Empty DataFrame - input_df = pd.DataFrame() - expected_df = input_df.copy() - assert load_df(input_df).equals(expected_df) - - # Test case 4: Large DataFrame - input_df = pd.DataFrame({"A": range(100000), "B": range(100000)}) - expected_df = input_df.copy() - assert load_df(input_df).equals(expected_df) - -def test_add_file_extension(): - # Test case 1: File extension already present - filename = "myfile.txt" - extension = ".txt" - assert add_file_extension(filename, extension) == "myfile.txt" - - # Test case 2: File extension not present - filename = "myfile" - extension = ".txt" - assert add_file_extension(filename, extension) == "myfile.txt" - - # Test case 3: Different extension - filename = "document" - extension = ".docx" - assert add_file_extension(filename, extension) == "document.docx" - - # Test case 4: Empty filename - filename = "" - extension = ".txt" - assert add_file_extension(filename, extension) == ".txt" - - # Test case 5: Empty extension - filename = "myfile" - extension = "" - assert add_file_extension(filename, extension) == "myfile" - - # Test case 6: Multiple extension dots in filename - filename = "file.tar.gz" - extension = ".gz" - assert add_file_extension(filename, extension) == "file.tar.gz" \ No newline at end of file +class CheckSpareScoresUtil(unittest.TestCase): + def test_load_model(self): + self.model_fixture = "../fixture/sample_model.pkl.gz" + # Test case 1: No arguments given: + no_args = "load_model() missing 1 required positional " + \ + "argument: 'mdl_path'" + + # Test case 2: Load a model + filepath = Path(__file__).resolve().parent.parent / 'fixtures' / 'sample_model.pkl.gz' + filepath = str(filepath) + result = load_model(filepath) + self.assertTrue(result[1]['mdl_type'] == self.model_fixture[1]['mdl_type']) + self.assertTrue(result[1]['kernel'] == self.model_fixture[1]['kernel']) + self.assertTrue(result[1]['predictors'] == self.model_fixture[1]['predictors']) + self.assertTrue(result[1]['to_predict'] == self.model_fixture[1]['to_predict']) + self.assertTrue(result[1]['categorical_var_map'] == self.model_fixture[1]['categorical_var_map']) + + def test_expspace(self): + # Test case 1: span = [0, 2] + span = [0, 2] + expected_result = np.array([1., 2.71828183, 7.3890561]) + self.assertTrue(np.allclose(expspace(span), expected_result)) + + # Test case 2: span = [1, 5] + span = [1, 5] + expected_result = np.array([ 2.71828183, 7.3890561, 20.08553692, 54.59815003, 148.4131591]) + self.assertTrue(np.allclose(expspace(span), expected_result)) + + # Test case 3: span = [-2, 1] + span = [-2, 1] + expected_result = np.array([0.13533528, 0.36787944, 1., 2.71828183]) + self.assertTrue(np.allclose(expspace(span), expected_result)) + + def test_check_file_exists(self): + pass + + def test_save_file(self): + pass + + def test_is_unique_identifier(self): + pass + + def test_load_model(self): + pass + + def test_load_examples(self): + pass + + def test_load_df(self): + # Test case 1: Input is a string (CSV file path) + filepath = Path(__file__).resolve().parent.parent / 'fixtures' / 'sample_data.csv' + filepath = str(filepath) + expected_df = pd.read_csv(filepath, low_memory=False) + self.assertTrue(load_df(filepath).equals(expected_df)) + + # Test case 2: Input is already a DataFrame + input_df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + expected_df = input_df.copy() + self.assertTrue(load_df(input_df).equals(expected_df)) + + # Test case 3: Empty DataFrame + input_df = pd.DataFrame() + expected_df = input_df.copy() + self.assertTrue(load_df(input_df).equals(expected_df)) + + # Test case 4: Large DataFrame + input_df = pd.DataFrame({"A": range(100000), "B": range(100000)}) + expected_df = input_df.copy() + self.assertTrue(load_df(input_df).equals(expected_df)) + + def test_add_file_extension(self): + # Test case 1: File extension already present + filename = "myfile.txt" + extension = ".txt" + self.assertTrue(add_file_extension(filename, extension) == "myfile.txt") + + # Test case 2: File extension not present + filename = "myfile" + extension = ".txt" + self.assertTrue(add_file_extension(filename, extension) == "myfile.txt") + + # Test case 3: Different extension + filename = "document" + extension = ".docx" + self.assertTrue(add_file_extension(filename, extension) == "document.docx") + + # Test case 4: Empty filename + filename = "" + extension = ".txt" + self.assertTrue(add_file_extension(filename, extension) == ".txt") + + # Test case 5: Empty extension + filename = "myfile" + extension = "" + self.assertTrue(add_file_extension(filename, extension) == "myfile") + + # Test case 6: Multiple extension dots in filename + filename = "file.tar.gz" + extension = ".gz" + self.assertTrue(add_file_extension(filename, extension) == "file.tar.gz") \ No newline at end of file