From a302d910a9a021508a243e70d8f8896db1dbe1a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ot=C3=A1vio=20Vasques?= Date: Wed, 8 Nov 2023 18:41:55 -0300 Subject: [PATCH] Remove the scikit learn restriction and bump minimal python version to 3.8 (#233) * Remove the scikit learn restriction * Set minimal version to 3.7 * Fix linter and swap the type check in the metalearners * Replace boston dataset by california * rollback the type check change, linter will break * Remove list accessor of the california dataset * Reformat imports * Change acessors * Fix feature name * Remove trailing space * Put the correct test value * Change test value * Change test value * Fix test pd extractors * Fix transformation * Fix type annotations * Lint fix * Lint fix * Put the correct version * Add changelog * Bump lightgbm * Add upper limits to deps * Bump major * Increase major constraint of pandas * Remove upper limitation on xgboost * Remove silent unused keyword * Update a few types * Lint fix * Add typing extensions for python 3.7 support * trick to avoid type checking for lists * Fix classification tests * Try to replace ndarrays by numpy typing NDArrays * Change back ndarray * Reduce type list * Add one more type * Add other types * Remove all other types * Try to use numpy typing * Drop python 3.7 support * Swap utils by testing in pandas assertion functions * In order to support pandas 2 it is required to bump xgboost up to version 2 * Fix xgboost dmatrix tests * Fix rank categorical * Solve pd extractors test * Fix hash eval test * Fix lookup in ensemble learner * Add type annotation to the new functions * Create conditional assertions based on python version * Remove necessity for typing extension and fix hash values * Lint fix * Fix mypi for multiclass classification for lgbm classifier * Bump catboost and joblib * Bump pytest * Bump coverage packages * Bump xdist * Bump mypy * Bump hypothesis * Rollback coverage bumps * Update changelog * Update changelog * Change hash test to match exactly 8 minor version --- .github/workflows/push.yaml | 2 +- CHANGELOG.md | 6 + requirements.txt | 12 +- requirements_catboost.txt | 2 +- requirements_lgbm.txt | 2 +- requirements_test.txt | 8 +- requirements_tools.txt | 4 +- requirements_xgboost.txt | 2 +- setup.py | 4 +- src/fklearn/resources/VERSION | 2 +- src/fklearn/training/classification.py | 194 +++++++++++------- src/fklearn/training/ensemble.py | 10 +- src/fklearn/training/regression.py | 187 +++++++++-------- src/fklearn/training/transformation.py | 23 ++- src/fklearn/tuning/selectors.py | 28 ++- src/fklearn/types/types.py | 4 +- .../cate_learning/test_meta_learners.py | 31 ++- tests/metrics/test_pd_extractors.py | 24 +-- tests/training/test_calibration.py | 2 +- tests/training/test_classification.py | 2 +- tests/training/test_pipeline.py | 10 +- tests/validation/test_evaluators.py | 17 +- 22 files changed, 348 insertions(+), 228 deletions(-) diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml index be99952f..acfaf886 100644 --- a/.github/workflows/push.yaml +++ b/.github/workflows/push.yaml @@ -50,7 +50,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: ["3.6", "3.7", "3.8", "3.9"] + python-version: ["3.8", "3.9"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 58d75737..a467db27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [3.0.0] - 2023-11-08 +- **Enhancement** + - Remove support for python 3.6 and 3.7. + - Bumps in joblib, numpy, pandas, scikit-learn, statsmodels, toolz, catboost, lightgbm, shap, xgboost + and test auxiliary packages. + ## [2.3.1] - 2023-04-11 - **Bugfix** - Remove incorrect `lightgbm` import from common paths diff --git a/requirements.txt b/requirements.txt index b33f771c..29ad3628 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -joblib>=0.13.2,<2 -numpy>=1.16.4,<2 -pandas>=0.24.1,<2 -scikit-learn>=0.21.2,<0.25.0 -statsmodels>=0.9.0,<1 -toolz>=0.9.0,<1 +joblib>=1.3.2,<2 +numpy>=1.24.4,<2 +pandas>=2,<3 +scikit-learn>=1,<2 +statsmodels>=0.14.0,<1 +toolz>=0.12.0,<1 diff --git a/requirements_catboost.txt b/requirements_catboost.txt index 79257b11..48319690 100644 --- a/requirements_catboost.txt +++ b/requirements_catboost.txt @@ -1 +1 @@ -catboost>=0.14.2,<2 +catboost>=1.2.2,<2 diff --git a/requirements_lgbm.txt b/requirements_lgbm.txt index 89ea48ba..eb2520df 100644 --- a/requirements_lgbm.txt +++ b/requirements_lgbm.txt @@ -1 +1 @@ -lightgbm>=2.2.2,<4 +lightgbm>=4,<5 diff --git a/requirements_test.txt b/requirements_test.txt index 5a351e2e..83968359 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -1,7 +1,7 @@ -pytest>=4.2.1,<7 +pytest>=7.4.3,<8 pytest-cov>=2.6.1,<3 -pytest-xdist>=1.26.1,<3 -mypy>=0.670,<1 +pytest-xdist>=3.3.1,<4 +mypy>=1.6.1,<2 coverage<5 codecov>=2.0,<3 -hypothesis>=5.5.4,<7 +hypothesis>=6.88.3,<7 diff --git a/requirements_tools.txt b/requirements_tools.txt index 8792381a..cc23f836 100644 --- a/requirements_tools.txt +++ b/requirements_tools.txt @@ -1,2 +1,2 @@ -shap>=0.31.0,<=0.40 -swifter>=0.284,<2 +shap>=0.43,<1 +swifter>=0.24,<2 diff --git a/requirements_xgboost.txt b/requirements_xgboost.txt index 0254ec0c..f72dd67f 100644 --- a/requirements_xgboost.txt +++ b/requirements_xgboost.txt @@ -1 +1 @@ -xgboost>=0.81,<1.5 +xgboost>=2,<3 diff --git a/setup.py b/setup.py index 065ec867..79f44d95 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def requirements_from_pip(filename='requirements.txt'): long_description=long_description, long_description_content_type="text/markdown", url='https://github.com/nubank/{:s}'.format(REPO_NAME), - python_requires='>=3.6.2,<3.10', + python_requires='>=3.8,<3.10', author="Nubank", package_dir={'': 'src'}, packages=find_packages('src'), @@ -52,8 +52,6 @@ def requirements_from_pip(filename='requirements.txt'): include_package_data=True, zip_safe=False, classifiers=[ - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9' ]) diff --git a/src/fklearn/resources/VERSION b/src/fklearn/resources/VERSION index 2bf1c1cc..4a36342f 100644 --- a/src/fklearn/resources/VERSION +++ b/src/fklearn/resources/VERSION @@ -1 +1 @@ -2.3.1 +3.0.0 diff --git a/src/fklearn/training/classification.py b/src/fklearn/training/classification.py index a27eaa28..75feabad 100644 --- a/src/fklearn/training/classification.py +++ b/src/fklearn/training/classification.py @@ -1,6 +1,7 @@ -from typing import List, Any, Optional, Callable, Tuple, Union, TYPE_CHECKING +from typing import List, Any, Optional, Callable, Tuple, Union, TYPE_CHECKING, Literal import numpy as np +import numpy.typing as npt import pandas as pd from pathlib import Path from toolz import curry, merge, assoc @@ -8,7 +9,7 @@ from sklearn.linear_model import LogisticRegression from sklearn import __version__ as sk_version -from fklearn.types import LearnerReturnType, LogType +from fklearn.types import LearnerReturnType, LearnerLogType, LogType from fklearn.common_docstrings import learner_return_docstring, learner_pred_fn_docstring from fklearn.training.utils import log_learner_time, expand_features_encoded @@ -83,16 +84,19 @@ def p(new_df: pd.DataFrame) -> pd.DataFrame: p.__doc__ = learner_pred_fn_docstring("logistic_classification_learner") - log = {'logistic_classification_learner': { - 'features': features, - 'target': target, - 'parameters': merged_params, - 'prediction_column': prediction_column, - 'package': "sklearn", - 'package_version': sk_version, - 'feature_importance': dict(zip(features, clf.coef_.flatten())), - 'training_samples': len(df)}, - 'object': clf} + log = { + 'logistic_classification_learner': { + 'features': features, + 'target': target, + 'parameters': merged_params, + 'prediction_column': prediction_column, + 'package': "sklearn", + 'package_version': sk_version, + 'feature_importance': dict(zip(features, clf.coef_.flatten())), + 'training_samples': len(df) + }, + 'object': clf + } return p, p(df), log @@ -174,13 +178,21 @@ def xgb_classification_learner(df: pd.DataFrame, features = features if not encode_extra_cols else expand_features_encoded(df, features) - dtrain = xgb.DMatrix(df[features].values, label=df[target].values, feature_names=map(str, features), weight=weights) + dtrain = xgb.DMatrix( + df[features].values, + label=df[target].values, + feature_names=list(map(str, features)), + weight=weights + ) bst = xgb.train(params, dtrain, num_estimators) def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: - dtest = xgb.DMatrix(new_df[features].values, feature_names=map(str, features)) + dtest = xgb.DMatrix( + new_df[features].values, + feature_names=list(map(str, features)) + ) pred = bst.predict(dtest) if params["objective"] == "multi:softprob": @@ -218,16 +230,19 @@ def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: p.__doc__ = learner_pred_fn_docstring("xgb_classification_learner", shap=True) - log = {'xgb_classification_learner': { - 'features': features, - 'target': target, - 'prediction_column': prediction_column, - 'package': "xgboost", - 'package_version': xgb.__version__, - 'parameters': assoc(params, "num_estimators", num_estimators), - 'feature_importance': bst.get_score(), - 'training_samples': len(df)}, - 'object': bst} + log = { + 'xgb_classification_learner': { + 'features': features, + 'target': target, + 'prediction_column': prediction_column, + 'package': "xgboost", + 'package_version': xgb.__version__, + 'parameters': assoc(params, "num_estimators", num_estimators), + 'feature_importance': bst.get_score(), + 'training_samples': len(df) + }, + 'object': bst + } return p, p(df), log @@ -393,16 +408,19 @@ def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: p.__doc__ = learner_pred_fn_docstring("catboost_classification_learner", shap=True) - log = {'catboost_classification_learner': { - 'features': features, - 'target': target, - 'prediction_column': prediction_column, - 'package': "catboost", - 'package_version': catboost.__version__, - 'parameters': assoc(params, "num_estimators", num_estimators), - 'feature_importance': cbr.feature_importances_, - 'training_samples': len(df)}, - 'object': cbr} + log = { + 'catboost_classification_learner': { + 'features': features, + 'target': target, + 'prediction_column': prediction_column, + 'package': "catboost", + 'package_version': catboost.__version__, + 'parameters': assoc(params, "num_estimators", num_estimators), + 'feature_importance': cbr.feature_importances_, + 'training_samples': len(df) + }, + 'object': cbr + } return p, p(df), log @@ -501,29 +519,34 @@ def p(new_df: pd.DataFrame) -> pd.DataFrame: @curry @log_learner_time(learner_name='lgbm_classification_learner') -def lgbm_classification_learner(df: pd.DataFrame, - features: List[str], - target: str, - learning_rate: float = 0.1, - num_estimators: int = 100, - extra_params: Optional[LogType] = None, - prediction_column: str = "prediction", - weight_column: Optional[str] = None, - encode_extra_cols: bool = True, - valid_sets: Optional[List[pd.DataFrame]] = None, - valid_names: Optional[List[str]] = None, - feval: Optional[Union[ - Callable[[np.ndarray, pd.DataFrame], Tuple[str, float, bool]], - List[Callable[[np.ndarray, pd.DataFrame], Tuple[str, float, bool]]]] - ] = None, - init_model: Optional[Union[str, Path, 'Booster']] = None, - feature_name: Union[List[str], str] = 'auto', - categorical_feature: Union[List[str], List[int], str] = 'auto', - keep_training_booster: bool = False, - callbacks: Optional[List[Callable]] = None, - dataset_init_score: Optional[Union[ - List, List[List], np.ndarray, pd.Series, pd.DataFrame] - ] = None) -> LearnerReturnType: +def lgbm_classification_learner( + df: pd.DataFrame, + features: List[str], + target: str, + learning_rate: float = 0.1, + num_estimators: int = 100, + extra_params: Optional[LogType] = None, + prediction_column: str = "prediction", + weight_column: Optional[str] = None, + encode_extra_cols: bool = True, + valid_sets: Optional[List[pd.DataFrame]] = None, + valid_names: Optional[List[str]] = None, + feval: Optional[Union[ + Union[Callable[[npt.NDArray, Any], Tuple[str, float, bool]], + Callable[[npt.NDArray, Any], List[Tuple[str, float, bool]]]], + List[Union[Callable[[npt.NDArray, Any], + Tuple[str, float, bool]], + Callable[[npt.NDArray, Any], + List[Tuple[str, float, bool]]]]], + None + ]] = None, + init_model: Optional[Union[str, Path, 'Booster']] = None, + feature_name: Union[List[str], Literal['auto']] = 'auto', + categorical_feature: Union[List[str], List[int], Literal['auto']] = 'auto', + keep_training_booster: bool = False, + callbacks: Optional[List[Callable]] = None, + dataset_init_score: Optional[Union[List, List[List], npt.NDArray, pd.Series, pd.DataFrame]] = None +) -> LearnerReturnType: """ Fits an LGBM classifier to the dataset. @@ -632,20 +655,37 @@ def lgbm_classification_learner(df: pd.DataFrame, features = features if not encode_extra_cols else expand_features_encoded(df, features) - dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights, - silent=True, init_score=dataset_init_score) - - bst = lgbm.train(params=params, train_set=dtrain, num_boost_round=num_estimators, valid_sets=valid_sets, - valid_names=valid_names, feval=feval, init_model=init_model, feature_name=feature_name, - categorical_feature=categorical_feature, keep_training_booster=keep_training_booster, - callbacks=callbacks) + dtrain = lgbm.Dataset( + df[features].values, + label=df[target], + feature_name=list(map(str, features)), + weight=weights, + init_score=dataset_init_score + ) + + bst = lgbm.train( + params=params, + train_set=dtrain, + num_boost_round=num_estimators, + valid_sets=valid_sets, + valid_names=valid_names, + feval=feval, + init_model=init_model, + feature_name=feature_name, + categorical_feature=categorical_feature, + keep_training_booster=keep_training_booster, + callbacks=callbacks + ) def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: + predictions = bst.predict(new_df[features].values) + if isinstance(predictions, List): + predictions = np.ndarray(predictions) if is_multiclass_classification: col_dict = {prediction_column + "_" + str(key): value - for (key, value) in enumerate(bst.predict(new_df[features].values).T)} + for (key, value) in enumerate(predictions.T)} else: - col_dict = {prediction_column: bst.predict(new_df[features].values)} + col_dict = {prediction_column: predictions} if apply_shap: import shap @@ -675,16 +715,18 @@ def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: p.__doc__ = learner_pred_fn_docstring("lgbm_classification_learner", shap=True) - log = {'lgbm_classification_learner': { - 'features': features, - 'target': target, - 'prediction_column': prediction_column, - 'package': "lightgbm", - 'package_version': lgbm.__version__, - 'parameters': assoc(params, "num_estimators", num_estimators), - 'feature_importance': dict(zip(features, bst.feature_importance().tolist())), - 'training_samples': len(df)}, - 'object': bst} + log: LearnerLogType = { + 'lgbm_classification_learner': { + 'features': features, + 'target': target, + 'prediction_column': prediction_column, + 'package': "lightgbm", + 'package_version': lgbm.__version__, + 'parameters': assoc(params, "num_estimators", num_estimators), + 'feature_importance': dict(zip(features, bst.feature_importance().tolist())), + 'training_samples': len(df)}, + 'object': bst + } return p, p(df), log diff --git a/src/fklearn/training/ensemble.py b/src/fklearn/training/ensemble.py index 1c265505..fec94b84 100644 --- a/src/fklearn/training/ensemble.py +++ b/src/fklearn/training/ensemble.py @@ -1,5 +1,7 @@ from typing import Any, Dict, List, TypeVar +import numpy as np +import numpy.typing as npt import pandas as pd from toolz import curry, assoc, compose @@ -136,10 +138,14 @@ def xgb_octopus_classification_learner(train_set: pd.DataFrame, def p(df: pd.DataFrame) -> pd.DataFrame: pred_fn = compose(*pred_fns.values()) + def lookup(df: pd.DataFrame) -> npt.NDArray: + idx, cols = pd.factorize(df.pred_bin.values.squeeze()) + output = df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx] + return output + return (pred_fn(df) .assign(pred_bin=prediction_column + "_bin_" + df[train_split_col].astype(str)) - .assign(prediction=lambda d: d.lookup(d.index.values, - d.pred_bin.values.squeeze())) + .assign(prediction=lookup) .rename(index=str, columns={"prediction": prediction_column}) .drop("pred_bin", axis=1)) diff --git a/src/fklearn/training/regression.py b/src/fklearn/training/regression.py index 9131abfb..2820a83f 100644 --- a/src/fklearn/training/regression.py +++ b/src/fklearn/training/regression.py @@ -70,16 +70,19 @@ def p(new_df: pd.DataFrame) -> pd.DataFrame: p.__doc__ = learner_pred_fn_docstring("linear_regression_learner") - log = {'linear_regression_learner': { - 'features': features, - 'target': target, - 'parameters': params, - 'prediction_column': prediction_column, - 'package': "sklearn", - 'package_version': sk_version, - 'feature_importance': dict(zip(features, regr.coef_.flatten())), - 'training_samples': len(df)}, - 'object': regr} + log = { + 'linear_regression_learner': { + 'features': features, + 'target': target, + 'parameters': params, + 'prediction_column': prediction_column, + 'package': "sklearn", + 'package_version': sk_version, + 'feature_importance': dict(zip(features, regr.coef_.flatten())), + 'training_samples': len(df) + }, + 'object': regr + } return p, p(df), log @@ -159,12 +162,20 @@ def xgb_regression_learner(df: pd.DataFrame, features = features if not encode_extra_cols else expand_features_encoded(df, features) - dtrain = xgb.DMatrix(df[features].values, label=df[target].values, weight=weights, feature_names=map(str, features)) + dtrain = xgb.DMatrix( + df[features].values, + label=df[target].values, + weight=weights, + feature_names=list(map(str, features)) + ) bst = xgb.train(params, dtrain, num_estimators) def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: - dtest = xgb.DMatrix(new_df[features].values, feature_names=map(str, features)) + dtest = xgb.DMatrix( + new_df[features].values, + feature_names=list(map(str, features)) + ) col_dict = {prediction_column: bst.predict(dtest)} if apply_shap: @@ -182,16 +193,19 @@ def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: p.__doc__ = learner_pred_fn_docstring("xgb_regression_learner", shap=True) - log = {'xgb_regression_learner': { - 'features': features, - 'target': target, - 'prediction_column': prediction_column, - 'package': "xgboost", - 'package_version': xgb.__version__, - 'parameters': assoc(params, "num_estimators", num_estimators), - 'feature_importance': bst.get_score(), - 'training_samples': len(df)}, - 'object': bst} + log = { + 'xgb_regression_learner': { + 'features': features, + 'target': target, + 'prediction_column': prediction_column, + 'package': "xgboost", + 'package_version': xgb.__version__, + 'parameters': assoc(params, "num_estimators", num_estimators), + 'feature_importance': bst.get_score(), + 'training_samples': len(df) + }, + 'object': bst + } return p, p(df), log @@ -287,16 +301,19 @@ def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: p.__doc__ = learner_pred_fn_docstring("CatBoostRegressor", shap=False) - log = {'catboost_regression_learner': { - 'features': features, - 'target': target, - 'prediction_column': prediction_column, - 'package': "catboost", - 'package_version': catboost.__version__, - 'parameters': assoc(params, "num_estimators", num_estimators), - 'feature_importance': cbr.feature_importances_, - 'training_samples': len(df)}, - 'object': cbr} + log = { + 'catboost_regression_learner': { + 'features': features, + 'target': target, + 'prediction_column': prediction_column, + 'package': "catboost", + 'package_version': catboost.__version__, + 'parameters': assoc(params, "num_estimators", num_estimators), + 'feature_importance': cbr.feature_importances_, + 'training_samples': len(df) + }, + 'object': cbr + } return p, p(df), log @@ -387,16 +404,18 @@ def p(new_df: pd.DataFrame) -> pd.DataFrame: p.__doc__ = learner_pred_fn_docstring("gp_regression_learner") - log = {'gp_regression_learner': { - 'features': features, - 'target': target, - 'parameters': merge(params, {'extra_variance': extra_variance, - 'return_std': return_std}), - 'prediction_column': prediction_column, - 'package': "sklearn", - 'package_version': sk_version, - 'training_samples': len(df)}, - 'object': gp} + log = { + 'gp_regression_learner': { + 'features': features, + 'target': target, + 'parameters': merge(params, {'extra_variance': extra_variance, + 'return_std': return_std}), + 'prediction_column': prediction_column, + 'package': "sklearn", + 'package_version': sk_version, + 'training_samples': len(df)}, + 'object': gp + } return p, p(df), log @@ -406,15 +425,17 @@ def p(new_df: pd.DataFrame) -> pd.DataFrame: @curry @log_learner_time(learner_name='lgbm_regression_learner') -def lgbm_regression_learner(df: pd.DataFrame, - features: List[str], - target: str, - learning_rate: float = 0.1, - num_estimators: int = 100, - extra_params: Dict[str, Any] = None, - prediction_column: str = "prediction", - weight_column: str = None, - encode_extra_cols: bool = True) -> LearnerReturnType: +def lgbm_regression_learner( + df: pd.DataFrame, + features: List[str], + target: str, + learning_rate: float = 0.1, + num_estimators: int = 100, + extra_params: Dict[str, Any] = None, + prediction_column: str = "prediction", + weight_column: str = None, + encode_extra_cols: bool = True +) -> LearnerReturnType: """ Fits an LGBM regressor to the dataset. @@ -478,8 +499,7 @@ def lgbm_regression_learner(df: pd.DataFrame, features = features if not encode_extra_cols else expand_features_encoded(df, features) - dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights, - silent=True) + dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights) bst = lgbm.train(params, dtrain, num_estimators) @@ -501,16 +521,18 @@ def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: p.__doc__ = learner_pred_fn_docstring("lgbm_regression_learner", shap=True) - log = {'lgbm_regression_learner': { - 'features': features, - 'target': target, - 'prediction_column': prediction_column, - 'package': "lightgbm", - 'package_version': lgbm.__version__, - 'parameters': assoc(params, "num_estimators", num_estimators), - 'feature_importance': dict(zip(features, bst.feature_importance().tolist())), - 'training_samples': len(df)}, - 'object': bst} + log = { + 'lgbm_regression_learner': { + 'features': features, + 'target': target, + 'prediction_column': prediction_column, + 'package': "lightgbm", + 'package_version': lgbm.__version__, + 'parameters': assoc(params, "num_estimators", num_estimators), + 'feature_importance': dict(zip(features, bst.feature_importance().tolist())), + 'training_samples': len(df)}, + 'object': bst + } return p, p(df), log @@ -520,13 +542,15 @@ def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: @curry @log_learner_time(learner_name='custom_supervised_model_learner') -def custom_supervised_model_learner(df: pd.DataFrame, - features: List[str], - target: str, - model: Any, - supervised_type: str, - log: Dict[str, Dict], - prediction_column: str = "prediction") -> LearnerReturnType: +def custom_supervised_model_learner( + df: pd.DataFrame, + features: List[str], + target: str, + model: Any, + supervised_type: str, + log: Dict[str, Dict], + prediction_column: str = "prediction" +) -> LearnerReturnType: """ Fits a custom model to the dataset. Return the predict function, the predictions for the input dataset and a log describing the model. @@ -658,16 +682,19 @@ def p(new_df: pd.DataFrame) -> pd.DataFrame: p.__doc__ = learner_pred_fn_docstring("elasticnet_regression_learner") - log = {'elasticnet_regression_learner': { - 'features': features, - 'target': target, - 'parameters': params, - 'prediction_column': prediction_column, - 'package': "sklearn", - 'package_version': sk_version, - 'feature_importance': dict(zip(features, regr.coef_.flatten())), - 'training_samples': len(df)}, - 'object': regr} + log = { + 'elasticnet_regression_learner': { + 'features': features, + 'target': target, + 'parameters': params, + 'prediction_column': prediction_column, + 'package': "sklearn", + 'package_version': sk_version, + 'feature_importance': dict(zip(features, regr.coef_.flatten())), + 'training_samples': len(df) + }, + 'object': regr + } return p, p(df), log diff --git a/src/fklearn/training/transformation.py b/src/fklearn/training/transformation.py index 7243c57a..1a117fae 100644 --- a/src/fklearn/training/transformation.py +++ b/src/fklearn/training/transformation.py @@ -500,12 +500,14 @@ def rank_categorical(df: pd.DataFrame, Whether to store the feature value -> integer dictionary in the log """ - col_categ_getter = lambda col: (df[col] - .value_counts() - .reset_index() - .sort_values([col, "index"], ascending=[False, True]) - .set_index("index")[col] - .rank(method="first", ascending=False).to_dict()) + def col_categ_getter(col: str) -> Dict: + return (df[col] + .value_counts() + .reset_index() + .sort_values([col, "count"], ascending=[True, False]) + .set_index(col)["count"] + .rank(method="first", ascending=False) + .to_dict()) vec = {column: col_categ_getter(column) for column in columns_to_rank} @@ -1027,11 +1029,12 @@ def missing_warner(df: pd.DataFrame, cols_list: List[str], cols_without_missing = df_selected.loc[:, df_selected.isna().sum(axis=0) == 0].columns.tolist() def p(dataset: pd.DataFrame) -> pd.DataFrame: - def detailed_assignment(df: pd.DataFrame, cols_to_check: List[str]) -> np.ndarray: + def detailed_assignment(df: pd.DataFrame, cols_to_check: List[str]) -> list: + print(df.loc[:, cols_to_check]) cols_with_missing = np.array([np.where(df[col].isna(), col, "") for col in cols_to_check]).T - missing_by_row_list = np.array([list(filter(None, x)) for x in cols_with_missing]).reshape(-1, 1) - if missing_by_row_list.size == 0: - return np.empty((df.shape[0], 0)).tolist() + missing_by_row_list: list = [list(filter(None, x)) for x in cols_with_missing] + if len(missing_by_row_list) == 0: + return np.empty((0, 0)).tolist() else: return missing_by_row_list diff --git a/src/fklearn/tuning/selectors.py b/src/fklearn/tuning/selectors.py index e5ac1f6c..f6c710b9 100644 --- a/src/fklearn/tuning/selectors.py +++ b/src/fklearn/tuning/selectors.py @@ -3,12 +3,30 @@ from toolz.curried import pipe, first, mapcat import pandas as pd -from fklearn.tuning.samplers import remove_features_subsets, remove_by_feature_importance, remove_by_feature_shuffling -from fklearn.tuning.stoppers import stop_by_num_features, stop_by_num_features_parallel, stop_by_iter_num, \ - stop_by_no_improvement, stop_by_no_improvement_parallel, aggregate_stop_funcs +from fklearn.tuning.samplers import ( + remove_features_subsets, + remove_by_feature_importance, + remove_by_feature_shuffling +) +from fklearn.tuning.stoppers import ( + stop_by_num_features, + stop_by_num_features_parallel, + stop_by_iter_num, + stop_by_no_improvement, + stop_by_no_improvement_parallel, + aggregate_stop_funcs +) from fklearn.validation.validator import parallel_validator -from fklearn.types import EvalFnType, ExtractorFnType, LearnerReturnType, ListLogListType, LogListType, SplitterFnType,\ - ValidatorReturnType, LogType +from fklearn.types import ( + EvalFnType, + ExtractorFnType, + LearnerReturnType, + ListLogListType, + LogListType, + SplitterFnType, + ValidatorReturnType, + LogType +) SaveIntermediaryFnType = Callable[[List[ValidatorReturnType]], None] TuningLearnerFnType = Callable[[pd.DataFrame, List[str]], LearnerReturnType] diff --git a/src/fklearn/types/types.py b/src/fklearn/types/types.py index 31e9e3d7..d2f62fa9 100644 --- a/src/fklearn/types/types.py +++ b/src/fklearn/types/types.py @@ -12,8 +12,8 @@ ListLogListType = List[LogListType] # Learner types -PredictFnType = Callable[[pd.DataFrame], pd.DataFrame] -LearnerLogType = Dict[str, LogType] +PredictFnType = Callable[..., pd.DataFrame] +LearnerLogType = Dict[str, Any] LearnerReturnType = Tuple[PredictFnType, pd.DataFrame, LearnerLogType] UncurriedLearnerFnType = Callable[..., LearnerReturnType] diff --git a/tests/causal/cate_learning/test_meta_learners.py b/tests/causal/cate_learning/test_meta_learners.py index dd7d297e..24b6a73d 100644 --- a/tests/causal/cate_learning/test_meta_learners.py +++ b/tests/causal/cate_learning/test_meta_learners.py @@ -8,14 +8,25 @@ from pandas.testing import assert_frame_equal from fklearn.causal.cate_learning.meta_learners import ( - TREATMENT_FEATURE, _append_treatment_feature, _create_treatment_flag, - _filter_by_treatment, _fit_by_treatment, _get_learners, _get_model_fcn, - _get_unique_treatments, _predict_by_treatment_flag, - _simulate_t_learner_treatment_effect, _simulate_treatment_effect, - causal_s_classification_learner, causal_t_classification_learner) -from fklearn.exceptions.exceptions import (MissingControlError, - MissingTreatmentError, - MultipleTreatmentsError) + TREATMENT_FEATURE, + _append_treatment_feature, + _create_treatment_flag, + _filter_by_treatment, + _fit_by_treatment, + _get_learners, + _get_model_fcn, + _get_unique_treatments, + _predict_by_treatment_flag, + _simulate_t_learner_treatment_effect, + _simulate_treatment_effect, + causal_s_classification_learner, + causal_t_classification_learner +) +from fklearn.exceptions.exceptions import ( + MissingControlError, + MissingTreatmentError, + MultipleTreatmentsError +) from fklearn.training.classification import logistic_classification_learner from fklearn.types import LearnerFnType @@ -235,8 +246,8 @@ def test__fit_by_treatment(base_input_df): assert len(learners) == len(treatments) assert len(logs) == len(treatments) - assert type(logs) == dict - assert [type(learner) == LearnerFnType for learner in learners] + assert type(logs) is dict + assert [type(learner) is LearnerFnType for learner in learners] def ones_or_zeros_model(df): diff --git a/tests/metrics/test_pd_extractors.py b/tests/metrics/test_pd_extractors.py index 21a24aeb..93547d4a 100644 --- a/tests/metrics/test_pd_extractors.py +++ b/tests/metrics/test_pd_extractors.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd import pytest -from sklearn.datasets import load_boston +from sklearn.datasets import fetch_california_housing from fklearn.data.datasets import make_tutorial_data from fklearn.metrics.pd_extractors import (combined_evaluator_extractor, @@ -119,7 +119,7 @@ def test__split_evaluator_extractor__when_split_value_is_missing(): results = feature3_date_evaluator(data) - date_values = [ + date_values = pd.to_datetime([ np.datetime64("2015-01-06T00:00:00.000000000"), np.datetime64("2015-01-14T00:00:00.000000000"), np.datetime64("2015-01-22T00:00:00.000000000"), @@ -127,7 +127,7 @@ def test__split_evaluator_extractor__when_split_value_is_missing(): np.datetime64("2015-03-08T00:00:00.000000000"), np.datetime64("2015-03-09T00:00:00.000000000"), np.datetime64("2015-04-04T00:00:00.000000000"), - ] + ]) base_evaluator = evaluator_extractor(evaluator_name="mse_evaluator__target") feature3_extractor = split_evaluator_extractor( @@ -142,15 +142,15 @@ def test__split_evaluator_extractor__when_split_value_is_missing(): def test_extract(): - boston = load_boston() - df = pd.DataFrame(boston['data'], columns=boston['feature_names']) - df['target'] = boston['target'] + california = fetch_california_housing() + df = pd.DataFrame(california['data'], columns=california['feature_names']) + df['target'] = california['target'] df['time'] = pd.date_range(start='2015-01-01', periods=len(df)) np.random.seed(42) df['space'] = np.random.randint(0, 100, size=len(df)) # Define train function - train_fn = linear_regression_learner(features=boston['feature_names'].tolist(), target="target") + train_fn = linear_regression_learner(features=california['feature_names'], target="target") # Define evaluator function base_evaluator = combined_evaluators(evaluators=[ @@ -158,7 +158,7 @@ def test_extract(): spearman_evaluator(target_column='target', prediction_column='prediction') ]) - splitter = split_evaluator(eval_fn=base_evaluator, split_col='RAD', split_values=[4.0, 5.0, 24.0]) + splitter = split_evaluator(eval_fn=base_evaluator, split_col='MedInc', split_values=[0.5, 10.0, 20.0]) temporal_week_splitter = temporal_split_evaluator(eval_fn=base_evaluator, time_col='time', time_format='%Y-%W') temporal_year_splitter = temporal_split_evaluator(eval_fn=base_evaluator, time_col='time', time_format='%Y') @@ -216,11 +216,11 @@ def test_extract(): assert extract(tlc_results, base_extractors).shape == (12, 9) assert extract(tlc_results, splitter_extractor).shape == (36, 10) - assert extract(sc_results, base_extractors).shape == (5, 9) - assert extract(sc_results, splitter_extractor).shape == (15, 10) + assert extract(sc_results, base_extractors).shape == (667, 9) + assert extract(sc_results, splitter_extractor).shape == (2001, 10) - assert extract(fw_sc_results, base_extractors).shape == (3, 9) - assert extract(fw_sc_results, splitter_extractor).shape == (9, 10) + assert extract(fw_sc_results, base_extractors).shape == (674, 9) + assert extract(fw_sc_results, splitter_extractor).shape == (2022, 10) n_time_week_folds = len(df['time'].dt.strftime('%Y-%W').unique()) n_time_year_folds = len(df['time'].dt.strftime('%Y').unique()) diff --git a/tests/training/test_calibration.py b/tests/training/test_calibration.py index e5b28107..500eb974 100644 --- a/tests/training/test_calibration.py +++ b/tests/training/test_calibration.py @@ -56,4 +56,4 @@ def test_find_thresholds_with_same_risk(): df_with_ecdf["fair"] = pred_df assert fair_thresholds == log["find_thresholds_with_same_risk"]["fair_thresholds"] - pd.util.testing.assert_frame_equal(df_expected, df_with_ecdf) + pd.testing.assert_frame_equal(df_expected, df_with_ecdf) diff --git a/tests/training/test_classification.py b/tests/training/test_classification.py index e28393ea..2d657040 100644 --- a/tests/training/test_classification.py +++ b/tests/training/test_classification.py @@ -507,7 +507,7 @@ def test_lgbm_classification_learner_params(): } ) - lgbm_dataset = lightgbm.Dataset(df[features].values, label=df[target], silent=True) + lgbm_dataset = lightgbm.Dataset(df[features].values, label=df[target]) mock_lgbm = MagicMock() mock_lgbm.predict.return_value = df_result["prediction"] diff --git a/tests/training/test_pipeline.py b/tests/training/test_pipeline.py index 4bf462ad..22c3f9fb 100644 --- a/tests/training/test_pipeline.py +++ b/tests/training/test_pipeline.py @@ -49,7 +49,7 @@ def test_build_pipeline(has_repeated_learners): pred_test_without_shap = predict_fn(df_test) assert set(pred_test_without_shap.columns) == set(pred_train.columns) - pd.util.testing.assert_frame_equal(pred_test_with_shap[pred_test_without_shap.columns], pred_test_without_shap) + pd.testing.assert_frame_equal(pred_test_with_shap[pred_test_without_shap.columns], pred_test_without_shap) @pytest.mark.parametrize("has_repeated_learners", [False, True]) @@ -73,7 +73,7 @@ def p(dataset, mult=2): side_effect_learner, kwargs_learner, has_repeated_learners=has_repeated_learners) side_effect_pipeline(test_df) - pd.util.testing.assert_frame_equal(test_df, orig_df) + pd.testing.assert_frame_equal(test_df, orig_df) @pytest.mark.parametrize("has_repeated_learners", [False, True]) @@ -97,9 +97,9 @@ def dummy_learner(df): side_effect_pipeline = build_pipeline(*variation, has_repeated_learners=has_repeated_learners) predict_fn, result_df, log = side_effect_pipeline(test_df) - pd.util.testing.assert_frame_equal(test_df, orig_df) - pd.util.testing.assert_frame_equal(result_df, expected_df) - pd.util.testing.assert_frame_equal(predict_fn(test_df, mult=mult_constant), expected_df) + pd.testing.assert_frame_equal(test_df, orig_df) + pd.testing.assert_frame_equal(result_df, expected_df) + pd.testing.assert_frame_equal(predict_fn(test_df, mult=mult_constant), expected_df) @pytest.mark.parametrize("has_repeated_learners", [False, True]) diff --git a/tests/validation/test_evaluators.py b/tests/validation/test_evaluators.py index a658fcaf..c4d14c16 100644 --- a/tests/validation/test_evaluators.py +++ b/tests/validation/test_evaluators.py @@ -1,3 +1,4 @@ +import sys import string import numpy as np @@ -468,10 +469,18 @@ def test_hash_evaluator(): assert eval_fn(df1)["eval_name"] != eval_fn(df3)["eval_name"] # if we consider all the features in the dataframe, it should return different hashes for different dataframes assert eval_fn_all(df1)["eval_name"] != eval_fn_all(df2)["eval_name"] - # Assert that the hashes stay the same everytime this is run - assert eval_fn_all(df1)["eval_name"] == -6356943988420224450 - assert eval_fn_all(df2)["eval_name"] == -4865376220991082723 - assert eval_fn_all(df3)["eval_name"] == 141388279445698461 + + # Assert that the hashes stay the same everytime this is run. + # The hash function is update in python 3.9 requiring different checks for each version. + python_version = sys.version_info + if python_version.minor == 8: + assert eval_fn_all(df1)["eval_name"] == -6356943988420224450 + assert eval_fn_all(df2)["eval_name"] == -4865376220991082723 + assert eval_fn_all(df3)["eval_name"] == 141388279445698461 + else: + assert eval_fn_all(df1)["eval_name"] == 12089800085289327166 + assert eval_fn_all(df2)["eval_name"] == 13581367852718468893 + assert eval_fn_all(df3)["eval_name"] == 141388279445698461 def test_exponential_coefficient_evaluator():