from that_ml_library.data_preprocess import *
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
+from sklearn.preprocessing import LabelEncoder
chart_plotting
+Variance Inflation Factor and Correlation
++ +
get_vif
++++get_vif (df:pandas.core.frame.DataFrame, plot_corr=False, figsize=(10, + 10))
Perform variance inflation factor calculation, and optionally plot correlation matrix
+Note that your dataframe should only have numerical features to perform VIF
++ | Type | +Default | +Details | +
---|---|---|---|
df | +pd.DataFrame | ++ | dataframe to plot | +
plot_corr | +bool | +False | +to plot the correlation matrix | +
figsize | +tuple | +(10, 10) | +Matplotlib figsize | +
1. Why you should use VIF: to detect multicollinearity (more than 2 columns)
+-
+
- Compute variance inflation factor +
- The VIF is variance inflation factor the ratio of the variance of βˆj when fitting the full model (with other features) divided by the variance of βˆj if fit on its own
+
-
+
- Min(VIF) = 1 (no collinearity) +
- VIF >5 or >10 means high collinearity +
+
2. How to calculating VIF: Set the suspected collinearity feature (e.g. X1) as label, and try to predict X1 using a regression model and other features
+3. What to do with high collinearity:
+-
+
Drop one of them
+Combine them to create a new feature
+Perform an analysis designed for highly correlated variables, such as principal components analysis or partial least squares regression.
+
= pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/titanic.csv')
+ df = process_missing_values(df[['Survived','Pclass','Age','SibSp','Parch']],
+ df_num ='Age',strategies='median') missing_cols
True,(5,5)) get_vif(df_num,
const 28.227667
+Survived 1.061895
+Pclass 1.173788
+Age 1.361584
+SibSp 1.351837
+Parch 1.199945
+dtype: float64
++ +
get_correlation_by_threshold
++++get_correlation_by_threshold (df_corr, min_thres=0.98)
+ | Type | +Default | +Details | +
---|---|---|---|
df_corr | ++ | + | Correlation DataFrame | +
min_thres | +float | +0.98 | +minimum correlation to take | +
=0) get_correlation_by_threshold(df_num.corr(),min_thres
{'Pclass': {'Survived': -0.11633986928104582},
+ 'Age': {'Survived': -0.11211373025858094, 'Pclass': -0.3451575619176082},
+ 'SibSp': {'Survived': -0.06694288369258686,
+ 'Pclass': 0.08741953046914279,
+ 'Age': -0.3664840343129444},
+ 'Parch': {'Survived': 0.03943462980865732,
+ 'Pclass': 0.016490845192711254,
+ 'Age': -0.19765444198507792,
+ 'SibSp': 0.39904002232194297}}
+=0.3) get_correlation_by_threshold(df_num.corr(),min_thres
{'Age': {'Pclass': -0.3451575619176082},
+ 'SibSp': {'Age': -0.3664840343129444},
+ 'Parch': {'SibSp': 0.39904002232194297}}
++ +
plot_cat_correlation
++++plot_cat_correlation (df_cat, figsize=(10, 10))
+ | Type | +Default | +Details | +
---|---|---|---|
df_cat | ++ | + | DataFrame with categorical features that have been processed | +
figsize | +tuple | +(10, 10) | +Matplotlib figsize | +
Let’s process some of the categorical features
+from sklearn.preprocessing import OrdinalEncoder
for c in ['Sex','Embarked']:
+= OrdinalEncoder()
+ oe= oe.fit_transform(df[c].values.reshape(-1,1)) df[c]
= df[['Survived','Pclass','Sex','Embarked']] df_cat
df_cat.head()
+ | Survived | +Pclass | +Sex | +Embarked | +
---|---|---|---|---|
0 | +0 | +3 | +1.0 | +2.0 | +
1 | +1 | +1 | +0.0 | +0.0 | +
2 | +1 | +3 | +0.0 | +2.0 | +
3 | +1 | +1 | +0.0 | +2.0 | +
4 | +0 | +3 | +1.0 | +2.0 | +
Cramer’s V measures association between two nominal variables.
+Cramer’s V lies between 0 and 1 (inclusive). - 0 indicates that the two variables are not linked by any relation. - 1 indicates that there exists a strong association between the two variables.
+5,5)) plot_cat_correlation(df_cat,(
= get_cat_correlation(df_cat)
+ cat_corr =0.2) get_correlation_by_threshold(cat_corr,min_thres
{'Sex': {'Survived': 0.5650175790296367},
+ 'Embarked': {'Pclass': 0.23572003899034383}}
+Evaluation plot for regression problem
++ +
plot_residuals
++++plot_residuals (model, X_trn, y_trn, X_test=None, y_test=None, + qqplot=True)
+ | Type | +Default | +Details | +
---|---|---|---|
model | ++ | + | Regression model | +
X_trn | ++ | + | Training dataframe | +
y_trn | ++ | + | Training label | +
X_test | +NoneType | +None | +Testing dataframe | +
y_test | +NoneType | +None | +Testing label | +
qqplot | +bool | +True | +To whether plot the qqplot | +
= pd.read_csv('http://www.statsci.org/data/general/uscrime.txt',sep='\t') df_reg
from sklearn.linear_model import LinearRegression
= LinearRegression()
+ reg_model 'Crime',axis=1), df_reg.Crime.values) reg_model.fit(df_reg.drop(
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
'Crime',axis=1), df_reg.Crime.values,
+ plot_residuals(reg_model, df_reg.drop(=None, y_test=None, qqplot=True) X_test
/home/quan/anaconda3/envs/dev/lib/python3.10/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
+from sklearn.model_selection import train_test_split
= train_test_split(df_reg.drop('Crime',axis=1), df_reg.Crime.values,
+ X_train, X_test, y_train, y_test =0.2, random_state=42) test_size
=True) plot_residuals(reg_model, X_train,y_train,X_test, y_test, qqplot
/home/quan/anaconda3/envs/dev/lib/python3.10/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
++ +
plot_prediction_distribution
++++plot_prediction_distribution (y_true, y_pred, figsize=(15, 5))
+ | Type | +Default | +Details | +
---|---|---|---|
y_true | ++ | + | True label numpy array | +
y_pred | ++ | + | Prediction numpy array | +
figsize | +tuple | +(15, 5) | +Matplotlib figsize | +
= LinearRegression()
+ reg_model 'Crime',axis=1), df_reg.Crime.values) reg_model.fit(df_reg.drop(
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
= reg_model.predict(df_reg.drop('Crime',axis=1))
+ y_pred = df_reg.Crime.values y_true
plot_prediction_distribution(y_true,y_pred)
MSE: 28828.633430503334
+RMSE: 169.789968580312
+MAE: 129.91521266409967
+Model evaluation curves
++ +
plot_learning_curve
++++plot_learning_curve (estimator, title, X, y, axes=None, ylim=None, + cv=None, n_jobs=-1, scoring=None, train_sizes=[0.05, + 0.24, 0.43, 0.62, 0.81, 1.0], save_fig=False, + figsize=(20, 5), seed=42)
+ | Type | +Default | +Details | +
---|---|---|---|
estimator | ++ | + | sklearn’s classifier | +
title | ++ | + | Title of the chart | +
X | ++ | + | Training features | +
y | ++ | + | Training label | +
axes | +NoneType | +None | +matplotlib’s axes | +
ylim | +NoneType | +None | +y axis range limit | +
cv | +NoneType | +None | +sklearn’s cross-validation splitting strategy | +
n_jobs | +int | +-1 | +Number of jobs to run in parallel | +
scoring | +NoneType | +None | +metric | +
train_sizes | +list | +[0.05, 0.24, 0.43, 0.62, 0.81, 1.0] | +List of training size portion | +
save_fig | +bool | +False | +To store the chart as png in images directory | +
figsize | +tuple | +(20, 5) | +Matplotlib figsize | +
seed | +int | +42 | +Random seed | +
= DecisionTreeClassifier(criterion='entropy',random_state=42,min_samples_leaf=1)
+ dt 'Learning Curve - Decision Tree - Titanic',df_num.drop('Survived',axis=1),df_num['Survived'],
+ plot_learning_curve(dt,=5,scoring='f1_macro',train_sizes=np.linspace(0.1,1,20)) cv
+ +
plot_validation_curve
++++plot_validation_curve (estimator, title, X, y, ylim=None, cv=None, + param_name=None, param_range=None, is_log=False, + n_jobs=-1, scoring=None, save_fig=False, + figsize=(8, 4), fill_between=True, + enumerate_x=False)
+ | Type | +Default | +Details | +
---|---|---|---|
estimator | ++ | + | sklearn’s classifier | +
title | ++ | + | Title of the chart | +
X | ++ | + | Training features | +
y | ++ | + | Training label | +
ylim | +NoneType | +None | +y axis range limit | +
cv | +NoneType | +None | +sklearn’s cross-validation splitting strategy | +
param_name | +NoneType | +None | +Name of model’s hyperparameter | +
param_range | +NoneType | +None | +List containing range of value for param_name | +
is_log | +bool | +False | +To log the value in param_range, for plotting | +
n_jobs | +int | +-1 | +Number of jobs to run in parallel | +
scoring | +NoneType | +None | +metric | +
save_fig | +bool | +False | +To store the chart as png in images directory | +
figsize | +tuple | +(8, 4) | +Matplotlib figsize | +
fill_between | +bool | +True | +To add a upper and lower one-std line for train and test curve | +
enumerate_x | +bool | +False | +Convert categorical hyperparam to numerical, for x axis | +
= DecisionTreeClassifier(criterion='entropy',random_state=42)
+ dt 'Val Curve - Decision Tree - Titanic',df_num.drop('Survived',axis=1),df_num['Survived'],
+ plot_validation_curve(dt,=5,param_range=np.arange(1,20,1),param_name='max_depth',scoring='f1_macro') cv
Tree visualization
++ +
plot_tree_dtreeviz
++++plot_tree_dtreeviz (estimator, X, y, target_name:str, + class_names:list=None, tree_index=0, + depth_range_to_display=None, fancy=False, scale=1.0)
Plot a decision tree using dtreeviz. Note that you need to install graphviz before using this function
++ | Type | +Default | +Details | +
---|---|---|---|
estimator | ++ | + | sklearn’s classifier | +
X | ++ | + | Training features | +
y | ++ | + | Training label | +
target_name | +str | ++ | The (string) name of the target variable; e.g., for Titanic, it’s “Survived” | +
class_names | +list | +None | +List of names associated with the labels (same order); e.g. [‘no’,‘yes’] | +
tree_index | +int | +0 | +Index (from 0) of tree if model is an ensemble of trees like a random forest. | +
depth_range_to_display | +NoneType | +None | +Range of depth levels to be displayed. The range values are inclusive | +
fancy | +bool | +False | +To draw fancy tree chart (as opposed to simplified one) | +
scale | +float | +1.0 | +Scale of the chart. Higher means bigger | +
= DecisionTreeClassifier(criterion='entropy',random_state=42,class_weight=None,max_depth=3)
+ dt 'Survived',axis=1),df_num['Survived']) dt.fit(df_num.drop(
DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=42)
After you have installed graphviz (https://github.com/parrt/dtreeviz#installation), run these codes to run dtreeviz
+'Survived',axis=1),df_num['Survived'],
+ plot_tree_dtreeviz(dt,df_num.drop(='Survived',
+ target_name=['no','yes'],
+ class_names=True,scale=1) fancy
/home/quan/anaconda3/envs/dev/lib/python3.10/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
+'Survived',axis=1),df_num['Survived'],
+ plot_tree_dtreeviz(dt,df_num.drop(='Survived',
+ target_name=['no','yes'],
+ class_names=[2,3],
+ depth_range_to_display=True,scale=1.2) fancy
/home/quan/anaconda3/envs/dev/lib/python3.10/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
++ +
plot_classification_tree_sklearn
++++plot_classification_tree_sklearn (estimator, feature_names, + class_names:list, rotate=True, + fname='tmp')
Plot a decision tree classifier using sklearn. Note that this will output a png file with fname instead of showing it in the notebook
++ | Type | +Default | +Details | +
---|---|---|---|
estimator | ++ | + | sklearn’s classifier | +
feature_names | ++ | + | List of names of dependent variables (features) | +
class_names | +list | ++ | List of names associated with the labels (same order); e.g. [‘no’,‘yes’] | +
rotate | +bool | +True | +To rotate the tree graph | +
fname | +str | +tmp | +Name of the png file to save(no extension) | +
# feature names (not including label)
+= df_num.drop('Survived',axis=1).columns.values
+ feature_names print(feature_names)
['Pclass' 'Age' 'SibSp' 'Parch']
+After you have installed graphviz (https://github.com/parrt/dtreeviz#installation), run these codes to run sklearn tree plotting
+= DecisionTreeClassifier(criterion='entropy',random_state=42,class_weight=None,max_depth=3)
+ dt 'Survived',axis=1),df_num['Survived'])
+ dt.fit(df_num.drop(=df_num.drop('Survived',axis=1).columns.values,
+ plot_tree_sklearn(dt,feature_names=['no','yes'],
+ class_names=True,fname='tree_depth_3_titanic') rotate
To show the image in notebook, create a markdown cell and type ![](images/tree_depth_3_titanic.png)
Decision Tree’s feature importances
++ +
plot_feature_importances
++++plot_feature_importances (importances, feature_names, figsize=(20, 10), + top_n=None)
Plot and return a dataframe of feature importances, using sklearn’s feature_importances_ value
++ | Type | +Default | +Details | +
---|---|---|---|
importances | ++ | + | feature importances from sklearn’s feature_importances_ variable | +
feature_names | ++ | + | List of names of dependent variables (features) | +
figsize | +tuple | +(20, 10) | +Matplotlib figsize | +
top_n | +NoneType | +None | +Show top n features | +
= df_num.drop('Survived',axis=1).columns.values feature_names
= DecisionTreeClassifier(criterion='entropy',random_state=42,class_weight=None,max_depth=5)
+ dt 'Survived',axis=1),df_num['Survived'])
+ dt.fit(df_num.drop(
+=3) plot_feature_importances(dt.feature_importances_,feature_names,top_n
+ | Importance | +
---|---|
Feature | ++ |
Pclass | +0.087868 | +
SibSp | +0.186577 | +
Age | +0.686647 | +
= df_num.drop('Survived',axis=1).columns.values
+ feature_names = DecisionTreeClassifier(criterion='entropy',random_state=42,class_weight=None,max_depth=5)
+ dt 'Survived',axis=1),df_num['Survived'])
+ dt.fit(df_num.drop(
+
+ plot_permutation_importances(dt,'Survived',axis=1),
+ df_num.drop('Survived'],
+ df_num[=['f1_macro','accuracy'],
+ scoring=3) top_n
f1_macro
+accuracy
+[ Importance STD
+ Feature
+ Pclass 0.049733 0.024178
+ SibSp 0.063374 0.018909
+ Age 0.126187 0.019904,
+ Importance STD
+ Feature
+ Pclass 0.046154 0.014841
+ SibSp 0.070513 0.013446
+ Age 0.133333 0.021983]
+= df_num.drop('Survived',axis=1).columns.values
+ feature_names = DecisionTreeClassifier(criterion='entropy',random_state=42,class_weight=None,max_depth=5)
+ dt 'Survived',axis=1),df_num['Survived'])
+ dt.fit(df_num.drop(
+
+ plot_permutation_importances(dt,'Survived',axis=1),
+ df_num.drop('Survived'],
+ df_num[='f1_macro'
+ scoring )
f1_macro
+[ Importance STD
+ Feature
+ Parch 0.002853 0.005706
+ Pclass 0.049733 0.024178
+ SibSp 0.063374 0.018909
+ Age 0.126187 0.019904]
+Hyperparameters visualization
++ +
params_2D_heatmap
++++params_2D_heatmap (search_cv:dict, param1:str, param2:str, + scoring:str='f1_macro', log_param1=False, + log_param2=False, figsize=(20, 10), min_hm=None, + max_hm=None, higher_is_better=True)
Plot 2D graph of metric value for each pair of hyperparameters
++ | Type | +Default | +Details | +
---|---|---|---|
search_cv | +dict | ++ | A dict with keys as column headers and values as columns. Typically an attribute (cv_results_) of GridSearchCV or RandomizedSearchCV | +
param1 | +str | ++ | Name of the first hyperparameter | +
param2 | +str | ++ | Name of the second hyperparameter | +
scoring | +str | +f1_macro | +Metric name | +
log_param1 | +bool | +False | +To log the first hyperparameter | +
log_param2 | +bool | +False | +To log the second hyperparameter | +
figsize | +tuple | +(20, 10) | +Matplotlib figsize | +
min_hm | +NoneType | +None | +Minimum value for the metric to show | +
max_hm | +NoneType | +None | +Maximum value of the metric to show | +
higher_is_better | +bool | +True | +Set if high metric is better | +
= RandomForestClassifier(random_state=42)
+ dt ={
+ param_grid'n_estimators': np.arange(2,20),
+ 'min_samples_leaf': np.arange(1,80),
+
+ }# Note: in order to use params_2D_heatmap, you should set scoring to a list, and set refit to False
+= RandomizedSearchCV(dt,param_grid,n_iter=100,
+ clf =['f1_macro'],n_jobs=-1,
+ scoring=5,verbose=1,random_state=42,refit=False)
+ cv'Survived',axis=1),df_num['Survived']) clf.fit(df_num.drop(
Fitting 5 folds for each of 100 candidates, totalling 500 fits
+RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), + n_iter=100, n_jobs=-1, + param_distributions={'min_samples_leaf': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]), + 'n_estimators': array([ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19])}, + random_state=42, refit=False, scoring=['f1_macro'], + verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), + n_iter=100, n_jobs=-1, + param_distributions={'min_samples_leaf': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]), + 'n_estimators': array([ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19])}, + random_state=42, refit=False, scoring=['f1_macro'], + verbose=1)
RandomForestClassifier(random_state=42)
RandomForestClassifier(random_state=42)
'n_estimators','min_samples_leaf',
+ params_2D_heatmap(clf.cv_results_,='f1_macro',
+ scoring=(20,10)) figsize
/tmp/ipykernel_4395/2399879344.py:14: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
+'n_estimators','min_samples_leaf',
+ params_2D_heatmap(clf.cv_results_,='f1_macro',
+ scoring=(20,10),min_hm=0.45) figsize
/tmp/ipykernel_4395/2399879344.py:14: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
+= RandomForestClassifier(random_state=42)
+ dt ={
+ param_grid'n_estimators': np.arange(2,20),
+ 'min_samples_leaf': np.arange(1,80),
+ 'max_features': [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
+
+ }# Note: in order to use params_2D_heatmap, you should set scoring to a list, and set refit to False
+= RandomizedSearchCV(dt,param_grid,n_iter=100,
+ clf =['f1_macro'],n_jobs=-1,
+ scoring=5,verbose=1,random_state=42,refit=False)
+ cv'Survived',axis=1),df_num['Survived']) clf.fit(df_num.drop(
Fitting 5 folds for each of 100 candidates, totalling 500 fits
+RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), + n_iter=100, n_jobs=-1, + param_distributions={'max_features': [0.3, 0.4, 0.5, 0.6, + 0.7, 0.8, 0.9, 1], + 'min_samples_leaf': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]), + 'n_estimators': array([ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19])}, + random_state=42, refit=False, scoring=['f1_macro'], + verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), + n_iter=100, n_jobs=-1, + param_distributions={'max_features': [0.3, 0.4, 0.5, 0.6, + 0.7, 0.8, 0.9, 1], + 'min_samples_leaf': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]), + 'n_estimators': array([ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19])}, + random_state=42, refit=False, scoring=['f1_macro'], + verbose=1)
RandomForestClassifier(random_state=42)
RandomForestClassifier(random_state=42)
+ params_3D_heatmap(clf.cv_results_,'n_estimators',
+ 'min_samples_leaf',
+ 'max_features',
+ ='f1_macro') scoring
Partial Dependency Plot
++ +
pdp_numerical_only
++++pdp_numerical_only (model, X:pandas.core.frame.DataFrame, + num_features:list, class_names:list, y_colors=None, + ncols=2, nrows=2, figsize=(20, 16))
Plot PDP plot for numerical dependent variables
++ | Type | +Default | +Details | +
---|---|---|---|
model | ++ | + | sklearn tree model that has been trained | +
X | +pd.DataFrame | ++ | dataframe to perform pdp | +
num_features | +list | ++ | A list of numerical features | +
class_names | +list | ++ | List of names associated with the labels (same order); e.g. [‘no’,‘yes’] | +
y_colors | +NoneType | +None | +List of colors associated with class_names | +
ncols | +int | +2 | ++ |
nrows | +int | +2 | ++ |
figsize | +tuple | +(20, 16) | ++ |
= pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/titanic.csv')
+ df = df[['Survived','Pclass','Sex','Age','SibSp','Parch','Embarked']].copy()
+ df = preprocessing_general(df,
+ df =['Age','Embarked'],
+ missing_cols=np.NaN,
+ missing_vals=['median','most_frequent'],
+ strategies='Embarked',
+ cat_cols='Sex'
+ bi_cols )
df.head()
+ | Survived | +Pclass | +Age | +SibSp | +Parch | +Embarked_C | +Embarked_Q | +Embarked_S | +Sex_male | +
---|---|---|---|---|---|---|---|---|---|
0 | +0 | +3 | +22.0 | +1 | +0 | +False | +False | +True | +True | +
1 | +1 | +1 | +38.0 | +1 | +0 | +True | +False | +False | +False | +
2 | +1 | +3 | +26.0 | +0 | +0 | +False | +False | +True | +False | +
3 | +1 | +1 | +35.0 | +1 | +0 | +False | +False | +True | +False | +
4 | +0 | +3 | +35.0 | +0 | +0 | +False | +False | +True | +True | +
To better showcase the interpretation of Partial Dependency Plot, we will reuse the Titanic dataset, but now the independent variable (the one we need to predict) will be Pclass (3 classes to predict)
+= {'n_estimators': 12, 'min_samples_leaf': 10, 'max_features': 0.8, 'class_weight': 'balanced'}
+ params = df.drop('Pclass',axis=1)
+ X_trn = LabelEncoder().fit_transform(df['Pclass'])
+ y_trn = RandomForestClassifier(**params)
+ dt dt.fit(X_trn,y_trn)
RandomForestClassifier(class_weight='balanced', max_features=0.8, + min_samples_leaf=10, n_estimators=12)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(class_weight='balanced', max_features=0.8, + min_samples_leaf=10, n_estimators=12)
=['Age','SibSp'],class_names=['pclass_1','pclass_2','pclass_3'],nrows=2,ncols=1,figsize=(6,8)) pdp_numerical_only(dt,X_trn,num_features
+ +
pdp_categorical_only
++++pdp_categorical_only (model, X:pandas.core.frame.DataFrame, + cat_feature:list, class_names:list, y_colors=None, + ymax=0.5, figsize=(20, 8))
Plot PDP plot for categorical dependent variables
++ | Type | +Default | +Details | +
---|---|---|---|
model | ++ | + | sklearn tree model that has been trained | +
X | +pd.DataFrame | ++ | dataframe to perform pdp | +
cat_feature | +list | ++ | A single categorical feature | +
class_names | +list | ++ | List of names associated with the labels (same order); e.g. [‘no’,‘yes’] | +
y_colors | +NoneType | +None | +List of colors associated with class_names | +
ymax | +float | +0.5 | ++ |
figsize | +tuple | +(20, 8) | ++ |
'Survived',
+ pdp_categorical_only(dt,X_trn,=['pclass_1','pclass_2','pclass_3']) class_names
+ +
plot_ice_pair
++++plot_ice_pair (model, X:pandas.core.frame.DataFrame, pair_features:list, + class_idx, figsize=(10, 4))
Plot ICE plot from a pair of numerical feature
++ | Type | +Default | +Details | +
---|---|---|---|
model | ++ | + | sklearn tree model that has been trained | +
X | +pd.DataFrame | ++ | dataframe to perform ice | +
pair_features | +list | ++ | a list of only 2 features | +
class_idx | ++ | + | index of the class to plot | +
figsize | +tuple | +(10, 4) | ++ |
# For pclass_1
+=['Age','SibSp'],class_idx=0,figsize=(8,3)) plot_ice_pair(dt,X_trn,pair_features
# For pclass_2
+=['Age','SibSp'],class_idx=1,figsize=(8,3)) plot_ice_pair(dt,X_trn,pair_features
# For pclass_3
+=['Age','SibSp'],class_idx=2,figsize=(8,3)) plot_ice_pair(dt,X_trn,pair_features
Other functions
++ +
plot_confusion_matrix
++++plot_confusion_matrix (y_true:list|numpy.ndarray, + y_pred:list|numpy.ndarray, labels=None)
Simple function to plot the confusion matrix
++ | Type | +Default | +Details | +
---|---|---|---|
y_true | +list | np.ndarray | ++ | A list/numpy array of true labels | +
y_pred | +list | np.ndarray | ++ | A list/numpy array of predictions | +
labels | +NoneType | +None | +Display names matching the labels (same order). | +
= np.array([1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0])
+ y_true = np.array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0]) y_pred
=['Not Survived','Survived']) plot_confusion_matrix(y_true,y_pred,labels