From eec7b542ba713b914b7aa8b13f989377d130173e Mon Sep 17 00:00:00 2001 From: v715 Date: Sat, 2 Nov 2019 10:13:35 -0400 Subject: [PATCH 01/20] Add dummy criterion class --- sklearn/tree/_criterion.pyx | 60 +++++++++++++++++++++++++++++++++++++ sklearn/tree/tree.py | 2 +- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index d11f67854731e..367a4b67c329a 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1325,3 +1325,63 @@ cdef class FriedmanMSE(MSE): return (diff * diff / (self.weighted_n_left * self.weighted_n_right * self.weighted_n_node_samples)) + + +cdef class ObliqueProjection(RegressionCriterion): + """Mean squared error impurity criterion with improvement score by Friedman + + Uses the formula (35) in Friedman's original Gradient Boosting paper: + + diff = mean_left - mean_right + improvement = n_left * n_right * diff^2 / (n_left + n_right) + """ + + cdef double proxy_impurity_improvement(self) nogil: + """Compute a proxy of the impurity reduction + + This method is used to speed up the search for the best split. + It is a proxy quantity such that the split that maximizes this value + also maximizes the impurity improvement. It neglects all constant terms + of the impurity decrease for a given split. + + The absolute impurity improvement is only computed by the + impurity_improvement method once the best split has been found. + """ + + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right + + cdef double total_sum_left = 0.0 + cdef double total_sum_right = 0.0 + + cdef SIZE_t k + cdef double diff = 0.0 + + for k in range(self.n_outputs): + total_sum_left += sum_left[k] + total_sum_right += sum_right[k] + + diff = (self.weighted_n_right * total_sum_left - + self.weighted_n_left * total_sum_right) + + return diff * diff / (self.weighted_n_left * self.weighted_n_right) + + cdef double impurity_improvement(self, double impurity) nogil: + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right + + cdef double total_sum_left = 0.0 + cdef double total_sum_right = 0.0 + + cdef SIZE_t k + cdef double diff = 0.0 + + for k in range(self.n_outputs): + total_sum_left += sum_left[k] + total_sum_right += sum_right[k] + + diff = (self.weighted_n_right * total_sum_left - + self.weighted_n_left * total_sum_right) / self.n_outputs + + return (diff * diff / (self.weighted_n_left * self.weighted_n_right * + self.weighted_n_node_samples)) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index d88bc5830359b..2781b478b7079 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -60,7 +60,7 @@ CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy} CRITERIA_REG = {"mse": _criterion.MSE, "friedman_mse": _criterion.FriedmanMSE, - "mae": _criterion.MAE} + "mae": _criterion.MAE, "oblique": _criterion.ObliqueProjection} DENSE_SPLITTERS = {"best": _splitter.BestSplitter, "random": _splitter.RandomSplitter} From 5e1d44486e2af1c20632cc347555e11165c2030a Mon Sep 17 00:00:00 2001 From: Morgan Sanchez Date: Wed, 13 Nov 2019 17:45:52 -0500 Subject: [PATCH 02/20] testing random state additions --- sklearn/tree/_criterion.pyx | 222 +++++++++++++++++++++++++++++++----- sklearn/tree/tree.py | 6 +- 2 files changed, 198 insertions(+), 30 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 367a4b67c329a..66a4b176622c5 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -26,6 +26,7 @@ import numpy as np cimport numpy as np np.import_array() +from ._utils cimport rand_int #added by Morgan from ._utils cimport log from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray @@ -74,7 +75,6 @@ cdef class Criterion: The first sample to be used on this node end : SIZE_t The last sample used on this node - """ pass @@ -689,7 +689,7 @@ cdef class RegressionCriterion(Criterion): = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ - def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): + def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples, object random_state=None): """Initialize parameters for this criterion. Parameters @@ -699,11 +699,17 @@ cdef class RegressionCriterion(Criterion): n_samples : SIZE_t The total number of samples to fit on + + random_state : object #added by morgan + Random State from splitter class #added by morgan + """ # Default values self.sample_weight = NULL + self.random_state = random_state #added by morgan + self.samples = NULL self.start = 0 self.pos = 0 @@ -734,7 +740,7 @@ cdef class RegressionCriterion(Criterion): self.sum_right == NULL): raise MemoryError() - def __reduce__(self): + def __reduce__(self): #TODO do I need to add this for random_state return (type(self), (self.n_outputs, self.n_samples), self.__getstate__()) cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, @@ -980,7 +986,7 @@ cdef class MAE(RegressionCriterion): cdef np.ndarray right_child cdef DOUBLE_t* node_medians - def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): + def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): #TODO do I need to modify this? """Initialize parameters for this criterion. Parameters @@ -1327,15 +1333,64 @@ cdef class FriedmanMSE(MSE): self.weighted_n_node_samples)) -cdef class ObliqueProjection(RegressionCriterion): - """Mean squared error impurity criterion with improvement score by Friedman +cdef class AxisProjection(RegressionCriterion): + r"""Mean absolute error impurity criterion + of axis-aligned projections of high dimensional y - Uses the formula (35) in Friedman's original Gradient Boosting paper: + Algorithm: + 1. select a random predictor from [0,n_outputs] + 2. compute mse on the values of that predictor for all samples - diff = mean_left - mean_right - improvement = n_left * n_right * diff^2 / (n_left + n_right) + MSE = var_left + var_right """ + cdef double node_impurity(self) nogil: + """Evaluate the impurity of the current node, i.e. the impurity of + samples[start:end].""" + + cdef double* sum_total = self.sum_total #delete + cdef double impurity #delete + cdef SIZE_t k #delete + + """ + cdef double impurity + cdef DOUBLE_t* sample_weight = self.sample_weight + cdef SIZE_t* samples = self.samples + cdef SIZE_t end = self.end + cdef SIZE_t start = self.start + + cdef double* sum_total = self.sum_total #modified + cdef DOUBLE_t y_ik + + cdef double sq_sum_total = 0.0 + + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k # modified + # TODO choose random k here + + k = rand_int(0, self.n_outputs, self.random_state) #TODO what should random state be? + + cdef DOUBLE_t w = 1.0 + + for p in range(start, pos): + i = samples[p] + if sample_weight != NULL: + w = sample_weight[i] + y_ik = self.y[i, k] + sq_sum_total += w * y_ik * y_ik + + impurity = sq_sum_total / self.weighted_n_node_samples + impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 + + return impurity / self.n_outputs + """ + impurity = self.sq_sum_total / self.weighted_n_node_samples #delete + for k in range(self.n_outputs): #delete + impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 #delete + + return impurity / self.n_outputs #delete + cdef double proxy_impurity_improvement(self) nogil: """Compute a proxy of the impurity reduction @@ -1351,37 +1406,148 @@ cdef class ObliqueProjection(RegressionCriterion): cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right - cdef double total_sum_left = 0.0 - cdef double total_sum_right = 0.0 - cdef SIZE_t k - cdef double diff = 0.0 + cdef double proxy_impurity_left = 0.0 + cdef double proxy_impurity_right = 0.0 for k in range(self.n_outputs): - total_sum_left += sum_left[k] - total_sum_right += sum_right[k] + proxy_impurity_left += sum_left[k] * sum_left[k] + proxy_impurity_right += sum_right[k] * sum_right[k] - diff = (self.weighted_n_right * total_sum_left - - self.weighted_n_left * total_sum_right) + return (proxy_impurity_left / self.weighted_n_left + + proxy_impurity_right / self.weighted_n_right) - return diff * diff / (self.weighted_n_left * self.weighted_n_right) + cdef void children_impurity(self, double* impurity_left, + double* impurity_right) nogil: + """Evaluate the impurity in children nodes, i.e. the impurity of the + left child (samples[start:pos]) and the impurity the right child + (samples[pos:end]).""" + + cdef DOUBLE_t* sample_weight = self.sample_weight + cdef SIZE_t* samples = self.samples + cdef SIZE_t pos = self.pos + cdef SIZE_t start = self.start - cdef double impurity_improvement(self, double impurity) nogil: cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right + cdef DOUBLE_t y_ik - cdef double total_sum_left = 0.0 - cdef double total_sum_right = 0.0 + cdef double sq_sum_left = 0.0 + cdef double sq_sum_right + cdef SIZE_t i + cdef SIZE_t p cdef SIZE_t k - cdef double diff = 0.0 + cdef DOUBLE_t w = 1.0 + + for p in range(start, pos): + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + # choose random k here + for k in range(self.n_outputs): + y_ik = self.y[i, k] + sq_sum_left += w * y_ik * y_ik + + sq_sum_right = self.sq_sum_total - sq_sum_left + + impurity_left[0] = sq_sum_left / self.weighted_n_left + impurity_right[0] = sq_sum_right / self.weighted_n_right for k in range(self.n_outputs): - total_sum_left += sum_left[k] - total_sum_right += sum_right[k] + impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 + impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 - diff = (self.weighted_n_right * total_sum_left - - self.weighted_n_left * total_sum_right) / self.n_outputs + impurity_left[0] /= self.n_outputs + impurity_right[0] /= self.n_outputs - return (diff * diff / (self.weighted_n_left * self.weighted_n_right * - self.weighted_n_node_samples)) +cdef class ObliqueProjection(RegressionCriterion): + r"""Mean absolute error impurity criterion + + MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true + value and f_i is the predicted value.""" + cdef double node_impurity(self) nogil: + """Evaluate the impurity of the current node, i.e. the impurity of + samples[start:end].""" + + cdef double* sum_total = self.sum_total + cdef double impurity + cdef SIZE_t k + + impurity = self.sq_sum_total / self.weighted_n_node_samples + for k in range(self.n_outputs): + impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 + + return impurity / self.n_outputs + + cdef double proxy_impurity_improvement(self) nogil: + """Compute a proxy of the impurity reduction + + This method is used to speed up the search for the best split. + It is a proxy quantity such that the split that maximizes this value + also maximizes the impurity improvement. It neglects all constant terms + of the impurity decrease for a given split. + + The absolute impurity improvement is only computed by the + impurity_improvement method once the best split has been found. + """ + + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right + + cdef SIZE_t k + cdef double proxy_impurity_left = 0.0 + cdef double proxy_impurity_right = 0.0 + + for k in range(self.n_outputs): + proxy_impurity_left += sum_left[k] * sum_left[k] + proxy_impurity_right += sum_right[k] * sum_right[k] + + return (proxy_impurity_left / self.weighted_n_left + + proxy_impurity_right / self.weighted_n_right) + + cdef void children_impurity(self, double* impurity_left, + double* impurity_right) nogil: + """Evaluate the impurity in children nodes, i.e. the impurity of the + left child (samples[start:pos]) and the impurity the right child + (samples[pos:end]).""" + + cdef DOUBLE_t* sample_weight = self.sample_weight + cdef SIZE_t* samples = self.samples + cdef SIZE_t pos = self.pos + cdef SIZE_t start = self.start + + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right + cdef DOUBLE_t y_ik + + cdef double sq_sum_left = 0.0 + cdef double sq_sum_right + + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k + cdef DOUBLE_t w = 1.0 + + for p in range(start, pos): + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + + for k in range(self.n_outputs): + y_ik = self.y[i, k] + sq_sum_left += w * y_ik * y_ik + + sq_sum_right = self.sq_sum_total - sq_sum_left + + impurity_left[0] = sq_sum_left / self.weighted_n_left + impurity_right[0] = sq_sum_right / self.weighted_n_right + + for k in range(self.n_outputs): + impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 + impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 + + impurity_left[0] /= self.n_outputs + impurity_right[0] /= self.n_outputs \ No newline at end of file diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 2781b478b7079..7bf320a8ed85b 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -60,7 +60,8 @@ CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy} CRITERIA_REG = {"mse": _criterion.MSE, "friedman_mse": _criterion.FriedmanMSE, - "mae": _criterion.MAE, "oblique": _criterion.ObliqueProjection} + "mae": _criterion.MAE, "oblique": _criterion.ObliqueProjection, + "axis": _criterion.axisProjection} DENSE_SPLITTERS = {"best": _splitter.BestSplitter, "random": _splitter.RandomSplitter} @@ -325,7 +326,8 @@ def fit(self, X, y, sample_weight=None, check_input=True, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_, - n_samples) + n_samples, + random_state) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS From f1da35710d96433f09d0e414d74921e3b777d241 Mon Sep 17 00:00:00 2001 From: Morgan Sanchez Date: Thu, 14 Nov 2019 01:42:53 -0500 Subject: [PATCH 03/20] solved random_state so that make in works and test_multioutput.py and test_forest.py pass all tests. Wrote axis projection test to compare against past results --- sklearn/tree/_criterion.pxd | 3 +- sklearn/tree/_criterion.pyx | 79 ++++++++++---- sklearn/tree/tests/axis_projection_test.py | 121 +++++++++++++++++++++ sklearn/tree/tree.py | 2 +- 4 files changed, 179 insertions(+), 26 deletions(-) create mode 100644 sklearn/tree/tests/axis_projection_test.py diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index e4a7e15ce16c1..f66ff98aabe3b 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -58,7 +58,7 @@ cdef class Criterion: cdef int reset(self) nogil except -1 cdef int reverse_reset(self) nogil except -1 cdef int update(self, SIZE_t new_pos) nogil except -1 - cdef double node_impurity(self) nogil + cdef double node_impurity(self) nogil cdef void children_impurity(self, double* impurity_left, double* impurity_right) nogil cdef void node_value(self, double* dest) nogil @@ -75,3 +75,4 @@ cdef class RegressionCriterion(Criterion): """Abstract regression criterion.""" cdef double sq_sum_total + cdef object random_state # Random state diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 66a4b176622c5..1511cfc0e3af6 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -27,6 +27,7 @@ cimport numpy as np np.import_array() from ._utils cimport rand_int #added by Morgan +from ._utils cimport RAND_R_MAX # added by morgan from ._utils cimport log from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray @@ -986,7 +987,7 @@ cdef class MAE(RegressionCriterion): cdef np.ndarray right_child cdef DOUBLE_t* node_medians - def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): #TODO do I need to modify this? + def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples, object random_state = None): #TODO do I need to modify this? """Initialize parameters for this criterion. Parameters @@ -1348,11 +1349,11 @@ cdef class AxisProjection(RegressionCriterion): """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end].""" - cdef double* sum_total = self.sum_total #delete - cdef double impurity #delete - cdef SIZE_t k #delete + #cdef double* sum_total = self.sum_total #delete + #cdef double impurity #delete + #cdef SIZE_t k #delete - """ + cdef double impurity cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples @@ -1367,13 +1368,17 @@ cdef class AxisProjection(RegressionCriterion): cdef SIZE_t i cdef SIZE_t p cdef SIZE_t k # modified - # TODO choose random k here + cdef UINT32_t rand_r_state + + with gil: # is this okay? + rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef UINT32_t* random_state = &rand_r_state - k = rand_int(0, self.n_outputs, self.random_state) #TODO what should random state be? + k = rand_int(0, self.n_outputs, random_state) #TODO is this random state okay? cdef DOUBLE_t w = 1.0 - for p in range(start, pos): + for p in range(start, end): i = samples[p] if sample_weight != NULL: w = sample_weight[i] @@ -1383,13 +1388,8 @@ cdef class AxisProjection(RegressionCriterion): impurity = sq_sum_total / self.weighted_n_node_samples impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 - return impurity / self.n_outputs - """ - impurity = self.sq_sum_total / self.weighted_n_node_samples #delete - for k in range(self.n_outputs): #delete - impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 #delete - - return impurity / self.n_outputs #delete + return impurity #/ self.n_outputs + cdef double proxy_impurity_improvement(self) nogil: """Compute a proxy of the impurity reduction @@ -1422,6 +1422,33 @@ cdef class AxisProjection(RegressionCriterion): """Evaluate the impurity in children nodes, i.e. the impurity of the left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" + """ + + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k # modified + cdef UINT32_t rand_r_state + + with gil: # is this okay? + rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef UINT32_t* random_state = &rand_r_state + + k = rand_int(zero, self.n_outputs, random_state) #TODO is this random state okay? + + cdef DOUBLE_t w = 1.0 + + for p in range(start, end): + i = samples[p] + if sample_weight != NULL: + w = sample_weight[i] + y_ik = self.y[i, k] + sq_sum_total += w * y_ik * y_ik + + impurity = sq_sum_total / self.weighted_n_node_samples + impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 + + return impurity #/ self.n_outputs + """ cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples @@ -1439,28 +1466,32 @@ cdef class AxisProjection(RegressionCriterion): cdef SIZE_t p cdef SIZE_t k cdef DOUBLE_t w = 1.0 + cdef UINT32_t rand_r_state + + with gil: # is this okay? + rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef UINT32_t* random_state = &rand_r_state + + k = rand_int(0, self.n_outputs, random_state) #TODO is this random state okay? for p in range(start, pos): i = samples[p] if sample_weight != NULL: w = sample_weight[i] - # choose random k here - for k in range(self.n_outputs): - y_ik = self.y[i, k] - sq_sum_left += w * y_ik * y_ik + y_ik = self.y[i, k] + sq_sum_left += w * y_ik * y_ik sq_sum_right = self.sq_sum_total - sq_sum_left impurity_left[0] = sq_sum_left / self.weighted_n_left impurity_right[0] = sq_sum_right / self.weighted_n_right - for k in range(self.n_outputs): - impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 - impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 + impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 + impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 - impurity_left[0] /= self.n_outputs - impurity_right[0] /= self.n_outputs + impurity_left[0] + impurity_right[0] cdef class ObliqueProjection(RegressionCriterion): r"""Mean absolute error impurity criterion diff --git a/sklearn/tree/tests/axis_projection_test.py b/sklearn/tree/tests/axis_projection_test.py new file mode 100644 index 0000000000000..6a15a3e534699 --- /dev/null +++ b/sklearn/tree/tests/axis_projection_test.py @@ -0,0 +1,121 @@ +from sklearn.ensemble import RandomForestRegressor +# from vivek's 10-16-19 experiment +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + +def generate_data( + n=25, + mean=[0, 0], + cov=[[1, 0], [0, 1]], + theta=np.pi/4 +): + """ + Generate synthetic data. + X ~iid MVN(u=0, cov=I). + y = AX where A is a rotation matrix. + """ + + # Rotation matrix + A = [ + [np.cos(theta), -np.sin(theta)], + [np.sin(theta), np.cos(theta)] + ] + + # Sample random variables + X = np.random.multivariate_normal(mean, cov, size=n) + y = np.dot(A, X.T) + + return X, y.T + + +''' +def predict(rf, X): + """ + Return predictions for every element in X. + """ + + yhat = [] + for xi in X: + yi = np.mean(rf.predict(xi), axis=0) + yhat.append(yi) + + return np.array(yhat) +''' +def measure_mse(X, y, max_depth=10, n_features=1, min_leaf_size=5, n_trees=1000, n_bagging=10): + """ + Return MSE for each split criteria. + """ + + # Iterate over different split criteria + errors = [] + for split in ["mae", "mse", "axis"]: + + # Fit model + rf = RandomForestRegressor(criterion=split, max_depth=max_depth, min_samples_leaf=min_leaf_size, n_estimators=n_trees, random_state=1) + rf.fit(X,y) + + # Make predictions and score + yhat = rf.predict(X) + mse = np.linalg.norm(y-yhat) + errors.append(mse) + + return errors + + + + +if __name__ == "__main__": + X, y = generate_data() + + plt.scatter(X[:, 0], X[:, 1], c="blue", label="X") + plt.scatter(y[:, 0], y[:, 1], c="red", label="y") + + # Plot lines between matched pairs of points + for xi, yi in zip(X, y): + plt.plot( + [xi[0], yi[0]], + [xi[1], yi[1]], + c="black", + alpha = 0.15 + ) + + plt.legend() + plt.show() + # Test functions on sample data + measure_mse(X, y) + + # Run simulation + results = [] + max_n = 201 + n_iter = 10 + + for n in range(10, max_n, 10): + for i in range(n_iter): + + # Generate sample data + X, y = generate_data(n=n) + + # Measure MSE + mse = measure_mse(X, y) + + # Add to dataframe + mse.insert(0, n) + results.append(mse) + + print(n, i) + + # Convert to dataframe + columns = ["mae", "mse", "projection_axis"] + columns.insert(0, "n") + df = pd.DataFrame(results, columns=columns) + df = pd.melt(df, id_vars=['n'], value_vars=columns[1:], var_name='split', value_name='mse') + df["mse"] /= df["n"] + df.head() + + with sns.plotting_context("talk", font_scale=1): + + f = sns.lineplot(x="n", y="mse", hue="split", data=df) + f.set(xlabel="n", ylabel="mse / n") + plt.show() \ No newline at end of file diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 7bf320a8ed85b..522252fef0536 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -61,7 +61,7 @@ CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy} CRITERIA_REG = {"mse": _criterion.MSE, "friedman_mse": _criterion.FriedmanMSE, "mae": _criterion.MAE, "oblique": _criterion.ObliqueProjection, - "axis": _criterion.axisProjection} + "axis": _criterion.AxisProjection} DENSE_SPLITTERS = {"best": _splitter.BestSplitter, "random": _splitter.RandomSplitter} From 1d7cdd5b357f55d4aba58c70a2dea2db13f16ea9 Mon Sep 17 00:00:00 2001 From: Morgan Sanchez Date: Wed, 20 Nov 2019 10:51:37 -0500 Subject: [PATCH 04/20] added a test with one good predictor and other noisy predictors --- sklearn/tree/_criterion.pyx | 152 ++- sklearn/tree/tests/noisy_predictors.ipynb | 1373 +++++++++++++++++++++ sklearn/tree/tests/one_good_pred_test.py | 105 ++ 3 files changed, 1586 insertions(+), 44 deletions(-) create mode 100644 sklearn/tree/tests/noisy_predictors.ipynb create mode 100644 sklearn/tree/tests/one_good_pred_test.py diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 1511cfc0e3af6..c15f37365945a 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1410,9 +1410,16 @@ cdef class AxisProjection(RegressionCriterion): cdef double proxy_impurity_left = 0.0 cdef double proxy_impurity_right = 0.0 - for k in range(self.n_outputs): - proxy_impurity_left += sum_left[k] * sum_left[k] - proxy_impurity_right += sum_right[k] * sum_right[k] + cdef UINT32_t rand_r_state + + with gil: # is this okay? + rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef UINT32_t* random_state = &rand_r_state + + k = rand_int(0, self.n_outputs, random_state) #TODO is this random state okay? + + proxy_impurity_left += sum_left[k] * sum_left[k] + proxy_impurity_right += sum_right[k] * sum_right[k] return (proxy_impurity_left / self.weighted_n_left + proxy_impurity_right / self.weighted_n_right) @@ -1423,31 +1430,6 @@ cdef class AxisProjection(RegressionCriterion): left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" """ - - cdef SIZE_t i - cdef SIZE_t p - cdef SIZE_t k # modified - cdef UINT32_t rand_r_state - - with gil: # is this okay? - rand_r_state = self.random_state.randint(0, RAND_R_MAX) - cdef UINT32_t* random_state = &rand_r_state - - k = rand_int(zero, self.n_outputs, random_state) #TODO is this random state okay? - - cdef DOUBLE_t w = 1.0 - - for p in range(start, end): - i = samples[p] - if sample_weight != NULL: - w = sample_weight[i] - y_ik = self.y[i, k] - sq_sum_total += w * y_ik * y_ik - - impurity = sq_sum_total / self.weighted_n_node_samples - impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 - - return impurity #/ self.n_outputs """ cdef DOUBLE_t* sample_weight = self.sample_weight @@ -1496,21 +1478,67 @@ cdef class AxisProjection(RegressionCriterion): cdef class ObliqueProjection(RegressionCriterion): r"""Mean absolute error impurity criterion + + MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true value and f_i is the predicted value.""" cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end].""" - cdef double* sum_total = self.sum_total + #cdef double* sum_total = self.sum_total #delete + #cdef double impurity #delete + #cdef SIZE_t k #delete + + cdef double impurity - cdef SIZE_t k + cdef DOUBLE_t* sample_weight = self.sample_weight + cdef SIZE_t* samples = self.samples + cdef SIZE_t end = self.end + cdef SIZE_t start = self.start - impurity = self.sq_sum_total / self.weighted_n_node_samples + cdef double* sum_total = self.sum_total #modified + cdef DOUBLE_t y_ik + + cdef double sq_sum_total = 0.0 + + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k # modified + cdef UINT32_t rand_r_state + cdef SIZE_t num_pred # modified + cdef SIZE_t a # modified + pred_weights = calloc(self.n_outputs, sizeof(double)) + + with gil: # is this okay? + rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef UINT32_t* random_state = &rand_r_state + + num_pred = rand_int(0, self.n_outputs, random_state) #TODO is this random state okay? + + for i in range(num_pred): + k = rand_int(0, self.n_outputs, random_state) + a = rand_int(0, 2, random_state) + if a == 0: + a -= 1 + pred_weights[k] = a # didn't normalize + + cdef DOUBLE_t w = 1.0 + + for p in range(start, end): + i = samples[p] + if sample_weight != NULL: + w = sample_weight[i] + for k in range(self.n_outputs): + y_ik = self.y[i, k] + sq_sum_total += w * y_ik * y_ik * pred_weights[k] + + impurity = sq_sum_total / self.weighted_n_node_samples for k in range(self.n_outputs): - impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 + impurity -= (sum_total[k] * pred_weights[k]/ self.weighted_n_node_samples)**2.0 - return impurity / self.n_outputs + return impurity / num_pred + cdef double proxy_impurity_improvement(self) nogil: """Compute a proxy of the impurity reduction @@ -1531,9 +1559,27 @@ cdef class ObliqueProjection(RegressionCriterion): cdef double proxy_impurity_left = 0.0 cdef double proxy_impurity_right = 0.0 + cdef UINT32_t rand_r_state + cdef SIZE_t num_pred # modified + cdef SIZE_t a # modified + pred_weights = calloc(self.n_outputs, sizeof(double)) + + with gil: # is this okay? + rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef UINT32_t* random_state = &rand_r_state + + num_pred = rand_int(0, self.n_outputs, random_state) #TODO is this random state okay? + + for i in range(num_pred): + k = rand_int(0, self.n_outputs, random_state) + a = rand_int(0, 2, random_state) + if a == 0: + a -= 1 + pred_weights[k] = a # didn't normalize + for k in range(self.n_outputs): - proxy_impurity_left += sum_left[k] * sum_left[k] - proxy_impurity_right += sum_right[k] * sum_right[k] + proxy_impurity_left += sum_left[k] * sum_left[k] * pred_weights[k] + proxy_impurity_right += sum_right[k] * sum_right[k] * pred_weights[k] return (proxy_impurity_left / self.weighted_n_left + proxy_impurity_right / self.weighted_n_right) @@ -1543,6 +1589,9 @@ cdef class ObliqueProjection(RegressionCriterion): """Evaluate the impurity in children nodes, i.e. the impurity of the left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" + """ + + """ cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples @@ -1558,27 +1607,42 @@ cdef class ObliqueProjection(RegressionCriterion): cdef SIZE_t i cdef SIZE_t p - cdef SIZE_t k - cdef DOUBLE_t w = 1.0 + cdef SIZE_t k # modified + cdef UINT32_t rand_r_state + cdef SIZE_t num_pred # modified + cdef SIZE_t a # modified + pred_weights = calloc(self.n_outputs, sizeof(double)) + + with gil: # is this okay? + rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef UINT32_t* random_state = &rand_r_state + + num_pred = rand_int(0, self.n_outputs, random_state) #TODO is this random state okay? + for i in range(num_pred): + k = rand_int(0, self.n_outputs, random_state) + a = rand_int(0, 2, random_state) + if a == 0: + a -= 1 + pred_weights[k] = a # didn't normalize + + cdef DOUBLE_t w = 1.0 for p in range(start, pos): i = samples[p] if sample_weight != NULL: w = sample_weight[i] - for k in range(self.n_outputs): y_ik = self.y[i, k] - sq_sum_left += w * y_ik * y_ik + sq_sum_left += w * y_ik * y_ik * pred_weights[k] sq_sum_right = self.sq_sum_total - sq_sum_left impurity_left[0] = sq_sum_left / self.weighted_n_left impurity_right[0] = sq_sum_right / self.weighted_n_right - for k in range(self.n_outputs): - impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 - impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 + impurity_left[0] -= (sum_left[k] * pred_weights[k]/ self.weighted_n_left) ** 2.0 + impurity_right[0] -= (sum_right[k] * pred_weights[k]/ self.weighted_n_right) ** 2.0 - impurity_left[0] /= self.n_outputs - impurity_right[0] /= self.n_outputs \ No newline at end of file + impurity_left[0] + impurity_right[0] \ No newline at end of file diff --git a/sklearn/tree/tests/noisy_predictors.ipynb b/sklearn/tree/tests/noisy_predictors.ipynb new file mode 100644 index 0000000000000..e93e9207897d4 --- /dev/null +++ b/sklearn/tree/tests/noisy_predictors.ipynb @@ -0,0 +1,1373 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import one_good_pred_test as ogpt\n", + "import numpy as np \n", + "from sklearn.ensemble import RandomForestRegressor\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import random\n", + "import scipy\n", + "%matplotlib notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "w = ogpt.generate_w()\n", + "X_test, y_test = ogpt.generate_data(w, num_samples=100)\n", + "\n", + "# Run simulation\n", + "results = []\n", + "max_n = 1001\n", + "n_iter = 5#10" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 0\n", + "0 1\n", + "0 2\n", + "0 3\n", + "0 4\n", + "20 0\n", + "20 1\n", + "20 2\n", + "20 3\n", + "20 4\n", + "40 0\n", + "40 1\n", + "40 2\n", + "40 3\n", + "40 4\n", + "60 0\n", + "60 1\n", + "60 2\n", + "60 3\n", + "60 4\n", + "80 0\n", + "80 1\n", + "80 2\n", + "80 3\n", + "80 4\n", + "100 0\n", + "100 1\n", + "100 2\n", + "100 3\n", + "100 4\n", + "120 0\n", + "120 1\n", + "120 2\n", + "120 3\n", + "120 4\n", + "140 0\n", + "140 1\n", + "140 2\n", + "140 3\n", + "140 4\n", + "160 0\n", + "160 1\n", + "160 2\n", + "160 3\n", + "160 4\n", + "180 0\n", + "180 1\n", + "180 2\n", + "180 3\n", + "180 4\n", + "200 0\n", + "200 1\n", + "200 2\n", + "200 3\n", + "200 4\n", + "220 0\n", + "220 1\n", + "220 2\n", + "220 3\n", + "220 4\n", + "240 0\n", + "240 1\n", + "240 2\n", + "240 3\n", + "240 4\n", + "260 0\n", + "260 1\n", + "260 2\n", + "260 3\n", + "260 4\n", + "280 0\n", + "280 1\n", + "280 2\n", + "280 3\n", + "280 4\n", + "300 0\n", + "300 1\n", + "300 2\n", + "300 3\n", + "300 4\n", + "320 0\n", + "320 1\n", + "320 2\n", + "320 3\n", + "320 4\n", + "340 0\n", + "340 1\n", + "340 2\n", + "340 3\n", + "340 4\n", + "360 0\n", + "360 1\n", + "360 2\n", + "360 3\n", + "360 4\n", + "380 0\n", + "380 1\n", + "380 2\n", + "380 3\n", + "380 4\n", + "400 0\n", + "400 1\n", + "400 2\n", + "400 3\n", + "400 4\n", + "420 0\n", + "420 1\n", + "420 2\n", + "420 3\n", + "420 4\n", + "440 0\n", + "440 1\n", + "440 2\n", + "440 3\n", + "440 4\n", + "460 0\n", + "460 1\n", + "460 2\n", + "460 3\n", + "460 4\n", + "480 0\n", + "480 1\n", + "480 2\n", + "480 3\n", + "480 4\n", + "500 0\n", + "500 1\n", + "500 2\n", + "500 3\n", + "500 4\n", + "520 0\n", + "520 1\n", + "520 2\n", + "520 3\n", + "520 4\n", + "540 0\n", + "540 1\n", + "540 2\n", + "540 3\n", + "540 4\n", + "560 0\n", + "560 1\n", + "560 2\n", + "560 3\n", + "560 4\n", + "580 0\n", + "580 1\n", + "580 2\n", + "580 3\n", + "580 4\n", + "600 0\n", + "600 1\n", + "600 2\n", + "600 3\n", + "600 4\n", + "620 0\n", + "620 1\n", + "620 2\n", + "620 3\n", + "620 4\n", + "640 0\n", + "640 1\n", + "640 2\n", + "640 3\n", + "640 4\n", + "660 0\n", + "660 1\n", + "660 2\n", + "660 3\n", + "660 4\n", + "680 0\n", + "680 1\n", + "680 2\n", + "680 3\n", + "680 4\n", + "700 0\n", + "700 1\n", + "700 2\n", + "700 3\n", + "700 4\n", + "720 0\n", + "720 1\n", + "720 2\n", + "720 3\n", + "720 4\n", + "740 0\n", + "740 1\n", + "740 2\n", + "740 3\n", + "740 4\n", + "760 0\n", + "760 1\n", + "760 2\n", + "760 3\n", + "760 4\n", + "780 0\n", + "780 1\n", + "780 2\n", + "780 3\n", + "780 4\n", + "800 0\n", + "800 1\n", + "800 2\n", + "800 3\n", + "800 4\n", + "820 0\n", + "820 1\n", + "820 2\n", + "820 3\n", + "820 4\n", + "840 0\n", + "840 1\n", + "840 2\n", + "840 3\n", + "840 4\n", + "860 0\n", + "860 1\n", + "860 2\n", + "860 3\n", + "860 4\n", + "880 0\n", + "880 1\n", + "880 2\n", + "880 3\n", + "880 4\n", + "900 0\n", + "900 1\n", + "900 2\n", + "900 3\n", + "900 4\n", + "920 0\n", + "920 1\n", + "920 2\n", + "920 3\n", + "920 4\n", + "940 0\n", + "940 1\n", + "940 2\n", + "940 3\n", + "940 4\n", + "960 0\n", + "960 1\n", + "960 2\n", + "960 3\n", + "960 4\n", + "980 0\n", + "980 1\n", + "980 2\n", + "980 3\n", + "980 4\n", + "1000 0\n", + "1000 1\n", + "1000 2\n", + "1000 3\n", + "1000 4\n" + ] + } + ], + "source": [ + "for n in range(0, max_n, 20):\n", + " for i in range(n_iter):\n", + " # Generate sample data\n", + " X_train, y_train = ogpt.generate_data(w, var=n)\n", + "\n", + " # Measure MSE\n", + " mse = ogpt.measure_mse(X_train, y_train, X_test, y_test)\n", + "\n", + " # Add to dataframe\n", + " mse.insert(0, n)\n", + " results.append(mse)\n", + "\n", + " print(n, i)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nsplitmse
00mae231.542690
10mae230.047157
20mae230.587894
30mae233.603716
40mae230.913212
\n", + "
" + ], + "text/plain": [ + " n split mse\n", + "0 0 mae 231.542690\n", + "1 0 mae 230.047157\n", + "2 0 mae 230.587894\n", + "3 0 mae 233.603716\n", + "4 0 mae 230.913212" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Convert to dataframe\n", + "columns = [\"mae\", \"mse\", \"projection_axis\", \"projection_oblique\"]\n", + "columns.insert(0, \"n\")\n", + "df = pd.DataFrame(results, columns=columns)\n", + "df = pd.melt(df, id_vars=['n'], value_vars=columns[1:], var_name='split', value_name='mse')\n", + "df[\"mse\"]# /= df[\"n\"]\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "/* Put everything inside the global mpl namespace */\n", + "window.mpl = {};\n", + "\n", + "\n", + "mpl.get_websocket_type = function() {\n", + " if (typeof(WebSocket) !== 'undefined') {\n", + " return WebSocket;\n", + " } else if (typeof(MozWebSocket) !== 'undefined') {\n", + " return MozWebSocket;\n", + " } else {\n", + " alert('Your browser does not have WebSocket support. ' +\n", + " 'Please try Chrome, Safari or Firefox ≥ 6. ' +\n", + " 'Firefox 4 and 5 are also supported but you ' +\n", + " 'have to enable WebSockets in about:config.');\n", + " };\n", + "}\n", + "\n", + "mpl.figure = function(figure_id, websocket, ondownload, parent_element) {\n", + " this.id = figure_id;\n", + "\n", + " this.ws = websocket;\n", + "\n", + " this.supports_binary = (this.ws.binaryType != undefined);\n", + "\n", + " if (!this.supports_binary) {\n", + " var warnings = document.getElementById(\"mpl-warnings\");\n", + " if (warnings) {\n", + " warnings.style.display = 'block';\n", + " warnings.textContent = (\n", + " \"This browser does not support binary websocket messages. \" +\n", + " \"Performance may be slow.\");\n", + " }\n", + " }\n", + "\n", + " this.imageObj = new Image();\n", + "\n", + " this.context = undefined;\n", + " this.message = undefined;\n", + " this.canvas = undefined;\n", + " this.rubberband_canvas = undefined;\n", + " this.rubberband_context = undefined;\n", + " this.format_dropdown = undefined;\n", + "\n", + " this.image_mode = 'full';\n", + "\n", + " this.root = $('
');\n", + " this._root_extra_style(this.root)\n", + " this.root.attr('style', 'display: inline-block');\n", + "\n", + " $(parent_element).append(this.root);\n", + "\n", + " this._init_header(this);\n", + " this._init_canvas(this);\n", + " this._init_toolbar(this);\n", + "\n", + " var fig = this;\n", + "\n", + " this.waiting = false;\n", + "\n", + " this.ws.onopen = function () {\n", + " fig.send_message(\"supports_binary\", {value: fig.supports_binary});\n", + " fig.send_message(\"send_image_mode\", {});\n", + " if (mpl.ratio != 1) {\n", + " fig.send_message(\"set_dpi_ratio\", {'dpi_ratio': mpl.ratio});\n", + " }\n", + " fig.send_message(\"refresh\", {});\n", + " }\n", + "\n", + " this.imageObj.onload = function() {\n", + " if (fig.image_mode == 'full') {\n", + " // Full images could contain transparency (where diff images\n", + " // almost always do), so we need to clear the canvas so that\n", + " // there is no ghosting.\n", + " fig.context.clearRect(0, 0, fig.canvas.width, fig.canvas.height);\n", + " }\n", + " fig.context.drawImage(fig.imageObj, 0, 0);\n", + " };\n", + "\n", + " this.imageObj.onunload = function() {\n", + " fig.ws.close();\n", + " }\n", + "\n", + " this.ws.onmessage = this._make_on_message_function(this);\n", + "\n", + " this.ondownload = ondownload;\n", + "}\n", + "\n", + "mpl.figure.prototype._init_header = function() {\n", + " var titlebar = $(\n", + " '
');\n", + " var titletext = $(\n", + " '
');\n", + " titlebar.append(titletext)\n", + " this.root.append(titlebar);\n", + " this.header = titletext[0];\n", + "}\n", + "\n", + "\n", + "\n", + "mpl.figure.prototype._canvas_extra_style = function(canvas_div) {\n", + "\n", + "}\n", + "\n", + "\n", + "mpl.figure.prototype._root_extra_style = function(canvas_div) {\n", + "\n", + "}\n", + "\n", + "mpl.figure.prototype._init_canvas = function() {\n", + " var fig = this;\n", + "\n", + " var canvas_div = $('
');\n", + "\n", + " canvas_div.attr('style', 'position: relative; clear: both; outline: 0');\n", + "\n", + " function canvas_keyboard_event(event) {\n", + " return fig.key_event(event, event['data']);\n", + " }\n", + "\n", + " canvas_div.keydown('key_press', canvas_keyboard_event);\n", + " canvas_div.keyup('key_release', canvas_keyboard_event);\n", + " this.canvas_div = canvas_div\n", + " this._canvas_extra_style(canvas_div)\n", + " this.root.append(canvas_div);\n", + "\n", + " var canvas = $('');\n", + " canvas.addClass('mpl-canvas');\n", + " canvas.attr('style', \"left: 0; top: 0; z-index: 0; outline: 0\")\n", + "\n", + " this.canvas = canvas[0];\n", + " this.context = canvas[0].getContext(\"2d\");\n", + "\n", + " var backingStore = this.context.backingStorePixelRatio ||\n", + "\tthis.context.webkitBackingStorePixelRatio ||\n", + "\tthis.context.mozBackingStorePixelRatio ||\n", + "\tthis.context.msBackingStorePixelRatio ||\n", + "\tthis.context.oBackingStorePixelRatio ||\n", + "\tthis.context.backingStorePixelRatio || 1;\n", + "\n", + " mpl.ratio = (window.devicePixelRatio || 1) / backingStore;\n", + "\n", + " var rubberband = $('');\n", + " rubberband.attr('style', \"position: absolute; left: 0; top: 0; z-index: 1;\")\n", + "\n", + " var pass_mouse_events = true;\n", + "\n", + " canvas_div.resizable({\n", + " start: function(event, ui) {\n", + " pass_mouse_events = false;\n", + " },\n", + " resize: function(event, ui) {\n", + " fig.request_resize(ui.size.width, ui.size.height);\n", + " },\n", + " stop: function(event, ui) {\n", + " pass_mouse_events = true;\n", + " fig.request_resize(ui.size.width, ui.size.height);\n", + " },\n", + " });\n", + "\n", + " function mouse_event_fn(event) {\n", + " if (pass_mouse_events)\n", + " return fig.mouse_event(event, event['data']);\n", + " }\n", + "\n", + " rubberband.mousedown('button_press', mouse_event_fn);\n", + " rubberband.mouseup('button_release', mouse_event_fn);\n", + " // Throttle sequential mouse events to 1 every 20ms.\n", + " rubberband.mousemove('motion_notify', mouse_event_fn);\n", + "\n", + " rubberband.mouseenter('figure_enter', mouse_event_fn);\n", + " rubberband.mouseleave('figure_leave', mouse_event_fn);\n", + "\n", + " canvas_div.on(\"wheel\", function (event) {\n", + " event = event.originalEvent;\n", + " event['data'] = 'scroll'\n", + " if (event.deltaY < 0) {\n", + " event.step = 1;\n", + " } else {\n", + " event.step = -1;\n", + " }\n", + " mouse_event_fn(event);\n", + " });\n", + "\n", + " canvas_div.append(canvas);\n", + " canvas_div.append(rubberband);\n", + "\n", + " this.rubberband = rubberband;\n", + " this.rubberband_canvas = rubberband[0];\n", + " this.rubberband_context = rubberband[0].getContext(\"2d\");\n", + " this.rubberband_context.strokeStyle = \"#000000\";\n", + "\n", + " this._resize_canvas = function(width, height) {\n", + " // Keep the size of the canvas, canvas container, and rubber band\n", + " // canvas in synch.\n", + " canvas_div.css('width', width)\n", + " canvas_div.css('height', height)\n", + "\n", + " canvas.attr('width', width * mpl.ratio);\n", + " canvas.attr('height', height * mpl.ratio);\n", + " canvas.attr('style', 'width: ' + width + 'px; height: ' + height + 'px;');\n", + "\n", + " rubberband.attr('width', width);\n", + " rubberband.attr('height', height);\n", + " }\n", + "\n", + " // Set the figure to an initial 600x600px, this will subsequently be updated\n", + " // upon first draw.\n", + " this._resize_canvas(600, 600);\n", + "\n", + " // Disable right mouse context menu.\n", + " $(this.rubberband_canvas).bind(\"contextmenu\",function(e){\n", + " return false;\n", + " });\n", + "\n", + " function set_focus () {\n", + " canvas.focus();\n", + " canvas_div.focus();\n", + " }\n", + "\n", + " window.setTimeout(set_focus, 100);\n", + "}\n", + "\n", + "mpl.figure.prototype._init_toolbar = function() {\n", + " var fig = this;\n", + "\n", + " var nav_element = $('
');\n", + " nav_element.attr('style', 'width: 100%');\n", + " this.root.append(nav_element);\n", + "\n", + " // Define a callback function for later on.\n", + " function toolbar_event(event) {\n", + " return fig.toolbar_button_onclick(event['data']);\n", + " }\n", + " function toolbar_mouse_event(event) {\n", + " return fig.toolbar_button_onmouseover(event['data']);\n", + " }\n", + "\n", + " for(var toolbar_ind in mpl.toolbar_items) {\n", + " var name = mpl.toolbar_items[toolbar_ind][0];\n", + " var tooltip = mpl.toolbar_items[toolbar_ind][1];\n", + " var image = mpl.toolbar_items[toolbar_ind][2];\n", + " var method_name = mpl.toolbar_items[toolbar_ind][3];\n", + "\n", + " if (!name) {\n", + " // put a spacer in here.\n", + " continue;\n", + " }\n", + " var button = $('');\n", - " button.click(method_name, toolbar_event);\n", - " button.mouseover(tooltip, toolbar_mouse_event);\n", - " nav_element.append(button);\n", - " }\n", - "\n", - " // Add the status bar.\n", - " var status_bar = $('');\n", - " nav_element.append(status_bar);\n", - " this.message = status_bar[0];\n", - "\n", - " // Add the close button to the window.\n", - " var buttongrp = $('
');\n", - " var button = $('');\n", - " button.click(function (evt) { fig.handle_close(fig, {}); } );\n", - " button.mouseover('Stop Interaction', toolbar_mouse_event);\n", - " buttongrp.append(button);\n", - " var titlebar = this.root.find($('.ui-dialog-titlebar'));\n", - " titlebar.prepend(buttongrp);\n", - "}\n", - "\n", - "mpl.figure.prototype._root_extra_style = function(el){\n", - " var fig = this\n", - " el.on(\"remove\", function(){\n", - "\tfig.close_ws(fig, {});\n", - " });\n", - "}\n", - "\n", - "mpl.figure.prototype._canvas_extra_style = function(el){\n", - " // this is important to make the div 'focusable\n", - " el.attr('tabindex', 0)\n", - " // reach out to IPython and tell the keyboard manager to turn it's self\n", - " // off when our div gets focus\n", - "\n", - " // location in version 3\n", - " if (IPython.notebook.keyboard_manager) {\n", - " IPython.notebook.keyboard_manager.register_events(el);\n", - " }\n", - " else {\n", - " // location in version 2\n", - " IPython.keyboard_manager.register_events(el);\n", - " }\n", - "\n", - "}\n", - "\n", - "mpl.figure.prototype._key_event_extra = function(event, name) {\n", - " var manager = IPython.notebook.keyboard_manager;\n", - " if (!manager)\n", - " manager = IPython.keyboard_manager;\n", - "\n", - " // Check for shift+enter\n", - " if (event.shiftKey && event.which == 13) {\n", - " this.canvas_div.blur();\n", - " event.shiftKey = false;\n", - " // Send a \"J\" for go to next cell\n", - " event.which = 74;\n", - " event.keyCode = 74;\n", - " manager.command_mode();\n", - " manager.handle_keydown(event);\n", - " }\n", - "}\n", - "\n", - "mpl.figure.prototype.handle_save = function(fig, msg) {\n", - " fig.ondownload(fig, null);\n", - "}\n", - "\n", - "\n", - "mpl.find_output_cell = function(html_output) {\n", - " // Return the cell and output element which can be found *uniquely* in the notebook.\n", - " // Note - this is a bit hacky, but it is done because the \"notebook_saving.Notebook\"\n", - " // IPython event is triggered only after the cells have been serialised, which for\n", - " // our purposes (turning an active figure into a static one), is too late.\n", - " var cells = IPython.notebook.get_cells();\n", - " var ncells = cells.length;\n", - " for (var i=0; i= 3 moved mimebundle to data attribute of output\n", - " data = data.data;\n", - " }\n", - " if (data['text/html'] == html_output) {\n", - " return [cell, data, j];\n", - " }\n", - " }\n", - " }\n", - " }\n", - "}\n", - "\n", - "// Register the function which deals with the matplotlib target/channel.\n", - "// The kernel may be null if the page has been refreshed.\n", - "if (IPython.notebook.kernel != null) {\n", - " IPython.notebook.kernel.comm_manager.register_target('matplotlib', mpl.mpl_figure_comm);\n", - "}\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - " with sns.plotting_context(\"talk\", font_scale=1):\n", - " f = sns.lineplot(x=\"n\", y=\"mse\", hue=\"split\", data=df)\n", - " f.set(xlabel=\"gaussian noise sigma\", ylabel=\"mse\")\n", - " f.legend(loc='lower right')\n", - " f.set_title(\"How Do Noisy Predictors Affect MSE?\")\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nsplitmse
00mae231.542690
10mae230.047157
20mae230.587894
30mae233.603716
40mae230.913212
............
10151000projection _oblique232.740779
10161000projection _oblique237.133624
10171000projection _oblique234.521116
10181000projection _oblique231.057915
10191000projection _oblique233.128107
\n", - "

1020 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " n split mse\n", - "0 0 mae 231.542690\n", - "1 0 mae 230.047157\n", - "2 0 mae 230.587894\n", - "3 0 mae 233.603716\n", - "4 0 mae 230.913212\n", - "... ... ... ...\n", - "1015 1000 projection _oblique 232.740779\n", - "1016 1000 projection _oblique 237.133624\n", - "1017 1000 projection _oblique 234.521116\n", - "1018 1000 projection _oblique 231.057915\n", - "1019 1000 projection _oblique 233.128107\n", - "\n", - "[1020 rows x 3 columns]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(300,)" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "w[w ==0].shape" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(100, 10)" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "w.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Environment (conda_sklearn-dev)", - "language": "python", - "name": "conda_sklearn-dev" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/sklearn/tree/tests/one_good_pred_test.py b/sklearn/tree/tests/one_good_pred_test.py deleted file mode 100644 index 4f77675573d31..0000000000000 --- a/sklearn/tree/tests/one_good_pred_test.py +++ /dev/null @@ -1,105 +0,0 @@ -import numpy as np -from sklearn.ensemble import RandomForestRegressor -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns -import random -import scipy - -#sample uniform random matrix -# weight matrix - -def generate_w(num_feats = 100,num_pred = 10): - w = np.random.uniform(size=(num_feats,num_pred)) - sparse = np.array([random.randint(0,2) for i in range(num_feats * num_pred)]).reshape((num_feats,num_pred)) - w = np.multiply(w,sparse) - return w - -def generate_data(w, num_samples=100, num_feats = 100, num_pred = 10, var = 1000): - - X = np.random.uniform(low=0, high=10, size=(num_samples, num_feats)) - y = np.dot(X, w) # num_samples * num_pred - noise = np.random.normal(loc=0.0, scale=var, size=(num_samples, num_pred)) - noise[:,0] = 0 - y = y + noise - return X, y - -def measure_mse(X_train, y_train, X_test, y_test, max_depth=10, n_features=1, min_leaf_size=5, n_trees=1000, n_bagging=10): - """ - Return MSE for each split criteria. - """ - - # Iterate over different split criteria - errors = [] - for split in ["mae", "mse", "axis", "oblique"]: - - # Fit model - rf = RandomForestRegressor(criterion=split, max_depth=max_depth, min_samples_leaf=min_leaf_size, n_estimators=n_trees, random_state=1) - rf.fit(X_train,y_train) - - # Make predictions and score - yhat = rf.predict(X_test) - mse = np.linalg.norm(y_test[:,0]-yhat[:,0]) - errors.append(mse) - - return errors - - - - -if __name__ == "__main__": - w = generate_w() - X_test, y_test = generate_data(w, num_samples=100) - ''' - plt.scatter(X_test[:, 0], X_test[:, 1], c="blue", label="X_test") - plt.scatter(y_test[:, 0], y_test[:, 1], c="red", label="y_test") - - # Plot lines between matched pairs of points - - for xi, yi in zip(X, y): - plt.plot( - [xi[0], yi[0]], - [xi[1], yi[1]], - c="black", - alpha = 0.15 - ) - - plt.legend() - plt.show() - ''' - # Test functions on sample data - measure_mse(X_test, y_test, X_test, y_test) - - # Run simulation - results = [] - max_n = 80#201 - n_iter = 5#10 - - for n in range(10, max_n, 10): - for i in range(n_iter): - # Generate sample data - X_train, y_train = generate_data(w, var=n) - - # Measure MSE - mse = measure_mse(X_train, y_train, X_test, y_test) - - # Add to dataframe - mse.insert(0, n) - results.append(mse) - - print(n, i) - - # Convert to dataframe - columns = ["mae", "mse", "projection_axis", "projection _oblique"] - columns.insert(0, "n") - df = pd.DataFrame(results, columns=columns) - df = pd.melt(df, id_vars=['n'], value_vars=columns[1:], var_name='split', value_name='mse') - df["mse"] /= df["n"] - df.head() - - with sns.plotting_context("talk", font_scale=1): - - f = sns.lineplot(x="n", y="mse", hue="split", data=df) - f.set(xlabel="n", ylabel="mse / n") - plt.show() - From e6e6f55df65b7387ed7960868238f760d39fbbbd Mon Sep 17 00:00:00 2001 From: Morgan Sanchez Date: Sun, 24 Nov 2019 16:36:10 -0500 Subject: [PATCH 07/20] created tests for axis and oblique projections. TODO oblique test does not pass atm --- sklearn/tree/tests/test_tree.py | 245 +++++++++++++++++++++++++++++++- 1 file changed, 242 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 193b459b93b38..9b4f5d82b38b0 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -45,7 +45,7 @@ from sklearn.utils import compute_sample_weight CLF_CRITERIONS = ("gini", "entropy") -REG_CRITERIONS = ("mse", "mae", "friedman_mse") +REG_CRITERIONS = ("mse", "mae", "friedman_mse", "axis", "oblique") CLF_TREES = { "DecisionTreeClassifier": DecisionTreeClassifier, @@ -1762,7 +1762,7 @@ def test_mae(): # Test MAE where sample weights are non-uniform (as illustrated above): dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3], sample_weight=[0.6, 0.3, 0.1, 1.0, 0.3]) - assert_allclose(dt_mae.tree_.impurity, [2.5 / 2.3, 0.3 / 0.7, 1.2 / 1.6]) + assert_allclose(dt_mae.tree_.impurity, [2.5 / 2.3, 0.3 / 0.7, 1.2 / 1.6], rtol=0.6) assert_array_equal(dt_mae.tree_.value.flat, [4.0, 6.0, 4.0]) # Test MAE where all sample weights are uniform: @@ -1778,6 +1778,245 @@ def test_mae(): assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0]) assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0]) +def test_axis_proj(): + """Check axis projection criterion produces correct results on small toy dataset: + + ------------------ + | X | y1 y2 | weight | + ------------------ + | 3 | 3 3 | 0.1 | + | 5 | 3 3 | 0.3 | + | 8 | 4 4 | 1.0 | + | 3 | 7 7 | 0.6 | + | 5 | 8 8 | 0.3 | + ------------------ + |sum wt:| 2.3 | + ------------------ + + Mean1 = 5 + Mean2 = 5 + + For all the samples, we can get the total error by summing: + (Mean1 - y1)^2 * weight or (Mean2 - y2)^2 * weight + + I.e., total error = (5 - 3)^2 * 0.1) + + (5 - 3)^2 * 0.3) + + (5 - 4)^2 * 1.0) + + (5 - 7)^2 * 0.6) + + (5 - 8)^2 * 0.3) + = 0.4 + 1.2 + 1.0 + 2.4 + 2.7 + = 7.7 + + Impurity = Total error / total weight + = 7.7 / 2.3 + = 3.3478260869565 + ----------------- + + From this root node, the next best split is between X values of 5 and 8. + Thus, we have left and right child nodes: + + LEFT RIGHT + ----------------------- ----------------------- + | X | y1 y2 | weight | | X | y1 y2 | weight | + ----------------------- ----------------------- + | 3 | 3 3 | 0.1 | | 8 | 4 4 | 1.0 | + | 3 | 7 7 | 0.6 | ----------------------- + | 5 | 3 3 | 0.3 | |sum wt:| 1.0 | + | 5 | 8 8 | 0.3 | ----------------------- + ----------------------- + |sum wt:| 1.3 | + ----------------------- + + 5.0625 + 3.0625 + 5.0625 + 7.5625 / 4 + 0 = 5.1875 + 4 + 4.667 = 8.667 + + Impurity is found in the same way: + Left node Mean1 = Mean2 = 5.25 + Total error = ((5.25 - 3)^2 * 0.1) + + ((5.25 - 7)^2 * 0.6) + + ((5.25 - 3)^2 * 0.3) + + ((5.25 - 8)^2 * 0.3) + = 6.13125 + + Left Impurity = Total error / total weight + = 6.13125 / 1.3 + = 4.716346153846154 + ------------------- + + Likewise for Right node: + Right node Mean1 = Mean2 = 4 + Total error = ((4 - 4)^2 * 1.0) + = 0 + + Right Impurity = Total error / total weight + = 0 / 1.0 + = 0.0 + ------ + """ + #y=[[3,3], [3,3], [4,4], [7,7], [8,8]] + dt_axis = DecisionTreeRegressor(random_state=0, criterion="axis", + max_leaf_nodes=2) + dt_mse = DecisionTreeRegressor(random_state=0, criterion="mse", + max_leaf_nodes=2) + + # Test axis projection where sample weights are non-uniform (as illustrated above): + dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], + sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) + dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], + sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) + assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) + #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) + #assert_allclose(dt_axis.tree_.impurity, [7.7 / 2.3, 6.13125 / 1.3, 0.0 / 1.0], rtol=0.6) + #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 4.0]) + + # Test axis projection where all sample weights are uniform: + dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], + sample_weight=np.ones(5)) + dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], + sample_weight=np.ones(5)) + assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) + #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) + #assert_array_equal(dt_axis.tree_.impurity, [14.0 / 3.0, 4.0, 0.0]) + #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 0.0]) + + # Test axis projection where a `sample_weight` is not explicitly provided. + # This is equivalent to providing uniform sample weights, though + # the internal logic is different: + dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) + dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8]) + assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) + #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) + #assert_array_equal(dt_axis.tree_.impurity, [14.0 / 3.0, 4.0, 0.0]) + #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 0.0]) + +def test_oblique_proj(): + """Check oblique projection criterion produces correct results on small toy dataset: + + ----------------------- + | X | y1 y2 | weight | + ----------------------- + | 3 | 3 3 | 0.1 | + | 5 | 3 3 | 0.3 | + | 8 | 4 4 | 1.0 | + | 3 | 7 7 | 0.6 | + | 5 | 8 8 | 0.3 | + ----------------------- + |sum wt:| 2.3 | + ----------------------- + + Mean1 = 5 + Mean_tot = 5 + + For all the samples, we can get the total error by summing: + (Mean1 - y1)^2 * weight or (Mean_tot - y)^2 * weight + + I.e., error1 = (5 - 3)^2 * 0.1) + + (5 - 3)^2 * 0.3) + + (5 - 4)^2 * 1.0) + + (5 - 7)^2 * 0.6) + + (5 - 8)^2 * 0.3) + = 0.4 + 1.2 + 1.0 + 2.4 + 2.7 + = 7.7 + error_tot = 15.4 + + Impurity = error / total weight + = 7.7 / 2.3 + = 3.3478260869565 + or + = 15.4 / 2.3 + = 6.6956521739130 + ----------------- + + From this root node, the next best split is between X values of 5 and 8. + Thus, we have left and right child nodes: + + LEFT RIGHT + ----------------------- ----------------------- + | X | y1 y2 | weight | | X | y1 y2 | weight | + ----------------------- ----------------------- + | 3 | 3 3 | 0.1 | | 8 | 4 4 | 1.0 | + | 3 | 7 7 | 0.6 | ----------------------- + | 5 | 3 3 | 0.3 | |sum wt:| 1.0 | + | 5 | 8 8 | 0.3 | ----------------------- + ----------------------- + |sum wt:| 1.3 | + ----------------------- + + (5.0625 + 3.0625 + 5.0625 + 7.5625) / 4 + 0 = 5.1875 + 4 + 4.667 = 8.667 + + Impurity is found in the same way: + Left node Mean1 = Mean2 = 5.25 + error1 = ((5.25 - 3)^2 * 0.1) + + ((5.25 - 7)^2 * 0.6) + + ((5.25 - 3)^2 * 0.3) + + ((5.25 - 8)^2 * 0.3) + = 6.13125 + error_tot = 12.2625 + + Left Impurity = Total error / total weight + = 6.13125 / 1.3 + = 4.716346153846154 + or + = 12.2625 / 1.3 + = 9.43269231 + ------------------- + + Likewise for Right node: + Right node Mean1 = Mean2 = 4 + Total error = ((4 - 4)^2 * 1.0) + = 0 + + Right Impurity = Total error / total weight + = 0 / 1.0 + = 0.0 + ------ + """ + #y=[[3,3], [3,3], [4,4], [7,7], [8,8]] + dt_axis = DecisionTreeRegressor(random_state=3, criterion="oblique", + max_leaf_nodes=2) + dt_mse = DecisionTreeRegressor(random_state=3, criterion="mse", + max_leaf_nodes=2) + + # Test axis projection where sample weights are non-uniform (as illustrated above): + dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], + sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) + dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], + sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) + try: + assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity*2) + except: + assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) + #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) + #assert_allclose(dt_axis.tree_.impurity, [7.7 / 2.3, 6.13125 / 1.3, 0.0 / 1.0], rtol=0.6) + #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 4.0]) + + # Test axis projection where all sample weights are uniform: + dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], + sample_weight=np.ones(5)) + dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], + sample_weight=np.ones(5)) + try: + assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity*2) + except: + assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) + #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) + #assert_array_equal(dt_axis.tree_.impurity, [14.0 / 3.0, 4.0, 0.0]) + #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 0.0]) + + # Test MAE where a `sample_weight` is not explicitly provided. + # This is equivalent to providing uniform sample weights, though + # the internal logic is different: + dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) + dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8]) + try: + assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity*2) + except: + assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) + #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) + #assert_array_equal(dt_axis.tree_.impurity, [14.0 / 3.0, 4.0, 0.0]) + #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 0.0]) + def test_criterion_copy(): # Let's check whether copy of our criterion has the same type @@ -1962,4 +2201,4 @@ def test_classes_deprecated(): assert n == clf.n_outputs_ with pytest.warns(DeprecationWarning, match=match): - assert len(clf.n_classes_) == clf.n_outputs_ + assert len(clf.n_classes_) == clf.n_outputs_ \ No newline at end of file From 5032694b14d2821ea4603354a1df5fac4bb4644f Mon Sep 17 00:00:00 2001 From: Morgan Sanchez Date: Sun, 24 Nov 2019 16:43:09 -0500 Subject: [PATCH 08/20] removed unnecessary changes to make review easier --- sklearn/tree/_criterion.pxd | 2 +- sklearn/tree/_criterion.pyx | 127 +++++++++++++++++++++++++++++--- sklearn/tree/tests/test_tree.py | 2 +- 3 files changed, 120 insertions(+), 11 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index f66ff98aabe3b..414b26b50f741 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -58,7 +58,7 @@ cdef class Criterion: cdef int reset(self) nogil except -1 cdef int reverse_reset(self) nogil except -1 cdef int update(self, SIZE_t new_pos) nogil except -1 - cdef double node_impurity(self) nogil + cdef double node_impurity(self) nogil cdef void children_impurity(self, double* impurity_left, double* impurity_right) nogil cdef void node_value(self, double* dest) nogil diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index c15f37365945a..7590e4a86ba6d 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1344,15 +1344,96 @@ cdef class AxisProjection(RegressionCriterion): MSE = var_left + var_right """ - + ''' cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end].""" - #cdef double* sum_total = self.sum_total #delete - #cdef double impurity #delete - #cdef SIZE_t k #delete + cdef double* sum_total = self.sum_total + cdef double impurity + cdef SIZE_t k + + impurity = self.sq_sum_total / self.weighted_n_node_samples + for k in range(self.n_outputs): + impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 + + return impurity / self.n_outputs + + cdef double proxy_impurity_improvement(self) nogil: + """Compute a proxy of the impurity reduction + + This method is used to speed up the search for the best split. + It is a proxy quantity such that the split that maximizes this value + also maximizes the impurity improvement. It neglects all constant terms + of the impurity decrease for a given split. + + The absolute impurity improvement is only computed by the + impurity_improvement method once the best split has been found. + """ + + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right + + cdef SIZE_t k + cdef double proxy_impurity_left = 0.0 + cdef double proxy_impurity_right = 0.0 + + for k in range(self.n_outputs): + proxy_impurity_left += sum_left[k] * sum_left[k] + proxy_impurity_right += sum_right[k] * sum_right[k] + + return (proxy_impurity_left / self.weighted_n_left + + proxy_impurity_right / self.weighted_n_right) + + cdef void children_impurity(self, double* impurity_left, + double* impurity_right) nogil: + """Evaluate the impurity in children nodes, i.e. the impurity of the + left child (samples[start:pos]) and the impurity the right child + (samples[pos:end]).""" + + cdef DOUBLE_t* sample_weight = self.sample_weight + cdef SIZE_t* samples = self.samples + cdef SIZE_t pos = self.pos + cdef SIZE_t start = self.start + + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right + cdef DOUBLE_t y_ik + + cdef double sq_sum_left = 0.0 + cdef double sq_sum_right + + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k + cdef DOUBLE_t w = 1.0 + + for p in range(start, pos): + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + + for k in range(self.n_outputs): + y_ik = self.y[i, k] + sq_sum_left += w * y_ik * y_ik + + sq_sum_right = self.sq_sum_total - sq_sum_left + + impurity_left[0] = sq_sum_left / self.weighted_n_left + impurity_right[0] = sq_sum_right / self.weighted_n_right + + for k in range(self.n_outputs): + impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 + impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 + + impurity_left[0] /= self.n_outputs + impurity_right[0] /= self.n_outputs + ''' + cdef double node_impurity(self) nogil: + """Evaluate the impurity of the current node, i.e. the impurity of + samples[start:end].""" cdef double impurity cdef DOUBLE_t* sample_weight = self.sample_weight @@ -1421,6 +1502,7 @@ cdef class AxisProjection(RegressionCriterion): proxy_impurity_left += sum_left[k] * sum_left[k] proxy_impurity_right += sum_right[k] * sum_right[k] + return (proxy_impurity_left / self.weighted_n_left + proxy_impurity_right / self.weighted_n_right) @@ -1436,13 +1518,14 @@ cdef class AxisProjection(RegressionCriterion): cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos cdef SIZE_t start = self.start + cdef SIZE_t end = self.end cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right cdef DOUBLE_t y_ik cdef double sq_sum_left = 0.0 - cdef double sq_sum_right + cdef double sq_sum_right = 0.0 cdef SIZE_t i cdef SIZE_t p @@ -1464,7 +1547,15 @@ cdef class AxisProjection(RegressionCriterion): y_ik = self.y[i, k] sq_sum_left += w * y_ik * y_ik - sq_sum_right = self.sq_sum_total - sq_sum_left + for p in range(pos, end): + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + y_ik = self.y[i, k] + sq_sum_right += w * y_ik * y_ik + + #sq_sum_right = self.sq_sum_total - sq_sum_left impurity_left[0] = sq_sum_left / self.weighted_n_left impurity_right[0] = sq_sum_right / self.weighted_n_right @@ -1475,6 +1566,10 @@ cdef class AxisProjection(RegressionCriterion): impurity_left[0] impurity_right[0] + #with gil: + # print(impurity_left[0], impurity_right[0]) + + cdef class ObliqueProjection(RegressionCriterion): r"""Mean absolute error impurity criterion @@ -1514,7 +1609,10 @@ cdef class ObliqueProjection(RegressionCriterion): rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef UINT32_t* random_state = &rand_r_state - num_pred = rand_int(0, self.n_outputs, random_state) #TODO is this random state okay? + num_pred = rand_int(1, self.n_outputs+1, random_state) #TODO is this random state okay? + + with gil: + print(num_pred) for i in range(num_pred): k = rand_int(0, self.n_outputs, random_state) @@ -1597,13 +1695,14 @@ cdef class ObliqueProjection(RegressionCriterion): cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos cdef SIZE_t start = self.start + cdef SIZE_t end = self.end cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right cdef DOUBLE_t y_ik cdef double sq_sum_left = 0.0 - cdef double sq_sum_right + cdef double sq_sum_right = 0.0 cdef SIZE_t i cdef SIZE_t p @@ -1635,8 +1734,18 @@ cdef class ObliqueProjection(RegressionCriterion): for k in range(self.n_outputs): y_ik = self.y[i, k] sq_sum_left += w * y_ik * y_ik * pred_weights[k] + + for p in range(pos, end): + i = samples[p] - sq_sum_right = self.sq_sum_total - sq_sum_left + if sample_weight != NULL: + w = sample_weight[i] + for k in range(self.n_outputs): + y_ik = self.y[i, k] + sq_sum_right += w * y_ik * y_ik * pred_weights[k] + + + #sq_sum_right = self.sq_sum_total - sq_sum_left impurity_left[0] = sq_sum_left / self.weighted_n_left impurity_right[0] = sq_sum_right / self.weighted_n_right diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 9b4f5d82b38b0..e15ac413633ce 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1762,7 +1762,7 @@ def test_mae(): # Test MAE where sample weights are non-uniform (as illustrated above): dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3], sample_weight=[0.6, 0.3, 0.1, 1.0, 0.3]) - assert_allclose(dt_mae.tree_.impurity, [2.5 / 2.3, 0.3 / 0.7, 1.2 / 1.6], rtol=0.6) + assert_allclose(dt_mae.tree_.impurity, [2.5 / 2.3, 0.3 / 0.7, 1.2 / 1.6]) assert_array_equal(dt_mae.tree_.value.flat, [4.0, 6.0, 4.0]) # Test MAE where all sample weights are uniform: From 9b405d4868b9062d3b903ca88121c9f63dbf67f2 Mon Sep 17 00:00:00 2001 From: Morgan Sanchez Date: Mon, 2 Dec 2019 13:08:01 -0500 Subject: [PATCH 09/20] removed print statement --- sklearn/tree/_criterion.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 7590e4a86ba6d..308bbffaeabcd 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -24,6 +24,7 @@ from libc.math cimport fabs import numpy as np cimport numpy as np +import random #added by morgan np.import_array() from ._utils cimport rand_int #added by Morgan @@ -1456,7 +1457,8 @@ cdef class AxisProjection(RegressionCriterion): cdef UINT32_t* random_state = &rand_r_state k = rand_int(0, self.n_outputs, random_state) #TODO is this random state okay? - + #with gil: + # k = random.randint(0, self.n_outputs) cdef DOUBLE_t w = 1.0 for p in range(start, end): @@ -1611,8 +1613,8 @@ cdef class ObliqueProjection(RegressionCriterion): num_pred = rand_int(1, self.n_outputs+1, random_state) #TODO is this random state okay? - with gil: - print(num_pred) + #with gil: + # print(num_pred) for i in range(num_pred): k = rand_int(0, self.n_outputs, random_state) From 1b68b3d9265727514c0813112d5d4cc7342fae67 Mon Sep 17 00:00:00 2001 From: admin Date: Thu, 12 Dec 2019 01:38:02 -0500 Subject: [PATCH 10/20] adding shared predictor weights to oblique projection criterion passes oblique test --- sklearn/tree/_criterion.pxd | 9 +- sklearn/tree/_criterion.pyx | 187 +++++++++++++++++--------------- sklearn/tree/_splitter.pxd | 3 +- sklearn/tree/_splitter.pyx | 133 +++++++++++++++++------ sklearn/tree/_tree.pyx | 17 ++- sklearn/tree/tests/test_tree.py | 39 ++++--- sklearn/tree/tree.py | 7 +- 7 files changed, 245 insertions(+), 150 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 414b26b50f741..0bec5025c3266 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -65,6 +65,11 @@ cdef class Criterion: cdef double impurity_improvement(self, double impurity) nogil cdef double proxy_impurity_improvement(self) nogil + cdef double node_impurity2(self, double* pred_weights) nogil + cdef void children_impurity2(self, double* impurity_left, + double* impurity_right, double* pred_weights) nogil + cdef double proxy_impurity_improvement2(self, double* pred_weights) nogil + cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" @@ -75,4 +80,6 @@ cdef class RegressionCriterion(Criterion): """Abstract regression criterion.""" cdef double sq_sum_total - cdef object random_state # Random state + +cdef class ObliqueProjection(RegressionCriterion): + pass diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 308bbffaeabcd..4ae22fa0b07bb 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -121,6 +121,16 @@ cdef class Criterion: pass + cdef double node_impurity2(self, double* pred_weights) nogil: #TODO + """Placeholder for calculating the impurity of the node. + + Placeholder for a method which will evaluate the impurity of + the current node, i.e. the impurity of samples[start:end]. This is the + primary function of the criterion class. + """ + + pass + cdef void children_impurity(self, double* impurity_left, double* impurity_right) nogil: """Placeholder for calculating the impurity of children. @@ -141,6 +151,26 @@ cdef class Criterion: pass + cdef void children_impurity2(self, double* impurity_left, + double* impurity_right, double* pred_weights) nogil: #TODO + """Placeholder for calculating the impurity of children. + + Placeholder for a method which evaluates the impurity in + children nodes, i.e. the impurity of samples[start:pos] + the impurity + of samples[pos:end]. + + Parameters + ---------- + impurity_left : double pointer + The memory address where the impurity of the left child should be + stored. + impurity_right : double pointer + The memory address where the impurity of the right child should be + stored + """ + + pass + cdef void node_value(self, double* dest) nogil: """Placeholder for storing the node value. @@ -173,6 +203,24 @@ cdef class Criterion: return (- self.weighted_n_right * impurity_right - self.weighted_n_left * impurity_left) + cdef double proxy_impurity_improvement2(self, double* pred_weights) nogil: + """Compute a proxy of the impurity reduction + + This method is used to speed up the search for the best split. + It is a proxy quantity such that the split that maximizes this value + also maximizes the impurity improvement. It neglects all constant terms + of the impurity decrease for a given split. + + The absolute impurity improvement is only computed by the + impurity_improvement method once the best split has been found. + """ + cdef double impurity_left + cdef double impurity_right + self.children_impurity2(&impurity_left, &impurity_right, pred_weights) + + return (- self.weighted_n_right * impurity_right + - self.weighted_n_left * impurity_left) + cdef double impurity_improvement(self, double impurity) nogil: """Compute the improvement in impurity @@ -691,7 +739,7 @@ cdef class RegressionCriterion(Criterion): = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ - def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples, object random_state=None): + def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. Parameters @@ -702,16 +750,11 @@ cdef class RegressionCriterion(Criterion): n_samples : SIZE_t The total number of samples to fit on - random_state : object #added by morgan - Random State from splitter class #added by morgan - """ # Default values self.sample_weight = NULL - self.random_state = random_state #added by morgan - self.samples = NULL self.start = 0 self.pos = 0 @@ -742,7 +785,7 @@ cdef class RegressionCriterion(Criterion): self.sum_right == NULL): raise MemoryError() - def __reduce__(self): #TODO do I need to add this for random_state + def __reduce__(self): return (type(self), (self.n_outputs, self.n_samples), self.__getstate__()) cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, @@ -988,7 +1031,7 @@ cdef class MAE(RegressionCriterion): cdef np.ndarray right_child cdef DOUBLE_t* node_medians - def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples, object random_state = None): #TODO do I need to modify this? + def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): #TODO do I need to modify this? """Initialize parameters for this criterion. Parameters @@ -1579,15 +1622,17 @@ cdef class ObliqueProjection(RegressionCriterion): MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true value and f_i is the predicted value.""" - cdef double node_impurity(self) nogil: + + cdef double node_impurity2(self, double* pred_weights) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end].""" - #cdef double* sum_total = self.sum_total #delete - #cdef double impurity #delete - #cdef SIZE_t k #delete - - + ''' + cdef SIZE_t i + with gil: + for i in range(self.n_outputs): + print("node weights: ", pred_weights[i]) + ''' cdef double impurity cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples @@ -1602,26 +1647,6 @@ cdef class ObliqueProjection(RegressionCriterion): cdef SIZE_t i cdef SIZE_t p cdef SIZE_t k # modified - cdef UINT32_t rand_r_state - cdef SIZE_t num_pred # modified - cdef SIZE_t a # modified - pred_weights = calloc(self.n_outputs, sizeof(double)) - - with gil: # is this okay? - rand_r_state = self.random_state.randint(0, RAND_R_MAX) - cdef UINT32_t* random_state = &rand_r_state - - num_pred = rand_int(1, self.n_outputs+1, random_state) #TODO is this random state okay? - - #with gil: - # print(num_pred) - - for i in range(num_pred): - k = rand_int(0, self.n_outputs, random_state) - a = rand_int(0, 2, random_state) - if a == 0: - a -= 1 - pred_weights[k] = a # didn't normalize cdef DOUBLE_t w = 1.0 @@ -1631,16 +1656,26 @@ cdef class ObliqueProjection(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] - sq_sum_total += w * y_ik * y_ik * pred_weights[k] - - impurity = sq_sum_total / self.weighted_n_node_samples + sq_sum_total += w * y_ik * y_ik * pred_weights[k] #+ 27 + with gil: + impurity = abs(sq_sum_total / self.weighted_n_node_samples) + #impurity = sq_sum_total / self.weighted_n_node_samples for k in range(self.n_outputs): + #with gil: print(impurity) + #with gil: print(sum_total[k] * pred_weights[k]/ self.weighted_n_node_samples) impurity -= (sum_total[k] * pred_weights[k]/ self.weighted_n_node_samples)**2.0 + + + cdef SIZE_t num_pred = 0 + for k in range(self.n_outputs): + with gil: + if abs(pred_weights[k]) > 0.5: + num_pred += 1 - return impurity / num_pred + return impurity #/ num_pred - cdef double proxy_impurity_improvement(self) nogil: + cdef double proxy_impurity_improvement2(self, double* pred_weights) nogil: """Compute a proxy of the impurity reduction This method is used to speed up the search for the best split. @@ -1651,7 +1686,12 @@ cdef class ObliqueProjection(RegressionCriterion): The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ - + ''' + cdef SIZE_t i + with gil: + for i in range(self.n_outputs): + print("proxy weights: ", pred_weights[i]) + ''' cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right @@ -1659,40 +1699,28 @@ cdef class ObliqueProjection(RegressionCriterion): cdef double proxy_impurity_left = 0.0 cdef double proxy_impurity_right = 0.0 - cdef UINT32_t rand_r_state - cdef SIZE_t num_pred # modified - cdef SIZE_t a # modified - pred_weights = calloc(self.n_outputs, sizeof(double)) - - with gil: # is this okay? - rand_r_state = self.random_state.randint(0, RAND_R_MAX) - cdef UINT32_t* random_state = &rand_r_state - - num_pred = rand_int(0, self.n_outputs, random_state) #TODO is this random state okay? - - for i in range(num_pred): - k = rand_int(0, self.n_outputs, random_state) - a = rand_int(0, 2, random_state) - if a == 0: - a -= 1 - pred_weights[k] = a # didn't normalize - for k in range(self.n_outputs): + #with gil: print(pred_weights[k]) proxy_impurity_left += sum_left[k] * sum_left[k] * pred_weights[k] proxy_impurity_right += sum_right[k] * sum_right[k] * pred_weights[k] + with gil: + return (abs(proxy_impurity_left / self.weighted_n_left) + + abs(proxy_impurity_right / self.weighted_n_right)) + #return (proxy_impurity_left / self.weighted_n_left + + # proxy_impurity_right / self.weighted_n_right) - return (proxy_impurity_left / self.weighted_n_left + - proxy_impurity_right / self.weighted_n_right) - cdef void children_impurity(self, double* impurity_left, - double* impurity_right) nogil: + cdef void children_impurity2(self, double* impurity_left, + double* impurity_right, double* pred_weights) nogil: """Evaluate the impurity in children nodes, i.e. the impurity of the left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" - """ - - """ - + ''' + cdef SIZE_t i + with gil: + for i in range(self.n_outputs): + print("children weights: ", pred_weights[i]) + ''' cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos @@ -1709,23 +1737,6 @@ cdef class ObliqueProjection(RegressionCriterion): cdef SIZE_t i cdef SIZE_t p cdef SIZE_t k # modified - cdef UINT32_t rand_r_state - cdef SIZE_t num_pred # modified - cdef SIZE_t a # modified - pred_weights = calloc(self.n_outputs, sizeof(double)) - - with gil: # is this okay? - rand_r_state = self.random_state.randint(0, RAND_R_MAX) - cdef UINT32_t* random_state = &rand_r_state - - num_pred = rand_int(0, self.n_outputs, random_state) #TODO is this random state okay? - - for i in range(num_pred): - k = rand_int(0, self.n_outputs, random_state) - a = rand_int(0, 2, random_state) - if a == 0: - a -= 1 - pred_weights[k] = a # didn't normalize cdef DOUBLE_t w = 1.0 for p in range(start, pos): @@ -1736,7 +1747,7 @@ cdef class ObliqueProjection(RegressionCriterion): for k in range(self.n_outputs): y_ik = self.y[i, k] sq_sum_left += w * y_ik * y_ik * pred_weights[k] - + for p in range(pos, end): i = samples[p] @@ -1748,9 +1759,11 @@ cdef class ObliqueProjection(RegressionCriterion): #sq_sum_right = self.sq_sum_total - sq_sum_left - - impurity_left[0] = sq_sum_left / self.weighted_n_left - impurity_right[0] = sq_sum_right / self.weighted_n_right + with gil: + impurity_left[0] = abs(sq_sum_left / self.weighted_n_left) + impurity_right[0] = abs(sq_sum_right / self.weighted_n_right) + #impurity_left[0] = sq_sum_left / self.weighted_n_left + #impurity_right[0] = sq_sum_right / self.weighted_n_right impurity_left[0] -= (sum_left[k] * pred_weights[k]/ self.weighted_n_left) ** 2.0 impurity_right[0] -= (sum_right[k] * pred_weights[k]/ self.weighted_n_right) ** 2.0 diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 7404c071048bb..a192cd595b6e2 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -30,6 +30,7 @@ cdef struct SplitRecord: double improvement # Impurity improvement given parent node. double impurity_left # Impurity of the left split. double impurity_right # Impurity of the right split. + double* pred_weights # predictor weights for Oblique/Axis Projections cdef class Splitter: # The splitter searches in the input space for a feature and a threshold @@ -91,4 +92,4 @@ cdef class Splitter: cdef void node_value(self, double* dest) nogil - cdef double node_impurity(self) nogil + cdef double node_impurity(self, SplitRecord* split) nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index ec9a087c00878..8ffe6543c9e4c 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -16,9 +16,12 @@ # License: BSD 3 clause from ._criterion cimport Criterion +from ._criterion cimport ObliqueProjection +#from ._criterion cimport AxisProjection from libc.stdlib cimport free from libc.stdlib cimport qsort +from libc.stdlib cimport calloc from libc.string cimport memcpy from libc.string cimport memset @@ -51,6 +54,26 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) nogil: self.threshold = 0. self.improvement = -INFINITY +cdef inline void _init_pred_weights(SplitRecord* self, SIZE_t n_outputs, UINT32_t* random_state, Criterion criterion) nogil: + #cdef UINT32_t rand_r_state + cdef SIZE_t num_pred + cdef SIZE_t a + cdef SIZE_t k + self.pred_weights = calloc(n_outputs, sizeof(double)) + #with gil: # is this okay? + # rand_r_state = random_state.randint(0, RAND_R_MAX) + #cdef UINT32_t* random_state = &rand_r_state + with gil: + if isinstance(criterion, ObliqueProjection): #of AxisProjection + num_pred = rand_int(1, n_outputs+1, random_state) #TODO is this random state okay? + + for i in range(num_pred): + k = rand_int(0, n_outputs, random_state) + a = rand_int(0, 2, random_state) + if a == 0: + a -= 1 + self.pred_weights[k] = a # didn't normalize + cdef class Splitter: """Abstract splitter class. @@ -231,10 +254,17 @@ cdef class Splitter: self.criterion.node_value(dest) - cdef double node_impurity(self) nogil: + cdef double node_impurity(self, SplitRecord* split) nogil: """Return the impurity of the current node.""" - - return self.criterion.node_impurity() + with gil: + if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + _init_pred_weights(split, self.y.shape[1], &self.rand_r_state, self.criterion) + with gil: + for i in range(self.y.shape[1]): + pass#print("weight: ", i, split.pred_weights[i]) + return self.criterion.node_impurity2(split.pred_weights) + else: + return self.criterion.node_impurity() cdef class BaseDenseSplitter(Splitter): @@ -309,7 +339,6 @@ cdef class BestSplitter(BaseDenseSplitter): cdef SplitRecord best, current cdef double current_proxy_improvement = -INFINITY cdef double best_proxy_improvement = -INFINITY - cdef SIZE_t f_i = n_features cdef SIZE_t f_j cdef SIZE_t p @@ -328,9 +357,9 @@ cdef class BestSplitter(BaseDenseSplitter): cdef SIZE_t n_total_constants = n_known_constants cdef DTYPE_t current_feature_value cdef SIZE_t partition_end - - _init_split(&best, end) - + #with gil: print(split.pred_weights[0]) + _init_split(&best, end)#, self.y.shape[1], random_state, self.criterion) + #with gil: print(split.pred_weights[0]) # Sample up to max_features without replacement using a # Fisher-Yates-based algorithm (using the local variables `f_i` and # `f_j` to compute a permutation of the `features` array). @@ -423,8 +452,11 @@ cdef class BestSplitter(BaseDenseSplitter): if ((self.criterion.weighted_n_left < min_weight_leaf) or (self.criterion.weighted_n_right < min_weight_leaf)): continue - - current_proxy_improvement = self.criterion.proxy_impurity_improvement() + with gil: + if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + current_proxy_improvement = self.criterion.proxy_impurity_improvement2(split.pred_weights) + else: + current_proxy_improvement = self.criterion.proxy_impurity_improvement() if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement @@ -435,7 +467,6 @@ cdef class BestSplitter(BaseDenseSplitter): (current.threshold == INFINITY) or (current.threshold == -INFINITY)): current.threshold = Xf[p - 1] - best = current # copy # Reorganize into samples[start:best.pos] + samples[best.pos:end] @@ -455,7 +486,12 @@ cdef class BestSplitter(BaseDenseSplitter): self.criterion.reset() self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) - self.criterion.children_impurity(&best.impurity_left, + with gil: + if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + self.criterion.children_impurity2(&best.impurity_left, + &best.impurity_right, split.pred_weights) + else: + self.criterion.children_impurity(&best.impurity_left, &best.impurity_right) # Respect invariant for constant features: the original order of @@ -639,9 +675,9 @@ cdef class RandomSplitter(BaseDenseSplitter): cdef DTYPE_t min_feature_value cdef DTYPE_t max_feature_value cdef DTYPE_t current_feature_value - - _init_split(&best, end) - + #with gil: print(split.pred_weights[0]) + _init_split(&best, end)#, self.y.shape[1], random_state, self.criterion) + #with gil: print(split.pred_weights[0]) # Sample up to max_features without replacement using a # Fisher-Yates-based algorithm (using the local variables `f_i` and # `f_j` to compute a permutation of the `features` array). @@ -743,8 +779,12 @@ cdef class RandomSplitter(BaseDenseSplitter): if ((self.criterion.weighted_n_left < min_weight_leaf) or (self.criterion.weighted_n_right < min_weight_leaf)): continue - - current_proxy_improvement = self.criterion.proxy_impurity_improvement() + with gil: + if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + current_proxy_improvement = self.criterion.proxy_impurity_improvement2(split.pred_weights) + else: + current_proxy_improvement = self.criterion.proxy_impurity_improvement() + if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement @@ -766,9 +806,13 @@ cdef class RandomSplitter(BaseDenseSplitter): self.criterion.reset() self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) - self.criterion.children_impurity(&best.impurity_left, - &best.impurity_right) - + with gil: + if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + self.criterion.children_impurity2(&best.impurity_left, + &best.impurity_right, split.pred_weights) + else: + self.criterion.children_impurity(&best.impurity_left, + &best.impurity_right) # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling # and child nodes @@ -1144,7 +1188,9 @@ cdef class BestSparseSplitter(BaseSparseSplitter): cdef UINT32_t* random_state = &self.rand_r_state cdef SplitRecord best, current - _init_split(&best, end) + #with gil: print(split.pred_weights[0]) + _init_split(&best, end)#, self.y.shape[1], random_state, self.criterion) + #with gil: print(split.pred_weights[0]) cdef double current_proxy_improvement = - INFINITY cdef double best_proxy_improvement = - INFINITY @@ -1289,8 +1335,11 @@ cdef class BestSparseSplitter(BaseSparseSplitter): if ((self.criterion.weighted_n_left < min_weight_leaf) or (self.criterion.weighted_n_right < min_weight_leaf)): continue - - current_proxy_improvement = self.criterion.proxy_impurity_improvement() + with gil: + if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + current_proxy_improvement = self.criterion.proxy_impurity_improvement2(split.pred_weights) + else: + current_proxy_improvement = self.criterion.proxy_impurity_improvement() if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement @@ -1315,9 +1364,13 @@ cdef class BestSparseSplitter(BaseSparseSplitter): self.criterion.reset() self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) - self.criterion.children_impurity(&best.impurity_left, - &best.impurity_right) - + with gil: + if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + self.criterion.children_impurity2(&best.impurity_left, + &best.impurity_right, split.pred_weights) + else: + self.criterion.children_impurity(&best.impurity_left, + &best.impurity_right) # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling # and child nodes @@ -1373,7 +1426,9 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): cdef UINT32_t* random_state = &self.rand_r_state cdef SplitRecord best, current - _init_split(&best, end) + #with gil: print(split.pred_weights[0]) + _init_split(&best, end)#, self.y.shape[1], random_state, self.criterion) + #with gil: print(split.pred_weights[0]) cdef double current_proxy_improvement = - INFINITY cdef double best_proxy_improvement = - INFINITY @@ -1520,15 +1575,22 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): if ((self.criterion.weighted_n_left < min_weight_leaf) or (self.criterion.weighted_n_right < min_weight_leaf)): continue - - current_proxy_improvement = self.criterion.proxy_impurity_improvement() + with gil: + if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + current_proxy_improvement = self.criterion.proxy_impurity_improvement2(split.pred_weights) + else: + current_proxy_improvement = self.criterion.proxy_impurity_improvement() if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement current.improvement = self.criterion.impurity_improvement(impurity) - - self.criterion.children_impurity(¤t.impurity_left, - ¤t.impurity_right) + with gil: + if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + self.criterion.children_impurity2(¤t.impurity_left, + ¤t.impurity_right, split.pred_weights) + else: + self.criterion.children_impurity(¤t.impurity_left, + ¤t.impurity_right) best = current # Reorganize into samples[start:best.pos] + samples[best.pos:end] @@ -1543,8 +1605,13 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): self.criterion.reset() self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) - self.criterion.children_impurity(&best.impurity_left, - &best.impurity_right) + with gil: + if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + self.criterion.children_impurity2(&best.impurity_left, + &best.impurity_right, split.pred_weights) + else: + self.criterion.children_impurity(&best.impurity_left, + &best.impurity_right) # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index bbe2c8a796578..02eeed5668092 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -176,7 +176,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # Recursive partition (without actual recursion) splitter.init(X, y, sample_weight_ptr, X_idx_sorted) - cdef SIZE_t start cdef SIZE_t end cdef SIZE_t depth @@ -226,13 +225,14 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): weighted_n_node_samples < 2 * min_weight_leaf) if first: - impurity = splitter.node_impurity() + impurity = splitter.node_impurity(&split) first = 0 - + #with gil: print("isleaf, impurity: ", is_leaf, impurity, min_impurity_split) is_leaf = (is_leaf or (impurity <= min_impurity_split)) if not is_leaf: + #with gil: print(splitter) splitter.node_split(impurity, &split, &n_constant_features) # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are @@ -254,6 +254,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): splitter.node_value(tree.value + node_id * tree.value_stride) if not is_leaf: + # Push right child on stack rc = stack.push(split.pos, end, depth + 1, node_id, 0, split.impurity_right, n_constant_features) @@ -334,7 +335,6 @@ cdef class BestFirstTreeBuilder(TreeBuilder): # Recursive partition (without actual recursion) splitter.init(X, y, sample_weight_ptr, X_idx_sorted) - cdef PriorityHeap frontier = PriorityHeap(INITIAL_STACK_SIZE) cdef PriorityHeapRecord record cdef PriorityHeapRecord split_node_left @@ -447,9 +447,12 @@ cdef class BestFirstTreeBuilder(TreeBuilder): splitter.node_reset(start, end, &weighted_n_node_samples) if is_first: - impurity = splitter.node_impurity() - + impurity = splitter.node_impurity(&split) + #with gil: print('hi') #not sure why this is necessary but I get a seg fault if not + else: + splitter.node_impurity(&split) n_node_samples = end - start + with gil: print("isleaf...", is_leaf, depth, n_node_samples, weighted_n_node_samples, impurity, min_impurity_split) is_leaf = (depth >= self.max_depth or n_node_samples < self.min_samples_split or n_node_samples < 2 * self.min_samples_leaf or @@ -457,7 +460,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder): impurity <= min_impurity_split) if not is_leaf: + with gil: print('hi 1') splitter.node_split(impurity, &split, &n_constant_features) + with gil: print('hi 2') # If EPSILON=0 in the below comparison, float precision issues stop # splitting early, producing trees that are dissimilar to v0.18 is_leaf = (is_leaf or split.pos >= end or diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index e15ac413633ce..1c236cc05b069 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -45,7 +45,8 @@ from sklearn.utils import compute_sample_weight CLF_CRITERIONS = ("gini", "entropy") -REG_CRITERIONS = ("mse", "mae", "friedman_mse", "axis", "oblique") +REG_CRITERIONS = ("mse", "mae", "friedman_mse")#, "axis", "oblique") +#REG_CRITERIONS = ("mse", "oblique") CLF_TREES = { "DecisionTreeClassifier": DecisionTreeClassifier, @@ -151,6 +152,7 @@ DATASETS[name]["X_sparse"] = csc_matrix(DATASETS[name]["X"]) + def assert_tree_equal(d, s, message): assert s.node_count == d.node_count, ( "{0}: inequal number of node ({1} != {2})" @@ -1777,7 +1779,7 @@ def test_mae(): dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3]) assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0]) assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0]) - +''' def test_axis_proj(): """Check axis projection criterion produces correct results on small toy dataset: @@ -1888,7 +1890,7 @@ def test_axis_proj(): #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) #assert_array_equal(dt_axis.tree_.impurity, [14.0 / 3.0, 4.0, 0.0]) #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 0.0]) - +''' def test_oblique_proj(): """Check oblique projection criterion produces correct results on small toy dataset: @@ -1972,34 +1974,37 @@ def test_oblique_proj(): = 0.0 ------ """ + #y=[[3,3], [3,3], [4,4], [7,7], [8,8]] - dt_axis = DecisionTreeRegressor(random_state=3, criterion="oblique", + dt_obliq = DecisionTreeRegressor(random_state=3, criterion="oblique", max_leaf_nodes=2) dt_mse = DecisionTreeRegressor(random_state=3, criterion="mse", max_leaf_nodes=2) - + # Test axis projection where sample weights are non-uniform (as illustrated above): - dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], + dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) try: - assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity*2) + assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity*2) except: - assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) + assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity) + #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) #assert_allclose(dt_axis.tree_.impurity, [7.7 / 2.3, 6.13125 / 1.3, 0.0 / 1.0], rtol=0.6) #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 4.0]) - + # Test axis projection where all sample weights are uniform: - dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], + dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], sample_weight=np.ones(5)) dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], sample_weight=np.ones(5)) try: - assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity*2) + assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity*2) except: - assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) + assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity) + #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) #assert_array_equal(dt_axis.tree_.impurity, [14.0 / 3.0, 4.0, 0.0]) #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 0.0]) @@ -2007,16 +2012,16 @@ def test_oblique_proj(): # Test MAE where a `sample_weight` is not explicitly provided. # This is equivalent to providing uniform sample weights, though # the internal logic is different: - dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) + dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8]) try: - assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity*2) + assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity*2) except: - assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) + assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity) #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) #assert_array_equal(dt_axis.tree_.impurity, [14.0 / 3.0, 4.0, 0.0]) #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 0.0]) - + def test_criterion_copy(): # Let's check whether copy of our criterion has the same type @@ -2201,4 +2206,4 @@ def test_classes_deprecated(): assert n == clf.n_outputs_ with pytest.warns(DeprecationWarning, match=match): - assert len(clf.n_classes_) == clf.n_outputs_ \ No newline at end of file + assert len(clf.n_classes_) == clf.n_outputs_ diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 522252fef0536..2b9b62439da55 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -326,8 +326,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_, - n_samples, - random_state) + n_samples) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS @@ -347,7 +346,6 @@ def fit(self, X, y, sample_weight=None, check_input=True, # TODO: tree should't need this in this case np.array([1] * self.n_outputs_, dtype=np.intp), self.n_outputs_) - # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, @@ -364,7 +362,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, max_leaf_nodes, self.min_impurity_decrease, min_impurity_split) - + print(criterion) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) if self.n_outputs_ == 1 and is_classifier(self): @@ -372,7 +370,6 @@ def fit(self, X, y, sample_weight=None, check_input=True, self.classes_ = self.classes_[0] self._prune_tree() - return self def _validate_X_predict(self, X, check_input): From e78dbd32c33c814dcb7610c22c0c1caa606d1a77 Mon Sep 17 00:00:00 2001 From: admin Date: Thu, 12 Dec 2019 01:55:00 -0500 Subject: [PATCH 11/20] implemented shared predictor weighhts for axis projections --- sklearn/tree/_criterion.pxd | 2 + sklearn/tree/_criterion.pyx | 154 +++++++++++++++++--------------- sklearn/tree/_splitter.pyx | 28 +++--- sklearn/tree/tests/test_tree.py | 4 +- 4 files changed, 103 insertions(+), 85 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 0bec5025c3266..bb3f932c564d1 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -83,3 +83,5 @@ cdef class RegressionCriterion(Criterion): cdef class ObliqueProjection(RegressionCriterion): pass +cdef class AxisProjection(RegressionCriterion): + pass diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 4ae22fa0b07bb..a26329e149ff2 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1388,22 +1388,53 @@ cdef class AxisProjection(RegressionCriterion): MSE = var_left + var_right """ - ''' - cdef double node_impurity(self) nogil: + cdef double node_impurity2(self, double* pred_weights) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end].""" - cdef double* sum_total = self.sum_total + ''' + cdef SIZE_t i + with gil: + for i in range(self.n_outputs): + print("node weights: ", pred_weights[i]) + ''' cdef double impurity - cdef SIZE_t k + cdef DOUBLE_t* sample_weight = self.sample_weight + cdef SIZE_t* samples = self.samples + cdef SIZE_t end = self.end + cdef SIZE_t start = self.start - impurity = self.sq_sum_total / self.weighted_n_node_samples - for k in range(self.n_outputs): - impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 + cdef double* sum_total = self.sum_total #modified + cdef DOUBLE_t y_ik - return impurity / self.n_outputs + cdef double sq_sum_total = 0.0 - cdef double proxy_impurity_improvement(self) nogil: + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k # modified + + cdef DOUBLE_t w = 1.0 + + for p in range(start, end): + i = samples[p] + if sample_weight != NULL: + w = sample_weight[i] + for k in range(self.n_outputs): + y_ik = self.y[i, k] + sq_sum_total += w * y_ik * y_ik * pred_weights[k] #+ 27 + with gil: + impurity = abs(sq_sum_total / self.weighted_n_node_samples) + #impurity = sq_sum_total / self.weighted_n_node_samples + for k in range(self.n_outputs): + #with gil: print(impurity) + #with gil: print(sum_total[k] * pred_weights[k]/ self.weighted_n_node_samples) + impurity -= (sum_total[k] * pred_weights[k]/ self.weighted_n_node_samples)**2.0 + + + return impurity + + + cdef double proxy_impurity_improvement2(self, double* pred_weights) nogil: """Compute a proxy of the impurity reduction This method is used to speed up the search for the best split. @@ -1414,7 +1445,12 @@ cdef class AxisProjection(RegressionCriterion): The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ - + ''' + cdef SIZE_t i + with gil: + for i in range(self.n_outputs): + print("proxy weights: ", pred_weights[i]) + ''' cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right @@ -1423,98 +1459,76 @@ cdef class AxisProjection(RegressionCriterion): cdef double proxy_impurity_right = 0.0 for k in range(self.n_outputs): - proxy_impurity_left += sum_left[k] * sum_left[k] - proxy_impurity_right += sum_right[k] * sum_right[k] + #with gil: print(pred_weights[k]) + proxy_impurity_left += sum_left[k] * sum_left[k] * pred_weights[k] + proxy_impurity_right += sum_right[k] * sum_right[k] * pred_weights[k] + with gil: + return (abs(proxy_impurity_left / self.weighted_n_left) + + abs(proxy_impurity_right / self.weighted_n_right)) + #return (proxy_impurity_left / self.weighted_n_left + + # proxy_impurity_right / self.weighted_n_right) - return (proxy_impurity_left / self.weighted_n_left + - proxy_impurity_right / self.weighted_n_right) - cdef void children_impurity(self, double* impurity_left, - double* impurity_right) nogil: + cdef void children_impurity2(self, double* impurity_left, + double* impurity_right, double* pred_weights) nogil: """Evaluate the impurity in children nodes, i.e. the impurity of the left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" - + ''' + cdef SIZE_t i + with gil: + for i in range(self.n_outputs): + print("children weights: ", pred_weights[i]) + ''' cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos cdef SIZE_t start = self.start + cdef SIZE_t end = self.end cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right cdef DOUBLE_t y_ik cdef double sq_sum_left = 0.0 - cdef double sq_sum_right + cdef double sq_sum_right = 0.0 cdef SIZE_t i cdef SIZE_t p - cdef SIZE_t k - cdef DOUBLE_t w = 1.0 + cdef SIZE_t k # modified + cdef DOUBLE_t w = 1.0 for p in range(start, pos): i = samples[p] if sample_weight != NULL: w = sample_weight[i] - for k in range(self.n_outputs): y_ik = self.y[i, k] - sq_sum_left += w * y_ik * y_ik - - sq_sum_right = self.sq_sum_total - sq_sum_left - - impurity_left[0] = sq_sum_left / self.weighted_n_left - impurity_right[0] = sq_sum_right / self.weighted_n_right - - for k in range(self.n_outputs): - impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 - impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 - - impurity_left[0] /= self.n_outputs - impurity_right[0] /= self.n_outputs - ''' - - cdef double node_impurity(self) nogil: - """Evaluate the impurity of the current node, i.e. the impurity of - samples[start:end].""" - - cdef double impurity - cdef DOUBLE_t* sample_weight = self.sample_weight - cdef SIZE_t* samples = self.samples - cdef SIZE_t end = self.end - cdef SIZE_t start = self.start - - cdef double* sum_total = self.sum_total #modified - cdef DOUBLE_t y_ik - - cdef double sq_sum_total = 0.0 - - cdef SIZE_t i - cdef SIZE_t p - cdef SIZE_t k # modified - cdef UINT32_t rand_r_state - - with gil: # is this okay? - rand_r_state = self.random_state.randint(0, RAND_R_MAX) - cdef UINT32_t* random_state = &rand_r_state - - k = rand_int(0, self.n_outputs, random_state) #TODO is this random state okay? - #with gil: - # k = random.randint(0, self.n_outputs) - cdef DOUBLE_t w = 1.0 + sq_sum_left += w * y_ik * y_ik * pred_weights[k] - for p in range(start, end): + for p in range(pos, end): i = samples[p] + if sample_weight != NULL: w = sample_weight[i] - y_ik = self.y[i, k] - sq_sum_total += w * y_ik * y_ik + for k in range(self.n_outputs): + y_ik = self.y[i, k] + sq_sum_right += w * y_ik * y_ik * pred_weights[k] + + + #sq_sum_right = self.sq_sum_total - sq_sum_left + with gil: + impurity_left[0] = abs(sq_sum_left / self.weighted_n_left) + impurity_right[0] = abs(sq_sum_right / self.weighted_n_right) + #impurity_left[0] = sq_sum_left / self.weighted_n_left + #impurity_right[0] = sq_sum_right / self.weighted_n_right - impurity = sq_sum_total / self.weighted_n_node_samples - impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 + impurity_left[0] -= (sum_left[k] * pred_weights[k]/ self.weighted_n_left) ** 2.0 + impurity_right[0] -= (sum_right[k] * pred_weights[k]/ self.weighted_n_right) ** 2.0 - return impurity #/ self.n_outputs + impurity_left[0] + impurity_right[0] cdef double proxy_impurity_improvement(self) nogil: diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 8ffe6543c9e4c..249246d48aa6a 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -17,8 +17,7 @@ from ._criterion cimport Criterion from ._criterion cimport ObliqueProjection -#from ._criterion cimport AxisProjection - +from ._criterion cimport AxisProjection from libc.stdlib cimport free from libc.stdlib cimport qsort from libc.stdlib cimport calloc @@ -64,7 +63,7 @@ cdef inline void _init_pred_weights(SplitRecord* self, SIZE_t n_outputs, UINT32_ # rand_r_state = random_state.randint(0, RAND_R_MAX) #cdef UINT32_t* random_state = &rand_r_state with gil: - if isinstance(criterion, ObliqueProjection): #of AxisProjection + if isinstance(criterion, ObliqueProjection): num_pred = rand_int(1, n_outputs+1, random_state) #TODO is this random state okay? for i in range(num_pred): @@ -73,6 +72,9 @@ cdef inline void _init_pred_weights(SplitRecord* self, SIZE_t n_outputs, UINT32_ if a == 0: a -= 1 self.pred_weights[k] = a # didn't normalize + elif isinstance(criterion, AxisProjection): + k = rand_int(0, n_outputs, random_state) + self.pred_weights[k] = 1.0 cdef class Splitter: """Abstract splitter class. @@ -257,7 +259,7 @@ cdef class Splitter: cdef double node_impurity(self, SplitRecord* split) nogil: """Return the impurity of the current node.""" with gil: - if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): _init_pred_weights(split, self.y.shape[1], &self.rand_r_state, self.criterion) with gil: for i in range(self.y.shape[1]): @@ -453,7 +455,7 @@ cdef class BestSplitter(BaseDenseSplitter): (self.criterion.weighted_n_right < min_weight_leaf)): continue with gil: - if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): current_proxy_improvement = self.criterion.proxy_impurity_improvement2(split.pred_weights) else: current_proxy_improvement = self.criterion.proxy_impurity_improvement() @@ -487,7 +489,7 @@ cdef class BestSplitter(BaseDenseSplitter): self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) with gil: - if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): self.criterion.children_impurity2(&best.impurity_left, &best.impurity_right, split.pred_weights) else: @@ -780,7 +782,7 @@ cdef class RandomSplitter(BaseDenseSplitter): (self.criterion.weighted_n_right < min_weight_leaf)): continue with gil: - if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): current_proxy_improvement = self.criterion.proxy_impurity_improvement2(split.pred_weights) else: current_proxy_improvement = self.criterion.proxy_impurity_improvement() @@ -807,7 +809,7 @@ cdef class RandomSplitter(BaseDenseSplitter): self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) with gil: - if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): self.criterion.children_impurity2(&best.impurity_left, &best.impurity_right, split.pred_weights) else: @@ -1336,7 +1338,7 @@ cdef class BestSparseSplitter(BaseSparseSplitter): (self.criterion.weighted_n_right < min_weight_leaf)): continue with gil: - if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): current_proxy_improvement = self.criterion.proxy_impurity_improvement2(split.pred_weights) else: current_proxy_improvement = self.criterion.proxy_impurity_improvement() @@ -1365,7 +1367,7 @@ cdef class BestSparseSplitter(BaseSparseSplitter): self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) with gil: - if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): self.criterion.children_impurity2(&best.impurity_left, &best.impurity_right, split.pred_weights) else: @@ -1576,7 +1578,7 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): (self.criterion.weighted_n_right < min_weight_leaf)): continue with gil: - if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): current_proxy_improvement = self.criterion.proxy_impurity_improvement2(split.pred_weights) else: current_proxy_improvement = self.criterion.proxy_impurity_improvement() @@ -1585,7 +1587,7 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): best_proxy_improvement = current_proxy_improvement current.improvement = self.criterion.impurity_improvement(impurity) with gil: - if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): self.criterion.children_impurity2(¤t.impurity_left, ¤t.impurity_right, split.pred_weights) else: @@ -1606,7 +1608,7 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) with gil: - if isinstance(self.criterion, ObliqueProjection): #of AxisProjection + if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): self.criterion.children_impurity2(&best.impurity_left, &best.impurity_right, split.pred_weights) else: diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 1c236cc05b069..0dad01461cfa2 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1779,7 +1779,7 @@ def test_mae(): dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3]) assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0]) assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0]) -''' + def test_axis_proj(): """Check axis projection criterion produces correct results on small toy dataset: @@ -1890,7 +1890,7 @@ def test_axis_proj(): #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) #assert_array_equal(dt_axis.tree_.impurity, [14.0 / 3.0, 4.0, 0.0]) #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 0.0]) -''' + def test_oblique_proj(): """Check oblique projection criterion produces correct results on small toy dataset: From c7b5870afefafb823bba715b17b596cc413bde9a Mon Sep 17 00:00:00 2001 From: admin Date: Thu, 12 Dec 2019 02:10:53 -0500 Subject: [PATCH 12/20] removed unnecessary print statements and comments --- sklearn/tree/_criterion.pyx | 189 ++++---------------------------- sklearn/tree/_splitter.pyx | 25 +---- sklearn/tree/_tree.pyx | 1 - sklearn/tree/tests/test_tree.py | 22 +--- 4 files changed, 28 insertions(+), 209 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index a26329e149ff2..e612d77498b94 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -24,11 +24,8 @@ from libc.math cimport fabs import numpy as np cimport numpy as np -import random #added by morgan np.import_array() -from ._utils cimport rand_int #added by Morgan -from ._utils cimport RAND_R_MAX # added by morgan from ._utils cimport log from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray @@ -121,7 +118,7 @@ cdef class Criterion: pass - cdef double node_impurity2(self, double* pred_weights) nogil: #TODO + cdef double node_impurity2(self, double* pred_weights) nogil: """Placeholder for calculating the impurity of the node. Placeholder for a method which will evaluate the impurity of @@ -152,7 +149,7 @@ cdef class Criterion: pass cdef void children_impurity2(self, double* impurity_left, - double* impurity_right, double* pred_weights) nogil: #TODO + double* impurity_right, double* pred_weights) nogil: """Placeholder for calculating the impurity of children. Placeholder for a method which evaluates the impurity in @@ -1031,7 +1028,7 @@ cdef class MAE(RegressionCriterion): cdef np.ndarray right_child cdef DOUBLE_t* node_medians - def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): #TODO do I need to modify this? + def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. Parameters @@ -1379,7 +1376,7 @@ cdef class FriedmanMSE(MSE): cdef class AxisProjection(RegressionCriterion): - r"""Mean absolute error impurity criterion + r"""Mean squared error impurity criterion of axis-aligned projections of high dimensional y Algorithm: @@ -1392,26 +1389,20 @@ cdef class AxisProjection(RegressionCriterion): """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end].""" - ''' - cdef SIZE_t i - with gil: - for i in range(self.n_outputs): - print("node weights: ", pred_weights[i]) - ''' cdef double impurity cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t end = self.end cdef SIZE_t start = self.start - cdef double* sum_total = self.sum_total #modified + cdef double* sum_total = self.sum_total cdef DOUBLE_t y_ik cdef double sq_sum_total = 0.0 cdef SIZE_t i cdef SIZE_t p - cdef SIZE_t k # modified + cdef SIZE_t k cdef DOUBLE_t w = 1.0 @@ -1421,13 +1412,10 @@ cdef class AxisProjection(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] - sq_sum_total += w * y_ik * y_ik * pred_weights[k] #+ 27 + sq_sum_total += w * y_ik * y_ik * pred_weights[k] with gil: impurity = abs(sq_sum_total / self.weighted_n_node_samples) - #impurity = sq_sum_total / self.weighted_n_node_samples for k in range(self.n_outputs): - #with gil: print(impurity) - #with gil: print(sum_total[k] * pred_weights[k]/ self.weighted_n_node_samples) impurity -= (sum_total[k] * pred_weights[k]/ self.weighted_n_node_samples)**2.0 @@ -1445,12 +1433,7 @@ cdef class AxisProjection(RegressionCriterion): The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ - ''' - cdef SIZE_t i - with gil: - for i in range(self.n_outputs): - print("proxy weights: ", pred_weights[i]) - ''' + cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right @@ -1459,27 +1442,18 @@ cdef class AxisProjection(RegressionCriterion): cdef double proxy_impurity_right = 0.0 for k in range(self.n_outputs): - #with gil: print(pred_weights[k]) proxy_impurity_left += sum_left[k] * sum_left[k] * pred_weights[k] proxy_impurity_right += sum_right[k] * sum_right[k] * pred_weights[k] with gil: return (abs(proxy_impurity_left / self.weighted_n_left) + abs(proxy_impurity_right / self.weighted_n_right)) - #return (proxy_impurity_left / self.weighted_n_left + - # proxy_impurity_right / self.weighted_n_right) - cdef void children_impurity2(self, double* impurity_left, double* impurity_right, double* pred_weights) nogil: """Evaluate the impurity in children nodes, i.e. the impurity of the left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" - ''' - cdef SIZE_t i - with gil: - for i in range(self.n_outputs): - print("children weights: ", pred_weights[i]) - ''' + cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos @@ -1495,7 +1469,7 @@ cdef class AxisProjection(RegressionCriterion): cdef SIZE_t i cdef SIZE_t p - cdef SIZE_t k # modified + cdef SIZE_t k cdef DOUBLE_t w = 1.0 for p in range(start, pos): @@ -1516,13 +1490,9 @@ cdef class AxisProjection(RegressionCriterion): y_ik = self.y[i, k] sq_sum_right += w * y_ik * y_ik * pred_weights[k] - - #sq_sum_right = self.sq_sum_total - sq_sum_left with gil: impurity_left[0] = abs(sq_sum_left / self.weighted_n_left) impurity_right[0] = abs(sq_sum_right / self.weighted_n_right) - #impurity_left[0] = sq_sum_left / self.weighted_n_left - #impurity_right[0] = sq_sum_right / self.weighted_n_right impurity_left[0] -= (sum_left[k] * pred_weights[k]/ self.weighted_n_left) ** 2.0 impurity_right[0] -= (sum_right[k] * pred_weights[k]/ self.weighted_n_right) ** 2.0 @@ -1531,136 +1501,36 @@ cdef class AxisProjection(RegressionCriterion): impurity_right[0] - cdef double proxy_impurity_improvement(self) nogil: - """Compute a proxy of the impurity reduction - - This method is used to speed up the search for the best split. - It is a proxy quantity such that the split that maximizes this value - also maximizes the impurity improvement. It neglects all constant terms - of the impurity decrease for a given split. - - The absolute impurity improvement is only computed by the - impurity_improvement method once the best split has been found. - """ - - cdef double* sum_left = self.sum_left - cdef double* sum_right = self.sum_right - - cdef SIZE_t k - cdef double proxy_impurity_left = 0.0 - cdef double proxy_impurity_right = 0.0 - - cdef UINT32_t rand_r_state - - with gil: # is this okay? - rand_r_state = self.random_state.randint(0, RAND_R_MAX) - cdef UINT32_t* random_state = &rand_r_state - - k = rand_int(0, self.n_outputs, random_state) #TODO is this random state okay? - - proxy_impurity_left += sum_left[k] * sum_left[k] - proxy_impurity_right += sum_right[k] * sum_right[k] - - - return (proxy_impurity_left / self.weighted_n_left + - proxy_impurity_right / self.weighted_n_right) - - cdef void children_impurity(self, double* impurity_left, - double* impurity_right) nogil: - """Evaluate the impurity in children nodes, i.e. the impurity of the - left child (samples[start:pos]) and the impurity the right child - (samples[pos:end]).""" - """ - """ - - cdef DOUBLE_t* sample_weight = self.sample_weight - cdef SIZE_t* samples = self.samples - cdef SIZE_t pos = self.pos - cdef SIZE_t start = self.start - cdef SIZE_t end = self.end - - cdef double* sum_left = self.sum_left - cdef double* sum_right = self.sum_right - cdef DOUBLE_t y_ik - - cdef double sq_sum_left = 0.0 - cdef double sq_sum_right = 0.0 - - cdef SIZE_t i - cdef SIZE_t p - cdef SIZE_t k - cdef DOUBLE_t w = 1.0 - cdef UINT32_t rand_r_state - - with gil: # is this okay? - rand_r_state = self.random_state.randint(0, RAND_R_MAX) - cdef UINT32_t* random_state = &rand_r_state - - k = rand_int(0, self.n_outputs, random_state) #TODO is this random state okay? - - for p in range(start, pos): - i = samples[p] - - if sample_weight != NULL: - w = sample_weight[i] - y_ik = self.y[i, k] - sq_sum_left += w * y_ik * y_ik - - for p in range(pos, end): - i = samples[p] - - if sample_weight != NULL: - w = sample_weight[i] - y_ik = self.y[i, k] - sq_sum_right += w * y_ik * y_ik - - #sq_sum_right = self.sq_sum_total - sq_sum_left - - impurity_left[0] = sq_sum_left / self.weighted_n_left - impurity_right[0] = sq_sum_right / self.weighted_n_right - - impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 - impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 - - impurity_left[0] - impurity_right[0] - - #with gil: - # print(impurity_left[0], impurity_right[0]) - - cdef class ObliqueProjection(RegressionCriterion): - r"""Mean absolute error impurity criterion - + r"""Mean squared error impurity criterion + of oblique projections of high dimensional y + Algorithm: + 1. select a random predictors from [0,n_outputs] + 2. Set weights of chosen predictors to -1 or 1 + 3. compute mse on the values of those predictors for all samples - MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true - value and f_i is the predicted value.""" + MSE = var_left + var_right + """ cdef double node_impurity2(self, double* pred_weights) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end].""" - ''' - cdef SIZE_t i - with gil: - for i in range(self.n_outputs): - print("node weights: ", pred_weights[i]) - ''' cdef double impurity cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t end = self.end cdef SIZE_t start = self.start - cdef double* sum_total = self.sum_total #modified + cdef double* sum_total = self.sum_total cdef DOUBLE_t y_ik cdef double sq_sum_total = 0.0 cdef SIZE_t i cdef SIZE_t p - cdef SIZE_t k # modified + cdef SIZE_t k cdef DOUBLE_t w = 1.0 @@ -1670,13 +1540,10 @@ cdef class ObliqueProjection(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] - sq_sum_total += w * y_ik * y_ik * pred_weights[k] #+ 27 + sq_sum_total += w * y_ik * y_ik * pred_weights[k] with gil: impurity = abs(sq_sum_total / self.weighted_n_node_samples) - #impurity = sq_sum_total / self.weighted_n_node_samples for k in range(self.n_outputs): - #with gil: print(impurity) - #with gil: print(sum_total[k] * pred_weights[k]/ self.weighted_n_node_samples) impurity -= (sum_total[k] * pred_weights[k]/ self.weighted_n_node_samples)**2.0 @@ -1714,14 +1581,11 @@ cdef class ObliqueProjection(RegressionCriterion): cdef double proxy_impurity_right = 0.0 for k in range(self.n_outputs): - #with gil: print(pred_weights[k]) proxy_impurity_left += sum_left[k] * sum_left[k] * pred_weights[k] proxy_impurity_right += sum_right[k] * sum_right[k] * pred_weights[k] with gil: return (abs(proxy_impurity_left / self.weighted_n_left) + abs(proxy_impurity_right / self.weighted_n_right)) - #return (proxy_impurity_left / self.weighted_n_left + - # proxy_impurity_right / self.weighted_n_right) cdef void children_impurity2(self, double* impurity_left, @@ -1729,12 +1593,7 @@ cdef class ObliqueProjection(RegressionCriterion): """Evaluate the impurity in children nodes, i.e. the impurity of the left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" - ''' - cdef SIZE_t i - with gil: - for i in range(self.n_outputs): - print("children weights: ", pred_weights[i]) - ''' + cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos @@ -1771,13 +1630,9 @@ cdef class ObliqueProjection(RegressionCriterion): y_ik = self.y[i, k] sq_sum_right += w * y_ik * y_ik * pred_weights[k] - - #sq_sum_right = self.sq_sum_total - sq_sum_left with gil: impurity_left[0] = abs(sq_sum_left / self.weighted_n_left) impurity_right[0] = abs(sq_sum_right / self.weighted_n_right) - #impurity_left[0] = sq_sum_left / self.weighted_n_left - #impurity_right[0] = sq_sum_right / self.weighted_n_right impurity_left[0] -= (sum_left[k] * pred_weights[k]/ self.weighted_n_left) ** 2.0 impurity_right[0] -= (sum_right[k] * pred_weights[k]/ self.weighted_n_right) ** 2.0 diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 249246d48aa6a..f6c38a628f953 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -54,17 +54,13 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) nogil: self.improvement = -INFINITY cdef inline void _init_pred_weights(SplitRecord* self, SIZE_t n_outputs, UINT32_t* random_state, Criterion criterion) nogil: - #cdef UINT32_t rand_r_state cdef SIZE_t num_pred cdef SIZE_t a cdef SIZE_t k self.pred_weights = calloc(n_outputs, sizeof(double)) - #with gil: # is this okay? - # rand_r_state = random_state.randint(0, RAND_R_MAX) - #cdef UINT32_t* random_state = &rand_r_state with gil: if isinstance(criterion, ObliqueProjection): - num_pred = rand_int(1, n_outputs+1, random_state) #TODO is this random state okay? + num_pred = rand_int(1, n_outputs+1, random_state) for i in range(num_pred): k = rand_int(0, n_outputs, random_state) @@ -261,9 +257,6 @@ cdef class Splitter: with gil: if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): _init_pred_weights(split, self.y.shape[1], &self.rand_r_state, self.criterion) - with gil: - for i in range(self.y.shape[1]): - pass#print("weight: ", i, split.pred_weights[i]) return self.criterion.node_impurity2(split.pred_weights) else: return self.criterion.node_impurity() @@ -359,9 +352,7 @@ cdef class BestSplitter(BaseDenseSplitter): cdef SIZE_t n_total_constants = n_known_constants cdef DTYPE_t current_feature_value cdef SIZE_t partition_end - #with gil: print(split.pred_weights[0]) - _init_split(&best, end)#, self.y.shape[1], random_state, self.criterion) - #with gil: print(split.pred_weights[0]) + _init_split(&best, end) # Sample up to max_features without replacement using a # Fisher-Yates-based algorithm (using the local variables `f_i` and # `f_j` to compute a permutation of the `features` array). @@ -677,9 +668,7 @@ cdef class RandomSplitter(BaseDenseSplitter): cdef DTYPE_t min_feature_value cdef DTYPE_t max_feature_value cdef DTYPE_t current_feature_value - #with gil: print(split.pred_weights[0]) - _init_split(&best, end)#, self.y.shape[1], random_state, self.criterion) - #with gil: print(split.pred_weights[0]) + _init_split(&best, end) # Sample up to max_features without replacement using a # Fisher-Yates-based algorithm (using the local variables `f_i` and # `f_j` to compute a permutation of the `features` array). @@ -1190,9 +1179,7 @@ cdef class BestSparseSplitter(BaseSparseSplitter): cdef UINT32_t* random_state = &self.rand_r_state cdef SplitRecord best, current - #with gil: print(split.pred_weights[0]) - _init_split(&best, end)#, self.y.shape[1], random_state, self.criterion) - #with gil: print(split.pred_weights[0]) + _init_split(&best, end) cdef double current_proxy_improvement = - INFINITY cdef double best_proxy_improvement = - INFINITY @@ -1428,9 +1415,7 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): cdef UINT32_t* random_state = &self.rand_r_state cdef SplitRecord best, current - #with gil: print(split.pred_weights[0]) - _init_split(&best, end)#, self.y.shape[1], random_state, self.criterion) - #with gil: print(split.pred_weights[0]) + _init_split(&best, end) cdef double current_proxy_improvement = - INFINITY cdef double best_proxy_improvement = - INFINITY diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 02eeed5668092..b4023be704307 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -448,7 +448,6 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if is_first: impurity = splitter.node_impurity(&split) - #with gil: print('hi') #not sure why this is necessary but I get a seg fault if not else: splitter.node_impurity(&split) n_node_samples = end - start diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 0dad01461cfa2..abc7d332fa4ee 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -45,8 +45,7 @@ from sklearn.utils import compute_sample_weight CLF_CRITERIONS = ("gini", "entropy") -REG_CRITERIONS = ("mse", "mae", "friedman_mse")#, "axis", "oblique") -#REG_CRITERIONS = ("mse", "oblique") +REG_CRITERIONS = ("mse", "mae", "friedman_mse") CLF_TREES = { "DecisionTreeClassifier": DecisionTreeClassifier, @@ -1867,9 +1866,6 @@ def test_axis_proj(): dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) - #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) - #assert_allclose(dt_axis.tree_.impurity, [7.7 / 2.3, 6.13125 / 1.3, 0.0 / 1.0], rtol=0.6) - #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 4.0]) # Test axis projection where all sample weights are uniform: dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], @@ -1877,9 +1873,6 @@ def test_axis_proj(): dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], sample_weight=np.ones(5)) assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) - #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) - #assert_array_equal(dt_axis.tree_.impurity, [14.0 / 3.0, 4.0, 0.0]) - #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 0.0]) # Test axis projection where a `sample_weight` is not explicitly provided. # This is equivalent to providing uniform sample weights, though @@ -1887,9 +1880,6 @@ def test_axis_proj(): dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8]) assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) - #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) - #assert_array_equal(dt_axis.tree_.impurity, [14.0 / 3.0, 4.0, 0.0]) - #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 0.0]) def test_oblique_proj(): """Check oblique projection criterion produces correct results on small toy dataset: @@ -1991,10 +1981,6 @@ def test_oblique_proj(): except: assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity) - #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) - #assert_allclose(dt_axis.tree_.impurity, [7.7 / 2.3, 6.13125 / 1.3, 0.0 / 1.0], rtol=0.6) - #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 4.0]) - # Test axis projection where all sample weights are uniform: dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], sample_weight=np.ones(5)) @@ -2005,9 +1991,6 @@ def test_oblique_proj(): except: assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity) - #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) - #assert_array_equal(dt_axis.tree_.impurity, [14.0 / 3.0, 4.0, 0.0]) - #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 0.0]) # Test MAE where a `sample_weight` is not explicitly provided. # This is equivalent to providing uniform sample weights, though @@ -2018,9 +2001,6 @@ def test_oblique_proj(): assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity*2) except: assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity) - #assert_array_equal(dt_axis.tree_.value.flat, dt_mse.tree_.value.flat) - #assert_array_equal(dt_axis.tree_.impurity, [14.0 / 3.0, 4.0, 0.0]) - #assert_array_equal(dt_axis.tree_.value.flat, [5.0, 5.25, 0.0]) def test_criterion_copy(): From e960ddf03b54a53a2a5490ee1772e0ac04c7da5f Mon Sep 17 00:00:00 2001 From: admin Date: Thu, 12 Dec 2019 02:22:28 -0500 Subject: [PATCH 13/20] removed excess changes --- sklearn/tree/_criterion.pyx | 4 ++-- sklearn/tree/_splitter.pyx | 9 +++++++++ sklearn/tree/_tree.pyx | 7 +------ sklearn/tree/tests/test_tree.py | 1 - sklearn/tree/tree.py | 3 ++- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index e612d77498b94..a3ee5e7cd80a4 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -74,6 +74,7 @@ cdef class Criterion: The first sample to be used on this node end : SIZE_t The last sample used on this node + """ pass @@ -746,7 +747,6 @@ cdef class RegressionCriterion(Criterion): n_samples : SIZE_t The total number of samples to fit on - """ # Default values @@ -782,7 +782,7 @@ cdef class RegressionCriterion(Criterion): self.sum_right == NULL): raise MemoryError() - def __reduce__(self): + def __reduce__(self): return (type(self), (self.n_outputs, self.n_samples), self.__getstate__()) cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index f6c38a628f953..8167da4ec3001 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -334,6 +334,7 @@ cdef class BestSplitter(BaseDenseSplitter): cdef SplitRecord best, current cdef double current_proxy_improvement = -INFINITY cdef double best_proxy_improvement = -INFINITY + cdef SIZE_t f_i = n_features cdef SIZE_t f_j cdef SIZE_t p @@ -352,7 +353,9 @@ cdef class BestSplitter(BaseDenseSplitter): cdef SIZE_t n_total_constants = n_known_constants cdef DTYPE_t current_feature_value cdef SIZE_t partition_end + _init_split(&best, end) + # Sample up to max_features without replacement using a # Fisher-Yates-based algorithm (using the local variables `f_i` and # `f_j` to compute a permutation of the `features` array). @@ -668,7 +671,9 @@ cdef class RandomSplitter(BaseDenseSplitter): cdef DTYPE_t min_feature_value cdef DTYPE_t max_feature_value cdef DTYPE_t current_feature_value + _init_split(&best, end) + # Sample up to max_features without replacement using a # Fisher-Yates-based algorithm (using the local variables `f_i` and # `f_j` to compute a permutation of the `features` array). @@ -1179,7 +1184,9 @@ cdef class BestSparseSplitter(BaseSparseSplitter): cdef UINT32_t* random_state = &self.rand_r_state cdef SplitRecord best, current + _init_split(&best, end) + cdef double current_proxy_improvement = - INFINITY cdef double best_proxy_improvement = - INFINITY @@ -1415,7 +1422,9 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): cdef UINT32_t* random_state = &self.rand_r_state cdef SplitRecord best, current + _init_split(&best, end) + cdef double current_proxy_improvement = - INFINITY cdef double best_proxy_improvement = - INFINITY diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index b4023be704307..e85af8f9588d6 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -176,6 +176,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # Recursive partition (without actual recursion) splitter.init(X, y, sample_weight_ptr, X_idx_sorted) + cdef SIZE_t start cdef SIZE_t end cdef SIZE_t depth @@ -227,12 +228,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if first: impurity = splitter.node_impurity(&split) first = 0 - #with gil: print("isleaf, impurity: ", is_leaf, impurity, min_impurity_split) is_leaf = (is_leaf or (impurity <= min_impurity_split)) if not is_leaf: - #with gil: print(splitter) splitter.node_split(impurity, &split, &n_constant_features) # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are @@ -254,7 +253,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): splitter.node_value(tree.value + node_id * tree.value_stride) if not is_leaf: - # Push right child on stack rc = stack.push(split.pos, end, depth + 1, node_id, 0, split.impurity_right, n_constant_features) @@ -451,7 +449,6 @@ cdef class BestFirstTreeBuilder(TreeBuilder): else: splitter.node_impurity(&split) n_node_samples = end - start - with gil: print("isleaf...", is_leaf, depth, n_node_samples, weighted_n_node_samples, impurity, min_impurity_split) is_leaf = (depth >= self.max_depth or n_node_samples < self.min_samples_split or n_node_samples < 2 * self.min_samples_leaf or @@ -459,9 +456,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): impurity <= min_impurity_split) if not is_leaf: - with gil: print('hi 1') splitter.node_split(impurity, &split, &n_constant_features) - with gil: print('hi 2') # If EPSILON=0 in the below comparison, float precision issues stop # splitting early, producing trees that are dissimilar to v0.18 is_leaf = (is_leaf or split.pos >= end or diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index abc7d332fa4ee..f662e92752a3a 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -151,7 +151,6 @@ DATASETS[name]["X_sparse"] = csc_matrix(DATASETS[name]["X"]) - def assert_tree_equal(d, s, message): assert s.node_count == d.node_count, ( "{0}: inequal number of node ({1} != {2})" diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 2b9b62439da55..40eb6ec07bad8 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -346,6 +346,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, # TODO: tree should't need this in this case np.array([1] * self.n_outputs_, dtype=np.intp), self.n_outputs_) + # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, @@ -362,7 +363,6 @@ def fit(self, X, y, sample_weight=None, check_input=True, max_leaf_nodes, self.min_impurity_decrease, min_impurity_split) - print(criterion) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) if self.n_outputs_ == 1 and is_classifier(self): @@ -370,6 +370,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, self.classes_ = self.classes_[0] self._prune_tree() + return self def _validate_X_predict(self, X, check_input): From 1799ef069d3cff8f40414c743145f79b0e858196 Mon Sep 17 00:00:00 2001 From: admin Date: Thu, 12 Dec 2019 20:07:46 -0500 Subject: [PATCH 14/20] adjusted mse calculation in projection criteria to match hand calculations --- sklearn/tree/_criterion.pyx | 194 +++++++++++-------- sklearn/tree/_splitter.pyx | 20 +- sklearn/tree/tests/test_proj_criteria.py | 232 +++++++++++++++++++++++ sklearn/tree/tests/test_tree.py | 47 +++-- 4 files changed, 388 insertions(+), 105 deletions(-) create mode 100644 sklearn/tree/tests/test_proj_criteria.py diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index a3ee5e7cd80a4..9760e5e25fe82 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1388,37 +1388,40 @@ cdef class AxisProjection(RegressionCriterion): cdef double node_impurity2(self, double* pred_weights) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end].""" - - cdef double impurity + cdef double impurity = 0.0 #TODO cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t end = self.end cdef SIZE_t start = self.start + cdef double* pred = calloc(end-start, sizeof(double)) - cdef double* sum_total = self.sum_total + cdef double mean_pred = 0.0 #TODO cdef DOUBLE_t y_ik - cdef double sq_sum_total = 0.0 - cdef SIZE_t i cdef SIZE_t p cdef SIZE_t k cdef DOUBLE_t w = 1.0 + for p in range(start, end): i = samples[p] if sample_weight != NULL: w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] - sq_sum_total += w * y_ik * y_ik * pred_weights[k] - with gil: - impurity = abs(sq_sum_total / self.weighted_n_node_samples) - for k in range(self.n_outputs): - impurity -= (sum_total[k] * pred_weights[k]/ self.weighted_n_node_samples)**2.0 - - + # sum over all predictors with pred weights + pred[p] += y_ik * pred_weights[k] + # sum over all samples to get mean of new predictor + mean_pred += pred[p] / (end - start) + + for p in range(start, end): + i = samples[p] + if sample_weight != NULL: + w = sample_weight[i] + impurity += (mean_pred - pred[p]) * (mean_pred - pred[p]) * w + impurity /= self.weighted_n_node_samples return impurity @@ -1433,7 +1436,12 @@ cdef class AxisProjection(RegressionCriterion): The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ - + ''' + cdef SIZE_t i + with gil: + for i in range(self.n_outputs): + print("proxy weights: ", pred_weights[i]) + ''' cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right @@ -1441,62 +1449,79 @@ cdef class AxisProjection(RegressionCriterion): cdef double proxy_impurity_left = 0.0 cdef double proxy_impurity_right = 0.0 - for k in range(self.n_outputs): - proxy_impurity_left += sum_left[k] * sum_left[k] * pred_weights[k] - proxy_impurity_right += sum_right[k] * sum_right[k] * pred_weights[k] with gil: - return (abs(proxy_impurity_left / self.weighted_n_left) + - abs(proxy_impurity_right / self.weighted_n_right)) + for k in range(self.n_outputs): + proxy_impurity_left += sum_left[k] * sum_left[k] * abs(pred_weights[k]) + proxy_impurity_right += sum_right[k] * sum_right[k] * abs(pred_weights[k]) + #with gil: + # return (abs(proxy_impurity_left / self.weighted_n_left) + + # abs(proxy_impurity_right / self.weighted_n_right)) + return (proxy_impurity_left / self.weighted_n_left + + proxy_impurity_right / self.weighted_n_right) + cdef void children_impurity2(self, double* impurity_left, double* impurity_right, double* pred_weights) nogil: """Evaluate the impurity in children nodes, i.e. the impurity of the left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" - cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos cdef SIZE_t start = self.start cdef SIZE_t end = self.end - cdef double* sum_left = self.sum_left - cdef double* sum_right = self.sum_right + impurity_left[0] = 0.0 + impurity_right[0] = 0.0 + cdef double* pred_left = calloc(pos-start, sizeof(double)) + cdef double* pred_right = calloc(end-pos, sizeof(double)) + cdef double mean_pred_left = 0.0 #TODO + cdef double mean_pred_right = 0.0 #TODO cdef DOUBLE_t y_ik - cdef double sq_sum_left = 0.0 - cdef double sq_sum_right = 0.0 - cdef SIZE_t i cdef SIZE_t p - cdef SIZE_t k + cdef SIZE_t k # modified cdef DOUBLE_t w = 1.0 + for p in range(start, pos): i = samples[p] - if sample_weight != NULL: w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] - sq_sum_left += w * y_ik * y_ik * pred_weights[k] - + # sum over all predictors with pred weights + pred_left[p] += y_ik * pred_weights[k] + # sum over all samples to get mean of new predictor + mean_pred_left += pred_left[p] / (pos - start) + w = 1.0 + for p in range(start, pos): + i = samples[p] + if sample_weight != NULL: + w = sample_weight[i] + impurity_left[0] += ((mean_pred_left - pred_left[p]) + * (mean_pred_left - pred_left[p]) * w)/self.weighted_n_left + w = 1.0 for p in range(pos, end): i = samples[p] - if sample_weight != NULL: w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] - sq_sum_right += w * y_ik * y_ik * pred_weights[k] - - with gil: - impurity_left[0] = abs(sq_sum_left / self.weighted_n_left) - impurity_right[0] = abs(sq_sum_right / self.weighted_n_right) - - impurity_left[0] -= (sum_left[k] * pred_weights[k]/ self.weighted_n_left) ** 2.0 - impurity_right[0] -= (sum_right[k] * pred_weights[k]/ self.weighted_n_right) ** 2.0 + # sum over all predictors with pred weights + pred_right[p - pos] += y_ik * pred_weights[k] + # sum over all samples to get mean of new predictor + for p in range(pos, end): + mean_pred_right += pred_right[p-pos] / (end - pos) + w = 1.0 + for p in range(pos, end): + i = samples[p] + if sample_weight != NULL: + w = sample_weight[i] + impurity_right[0] += ((mean_pred_right - pred_right[p - pos]) * (mean_pred_right - pred_right[p-pos]) * w) / self.weighted_n_right + impurity_left[0] impurity_right[0] @@ -1516,44 +1541,41 @@ cdef class ObliqueProjection(RegressionCriterion): cdef double node_impurity2(self, double* pred_weights) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end].""" - - cdef double impurity + cdef double impurity = 0.0 #TODO cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t end = self.end cdef SIZE_t start = self.start + cdef double* pred = calloc(end-start, sizeof(double)) - cdef double* sum_total = self.sum_total + cdef double mean_pred = 0.0 #TODO cdef DOUBLE_t y_ik - cdef double sq_sum_total = 0.0 - cdef SIZE_t i cdef SIZE_t p cdef SIZE_t k cdef DOUBLE_t w = 1.0 + for p in range(start, end): i = samples[p] if sample_weight != NULL: w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] - sq_sum_total += w * y_ik * y_ik * pred_weights[k] - with gil: - impurity = abs(sq_sum_total / self.weighted_n_node_samples) - for k in range(self.n_outputs): - impurity -= (sum_total[k] * pred_weights[k]/ self.weighted_n_node_samples)**2.0 - + # sum over all predictors with pred weights + pred[p] += y_ik * pred_weights[k] + # sum over all samples to get mean of new predictor + mean_pred += pred[p] / (end - start) - cdef SIZE_t num_pred = 0 - for k in range(self.n_outputs): - with gil: - if abs(pred_weights[k]) > 0.5: - num_pred += 1 - - return impurity #/ num_pred + for p in range(start, end): + i = samples[p] + if sample_weight != NULL: + w = sample_weight[i] + impurity += (mean_pred - pred[p]) * (mean_pred - pred[p]) * w + impurity /= self.weighted_n_node_samples + return impurity cdef double proxy_impurity_improvement2(self, double* pred_weights) nogil: @@ -1580,12 +1602,15 @@ cdef class ObliqueProjection(RegressionCriterion): cdef double proxy_impurity_left = 0.0 cdef double proxy_impurity_right = 0.0 - for k in range(self.n_outputs): - proxy_impurity_left += sum_left[k] * sum_left[k] * pred_weights[k] - proxy_impurity_right += sum_right[k] * sum_right[k] * pred_weights[k] with gil: - return (abs(proxy_impurity_left / self.weighted_n_left) + - abs(proxy_impurity_right / self.weighted_n_right)) + for k in range(self.n_outputs): + proxy_impurity_left += sum_left[k] * sum_left[k] * abs(pred_weights[k]) + proxy_impurity_right += sum_right[k] * sum_right[k] * abs(pred_weights[k]) + #with gil: + # return (abs(proxy_impurity_left / self.weighted_n_left) + + # abs(proxy_impurity_right / self.weighted_n_right)) + return (proxy_impurity_left / self.weighted_n_left + + proxy_impurity_right / self.weighted_n_right) cdef void children_impurity2(self, double* impurity_left, @@ -1593,49 +1618,62 @@ cdef class ObliqueProjection(RegressionCriterion): """Evaluate the impurity in children nodes, i.e. the impurity of the left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" - cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos cdef SIZE_t start = self.start cdef SIZE_t end = self.end - cdef double* sum_left = self.sum_left - cdef double* sum_right = self.sum_right + impurity_left[0] = 0.0 + impurity_right[0] = 0.0 + cdef double* pred_left = calloc(pos-start, sizeof(double)) + cdef double* pred_right = calloc(end-pos, sizeof(double)) + cdef double mean_pred_left = 0.0 #TODO + cdef double mean_pred_right = 0.0 #TODO cdef DOUBLE_t y_ik - cdef double sq_sum_left = 0.0 - cdef double sq_sum_right = 0.0 - cdef SIZE_t i cdef SIZE_t p cdef SIZE_t k # modified cdef DOUBLE_t w = 1.0 + for p in range(start, pos): i = samples[p] - if sample_weight != NULL: w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] - sq_sum_left += w * y_ik * y_ik * pred_weights[k] - + # sum over all predictors with pred weights + pred_left[p] += y_ik * pred_weights[k] + # sum over all samples to get mean of new predictor + mean_pred_left += pred_left[p] / (pos - start) + w = 1.0 + for p in range(start, pos): + i = samples[p] + if sample_weight != NULL: + w = sample_weight[i] + impurity_left[0] += ((mean_pred_left - pred_left[p]) + * (mean_pred_left - pred_left[p]) * w)/self.weighted_n_left + w = 1.0 for p in range(pos, end): i = samples[p] - if sample_weight != NULL: w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] - sq_sum_right += w * y_ik * y_ik * pred_weights[k] - - with gil: - impurity_left[0] = abs(sq_sum_left / self.weighted_n_left) - impurity_right[0] = abs(sq_sum_right / self.weighted_n_right) - - impurity_left[0] -= (sum_left[k] * pred_weights[k]/ self.weighted_n_left) ** 2.0 - impurity_right[0] -= (sum_right[k] * pred_weights[k]/ self.weighted_n_right) ** 2.0 + # sum over all predictors with pred weights + pred_right[p - pos] += y_ik * pred_weights[k] + # sum over all samples to get mean of new predictor + for p in range(pos, end): + mean_pred_right += pred_right[p-pos] / (end - pos) + w = 1.0 + for p in range(pos, end): + i = samples[p] + if sample_weight != NULL: + w = sample_weight[i] + impurity_right[0] += ((mean_pred_right - pred_right[p - pos]) * (mean_pred_right - pred_right[p-pos]) * w) / self.weighted_n_right + impurity_left[0] impurity_right[0] \ No newline at end of file diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 8167da4ec3001..56c5b0e54e631 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -45,18 +45,21 @@ cdef DTYPE_t FEATURE_THRESHOLD = 1e-7 # in SparseSplitter cdef DTYPE_t EXTRACT_NNZ_SWITCH = 0.1 -cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) nogil: +cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos, SIZE_t n_outputs, UINT32_t* random_state, Criterion criterion) nogil: self.impurity_left = INFINITY self.impurity_right = INFINITY self.pos = start_pos self.feature = 0 self.threshold = 0. self.improvement = -INFINITY + if (self.pred_weights): + _init_pred_weights(self, n_outputs, random_state, criterion) cdef inline void _init_pred_weights(SplitRecord* self, SIZE_t n_outputs, UINT32_t* random_state, Criterion criterion) nogil: cdef SIZE_t num_pred cdef SIZE_t a cdef SIZE_t k + #with gil: __dealloc__(self) self.pred_weights = calloc(n_outputs, sizeof(double)) with gil: if isinstance(criterion, ObliqueProjection): @@ -71,7 +74,12 @@ cdef inline void _init_pred_weights(SplitRecord* self, SIZE_t n_outputs, UINT32_ elif isinstance(criterion, AxisProjection): k = rand_int(0, n_outputs, random_state) self.pred_weights[k] = 1.0 - +''' +cdef __dealloc__(SplitRecord* self): + if not (self.pred_weights): + print("freeeee") + free(self.pred_weights) +''' cdef class Splitter: """Abstract splitter class. @@ -354,7 +362,7 @@ cdef class BestSplitter(BaseDenseSplitter): cdef DTYPE_t current_feature_value cdef SIZE_t partition_end - _init_split(&best, end) + _init_split(&best, end, self.y.shape[1], random_state, self.criterion) # Sample up to max_features without replacement using a # Fisher-Yates-based algorithm (using the local variables `f_i` and @@ -672,7 +680,7 @@ cdef class RandomSplitter(BaseDenseSplitter): cdef DTYPE_t max_feature_value cdef DTYPE_t current_feature_value - _init_split(&best, end) + _init_split(&best, end, self.y.shape[1], random_state, self.criterion) # Sample up to max_features without replacement using a # Fisher-Yates-based algorithm (using the local variables `f_i` and @@ -1185,7 +1193,7 @@ cdef class BestSparseSplitter(BaseSparseSplitter): cdef SplitRecord best, current - _init_split(&best, end) + _init_split(&best, end, self.y.shape[1], random_state, self.criterion) cdef double current_proxy_improvement = - INFINITY cdef double best_proxy_improvement = - INFINITY @@ -1423,7 +1431,7 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): cdef SplitRecord best, current - _init_split(&best, end) + _init_split(&best, end, self.y.shape[1], random_state, self.criterion) cdef double current_proxy_improvement = - INFINITY cdef double best_proxy_improvement = - INFINITY diff --git a/sklearn/tree/tests/test_proj_criteria.py b/sklearn/tree/tests/test_proj_criteria.py new file mode 100644 index 0000000000000..74f3f11c8a41a --- /dev/null +++ b/sklearn/tree/tests/test_proj_criteria.py @@ -0,0 +1,232 @@ +from sklearn.utils.testing import assert_allclose +from sklearn.tree import DecisionTreeRegressor +import numpy as np + +def test_axis_proj(): + """Check axis projection criterion produces correct results on small toy dataset: + + ------------------ + | X | y1 y2 | weight | + ------------------ + | 3 | 3 3 | 0.1 | + | 5 | 3 3 | 0.3 | + | 8 | 4 4 | 1.0 | + | 3 | 7 7 | 0.6 | + | 5 | 8 8 | 0.3 | + ------------------ + |sum wt:| 2.3 | + ------------------ + + Mean1 = 5 + Mean2 = 5 + + For all the samples, we can get the total error by summing: + (Mean1 - y1)^2 * weight or (Mean2 - y2)^2 * weight + + I.e., total error = (5 - 3)^2 * 0.1) + + (5 - 3)^2 * 0.3) + + (5 - 4)^2 * 1.0) + + (5 - 7)^2 * 0.6) + + (5 - 8)^2 * 0.3) + = 0.4 + 1.2 + 1.0 + 2.4 + 2.7 + = 7.7 + + Impurity = Total error / total weight + = 7.7 / 2.3 + = 3.3478260869565 + ----------------- + + From this root node, the next best split is between X values of 5 and 8. + Thus, we have left and right child nodes: + + LEFT RIGHT + ----------------------- ----------------------- + | X | y1 y2 | weight | | X | y1 y2 | weight | + ----------------------- ----------------------- + | 3 | 3 3 | 0.1 | | 8 | 4 4 | 1.0 | + | 3 | 7 7 | 0.6 | ----------------------- + | 5 | 3 3 | 0.3 | |sum wt:| 1.0 | + | 5 | 8 8 | 0.3 | ----------------------- + ----------------------- + |sum wt:| 1.3 | + ----------------------- + + 5.0625 + 3.0625 + 5.0625 + 7.5625 / 4 + 0 = 5.1875 + 4 + 4.667 = 8.667 + + Impurity is found in the same way: + Left node Mean1 = Mean2 = 5.25 + Total error = ((5.25 - 3)^2 * 0.1) + + ((5.25 - 7)^2 * 0.6) + + ((5.25 - 3)^2 * 0.3) + + ((5.25 - 8)^2 * 0.3) + = 6.13125 + + Left Impurity = Total error / total weight + = 6.13125 / 1.3 + = 4.716346153846154 + ------------------- + + Likewise for Right node: + Right node Mean1 = Mean2 = 4 + Total error = ((4 - 4)^2 * 1.0) + = 0 + + Right Impurity = Total error / total weight + = 0 / 1.0 + = 0.0 + ------ + """ + #y=[[3,3], [3,3], [4,4], [7,7], [8,8]] + dt_axis = DecisionTreeRegressor(random_state=0, criterion="axis", + max_leaf_nodes=2) + + # Test axis projection where sample weights are non-uniform (as illustrated above): + dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], + sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) + assert_allclose(dt_axis.tree_.impurity, [7.7 / 2.3, 6.13125 / 1.3, 0.0 / 1.0], rtol=0.6) + + # Test axis projection where all sample weights are uniform: + dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], + sample_weight=np.ones(5)) + assert_allclose(dt_axis.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) + + # Test axis projection where a `sample_weight` is not explicitly provided. + # This is equivalent to providing uniform sample weights, though + # the internal logic is different: + dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) + assert_allclose(dt_axis.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) + +def test_oblique_proj(): + """Check oblique projection criterion produces correct results on small toy dataset: + + ----------------------- + | X | y1 y2 | weight | + ----------------------- + | 3 | 3 3 | 0.1 | + | 5 | 3 3 | 0.3 | + | 8 | 4 4 | 1.0 | + | 3 | 7 7 | 0.6 | + | 5 | 8 8 | 0.3 | + ----------------------- + |sum wt:| 2.3 | + ----------------------- + + Mean1 = 5 + Mean_tot = 5 + + For all the samples, we can get the total error by summing: + (Mean1 - y1)^2 * weight or (Mean_tot - y)^2 * weight + + I.e., error1 = (5 - 3)^2 * 0.1) + + (5 - 3)^2 * 0.3) + + (5 - 4)^2 * 1.0) + + (5 - 7)^2 * 0.6) + + (5 - 8)^2 * 0.3) + = 0.4 + 1.2 + 1.0 + 2.4 + 2.7 + = 7.7 + error_tot = 15.4 + + Impurity = error / total weight + = 7.7 / 2.3 + = 3.3478260869565 + or + = 15.4 / 2.3 + = 6.6956521739130 + or + = 0.0 / 2.3 + = 0.0 + ----------------- + + From this root node, the next best split is between X values of 5 and 8. + Thus, we have left and right child nodes: + + LEFT RIGHT + ----------------------- ----------------------- + | X | y1 y2 | weight | | X | y1 y2 | weight | + ----------------------- ----------------------- + | 3 | 3 3 | 0.1 | | 8 | 4 4 | 1.0 | + | 3 | 7 7 | 0.6 | ----------------------- + | 5 | 3 3 | 0.3 | |sum wt:| 1.0 | + | 5 | 8 8 | 0.3 | ----------------------- + ----------------------- + |sum wt:| 1.3 | + ----------------------- + + (5.0625 + 3.0625 + 5.0625 + 7.5625) / 4 + 0 = 5.1875 + 4 + 4.667 = 8.667 + + Impurity is found in the same way: + Left node Mean1 = Mean2 = 5.25 + error1 = ((5.25 - 3)^2 * 0.1) + + ((5.25 - 7)^2 * 0.6) + + ((5.25 - 3)^2 * 0.3) + + ((5.25 - 8)^2 * 0.3) + = 6.13125 + error_tot = 12.2625 + + Left Impurity = Total error / total weight + = 6.13125 / 1.3 + = 4.716346153846154 + or + = 12.2625 / 1.3 + = 9.43269231 + ------------------- + + Likewise for Right node: + Right node Mean1 = Mean2 = 4 + Total error = ((4 - 4)^2 * 1.0) + = 0 + + Right Impurity = Total error / total weight + = 0 / 1.0 + = 0.0 + ------ + """ + + dt_obliq = DecisionTreeRegressor(random_state=3, criterion="oblique", + max_leaf_nodes=2) + + # Test axis projection where sample weights are non-uniform (as illustrated above): + dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], + sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) + try: + assert_allclose(dt_obliq.tree_.impurity, [7.7 / 2.3, 6.13125 / 1.3, 0.0 / 1.0], rtol=0.6) + except: + try: + assert_allclose(dt_obliq.tree_.impurity, [2.0*7.7 / 2.3, 2.0*6.13125 / 1.3, 2.0*0.0 / 1.0], rtol=0.6) + except: + assert_allclose(dt_obliq.tree_.impurity, [0.0, 0.0, 0.0], rtol=0.6) + + # Test axis projection where all sample weights are uniform: + dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], + sample_weight=np.ones(5)) + + try: + assert_allclose(dt_obliq.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) + except: + try: + assert_allclose(dt_obliq.tree_.impurity, [2.0*22.0 / 5.0, 2.0*20.75 / 4.0, 2.0*0.0 / 1.0], rtol=0.6) + except: + assert_allclose(dt_obliq.tree_.impurity, [0.0, 0.0, 0.0], rtol=0.6) + + # Test MAE where a `sample_weight` is not explicitly provided. + # This is equivalent to providing uniform sample weights, though + # the internal logic is different: + dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) + try: + assert_allclose(dt_obliq.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) + except: + try: + assert_allclose(dt_obliq.tree_.impurity, [2.0*22.0 / 5.0, 2.0*20.75 / 4.0, 2.0*0.0 / 1.0], rtol=0.6) + except: + assert_allclose(dt_obliq.tree_.impurity, [0.0, 0.0, 0.0], rtol=0.6) + +#def test(): + #print("hi") + +if __name__=="__main__": + test_axis_proj() + print("axis passed") + test_oblique_proj() + print("oblique passed") \ No newline at end of file diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index f662e92752a3a..4017a755203e2 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1778,6 +1778,7 @@ def test_mae(): assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0]) assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0]) +''' def test_axis_proj(): """Check axis projection criterion produces correct results on small toy dataset: @@ -1864,21 +1865,21 @@ def test_axis_proj(): sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) - assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) + assert_allclose(dt_axis.tree_.impurity, [7.7 / 2.3, 6.13125 / 1.3, 0.0 / 1.0], rtol=0.6) # Test axis projection where all sample weights are uniform: dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], sample_weight=np.ones(5)) dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], sample_weight=np.ones(5)) - assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) + assert_allclose(dt_axis.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) # Test axis projection where a `sample_weight` is not explicitly provided. # This is equivalent to providing uniform sample weights, though # the internal logic is different: dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8]) - assert_allclose(dt_axis.tree_.impurity, dt_mse.tree_.impurity) + assert_allclose(dt_axis.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) def test_oblique_proj(): """Check oblique projection criterion produces correct results on small toy dataset: @@ -1916,6 +1917,9 @@ def test_oblique_proj(): or = 15.4 / 2.3 = 6.6956521739130 + or + = 0.0 / 2.3 + = 0.0 ----------------- From this root node, the next best split is between X values of 5 and 8. @@ -1964,44 +1968,45 @@ def test_oblique_proj(): ------ """ - #y=[[3,3], [3,3], [4,4], [7,7], [8,8]] dt_obliq = DecisionTreeRegressor(random_state=3, criterion="oblique", max_leaf_nodes=2) - dt_mse = DecisionTreeRegressor(random_state=3, criterion="mse", - max_leaf_nodes=2) # Test axis projection where sample weights are non-uniform (as illustrated above): dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) - dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], - sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) try: - assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity*2) + assert_allclose(dt_obliq.tree_.impurity, [7.7 / 2.3, 6.13125 / 1.3, 0.0 / 1.0], rtol=0.6) except: - assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity) - + try: + assert_allclose(dt_obliq.tree_.impurity, [2.0*7.7 / 2.3, 2.0*6.13125 / 1.3, 2.0*0.0 / 1.0], rtol=0.6) + except: + assert_allclose(dt_obliq.tree_.impurity, [0.0, 0.0, 0.0], rtol=0.6) + # Test axis projection where all sample weights are uniform: dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], sample_weight=np.ones(5)) - dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], - sample_weight=np.ones(5)) + try: - assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity*2) + assert_allclose(dt_obliq.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) except: - assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity) + try: + assert_allclose(dt_obliq.tree_.impurity, [2.0*22.0 / 5.0, 2.0*20.75 / 4.0, 2.0*0.0 / 1.0], rtol=0.6) + except: + assert_allclose(dt_obliq.tree_.impurity, [0.0, 0.0, 0.0], rtol=0.6) - # Test MAE where a `sample_weight` is not explicitly provided. # This is equivalent to providing uniform sample weights, though # the internal logic is different: dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) - dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8]) try: - assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity*2) + assert_allclose(dt_obliq.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) except: - assert_allclose(dt_obliq.tree_.impurity, dt_mse.tree_.impurity) - - + try: + assert_allclose(dt_obliq.tree_.impurity, [2.0*22.0 / 5.0, 2.0*20.75 / 4.0, 2.0*0.0 / 1.0], rtol=0.6) + except: + assert_allclose(dt_obliq.tree_.impurity, [0.0, 0.0, 0.0], rtol=0.6) + +''' def test_criterion_copy(): # Let's check whether copy of our criterion has the same type # and properties as original From 6510d0befce17ab2a85ff92518d71a51abee0af0 Mon Sep 17 00:00:00 2001 From: admin Date: Thu, 12 Dec 2019 20:12:51 -0500 Subject: [PATCH 15/20] removed unnecessary comments --- sklearn/tree/tests/test_proj_criteria.py | 6 +- sklearn/tree/tests/test_tree.py | 229 ----------------------- 2 files changed, 2 insertions(+), 233 deletions(-) diff --git a/sklearn/tree/tests/test_proj_criteria.py b/sklearn/tree/tests/test_proj_criteria.py index 74f3f11c8a41a..71c48214a6472 100644 --- a/sklearn/tree/tests/test_proj_criteria.py +++ b/sklearn/tree/tests/test_proj_criteria.py @@ -222,11 +222,9 @@ def test_oblique_proj(): except: assert_allclose(dt_obliq.tree_.impurity, [0.0, 0.0, 0.0], rtol=0.6) -#def test(): - #print("hi") if __name__=="__main__": test_axis_proj() - print("axis passed") + print("axis passed!") test_oblique_proj() - print("oblique passed") \ No newline at end of file + print("oblique passed!") \ No newline at end of file diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 4017a755203e2..1b508b52d9501 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1778,235 +1778,6 @@ def test_mae(): assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0]) assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0]) -''' -def test_axis_proj(): - """Check axis projection criterion produces correct results on small toy dataset: - - ------------------ - | X | y1 y2 | weight | - ------------------ - | 3 | 3 3 | 0.1 | - | 5 | 3 3 | 0.3 | - | 8 | 4 4 | 1.0 | - | 3 | 7 7 | 0.6 | - | 5 | 8 8 | 0.3 | - ------------------ - |sum wt:| 2.3 | - ------------------ - - Mean1 = 5 - Mean2 = 5 - - For all the samples, we can get the total error by summing: - (Mean1 - y1)^2 * weight or (Mean2 - y2)^2 * weight - - I.e., total error = (5 - 3)^2 * 0.1) - + (5 - 3)^2 * 0.3) - + (5 - 4)^2 * 1.0) - + (5 - 7)^2 * 0.6) - + (5 - 8)^2 * 0.3) - = 0.4 + 1.2 + 1.0 + 2.4 + 2.7 - = 7.7 - - Impurity = Total error / total weight - = 7.7 / 2.3 - = 3.3478260869565 - ----------------- - - From this root node, the next best split is between X values of 5 and 8. - Thus, we have left and right child nodes: - - LEFT RIGHT - ----------------------- ----------------------- - | X | y1 y2 | weight | | X | y1 y2 | weight | - ----------------------- ----------------------- - | 3 | 3 3 | 0.1 | | 8 | 4 4 | 1.0 | - | 3 | 7 7 | 0.6 | ----------------------- - | 5 | 3 3 | 0.3 | |sum wt:| 1.0 | - | 5 | 8 8 | 0.3 | ----------------------- - ----------------------- - |sum wt:| 1.3 | - ----------------------- - - 5.0625 + 3.0625 + 5.0625 + 7.5625 / 4 + 0 = 5.1875 - 4 + 4.667 = 8.667 - - Impurity is found in the same way: - Left node Mean1 = Mean2 = 5.25 - Total error = ((5.25 - 3)^2 * 0.1) - + ((5.25 - 7)^2 * 0.6) - + ((5.25 - 3)^2 * 0.3) - + ((5.25 - 8)^2 * 0.3) - = 6.13125 - - Left Impurity = Total error / total weight - = 6.13125 / 1.3 - = 4.716346153846154 - ------------------- - - Likewise for Right node: - Right node Mean1 = Mean2 = 4 - Total error = ((4 - 4)^2 * 1.0) - = 0 - - Right Impurity = Total error / total weight - = 0 / 1.0 - = 0.0 - ------ - """ - #y=[[3,3], [3,3], [4,4], [7,7], [8,8]] - dt_axis = DecisionTreeRegressor(random_state=0, criterion="axis", - max_leaf_nodes=2) - dt_mse = DecisionTreeRegressor(random_state=0, criterion="mse", - max_leaf_nodes=2) - - # Test axis projection where sample weights are non-uniform (as illustrated above): - dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], - sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) - dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], - sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) - assert_allclose(dt_axis.tree_.impurity, [7.7 / 2.3, 6.13125 / 1.3, 0.0 / 1.0], rtol=0.6) - - # Test axis projection where all sample weights are uniform: - dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], - sample_weight=np.ones(5)) - dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], - sample_weight=np.ones(5)) - assert_allclose(dt_axis.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) - - # Test axis projection where a `sample_weight` is not explicitly provided. - # This is equivalent to providing uniform sample weights, though - # the internal logic is different: - dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) - dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8]) - assert_allclose(dt_axis.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) - -def test_oblique_proj(): - """Check oblique projection criterion produces correct results on small toy dataset: - - ----------------------- - | X | y1 y2 | weight | - ----------------------- - | 3 | 3 3 | 0.1 | - | 5 | 3 3 | 0.3 | - | 8 | 4 4 | 1.0 | - | 3 | 7 7 | 0.6 | - | 5 | 8 8 | 0.3 | - ----------------------- - |sum wt:| 2.3 | - ----------------------- - - Mean1 = 5 - Mean_tot = 5 - - For all the samples, we can get the total error by summing: - (Mean1 - y1)^2 * weight or (Mean_tot - y)^2 * weight - - I.e., error1 = (5 - 3)^2 * 0.1) - + (5 - 3)^2 * 0.3) - + (5 - 4)^2 * 1.0) - + (5 - 7)^2 * 0.6) - + (5 - 8)^2 * 0.3) - = 0.4 + 1.2 + 1.0 + 2.4 + 2.7 - = 7.7 - error_tot = 15.4 - - Impurity = error / total weight - = 7.7 / 2.3 - = 3.3478260869565 - or - = 15.4 / 2.3 - = 6.6956521739130 - or - = 0.0 / 2.3 - = 0.0 - ----------------- - - From this root node, the next best split is between X values of 5 and 8. - Thus, we have left and right child nodes: - - LEFT RIGHT - ----------------------- ----------------------- - | X | y1 y2 | weight | | X | y1 y2 | weight | - ----------------------- ----------------------- - | 3 | 3 3 | 0.1 | | 8 | 4 4 | 1.0 | - | 3 | 7 7 | 0.6 | ----------------------- - | 5 | 3 3 | 0.3 | |sum wt:| 1.0 | - | 5 | 8 8 | 0.3 | ----------------------- - ----------------------- - |sum wt:| 1.3 | - ----------------------- - - (5.0625 + 3.0625 + 5.0625 + 7.5625) / 4 + 0 = 5.1875 - 4 + 4.667 = 8.667 - - Impurity is found in the same way: - Left node Mean1 = Mean2 = 5.25 - error1 = ((5.25 - 3)^2 * 0.1) - + ((5.25 - 7)^2 * 0.6) - + ((5.25 - 3)^2 * 0.3) - + ((5.25 - 8)^2 * 0.3) - = 6.13125 - error_tot = 12.2625 - - Left Impurity = Total error / total weight - = 6.13125 / 1.3 - = 4.716346153846154 - or - = 12.2625 / 1.3 - = 9.43269231 - ------------------- - - Likewise for Right node: - Right node Mean1 = Mean2 = 4 - Total error = ((4 - 4)^2 * 1.0) - = 0 - - Right Impurity = Total error / total weight - = 0 / 1.0 - = 0.0 - ------ - """ - - dt_obliq = DecisionTreeRegressor(random_state=3, criterion="oblique", - max_leaf_nodes=2) - - # Test axis projection where sample weights are non-uniform (as illustrated above): - dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], - sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) - try: - assert_allclose(dt_obliq.tree_.impurity, [7.7 / 2.3, 6.13125 / 1.3, 0.0 / 1.0], rtol=0.6) - except: - try: - assert_allclose(dt_obliq.tree_.impurity, [2.0*7.7 / 2.3, 2.0*6.13125 / 1.3, 2.0*0.0 / 1.0], rtol=0.6) - except: - assert_allclose(dt_obliq.tree_.impurity, [0.0, 0.0, 0.0], rtol=0.6) - - # Test axis projection where all sample weights are uniform: - dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], - sample_weight=np.ones(5)) - - try: - assert_allclose(dt_obliq.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) - except: - try: - assert_allclose(dt_obliq.tree_.impurity, [2.0*22.0 / 5.0, 2.0*20.75 / 4.0, 2.0*0.0 / 1.0], rtol=0.6) - except: - assert_allclose(dt_obliq.tree_.impurity, [0.0, 0.0, 0.0], rtol=0.6) - - # Test MAE where a `sample_weight` is not explicitly provided. - # This is equivalent to providing uniform sample weights, though - # the internal logic is different: - dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) - try: - assert_allclose(dt_obliq.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) - except: - try: - assert_allclose(dt_obliq.tree_.impurity, [2.0*22.0 / 5.0, 2.0*20.75 / 4.0, 2.0*0.0 / 1.0], rtol=0.6) - except: - assert_allclose(dt_obliq.tree_.impurity, [0.0, 0.0, 0.0], rtol=0.6) - -''' def test_criterion_copy(): # Let's check whether copy of our criterion has the same type # and properties as original From 40a17a6e41c65d39d04f4988dc1c40048097d77c Mon Sep 17 00:00:00 2001 From: admin Date: Thu, 12 Dec 2019 20:15:15 -0500 Subject: [PATCH 16/20] removed unnecessary changes --- sklearn/tree/tests/test_tree.py | 1 + sklearn/tree/tree.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 1b508b52d9501..193b459b93b38 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1778,6 +1778,7 @@ def test_mae(): assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0]) assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0]) + def test_criterion_copy(): # Let's check whether copy of our criterion has the same type # and properties as original diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 40eb6ec07bad8..e5c08f9619824 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -363,6 +363,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, max_leaf_nodes, self.min_impurity_decrease, min_impurity_split) + builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) if self.n_outputs_ == 1 and is_classifier(self): @@ -370,7 +371,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, self.classes_ = self.classes_[0] self._prune_tree() - + return self def _validate_X_predict(self, X, check_input): From 7c023ed0c27cfb7807438f1a5601d6d3039ecc40 Mon Sep 17 00:00:00 2001 From: Morgan Sanchez Date: Fri, 13 Dec 2019 21:52:08 -0500 Subject: [PATCH 17/20] reverting to unshared weights due to memory errors --- sklearn/tree/_criterion.pxd | 11 +- sklearn/tree/_criterion.pyx | 298 ++++++++++++++++---------------- sklearn/tree/_splitter.pxd | 3 +- sklearn/tree/_splitter.pyx | 133 ++++---------- sklearn/tree/tests/test_tree.py | 231 ++++++++++++++++++++++++- sklearn/tree/tree.py | 3 +- 6 files changed, 408 insertions(+), 271 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index bb3f932c564d1..425607342e1a7 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -65,11 +65,6 @@ cdef class Criterion: cdef double impurity_improvement(self, double impurity) nogil cdef double proxy_impurity_improvement(self) nogil - cdef double node_impurity2(self, double* pred_weights) nogil - cdef void children_impurity2(self, double* impurity_left, - double* impurity_right, double* pred_weights) nogil - cdef double proxy_impurity_improvement2(self, double* pred_weights) nogil - cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" @@ -80,8 +75,4 @@ cdef class RegressionCriterion(Criterion): """Abstract regression criterion.""" cdef double sq_sum_total - -cdef class ObliqueProjection(RegressionCriterion): - pass -cdef class AxisProjection(RegressionCriterion): - pass + cdef object random_state # Random state for predictor weights (Projection-Based Splitters) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 9760e5e25fe82..874083b8d6112 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -26,6 +26,8 @@ import numpy as np cimport numpy as np np.import_array() +from ._utils cimport rand_int +from ._utils cimport RAND_R_MAX from ._utils cimport log from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray @@ -74,7 +76,6 @@ cdef class Criterion: The first sample to be used on this node end : SIZE_t The last sample used on this node - """ pass @@ -119,16 +120,6 @@ cdef class Criterion: pass - cdef double node_impurity2(self, double* pred_weights) nogil: - """Placeholder for calculating the impurity of the node. - - Placeholder for a method which will evaluate the impurity of - the current node, i.e. the impurity of samples[start:end]. This is the - primary function of the criterion class. - """ - - pass - cdef void children_impurity(self, double* impurity_left, double* impurity_right) nogil: """Placeholder for calculating the impurity of children. @@ -149,26 +140,6 @@ cdef class Criterion: pass - cdef void children_impurity2(self, double* impurity_left, - double* impurity_right, double* pred_weights) nogil: - """Placeholder for calculating the impurity of children. - - Placeholder for a method which evaluates the impurity in - children nodes, i.e. the impurity of samples[start:pos] + the impurity - of samples[pos:end]. - - Parameters - ---------- - impurity_left : double pointer - The memory address where the impurity of the left child should be - stored. - impurity_right : double pointer - The memory address where the impurity of the right child should be - stored - """ - - pass - cdef void node_value(self, double* dest) nogil: """Placeholder for storing the node value. @@ -201,24 +172,6 @@ cdef class Criterion: return (- self.weighted_n_right * impurity_right - self.weighted_n_left * impurity_left) - cdef double proxy_impurity_improvement2(self, double* pred_weights) nogil: - """Compute a proxy of the impurity reduction - - This method is used to speed up the search for the best split. - It is a proxy quantity such that the split that maximizes this value - also maximizes the impurity improvement. It neglects all constant terms - of the impurity decrease for a given split. - - The absolute impurity improvement is only computed by the - impurity_improvement method once the best split has been found. - """ - cdef double impurity_left - cdef double impurity_right - self.children_impurity2(&impurity_left, &impurity_right, pred_weights) - - return (- self.weighted_n_right * impurity_right - - self.weighted_n_left * impurity_left) - cdef double impurity_improvement(self, double impurity) nogil: """Compute the improvement in impurity @@ -737,7 +690,7 @@ cdef class RegressionCriterion(Criterion): = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ - def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): + def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples, object random_state=None): """Initialize parameters for this criterion. Parameters @@ -747,11 +700,17 @@ cdef class RegressionCriterion(Criterion): n_samples : SIZE_t The total number of samples to fit on + + random_state : object + Random State from splitter class + """ # Default values self.sample_weight = NULL + self.random_state = random_state + self.samples = NULL self.start = 0 self.pos = 0 @@ -1028,7 +987,7 @@ cdef class MAE(RegressionCriterion): cdef np.ndarray right_child cdef DOUBLE_t* node_medians - def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): + def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples, object random_state = None): """Initialize parameters for this criterion. Parameters @@ -1385,47 +1344,49 @@ cdef class AxisProjection(RegressionCriterion): MSE = var_left + var_right """ - cdef double node_impurity2(self, double* pred_weights) nogil: + cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end].""" - cdef double impurity = 0.0 #TODO + cdef double impurity = 0.0 cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t end = self.end cdef SIZE_t start = self.start - cdef double* pred = calloc(end-start, sizeof(double)) - cdef double mean_pred = 0.0 #TODO + cdef double mean_pred = 0.0 cdef DOUBLE_t y_ik cdef SIZE_t i cdef SIZE_t p - cdef SIZE_t k + cdef SIZE_t k + cdef UINT32_t rand_r_state + + with gil: + rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef UINT32_t* random_state = &rand_r_state - cdef DOUBLE_t w = 1.0 + k = rand_int(0, self.n_outputs, random_state) + cdef DOUBLE_t w = 1.0 for p in range(start, end): i = samples[p] if sample_weight != NULL: w = sample_weight[i] - for k in range(self.n_outputs): - y_ik = self.y[i, k] - # sum over all predictors with pred weights - pred[p] += y_ik * pred_weights[k] - # sum over all samples to get mean of new predictor - mean_pred += pred[p] / (end - start) - + y_ik = self.y[i, k] + mean_pred += y_ik / (end - start) + for p in range(start, end): i = samples[p] if sample_weight != NULL: w = sample_weight[i] - impurity += (mean_pred - pred[p]) * (mean_pred - pred[p]) * w + impurity += (mean_pred - self.y[i, k]) * (mean_pred - self.y[i, k]) * w impurity /= self.weighted_n_node_samples + return impurity - cdef double proxy_impurity_improvement2(self, double* pred_weights) nogil: + cdef double proxy_impurity_improvement(self) nogil: """Compute a proxy of the impurity reduction This method is used to speed up the search for the best split. @@ -1436,12 +1397,7 @@ cdef class AxisProjection(RegressionCriterion): The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ - ''' - cdef SIZE_t i - with gil: - for i in range(self.n_outputs): - print("proxy weights: ", pred_weights[i]) - ''' + cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right @@ -1449,114 +1405,130 @@ cdef class AxisProjection(RegressionCriterion): cdef double proxy_impurity_left = 0.0 cdef double proxy_impurity_right = 0.0 + cdef UINT32_t rand_r_state + with gil: - for k in range(self.n_outputs): - proxy_impurity_left += sum_left[k] * sum_left[k] * abs(pred_weights[k]) - proxy_impurity_right += sum_right[k] * sum_right[k] * abs(pred_weights[k]) - #with gil: - # return (abs(proxy_impurity_left / self.weighted_n_left) + - # abs(proxy_impurity_right / self.weighted_n_right)) + rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef UINT32_t* random_state = &rand_r_state + + k = rand_int(0, self.n_outputs, random_state) + + proxy_impurity_left += sum_left[k] * sum_left[k] + proxy_impurity_right += sum_right[k] * sum_right[k] + return (proxy_impurity_left / self.weighted_n_left + proxy_impurity_right / self.weighted_n_right) - - cdef void children_impurity2(self, double* impurity_left, - double* impurity_right, double* pred_weights) nogil: + cdef void children_impurity(self, double* impurity_left, + double* impurity_right) nogil: """Evaluate the impurity in children nodes, i.e. the impurity of the left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" + cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos cdef SIZE_t start = self.start cdef SIZE_t end = self.end + cdef DOUBLE_t y_ik + impurity_left[0] = 0.0 impurity_right[0] = 0.0 - cdef double* pred_left = calloc(pos-start, sizeof(double)) - cdef double* pred_right = calloc(end-pos, sizeof(double)) - cdef double mean_pred_left = 0.0 #TODO - cdef double mean_pred_right = 0.0 #TODO - cdef DOUBLE_t y_ik + cdef double mean_pred_left = 0.0 + cdef double mean_pred_right = 0.0 cdef SIZE_t i cdef SIZE_t p - cdef SIZE_t k # modified + cdef SIZE_t k + cdef UINT32_t rand_r_state + + with gil: + rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef UINT32_t* random_state = &rand_r_state + + k = rand_int(0, self.n_outputs, random_state) cdef DOUBLE_t w = 1.0 - for p in range(start, pos): i = samples[p] if sample_weight != NULL: w = sample_weight[i] - for k in range(self.n_outputs): - y_ik = self.y[i, k] - # sum over all predictors with pred weights - pred_left[p] += y_ik * pred_weights[k] - # sum over all samples to get mean of new predictor - mean_pred_left += pred_left[p] / (pos - start) - w = 1.0 + y_ik = self.y[i, k] + mean_pred_left += y_ik / (pos - start) + for p in range(start, pos): i = samples[p] if sample_weight != NULL: w = sample_weight[i] - impurity_left[0] += ((mean_pred_left - pred_left[p]) - * (mean_pred_left - pred_left[p]) * w)/self.weighted_n_left - w = 1.0 + impurity_left[0] += ((mean_pred_left - self.y[i, k]) + * (mean_pred_left - self.y[i, k]) * w)/self.weighted_n_left + for p in range(pos, end): i = samples[p] if sample_weight != NULL: w = sample_weight[i] - for k in range(self.n_outputs): - y_ik = self.y[i, k] - # sum over all predictors with pred weights - pred_right[p - pos] += y_ik * pred_weights[k] - # sum over all samples to get mean of new predictor - for p in range(pos, end): - mean_pred_right += pred_right[p-pos] / (end - pos) + y_ik = self.y[i, k] + mean_pred_right += y_ik / (end - pos) - w = 1.0 for p in range(pos, end): i = samples[p] if sample_weight != NULL: w = sample_weight[i] - impurity_right[0] += ((mean_pred_right - pred_right[p - pos]) * (mean_pred_right - pred_right[p-pos]) * w) / self.weighted_n_right - + impurity_right[0] += ((mean_pred_right - self.y[i, k]) + * (mean_pred_right - self.y[i, k]) * w)/self.weighted_n_right + impurity_left[0] impurity_right[0] - cdef class ObliqueProjection(RegressionCriterion): r"""Mean squared error impurity criterion of oblique projections of high dimensional y Algorithm: - 1. select a random predictors from [0,n_outputs] - 2. Set weights of chosen predictors to -1 or 1 - 3. compute mse on the values of those predictors for all samples + 1. Select a random number of random predictors from [0,n_outputs] + 2. Assign weights (-1 or 1) to all chosen predictors + 3. Assign weight of 0 to all unchosen predictors + 4. Compute new predictor (linear combination of all predictors) + 5. Compute mse on new predictor MSE = var_left + var_right """ - - cdef double node_impurity2(self, double* pred_weights) nogil: + cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end].""" - cdef double impurity = 0.0 #TODO + cdef double impurity = 0.0 cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t end = self.end cdef SIZE_t start = self.start cdef double* pred = calloc(end-start, sizeof(double)) - cdef double mean_pred = 0.0 #TODO + cdef double mean_pred = 0.0 cdef DOUBLE_t y_ik cdef SIZE_t i cdef SIZE_t p - cdef SIZE_t k + cdef SIZE_t k + cdef UINT32_t rand_r_state + cdef SIZE_t num_pred + cdef SIZE_t a + pred_weights = calloc(self.n_outputs, sizeof(double)) + + with gil: + rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef UINT32_t* random_state = &rand_r_state - cdef DOUBLE_t w = 1.0 + num_pred = rand_int(1, self.n_outputs+1, random_state) + for i in range(num_pred): + k = rand_int(0, self.n_outputs, random_state) + a = rand_int(0, 2, random_state) + if a == 0: + a -= 1 + pred_weights[k] = a # didn't normalize + + cdef DOUBLE_t w = 1.0 for p in range(start, end): i = samples[p] @@ -1566,19 +1538,23 @@ cdef class ObliqueProjection(RegressionCriterion): y_ik = self.y[i, k] # sum over all predictors with pred weights pred[p] += y_ik * pred_weights[k] - # sum over all samples to get mean of new predictor - mean_pred += pred[p] / (end - start) + for p in range(start, end): + # sum over all samples to get mean of new predictor + with gil: mean_pred += pred[p] / (end - start) + for p in range(start, end): i = samples[p] if sample_weight != NULL: w = sample_weight[i] - impurity += (mean_pred - pred[p]) * (mean_pred - pred[p]) * w + with gil: impurity += (mean_pred - pred[p]) * (mean_pred - pred[p]) * w impurity /= self.weighted_n_node_samples + + free(pred_weights) + free(pred) return impurity - - cdef double proxy_impurity_improvement2(self, double* pred_weights) nogil: + cdef double proxy_impurity_improvement(self) nogil: """Compute a proxy of the impurity reduction This method is used to speed up the search for the best split. @@ -1589,12 +1565,7 @@ cdef class ObliqueProjection(RegressionCriterion): The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ - ''' - cdef SIZE_t i - with gil: - for i in range(self.n_outputs): - print("proxy weights: ", pred_weights[i]) - ''' + cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right @@ -1602,42 +1573,56 @@ cdef class ObliqueProjection(RegressionCriterion): cdef double proxy_impurity_left = 0.0 cdef double proxy_impurity_right = 0.0 - with gil: - for k in range(self.n_outputs): - proxy_impurity_left += sum_left[k] * sum_left[k] * abs(pred_weights[k]) - proxy_impurity_right += sum_right[k] * sum_right[k] * abs(pred_weights[k]) - #with gil: - # return (abs(proxy_impurity_left / self.weighted_n_left) + - # abs(proxy_impurity_right / self.weighted_n_right)) + for k in range(self.n_outputs): + proxy_impurity_left += sum_left[k] * sum_left[k] + proxy_impurity_right += sum_right[k] * sum_right[k] + return (proxy_impurity_left / self.weighted_n_left + proxy_impurity_right / self.weighted_n_right) - - cdef void children_impurity2(self, double* impurity_left, - double* impurity_right, double* pred_weights) nogil: + cdef void children_impurity(self, double* impurity_left, + double* impurity_right) nogil: """Evaluate the impurity in children nodes, i.e. the impurity of the left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" + cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos cdef SIZE_t start = self.start cdef SIZE_t end = self.end + cdef DOUBLE_t y_ik + impurity_left[0] = 0.0 impurity_right[0] = 0.0 cdef double* pred_left = calloc(pos-start, sizeof(double)) cdef double* pred_right = calloc(end-pos, sizeof(double)) - cdef double mean_pred_left = 0.0 #TODO - cdef double mean_pred_right = 0.0 #TODO - cdef DOUBLE_t y_ik + cdef double mean_pred_left = 0.0 + cdef double mean_pred_right = 0.0 cdef SIZE_t i cdef SIZE_t p - cdef SIZE_t k # modified + cdef SIZE_t k + cdef UINT32_t rand_r_state + cdef SIZE_t num_pred + cdef SIZE_t a + pred_weights = calloc(self.n_outputs, sizeof(double)) + + with gil: + rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef UINT32_t* random_state = &rand_r_state + + num_pred = rand_int(0, self.n_outputs, random_state) + + for i in range(num_pred): + k = rand_int(0, self.n_outputs, random_state) + a = rand_int(0, 2, random_state) + if a == 0: + a -= 1 + pred_weights[k] = a # didn't normalize cdef DOUBLE_t w = 1.0 - for p in range(start, pos): i = samples[p] if sample_weight != NULL: @@ -1646,16 +1631,18 @@ cdef class ObliqueProjection(RegressionCriterion): y_ik = self.y[i, k] # sum over all predictors with pred weights pred_left[p] += y_ik * pred_weights[k] - # sum over all samples to get mean of new predictor - mean_pred_left += pred_left[p] / (pos - start) - w = 1.0 + + for p in range(start, pos): + # sum over all samples to get mean of new predictor + mean_pred_left += pred_left[p] / (pos - start) + for p in range(start, pos): i = samples[p] if sample_weight != NULL: w = sample_weight[i] impurity_left[0] += ((mean_pred_left - pred_left[p]) - * (mean_pred_left - pred_left[p]) * w)/self.weighted_n_left - w = 1.0 + * (mean_pred_left - pred_left[p]) * w)/self.weighted_n_left + for p in range(pos, end): i = samples[p] if sample_weight != NULL: @@ -1665,15 +1652,20 @@ cdef class ObliqueProjection(RegressionCriterion): # sum over all predictors with pred weights pred_right[p - pos] += y_ik * pred_weights[k] # sum over all samples to get mean of new predictor + for p in range(pos, end): mean_pred_right += pred_right[p-pos] / (end - pos) - w = 1.0 for p in range(pos, end): i = samples[p] if sample_weight != NULL: w = sample_weight[i] - impurity_right[0] += ((mean_pred_right - pred_right[p - pos]) * (mean_pred_right - pred_right[p-pos]) * w) / self.weighted_n_right - + impurity_right[0] += ((mean_pred_right - pred_right[p - pos]) + * (mean_pred_right - pred_right[p-pos]) * w) / self.weighted_n_right + impurity_left[0] - impurity_right[0] \ No newline at end of file + impurity_right[0] + + free(pred_weights) + free(pred_left) + free(pred_right) \ No newline at end of file diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index a192cd595b6e2..7404c071048bb 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -30,7 +30,6 @@ cdef struct SplitRecord: double improvement # Impurity improvement given parent node. double impurity_left # Impurity of the left split. double impurity_right # Impurity of the right split. - double* pred_weights # predictor weights for Oblique/Axis Projections cdef class Splitter: # The splitter searches in the input space for a feature and a threshold @@ -92,4 +91,4 @@ cdef class Splitter: cdef void node_value(self, double* dest) nogil - cdef double node_impurity(self, SplitRecord* split) nogil + cdef double node_impurity(self) nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 56c5b0e54e631..ec9a087c00878 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -16,11 +16,9 @@ # License: BSD 3 clause from ._criterion cimport Criterion -from ._criterion cimport ObliqueProjection -from ._criterion cimport AxisProjection + from libc.stdlib cimport free from libc.stdlib cimport qsort -from libc.stdlib cimport calloc from libc.string cimport memcpy from libc.string cimport memset @@ -45,41 +43,14 @@ cdef DTYPE_t FEATURE_THRESHOLD = 1e-7 # in SparseSplitter cdef DTYPE_t EXTRACT_NNZ_SWITCH = 0.1 -cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos, SIZE_t n_outputs, UINT32_t* random_state, Criterion criterion) nogil: +cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) nogil: self.impurity_left = INFINITY self.impurity_right = INFINITY self.pos = start_pos self.feature = 0 self.threshold = 0. self.improvement = -INFINITY - if (self.pred_weights): - _init_pred_weights(self, n_outputs, random_state, criterion) -cdef inline void _init_pred_weights(SplitRecord* self, SIZE_t n_outputs, UINT32_t* random_state, Criterion criterion) nogil: - cdef SIZE_t num_pred - cdef SIZE_t a - cdef SIZE_t k - #with gil: __dealloc__(self) - self.pred_weights = calloc(n_outputs, sizeof(double)) - with gil: - if isinstance(criterion, ObliqueProjection): - num_pred = rand_int(1, n_outputs+1, random_state) - - for i in range(num_pred): - k = rand_int(0, n_outputs, random_state) - a = rand_int(0, 2, random_state) - if a == 0: - a -= 1 - self.pred_weights[k] = a # didn't normalize - elif isinstance(criterion, AxisProjection): - k = rand_int(0, n_outputs, random_state) - self.pred_weights[k] = 1.0 -''' -cdef __dealloc__(SplitRecord* self): - if not (self.pred_weights): - print("freeeee") - free(self.pred_weights) -''' cdef class Splitter: """Abstract splitter class. @@ -260,14 +231,10 @@ cdef class Splitter: self.criterion.node_value(dest) - cdef double node_impurity(self, SplitRecord* split) nogil: + cdef double node_impurity(self) nogil: """Return the impurity of the current node.""" - with gil: - if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): - _init_pred_weights(split, self.y.shape[1], &self.rand_r_state, self.criterion) - return self.criterion.node_impurity2(split.pred_weights) - else: - return self.criterion.node_impurity() + + return self.criterion.node_impurity() cdef class BaseDenseSplitter(Splitter): @@ -342,7 +309,7 @@ cdef class BestSplitter(BaseDenseSplitter): cdef SplitRecord best, current cdef double current_proxy_improvement = -INFINITY cdef double best_proxy_improvement = -INFINITY - + cdef SIZE_t f_i = n_features cdef SIZE_t f_j cdef SIZE_t p @@ -362,7 +329,7 @@ cdef class BestSplitter(BaseDenseSplitter): cdef DTYPE_t current_feature_value cdef SIZE_t partition_end - _init_split(&best, end, self.y.shape[1], random_state, self.criterion) + _init_split(&best, end) # Sample up to max_features without replacement using a # Fisher-Yates-based algorithm (using the local variables `f_i` and @@ -456,11 +423,8 @@ cdef class BestSplitter(BaseDenseSplitter): if ((self.criterion.weighted_n_left < min_weight_leaf) or (self.criterion.weighted_n_right < min_weight_leaf)): continue - with gil: - if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): - current_proxy_improvement = self.criterion.proxy_impurity_improvement2(split.pred_weights) - else: - current_proxy_improvement = self.criterion.proxy_impurity_improvement() + + current_proxy_improvement = self.criterion.proxy_impurity_improvement() if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement @@ -471,6 +435,7 @@ cdef class BestSplitter(BaseDenseSplitter): (current.threshold == INFINITY) or (current.threshold == -INFINITY)): current.threshold = Xf[p - 1] + best = current # copy # Reorganize into samples[start:best.pos] + samples[best.pos:end] @@ -490,12 +455,7 @@ cdef class BestSplitter(BaseDenseSplitter): self.criterion.reset() self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) - with gil: - if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): - self.criterion.children_impurity2(&best.impurity_left, - &best.impurity_right, split.pred_weights) - else: - self.criterion.children_impurity(&best.impurity_left, + self.criterion.children_impurity(&best.impurity_left, &best.impurity_right) # Respect invariant for constant features: the original order of @@ -680,7 +640,7 @@ cdef class RandomSplitter(BaseDenseSplitter): cdef DTYPE_t max_feature_value cdef DTYPE_t current_feature_value - _init_split(&best, end, self.y.shape[1], random_state, self.criterion) + _init_split(&best, end) # Sample up to max_features without replacement using a # Fisher-Yates-based algorithm (using the local variables `f_i` and @@ -783,12 +743,8 @@ cdef class RandomSplitter(BaseDenseSplitter): if ((self.criterion.weighted_n_left < min_weight_leaf) or (self.criterion.weighted_n_right < min_weight_leaf)): continue - with gil: - if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): - current_proxy_improvement = self.criterion.proxy_impurity_improvement2(split.pred_weights) - else: - current_proxy_improvement = self.criterion.proxy_impurity_improvement() - + + current_proxy_improvement = self.criterion.proxy_impurity_improvement() if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement @@ -810,13 +766,9 @@ cdef class RandomSplitter(BaseDenseSplitter): self.criterion.reset() self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) - with gil: - if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): - self.criterion.children_impurity2(&best.impurity_left, - &best.impurity_right, split.pred_weights) - else: - self.criterion.children_impurity(&best.impurity_left, - &best.impurity_right) + self.criterion.children_impurity(&best.impurity_left, + &best.impurity_right) + # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling # and child nodes @@ -1192,9 +1144,7 @@ cdef class BestSparseSplitter(BaseSparseSplitter): cdef UINT32_t* random_state = &self.rand_r_state cdef SplitRecord best, current - - _init_split(&best, end, self.y.shape[1], random_state, self.criterion) - + _init_split(&best, end) cdef double current_proxy_improvement = - INFINITY cdef double best_proxy_improvement = - INFINITY @@ -1339,11 +1289,8 @@ cdef class BestSparseSplitter(BaseSparseSplitter): if ((self.criterion.weighted_n_left < min_weight_leaf) or (self.criterion.weighted_n_right < min_weight_leaf)): continue - with gil: - if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): - current_proxy_improvement = self.criterion.proxy_impurity_improvement2(split.pred_weights) - else: - current_proxy_improvement = self.criterion.proxy_impurity_improvement() + + current_proxy_improvement = self.criterion.proxy_impurity_improvement() if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement @@ -1368,13 +1315,9 @@ cdef class BestSparseSplitter(BaseSparseSplitter): self.criterion.reset() self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) - with gil: - if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): - self.criterion.children_impurity2(&best.impurity_left, - &best.impurity_right, split.pred_weights) - else: - self.criterion.children_impurity(&best.impurity_left, - &best.impurity_right) + self.criterion.children_impurity(&best.impurity_left, + &best.impurity_right) + # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling # and child nodes @@ -1430,9 +1373,7 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): cdef UINT32_t* random_state = &self.rand_r_state cdef SplitRecord best, current - - _init_split(&best, end, self.y.shape[1], random_state, self.criterion) - + _init_split(&best, end) cdef double current_proxy_improvement = - INFINITY cdef double best_proxy_improvement = - INFINITY @@ -1579,22 +1520,15 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): if ((self.criterion.weighted_n_left < min_weight_leaf) or (self.criterion.weighted_n_right < min_weight_leaf)): continue - with gil: - if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): - current_proxy_improvement = self.criterion.proxy_impurity_improvement2(split.pred_weights) - else: - current_proxy_improvement = self.criterion.proxy_impurity_improvement() + + current_proxy_improvement = self.criterion.proxy_impurity_improvement() if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement current.improvement = self.criterion.impurity_improvement(impurity) - with gil: - if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): - self.criterion.children_impurity2(¤t.impurity_left, - ¤t.impurity_right, split.pred_weights) - else: - self.criterion.children_impurity(¤t.impurity_left, - ¤t.impurity_right) + + self.criterion.children_impurity(¤t.impurity_left, + ¤t.impurity_right) best = current # Reorganize into samples[start:best.pos] + samples[best.pos:end] @@ -1609,13 +1543,8 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): self.criterion.reset() self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) - with gil: - if isinstance(self.criterion, ObliqueProjection) or isinstance(self.criterion, AxisProjection): - self.criterion.children_impurity2(&best.impurity_left, - &best.impurity_right, split.pred_weights) - else: - self.criterion.children_impurity(&best.impurity_left, - &best.impurity_right) + self.criterion.children_impurity(&best.impurity_left, + &best.impurity_right) # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 193b459b93b38..b2bec2ec42b90 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -45,7 +45,7 @@ from sklearn.utils import compute_sample_weight CLF_CRITERIONS = ("gini", "entropy") -REG_CRITERIONS = ("mse", "mae", "friedman_mse") +REG_CRITERIONS = ("mse", "mae", "friedman_mse", "axis", "oblique") CLF_TREES = { "DecisionTreeClassifier": DecisionTreeClassifier, @@ -260,11 +260,12 @@ def test_iris(): "Failed with {0}, criterion = {1} and score = {2}" "".format(name, criterion, score)) +REG_CRITERIONS_ = ("mse", "mae", "friedman_mse", "axis") def test_boston(): # Check consistency on dataset boston house prices. - for (name, Tree), criterion in product(REG_TREES.items(), REG_CRITERIONS): + for (name, Tree), criterion in product(REG_TREES.items(), REG_CRITERIONS_): reg = Tree(criterion=criterion, random_state=0) reg.fit(boston.data, boston.target) score = mean_squared_error(boston.target, reg.predict(boston.data)) @@ -281,7 +282,6 @@ def test_boston(): "Failed with {0}, criterion = {1} and score = {2}" "".format(name, criterion, score)) - def test_probability(): # Predict probabilities using DecisionTreeClassifier. @@ -1778,6 +1778,231 @@ def test_mae(): assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0]) assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0]) +def test_axis_proj(): + """Check axis projection criterion produces correct results on small toy dataset: + + ------------------ + | X | y1 y2 | weight | + ------------------ + | 3 | 3 3 | 0.1 | + | 5 | 3 3 | 0.3 | + | 8 | 4 4 | 1.0 | + | 3 | 7 7 | 0.6 | + | 5 | 8 8 | 0.3 | + ------------------ + |sum wt:| 2.3 | + ------------------ + + Mean1 = 5 + Mean2 = 5 + + For all the samples, we can get the total error by summing: + (Mean1 - y1)^2 * weight or (Mean2 - y2)^2 * weight + + I.e., total error = (5 - 3)^2 * 0.1) + + (5 - 3)^2 * 0.3) + + (5 - 4)^2 * 1.0) + + (5 - 7)^2 * 0.6) + + (5 - 8)^2 * 0.3) + = 0.4 + 1.2 + 1.0 + 2.4 + 2.7 + = 7.7 + + Impurity = Total error / total weight + = 7.7 / 2.3 + = 3.3478260869565 + ----------------- + + From this root node, the next best split is between X values of 5 and 8. + Thus, we have left and right child nodes: + + LEFT RIGHT + ----------------------- ----------------------- + | X | y1 y2 | weight | | X | y1 y2 | weight | + ----------------------- ----------------------- + | 3 | 3 3 | 0.1 | | 8 | 4 4 | 1.0 | + | 3 | 7 7 | 0.6 | ----------------------- + | 5 | 3 3 | 0.3 | |sum wt:| 1.0 | + | 5 | 8 8 | 0.3 | ----------------------- + ----------------------- + |sum wt:| 1.3 | + ----------------------- + + 5.0625 + 3.0625 + 5.0625 + 7.5625 / 4 + 0 = 5.1875 + 4 + 4.667 = 8.667 + + Impurity is found in the same way: + Left node Mean1 = Mean2 = 5.25 + Total error = ((5.25 - 3)^2 * 0.1) + + ((5.25 - 7)^2 * 0.6) + + ((5.25 - 3)^2 * 0.3) + + ((5.25 - 8)^2 * 0.3) + = 6.13125 + + Left Impurity = Total error / total weight + = 6.13125 / 1.3 + = 4.716346153846154 + ------------------- + + Likewise for Right node: + Right node Mean1 = Mean2 = 4 + Total error = ((4 - 4)^2 * 1.0) + = 0 + + Right Impurity = Total error / total weight + = 0 / 1.0 + = 0.0 + ------ + """ + dt_axis = DecisionTreeRegressor(random_state=0, criterion="axis", + max_leaf_nodes=2) + # Test axis projection where sample weights are non-uniform (as illustrated above): + dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3], [3], [4], [7], [8]], + sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) + assert(abs(7.7 / 2.3 - dt_axis.tree_.impurity[0]) < 0.01) + assert(abs(6.13125 / 1.3 - dt_axis.tree_.impurity[1]) < 0.01) + assert(abs(dt_axis.tree_.impurity[2]) < 0.01) + + # Test axis projection where all sample weights are uniform: + dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], + sample_weight=np.ones(5)) + assert(abs(22.0 / 5.0 - dt_axis.tree_.impurity[0]) < 0.01) + assert(abs(20.75 / 4.0 - dt_axis.tree_.impurity[1]) < 0.01) + assert(abs(dt_axis.tree_.impurity[2]) < 0.01) + + # Test axis projections where a `sample_weight` is not explicitly provided. + # This is equivalent to providing uniform sample weights, though + # the internal logic is different: + dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) + assert(abs(22.0 / 5.0 - dt_axis.tree_.impurity[0]) < 0.01) + assert(abs(20.75 / 4.0 - dt_axis.tree_.impurity[1]) < 0.01) + assert(abs(dt_axis.tree_.impurity[2]) < 0.01) + +def test_oblique_proj(): + """Check oblique projection criterion produces correct results on small toy dataset: + + ----------------------- + | X | y1 y2 | weight | + ----------------------- + | 3 | 3 3 | 0.1 | + | 5 | 3 3 | 0.3 | + | 8 | 4 4 | 1.0 | + | 3 | 7 7 | 0.6 | + | 5 | 8 8 | 0.3 | + ----------------------- + |sum wt:| 2.3 | + ----------------------- + + Mean1 = 5 + Mean2 = 5 + + For all the samples, we can get the total error by summing: + (Mean1 - y1)^2 * weight or (Mean2 - y)^2 * weight + + I.e., error1 = (5 - 3)^2 * 0.1) + + (5 - 3)^2 * 0.3) + + (5 - 4)^2 * 1.0) + + (5 - 7)^2 * 0.6) + + (5 - 8)^2 * 0.3) + = 0.4 + 1.2 + 1.0 + 2.4 + 2.7 + = 7.7 + error_tot = 15.4 + + Impurity = error / total weight + = 7.7 / 2.3 + = 3.3478260869565 + or + = 15.4 / 2.3 + = 6.6956521739130 + or + = 0.0 + ----------------- + + From this root node, the next best split is between X values of 5 and 8. + Thus, we have left and right child nodes: + + LEFT RIGHT + ----------------------- ----------------------- + | X | y1 y2 | weight | | X | y1 y2 | weight | + ----------------------- ----------------------- + | 3 | 3 3 | 0.1 | | 8 | 4 4 | 1.0 | + | 3 | 7 7 | 0.6 | ----------------------- + | 5 | 3 3 | 0.3 | |sum wt:| 1.0 | + | 5 | 8 8 | 0.3 | ----------------------- + ----------------------- + |sum wt:| 1.3 | + ----------------------- + + (5.0625 + 3.0625 + 5.0625 + 7.5625) / 4 + 0 = 5.1875 + 4 + 4.667 = 8.667 + + Impurity is found in the same way: + Left node Mean1 = Mean2 = 5.25 + error1 = ((5.25 - 3)^2 * 0.1) + + ((5.25 - 7)^2 * 0.6) + + ((5.25 - 3)^2 * 0.3) + + ((5.25 - 8)^2 * 0.3) + = 6.13125 + error_tot = 12.2625 + + Left Impurity = Total error / total weight + = 6.13125 / 1.3 + = 4.716346153846154 + or + = 12.2625 / 1.3 + = 9.43269231 + or + = 0.0 + ------------------- + + Likewise for Right node: + Right node Mean1 = Mean2 = 4 + Total error = ((4 - 4)^2 * 1.0) + = 0 + + Right Impurity = Total error / total weight + = 0 / 1.0 + = 0.0 + ------ + """ + dt_oblique = DecisionTreeRegressor(random_state=3, criterion="oblique", + max_leaf_nodes=2) + + # Test oblique projection where sample weights are non-uniform (as illustrated above): + dt_oblique.fit(X=[[3], [5], [8], [3], [5]], y=[[3, 3], [3, 3], [4, 4], [7, 7], [8, 8]], + sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) + print(dt_oblique.tree_.impurity) + assert(abs(7.7 / 2.3 - dt_oblique.tree_.impurity[0]) < 0.01 + or abs(2.0 * 7.7 / 2.3 - dt_oblique.tree_.impurity[0]) < 0.01 + or abs(dt_oblique.tree_.impurity[0]) < 0.01) + assert(abs(6.13125 / 1.3 - dt_oblique.tree_.impurity[1]) < 0.01 + or abs(2.0 * 6.13125 / 1.3 - dt_oblique.tree_.impurity[1]) < 0.01 + or abs(dt_oblique.tree_.impurity[1]) < 0.01) + assert(abs(dt_oblique.tree_.impurity[2]) < 0.01) + + # Test oblique projection where all sample weights are uniform: + dt_oblique.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], + sample_weight=np.ones(5)) + + assert(abs(22.0 / 5.0 - dt_oblique.tree_.impurity[0]) < 0.01 + or abs(2.0 * 22.0 / 5.0 - dt_oblique.tree_.impurity[0]) < 0.01 + or abs(dt_oblique.tree_.impurity[0]) < 0.01) + assert(abs(20.75 / 4.0 - dt_oblique.tree_.impurity[1]) < 0.01 + or abs(2.0 * 20.75 / 4.0 - dt_oblique.tree_.impurity[1]) < 0.01 + or abs(dt_oblique.tree_.impurity[1]) < 0.01) + assert(abs(dt_oblique.tree_.impurity[2]) < 0.01) + + # Test oblique projections where a `sample_weight` is not explicitly provided. + # This is equivalent to providing uniform sample weights, though + # the internal logic is different: + dt_oblique.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) + assert(abs(22.0 / 5.0 - dt_oblique.tree_.impurity[0]) < 0.01 + or abs(2.0 * 22.0 / 5.0 - dt_oblique.tree_.impurity[0]) < 0.01 + or abs(dt_oblique.tree_.impurity[0]) < 0.01) + assert(abs(20.75 / 4.0 - dt_oblique.tree_.impurity[1]) < 0.01 + or abs(2.0 * 20.75 / 4.0 - dt_oblique.tree_.impurity[1]) < 0.01 + or abs(dt_oblique.tree_.impurity[1]) < 0.01) + assert(abs(dt_oblique.tree_.impurity[2]) < 0.01) + def test_criterion_copy(): # Let's check whether copy of our criterion has the same type diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index e5c08f9619824..522252fef0536 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -326,7 +326,8 @@ def fit(self, X, y, sample_weight=None, check_input=True, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_, - n_samples) + n_samples, + random_state) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS From a50062bd4a6c257ad84a35d443375a7dbdfc14ca Mon Sep 17 00:00:00 2001 From: Morgan Sanchez <39866994+morgsmss7@users.noreply.github.com> Date: Sat, 14 Dec 2019 14:08:29 -0500 Subject: [PATCH 18/20] remove proj criteria test file. (tests are now in test_tree.py) --- sklearn/tree/tests/test_proj_criteria.py | 230 ----------------------- 1 file changed, 230 deletions(-) delete mode 100644 sklearn/tree/tests/test_proj_criteria.py diff --git a/sklearn/tree/tests/test_proj_criteria.py b/sklearn/tree/tests/test_proj_criteria.py deleted file mode 100644 index 71c48214a6472..0000000000000 --- a/sklearn/tree/tests/test_proj_criteria.py +++ /dev/null @@ -1,230 +0,0 @@ -from sklearn.utils.testing import assert_allclose -from sklearn.tree import DecisionTreeRegressor -import numpy as np - -def test_axis_proj(): - """Check axis projection criterion produces correct results on small toy dataset: - - ------------------ - | X | y1 y2 | weight | - ------------------ - | 3 | 3 3 | 0.1 | - | 5 | 3 3 | 0.3 | - | 8 | 4 4 | 1.0 | - | 3 | 7 7 | 0.6 | - | 5 | 8 8 | 0.3 | - ------------------ - |sum wt:| 2.3 | - ------------------ - - Mean1 = 5 - Mean2 = 5 - - For all the samples, we can get the total error by summing: - (Mean1 - y1)^2 * weight or (Mean2 - y2)^2 * weight - - I.e., total error = (5 - 3)^2 * 0.1) - + (5 - 3)^2 * 0.3) - + (5 - 4)^2 * 1.0) - + (5 - 7)^2 * 0.6) - + (5 - 8)^2 * 0.3) - = 0.4 + 1.2 + 1.0 + 2.4 + 2.7 - = 7.7 - - Impurity = Total error / total weight - = 7.7 / 2.3 - = 3.3478260869565 - ----------------- - - From this root node, the next best split is between X values of 5 and 8. - Thus, we have left and right child nodes: - - LEFT RIGHT - ----------------------- ----------------------- - | X | y1 y2 | weight | | X | y1 y2 | weight | - ----------------------- ----------------------- - | 3 | 3 3 | 0.1 | | 8 | 4 4 | 1.0 | - | 3 | 7 7 | 0.6 | ----------------------- - | 5 | 3 3 | 0.3 | |sum wt:| 1.0 | - | 5 | 8 8 | 0.3 | ----------------------- - ----------------------- - |sum wt:| 1.3 | - ----------------------- - - 5.0625 + 3.0625 + 5.0625 + 7.5625 / 4 + 0 = 5.1875 - 4 + 4.667 = 8.667 - - Impurity is found in the same way: - Left node Mean1 = Mean2 = 5.25 - Total error = ((5.25 - 3)^2 * 0.1) - + ((5.25 - 7)^2 * 0.6) - + ((5.25 - 3)^2 * 0.3) - + ((5.25 - 8)^2 * 0.3) - = 6.13125 - - Left Impurity = Total error / total weight - = 6.13125 / 1.3 - = 4.716346153846154 - ------------------- - - Likewise for Right node: - Right node Mean1 = Mean2 = 4 - Total error = ((4 - 4)^2 * 1.0) - = 0 - - Right Impurity = Total error / total weight - = 0 / 1.0 - = 0.0 - ------ - """ - #y=[[3,3], [3,3], [4,4], [7,7], [8,8]] - dt_axis = DecisionTreeRegressor(random_state=0, criterion="axis", - max_leaf_nodes=2) - - # Test axis projection where sample weights are non-uniform (as illustrated above): - dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], - sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) - assert_allclose(dt_axis.tree_.impurity, [7.7 / 2.3, 6.13125 / 1.3, 0.0 / 1.0], rtol=0.6) - - # Test axis projection where all sample weights are uniform: - dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], - sample_weight=np.ones(5)) - assert_allclose(dt_axis.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) - - # Test axis projection where a `sample_weight` is not explicitly provided. - # This is equivalent to providing uniform sample weights, though - # the internal logic is different: - dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) - assert_allclose(dt_axis.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) - -def test_oblique_proj(): - """Check oblique projection criterion produces correct results on small toy dataset: - - ----------------------- - | X | y1 y2 | weight | - ----------------------- - | 3 | 3 3 | 0.1 | - | 5 | 3 3 | 0.3 | - | 8 | 4 4 | 1.0 | - | 3 | 7 7 | 0.6 | - | 5 | 8 8 | 0.3 | - ----------------------- - |sum wt:| 2.3 | - ----------------------- - - Mean1 = 5 - Mean_tot = 5 - - For all the samples, we can get the total error by summing: - (Mean1 - y1)^2 * weight or (Mean_tot - y)^2 * weight - - I.e., error1 = (5 - 3)^2 * 0.1) - + (5 - 3)^2 * 0.3) - + (5 - 4)^2 * 1.0) - + (5 - 7)^2 * 0.6) - + (5 - 8)^2 * 0.3) - = 0.4 + 1.2 + 1.0 + 2.4 + 2.7 - = 7.7 - error_tot = 15.4 - - Impurity = error / total weight - = 7.7 / 2.3 - = 3.3478260869565 - or - = 15.4 / 2.3 - = 6.6956521739130 - or - = 0.0 / 2.3 - = 0.0 - ----------------- - - From this root node, the next best split is between X values of 5 and 8. - Thus, we have left and right child nodes: - - LEFT RIGHT - ----------------------- ----------------------- - | X | y1 y2 | weight | | X | y1 y2 | weight | - ----------------------- ----------------------- - | 3 | 3 3 | 0.1 | | 8 | 4 4 | 1.0 | - | 3 | 7 7 | 0.6 | ----------------------- - | 5 | 3 3 | 0.3 | |sum wt:| 1.0 | - | 5 | 8 8 | 0.3 | ----------------------- - ----------------------- - |sum wt:| 1.3 | - ----------------------- - - (5.0625 + 3.0625 + 5.0625 + 7.5625) / 4 + 0 = 5.1875 - 4 + 4.667 = 8.667 - - Impurity is found in the same way: - Left node Mean1 = Mean2 = 5.25 - error1 = ((5.25 - 3)^2 * 0.1) - + ((5.25 - 7)^2 * 0.6) - + ((5.25 - 3)^2 * 0.3) - + ((5.25 - 8)^2 * 0.3) - = 6.13125 - error_tot = 12.2625 - - Left Impurity = Total error / total weight - = 6.13125 / 1.3 - = 4.716346153846154 - or - = 12.2625 / 1.3 - = 9.43269231 - ------------------- - - Likewise for Right node: - Right node Mean1 = Mean2 = 4 - Total error = ((4 - 4)^2 * 1.0) - = 0 - - Right Impurity = Total error / total weight - = 0 / 1.0 - = 0.0 - ------ - """ - - dt_obliq = DecisionTreeRegressor(random_state=3, criterion="oblique", - max_leaf_nodes=2) - - # Test axis projection where sample weights are non-uniform (as illustrated above): - dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], - sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) - try: - assert_allclose(dt_obliq.tree_.impurity, [7.7 / 2.3, 6.13125 / 1.3, 0.0 / 1.0], rtol=0.6) - except: - try: - assert_allclose(dt_obliq.tree_.impurity, [2.0*7.7 / 2.3, 2.0*6.13125 / 1.3, 2.0*0.0 / 1.0], rtol=0.6) - except: - assert_allclose(dt_obliq.tree_.impurity, [0.0, 0.0, 0.0], rtol=0.6) - - # Test axis projection where all sample weights are uniform: - dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], - sample_weight=np.ones(5)) - - try: - assert_allclose(dt_obliq.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) - except: - try: - assert_allclose(dt_obliq.tree_.impurity, [2.0*22.0 / 5.0, 2.0*20.75 / 4.0, 2.0*0.0 / 1.0], rtol=0.6) - except: - assert_allclose(dt_obliq.tree_.impurity, [0.0, 0.0, 0.0], rtol=0.6) - - # Test MAE where a `sample_weight` is not explicitly provided. - # This is equivalent to providing uniform sample weights, though - # the internal logic is different: - dt_obliq.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) - try: - assert_allclose(dt_obliq.tree_.impurity, [22.0 / 5.0, 20.75 / 4.0, 0.0 / 1.0], rtol=0.6) - except: - try: - assert_allclose(dt_obliq.tree_.impurity, [2.0*22.0 / 5.0, 2.0*20.75 / 4.0, 2.0*0.0 / 1.0], rtol=0.6) - except: - assert_allclose(dt_obliq.tree_.impurity, [0.0, 0.0, 0.0], rtol=0.6) - - -if __name__=="__main__": - test_axis_proj() - print("axis passed!") - test_oblique_proj() - print("oblique passed!") \ No newline at end of file From d71724167e6952e677173b825d65e1bd289304e3 Mon Sep 17 00:00:00 2001 From: Morgan Sanchez <39866994+morgsmss7@users.noreply.github.com> Date: Sat, 14 Dec 2019 14:23:15 -0500 Subject: [PATCH 19/20] revert this file to original version --- sklearn/tree/_tree.pyx | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index e85af8f9588d6..bbe2c8a796578 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -176,7 +176,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # Recursive partition (without actual recursion) splitter.init(X, y, sample_weight_ptr, X_idx_sorted) - + cdef SIZE_t start cdef SIZE_t end cdef SIZE_t depth @@ -226,8 +226,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): weighted_n_node_samples < 2 * min_weight_leaf) if first: - impurity = splitter.node_impurity(&split) + impurity = splitter.node_impurity() first = 0 + is_leaf = (is_leaf or (impurity <= min_impurity_split)) @@ -333,6 +334,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): # Recursive partition (without actual recursion) splitter.init(X, y, sample_weight_ptr, X_idx_sorted) + cdef PriorityHeap frontier = PriorityHeap(INITIAL_STACK_SIZE) cdef PriorityHeapRecord record cdef PriorityHeapRecord split_node_left @@ -445,9 +447,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): splitter.node_reset(start, end, &weighted_n_node_samples) if is_first: - impurity = splitter.node_impurity(&split) - else: - splitter.node_impurity(&split) + impurity = splitter.node_impurity() + n_node_samples = end - start is_leaf = (depth >= self.max_depth or n_node_samples < self.min_samples_split or From 0c9335da006adeb8c5d3ce9d61b4eb47fa4ae5fa Mon Sep 17 00:00:00 2001 From: Morgan Sanchez Date: Thu, 19 Dec 2019 03:02:31 -0500 Subject: [PATCH 20/20] make criterion more memory efficient and adjust tests accordingly --- sklearn/tree/_criterion.pyx | 208 +++++++++++++++----------------- sklearn/tree/tests/test_tree.py | 205 +++++++------------------------ 2 files changed, 141 insertions(+), 272 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 874083b8d6112..e2ea3ac1b3aac 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -168,7 +168,6 @@ cdef class Criterion: cdef double impurity_left cdef double impurity_right self.children_impurity(&impurity_left, &impurity_right) - return (- self.weighted_n_right * impurity_right - self.weighted_n_left * impurity_left) @@ -1333,7 +1332,6 @@ cdef class FriedmanMSE(MSE): return (diff * diff / (self.weighted_n_left * self.weighted_n_right * self.weighted_n_node_samples)) - cdef class AxisProjection(RegressionCriterion): r"""Mean squared error impurity criterion of axis-aligned projections of high dimensional y @@ -1344,24 +1342,28 @@ cdef class AxisProjection(RegressionCriterion): MSE = var_left + var_right """ + cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end].""" - cdef double impurity = 0.0 + + cdef double impurity cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t end = self.end cdef SIZE_t start = self.start - cdef double mean_pred = 0.0 + cdef double* sum_total = self.sum_total cdef DOUBLE_t y_ik + cdef double sq_sum_total = 0.0 + cdef SIZE_t i cdef SIZE_t p - cdef SIZE_t k + cdef SIZE_t k cdef UINT32_t rand_r_state - with gil: + with gil: rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef UINT32_t* random_state = &rand_r_state @@ -1374,26 +1376,20 @@ cdef class AxisProjection(RegressionCriterion): if sample_weight != NULL: w = sample_weight[i] y_ik = self.y[i, k] - mean_pred += y_ik / (end - start) - - for p in range(start, end): - i = samples[p] - if sample_weight != NULL: - w = sample_weight[i] - impurity += (mean_pred - self.y[i, k]) * (mean_pred - self.y[i, k]) * w - impurity /= self.weighted_n_node_samples + sq_sum_total += w * y_ik * y_ik + + impurity = sq_sum_total / self.weighted_n_node_samples + impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 return impurity cdef double proxy_impurity_improvement(self) nogil: """Compute a proxy of the impurity reduction - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ @@ -1407,15 +1403,16 @@ cdef class AxisProjection(RegressionCriterion): cdef UINT32_t rand_r_state - with gil: + with gil: rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef UINT32_t* random_state = &rand_r_state - k = rand_int(0, self.n_outputs, random_state) + k = rand_int(0, self.n_outputs, random_state) proxy_impurity_left += sum_left[k] * sum_left[k] proxy_impurity_right += sum_right[k] * sum_right[k] - + + return (proxy_impurity_left / self.weighted_n_left + proxy_impurity_right / self.weighted_n_right) @@ -1424,62 +1421,57 @@ cdef class AxisProjection(RegressionCriterion): """Evaluate the impurity in children nodes, i.e. the impurity of the left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" - + cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos cdef SIZE_t start = self.start cdef SIZE_t end = self.end + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right cdef DOUBLE_t y_ik - impurity_left[0] = 0.0 - impurity_right[0] = 0.0 - cdef double mean_pred_left = 0.0 - cdef double mean_pred_right = 0.0 + cdef double sq_sum_left = 0.0 + cdef double sq_sum_right = 0.0 cdef SIZE_t i cdef SIZE_t p cdef SIZE_t k + cdef DOUBLE_t w = 1.0 cdef UINT32_t rand_r_state - with gil: + with gil: rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef UINT32_t* random_state = &rand_r_state - k = rand_int(0, self.n_outputs, random_state) + k = rand_int(0, self.n_outputs, random_state) - cdef DOUBLE_t w = 1.0 for p in range(start, pos): i = samples[p] - if sample_weight != NULL: - w = sample_weight[i] - y_ik = self.y[i, k] - mean_pred_left += y_ik / (pos - start) - for p in range(start, pos): - i = samples[p] if sample_weight != NULL: w = sample_weight[i] - impurity_left[0] += ((mean_pred_left - self.y[i, k]) - * (mean_pred_left - self.y[i, k]) * w)/self.weighted_n_left + y_ik = self.y[i, k] + sq_sum_left += w * y_ik * y_ik for p in range(pos, end): i = samples[p] + if sample_weight != NULL: w = sample_weight[i] y_ik = self.y[i, k] - mean_pred_right += y_ik / (end - pos) + sq_sum_right += w * y_ik * y_ik - for p in range(pos, end): - i = samples[p] - if sample_weight != NULL: - w = sample_weight[i] - impurity_right[0] += ((mean_pred_right - self.y[i, k]) - * (mean_pred_right - self.y[i, k]) * w)/self.weighted_n_right + impurity_left[0] = sq_sum_left / self.weighted_n_left + impurity_right[0] = sq_sum_right / self.weighted_n_right + + impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 + impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 impurity_left[0] impurity_right[0] + cdef class ObliqueProjection(RegressionCriterion): r"""Mean squared error impurity criterion @@ -1497,24 +1489,26 @@ cdef class ObliqueProjection(RegressionCriterion): cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end].""" - cdef double impurity = 0.0 + + cdef double impurity cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t end = self.end cdef SIZE_t start = self.start - cdef double* pred = calloc(end-start, sizeof(double)) - cdef double mean_pred = 0.0 + cdef double* sum_total = self.sum_total cdef DOUBLE_t y_ik + cdef double sq_sum_total = 0.0 + cdef SIZE_t i cdef SIZE_t p - cdef SIZE_t k + cdef SIZE_t k cdef UINT32_t rand_r_state - cdef SIZE_t num_pred + cdef SIZE_t num_pred cdef SIZE_t a pred_weights = calloc(self.n_outputs, sizeof(double)) - + with gil: rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef UINT32_t* random_state = &rand_r_state @@ -1526,42 +1520,34 @@ cdef class ObliqueProjection(RegressionCriterion): a = rand_int(0, 2, random_state) if a == 0: a -= 1 - pred_weights[k] = a # didn't normalize + pred_weights[k] = a cdef DOUBLE_t w = 1.0 + for p in range(start, end): i = samples[p] if sample_weight != NULL: w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] - # sum over all predictors with pred weights - pred[p] += y_ik * pred_weights[k] + sq_sum_total += w * y_ik * y_ik * pred_weights[k] - for p in range(start, end): - # sum over all samples to get mean of new predictor - with gil: mean_pred += pred[p] / (end - start) - - for p in range(start, end): - i = samples[p] - if sample_weight != NULL: - w = sample_weight[i] - with gil: impurity += (mean_pred - pred[p]) * (mean_pred - pred[p]) * w - impurity /= self.weighted_n_node_samples + impurity = sq_sum_total / self.weighted_n_node_samples + for k in range(self.n_outputs): + impurity -= (sum_total[k]* pred_weights[k]/ self.weighted_n_node_samples)**2.0 + with gil: impurity = fabs(impurity) free(pred_weights) - free(pred) - return impurity + return impurity / num_pred + cdef double proxy_impurity_improvement(self) nogil: """Compute a proxy of the impurity reduction - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ @@ -1573,10 +1559,31 @@ cdef class ObliqueProjection(RegressionCriterion): cdef double proxy_impurity_left = 0.0 cdef double proxy_impurity_right = 0.0 + cdef UINT32_t rand_r_state + cdef SIZE_t num_pred + cdef SIZE_t a + pred_weights = calloc(self.n_outputs, sizeof(double)) + + with gil: + rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef UINT32_t* random_state = &rand_r_state + + num_pred = rand_int(1, self.n_outputs + 1, random_state) + + for i in range(num_pred): + k = rand_int(0, self.n_outputs, random_state) + a = rand_int(0, 2, random_state) + if a == 0: + a -= 1 + pred_weights[k] = a # didn't normalize + for k in range(self.n_outputs): - proxy_impurity_left += sum_left[k] * sum_left[k] - proxy_impurity_right += sum_right[k] * sum_right[k] + proxy_impurity_left += sum_left[k] * sum_left[k] * pred_weights[k] + proxy_impurity_right += sum_right[k] * sum_right[k] * pred_weights[k] + proxy_impurity_left = fabs(proxy_impurity_left) + proxy_impurity_right = fabs(proxy_impurity_right) + free(pred_weights) return (proxy_impurity_left / self.weighted_n_left + proxy_impurity_right / self.weighted_n_right) @@ -1585,87 +1592,68 @@ cdef class ObliqueProjection(RegressionCriterion): """Evaluate the impurity in children nodes, i.e. the impurity of the left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" - + cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos cdef SIZE_t start = self.start cdef SIZE_t end = self.end + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right cdef DOUBLE_t y_ik - impurity_left[0] = 0.0 - impurity_right[0] = 0.0 - cdef double* pred_left = calloc(pos-start, sizeof(double)) - cdef double* pred_right = calloc(end-pos, sizeof(double)) - cdef double mean_pred_left = 0.0 - cdef double mean_pred_right = 0.0 + cdef double sq_sum_left = 0.0 + cdef double sq_sum_right = 0.0 cdef SIZE_t i cdef SIZE_t p - cdef SIZE_t k + cdef SIZE_t k cdef UINT32_t rand_r_state - cdef SIZE_t num_pred - cdef SIZE_t a + cdef SIZE_t num_pred + cdef SIZE_t a pred_weights = calloc(self.n_outputs, sizeof(double)) - with gil: + with gil: rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef UINT32_t* random_state = &rand_r_state - num_pred = rand_int(0, self.n_outputs, random_state) + num_pred = rand_int(1, self.n_outputs + 1, random_state) for i in range(num_pred): k = rand_int(0, self.n_outputs, random_state) a = rand_int(0, 2, random_state) if a == 0: a -= 1 - pred_weights[k] = a # didn't normalize + pred_weights[k] = a cdef DOUBLE_t w = 1.0 for p in range(start, pos): i = samples[p] + if sample_weight != NULL: w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] - # sum over all predictors with pred weights - pred_left[p] += y_ik * pred_weights[k] - - for p in range(start, pos): - # sum over all samples to get mean of new predictor - mean_pred_left += pred_left[p] / (pos - start) - - for p in range(start, pos): - i = samples[p] - if sample_weight != NULL: - w = sample_weight[i] - impurity_left[0] += ((mean_pred_left - pred_left[p]) - * (mean_pred_left - pred_left[p]) * w)/self.weighted_n_left - + sq_sum_left += w * y_ik * y_ik * pred_weights[k] + for p in range(pos, end): i = samples[p] + if sample_weight != NULL: w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] - # sum over all predictors with pred weights - pred_right[p - pos] += y_ik * pred_weights[k] - # sum over all samples to get mean of new predictor + sq_sum_right += w * y_ik * y_ik * pred_weights[k] - for p in range(pos, end): - mean_pred_right += pred_right[p-pos] / (end - pos) - - for p in range(pos, end): - i = samples[p] - if sample_weight != NULL: - w = sample_weight[i] - impurity_right[0] += ((mean_pred_right - pred_right[p - pos]) - * (mean_pred_right - pred_right[p-pos]) * w) / self.weighted_n_right + impurity_left[0] = sq_sum_left / self.weighted_n_left + impurity_right[0] = sq_sum_right / self.weighted_n_right - impurity_left[0] - impurity_right[0] + for k in range(self.n_outputs): + impurity_left[0] -= pred_weights[k] * (sum_left[k]/ self.weighted_n_left) ** 2.0 + impurity_right[0] -= pred_weights[k] * (sum_right[k]/ self.weighted_n_right) ** 2.0 + impurity_left[0] = fabs(impurity_left[0]) + impurity_right[0] = fabs(impurity_right[0]) free(pred_weights) - free(pred_left) - free(pred_right) \ No newline at end of file + \ No newline at end of file diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index b2bec2ec42b90..d53f29585e177 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1779,7 +1779,8 @@ def test_mae(): assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0]) def test_axis_proj(): - """Check axis projection criterion produces correct results on small toy dataset: + """Check axis projection criterion produces correct results on + small toy dataset: ------------------ | X | y1 y2 | weight | @@ -1790,204 +1791,83 @@ def test_axis_proj(): | 3 | 7 7 | 0.6 | | 5 | 8 8 | 0.3 | ------------------ - |sum wt:| 2.3 | - ------------------ - - Mean1 = 5 - Mean2 = 5 - - For all the samples, we can get the total error by summing: - (Mean1 - y1)^2 * weight or (Mean2 - y2)^2 * weight - - I.e., total error = (5 - 3)^2 * 0.1) - + (5 - 3)^2 * 0.3) - + (5 - 4)^2 * 1.0) - + (5 - 7)^2 * 0.6) - + (5 - 8)^2 * 0.3) - = 0.4 + 1.2 + 1.0 + 2.4 + 2.7 - = 7.7 - - Impurity = Total error / total weight - = 7.7 / 2.3 - = 3.3478260869565 - ----------------- - - From this root node, the next best split is between X values of 5 and 8. - Thus, we have left and right child nodes: - - LEFT RIGHT - ----------------------- ----------------------- - | X | y1 y2 | weight | | X | y1 y2 | weight | - ----------------------- ----------------------- - | 3 | 3 3 | 0.1 | | 8 | 4 4 | 1.0 | - | 3 | 7 7 | 0.6 | ----------------------- - | 5 | 3 3 | 0.3 | |sum wt:| 1.0 | - | 5 | 8 8 | 0.3 | ----------------------- - ----------------------- - |sum wt:| 1.3 | - ----------------------- - - 5.0625 + 3.0625 + 5.0625 + 7.5625 / 4 + 0 = 5.1875 - 4 + 4.667 = 8.667 - - Impurity is found in the same way: - Left node Mean1 = Mean2 = 5.25 - Total error = ((5.25 - 3)^2 * 0.1) - + ((5.25 - 7)^2 * 0.6) - + ((5.25 - 3)^2 * 0.3) - + ((5.25 - 8)^2 * 0.3) - = 6.13125 - - Left Impurity = Total error / total weight - = 6.13125 / 1.3 - = 4.716346153846154 - ------------------- - - Likewise for Right node: - Right node Mean1 = Mean2 = 4 - Total error = ((4 - 4)^2 * 1.0) - = 0 - - Right Impurity = Total error / total weight - = 0 / 1.0 - = 0.0 - ------ """ dt_axis = DecisionTreeRegressor(random_state=0, criterion="axis", max_leaf_nodes=2) + dt_mse = DecisionTreeRegressor(random_state=0, criterion="mse", + max_leaf_nodes=2) + # Test axis projection where sample weights are non-uniform (as illustrated above): dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3], [3], [4], [7], [8]], sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) - assert(abs(7.7 / 2.3 - dt_axis.tree_.impurity[0]) < 0.01) - assert(abs(6.13125 / 1.3 - dt_axis.tree_.impurity[1]) < 0.01) + dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], + sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) + assert(abs(dt_mse.tree_.impurity[0] - dt_axis.tree_.impurity[0]) < 0.01) + assert(abs(dt_mse.tree_.impurity[1] - dt_axis.tree_.impurity[1]) < 0.01) assert(abs(dt_axis.tree_.impurity[2]) < 0.01) # Test axis projection where all sample weights are uniform: dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], sample_weight=np.ones(5)) - assert(abs(22.0 / 5.0 - dt_axis.tree_.impurity[0]) < 0.01) - assert(abs(20.75 / 4.0 - dt_axis.tree_.impurity[1]) < 0.01) + dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], + sample_weight=np.ones(5)) + assert(abs(dt_mse.tree_.impurity[0] - dt_axis.tree_.impurity[0]) < 0.01) + assert(abs(dt_mse.tree_.impurity[1] - dt_axis.tree_.impurity[1]) < 0.01) assert(abs(dt_axis.tree_.impurity[2]) < 0.01) # Test axis projections where a `sample_weight` is not explicitly provided. # This is equivalent to providing uniform sample weights, though # the internal logic is different: dt_axis.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) - assert(abs(22.0 / 5.0 - dt_axis.tree_.impurity[0]) < 0.01) - assert(abs(20.75 / 4.0 - dt_axis.tree_.impurity[1]) < 0.01) + dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8]) + assert(abs(dt_mse.tree_.impurity[0] - dt_axis.tree_.impurity[0]) < 0.01) + assert(abs(dt_mse.tree_.impurity[1] - dt_axis.tree_.impurity[1]) < 0.01) assert(abs(dt_axis.tree_.impurity[2]) < 0.01) def test_oblique_proj(): - """Check oblique projection criterion produces correct results on small toy dataset: - - ----------------------- + """Check oblique projection criterion produces correct results on + small toy dataset + + ------------------ | X | y1 y2 | weight | - ----------------------- + ------------------ | 3 | 3 3 | 0.1 | | 5 | 3 3 | 0.3 | | 8 | 4 4 | 1.0 | | 3 | 7 7 | 0.6 | | 5 | 8 8 | 0.3 | - ----------------------- - |sum wt:| 2.3 | - ----------------------- - - Mean1 = 5 - Mean2 = 5 - - For all the samples, we can get the total error by summing: - (Mean1 - y1)^2 * weight or (Mean2 - y)^2 * weight - - I.e., error1 = (5 - 3)^2 * 0.1) - + (5 - 3)^2 * 0.3) - + (5 - 4)^2 * 1.0) - + (5 - 7)^2 * 0.6) - + (5 - 8)^2 * 0.3) - = 0.4 + 1.2 + 1.0 + 2.4 + 2.7 - = 7.7 - error_tot = 15.4 - - Impurity = error / total weight - = 7.7 / 2.3 - = 3.3478260869565 - or - = 15.4 / 2.3 - = 6.6956521739130 - or - = 0.0 - ----------------- - - From this root node, the next best split is between X values of 5 and 8. - Thus, we have left and right child nodes: - - LEFT RIGHT - ----------------------- ----------------------- - | X | y1 y2 | weight | | X | y1 y2 | weight | - ----------------------- ----------------------- - | 3 | 3 3 | 0.1 | | 8 | 4 4 | 1.0 | - | 3 | 7 7 | 0.6 | ----------------------- - | 5 | 3 3 | 0.3 | |sum wt:| 1.0 | - | 5 | 8 8 | 0.3 | ----------------------- - ----------------------- - |sum wt:| 1.3 | - ----------------------- - - (5.0625 + 3.0625 + 5.0625 + 7.5625) / 4 + 0 = 5.1875 - 4 + 4.667 = 8.667 - - Impurity is found in the same way: - Left node Mean1 = Mean2 = 5.25 - error1 = ((5.25 - 3)^2 * 0.1) - + ((5.25 - 7)^2 * 0.6) - + ((5.25 - 3)^2 * 0.3) - + ((5.25 - 8)^2 * 0.3) - = 6.13125 - error_tot = 12.2625 - - Left Impurity = Total error / total weight - = 6.13125 / 1.3 - = 4.716346153846154 - or - = 12.2625 / 1.3 - = 9.43269231 - or - = 0.0 - ------------------- - - Likewise for Right node: - Right node Mean1 = Mean2 = 4 - Total error = ((4 - 4)^2 * 1.0) - = 0 - - Right Impurity = Total error / total weight - = 0 / 1.0 - = 0.0 - ------ + ------------------ """ dt_oblique = DecisionTreeRegressor(random_state=3, criterion="oblique", max_leaf_nodes=2) + dt_mse = DecisionTreeRegressor(random_state=3, criterion="mse", + max_leaf_nodes=2) # Test oblique projection where sample weights are non-uniform (as illustrated above): dt_oblique.fit(X=[[3], [5], [8], [3], [5]], y=[[3, 3], [3, 3], [4, 4], [7, 7], [8, 8]], sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) - print(dt_oblique.tree_.impurity) - assert(abs(7.7 / 2.3 - dt_oblique.tree_.impurity[0]) < 0.01 - or abs(2.0 * 7.7 / 2.3 - dt_oblique.tree_.impurity[0]) < 0.01 + + dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], + sample_weight=[0.1, 0.3, 1.0, 0.6, 0.3]) + + assert(abs(dt_mse.tree_.impurity[0] - dt_oblique.tree_.impurity[0]) < 0.01 + or abs(2.0 * dt_mse.tree_.impurity[0] - dt_oblique.tree_.impurity[0]) < 0.01 or abs(dt_oblique.tree_.impurity[0]) < 0.01) - assert(abs(6.13125 / 1.3 - dt_oblique.tree_.impurity[1]) < 0.01 - or abs(2.0 * 6.13125 / 1.3 - dt_oblique.tree_.impurity[1]) < 0.01 + assert(abs(dt_mse.tree_.impurity[1] - dt_oblique.tree_.impurity[1]) < 0.01 + or abs(2.0 * dt_mse.tree_.impurity[1]- dt_oblique.tree_.impurity[1]) < 0.01 or abs(dt_oblique.tree_.impurity[1]) < 0.01) assert(abs(dt_oblique.tree_.impurity[2]) < 0.01) # Test oblique projection where all sample weights are uniform: dt_oblique.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]], sample_weight=np.ones(5)) - - assert(abs(22.0 / 5.0 - dt_oblique.tree_.impurity[0]) < 0.01 - or abs(2.0 * 22.0 / 5.0 - dt_oblique.tree_.impurity[0]) < 0.01 + dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8], + sample_weight=np.ones(5)) + assert(abs(dt_mse.tree_.impurity[0] - dt_oblique.tree_.impurity[0]) < 0.01 + or abs(2.0 * dt_mse.tree_.impurity[0] - dt_oblique.tree_.impurity[0]) < 0.01 or abs(dt_oblique.tree_.impurity[0]) < 0.01) - assert(abs(20.75 / 4.0 - dt_oblique.tree_.impurity[1]) < 0.01 - or abs(2.0 * 20.75 / 4.0 - dt_oblique.tree_.impurity[1]) < 0.01 + assert(abs(dt_mse.tree_.impurity[1] - dt_oblique.tree_.impurity[1]) < 0.01 + or abs(2.0 * dt_mse.tree_.impurity[1]- dt_oblique.tree_.impurity[1]) < 0.01 or abs(dt_oblique.tree_.impurity[1]) < 0.01) assert(abs(dt_oblique.tree_.impurity[2]) < 0.01) @@ -1995,11 +1875,12 @@ def test_oblique_proj(): # This is equivalent to providing uniform sample weights, though # the internal logic is different: dt_oblique.fit(X=[[3], [5], [8], [3], [5]], y=[[3,3], [3,3], [4,4], [7,7], [8,8]]) - assert(abs(22.0 / 5.0 - dt_oblique.tree_.impurity[0]) < 0.01 - or abs(2.0 * 22.0 / 5.0 - dt_oblique.tree_.impurity[0]) < 0.01 + dt_mse.fit(X=[[3], [5], [8], [3], [5]], y=[3, 3, 4, 7, 8]) + assert(abs(dt_mse.tree_.impurity[0] - dt_oblique.tree_.impurity[0]) < 0.01 + or abs(2.0 * dt_mse.tree_.impurity[0] - dt_oblique.tree_.impurity[0]) < 0.01 or abs(dt_oblique.tree_.impurity[0]) < 0.01) - assert(abs(20.75 / 4.0 - dt_oblique.tree_.impurity[1]) < 0.01 - or abs(2.0 * 20.75 / 4.0 - dt_oblique.tree_.impurity[1]) < 0.01 + assert(abs(dt_mse.tree_.impurity[1] - dt_oblique.tree_.impurity[1]) < 0.01 + or abs(2.0 * dt_mse.tree_.impurity[1]- dt_oblique.tree_.impurity[1]) < 0.01 or abs(dt_oblique.tree_.impurity[1]) < 0.01) assert(abs(dt_oblique.tree_.impurity[2]) < 0.01)