Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add projection split criteria (see issue #4) #10

Merged
merged 20 commits into from
Dec 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sklearn/tree/_criterion.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,4 @@ cdef class RegressionCriterion(Criterion):
"""Abstract regression criterion."""

cdef double sq_sum_total
cdef object random_state # Random state for predictor weights (Projection-Based Splitters)
340 changes: 336 additions & 4 deletions sklearn/tree/_criterion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ import numpy as np
cimport numpy as np
np.import_array()

from ._utils cimport rand_int
from ._utils cimport RAND_R_MAX
from ._utils cimport log
from ._utils cimport safe_realloc
from ._utils cimport sizet_ptr_to_ndarray
Expand Down Expand Up @@ -74,7 +76,6 @@ cdef class Criterion:
The first sample to be used on this node
end : SIZE_t
The last sample used on this node

"""

pass
Expand Down Expand Up @@ -167,7 +168,6 @@ cdef class Criterion:
cdef double impurity_left
cdef double impurity_right
self.children_impurity(&impurity_left, &impurity_right)

return (- self.weighted_n_right * impurity_right
- self.weighted_n_left * impurity_left)

Expand Down Expand Up @@ -689,7 +689,7 @@ cdef class RegressionCriterion(Criterion):
= (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2
"""

def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples):
def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples, object random_state=None):
"""Initialize parameters for this criterion.

Parameters
Expand All @@ -699,11 +699,17 @@ cdef class RegressionCriterion(Criterion):

n_samples : SIZE_t
The total number of samples to fit on

random_state : object
Random State from splitter class

"""

# Default values
self.sample_weight = NULL

self.random_state = random_state

self.samples = NULL
self.start = 0
self.pos = 0
Expand Down Expand Up @@ -980,7 +986,7 @@ cdef class MAE(RegressionCriterion):
cdef np.ndarray right_child
cdef DOUBLE_t* node_medians

def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples):
def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples, object random_state = None):
"""Initialize parameters for this criterion.

Parameters
Expand Down Expand Up @@ -1325,3 +1331,329 @@ cdef class FriedmanMSE(MSE):

return (diff * diff / (self.weighted_n_left * self.weighted_n_right *
self.weighted_n_node_samples))

cdef class AxisProjection(RegressionCriterion):
r"""Mean squared error impurity criterion
of axis-aligned projections of high dimensional y

Algorithm:
1. select a random predictor from [0,n_outputs]
2. compute mse on the values of that predictor for all samples

MSE = var_left + var_right
"""

cdef double node_impurity(self) nogil:
"""Evaluate the impurity of the current node, i.e. the impurity of
samples[start:end]."""

cdef double impurity
cdef DOUBLE_t* sample_weight = self.sample_weight
cdef SIZE_t* samples = self.samples
cdef SIZE_t end = self.end
cdef SIZE_t start = self.start

cdef double* sum_total = self.sum_total
cdef DOUBLE_t y_ik

cdef double sq_sum_total = 0.0

cdef SIZE_t i
cdef SIZE_t p
cdef SIZE_t k
cdef UINT32_t rand_r_state

with gil:
rand_r_state = self.random_state.randint(0, RAND_R_MAX)
cdef UINT32_t* random_state = &rand_r_state

k = rand_int(0, self.n_outputs, random_state)

cdef DOUBLE_t w = 1.0

for p in range(start, end):
i = samples[p]
if sample_weight != NULL:
w = sample_weight[i]
y_ik = self.y[i, k]
sq_sum_total += w * y_ik * y_ik

impurity = sq_sum_total / self.weighted_n_node_samples
impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0

return impurity


cdef double proxy_impurity_improvement(self) nogil:
"""Compute a proxy of the impurity reduction
This method is used to speed up the search for the best split.
It is a proxy quantity such that the split that maximizes this value
also maximizes the impurity improvement. It neglects all constant terms
of the impurity decrease for a given split.
The absolute impurity improvement is only computed by the
impurity_improvement method once the best split has been found.
"""

cdef double* sum_left = self.sum_left
cdef double* sum_right = self.sum_right

cdef SIZE_t k
cdef double proxy_impurity_left = 0.0
cdef double proxy_impurity_right = 0.0

cdef UINT32_t rand_r_state

with gil:
rand_r_state = self.random_state.randint(0, RAND_R_MAX)
cdef UINT32_t* random_state = &rand_r_state

k = rand_int(0, self.n_outputs, random_state)

proxy_impurity_left += sum_left[k] * sum_left[k]
proxy_impurity_right += sum_right[k] * sum_right[k]


return (proxy_impurity_left / self.weighted_n_left +
proxy_impurity_right / self.weighted_n_right)

cdef void children_impurity(self, double* impurity_left,
double* impurity_right) nogil:
"""Evaluate the impurity in children nodes, i.e. the impurity of the
left child (samples[start:pos]) and the impurity the right child
(samples[pos:end])."""

cdef DOUBLE_t* sample_weight = self.sample_weight
cdef SIZE_t* samples = self.samples
cdef SIZE_t pos = self.pos
cdef SIZE_t start = self.start
cdef SIZE_t end = self.end

cdef double* sum_left = self.sum_left
cdef double* sum_right = self.sum_right
cdef DOUBLE_t y_ik

cdef double sq_sum_left = 0.0
cdef double sq_sum_right = 0.0

cdef SIZE_t i
cdef SIZE_t p
cdef SIZE_t k
cdef DOUBLE_t w = 1.0
cdef UINT32_t rand_r_state

with gil:
rand_r_state = self.random_state.randint(0, RAND_R_MAX)
cdef UINT32_t* random_state = &rand_r_state

k = rand_int(0, self.n_outputs, random_state)

for p in range(start, pos):
i = samples[p]

if sample_weight != NULL:
w = sample_weight[i]
y_ik = self.y[i, k]
sq_sum_left += w * y_ik * y_ik

for p in range(pos, end):
i = samples[p]

if sample_weight != NULL:
w = sample_weight[i]
y_ik = self.y[i, k]
sq_sum_right += w * y_ik * y_ik

impurity_left[0] = sq_sum_left / self.weighted_n_left
impurity_right[0] = sq_sum_right / self.weighted_n_right

impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0
impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0

impurity_left[0]
impurity_right[0]


cdef class ObliqueProjection(RegressionCriterion):
r"""Mean squared error impurity criterion
of oblique projections of high dimensional y

Algorithm:
1. Select a random number of random predictors from [0,n_outputs]
2. Assign weights (-1 or 1) to all chosen predictors
3. Assign weight of 0 to all unchosen predictors
4. Compute new predictor (linear combination of all predictors)
5. Compute mse on new predictor

MSE = var_left + var_right
"""
cdef double node_impurity(self) nogil:
"""Evaluate the impurity of the current node, i.e. the impurity of
samples[start:end]."""

cdef double impurity
cdef DOUBLE_t* sample_weight = self.sample_weight
cdef SIZE_t* samples = self.samples
cdef SIZE_t end = self.end
cdef SIZE_t start = self.start

cdef double* sum_total = self.sum_total
cdef DOUBLE_t y_ik

cdef double sq_sum_total = 0.0

cdef SIZE_t i
cdef SIZE_t p
cdef SIZE_t k
cdef UINT32_t rand_r_state
cdef SIZE_t num_pred
cdef SIZE_t a
pred_weights = <double*> calloc(self.n_outputs, sizeof(double))

with gil:
rand_r_state = self.random_state.randint(0, RAND_R_MAX)
cdef UINT32_t* random_state = &rand_r_state

num_pred = rand_int(1, self.n_outputs+1, random_state)

for i in range(num_pred):
k = rand_int(0, self.n_outputs, random_state)
a = rand_int(0, 2, random_state)
if a == 0:
a -= 1
pred_weights[k] = a

cdef DOUBLE_t w = 1.0


for p in range(start, end):
i = samples[p]
if sample_weight != NULL:
w = sample_weight[i]
for k in range(self.n_outputs):
y_ik = self.y[i, k]
sq_sum_total += w * y_ik * y_ik * pred_weights[k]

impurity = sq_sum_total / self.weighted_n_node_samples
for k in range(self.n_outputs):
impurity -= (sum_total[k]* pred_weights[k]/ self.weighted_n_node_samples)**2.0

with gil: impurity = fabs(impurity)
free(pred_weights)
return impurity / num_pred


cdef double proxy_impurity_improvement(self) nogil:
"""Compute a proxy of the impurity reduction
This method is used to speed up the search for the best split.
It is a proxy quantity such that the split that maximizes this value
also maximizes the impurity improvement. It neglects all constant terms
of the impurity decrease for a given split.
The absolute impurity improvement is only computed by the
impurity_improvement method once the best split has been found.
"""

cdef double* sum_left = self.sum_left
cdef double* sum_right = self.sum_right

cdef SIZE_t k
cdef double proxy_impurity_left = 0.0
cdef double proxy_impurity_right = 0.0

cdef UINT32_t rand_r_state
cdef SIZE_t num_pred
cdef SIZE_t a
pred_weights = <double*> calloc(self.n_outputs, sizeof(double))

with gil:
rand_r_state = self.random_state.randint(0, RAND_R_MAX)
cdef UINT32_t* random_state = &rand_r_state

num_pred = rand_int(1, self.n_outputs + 1, random_state)

for i in range(num_pred):
k = rand_int(0, self.n_outputs, random_state)
a = rand_int(0, 2, random_state)
if a == 0:
a -= 1
pred_weights[k] = a # didn't normalize

for k in range(self.n_outputs):
proxy_impurity_left += sum_left[k] * sum_left[k] * pred_weights[k]
proxy_impurity_right += sum_right[k] * sum_right[k] * pred_weights[k]

proxy_impurity_left = fabs(proxy_impurity_left)
proxy_impurity_right = fabs(proxy_impurity_right)
free(pred_weights)
return (proxy_impurity_left / self.weighted_n_left +
proxy_impurity_right / self.weighted_n_right)

cdef void children_impurity(self, double* impurity_left,
double* impurity_right) nogil:
"""Evaluate the impurity in children nodes, i.e. the impurity of the
left child (samples[start:pos]) and the impurity the right child
(samples[pos:end])."""

cdef DOUBLE_t* sample_weight = self.sample_weight
cdef SIZE_t* samples = self.samples
cdef SIZE_t pos = self.pos
cdef SIZE_t start = self.start
cdef SIZE_t end = self.end

cdef double* sum_left = self.sum_left
cdef double* sum_right = self.sum_right
cdef DOUBLE_t y_ik

cdef double sq_sum_left = 0.0
cdef double sq_sum_right = 0.0

cdef SIZE_t i
cdef SIZE_t p
cdef SIZE_t k
cdef UINT32_t rand_r_state
cdef SIZE_t num_pred
cdef SIZE_t a
pred_weights = <double*> calloc(self.n_outputs, sizeof(double))

with gil:
rand_r_state = self.random_state.randint(0, RAND_R_MAX)
cdef UINT32_t* random_state = &rand_r_state

num_pred = rand_int(1, self.n_outputs + 1, random_state)

for i in range(num_pred):
k = rand_int(0, self.n_outputs, random_state)
a = rand_int(0, 2, random_state)
if a == 0:
a -= 1
pred_weights[k] = a

cdef DOUBLE_t w = 1.0
for p in range(start, pos):
i = samples[p]

if sample_weight != NULL:
w = sample_weight[i]
for k in range(self.n_outputs):
y_ik = self.y[i, k]
sq_sum_left += w * y_ik * y_ik * pred_weights[k]

for p in range(pos, end):
i = samples[p]

if sample_weight != NULL:
w = sample_weight[i]
for k in range(self.n_outputs):
y_ik = self.y[i, k]
sq_sum_right += w * y_ik * y_ik * pred_weights[k]

impurity_left[0] = sq_sum_left / self.weighted_n_left
impurity_right[0] = sq_sum_right / self.weighted_n_right

for k in range(self.n_outputs):
impurity_left[0] -= pred_weights[k] * (sum_left[k]/ self.weighted_n_left) ** 2.0
impurity_right[0] -= pred_weights[k] * (sum_right[k]/ self.weighted_n_right) ** 2.0

impurity_left[0] = fabs(impurity_left[0])
impurity_right[0] = fabs(impurity_right[0])
free(pred_weights)

Loading