Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scarliles/defuse partitioner #70

Draft
wants to merge 10 commits into
base: submodulev3
Choose a base branch
from
45 changes: 44 additions & 1 deletion asv_benchmarks/benchmarks/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,58 @@
GradientBoostingClassifier,
HistGradientBoostingClassifier,
RandomForestClassifier,
RandomForestRegressor,
)

from .common import Benchmark, Estimator, Predictor
from .datasets import (
_20newsgroups_highdim_dataset,
_20newsgroups_lowdim_dataset,
_synth_classification_dataset,
_synth_regression_dataset,
_synth_regression_sparse_dataset,
)
from .utils import make_gen_classif_scorers
from .utils import make_gen_classif_scorers, make_gen_reg_scorers


class RandomForestRegressorBenchmark(Predictor, Estimator, Benchmark):
"""
Benchmarks for RandomForestRegressor.
"""

param_names = ["representation", "n_jobs"]
params = (["dense", "sparse"], Benchmark.n_jobs_vals)

def setup_cache(self):
super().setup_cache()

def make_data(self, params):
representation, n_jobs = params

if representation == "sparse":
data = _synth_regression_sparse_dataset()
else:
data = _synth_regression_dataset()

return data

def make_estimator(self, params):
representation, n_jobs = params

n_estimators = 500 if Benchmark.data_size == "large" else 100

estimator = RandomForestRegressor(
n_estimators=n_estimators,
min_samples_split=10,
max_features="log2",
n_jobs=n_jobs,
random_state=0,
)

return estimator

def make_scorers(self):
make_gen_reg_scorers(self)


class RandomForestClassifierBenchmark(Predictor, Estimator, Benchmark):
Expand Down
114 changes: 114 additions & 0 deletions sklearn/tree/_partitioner.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from ..utils._typedefs cimport float32_t, float64_t, intp_t, int8_t, int32_t, uint32_t

# Constant to switch between algorithm non zero value extract algorithm
# in SparsePartitioner
cdef float32_t EXTRACT_NNZ_SWITCH = 0.1


# Introduce a fused-class to make it possible to share the split implementation
# between the dense and sparse cases in the node_split_best and node_split_random
# functions. The alternative would have been to use inheritance-based polymorphism
# but it would have resulted in a ~10% overall tree fitting performance
# degradation caused by the overhead frequent virtual method lookups.
#ctypedef fused Partitioner:
# DensePartitioner
# SparsePartitioner


ctypedef void (*InitNodeSplitFunction)(
Partitioner partitioner, intp_t start, intp_t end
) noexcept nogil

ctypedef void (*SortSamplesAndFeatureValuesFunction)(
Partitioner partitioner, intp_t current_feature
) noexcept nogil

ctypedef void (*FindMinMaxFunction)(
Partitioner partitioner,
intp_t current_feature,
float32_t* min_feature_value_out,
float32_t* max_feature_value_out,
) noexcept nogil

ctypedef void (*NextPFunction)(
Partitioner partitioner, intp_t* p_prev, intp_t* p
) noexcept nogil

ctypedef intp_t (*PartitionSamplesFunction)(
Partitioner partitioner, float64_t current_threshold
) noexcept nogil

ctypedef void (*PartitionSamplesFinalFunction)(
Partitioner partitioner,
intp_t best_pos,
float64_t best_threshold,
intp_t best_feature,
intp_t best_n_missing,
) noexcept nogil


cdef class Partitioner:
cdef:
intp_t[::1] samples
float32_t[::1] feature_values
intp_t start
intp_t end
intp_t n_missing
const unsigned char[::1] missing_values_in_feature_mask

inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil
inline void sort_samples_and_feature_values(
self,
intp_t current_feature
) noexcept nogil
inline void find_min_max(
self,
intp_t current_feature,
float32_t* min_feature_value_out,
float32_t* max_feature_value_out,
) noexcept nogil
inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil
inline intp_t partition_samples(self, float64_t current_threshold) noexcept nogil
inline void partition_samples_final(
self,
intp_t best_pos,
float64_t best_threshold,
intp_t best_feature,
intp_t best_n_missing,
) noexcept nogil

InitNodeSplitFunction _init_node_split
SortSamplesAndFeatureValuesFunction _sort_samples_and_feature_values
FindMinMaxFunction _find_min_max
NextPFunction _next_p
PartitionSamplesFunction _partition_samples
PartitionSamplesFinalFunction _partition_samples_final


cdef class DensePartitioner(Partitioner):
"""Partitioner specialized for dense data.

Note that this partitioner is agnostic to the splitting strategy (best vs. random).
"""
cdef:
const float32_t[:, :] X


cdef class SparsePartitioner(Partitioner):
"""Partitioner specialized for sparse CSC data.

Note that this partitioner is agnostic to the splitting strategy (best vs. random).
"""
cdef:
const float32_t[::1] X_data
const int32_t[::1] X_indices
const int32_t[::1] X_indptr

intp_t n_total_samples

intp_t[::1] index_to_samples
intp_t[::1] sorted_samples

intp_t start_positive
intp_t end_negative
bint is_samples_sorted
Loading
Loading