diff --git a/.appveyor.yml b/.appveyor.yml index dc431ded5018..4cff03d571a1 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,4 +1,4 @@ -version: 4.0.0.99.{build} +version: 4.1.0.99.{build} image: Visual Studio 2015 platform: x64 diff --git a/.ci/lint_r_code.R b/.ci/lint_r_code.R index c13471ca8fb1..12116104ef6d 100755 --- a/.ci/lint_r_code.R +++ b/.ci/lint_r_code.R @@ -33,29 +33,37 @@ LINTERS_TO_USE <- list( , "any_duplicated" = lintr::any_duplicated_linter() , "any_is_na" = lintr::any_is_na_linter() , "assignment" = lintr::assignment_linter() + , "boolean_arithmetic" = lintr::boolean_arithmetic_linter() , "braces" = lintr::brace_linter() , "class_equals" = lintr::class_equals_linter() , "commas" = lintr::commas_linter() , "duplicate_argument" = lintr::duplicate_argument_linter() + , "empty_assignment" = lintr::empty_assignment_linter() , "equals_na" = lintr::equals_na_linter() + , "for_loop_index" = lintr::for_loop_index_linter() , "function_left" = lintr::function_left_parentheses_linter() , "implicit_integers" = lintr::implicit_integer_linter() , "infix_spaces" = lintr::infix_spaces_linter() , "inner_combine" = lintr::inner_combine_linter() + , "is_numeric" = lintr::is_numeric_linter() , "fixed_regex" = lintr::fixed_regex_linter() + , "function_return" = lintr::function_return_linter() + , "lengths" = lintr::lengths_linter() , "literal_coercion" = lintr::literal_coercion_linter() , "long_lines" = lintr::line_length_linter(length = 120L) + , "matrix" = lintr::matrix_apply_linter() , "missing_argument" = lintr::missing_argument_linter() - , "no_tabs" = lintr::no_tab_linter() , "non_portable_path" = lintr::nonportable_path_linter() , "numeric_leading_zero" = lintr::numeric_leading_zero_linter() , "outer_negation" = lintr::outer_negation_linter() , "package_hooks" = lintr::package_hooks_linter() , "paste" = lintr::paste_linter() + , "quotes" = lintr::quotes_linter() + , "redundant_equals" = lintr::redundant_equals_linter() , "regex_subset" = lintr::regex_subset_linter() + , "routine_registration" = lintr::routine_registration_linter() , "semicolon" = lintr::semicolon_linter() , "seq" = lintr::seq_linter() - , "single_quotes" = lintr::single_quotes_linter() , "spaces_inside" = lintr::spaces_inside_linter() , "spaces_left_parens" = lintr::spaces_left_parentheses_linter() , "sprintf" = lintr::sprintf_linter() @@ -96,9 +104,11 @@ LINTERS_TO_USE <- list( , "??" = interactive_text ) ) - , "unneeded_concatenation" = lintr::unneeded_concatenation_linter() - , "unreachable_code" = lintr::unreachable_code_linter() - , "vector_logic" = lintr::vector_logic_linter() + , "unnecessary_concatenation" = lintr::unnecessary_concatenation_linter() + , "unnecessary_lambda" = lintr::unnecessary_lambda_linter() + , "unreachable_code" = lintr::unreachable_code_linter() + , "vector_logic" = lintr::vector_logic_linter() + , "whitespace" = lintr::whitespace_linter() ) noquote(paste0(length(FILES_TO_LINT), " R files need linting")) diff --git a/.ci/test-python-oldest.sh b/.ci/test-python-oldest.sh index 09cc24633e15..3a0ea08dddda 100644 --- a/.ci/test-python-oldest.sh +++ b/.ci/test-python-oldest.sh @@ -7,6 +7,7 @@ # echo "installing lightgbm's dependencies" pip install \ + 'dataclasses' \ 'numpy==1.12.0' \ 'pandas==0.24.0' \ 'scikit-learn==0.18.2' \ diff --git a/.ci/test.sh b/.ci/test.sh index 665e7f6546ec..af7cae2e3858 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -73,7 +73,7 @@ if [[ $TASK == "lint" ]]; then cpplint \ isort \ mypy \ - 'r-lintr>=3.0' \ + 'r-lintr>=3.1' \ ruff source activate $CONDA_ENV echo "Linting Python code" @@ -119,15 +119,21 @@ if [[ $TASK == "check-docs" ]] || [[ $TASK == "check-links" ]]; then exit 0 fi +# older versions of Dask are incompatible with pandas>=2.0, but not all conda packages' metadata accurately reflects that +# +# ref: https://github.com/microsoft/LightGBM/issues/6030 +CONSTRAINED_DEPENDENCIES="'dask-core>=2023.5.0' 'distributed>=2023.5.0' 'pandas>=2.0'" +if [[ $PYTHON_VERSION == "3.7" ]]; then + CONSTRAINED_DEPENDENCIES="'dask-core' 'distributed' 'pandas<2.0'" +fi + # including python=version[build=*cpython] to ensure that conda doesn't fall back to pypy conda create -q -y -n $CONDA_ENV \ + ${CONSTRAINED_DEPENDENCIES} \ cloudpickle \ - dask-core \ - distributed \ joblib \ matplotlib \ numpy \ - pandas \ psutil \ pytest \ ${CONDA_PYTHON_REQUIREMENT} \ diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh index fe435ce11382..e4d70261aa36 100755 --- a/.ci/test_r_package.sh +++ b/.ci/test_r_package.sh @@ -21,9 +21,9 @@ if [[ "${R_MAJOR_VERSION}" == "3" ]]; then export R_LINUX_VERSION="3.6.3-1bionic" export R_APT_REPO="bionic-cran35/" elif [[ "${R_MAJOR_VERSION}" == "4" ]]; then - export R_MAC_VERSION=4.2.2 - export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/base/R-${R_MAC_VERSION}.pkg - export R_LINUX_VERSION="4.2.2-1.2204.0" + export R_MAC_VERSION=4.3.1 + export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/big-sur-x86_64/base/R-${R_MAC_VERSION}-x86_64.pkg + export R_LINUX_VERSION="4.3.1-1.2204.0" export R_APT_REPO="jammy-cran40/" else echo "Unrecognized R version: ${R_VERSION}" @@ -36,7 +36,10 @@ fi # # `devscripts` is required for 'checkbashisms' (https://github.com/r-lib/actions/issues/111) if [[ $OS_NAME == "linux" ]]; then + mkdir -p ~/.gnupg + echo "disable-ipv6" >> ~/.gnupg/dirmngr.conf sudo apt-key adv \ + --homedir ~/.gnupg \ --keyserver keyserver.ubuntu.com \ --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 || exit -1 sudo add-apt-repository \ @@ -53,6 +56,7 @@ if [[ $OS_NAME == "linux" ]]; then texlive-latex-recommended \ texlive-fonts-recommended \ texlive-fonts-extra \ + tidy \ qpdf \ || exit -1 diff --git a/.ci/test_r_package_windows.ps1 b/.ci/test_r_package_windows.ps1 index e4d20de50b90..52d47effcad7 100644 --- a/.ci/test_r_package_windows.ps1 +++ b/.ci/test_r_package_windows.ps1 @@ -203,6 +203,19 @@ if ($env:COMPILER -ne "MSVC") { } } +# Checking that the correct R version was used +if ($env:TOOLCHAIN -ne "MSVC") { + $checks = Select-String -Path "${LOG_FILE_NAME}" -Pattern "using R version $env:R_WINDOWS_VERSION" + $checks_cnt = $checks.Matches.length +} else { + $checks = Select-String -Path "${INSTALL_LOG_FILE_NAME}" -Pattern "R version passed into FindLibR.* $env:R_WINDOWS_VERSION" + $checks_cnt = $checks.Matches.length +} +if ($checks_cnt -eq 0) { + Write-Output "Wrong R version was found (expected '$env:R_WINDOWS_VERSION'). Check the build logs." + Check-Output $False +} + # Checking that we actually got the expected compiler. The R package has some logic # to fail back to MinGW if MSVC fails, but for CI builds we need to check that the correct # compiler was used. diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1 index 5962a9441346..413af821e065 100644 --- a/.ci/test_windows.ps1 +++ b/.ci/test_windows.ps1 @@ -124,7 +124,7 @@ if (($env:TASK -eq "regular") -or (($env:APPVEYOR -eq "true") -and ($env:TASK -e cd $env:BUILD_SOURCESDIRECTORY/examples/python-guide @("import matplotlib", "matplotlib.use('Agg')") + (Get-Content "plot_example.py") | Set-Content "plot_example.py" (Get-Content "plot_example.py").replace('graph.render(view=True)', 'graph.render(view=False)') | Set-Content "plot_example.py" # prevent interactive window mode - conda install -q -y -n $env:CONDA_ENV h5py ipywidgets notebook + conda install -q -y -n $env:CONDA_ENV "h5py>3.0" ipywidgets notebook foreach ($file in @(Get-ChildItem *.py)) { @("import sys, warnings", "warnings.showwarning = lambda message, category, filename, lineno, file=None, line=None: sys.stdout.write(warnings.formatwarning(message, category, filename, lineno, line))") + (Get-Content $file) | Set-Content $file python $file ; Check-Output $? diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ce6da9f6e7fb..02b5cfbdae23 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -7,56 +7,4 @@ # offer a reasonable automatic best-guess # catch-all rule (this only gets matched if no rules below match) -* @guolinke @StrikerRUS @jameslamb @shiyu1994 - -# other catch-alls that will get matched if specific rules below are not matched -*.R @jameslamb @jmoralez -*.py @StrikerRUS @jmoralez @jameslamb @shiyu1994 -*.cpp @guolinke @shiyu1994 -*.h @guolinke @shiyu1994 - -# main C++ code -include/ @guolinke @shiyu1994 -src/ @guolinke @shiyu1994 -CMakeLists.txt @guolinke @jameslamb @StrikerRUS @shiyu1994 -tests/c_api_test/ @guolinke @shiyu1994 -tests/cpp_tests/ @guolinke @shiyu1994 -tests/data/ @guolinke @shiyu1994 -windows/ @guolinke @StrikerRUS @shiyu1994 - -# R code -build_r.R @jameslamb @StrikerRUS @jmoralez -build-cran-package.sh @jameslamb @StrikerRUS @jmoralez -R-package/ @jameslamb @jmoralez - -# Python code -python-package/ @StrikerRUS @shiyu1994 @jameslamb @jmoralez - -# Dask integration -python-package/lightgbm/dask.py @jameslamb @jmoralez -tests/python_package_test/test_dask.py @jameslamb @jmoralez - -# helpers -helpers/ @StrikerRUS @guolinke - -# CI administrative stuff -.ci/ @StrikerRUS @jameslamb -docs/ @StrikerRUS @jameslamb -examples/ @StrikerRUS @jameslamb @guolinke @jmoralez -*.yml @StrikerRUS @jameslamb -.vsts-ci.yml @StrikerRUS @jameslamb - -# docker setup -docker/ @StrikerRUS @jameslamb -docker/dockerfile-cli @guolinke @shiyu1994 @StrikerRUS @jameslamb -docker/gpu/ @StrikerRUS @jameslamb -docker/dockerfile-python @StrikerRUS @shiyu1994 @jameslamb @jmoralez -docker/dockerfile-r @jameslamb @jmoralez - -# GPU code -docs/GPU-*.rst @shiyu1994 @guolinke -src/treelearner/gpu_tree_learner.cpp @guolinke @shiyu1994 -src/treelearner/tree_learner.cpp @guolinke @shiyu1994 - -# JAVA code -swig/ @guolinke @shiyu1994 +* @guolinke @jameslamb @shiyu1994 @jmoralez diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml new file mode 100644 index 000000000000..72d8b7c2f585 --- /dev/null +++ b/.github/workflows/lock.yml @@ -0,0 +1,44 @@ +name: 'Lock Inactive Threads' + +on: + schedule: + # midnight UTC, every Wednesday + - cron: '0 0 * * 3' + # allow manual triggering from GitHub UI + workflow_dispatch: + +permissions: + issues: write + pull-requests: write + +concurrency: + group: lock + +jobs: + action: + runs-on: ubuntu-latest + steps: + - uses: dessant/lock-threads@v4 + with: + github-token: ${{ github.token }} + # after how many days of inactivity should a closed issue/PR be locked? + issue-inactive-days: '90' + pr-inactive-days: '90' + # do not close feature request issues... + # we close those but track them in https://github.com/microsoft/LightGBM/issues/2302 + exclude-any-issue-labels: '"feature request"' + # what labels should be removed prior to locking? + remove-issue-labels: 'awaiting response,awaiting review,blocking,in progress' + remove-pr-labels: 'awaiting response,awaiting review,blocking,in progress' + # what message should be posted prior to locking? + issue-comment: > + This issue has been automatically locked since there has not been any recent activity since it was closed. + To start a new related discussion, open a new issue at https://github.com/microsoft/LightGBM/issues + including a reference to this. + pr-comment: > + This pull request has been automatically locked since there has not been any recent activity since it was closed. + To start a new related discussion, open a new issue at https://github.com/microsoft/LightGBM/issues + including a reference to this. + # what shoulld the locking status be? + issue-lock-reason: 'resolved' + pr-lock-reason: 'resolved' diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml index eb2cb90a424e..838528617143 100644 --- a/.github/workflows/r_package.yml +++ b/.github/workflows/r_package.yml @@ -48,7 +48,7 @@ jobs: - os: ubuntu-latest task: r-package compiler: gcc - r_version: 4.2 + r_version: 4.3 build_type: cmake container: 'ubuntu:22.04' - os: ubuntu-latest @@ -60,19 +60,19 @@ jobs: - os: ubuntu-latest task: r-package compiler: clang - r_version: 4.2 + r_version: 4.3 build_type: cmake container: 'ubuntu:22.04' - os: macOS-latest task: r-package compiler: gcc - r_version: 4.2 + r_version: 4.3 build_type: cmake container: null - os: macOS-latest task: r-package compiler: clang - r_version: 4.2 + r_version: 4.3 build_type: cmake container: null - os: windows-latest @@ -125,13 +125,13 @@ jobs: - os: ubuntu-latest task: r-package compiler: gcc - r_version: 4.2 + r_version: 4.3 build_type: cran container: 'ubuntu:22.04' - os: macOS-latest task: r-package compiler: clang - r_version: 4.2 + r_version: 4.3 build_type: cran container: null ################ @@ -140,7 +140,7 @@ jobs: - os: ubuntu-latest task: r-rchk compiler: gcc - r_version: 4.2 + r_version: 4.3 build_type: cran container: 'ubuntu:22.04' steps: diff --git a/.gitignore b/.gitignore index d4045d9a4798..bcf6f48b4cea 100644 --- a/.gitignore +++ b/.gitignore @@ -139,8 +139,6 @@ publish/ # Publish Web Output *.[Pp]ublish.xml *.azurePubxml -# TODO: Comment the next line if you want to checkin your web deploy settings -# but database connection strings (with potential passwords) will be unencrypted *.pubxml *.publishproj diff --git a/CMakeLists.txt b/CMakeLists.txt index 5087d6a8fddb..6705ef130052 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -326,6 +326,13 @@ if(UNIX OR MINGW OR CYGWIN) CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type" ) + if(MINGW) + # ignore this warning: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95353 + set( + CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -Wno-stringop-overflow" + ) + endif() if(USE_DEBUG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0") else() diff --git a/R-package/R/callback.R b/R-package/R/callback.R index 50f36be4a2be..e428dfb79eea 100644 --- a/R-package/R/callback.R +++ b/R-package/R/callback.R @@ -24,7 +24,7 @@ CB_ENV <- R6::R6Class( ) # Format the evaluation metric string -format.eval.string <- function(eval_res, eval_err) { +.format_eval_string <- function(eval_res, eval_err) { # Check for empty evaluation string if (is.null(eval_res) || length(eval_res) == 0L) { @@ -40,7 +40,7 @@ format.eval.string <- function(eval_res, eval_err) { } -merge.eval.string <- function(env) { +.merge_eval_string <- function(env) { # Check length of evaluation list if (length(env$eval_list) <= 0L) { @@ -63,7 +63,7 @@ merge.eval.string <- function(env) { } # Set error message - msg <- c(msg, format.eval.string(eval_res = env$eval_list[[j]], eval_err = eval_err)) + msg <- c(msg, .format_eval_string(eval_res = env$eval_list[[j]], eval_err = eval_err)) } @@ -86,11 +86,11 @@ cb_print_evaluation <- function(period) { if ((i - 1L) %% period == 0L || is.element(i, c(env$begin_iteration, env$end_iteration))) { # Merge evaluation string - msg <- merge.eval.string(env = env) + msg <- .merge_eval_string(env = env) # Check if message is existing if (nchar(msg) > 0L) { - print(merge.eval.string(env = env)) + print(.merge_eval_string(env = env)) } } @@ -270,7 +270,7 @@ cb_early_stop <- function(stopping_rounds, first_metric_only, verbose) { # Prepare to print if verbose if (verbose) { - best_msg[[i]] <<- as.character(merge.eval.string(env = env)) + best_msg[[i]] <<- as.character(.merge_eval_string(env = env)) } } else { diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 949038fde622..755b171724f9 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -928,6 +928,7 @@ NULL #' , metric = "l2" #' , min_data = 1L #' , learning_rate = 1.0 +#' , num_threads = 2L #' ) #' valids <- list(test = dtest) #' model <- lgb.train( @@ -1086,7 +1087,10 @@ predict.lgb.Booster <- function(object, #' X <- as.matrix(mtcars[, -1L]) #' y <- mtcars[, 1L] #' dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L)) -#' params <- list(min_data_in_leaf = 2L) +#' params <- list( +#' min_data_in_leaf = 2L +#' , num_threads = 2L +#' ) #' model <- lgb.train( #' params = params #' , data = dtrain @@ -1231,6 +1235,7 @@ summary.lgb.Booster <- function(object, ...) { #' , metric = "l2" #' , min_data = 1L #' , learning_rate = 1.0 +#' , num_threads = 2L #' ) #' valids <- list(test = dtest) #' model <- lgb.train( @@ -1296,6 +1301,7 @@ lgb.load <- function(filename = NULL, model_str = NULL) { #' , metric = "l2" #' , min_data = 1L #' , learning_rate = 1.0 +#' , num_threads = 2L #' ) #' valids <- list(test = dtest) #' model <- lgb.train( @@ -1351,6 +1357,7 @@ lgb.save <- function(booster, filename, num_iteration = NULL) { #' , metric = "l2" #' , min_data = 1L #' , learning_rate = 1.0 +#' , num_threads = 2L #' ) #' valids <- list(test = dtest) #' model <- lgb.train( @@ -1401,6 +1408,7 @@ lgb.dump <- function(booster, num_iteration = NULL) { #' , metric = "l2" #' , min_data = 1L #' , learning_rate = 1.0 +#' , num_threads = 2L #' ) #' valids <- list(test = dtest) #' model <- lgb.train( diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index 4df0acbdf005..e2892ea4bae0 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -494,11 +494,10 @@ Dataset <- R6::R6Class( if (info_len > 0L) { # Get back fields - ret <- NULL - ret <- if (field_name == "group") { - integer(info_len) + if (field_name == "group") { + ret <- integer(info_len) } else { - numeric(info_len) + ret <- numeric(info_len) } .Call( diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index 7f036c9726b6..0b7b39e2d8c2 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -98,8 +98,6 @@ Predictor <- R6::R6Class( start_iteration <- 0L } - num_row <- 0L - # Check if data is a file name and not a matrix if (identical(class(data), "character") && length(data) == 1L) { diff --git a/R-package/R/lgb.convert_with_rules.R b/R-package/R/lgb.convert_with_rules.R index f282fa3152fc..f024e9dfe6e9 100644 --- a/R-package/R/lgb.convert_with_rules.R +++ b/R-package/R/lgb.convert_with_rules.R @@ -116,10 +116,6 @@ lgb.convert_with_rules <- function(data, rules = NULL) { column_classes <- .get_column_classes(df = data) - is_char <- which(column_classes == "character") - is_factor <- which(column_classes == "factor") - is_logical <- which(column_classes == "logical") - is_data_table <- data.table::is.data.table(x = data) is_data_frame <- is.data.frame(data) diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R index cf88100db399..f81026fe673f 100644 --- a/R-package/R/lgb.cv.R +++ b/R-package/R/lgb.cv.R @@ -59,6 +59,7 @@ CVBooster <- R6::R6Class( #' , metric = "l2" #' , min_data = 1L #' , learning_rate = 1.0 +#' , num_threads = 2L #' ) #' model <- lgb.cv( #' params = params @@ -224,8 +225,6 @@ lgb.cv <- function(params = list() stop(sQuote("folds"), " must be a list with 2 or more elements that are vectors of indices for each CV-fold") } - nfold <- length(folds) - } else { if (nfold <= 1L) { diff --git a/R-package/R/lgb.importance.R b/R-package/R/lgb.importance.R index c05c6628be34..5a58770553f9 100644 --- a/R-package/R/lgb.importance.R +++ b/R-package/R/lgb.importance.R @@ -24,6 +24,7 @@ #' , max_depth = -1L #' , min_data_in_leaf = 1L #' , min_sum_hessian_in_leaf = 1.0 +#' , num_threads = 2L #' ) #' model <- lgb.train( #' params = params diff --git a/R-package/R/lgb.interprete.R b/R-package/R/lgb.interprete.R index 70aac8760485..7de772664d8b 100644 --- a/R-package/R/lgb.interprete.R +++ b/R-package/R/lgb.interprete.R @@ -35,6 +35,7 @@ #' , max_depth = -1L #' , min_data_in_leaf = 1L #' , min_sum_hessian_in_leaf = 1.0 +#' , num_threads = 2L #' ) #' model <- lgb.train( #' params = params @@ -71,7 +72,9 @@ lgb.interprete <- function(model, leaf_index_dt <- data.table::as.data.table(x = pred_mat) leaf_index_mat_list <- lapply( X = leaf_index_dt - , FUN = function(x) matrix(x, ncol = num_class, byrow = TRUE) + , FUN = matrix + , ncol = num_class + , byrow = TRUE ) # Get list of trees diff --git a/R-package/R/lgb.model.dt.tree.R b/R-package/R/lgb.model.dt.tree.R index 871f8f1d24bf..8b0d8d81e2e8 100644 --- a/R-package/R/lgb.model.dt.tree.R +++ b/R-package/R/lgb.model.dt.tree.R @@ -40,6 +40,7 @@ #' , max_depth = -1L #' , min_data_in_leaf = 1L #' , min_sum_hessian_in_leaf = 1.0 +#' , num_threads = 2L #' ) #' model <- lgb.train(params, dtrain, 10L) #' diff --git a/R-package/R/lgb.plot.importance.R b/R-package/R/lgb.plot.importance.R index 66eafd73a731..fc59ebd0efec 100644 --- a/R-package/R/lgb.plot.importance.R +++ b/R-package/R/lgb.plot.importance.R @@ -28,6 +28,7 @@ #' , learning_rate = 0.1 #' , min_data_in_leaf = 1L #' , min_sum_hessian_in_leaf = 1.0 +#' , num_threads = 2L #' ) #' #' model <- lgb.train( diff --git a/R-package/R/lgb.plot.interpretation.R b/R-package/R/lgb.plot.interpretation.R index 86d8b682725f..a88f14bf83f0 100644 --- a/R-package/R/lgb.plot.interpretation.R +++ b/R-package/R/lgb.plot.interpretation.R @@ -39,6 +39,7 @@ #' , max_depth = -1L #' , min_data_in_leaf = 1L #' , min_sum_hessian_in_leaf = 1.0 +#' , num_threads = 2L #' ) #' model <- lgb.train( #' params = params diff --git a/R-package/R/lgb.restore_handle.R b/R-package/R/lgb.restore_handle.R index dcb167608888..4de93d46c96a 100644 --- a/R-package/R/lgb.restore_handle.R +++ b/R-package/R/lgb.restore_handle.R @@ -23,7 +23,9 @@ #' , agaricus.train$label #' , params = list(objective = "binary") #' , nrounds = 5L -#' , verbose = 0) +#' , verbose = 0 +#' , num_threads = 2L +#' ) #' fname <- tempfile(fileext="rds") #' saveRDS(model, fname) #' diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index 8cf3a95eaf2e..20916c9844b5 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -30,6 +30,7 @@ #' , metric = "l2" #' , min_data = 1L #' , learning_rate = 1.0 +#' , num_threads = 2L #' ) #' valids <- list(test = dtest) #' model <- lgb.train( @@ -153,6 +154,9 @@ lgb.train <- function(params = list(), # Construct datasets, if needed data$update_params(params = params) + if (!is.null(categorical_feature)) { + data$set_categorical_feature(categorical_feature) + } data$construct() # Check interaction constraints @@ -178,11 +182,6 @@ lgb.train <- function(params = list(), data$set_colnames(colnames) } - # Write categorical features - if (!is.null(categorical_feature)) { - data$set_categorical_feature(categorical_feature) - } - valid_contain_train <- FALSE train_data_name <- "train" reduced_valid_sets <- list() diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R index cb3ef31e8afa..711b3ef0dc38 100644 --- a/R-package/R/lightgbm.R +++ b/R-package/R/lightgbm.R @@ -116,7 +116,7 @@ NULL #' \item If passing a factor with more than two variables, will use objective \code{"multiclass"} #' (note that parameter \code{num_class} in this case will also be determined automatically from #' \code{label}). -#' \item Otherwise, will use objective \code{"regression"}. +#' \item Otherwise (or if passing \code{lgb.Dataset} as input), will use objective \code{"regression"}. #' } #' #' \emph{New in version 4.0.0} @@ -211,6 +211,9 @@ lightgbm <- function(data, rm(temp) } else { data_processor <- NULL + if (objective == "auto") { + objective <- "regression" + } } # Set data to a temporary variable diff --git a/R-package/R/readRDS.lgb.Booster.R b/R-package/R/readRDS.lgb.Booster.R index a995d804adc5..a8abac642c24 100644 --- a/R-package/R/readRDS.lgb.Booster.R +++ b/R-package/R/readRDS.lgb.Booster.R @@ -23,6 +23,7 @@ #' , metric = "l2" #' , min_data = 1L #' , learning_rate = 1.0 +#' , num_threads = 2L #' ) #' valids <- list(test = dtest) #' model <- lgb.train( diff --git a/R-package/R/saveRDS.lgb.Booster.R b/R-package/R/saveRDS.lgb.Booster.R index eb71e7a2f08c..5d3af097301f 100644 --- a/R-package/R/saveRDS.lgb.Booster.R +++ b/R-package/R/saveRDS.lgb.Booster.R @@ -33,6 +33,7 @@ #' , metric = "l2" #' , min_data = 1L #' , learning_rate = 1.0 +#' , num_threads = 2L #' ) #' valids <- list(test = dtest) #' model <- lgb.train( diff --git a/R-package/configure b/R-package/configure index 867ef2d395a6..39a18d669833 100755 --- a/R-package/configure +++ b/R-package/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for lightgbm 4.0.0.99. +# Generated by GNU Autoconf 2.71 for lightgbm 4.1.0.99. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -607,8 +607,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='lightgbm' PACKAGE_TARNAME='lightgbm' -PACKAGE_VERSION='4.0.0.99' -PACKAGE_STRING='lightgbm 4.0.0.99' +PACKAGE_VERSION='4.1.0.99' +PACKAGE_STRING='lightgbm 4.1.0.99' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1211,7 +1211,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures lightgbm 4.0.0.99 to adapt to many kinds of systems. +\`configure' configures lightgbm 4.1.0.99 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1273,7 +1273,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of lightgbm 4.0.0.99:";; + short | recursive ) echo "Configuration of lightgbm 4.1.0.99:";; esac cat <<\_ACEOF @@ -1341,7 +1341,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -lightgbm configure 4.0.0.99 +lightgbm configure 4.1.0.99 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1378,7 +1378,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by lightgbm $as_me 4.0.0.99, which was +It was created by lightgbm $as_me 4.1.0.99, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -2454,7 +2454,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by lightgbm $as_me 4.0.0.99, which was +This file was extended by lightgbm $as_me 4.1.0.99, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -2509,7 +2509,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -lightgbm config.status 4.0.0.99 +lightgbm config.status 4.1.0.99 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/R-package/cran-comments.md b/R-package/cran-comments.md index 6fa74cdac4cb..44b8ed391bfc 100644 --- a/R-package/cran-comments.md +++ b/R-package/cran-comments.md @@ -1,5 +1,66 @@ # CRAN Submission History +## v4.1.0 - not submitted + +v4.1.0 was not submitted to CRAN, because https://github.com/microsoft/LightGBM/issues/5987 had not been resolved. + +## v4.0.0 - Submission 2 - (July 19, 2023) + +### CRAN response + +> Dear maintainer, +> package lightgbm_4.0.0.tar.gz does not pass the incoming checks automatically. + +The logs linked from those messagges showed one issue remaining on Debian (0 on Windows). + +```text +* checking examples ... [7s/4s] NOTE +Examples with CPU time > 2.5 times elapsed time + user system elapsed ratio +lgb.restore_handle 1.206 0.085 0.128 10.08 +``` + +### Maintainer Notes + +Chose to document the issue and need for a fix in https://github.com/microsoft/LightGBM/issues/5987, but not resubmit, +to avoid annoying CRAN maintainers. + +## v4.0.0 - Submission 1 - (July 16, 2023) + +### CRAN response + +> Dear maintainer, +> package lightgbm_4.0.0.tar.gz does not pass the incoming checks automatically. + +The logs linked from those messages showed the following issues from `R CMD check`. + +```text +* checking S3 generic/method consistency ... NOTE +Mismatches for apparent methods not registered: +merge: + function(x, y, ...) +merge.eval.string: + function(env) + +format: + function(x, ...) +format.eval.string: + function(eval_res, eval_err) +See section 'Registering S3 methods' in the 'Writing R Extensions' +manual. +``` + +```text +* checking examples ... [8s/4s] NOTE +Examples with CPU time > 2.5 times elapsed time + user system elapsed ratio +lgb.restore_handle 1.819 0.128 0.165 11.8 +``` + +### Maintainer Notes + +Attempted to fix these with https://github.com/microsoft/LightGBM/pull/5988 and resubmitted. + ## v3.3.5 - Submission 2 - (January 16, 2023) ### CRAN response diff --git a/R-package/man/lgb.configure_fast_predict.Rd b/R-package/man/lgb.configure_fast_predict.Rd index a228aad42e21..39fe6afa6b18 100644 --- a/R-package/man/lgb.configure_fast_predict.Rd +++ b/R-package/man/lgb.configure_fast_predict.Rd @@ -119,7 +119,10 @@ data(mtcars) X <- as.matrix(mtcars[, -1L]) y <- mtcars[, 1L] dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L)) -params <- list(min_data_in_leaf = 2L) +params <- list( + min_data_in_leaf = 2L + , num_threads = 2L +) model <- lgb.train( params = params , data = dtrain diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd index 0e6db2e2cb0f..555cb11c7bb3 100644 --- a/R-package/man/lgb.cv.Rd +++ b/R-package/man/lgb.cv.Rd @@ -160,6 +160,7 @@ params <- list( , metric = "l2" , min_data = 1L , learning_rate = 1.0 + , num_threads = 2L ) model <- lgb.cv( params = params diff --git a/R-package/man/lgb.dump.Rd b/R-package/man/lgb.dump.Rd index c9b242a812e3..f4e90242fd75 100644 --- a/R-package/man/lgb.dump.Rd +++ b/R-package/man/lgb.dump.Rd @@ -31,6 +31,7 @@ params <- list( , metric = "l2" , min_data = 1L , learning_rate = 1.0 + , num_threads = 2L ) valids <- list(test = dtest) model <- lgb.train( diff --git a/R-package/man/lgb.get.eval.result.Rd b/R-package/man/lgb.get.eval.result.Rd index cb54217bc42d..9c2293a0f909 100644 --- a/R-package/man/lgb.get.eval.result.Rd +++ b/R-package/man/lgb.get.eval.result.Rd @@ -45,6 +45,7 @@ params <- list( , metric = "l2" , min_data = 1L , learning_rate = 1.0 + , num_threads = 2L ) valids <- list(test = dtest) model <- lgb.train( diff --git a/R-package/man/lgb.importance.Rd b/R-package/man/lgb.importance.Rd index 2fd5d4938de5..89a3d4e6b5b7 100644 --- a/R-package/man/lgb.importance.Rd +++ b/R-package/man/lgb.importance.Rd @@ -35,6 +35,7 @@ params <- list( , max_depth = -1L , min_data_in_leaf = 1L , min_sum_hessian_in_leaf = 1.0 + , num_threads = 2L ) model <- lgb.train( params = params diff --git a/R-package/man/lgb.interprete.Rd b/R-package/man/lgb.interprete.Rd index 6431a5011f48..c1166b2c1cc9 100644 --- a/R-package/man/lgb.interprete.Rd +++ b/R-package/man/lgb.interprete.Rd @@ -48,6 +48,7 @@ params <- list( , max_depth = -1L , min_data_in_leaf = 1L , min_sum_hessian_in_leaf = 1.0 + , num_threads = 2L ) model <- lgb.train( params = params diff --git a/R-package/man/lgb.load.Rd b/R-package/man/lgb.load.Rd index 6031ff8e55bb..c1a00a20974b 100644 --- a/R-package/man/lgb.load.Rd +++ b/R-package/man/lgb.load.Rd @@ -31,6 +31,7 @@ params <- list( , metric = "l2" , min_data = 1L , learning_rate = 1.0 + , num_threads = 2L ) valids <- list(test = dtest) model <- lgb.train( diff --git a/R-package/man/lgb.model.dt.tree.Rd b/R-package/man/lgb.model.dt.tree.Rd index c5c88156ff4d..4d02ede9a001 100644 --- a/R-package/man/lgb.model.dt.tree.Rd +++ b/R-package/man/lgb.model.dt.tree.Rd @@ -51,6 +51,7 @@ params <- list( , max_depth = -1L , min_data_in_leaf = 1L , min_sum_hessian_in_leaf = 1.0 + , num_threads = 2L ) model <- lgb.train(params, dtrain, 10L) diff --git a/R-package/man/lgb.plot.importance.Rd b/R-package/man/lgb.plot.importance.Rd index 4b915e35fc86..302f46460e3f 100644 --- a/R-package/man/lgb.plot.importance.Rd +++ b/R-package/man/lgb.plot.importance.Rd @@ -47,6 +47,7 @@ params <- list( , learning_rate = 0.1 , min_data_in_leaf = 1L , min_sum_hessian_in_leaf = 1.0 + , num_threads = 2L ) model <- lgb.train( diff --git a/R-package/man/lgb.plot.interpretation.Rd b/R-package/man/lgb.plot.interpretation.Rd index 2d7416561f23..a914071e896f 100644 --- a/R-package/man/lgb.plot.interpretation.Rd +++ b/R-package/man/lgb.plot.interpretation.Rd @@ -58,6 +58,7 @@ params <- list( , max_depth = -1L , min_data_in_leaf = 1L , min_sum_hessian_in_leaf = 1.0 + , num_threads = 2L ) model <- lgb.train( params = params diff --git a/R-package/man/lgb.restore_handle.Rd b/R-package/man/lgb.restore_handle.Rd index bbe6f70c85de..95cbdc64485d 100644 --- a/R-package/man/lgb.restore_handle.Rd +++ b/R-package/man/lgb.restore_handle.Rd @@ -34,7 +34,9 @@ model <- lightgbm( , agaricus.train$label , params = list(objective = "binary") , nrounds = 5L - , verbose = 0) + , verbose = 0 + , num_threads = 2L +) fname <- tempfile(fileext="rds") saveRDS(model, fname) diff --git a/R-package/man/lgb.save.Rd b/R-package/man/lgb.save.Rd index 0736c26ab3f6..efd110c7d816 100644 --- a/R-package/man/lgb.save.Rd +++ b/R-package/man/lgb.save.Rd @@ -33,6 +33,7 @@ params <- list( , metric = "l2" , min_data = 1L , learning_rate = 1.0 + , num_threads = 2L ) valids <- list(test = dtest) model <- lgb.train( diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd index 30589ef34e54..0f2961edc415 100644 --- a/R-package/man/lgb.train.Rd +++ b/R-package/man/lgb.train.Rd @@ -141,6 +141,7 @@ params <- list( , metric = "l2" , min_data = 1L , learning_rate = 1.0 + , num_threads = 2L ) valids <- list(test = dtest) model <- lgb.train( diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd index 88f3e3188fec..09d7704605c1 100644 --- a/R-package/man/lightgbm.Rd +++ b/R-package/man/lightgbm.Rd @@ -68,7 +68,7 @@ set to the iteration number of the best iteration.} \item If passing a factor with more than two variables, will use objective \code{"multiclass"} (note that parameter \code{num_class} in this case will also be determined automatically from \code{label}). - \item Otherwise, will use objective \code{"regression"}. + \item Otherwise (or if passing \code{lgb.Dataset} as input), will use objective \code{"regression"}. } \emph{New in version 4.0.0}} diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd index f8043767be43..2df13b9bc374 100644 --- a/R-package/man/predict.lgb.Booster.Rd +++ b/R-package/man/predict.lgb.Booster.Rd @@ -132,6 +132,7 @@ params <- list( , metric = "l2" , min_data = 1L , learning_rate = 1.0 + , num_threads = 2L ) valids <- list(test = dtest) model <- lgb.train( diff --git a/R-package/man/readRDS.lgb.Booster.Rd b/R-package/man/readRDS.lgb.Booster.Rd index 5a1c647a0f74..6a8e4c80ca91 100644 --- a/R-package/man/readRDS.lgb.Booster.Rd +++ b/R-package/man/readRDS.lgb.Booster.Rd @@ -34,6 +34,7 @@ params <- list( , metric = "l2" , min_data = 1L , learning_rate = 1.0 + , num_threads = 2L ) valids <- list(test = dtest) model <- lgb.train( diff --git a/R-package/man/saveRDS.lgb.Booster.Rd b/R-package/man/saveRDS.lgb.Booster.Rd index e730f36b2caf..a8664243dce2 100644 --- a/R-package/man/saveRDS.lgb.Booster.Rd +++ b/R-package/man/saveRDS.lgb.Booster.Rd @@ -57,6 +57,7 @@ params <- list( , metric = "l2" , min_data = 1L , learning_rate = 1.0 + , num_threads = 2L ) valids <- list(test = dtest) model <- lgb.train( diff --git a/R-package/tests/testthat/helper.R b/R-package/tests/testthat/helper.R new file mode 100644 index 000000000000..9c928c1f71d1 --- /dev/null +++ b/R-package/tests/testthat/helper.R @@ -0,0 +1,48 @@ +# ref for this file: +# +# * https://r-pkgs.org/testing-design.html#testthat-helper-files +# * https://r-pkgs.org/testing-design.html#testthat-setup-files + +# LightGBM-internal fix to comply with CRAN policy of only using up to 2 threads in tests and example. +# +# per https://cran.r-project.org/web/packages/policies.html +# +# > If running a package uses multiple threads/cores it must never use more than two simultaneously: +# the check farm is a shared resource and will typically be running many checks simultaneously. +# +.LGB_MAX_THREADS <- 2L + +# by default, how much should results in tests be allowed to differ from hard-coded expected numbers? +.LGB_NUMERIC_TOLERANCE <- 1e-6 + +# are the tests running on Windows? +.LGB_ON_WINDOWS <- .Platform$OS.type == "windows" +.LGB_ON_32_BIT_WINDOWS <- .LGB_ON_WINDOWS && .Machine$sizeof.pointer != 8L + +# are the tests running in a UTF-8 locale? +.LGB_UTF8_LOCALE <- all(endsWith( + Sys.getlocale(category = "LC_CTYPE") + , "UTF-8" +)) + +# control how many loud LightGBM's logger is in tests +.LGB_VERBOSITY <- as.integer( + Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") +) + +# [description] +# test that every element of 'x' is in 'y' +# +# testthat::expect_in() is not available in version of {testthat} +# built for R 3.6, this is here to support a similar interface on R 3.6 +.expect_in <- function(x, y) { + if (exists("expect_in")) { + expect_in(x, y) + } else { + missing_items <- x[!(x %in% y)] + if (length(missing_items) != 0L) { + error_msg <- paste0("Some expected items not found: ", toString(missing_items)) + stop(error_msg) + } + } +} diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index a5003f086cbd..90be1d08cf67 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -1,11 +1,5 @@ library(Matrix) -VERBOSITY <- as.integer( - Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") -) - -TOLERANCE <- 1e-6 - test_that("Predictor$finalize() should not fail", { X <- as.matrix(as.integer(iris[, "Species"]), ncol = 1L) y <- iris[["Sepal.Length"]] @@ -14,8 +8,9 @@ test_that("Predictor$finalize() should not fail", { data = dtrain , params = list( objective = "regression" + , num_threads = .LGB_MAX_THREADS ) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , nrounds = 3L ) model_file <- tempfile(fileext = ".model") @@ -42,8 +37,9 @@ test_that("predictions do not fail for integer input", { data = dtrain , params = list( objective = "regression" + , num_threads = .LGB_MAX_THREADS ) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , nrounds = 3L ) X_double <- X[c(1L, 51L, 101L), , drop = FALSE] @@ -76,7 +72,8 @@ test_that("start_iteration works correctly", { num_leaves = 4L , learning_rate = 0.6 , objective = "binary" - , verbosity = VERBOSITY + , verbosity = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = 50L , valids = list("test" = dtest) @@ -125,8 +122,8 @@ test_that("Feature contributions from sparse inputs produce sparse outputs", { data = dtrain , obj = "regression" , nrounds = 5L - , verbose = VERBOSITY - , params = list(min_data_in_leaf = 5L) + , verbose = .LGB_VERBOSITY + , params = list(min_data_in_leaf = 5L, num_threads = .LGB_MAX_THREADS) ) pred_dense <- predict(bst, X, type = "contrib") @@ -156,8 +153,8 @@ test_that("Sparse feature contribution predictions do not take inputs with wrong data = dtrain , obj = "regression" , nrounds = 5L - , verbose = VERBOSITY - , params = list(min_data_in_leaf = 5L) + , verbose = .LGB_VERBOSITY + , params = list(min_data_in_leaf = 5L, num_threads = .LGB_MAX_THREADS) ) X_wrong <- X[, c(1L:10L, 1L:10L)] @@ -186,8 +183,8 @@ test_that("Feature contribution predictions do not take non-general CSR or CSC i data = dtrain , obj = "regression" , nrounds = 5L - , verbose = VERBOSITY - , params = list(min_data_in_leaf = 5L) + , verbose = .LGB_VERBOSITY + , params = list(min_data_in_leaf = 5L, num_threads = .LGB_MAX_THREADS) ) expect_error(predict(bst, SmatC, type = "contrib")) @@ -211,16 +208,17 @@ test_that("predict() params should override keyword argument for raw-score predi objective = "binary" , min_data_in_leaf = 1L , seed = 708L + , num_threads = .LGB_MAX_THREADS ) , nrounds = 10L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) # check that the predictions from predict.lgb.Booster() really look like raw score predictions preds_prob <- predict(bst, X) preds_raw_s3_keyword <- predict(bst, X, type = "raw") preds_prob_from_raw <- 1.0 / (1.0 + exp(-preds_raw_s3_keyword)) - expect_equal(preds_prob, preds_prob_from_raw, tolerance = TOLERANCE) + expect_equal(preds_prob, preds_prob_from_raw, tolerance = .LGB_NUMERIC_TOLERANCE) accuracy <- sum(as.integer(preds_prob_from_raw > 0.5) == y) / length(y) expect_equal(accuracy, 1.0) @@ -262,9 +260,10 @@ test_that("predict() params should override keyword argument for leaf-index pred objective = "regression" , min_data_in_leaf = 1L , seed = 708L + , num_threads = .LGB_MAX_THREADS ) , nrounds = 10L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) # check that predictions really look like leaf index predictions @@ -315,9 +314,10 @@ test_that("predict() params should override keyword argument for feature contrib objective = "regression" , min_data_in_leaf = 1L , seed = 708L + , num_threads = .LGB_MAX_THREADS ) , nrounds = 10L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) # check that predictions really look like feature contributions @@ -425,8 +425,8 @@ test_that("predict() keeps row names from data (regression)", { data = dtrain , obj = "regression" , nrounds = 5L - , verbose = VERBOSITY - , params = list(min_data_in_leaf = 1L) + , verbose = .LGB_VERBOSITY + , params = list(min_data_in_leaf = 1L, num_threads = .LGB_MAX_THREADS) ) .check_all_row_name_expectations(bst, X) }) @@ -441,7 +441,8 @@ test_that("predict() keeps row names from data (binary classification)", { data = dtrain , obj = "binary" , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , params = list(num_threads = .LGB_MAX_THREADS) ) .check_all_row_name_expectations(bst, X) }) @@ -455,9 +456,9 @@ test_that("predict() keeps row names from data (multi-class classification)", { bst <- lgb.train( data = dtrain , obj = "multiclass" - , params = list(num_class = 3L) + , params = list(num_class = 3L, num_threads = .LGB_MAX_THREADS) , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) .check_all_row_name_expectations(bst, X) }) @@ -478,8 +479,8 @@ test_that("predictions for regression and binary classification are returned as data = dtrain , obj = "regression" , nrounds = 5L - , verbose = VERBOSITY - , params = list(min_data_in_leaf = 1L) + , verbose = .LGB_VERBOSITY + , params = list(min_data_in_leaf = 1L, num_threads = .LGB_MAX_THREADS) ) pred <- predict(model, X) expect_true(is.vector(pred)) @@ -496,7 +497,8 @@ test_that("predictions for regression and binary classification are returned as data = dtrain , obj = "binary" , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , params = list(num_threads = .LGB_MAX_THREADS) ) pred <- predict(model, X) expect_true(is.vector(pred)) @@ -515,8 +517,8 @@ test_that("predictions for multiclass classification are returned as matrix", { data = dtrain , obj = "multiclass" , nrounds = 5L - , verbose = VERBOSITY - , params = list(num_class = 3L) + , verbose = .LGB_VERBOSITY + , params = list(num_class = 3L, num_threads = .LGB_MAX_THREADS) ) pred <- predict(model, X) expect_true(is.matrix(pred)) @@ -533,7 +535,7 @@ test_that("Single-row predictions are identical to multi-row ones", { X <- as.matrix(mtcars[, -1L]) y <- mtcars[, 1L] dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L)) - params <- list(min_data_in_leaf = 2L) + params <- list(min_data_in_leaf = 2L, num_threads = .LGB_MAX_THREADS) model <- lgb.train( params = params , data = dtrain @@ -594,7 +596,7 @@ test_that("Fast-predict configuration accepts non-default prediction types", { X <- as.matrix(mtcars[, -1L]) y <- mtcars[, 1L] dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L)) - params <- list(min_data_in_leaf = 2L) + params <- list(min_data_in_leaf = 2L, num_threads = .LGB_MAX_THREADS) model <- lgb.train( params = params , data = dtrain @@ -624,7 +626,7 @@ test_that("Fast-predict configuration does not block other prediction types", { X <- as.matrix(mtcars[, -1L]) y <- mtcars[, 1L] dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L)) - params <- list(min_data_in_leaf = 2L) + params <- list(min_data_in_leaf = 2L, num_threads = .LGB_MAX_THREADS) model <- lgb.train( params = params , data = dtrain @@ -660,7 +662,8 @@ test_that("predict type='class' returns predicted class for classification objec data = dtrain , obj = "binary" , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , params = list(num_threads = .LGB_MAX_THREADS) ) pred <- predict(bst, X, type = "class") expect_true(all(pred %in% c(0L, 1L))) @@ -673,8 +676,8 @@ test_that("predict type='class' returns predicted class for classification objec data = dtrain , obj = "multiclass" , nrounds = 5L - , verbose = VERBOSITY - , params = list(num_class = 3L) + , verbose = .LGB_VERBOSITY + , params = list(num_class = 3L, num_threads = .LGB_MAX_THREADS) ) pred <- predict(model, X, type = "class") expect_true(all(pred %in% c(0L, 1L, 2L))) @@ -689,7 +692,8 @@ test_that("predict type='class' returns values in the target's range for regress data = dtrain , obj = "regression" , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , params = list(num_threads = .LGB_MAX_THREADS) ) pred <- predict(bst, X, type = "class") expect_true(!any(pred %in% c(0.0, 1.0))) diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index b2ce39f18816..57c33c35dfee 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -1,20 +1,8 @@ -VERBOSITY <- as.integer( - Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") -) - -ON_WINDOWS <- .Platform$OS.type == "windows" - -UTF8_LOCALE <- all(endsWith( - Sys.getlocale(category = "LC_CTYPE") - , "UTF-8" -)) - data(agaricus.train, package = "lightgbm") data(agaricus.test, package = "lightgbm") train <- agaricus.train test <- agaricus.test -TOLERANCE <- 1e-6 set.seed(708L) # [description] Every time this function is called, it adds 0.1 @@ -55,18 +43,22 @@ CONSTANT_METRIC_VALUE <- 0.2 DTRAIN_RANDOM_REGRESSION <- lgb.Dataset( data = as.matrix(rnorm(100L), ncol = 1L, drop = FALSE) , label = rnorm(100L) + , params = list(num_threads = .LGB_MAX_THREADS) ) DVALID_RANDOM_REGRESSION <- lgb.Dataset( data = as.matrix(rnorm(50L), ncol = 1L, drop = FALSE) , label = rnorm(50L) + , params = list(num_threads = .LGB_MAX_THREADS) ) DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset( data = as.matrix(rnorm(120L), ncol = 1L, drop = FALSE) , label = sample(c(0L, 1L), size = 120L, replace = TRUE) + , params = list(num_threads = .LGB_MAX_THREADS) ) DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset( data = as.matrix(rnorm(37L), ncol = 1L, drop = FALSE) , label = sample(c(0L, 1L), size = 37L, replace = TRUE) + , params = list(num_threads = .LGB_MAX_THREADS) ) test_that("train and predict binary classification", { @@ -78,7 +70,8 @@ test_that("train and predict binary classification", { num_leaves = 5L , objective = "binary" , metric = "binary_error" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = nrounds , valids = list( @@ -99,7 +92,7 @@ test_that("train and predict binary classification", { expect_equal(length(pred1), 6513L) err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label) err_log <- record_results[1L] - expect_lt(abs(err_pred1 - err_log), TOLERANCE) + expect_lt(abs(err_pred1 - err_log), .LGB_NUMERIC_TOLERANCE) }) @@ -119,7 +112,8 @@ test_that("train and predict softmax", { , objective = "multiclass" , metric = "multi_error" , num_class = 3L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = 20L , valids = list( @@ -149,13 +143,15 @@ test_that("use of multiple eval metrics works", { , learning_rate = 1.0 , objective = "binary" , metric = metrics - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = 10L , valids = list( "train" = lgb.Dataset( data = train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) ) ) @@ -178,12 +174,13 @@ test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expec num_leaves = 5L , objective = "binary" , metric = "binary_error" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = nrounds ) - expect_true(abs(bst$lower_bound() - -1.590853) < TOLERANCE) - expect_true(abs(bst$upper_bound() - 1.871015) < TOLERANCE) + expect_true(abs(bst$lower_bound() - -1.590853) < .LGB_NUMERIC_TOLERANCE) + expect_true(abs(bst$upper_bound() - 1.871015) < .LGB_NUMERIC_TOLERANCE) }) test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expected for regression", { @@ -196,17 +193,18 @@ test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expec num_leaves = 5L , objective = "regression" , metric = "l2" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = nrounds ) - expect_true(abs(bst$lower_bound() - 0.1513859) < TOLERANCE) - expect_true(abs(bst$upper_bound() - 0.9080349) < TOLERANCE) + expect_true(abs(bst$lower_bound() - 0.1513859) < .LGB_NUMERIC_TOLERANCE) + expect_true(abs(bst$upper_bound() - 0.9080349) < .LGB_NUMERIC_TOLERANCE) }) test_that("lightgbm() rejects negative or 0 value passed to nrounds", { dtrain <- lgb.Dataset(train$data, label = train$label) - params <- list(objective = "regression", metric = "l2,l1") + params <- list(objective = "regression", metric = "l2,l1", num_threads = .LGB_MAX_THREADS) for (nround_value in c(-10L, 0L)) { expect_error({ bst <- lightgbm( @@ -230,7 +228,8 @@ test_that("lightgbm() accepts nrounds as either a top-level argument or paramete objective = "regression" , metric = "l2" , num_leaves = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) ) @@ -243,7 +242,8 @@ test_that("lightgbm() accepts nrounds as either a top-level argument or paramete , metric = "l2" , num_leaves = 5L , nrounds = nrounds - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) ) @@ -257,7 +257,8 @@ test_that("lightgbm() accepts nrounds as either a top-level argument or paramete , metric = "l2" , num_leaves = 5L , nrounds = nrounds - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) ) @@ -285,10 +286,12 @@ test_that("lightgbm() performs evaluation on validation sets if they are provide dvalid1 <- lgb.Dataset( data = train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid2 <- lgb.Dataset( data = train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L bst <- lightgbm( @@ -301,7 +304,8 @@ test_that("lightgbm() performs evaluation on validation sets if they are provide "binary_error" , "auc" ) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = nrounds , valids = list( @@ -310,6 +314,7 @@ test_that("lightgbm() performs evaluation on validation sets if they are provide , "train" = lgb.Dataset( data = train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) ) ) @@ -324,9 +329,9 @@ test_that("lightgbm() performs evaluation on validation sets if they are provide eval_results <- bst$record_evals[[valid_name]][["binary_error"]] expect_length(eval_results[["eval"]], nrounds) } - expect_true(abs(bst$record_evals[["train"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) - expect_true(abs(bst$record_evals[["valid1"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) - expect_true(abs(bst$record_evals[["valid2"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) + expect_true(abs(bst$record_evals[["train"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < .LGB_NUMERIC_TOLERANCE) + expect_true(abs(bst$record_evals[["valid1"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < .LGB_NUMERIC_TOLERANCE) + expect_true(abs(bst$record_evals[["valid2"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < .LGB_NUMERIC_TOLERANCE) }) test_that("training continuation works", { @@ -334,6 +339,7 @@ test_that("training continuation works", { train$data , label = train$label , free_raw_data = FALSE + , params = list(num_threads = .LGB_MAX_THREADS) ) watchlist <- list(train = dtrain) param <- list( @@ -341,7 +347,8 @@ test_that("training continuation works", { , metric = "binary_logloss" , num_leaves = 5L , learning_rate = 1.0 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) # train for 10 consecutive iterations @@ -367,7 +374,8 @@ test_that("cv works", { , metric = "l2,l1" , min_data = 1L , learning_rate = 1.0 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) bst <- lgb.cv( params @@ -387,7 +395,8 @@ test_that("CVBooster$reset_parameter() works as expected", { objective = "regression" , min_data = 1L , num_leaves = 7L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = 3L @@ -405,11 +414,12 @@ test_that("CVBooster$reset_parameter() works as expected", { }) test_that("lgb.cv() rejects negative or 0 value passed to nrounds", { - dtrain <- lgb.Dataset(train$data, label = train$label) + dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = 2L)) params <- list( objective = "regression" , metric = "l2,l1" , min_data = 1L + , num_threads = .LGB_MAX_THREADS ) for (nround_value in c(-10L, 0L)) { expect_error({ @@ -453,6 +463,7 @@ test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric dtrain <- lgb.Dataset( data = as.matrix(runif(n = 500L, min = 0.0, max = 15.0), drop = FALSE) , label = rep(c(0L, 1L), 250L) + , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L cv_bst <- lgb.cv( @@ -464,7 +475,8 @@ test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric , metric = "auc,binary_error" , learning_rate = 1.5 , num_leaves = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) ) expect_true(methods::is(cv_bst, "lgb.CVBooster")) @@ -487,6 +499,7 @@ test_that("lgb.cv() fit on linearly-relatead data improves when using linear lea return(lgb.Dataset( data = X , label = 2L * X + runif(nrow(X), 0L, 0.1) + , params = list(num_threads = .LGB_MAX_THREADS) )) } @@ -496,6 +509,7 @@ test_that("lgb.cv() fit on linearly-relatead data improves when using linear lea , metric = "mse" , seed = 0L , num_leaves = 2L + , num_threads = .LGB_MAX_THREADS ) dtrain <- .new_dataset() @@ -520,12 +534,13 @@ test_that("lgb.cv() fit on linearly-relatead data improves when using linear lea }) test_that("lgb.cv() respects showsd argument", { - dtrain <- lgb.Dataset(train$data, label = train$label) + dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS)) params <- list( objective = "regression" , metric = "l2" , min_data = 1L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) nrounds <- 5L set.seed(708L) @@ -559,6 +574,7 @@ test_that("lgb.cv() raises an informative error for unrecognized objectives", { dtrain <- lgb.Dataset( data = train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) expect_error({ capture.output({ @@ -566,7 +582,8 @@ test_that("lgb.cv() raises an informative error for unrecognized objectives", { data = dtrain , params = list( objective_type = "not_a_real_objective" - , verbosity = VERBOSITY + , verbosity = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) ) }, type = "message") @@ -579,6 +596,7 @@ test_that("lgb.cv() respects parameter aliases for objective", { dtrain <- lgb.Dataset( data = train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) cv_bst <- lgb.cv( data = dtrain @@ -586,7 +604,8 @@ test_that("lgb.cv() respects parameter aliases for objective", { num_leaves = 5L , application = "binary" , num_iterations = nrounds - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nfold = nfold ) @@ -602,10 +621,12 @@ test_that("lgb.cv() prefers objective in params to keyword argument", { data = lgb.Dataset( data = EuStockMarkets[, c("SMI", "CAC", "FTSE")] , label = EuStockMarkets[, "DAX"] + , params = list(num_threads = .LGB_MAX_THREADS) ) , params = list( application = "regression_l1" - , verbosity = VERBOSITY + , verbosity = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = 5L , obj = "regression_l2" @@ -631,6 +652,7 @@ test_that("lgb.cv() respects parameter aliases for metric", { dtrain <- lgb.Dataset( data = train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) cv_bst <- lgb.cv( data = dtrain @@ -639,7 +661,8 @@ test_that("lgb.cv() respects parameter aliases for metric", { , objective = "binary" , num_iterations = nrounds , metric_types = c("auc", "binary_logloss") - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nfold = nfold ) @@ -656,7 +679,8 @@ test_that("lgb.cv() respects eval_train_metric argument", { objective = "regression" , metric = "l2" , min_data = 1L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) nrounds <- 5L set.seed(708L) @@ -696,18 +720,21 @@ test_that("lgb.train() works as expected with multiple eval metrics", { data = lgb.Dataset( train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) , nrounds = 10L , params = list( objective = "binary" , metric = metrics , learning_rate = 1.0 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , valids = list( "train" = lgb.Dataset( train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) ) ) @@ -731,7 +758,7 @@ test_that("lgb.train() raises an informative error for unrecognized objectives", data = dtrain , params = list( objective_type = "not_a_real_objective" - , verbosity = VERBOSITY + , verbosity = .LGB_VERBOSITY ) ) }, type = "message") @@ -743,6 +770,7 @@ test_that("lgb.train() respects parameter aliases for objective", { dtrain <- lgb.Dataset( data = train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) bst <- lgb.train( data = dtrain @@ -750,7 +778,8 @@ test_that("lgb.train() respects parameter aliases for objective", { num_leaves = 5L , application = "binary" , num_iterations = nrounds - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , valids = list( "the_training_data" = dtrain @@ -767,10 +796,12 @@ test_that("lgb.train() prefers objective in params to keyword argument", { data = lgb.Dataset( data = EuStockMarkets[, c("SMI", "CAC", "FTSE")] , label = EuStockMarkets[, "DAX"] + , params = list(num_threads = .LGB_MAX_THREADS) ) , params = list( loss = "regression_l1" - , verbosity = VERBOSITY + , verbosity = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = 5L , obj = "regression_l2" @@ -792,6 +823,7 @@ test_that("lgb.train() respects parameter aliases for metric", { dtrain <- lgb.Dataset( data = train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) bst <- lgb.train( data = dtrain @@ -800,7 +832,8 @@ test_that("lgb.train() respects parameter aliases for metric", { , objective = "binary" , num_iterations = nrounds , metric_types = c("auc", "binary_logloss") - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , valids = list( "train" = dtrain @@ -814,11 +847,12 @@ test_that("lgb.train() respects parameter aliases for metric", { }) test_that("lgb.train() rejects negative or 0 value passed to nrounds", { - dtrain <- lgb.Dataset(train$data, label = train$label) + dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS)) params <- list( objective = "regression" , metric = "l2,l1" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) for (nround_value in c(-10L, 0L)) { expect_error({ @@ -840,13 +874,15 @@ test_that("lgb.train() accepts nrounds as either a top-level argument or paramet data = lgb.Dataset( train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) , nrounds = nrounds , params = list( objective = "regression" , metric = "l2" , num_leaves = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) ) @@ -855,13 +891,14 @@ test_that("lgb.train() accepts nrounds as either a top-level argument or paramet data = lgb.Dataset( train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) , params = list( objective = "regression" , metric = "l2" , num_leaves = 5L , nrounds = nrounds - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) ) @@ -870,6 +907,7 @@ test_that("lgb.train() accepts nrounds as either a top-level argument or paramet data = lgb.Dataset( train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) , nrounds = 20L , params = list( @@ -877,7 +915,8 @@ test_that("lgb.train() accepts nrounds as either a top-level argument or paramet , metric = "l2" , num_leaves = 5L , nrounds = nrounds - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) ) @@ -916,7 +955,7 @@ test_that("lgb.train() throws an informative error if 'data' is not an lgb.Datas params = list( objective = "regression" , metric = "l2,l1" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) , data = val , 10L @@ -935,7 +974,7 @@ test_that("lgb.train() throws an informative error if 'valids' is not a list of params = list( objective = "regression" , metric = "l2,l1" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) , data = lgb.Dataset(train$data, label = train$label) , 10L @@ -954,7 +993,7 @@ test_that("lgb.train() errors if 'valids' is a list of lgb.Dataset objects but s params = list( objective = "regression" , metric = "l2,l1" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) , data = lgb.Dataset(train$data, label = train$label) , 10L @@ -973,7 +1012,7 @@ test_that("lgb.train() throws an informative error if 'valids' contains lgb.Data params = list( objective = "regression" , metric = "l2,l1" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) , data = lgb.Dataset(train$data, label = train$label) , 10L @@ -988,12 +1027,14 @@ test_that("lgb.train() works with force_col_wise and force_row_wise", { dtrain <- lgb.Dataset( train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) params <- list( objective = "binary" , metric = "binary_error" , force_col_wise = TRUE - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) bst_col_wise <- lgb.train( params = params @@ -1005,7 +1046,8 @@ test_that("lgb.train() works with force_col_wise and force_row_wise", { objective = "binary" , metric = "binary_error" , force_row_wise = TRUE - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) bst_row_wise <- lgb.train( params = params @@ -1037,6 +1079,7 @@ test_that("lgb.train() works as expected with sparse features", { dtrain <- lgb.Dataset( data = as.matrix(trainDF[["x"]], drop = FALSE) , label = trainDF[["y"]] + , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 1L bst <- lgb.train( @@ -1044,7 +1087,8 @@ test_that("lgb.train() works as expected with sparse features", { objective = "binary" , min_data = 1L , min_data_in_bin = 1L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds @@ -1056,7 +1100,7 @@ test_that("lgb.train() works as expected with sparse features", { expect_equal(parsed_model$objective, "binary sigmoid:1") expect_false(parsed_model$average_output) expected_error <- 0.6931268 - expect_true(abs(bst$eval_train()[[1L]][["value"]] - expected_error) < TOLERANCE) + expect_true(abs(bst$eval_train()[[1L]][["value"]] - expected_error) < .LGB_NUMERIC_TOLERANCE) }) test_that("lgb.train() works with early stopping for classification", { @@ -1071,10 +1115,12 @@ test_that("lgb.train() works with early stopping for classification", { dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] + , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid <- lgb.Dataset( data = as.matrix(validDF[["feat1"]], drop = FALSE) , label = validDF[["target"]] + , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L @@ -1085,7 +1131,8 @@ test_that("lgb.train() works with early stopping for classification", { params = list( objective = "binary" , metric = "binary_error" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds @@ -1109,7 +1156,8 @@ test_that("lgb.train() works with early stopping for classification", { objective = "binary" , metric = "binary_error" , early_stopping_rounds = early_stopping_rounds - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds @@ -1142,10 +1190,12 @@ test_that("lgb.train() treats early_stopping_rounds<=0 as disabling early stoppi dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] + , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid <- lgb.Dataset( data = as.matrix(validDF[["feat1"]], drop = FALSE) , label = validDF[["target"]] + , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 5L @@ -1158,7 +1208,8 @@ test_that("lgb.train() treats early_stopping_rounds<=0 as disabling early stoppi params = list( objective = "binary" , metric = "binary_error" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds @@ -1182,7 +1233,8 @@ test_that("lgb.train() treats early_stopping_rounds<=0 as disabling early stoppi objective = "binary" , metric = "binary_error" , n_iter_no_change = value - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds @@ -1204,10 +1256,12 @@ test_that("lgb.train() works with early stopping for classification with a metri dtrain <- lgb.Dataset( data = train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid <- lgb.Dataset( data = test$data , label = test$label + , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L @@ -1222,7 +1276,8 @@ test_that("lgb.train() works with early stopping for classification with a metri , metric = "auc" , max_depth = 3L , early_stopping_rounds = early_stopping_rounds - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds @@ -1236,7 +1291,8 @@ test_that("lgb.train() works with early stopping for classification with a metri , metric = "binary_error" , max_depth = 3L , early_stopping_rounds = early_stopping_rounds - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds @@ -1254,7 +1310,7 @@ test_that("lgb.train() works with early stopping for classification with a metri ) expect_identical(bst_binary_error$best_iter, 1L) expect_identical(bst_binary_error$current_iter(), early_stopping_rounds + 1L) - expect_true(abs(bst_binary_error$best_score - 0.01613904) < TOLERANCE) + expect_true(abs(bst_binary_error$best_score - 0.01613904) < .LGB_NUMERIC_TOLERANCE) # early stopping should not have been hit for AUC (higher_better = TRUE) eval_info <- bst_auc$.__enclos_env__$private$get_eval_info() @@ -1265,7 +1321,7 @@ test_that("lgb.train() works with early stopping for classification with a metri ) expect_identical(bst_auc$best_iter, 9L) expect_identical(bst_auc$current_iter(), nrounds) - expect_true(abs(bst_auc$best_score - 0.9999969) < TOLERANCE) + expect_true(abs(bst_auc$best_score - 0.9999969) < .LGB_NUMERIC_TOLERANCE) }) test_that("lgb.train() works with early stopping for regression", { @@ -1281,10 +1337,12 @@ test_that("lgb.train() works with early stopping for regression", { dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] + , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid <- lgb.Dataset( data = as.matrix(validDF[["feat1"]], drop = FALSE) , label = validDF[["target"]] + , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L @@ -1295,7 +1353,8 @@ test_that("lgb.train() works with early stopping for regression", { params = list( objective = "regression" , metric = "rmse" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds @@ -1319,7 +1378,8 @@ test_that("lgb.train() works with early stopping for regression", { objective = "regression" , metric = "rmse" , early_stopping_rounds = early_stopping_rounds - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds @@ -1354,7 +1414,8 @@ test_that("lgb.train() does not stop early if early_stopping_rounds is not given params = list( objective = "regression" , metric = "None" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_REGRESSION , nrounds = nrounds @@ -1398,14 +1459,16 @@ test_that("If first_metric_only is not given or is FALSE, lgb.train() decides to objective = "regression" , metric = "None" , early_stopping_rounds = early_stopping_rounds - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , list( objective = "regression" , metric = "None" , early_stopping_rounds = early_stopping_rounds , first_metric_only = FALSE - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) ) @@ -1468,7 +1531,8 @@ test_that("If first_metric_only is TRUE, lgb.train() decides to stop early based , metric = "None" , early_stopping_rounds = early_stopping_rounds , first_metric_only = TRUE - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_REGRESSION , nrounds = nrounds @@ -1514,7 +1578,8 @@ test_that("lgb.train() works when a mixture of functions and strings are passed params = list( objective = "regression" , metric = "None" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_REGRESSION , nrounds = nrounds @@ -1539,15 +1604,15 @@ test_that("lgb.train() works when a mixture of functions and strings are passed # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid1"]] - expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 1.105012) < TOLERANCE) - expect_true(abs(results[["l2"]][["eval"]][[1L]] - 1.221051) < TOLERANCE) + expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 1.105012) < .LGB_NUMERIC_TOLERANCE) + expect_true(abs(results[["l2"]][["eval"]][[1L]] - 1.221051) < .LGB_NUMERIC_TOLERANCE) expected_increasing_metric <- increasing_metric_starting_value + 0.1 expect_true( abs( results[["increasing_metric"]][["eval"]][[1L]] - expected_increasing_metric - ) < TOLERANCE + ) < .LGB_NUMERIC_TOLERANCE ) - expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) + expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < .LGB_NUMERIC_TOLERANCE) }) @@ -1570,7 +1635,8 @@ test_that("lgb.train() works when a list of strings or a character vector is pas params = list( objective = "binary" , metric = "None" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_CLASSIFICATION , nrounds = nrounds @@ -1591,10 +1657,10 @@ test_that("lgb.train() works when a list of strings or a character vector is pas # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid1"]] if ("binary_error" %in% unlist(eval_variation)) { - expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE) + expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < .LGB_NUMERIC_TOLERANCE) } if ("binary_logloss" %in% unlist(eval_variation)) { - expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE) + expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < .LGB_NUMERIC_TOLERANCE) } } }) @@ -1607,7 +1673,8 @@ test_that("lgb.train() works when you specify both 'metric' and 'eval' with stri params = list( objective = "binary" , metric = "binary_error" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_CLASSIFICATION , nrounds = nrounds @@ -1627,8 +1694,8 @@ test_that("lgb.train() works when you specify both 'metric' and 'eval' with stri # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid1"]] - expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE) - expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE) + expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < .LGB_NUMERIC_TOLERANCE) + expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < .LGB_NUMERIC_TOLERANCE) }) test_that("lgb.train() works when you give a function for eval", { @@ -1639,7 +1706,8 @@ test_that("lgb.train() works when you give a function for eval", { params = list( objective = "binary" , metric = "None" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_CLASSIFICATION , nrounds = nrounds @@ -1651,7 +1719,7 @@ test_that("lgb.train() works when you give a function for eval", { # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid1"]] - expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) + expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < .LGB_NUMERIC_TOLERANCE) }) test_that("lgb.train() works with early stopping for regression with a metric that should be minimized", { @@ -1667,10 +1735,12 @@ test_that("lgb.train() works with early stopping for regression with a metric th dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] + , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid <- lgb.Dataset( data = as.matrix(validDF[["feat1"]], drop = FALSE) , label = validDF[["target"]] + , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L @@ -1688,7 +1758,8 @@ test_that("lgb.train() works with early stopping for regression with a metric th ) , min_data_in_bin = 5L , early_stopping_rounds = early_stopping_rounds - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds @@ -1720,6 +1791,7 @@ test_that("lgb.train() supports non-ASCII feature names", { dtrain <- lgb.Dataset( data = matrix(rnorm(400L), ncol = 4L) , label = rnorm(100L) + , params = list(num_threads = .LGB_MAX_THREADS) ) # content below is equivalent to # @@ -1739,7 +1811,8 @@ test_that("lgb.train() supports non-ASCII feature names", { , obj = "regression" , params = list( metric = "rmse" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , colnames = feature_names ) @@ -1749,7 +1822,7 @@ test_that("lgb.train() supports non-ASCII feature names", { # UTF-8 strings are not well-supported on Windows # * https://developer.r-project.org/Blog/public/2020/05/02/utf-8-support-on-windows/ # * https://developer.r-project.org/Blog/public/2020/07/30/windows/utf-8-build-of-r-and-cran-packages/index.html - if (UTF8_LOCALE && !ON_WINDOWS) { + if (.LGB_UTF8_LOCALE && !.LGB_ON_WINDOWS) { expect_identical( dumped_model[["feature_names"]] , feature_names @@ -1779,7 +1852,7 @@ test_that("lgb.train() works with integer, double, and numeric data", { , min_data_in_leaf = 1L , learning_rate = 0.01 , seed = 708L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) , nrounds = nrounds ) @@ -1792,7 +1865,7 @@ test_that("lgb.train() works with integer, double, and numeric data", { # should have achieved expected performance preds <- predict(bst, X) mae <- mean(abs(y - preds)) - expect_true(abs(mae - expected_mae) < TOLERANCE) + expect_true(abs(mae - expected_mae) < .LGB_NUMERIC_TOLERANCE) } }) @@ -1800,6 +1873,7 @@ test_that("lgb.train() updates params based on keyword arguments", { dtrain <- lgb.Dataset( data = matrix(rnorm(400L), ncol = 4L) , label = rnorm(100L) + , params = list(num_threads = .LGB_MAX_THREADS) ) # defaults from keyword arguments should be used if not specified in params @@ -1808,7 +1882,7 @@ test_that("lgb.train() updates params based on keyword arguments", { bst <- lgb.train( data = dtrain , obj = "regression" - , params = list() + , params = list(num_threads = .LGB_MAX_THREADS) ) }) ) @@ -1824,6 +1898,7 @@ test_that("lgb.train() updates params based on keyword arguments", { , params = list( "verbosity" = 5L , "num_iterations" = 2L + , num_threads = .LGB_MAX_THREADS ) ) }) @@ -1840,6 +1915,7 @@ test_that("lgb.train() updates params based on keyword arguments", { , params = list( "verbose" = 5L , "num_boost_round" = 2L + , num_threads = .LGB_MAX_THREADS ) ) }) @@ -1863,14 +1939,17 @@ test_that("when early stopping is not activated, best_iter and best_score come f dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] + , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid1 <- lgb.Dataset( data = as.matrix(validDF[["feat1"]], drop = FALSE) , label = validDF[["target"]] + , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid2 <- lgb.Dataset( data = as.matrix(validDF[1L:10L, "feat1"], drop = FALSE) , label = validDF[1L:10L, "target"] + , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L train_params <- list( @@ -1878,7 +1957,8 @@ test_that("when early stopping is not activated, best_iter and best_score come f , metric = "rmse" , learning_rate = 1.5 , num_leaves = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) # example 1: two valids, neither are the training data @@ -2020,10 +2100,12 @@ test_that("lightgbm.train() gives the correct best_score and best_iter for a met dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] + , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid1 <- lgb.Dataset( data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) , label = validDF[1L:25L, "target"] + , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L bst <- lgb.train( @@ -2038,7 +2120,8 @@ test_that("lightgbm.train() gives the correct best_score and best_iter for a met , metric = "auc" , learning_rate = 1.5 , num_leaves = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) ) # note that "something-random-we-would-not-hardcode" was recognized as the training @@ -2070,14 +2153,17 @@ test_that("using lightgbm() without early stopping, best_iter and best_score com dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] + , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid1 <- lgb.Dataset( data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) , label = validDF[1L:25L, "target"] + , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid2 <- lgb.Dataset( data = as.matrix(validDF[26L:50L, "feat1"], drop = FALSE) , label = validDF[26L:50L, "target"] + , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L bst <- lightgbm( @@ -2093,6 +2179,7 @@ test_that("using lightgbm() without early stopping, best_iter and best_score com , metric = "auc" , learning_rate = 1.5 , num_leaves = 5L + , num_threads = .LGB_MAX_THREADS ) , verbose = -7L ) @@ -2119,7 +2206,8 @@ test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings params = list( objective = "binary" , metric = "binary_error" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_CLASSIFICATION , nrounds = nrounds @@ -2137,8 +2225,8 @@ test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid"]] - expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5005654) < TOLERANCE) - expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7011232) < TOLERANCE) + expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5005654) < .LGB_NUMERIC_TOLERANCE) + expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7011232) < .LGB_NUMERIC_TOLERANCE) # all boosters should have been created expect_length(bst$boosters, nfolds) @@ -2153,7 +2241,8 @@ test_that("lgb.cv() works when you give a function for eval", { params = list( objective = "binary" , metric = "None" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_CLASSIFICATION , nfold = nfolds @@ -2163,7 +2252,7 @@ test_that("lgb.cv() works when you give a function for eval", { # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid"]] - expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) + expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < .LGB_NUMERIC_TOLERANCE) expect_named(results, "constant_metric") }) @@ -2179,7 +2268,8 @@ test_that("If first_metric_only is TRUE, lgb.cv() decides to stop early based on , metric = "None" , early_stopping_rounds = early_stopping_rounds , first_metric_only = TRUE - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_REGRESSION , nfold = nfolds @@ -2236,7 +2326,8 @@ test_that("early stopping works with lgb.cv()", { , metric = "None" , early_stopping_rounds = early_stopping_rounds , first_metric_only = TRUE - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_REGRESSION , nfold = nfolds @@ -2286,11 +2377,12 @@ test_that("lgb.cv() respects changes to logging verbosity", { dtrain <- lgb.Dataset( data = train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) # (verbose = 1) should be INFO and WARNING level logs lgb_cv_logs <- capture.output({ cv_bst <- lgb.cv( - params = list() + params = list(num_threads = .LGB_MAX_THREADS) , nfold = 2L , nrounds = 5L , data = dtrain @@ -2304,7 +2396,7 @@ test_that("lgb.cv() respects changes to logging verbosity", { # (verbose = 0) should be WARNING level logs only lgb_cv_logs <- capture.output({ cv_bst <- lgb.cv( - params = list() + params = list(num_threads = .LGB_MAX_THREADS) , nfold = 2L , nrounds = 5L , data = dtrain @@ -2318,7 +2410,7 @@ test_that("lgb.cv() respects changes to logging verbosity", { # (verbose = -1) no logs lgb_cv_logs <- capture.output({ cv_bst <- lgb.cv( - params = list() + params = list(num_threads = .LGB_MAX_THREADS) , nfold = 2L , nrounds = 5L , data = dtrain @@ -2336,6 +2428,7 @@ test_that("lgb.cv() updates params based on keyword arguments", { dtrain <- lgb.Dataset( data = matrix(rnorm(400L), ncol = 4L) , label = rnorm(100L) + , params = list(num_threads = .LGB_MAX_THREADS) ) # defaults from keyword arguments should be used if not specified in params @@ -2344,7 +2437,7 @@ test_that("lgb.cv() updates params based on keyword arguments", { cv_bst <- lgb.cv( data = dtrain , obj = "regression" - , params = list() + , params = list(num_threads = .LGB_MAX_THREADS) , nfold = 2L ) }) @@ -2365,6 +2458,7 @@ test_that("lgb.cv() updates params based on keyword arguments", { , params = list( "verbosity" = 5L , "num_iterations" = 2L + , num_threads = .LGB_MAX_THREADS ) , nfold = 2L ) @@ -2385,6 +2479,7 @@ test_that("lgb.cv() updates params based on keyword arguments", { , params = list( "verbose" = 5L , "num_boost_round" = 2L + , num_threads = .LGB_MAX_THREADS ) , nfold = 2L ) @@ -2407,15 +2502,17 @@ test_that("lgb.train() fit on linearly-relatead data improves when using linear return(lgb.Dataset( data = X , label = 2L * X + runif(nrow(X), 0L, 0.1) + , params = list(num_threads = .LGB_MAX_THREADS) )) } params <- list( objective = "regression" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , metric = "mse" , seed = 0L , num_leaves = 2L + , num_threads = .LGB_MAX_THREADS ) dtrain <- .new_dataset() @@ -2446,15 +2543,17 @@ test_that("lgb.train() with linear learner fails already-constructed dataset wit set.seed(708L) params <- list( objective = "regression" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , metric = "mse" , seed = 0L , num_leaves = 2L + , num_threads = .LGB_MAX_THREADS ) dtrain <- lgb.Dataset( data = matrix(rnorm(100L), ncol = 1L) , label = rnorm(100L) + , params = list(num_threads = .LGB_MAX_THREADS) ) dtrain$construct() expect_error({ @@ -2480,15 +2579,17 @@ test_that("lgb.train() works with linear learners even if Dataset has missing va return(lgb.Dataset( data = X , label = 2L * X + runif(nrow(X), 0L, 0.1) + , params = list(num_threads = .LGB_MAX_THREADS) )) } params <- list( objective = "regression" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , metric = "mse" , seed = 0L , num_leaves = 2L + , num_threads = .LGB_MAX_THREADS ) dtrain <- .new_dataset() @@ -2526,17 +2627,19 @@ test_that("lgb.train() works with linear learners, bagging, and a Dataset that h return(lgb.Dataset( data = X , label = 2L * X + runif(nrow(X), 0L, 0.1) + , params = list(num_threads = .LGB_MAX_THREADS) )) } params <- list( objective = "regression" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , metric = "mse" , seed = 0L , num_leaves = 2L , bagging_freq = 1L , subsample = 0.8 + , num_threads = .LGB_MAX_THREADS ) dtrain <- .new_dataset() @@ -2576,6 +2679,7 @@ test_that("lgb.train() works with linear learners and data where a feature has o , label = 2L * X[, 1L] + runif(nrow(X), 0L, 0.1) , params = list( feature_pre_filter = FALSE + , num_threads = .LGB_MAX_THREADS ) )) } @@ -2586,6 +2690,7 @@ test_that("lgb.train() works with linear learners and data where a feature has o , metric = "mse" , seed = 0L , num_leaves = 2L + , num_threads = .LGB_MAX_THREADS ) dtrain <- .new_dataset() @@ -2606,6 +2711,7 @@ test_that("lgb.train() works with linear learners when Dataset has categorical f return(lgb.Dataset( data = X , label = 2L * X[, 1L] + runif(nrow(X), 0L, 0.1) + , params = list(num_threads = .LGB_MAX_THREADS) )) } @@ -2616,6 +2722,7 @@ test_that("lgb.train() works with linear learners when Dataset has categorical f , seed = 0L , num_leaves = 2L , categorical_feature = 1L + , num_threads = .LGB_MAX_THREADS ) dtrain <- .new_dataset() @@ -2682,12 +2789,13 @@ test_that("lgb.train() throws an informative error if interaction_constraints co test_that(paste0("lgb.train() gives same result when interaction_constraints is specified as a list of ", "character vectors, numeric vectors, or a combination"), { set.seed(1L) - dtrain <- lgb.Dataset(train$data, label = train$label) + dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS)) params <- list( objective = "regression" , interaction_constraints = list(c(1L, 2L), 3L) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) bst <- lightgbm( data = dtrain @@ -2700,7 +2808,8 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is params <- list( objective = "regression" , interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), cnames[[3L]]) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) bst <- lightgbm( data = dtrain @@ -2712,7 +2821,8 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is params <- list( objective = "regression" , interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), 3L) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) bst <- lightgbm( data = dtrain @@ -2728,12 +2838,13 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is test_that(paste0("lgb.train() gives same results when using interaction_constraints and specifying colnames"), { set.seed(1L) - dtrain <- lgb.Dataset(train$data, label = train$label) + dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS)) params <- list( objective = "regression" , interaction_constraints = list(c(1L, 2L), 3L) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) bst <- lightgbm( data = dtrain @@ -2746,7 +2857,8 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai params <- list( objective = "regression" , interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L]) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) bst <- lightgbm( data = dtrain @@ -2796,6 +2908,7 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai , categorical_feature = categorical_features , free_raw_data = FALSE , colnames = c("feature_1", "feature_2", "feature_3") + , params = list(num_threads = .LGB_MAX_THREADS) )) } @@ -2890,7 +3003,8 @@ for (x3_to_categorical in c(TRUE, FALSE)) { , monotone_constraints = c(1L, -1L, 0L) , monotone_constraints_method = monotone_constraints_method , use_missing = FALSE - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) constrained_model <- lgb.train( params = params @@ -2912,9 +3026,9 @@ test_that("lightgbm() accepts objective as function argument and under params", bst1 <- lightgbm( data = train$data , label = train$label - , params = list(objective = "regression_l1") + , params = list(objective = "regression_l1", num_threads = .LGB_MAX_THREADS) , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) expect_equal(bst1$params$objective, "regression_l1") model_txt_lines <- strsplit( @@ -2930,7 +3044,7 @@ test_that("lightgbm() accepts objective as function argument and under params", , label = train$label , objective = "regression_l1" , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) expect_equal(bst2$params$objective, "regression_l1") model_txt_lines <- strsplit( @@ -2947,9 +3061,9 @@ test_that("lightgbm() prioritizes objective under params over objective as funct data = train$data , label = train$label , objective = "regression" - , params = list(objective = "regression_l1") + , params = list(objective = "regression_l1", num_threads = .LGB_MAX_THREADS) , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) expect_equal(bst1$params$objective, "regression_l1") model_txt_lines <- strsplit( @@ -2964,9 +3078,9 @@ test_that("lightgbm() prioritizes objective under params over objective as funct data = train$data , label = train$label , objective = "regression" - , params = list(loss = "regression_l1") + , params = list(loss = "regression_l1", num_threads = .LGB_MAX_THREADS) , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) expect_equal(bst2$params$objective, "regression_l1") model_txt_lines <- strsplit( @@ -2984,7 +3098,8 @@ test_that("lightgbm() accepts init_score as function argument", { , label = train$label , objective = "binary" , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , params = list(num_threads = .LGB_MAX_THREADS) ) pred1 <- predict(bst1, train$data, type = "raw") @@ -2994,7 +3109,8 @@ test_that("lightgbm() accepts init_score as function argument", { , init_score = pred1 , objective = "binary" , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , params = list(num_threads = .LGB_MAX_THREADS) ) pred2 <- predict(bst2, train$data, type = "raw") @@ -3006,7 +3122,8 @@ test_that("lightgbm() defaults to 'regression' objective if objective not otherw data = train$data , label = train$label , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , params = list(num_threads = .LGB_MAX_THREADS) ) expect_equal(bst$params$objective, "regression") model_txt_lines <- strsplit( @@ -3023,7 +3140,7 @@ test_that("lightgbm() accepts 'num_threads' as either top-level argument or unde data = train$data , label = train$label , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , num_threads = 1L ) expect_equal(bst$params$num_threads, 1L) @@ -3038,7 +3155,7 @@ test_that("lightgbm() accepts 'num_threads' as either top-level argument or unde data = train$data , label = train$label , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , params = list(num_threads = 1L) ) expect_equal(bst$params$num_threads, 1L) @@ -3053,7 +3170,7 @@ test_that("lightgbm() accepts 'num_threads' as either top-level argument or unde data = train$data , label = train$label , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , num_threads = 10L , params = list(num_threads = 1L) ) @@ -3077,10 +3194,11 @@ test_that("lightgbm() accepts 'weight' and 'weights'", { , weights = w , obj = "regression" , nrounds = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , params = list( min_data_in_bin = 1L , min_data_in_leaf = 1L + , num_threads = .LGB_MAX_THREADS ) ) expect_equal(model$.__enclos_env__$private$train_set$get_field("weight"), w) @@ -3132,7 +3250,7 @@ test_that("lightgbm() accepts 'weight' and 'weights'", { expect_equal( object = unlist(record_evals[["valid"]][["auc"]][["eval"]]) , expected = expected_valid_auc - , tolerance = TOLERANCE + , tolerance = .LGB_NUMERIC_TOLERANCE ) expect_named(record_evals, c("start_iter", "valid"), ignore.order = TRUE, ignore.case = FALSE) expect_equal(record_evals[["valid"]][["auc"]][["eval_err"]], list()) @@ -3146,6 +3264,9 @@ test_that("lightgbm() accepts 'weight' and 'weights'", { , objective = "binary" , metric = "auc" , early_stopping_round = nrounds + , num_threads = .LGB_MAX_THREADS + # include a nonsense parameter just to trigger a WARN-level log + , nonsense_param = 1.0 ) if (!is.null(verbose_param)) { params[["verbose"]] <- verbose_param @@ -3162,6 +3283,7 @@ test_that("lightgbm() accepts 'weight' and 'weights'", { train_kwargs[["data"]] <- lgb.Dataset( data = train$data , label = train$label + , params = list(num_threads = .LGB_MAX_THREADS) ) train_kwargs[["valids"]] <- list( "valid" = lgb.Dataset(data = test$data, label = test$label) @@ -3535,7 +3657,7 @@ test_that("lightgbm() changes objective='auto' appropriately", { data("mtcars") y <- mtcars$mpg x <- as.matrix(mtcars[, -1L]) - model <- lightgbm(x, y, objective = "auto", verbose = VERBOSITY, nrounds = 5L) + model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) expect_equal(model$params$objective, "regression") model_txt_lines <- strsplit( x = model$save_model_to_string() @@ -3548,7 +3670,7 @@ test_that("lightgbm() changes objective='auto' appropriately", { # Binary classification x <- train$data y <- factor(train$label) - model <- lightgbm(x, y, objective = "auto", verbose = VERBOSITY, nrounds = 5L) + model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) expect_equal(model$params$objective, "binary") model_txt_lines <- strsplit( x = model$save_model_to_string() @@ -3561,7 +3683,7 @@ test_that("lightgbm() changes objective='auto' appropriately", { data("iris") y <- factor(iris$Species) x <- as.matrix(iris[, -5L]) - model <- lightgbm(x, y, objective = "auto", verbose = VERBOSITY, nrounds = 5L) + model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) expect_equal(model$params$objective, "multiclass") expect_equal(model$params$num_class, 3L) model_txt_lines <- strsplit( @@ -3576,7 +3698,14 @@ test_that("lightgbm() determines number of classes for non-default multiclass ob data("iris") y <- factor(iris$Species) x <- as.matrix(iris[, -5L]) - model <- lightgbm(x, y, objective = "multiclassova", verbose = VERBOSITY, nrounds = 5L) + model <- lightgbm( + x + , y + , objective = "multiclassova" + , verbose = .LGB_VERBOSITY + , nrounds = 5L + , num_threads = .LGB_MAX_THREADS + ) expect_equal(model$params$objective, "multiclassova") expect_equal(model$params$num_class, 3L) model_txt_lines <- strsplit( @@ -3592,7 +3721,7 @@ test_that("lightgbm() doesn't accept binary classification with non-binary facto y <- factor(iris$Species) x <- as.matrix(iris[, -5L]) expect_error({ - lightgbm(x, y, objective = "binary", verbose = VERBOSITY, nrounds = 5L) + lightgbm(x, y, objective = "binary", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) }, regexp = "Factors with >2 levels as labels only allowed for multi-class objectives") }) @@ -3603,7 +3732,7 @@ test_that("lightgbm() doesn't accept multi-class classification with binary fact y <- factor(y) x <- as.matrix(iris[, -5L]) expect_error({ - lightgbm(x, y, objective = "multiclass", verbose = VERBOSITY, nrounds = 5L) + lightgbm(x, y, objective = "multiclass", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) }, regexp = "Two-level factors as labels only allowed for objective='binary'") }) @@ -3611,7 +3740,7 @@ test_that("lightgbm() model predictions retain factor levels for multiclass clas data("iris") y <- factor(iris$Species) x <- as.matrix(iris[, -5L]) - model <- lightgbm(x, y, objective = "auto", verbose = VERBOSITY, nrounds = 5L) + model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) pred <- predict(model, x, type = "class") expect_true(is.factor(pred)) @@ -3630,7 +3759,7 @@ test_that("lightgbm() model predictions retain factor levels for binary classifi y[y == "setosa"] <- "versicolor" y <- factor(y) x <- as.matrix(iris[, -5L]) - model <- lightgbm(x, y, objective = "auto", verbose = VERBOSITY, nrounds = 5L) + model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) pred <- predict(model, x, type = "class") expect_true(is.factor(pred)) @@ -3646,3 +3775,33 @@ test_that("lightgbm() model predictions retain factor levels for binary classifi expect_true(is.numeric(pred)) expect_false(any(pred %in% y)) }) + +test_that("lightgbm() accepts named categorical_features", { + data(mtcars) + y <- mtcars$mpg + x <- as.matrix(mtcars[, -1L]) + model <- lightgbm( + x + , y + , categorical_feature = "cyl" + , verbose = .LGB_VERBOSITY + , nrounds = 5L + , num_threads = .LGB_MAX_THREADS + ) + expect_true(length(model$params$categorical_feature) > 0L) +}) + +test_that("lightgbm() correctly sets objective when passing lgb.Dataset as input", { + data(mtcars) + y <- mtcars$mpg + x <- as.matrix(mtcars[, -1L]) + ds <- lgb.Dataset(x, label = y) + model <- lightgbm( + ds + , objective = "auto" + , verbose = .LGB_VERBOSITY + , nrounds = 5L + , num_threads = .LGB_MAX_THREADS + ) + expect_equal(model$params$objective, "regression") +}) diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R index 974430e1ab41..2c10b9d571dc 100644 --- a/R-package/tests/testthat/test_custom_objective.R +++ b/R-package/tests/testthat/test_custom_objective.R @@ -1,15 +1,9 @@ -VERBOSITY <- as.integer( - Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") -) - data(agaricus.train, package = "lightgbm") data(agaricus.test, package = "lightgbm") dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) watchlist <- list(eval = dtest, train = dtrain) -TOLERANCE <- 1e-6 - logregobj <- function(preds, dtrain) { labels <- get_field(dtrain, "label") preds <- 1.0 / (1.0 + exp(-preds)) @@ -38,7 +32,8 @@ param <- list( , learning_rate = 1.0 , objective = logregobj , metric = "auc" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) num_round <- 10L @@ -53,7 +48,8 @@ test_that("using a custom objective, custom eval, and no other metrics works", { params = list( num_leaves = 8L , learning_rate = 1.0 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = 4L @@ -63,11 +59,11 @@ test_that("using a custom objective, custom eval, and no other metrics works", { ) expect_false(is.null(bst$record_evals)) expect_equal(bst$best_iter, 4L) - expect_true(abs(bst$best_score - 0.000621) < TOLERANCE) + expect_true(abs(bst$best_score - 0.000621) < .LGB_NUMERIC_TOLERANCE) eval_results <- bst$eval_valid(feval = evalerror)[[1L]] expect_true(eval_results[["data_name"]] == "eval") - expect_true(abs(eval_results[["value"]] - 0.0006207325) < TOLERANCE) + expect_true(abs(eval_results[["value"]] - 0.0006207325) < .LGB_NUMERIC_TOLERANCE) expect_true(eval_results[["name"]] == "error") expect_false(eval_results[["higher_better"]]) }) @@ -79,7 +75,7 @@ test_that("using a custom objective that returns wrong shape grad or hess raises bad_hess <- function(preds, dtrain) { return(list(grad = rep(1.0, length(preds)), hess = numeric(0L))) } - params <- list(num_leaves = 3L, verbose = VERBOSITY) + params <- list(num_leaves = 3L, verbose = .LGB_VERBOSITY) expect_error({ lgb.train(params = params, data = dtrain, obj = bad_grad) }, sprintf("Expected custom objective function to return grad with length %d, got 0.", nrow(dtrain))) diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index bf4c826ef448..cf68ce9262a3 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -1,7 +1,3 @@ -VERBOSITY <- as.integer( - Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") -) - data(agaricus.train, package = "lightgbm") train_data <- agaricus.train$data[seq_len(1000L), ] train_label <- agaricus.train$label[seq_len(1000L)] @@ -16,7 +12,7 @@ test_that("lgb.Dataset: basic construction, saving, loading", { test_data , label = test_label , params = list( - verbose = VERBOSITY + verbose = .LGB_VERBOSITY ) ) # from dense matrix @@ -30,7 +26,7 @@ test_that("lgb.Dataset: basic construction, saving, loading", { dtest3 <- lgb.Dataset( tmp_file , params = list( - verbose = VERBOSITY + verbose = .LGB_VERBOSITY ) ) lgb.Dataset.construct(dtest3) @@ -133,7 +129,7 @@ test_that("Dataset$set_reference() updates categorical_feature, colnames, and pr dtrain$construct() bst <- Booster$new( train_set = dtrain - , params = list(verbose = -1L) + , params = list(verbose = -1L, num_threads = .LGB_MAX_THREADS) ) dtrain$.__enclos_env__$private$predictor <- bst$to_predictor() @@ -376,7 +372,7 @@ test_that("lgb.Dataset: should be able to run lgb.train() immediately after usin data = test_data , label = test_label , params = list( - verbose = VERBOSITY + verbose = .LGB_VERBOSITY ) ) tmp_file <- tempfile(pattern = "lgb.Dataset_") @@ -393,7 +389,8 @@ test_that("lgb.Dataset: should be able to run lgb.train() immediately after usin , metric = "binary_logloss" , num_leaves = 5L , learning_rate = 1.0 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) # should be able to train right away @@ -410,7 +407,7 @@ test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using l data = test_data , label = test_label , params = list( - verbosity = VERBOSITY + verbosity = .LGB_VERBOSITY ) ) tmp_file <- tempfile(pattern = "lgb.Dataset_") @@ -428,7 +425,8 @@ test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using l , num_leaves = 5L , learning_rate = 1.0 , num_iterations = 5L - , verbosity = VERBOSITY + , verbosity = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) # should be able to train right away @@ -444,7 +442,7 @@ test_that("lgb.Dataset: should be able to use and retrieve long feature names", # set one feature to a value longer than the default buffer size used # in LGBM_DatasetGetFeatureNames_R feature_names <- names(iris) - long_name <- paste0(rep("a", 1000L), collapse = "") + long_name <- strrep("a", 1000L) feature_names[1L] <- long_name names(iris) <- feature_names # check that feature name survived the trip from R to C++ and back @@ -473,7 +471,7 @@ test_that("lgb.Dataset: should be able to create a Dataset from a text file with data = train_file , params = list( header = TRUE - , verbosity = VERBOSITY + , verbosity = .LGB_VERBOSITY ) ) dtrain$construct() @@ -497,7 +495,7 @@ test_that("lgb.Dataset: should be able to create a Dataset from a text file with data = train_file , params = list( header = FALSE - , verbosity = VERBOSITY + , verbosity = .LGB_VERBOSITY ) ) dtrain$construct() diff --git a/R-package/tests/testthat/test_learning_to_rank.R b/R-package/tests/testthat/test_learning_to_rank.R index c1c7768dac7d..6868794cf8ec 100644 --- a/R-package/tests/testthat/test_learning_to_rank.R +++ b/R-package/tests/testthat/test_learning_to_rank.R @@ -1,12 +1,3 @@ -VERBOSITY <- as.integer( - Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") -) - -# numerical tolerance to use when checking metric values -TOLERANCE <- 1e-06 - -ON_32_BIT_WINDOWS <- .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8L - test_that("learning-to-rank with lgb.train() works as expected", { set.seed(708L) data(agaricus.train, package = "lightgbm") @@ -26,7 +17,8 @@ test_that("learning-to-rank with lgb.train() works as expected", { , ndcg_at = ndcg_at , lambdarank_truncation_level = 3L , learning_rate = 0.001 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) model <- lgb.train( params = params @@ -59,15 +51,15 @@ test_that("learning-to-rank with lgb.train() works as expected", { , eval_names ) expect_equal(eval_results[[1L]][["value"]], 0.775) - if (!ON_32_BIT_WINDOWS) { - expect_true(abs(eval_results[[2L]][["value"]] - 0.745986) < TOLERANCE) - expect_true(abs(eval_results[[3L]][["value"]] - 0.7351959) < TOLERANCE) + if (!.LGB_ON_32_BIT_WINDOWS) { + expect_true(abs(eval_results[[2L]][["value"]] - 0.745986) < .LGB_NUMERIC_TOLERANCE) + expect_true(abs(eval_results[[3L]][["value"]] - 0.7351959) < .LGB_NUMERIC_TOLERANCE) } }) test_that("learning-to-rank with lgb.cv() works as expected", { testthat::skip_if( - ON_32_BIT_WINDOWS + .LGB_ON_32_BIT_WINDOWS , message = "Skipping on 32-bit Windows" ) set.seed(708L) @@ -90,7 +82,8 @@ test_that("learning-to-rank with lgb.cv() works as expected", { , label_gain = "0,1,3" , min_data = 1L , learning_rate = 0.01 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) nfold <- 4L nrounds <- 10L @@ -113,7 +106,7 @@ test_that("learning-to-rank with lgb.cv() works as expected", { best_score <- cv_bst$best_score expect_true(best_iter > 0L && best_iter <= nrounds) expect_true(best_score > 0.0 && best_score < 1.0) - expect_true(abs(best_score - 0.75) < TOLERANCE) + expect_true(abs(best_score - 0.75) < .LGB_NUMERIC_TOLERANCE) # best_score should be set for the first metric first_metric <- eval_names[[1L]] @@ -136,19 +129,19 @@ test_that("learning-to-rank with lgb.cv() works as expected", { # first and last value of each metric should be as expected ndcg1_values <- c(0.675, 0.725, 0.65, 0.725, 0.75, 0.725, 0.75, 0.725, 0.75, 0.75) - expect_true(all(abs(unlist(eval_results[["ndcg@1"]][["eval"]]) - ndcg1_values) < TOLERANCE)) + expect_true(all(abs(unlist(eval_results[["ndcg@1"]][["eval"]]) - ndcg1_values) < .LGB_NUMERIC_TOLERANCE)) ndcg2_values <- c( 0.6556574, 0.6669721, 0.6306574, 0.6476294, 0.6629581, 0.6476294, 0.6629581, 0.6379581, 0.7113147, 0.6823008 ) - expect_true(all(abs(unlist(eval_results[["ndcg@2"]][["eval"]]) - ndcg2_values) < TOLERANCE)) + expect_true(all(abs(unlist(eval_results[["ndcg@2"]][["eval"]]) - ndcg2_values) < .LGB_NUMERIC_TOLERANCE)) ndcg3_values <- c( 0.6484639, 0.6571238, 0.6469279, 0.6540516, 0.6481857, 0.6481857, 0.6481857, 0.6466496, 0.7027939, 0.6629898 ) - expect_true(all(abs(unlist(eval_results[["ndcg@3"]][["eval"]]) - ndcg3_values) < TOLERANCE)) + expect_true(all(abs(unlist(eval_results[["ndcg@3"]][["eval"]]) - ndcg3_values) < .LGB_NUMERIC_TOLERANCE)) # check details of each booster for (bst in cv_bst$boosters) { diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 8ccfdaa336ae..5f398f1c081d 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -1,10 +1,3 @@ -VERBOSITY <- as.integer( - Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") -) - -ON_WINDOWS <- .Platform$OS.type == "windows" -TOLERANCE <- 1e-6 - test_that("Booster$finalize() should not fail", { X <- as.matrix(as.integer(iris[, "Species"]), ncol = 1L) y <- iris[["Sepal.Length"]] @@ -13,8 +6,9 @@ test_that("Booster$finalize() should not fail", { data = dtrain , params = list( objective = "regression" + , num_threads = .LGB_MAX_THREADS ) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , nrounds = 3L ) expect_true(lgb.is.Booster(bst)) @@ -65,7 +59,8 @@ test_that("lgb.get.eval.result() should throw an informative error for incorrect , metric = "l2" , min_data = 1L , learning_rate = 1.0 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = 5L @@ -99,7 +94,7 @@ test_that("lgb.get.eval.result() should throw an informative error for incorrect , metric = "l2" , min_data = 1L , learning_rate = 1.0 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) , data = dtrain , nrounds = 5L @@ -133,7 +128,7 @@ test_that("lgb.load() gives the expected error messages given different incorrec objective = "binary" , num_leaves = 4L , learning_rate = 1.0 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) , nrounds = 2L ) @@ -184,7 +179,8 @@ test_that("Loading a Booster from a text file works", { , metric = c("mape", "average_precision") , learning_rate = 1.0 , objective = "binary" - , verbosity = VERBOSITY + , verbosity = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) bst <- lightgbm( data = as.matrix(train$data) @@ -227,13 +223,14 @@ test_that("boosters with linear models at leaves can be written to text file and , metric = "mse" , seed = 0L , num_leaves = 2L + , num_threads = .LGB_MAX_THREADS ) bst <- lgb.train( data = dtrain , nrounds = 10L , params = params - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) expect_true(lgb.is.Booster(bst)) @@ -267,7 +264,8 @@ test_that("Loading a Booster from a string works", { num_leaves = 4L , learning_rate = 1.0 , objective = "binary" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = 2L ) @@ -299,9 +297,10 @@ test_that("Saving a large model to string should work", { num_leaves = 100L , learning_rate = 0.01 , objective = "binary" + , num_threads = .LGB_MAX_THREADS ) , nrounds = 500L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) pred <- predict(bst, train$data) @@ -342,9 +341,10 @@ test_that("Saving a large model to JSON should work", { num_leaves = 100L , learning_rate = 0.01 , objective = "binary" + , num_threads = .LGB_MAX_THREADS ) , nrounds = 200L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) model_json <- bst$dump_model() @@ -371,7 +371,8 @@ test_that("If a string and a file are both passed to lgb.load() the file is used num_leaves = 4L , learning_rate = 1.0 , objective = "binary" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = 2L ) @@ -405,7 +406,8 @@ test_that("Creating a Booster from a Dataset should work", { bst <- Booster$new( params = list( objective = "binary" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ), train_set = dtrain ) @@ -426,7 +428,8 @@ test_that("Creating a Booster from a Dataset with an existing predictor should w num_leaves = 4L , learning_rate = 1.0 , objective = "binary" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = nrounds ) @@ -439,7 +442,8 @@ test_that("Creating a Booster from a Dataset with an existing predictor should w bst_from_ds <- Booster$new( train_set = dtest , params = list( - verbose = VERBOSITY + verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) ) expect_true(lgb.is.Booster(bst)) @@ -462,7 +466,8 @@ test_that("Booster$eval() should work on a Dataset stored in a binary file", { objective = "regression" , metric = "l2" , num_leaves = 4L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = 2L @@ -492,14 +497,14 @@ test_that("Booster$eval() should work on a Dataset stored in a binary file", { eval_from_file <- bst$eval( data = lgb.Dataset( data = test_file - , params = list(verbose = VERBOSITY) + , params = list(verbose = .LGB_VERBOSITY, num_threads = .LGB_MAX_THREADS) )$construct() , name = "test" ) - expect_true(abs(eval_in_mem[[1L]][["value"]] - 0.1744423) < TOLERANCE) + expect_true(abs(eval_in_mem[[1L]][["value"]] - 0.1744423) < .LGB_NUMERIC_TOLERANCE) # refer to https://github.com/microsoft/LightGBM/issues/4680 - if (isTRUE(ON_WINDOWS)) { + if (isTRUE(.LGB_ON_WINDOWS)) { expect_equal(eval_in_mem, eval_from_file) } else { expect_identical(eval_in_mem, eval_from_file) @@ -520,7 +525,8 @@ test_that("Booster$rollback_one_iter() should work as expected", { num_leaves = 4L , learning_rate = 1.0 , objective = "binary" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = nrounds ) @@ -554,7 +560,8 @@ test_that("Booster$update() passing a train_set works as expected", { num_leaves = 4L , learning_rate = 1.0 , objective = "binary" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = nrounds ) @@ -564,7 +571,7 @@ test_that("Booster$update() passing a train_set works as expected", { train_set = Dataset$new( data = agaricus.train$data , label = agaricus.train$label - , params = list(verbose = VERBOSITY) + , params = list(verbose = .LGB_VERBOSITY) ) ) expect_true(lgb.is.Booster(bst)) @@ -578,7 +585,8 @@ test_that("Booster$update() passing a train_set works as expected", { num_leaves = 4L , learning_rate = 1.0 , objective = "binary" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = nrounds + 1L ) @@ -603,7 +611,8 @@ test_that("Booster$update() throws an informative error if you provide a non-Dat num_leaves = 4L , learning_rate = 1.0 , objective = "binary" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = nrounds ) @@ -630,7 +639,8 @@ test_that("Booster should store parameters and Booster$reset_parameter() should , metric = c("multi_logloss", "multi_error") , boosting = "gbdt" , num_class = 5L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) bst <- Booster$new( params = params @@ -657,7 +667,8 @@ test_that("Booster$params should include dataset params, before and after Booste objective = "binary" , max_depth = 4L , bagging_fraction = 0.8 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) bst <- Booster$new( params = params @@ -669,7 +680,8 @@ test_that("Booster$params should include dataset params, before and after Booste objective = "binary" , max_depth = 4L , bagging_fraction = 0.8 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS , max_bin = 17L ) ) @@ -680,7 +692,8 @@ test_that("Booster$params should include dataset params, before and after Booste objective = "binary" , max_depth = 4L , bagging_fraction = 0.9 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS , max_bin = 17L ) expect_identical(ret_bst$params, expected_params) @@ -698,7 +711,8 @@ test_that("Saving a model with different feature importance types works", { num_leaves = 4L , learning_rate = 1.0 , objective = "binary" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = 2L ) @@ -753,7 +767,8 @@ test_that("Saving a model with unknown importance type fails", { num_leaves = 4L , learning_rate = 1.0 , objective = "binary" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) , nrounds = 2L ) @@ -784,36 +799,166 @@ test_that("all parameters are stored correctly with save_model_to_string()", { data = matrix(rnorm(500L), nrow = 100L) , label = rnorm(100L) ) - nrounds <- 4L bst <- lgb.train( params = list( - objective = "regression" - , metric = "l2" + objective = "mape" + , metric = c("l2", "mae") + , num_threads = .LGB_MAX_THREADS + , seed = 708L + , data_sample_strategy = "bagging" + , sub_row = 0.8234 ) , data = dtrain - , nrounds = nrounds - , verbose = VERBOSITY - ) + , nrounds = 3L + , verbose = .LGB_VERBOSITY + ) + + # entries whose values should reflect params passed to lgb.train() + non_default_param_entries <- c( + "[objective: mape]" + # 'l1' was passed in with alias 'mae' + , "[metric: l2,l1]" + , "[data_sample_strategy: bagging]" + , "[seed: 708]" + # this was passed in with alias 'sub_row' + , "[bagging_fraction: 0.8234]" + , "[num_iterations: 3]" + ) + + # entries with default values of params + default_param_entries <- c( + "[boosting: gbdt]" + , "[tree_learner: serial]" + , "[device_type: cpu]" + , "[data: ]" + , "[valid: ]" + , "[learning_rate: 0.1]" + , "[num_leaves: 31]" + , sprintf("[num_threads: %i]", .LGB_MAX_THREADS) + , "[deterministic: 0]" + , "[histogram_pool_size: -1]" + , "[max_depth: -1]" + , "[min_data_in_leaf: 20]" + , "[min_sum_hessian_in_leaf: 0.001]" + , "[pos_bagging_fraction: 1]" + , "[neg_bagging_fraction: 1]" + , "[bagging_freq: 0]" + , "[bagging_seed: 15415]" + , "[feature_fraction: 1]" + , "[feature_fraction_bynode: 1]" + , "[feature_fraction_seed: 32671]" + , "[extra_trees: 0]" + , "[extra_seed: 6642]" + , "[early_stopping_round: 0]" + , "[first_metric_only: 0]" + , "[max_delta_step: 0]" + , "[lambda_l1: 0]" + , "[lambda_l2: 0]" + , "[linear_lambda: 0]" + , "[min_gain_to_split: 0]" + , "[drop_rate: 0.1]" + , "[max_drop: 50]" + , "[skip_drop: 0.5]" + , "[xgboost_dart_mode: 0]" + , "[uniform_drop: 0]" + , "[drop_seed: 20623]" + , "[top_rate: 0.2]" + , "[other_rate: 0.1]" + , "[min_data_per_group: 100]" + , "[max_cat_threshold: 32]" + , "[cat_l2: 10]" + , "[cat_smooth: 10]" + , "[max_cat_to_onehot: 4]" + , "[top_k: 20]" + , "[monotone_constraints: ]" + , "[monotone_constraints_method: basic]" + , "[monotone_penalty: 0]" + , "[feature_contri: ]" + , "[forcedsplits_filename: ]" + , "[force_col_wise: 0]" + , "[force_row_wise: 0]" + , "[refit_decay_rate: 0.9]" + , "[cegb_tradeoff: 1]" + , "[cegb_penalty_split: 0]" + , "[cegb_penalty_feature_lazy: ]" + , "[cegb_penalty_feature_coupled: ]" + , "[path_smooth: 0]" + , "[interaction_constraints: ]" + , sprintf("[verbosity: %i]", .LGB_VERBOSITY) + , "[saved_feature_importance_type: 0]" + , "[use_quantized_grad: 0]" + , "[num_grad_quant_bins: 4]" + , "[quant_train_renew_leaf: 0]" + , "[stochastic_rounding: 1]" + , "[linear_tree: 0]" + , "[max_bin: 255]" + , "[max_bin_by_feature: ]" + , "[min_data_in_bin: 3]" + , "[bin_construct_sample_cnt: 200000]" + , "[data_random_seed: 2350]" + , "[is_enable_sparse: 1]" + , "[enable_bundle: 1]" + , "[use_missing: 1]" + , "[zero_as_missing: 0]" + , "[feature_pre_filter: 1]" + , "[pre_partition: 0]" + , "[two_round: 0]" + , "[header: 0]" + , "[label_column: ]" + , "[weight_column: ]" + , "[group_column: ]" + , "[ignore_column: ]" + , "[categorical_feature: ]" + , "[forcedbins_filename: ]" + , "[precise_float_parser: 0]" + , "[parser_config_file: ]" + , "[objective_seed: 4309]" + , "[num_class: 1]" + , "[is_unbalance: 0]" + , "[scale_pos_weight: 1]" + , "[sigmoid: 1]" + , "[boost_from_average: 1]" + , "[reg_sqrt: 0]" + , "[alpha: 0.9]" + , "[fair_c: 1]" + , "[poisson_max_delta_step: 0.7]" + , "[tweedie_variance_power: 1.5]" + , "[lambdarank_truncation_level: 30]" + , "[lambdarank_norm: 1]" + , "[label_gain: ]" + , "[lambdarank_position_bias_regularization: 0]" + , "[eval_at: ]" + , "[multi_error_top_k: 1]" + , "[auc_mu_weights: ]" + , "[num_machines: 1]" + , "[local_listen_port: 12400]" + , "[time_out: 120]" + , "[machine_list_filename: ]" + , "[machines: ]" + , "[gpu_platform_id: -1]" + , "[gpu_device_id: -1]" + , "[gpu_use_dp: 0]" + , "[num_gpu: 1]" + ) + all_param_entries <- c(non_default_param_entries, default_param_entries) + # parameters should match what was passed from the R package model_str <- bst$save_model_to_string() params_in_file <- .params_from_model_string(model_str = model_str) - - # parameters should match what was passed from the R package - expect_equal(sum(startsWith(params_in_file, "[metric:")), 1L) - expect_equal(sum(params_in_file == "[metric: l2]"), 1L) - - expect_equal(sum(startsWith(params_in_file, "[num_iterations:")), 1L) - expect_equal(sum(params_in_file == "[num_iterations: 4]"), 1L) - - expect_equal(sum(startsWith(params_in_file, "[objective:")), 1L) - expect_equal(sum(params_in_file == "[objective: regression]"), 1L) - - expect_equal(sum(startsWith(params_in_file, "[verbosity:")), 1L) - expect_equal(sum(params_in_file == sprintf("[verbosity: %i]", VERBOSITY)), 1L) + .expect_in(all_param_entries, params_in_file) # early stopping should be off by default expect_equal(sum(startsWith(params_in_file, "[early_stopping_round:")), 1L) expect_equal(sum(params_in_file == "[early_stopping_round: 0]"), 1L) + + # since save_model_to_string() is used when serializing with saveRDS(), check that parameters all + # roundtrip saveRDS()/loadRDS() successfully + rds_file <- tempfile() + saveRDS(bst, rds_file) + bst_rds <- readRDS(rds_file) + model_str <- bst_rds$save_model_to_string() + params_in_file <- .params_from_model_string(model_str = model_str) + .expect_in(all_param_entries, params_in_file) }) test_that("early_stopping, num_iterations are stored correctly in model string even with aliases", { @@ -845,6 +990,7 @@ test_that("early_stopping, num_iterations are stored correctly in model string e , n_iter = n_iter , early_stopping_round = early_stopping_round , n_iter_no_change = n_iter_no_change + , num_threads = .LGB_MAX_THREADS ) bst <- lgb.train( @@ -855,7 +1001,7 @@ test_that("early_stopping, num_iterations are stored correctly in model string e , valids = list( "random_valid" = dvalid ) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) model_str <- bst$save_model_to_string() @@ -884,9 +1030,10 @@ test_that("Booster: method calls Booster with a null handle should raise an info objective = "regression" , metric = "l2" , num_leaves = 8L + , num_threads = .LGB_MAX_THREADS ) , data = dtrain - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , nrounds = 5L , valids = list( train = dtrain @@ -962,7 +1109,7 @@ test_that("Booster$new() using a Dataset with a null handle should raise an info bst <- Booster$new( train_set = dtrain , params = list( - verbose = VERBOSITY + verbose = .LGB_VERBOSITY ) ) }, regexp = "Attempting to create a Dataset without any raw data") @@ -1073,7 +1220,8 @@ test_that("lgb.cv() correctly handles passing through params to the model file", , n_iter = n_iter , early_stopping_round = early_stopping_round , n_iter_no_change = n_iter_no_change - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) cv_bst <- lgb.cv( @@ -1082,7 +1230,7 @@ test_that("lgb.cv() correctly handles passing through params to the model file", , nrounds = nrounds_kwarg , early_stopping_rounds = early_stopping_round_kwarg , nfold = 3L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) for (bst in cv_bst$boosters) { @@ -1117,7 +1265,8 @@ test_that("params (including dataset params) should be stored in .rds file for B objective = "binary" , max_depth = 4L , bagging_fraction = 0.8 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) bst <- Booster$new( params = params @@ -1133,7 +1282,8 @@ test_that("params (including dataset params) should be stored in .rds file for B objective = "binary" , max_depth = 4L , bagging_fraction = 0.8 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS , max_bin = 17L ) ) @@ -1152,7 +1302,8 @@ test_that("params (including dataset params) should be stored in .rds file for B objective = "binary" , max_depth = 4L , bagging_fraction = 0.8 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) bst <- Booster$new( params = params @@ -1168,7 +1319,8 @@ test_that("params (including dataset params) should be stored in .rds file for B objective = "binary" , max_depth = 4L , bagging_fraction = 0.8 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS , max_bin = 17L ) ) @@ -1182,8 +1334,9 @@ test_that("Handle is automatically restored when calling predict", { , nrounds = 5L , obj = "binary" , params = list( - verbose = VERBOSITY + verbose = .LGB_VERBOSITY ) + , num_threads = .LGB_MAX_THREADS ) bst_file <- tempfile(fileext = ".rds") saveRDS(bst, file = bst_file) @@ -1205,10 +1358,11 @@ test_that("boosters with linear models at leaves work with saveRDS.lgb.Booster a params <- list( objective = "regression" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , metric = "mse" , seed = 0L , num_leaves = 2L + , num_threads = .LGB_MAX_THREADS ) bst <- lgb.train( @@ -1244,10 +1398,11 @@ test_that("boosters with linear models at leaves can be written to RDS and re-lo params <- list( objective = "regression" - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , metric = "mse" , seed = 0L , num_leaves = 2L + , num_threads = .LGB_MAX_THREADS ) bst <- lgb.train( @@ -1308,7 +1463,7 @@ test_that("Booster's print, show, and summary work correctly", { .has_expected_content_for_fitted_model(log_txt) # summary() - log_text <- capture.output({ + log_txt <- capture.output({ ret <- summary(model) }) .have_same_handle(ret, model) @@ -1344,6 +1499,7 @@ test_that("Booster's print, show, and summary work correctly", { params = list( objective = "regression" , min_data_in_leaf = 1L + , num_threads = .LGB_MAX_THREADS ) , data = lgb.Dataset( as.matrix(mtcars[, -1L]) @@ -1352,19 +1508,19 @@ test_that("Booster's print, show, and summary work correctly", { min_data_in_bin = 1L ) ) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , nrounds = 5L ) .check_methods_work(model) data("iris") model <- lgb.train( - params = list(objective = "multiclass", num_class = 3L) + params = list(objective = "multiclass", num_class = 3L, num_threads = .LGB_MAX_THREADS) , data = lgb.Dataset( as.matrix(iris[, -5L]) , label = as.numeric(factor(iris$Species)) - 1.0 ) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , nrounds = 5L ) .check_methods_work(model) @@ -1397,8 +1553,9 @@ test_that("Booster's print, show, and summary work correctly", { ) , obj = .logregobj , eval = .evalerror - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , nrounds = 5L + , params = list(num_threads = .LGB_MAX_THREADS) ) .check_methods_work(model) @@ -1410,6 +1567,7 @@ test_that("LGBM_BoosterGetNumFeature_R returns correct outputs", { params = list( objective = "regression" , min_data_in_leaf = 1L + , num_threads = .LGB_MAX_THREADS ) , data = lgb.Dataset( as.matrix(mtcars[, -1L]) @@ -1418,7 +1576,7 @@ test_that("LGBM_BoosterGetNumFeature_R returns correct outputs", { min_data_in_bin = 1L ) ) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , nrounds = 5L ) ncols <- .Call(LGBM_BoosterGetNumFeature_R, model$.__enclos_env__$private$handle) @@ -1431,7 +1589,7 @@ test_that("LGBM_BoosterGetNumFeature_R returns correct outputs", { as.matrix(iris[, -5L]) , label = as.numeric(factor(iris$Species)) - 1.0 ) - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY , nrounds = 5L ) ncols <- .Call(LGBM_BoosterGetNumFeature_R, model$.__enclos_env__$private$handle) diff --git a/R-package/tests/testthat/test_lgb.interprete.R b/R-package/tests/testthat/test_lgb.interprete.R index 29ac110accbc..322a80a55bc5 100644 --- a/R-package/tests/testthat/test_lgb.interprete.R +++ b/R-package/tests/testthat/test_lgb.interprete.R @@ -1,7 +1,3 @@ -VERBOSITY <- as.integer( - Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") -) - .sigmoid <- function(x) { 1.0 / (1.0 + exp(-x)) } @@ -30,7 +26,8 @@ test_that("lgb.intereprete works as expected for binary classification", { , max_depth = -1L , min_data_in_leaf = 1L , min_sum_hessian_in_leaf = 1.0 - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) model <- lgb.train( params = params @@ -82,7 +79,8 @@ test_that("lgb.intereprete works as expected for multiclass classification", { , num_class = 3L , learning_rate = 0.00001 , min_data = 1L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) model <- lgb.train( params = params diff --git a/R-package/tests/testthat/test_lgb.plot.importance.R b/R-package/tests/testthat/test_lgb.plot.importance.R index 1a1e2b0d5398..e7ff63facde5 100644 --- a/R-package/tests/testthat/test_lgb.plot.importance.R +++ b/R-package/tests/testthat/test_lgb.plot.importance.R @@ -1,7 +1,3 @@ -VERBOSITY <- as.integer( - Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") -) - test_that("lgb.plot.importance() should run without error for well-formed inputs", { data(agaricus.train, package = "lightgbm") train <- agaricus.train @@ -13,7 +9,8 @@ test_that("lgb.plot.importance() should run without error for well-formed inputs , max_depth = -1L , min_data_in_leaf = 1L , min_sum_hessian_in_leaf = 1.0 - , verbosity = VERBOSITY + , verbosity = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) model <- lgb.train(params, dtrain, 3L) tree_imp <- lgb.importance(model, percentage = TRUE) diff --git a/R-package/tests/testthat/test_lgb.plot.interpretation.R b/R-package/tests/testthat/test_lgb.plot.interpretation.R index bb8009d3595b..6cba9927942a 100644 --- a/R-package/tests/testthat/test_lgb.plot.interpretation.R +++ b/R-package/tests/testthat/test_lgb.plot.interpretation.R @@ -1,7 +1,3 @@ -VERBOSITY <- as.integer( - Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") -) - .sigmoid <- function(x) { 1.0 / (1.0 + exp(-x)) } @@ -30,7 +26,8 @@ test_that("lgb.plot.interepretation works as expected for binary classification" , max_depth = -1L , min_data_in_leaf = 1L , min_sum_hessian_in_leaf = 1.0 - , verbosity = VERBOSITY + , verbosity = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS ) model <- lgb.train( params = params @@ -80,12 +77,13 @@ test_that("lgb.plot.interepretation works as expected for multiclass classificat , num_class = 3L , learning_rate = 0.00001 , min_data = 1L + , num_threads = .LGB_MAX_THREADS ) model <- lgb.train( params = params , data = dtrain , nrounds = 3L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) num_trees <- 5L tree_interpretation <- lgb.interprete( diff --git a/R-package/tests/testthat/test_parameters.R b/R-package/tests/testthat/test_parameters.R index 3f98f8d2907e..367f01af817c 100644 --- a/R-package/tests/testthat/test_parameters.R +++ b/R-package/tests/testthat/test_parameters.R @@ -20,6 +20,7 @@ test_that("Feature penalties work properly", { , objective = "binary" , feature_penalty = paste0(feature_penalties, collapse = ",") , metric = "binary_error" + , num_threads = .LGB_MAX_THREADS ) , nrounds = 5L , verbose = -1L @@ -97,6 +98,7 @@ test_that("training should warn if you use 'dart' boosting, specified with 'boos , learning_rate = 0.05 , objective = "binary" , metric = "binary_error" + , num_threads = .LGB_MAX_THREADS ) params[[boosting_param]] <- "dart" expect_warning({ diff --git a/R-package/tests/testthat/test_weighted_loss.R b/R-package/tests/testthat/test_weighted_loss.R index d00399548560..f9f9675c3bb9 100644 --- a/R-package/tests/testthat/test_weighted_loss.R +++ b/R-package/tests/testthat/test_weighted_loss.R @@ -1,7 +1,3 @@ -VERBOSITY <- as.integer( - Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") -) - test_that("Gamma regression reacts on 'weight'", { n <- 100L set.seed(87L) @@ -9,7 +5,7 @@ test_that("Gamma regression reacts on 'weight'", { y <- X[, 1L] + X[, 2L] + runif(n) X_pred <- X[1L:5L, ] - params <- list(objective = "gamma") + params <- list(objective = "gamma", num_threads = .LGB_MAX_THREADS) # Unweighted dtrain <- lgb.Dataset(X, label = y) @@ -17,7 +13,7 @@ test_that("Gamma regression reacts on 'weight'", { params = params , data = dtrain , nrounds = 4L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) pred_unweighted <- predict(bst, X_pred) @@ -31,7 +27,7 @@ test_that("Gamma regression reacts on 'weight'", { params = params , data = dtrain , nrounds = 4L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) pred_weighted_1 <- predict(bst, X_pred) @@ -45,7 +41,7 @@ test_that("Gamma regression reacts on 'weight'", { params = params , data = dtrain , nrounds = 4L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) pred_weighted_2 <- predict(bst, X_pred) @@ -59,7 +55,7 @@ test_that("Gamma regression reacts on 'weight'", { params = params , data = dtrain , nrounds = 4L - , verbose = VERBOSITY + , verbose = .LGB_VERBOSITY ) pred_weighted <- predict(bst, X_pred) diff --git a/README.md b/README.md index a44d557f058b..f6f4e8c570e0 100644 --- a/README.md +++ b/README.md @@ -126,6 +126,8 @@ lightgbm-transform (feature transformation binding): https://github.com/microsof `postgresml` (LightGBM training and prediction in SQL, via a Postgres extension): https://github.com/postgresml/postgresml +`vaex-ml` (Python DataFrame library with its own interface to LightGBM): https://github.com/vaexio/vaex + Support ------- diff --git a/VERSION.txt b/VERSION.txt index 200681852af8..1f06da0058c9 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -4.0.0.99 +4.1.0.99 diff --git a/build-python.sh b/build-python.sh index e535eeb06abc..8892ca2e936f 100755 --- a/build-python.sh +++ b/build-python.sh @@ -48,6 +48,9 @@ # Compile with MinGW. # --mpi # Compile MPI version. +# --no-isolation +# Assume all build and install dependencies are already installed, +# don't go to the internet to get them. # --nomp # Compile version without OpenMP support. # --precompile @@ -159,6 +162,10 @@ while [ $# -gt 0 ]; do --mpi) BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_MPI=ON" ;; + --no-isolation) + BUILD_ARGS="${BUILD_ARGS} --no-isolation" + PIP_INSTALL_ARGS="${PIP_INSTALL_ARGS} --no-build-isolation" + ;; --nomp) BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_OPENMP=OFF" ;; @@ -337,6 +344,7 @@ if test "${BUILD_SDIST}" = true; then python -m build \ --sdist \ --outdir ../dist \ + ${BUILD_ARGS} \ . fi diff --git a/docker/dockerfile-python b/docker/dockerfile-python index 541884811a0b..900d05c30012 100644 --- a/docker/dockerfile-python +++ b/docker/dockerfile-python @@ -26,6 +26,7 @@ RUN apt-get update && \ # lightgbm conda install -q -y numpy scipy scikit-learn pandas && \ git clone --recursive --branch stable --depth 1 https://github.com/Microsoft/LightGBM && \ + cd ./LightGBM && \ sh ./build-python.sh install && \ # clean apt-get autoremove -y && apt-get clean && \ diff --git a/docs/.linkcheckerrc b/docs/.linkcheckerrc index e6ab4ea1a5df..96fdcbd08157 100644 --- a/docs/.linkcheckerrc +++ b/docs/.linkcheckerrc @@ -9,6 +9,7 @@ threads=1 ignore= pythonapi/lightgbm\..*\.html.* http.*amd.com/.* + https.*dl.acm.org/doi/.* https.*tandfonline.com/.* ignorewarnings=http-robots-denied,https-certificate-error checkextern=1 diff --git a/docs/Advanced-Topics.rst b/docs/Advanced-Topics.rst index d1787b998479..345a1361bfa9 100644 --- a/docs/Advanced-Topics.rst +++ b/docs/Advanced-Topics.rst @@ -77,3 +77,44 @@ Recommendations for gcc Users (MinGW, \*nix) -------------------------------------------- - Refer to `gcc Tips <./gcc-Tips.rst>`__. + +Support for Position Bias Treatment +------------------------------------ + +Often the relevance labels provided in Learning-to-Rank tasks might be derived from implicit user feedback (e.g., clicks) and therefore might be biased due to their position/location on the screen when having been presented to a user. +LightGBM can make use of positional data. + +For example, consider the case where you expect that the first 3 results from a search engine will be visible in users' browsers without scrolling, and all other results for a query would require scrolling. + +LightGBM could be told to account for the position bias from results being "above the fold" by providing a ``positions`` array encoded as follows: + +:: + + 0 + 0 + 0 + 1 + 1 + 0 + 0 + 0 + 1 + ... + +Where ``0 = "above the fold"`` and ``1 = "requires scrolling"``. +The specific values are not important, as long as they are consistent across all observations in the training data. +An encoding like ``100 = "above the fold"`` and ``17 = "requires scrolling"`` would result in exactly the same trained model. + +In that way, ``positions`` in LightGBM's API are similar to a categorical feature. +Just as with non-ordinal categorical features, an integer representation is just used for memory and computational efficiency... LightGBM does not care about the absolute or relative magnitude of the values. + +Unlike a categorical feature, however, ``positions`` are used to adjust the target to reduce the bias in predictions made by the trained model. + +The position file corresponds with training data file line by line, and has one position per line. And if the name of training data file is ``train.txt``, the position file should be named as ``train.txt.position`` and placed in the same folder as the data file. +In this case, LightGBM will load the position file automatically if it exists. The positions can also be specified through the ``Dataset`` constructor when using Python API. If the positions are specified in both approaches, the ``.position`` file will be ignored. + +Currently, implemented is an approach to model position bias by using an idea of Generalized Additive Models (`GAM `_) to linearly decompose the document score ``s`` into the sum of a relevance component ``f`` and a positional component ``g``: ``s(x, pos) = f(x) + g(pos)`` where the former component depends on the original query-document features and the latter depends on the position of an item. +During the training, the compound scoring function ``s(x, pos)`` is fit with a standard ranking algorithm (e.g., LambdaMART) which boils down to jointly learning the relevance component ``f(x)`` (it is later returned as an unbiased model) and the position factors ``g(pos)`` that help better explain the observed (biased) labels. +Similar score decomposition ideas have previously been applied for classification & pointwise ranking tasks with assumptions of binary labels and binary relevance (a.k.a. "two-tower" models, refer to the papers: `Towards Disentangling Relevance and Bias in Unbiased Learning to Rank `_, `PAL: a position-bias aware learning framework for CTR prediction in live recommender systems `_, `A General Framework for Debiasing in CTR Prediction `_). +In LightGBM, we adapt this idea to general pairwise Lerarning-to-Rank with arbitrary ordinal relevance labels. +Besides, GAMs have been used in the context of explainable ML (`Accurate Intelligible Models with Pairwise Interactions `_) to linearly decompose the contribution of each feature (and possibly their pairwise interactions) to the overall score, for subsequent analysis and interpretation of their effects in the trained models. diff --git a/docs/GPU-Windows.rst b/docs/GPU-Windows.rst index aa1cb2036f4e..36e657e5801b 100644 --- a/docs/GPU-Windows.rst +++ b/docs/GPU-Windows.rst @@ -602,9 +602,9 @@ And open an issue in GitHub `here`_ with that log. .. _Boost: https://www.boost.org/users/history/ -.. _Prebuilt Boost x86_64: https://www.rpmfind.net/linux/fedora/linux/releases/36/Everything/x86_64/os/Packages/m/mingw64-boost-static-1.75.0-6.fc36.noarch.rpm +.. _Prebuilt Boost x86_64: https://www.rpmfind.net/linux/fedora/linux/releases/38/Everything/x86_64/os/Packages/m/mingw64-boost-static-1.78.0-4.fc38.noarch.rpm -.. _Prebuilt Boost i686: https://www.rpmfind.net/linux/fedora/linux/releases/36/Everything/x86_64/os/Packages/m/mingw32-boost-static-1.75.0-6.fc36.noarch.rpm +.. _Prebuilt Boost i686: https://www.rpmfind.net/linux/fedora/linux/releases/38/Everything/x86_64/os/Packages/m/mingw32-boost-static-1.78.0-4.fc38.noarch.rpm .. _7zip: https://www.7-zip.org/download.html diff --git a/docs/Parallel-Learning-Guide.rst b/docs/Parallel-Learning-Guide.rst index 438fd3f9ee0c..e1857034e499 100644 --- a/docs/Parallel-Learning-Guide.rst +++ b/docs/Parallel-Learning-Guide.rst @@ -514,7 +514,7 @@ See `the mars documentation`_ for usage examples. .. _SynapseML: https://aka.ms/spark -.. _this SynapseML example: https://github.com/microsoft/SynapseML/blob/master/notebooks/features/lightgbm/LightGBM%20-%20Overview.ipynb +.. _this SynapseML example: https://github.com/microsoft/SynapseML/tree/master/docs/Explore%20Algorithms/LightGBM .. _the Dask Array documentation: https://docs.dask.org/en/latest/array.html diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 5eecc27889b6..86104ba5be55 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -1137,6 +1137,12 @@ Objective Parameters - separate by ``,`` +- ``lambdarank_position_bias_regularization`` :raw-html:`🔗︎`, default = ``0.0``, type = double, constraints: ``lambdarank_position_bias_regularization >= 0.0`` + + - used only in ``lambdarank`` application when positional information is provided and position bias is modeled. Larger values reduce the inferred position bias factors. + + - *New in version 4.1.0* + Metric Parameters ----------------- diff --git a/external_libs/fast_double_parser b/external_libs/fast_double_parser index ace60646c02d..efec03532ef6 160000 --- a/external_libs/fast_double_parser +++ b/external_libs/fast_double_parser @@ -1 +1 @@ -Subproject commit ace60646c02dc54c57f19d644e49a61e7e7758ec +Subproject commit efec03532ef65984786e5e32dbc81f6e6a55a115 diff --git a/external_libs/fmt b/external_libs/fmt index b6f4ceaed0a0..f5e54359df4c 160000 --- a/external_libs/fmt +++ b/external_libs/fmt @@ -1 +1 @@ -Subproject commit b6f4ceaed0a0a24ccf575fab6c56dd50ccf6f1a9 +Subproject commit f5e54359df4c26b6230fc61d38aa294581393084 diff --git a/helpers/parameter_generator.py b/helpers/parameter_generator.py index 407f2c73e1e3..a554ee60b6c9 100644 --- a/helpers/parameter_generator.py +++ b/helpers/parameter_generator.py @@ -330,7 +330,7 @@ def gen_parameter_code( str_to_write += ' std::string tmp_str = "";\n' for x in infos: for y in x: - if "[doc-only]" in y: + if "[no-automatically-extract]" in y: continue param_type = y["inner_type"][0] name = y["name"][0] @@ -345,7 +345,7 @@ def gen_parameter_code( str_to_write += " std::stringstream str_buf;\n" for x in infos: for y in x: - if "[doc-only]" in y or "[no-save]" in y: + if "[no-save]" in y: continue param_type = y["inner_type"][0] name = y["name"][0] diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index e01578396259..6d61bc764924 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -5,8 +5,13 @@ * \note * - desc and descl2 fields must be written in reStructuredText format; * - nested sections can be placed only at the bottom of parent's section; - * - [doc-only] tag indicates that only documentation for this param should be generated and all other actions are performed manually; - * - [no-save] tag indicates that this param should not be saved into a model text representation. + * - [no-automatically-extract] + * - do not automatically extract this parameter into a Config property with the same name in Config::GetMembersFromString(). Use if: + * - specialized extraction logic for this param exists in Config::GetMembersFromString() + * - [no-save] + * - this param should not be saved into a model text representation via Config::SaveMembersToString(). Use if: + * - param is only used by the CLI (especially the "predict" and "convert_model" tasks) + * - param is related to LightGBM writing files (e.g. "output_model", "save_binary") */ #ifndef LIGHTGBM_CONFIG_H_ #define LIGHTGBM_CONFIG_H_ @@ -97,15 +102,15 @@ struct Config { #pragma region Core Parameters #endif // __NVCC__ + // [no-automatically-extract] // [no-save] - // [doc-only] // alias = config_file // desc = path of config file // desc = **Note**: can be used only in CLI version std::string config = ""; + // [no-automatically-extract] // [no-save] - // [doc-only] // type = enum // default = train // options = train, predict, convert_model, refit @@ -118,7 +123,8 @@ struct Config { // desc = **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions TaskType task = TaskType::kTrain; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // options = regression, regression_l1, huber, fair, poisson, quantile, mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda, lambdarank, rank_xendcg // alias = objective_type, app, application, loss @@ -150,7 +156,8 @@ struct Config { // descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect) std::string objective = "regression"; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // alias = boosting_type, boost // options = gbdt, rf, dart @@ -160,7 +167,7 @@ struct Config { // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations std::string boosting = "gbdt"; - // [doc-only] + // [no-automatically-extract] // type = enum // options = bagging, goss // desc = ``bagging``, Randomly Bagging Sampling @@ -200,7 +207,8 @@ struct Config { // desc = max number of leaves in one tree int num_leaves = kDefaultNumLeaves; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // options = serial, feature, data, voting // alias = tree, tree_type, tree_learner_type @@ -222,7 +230,8 @@ struct Config { // desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors int num_threads = 0; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // options = cpu, gpu, cuda // alias = device @@ -235,7 +244,7 @@ struct Config { // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support std::string device_type = "cpu"; - // [doc-only] + // [no-automatically-extract] // alias = random_seed, random_state // default = None // desc = this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc. @@ -593,7 +602,6 @@ struct Config { // desc = **Note**: can be used only in CLI version int snapshot_freq = -1; - // [no-save] // desc = whether to use gradient quantization when training // desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins`` // desc = with quantized training, most arithmetics in the training process will be integer operations @@ -602,21 +610,18 @@ struct Config { // desc = *New in version 4.0.0* bool use_quantized_grad = false; - // [no-save] // desc = number of bins to quantization gradients and hessians // desc = with more bins, the quantized training will be closer to full precision training // desc = **Note**: can be used only with ``device_type = cpu`` // desc = *New in 4.0.0* int num_grad_quant_bins = 4; - // [no-save] // desc = whether to renew the leaf values with original gradients when quantized training // desc = renewing is very helpful for good quantized training accuracy for ranking objectives // desc = **Note**: can be used only with ``device_type = cpu`` // desc = *New in 4.0.0* bool quant_train_renew_leaf = false; - // [no-save] // desc = whether to use stochastic rounding in gradient quantization // desc = *New in 4.0.0* bool stochastic_rounding = true; @@ -965,13 +970,19 @@ struct Config { // desc = separate by ``,`` std::vector label_gain; + // check = >=0.0 + // desc = used only in ``lambdarank`` application when positional information is provided and position bias is modeled. Larger values reduce the inferred position bias factors. + // desc = *New in version 4.1.0* + double lambdarank_position_bias_regularization = 0.0; + #ifndef __NVCC__ #pragma endregion #pragma region Metric Parameters #endif // __NVCC__ - // [doc-only] + // [no-automatically-extract] + // [no-save] // alias = metrics, metric_types // default = "" // type = multi-enum diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp index 6668c92f2921..5b2301ac8de3 100644 --- a/include/LightGBM/cuda/cuda_column_data.hpp +++ b/include/LightGBM/cuda/cuda_column_data.hpp @@ -98,6 +98,7 @@ class CUDAColumnData { void ResizeWhenCopySubrow(const data_size_t num_used_indices); + int gpu_device_id_; int num_threads_; data_size_t num_data_; int num_columns_; diff --git a/include/LightGBM/cuda/cuda_metric.hpp b/include/LightGBM/cuda/cuda_metric.hpp index 5eb04c81c777..9186ceea160b 100644 --- a/include/LightGBM/cuda/cuda_metric.hpp +++ b/include/LightGBM/cuda/cuda_metric.hpp @@ -9,6 +9,7 @@ #ifdef USE_CUDA +#include #include namespace LightGBM { @@ -19,6 +20,8 @@ class CUDAMetricInterface: public HOST_METRIC { explicit CUDAMetricInterface(const Config& config): HOST_METRIC(config) { cuda_labels_ = nullptr; cuda_weights_ = nullptr; + const int gpu_device_id = config.gpu_device_id >= 0 ? config.gpu_device_id : 0; + SetCUDADevice(gpu_device_id, __FILE__, __LINE__); } void Init(const Metadata& metadata, data_size_t num_data) override { diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp index dacaf252f8e6..fae8aa7ec643 100644 --- a/include/LightGBM/cuda/cuda_objective_function.hpp +++ b/include/LightGBM/cuda/cuda_objective_function.hpp @@ -21,7 +21,10 @@ namespace LightGBM { template class CUDAObjectiveInterface: public HOST_OBJECTIVE { public: - explicit CUDAObjectiveInterface(const Config& config): HOST_OBJECTIVE(config) {} + explicit CUDAObjectiveInterface(const Config& config): HOST_OBJECTIVE(config) { + const int gpu_device_id = config.gpu_device_id >= 0 ? config.gpu_device_id : 0; + SetCUDADevice(gpu_device_id, __FILE__, __LINE__); + } explicit CUDAObjectiveInterface(const std::vector& strs): HOST_OBJECTIVE(strs) {} diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 46ac5a9149d7..953bf9f12e88 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -28,6 +28,8 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = void SetCUDADevice(int gpu_device_id, const char* file, int line); +int GetCUDADevice(const char* file, int line); + template void AllocateCUDAMemory(T** out_ptr, size_t size, const char* file, const int line) { void* tmp_ptr = nullptr; diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 825c5c6ebcf8..e7baa42dc2e6 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -114,6 +114,8 @@ class Metadata { void SetQuery(const data_size_t* query, data_size_t len); + void SetPosition(const data_size_t* position, data_size_t len); + /*! * \brief Set initial scores * \param init_score Initial scores, this class will manage memory for init_score. @@ -213,6 +215,38 @@ class Metadata { } } + /*! + * \brief Get positions, if does not exist then return nullptr + * \return Pointer of positions + */ + inline const data_size_t* positions() const { + if (!positions_.empty()) { + return positions_.data(); + } else { + return nullptr; + } + } + + /*! + * \brief Get position IDs, if does not exist then return nullptr + * \return Pointer of position IDs + */ + inline const std::string* position_ids() const { + if (!position_ids_.empty()) { + return position_ids_.data(); + } else { + return nullptr; + } + } + + /*! + * \brief Get Number of different position IDs + * \return number of different position IDs + */ + inline size_t num_position_ids() const { + return position_ids_.size(); + } + /*! * \brief Get data boundaries on queries, if not exists, will return nullptr * we assume data will order by query, @@ -289,6 +323,8 @@ class Metadata { private: /*! \brief Load wights from file */ void LoadWeights(); + /*! \brief Load positions from file */ + void LoadPositions(); /*! \brief Load query boundaries from file */ void LoadQueryBoundaries(); /*! \brief Calculate query weights from queries */ @@ -309,10 +345,16 @@ class Metadata { data_size_t num_data_; /*! \brief Number of weights, used to check correct weight file */ data_size_t num_weights_; + /*! \brief Number of positions, used to check correct position file */ + data_size_t num_positions_; /*! \brief Label data */ std::vector label_; /*! \brief Weights data */ std::vector weights_; + /*! \brief Positions data */ + std::vector positions_; + /*! \brief Position identifiers */ + std::vector position_ids_; /*! \brief Query boundaries */ std::vector query_boundaries_; /*! \brief Query weights */ @@ -328,6 +370,7 @@ class Metadata { /*! \brief mutex for threading safe call */ std::mutex mutex_; bool weight_load_from_file_; + bool position_load_from_file_; bool query_load_from_file_; bool init_score_load_from_file_; #ifdef USE_CUDA diff --git a/python-package/README.rst b/python-package/README.rst index c83307916163..bf9874e1227c 100644 --- a/python-package/README.rst +++ b/python-package/README.rst @@ -256,7 +256,14 @@ If you get any errors during installation or due to any other reasons, you may w Build Wheel File **************** -You can use ``sh ./build-python.sh install bdist_wheel`` instead of ``sh ./build-python.sh install`` to build wheel file and use it for installation later. This might be useful for systems with restricted or completely without network access. +You can use ``sh ./build-python.sh install bdist_wheel`` to build a wheel file but not install it. + +That script requires some dependencies like ``build``, ``scikit-build-core``, and ``wheel``. +In environments with restricted or no internt access, install those tools and then pass ``--no-isolation``. + +.. code:: sh + + sh ./build-python.sh bdist_wheel --no-isolation Build With MSBuild ****************** diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py index 5815bc602bde..0dc5b75cfdf2 100644 --- a/python-package/lightgbm/__init__.py +++ b/python-package/lightgbm/__init__.py @@ -6,7 +6,7 @@ from pathlib import Path from .basic import Booster, Dataset, Sequence, register_logger -from .callback import early_stopping, log_evaluation, record_evaluation, reset_parameter +from .callback import EarlyStopException, early_stopping, log_evaluation, record_evaluation, reset_parameter from .engine import CVBooster, cv, train try: @@ -32,5 +32,5 @@ 'train', 'cv', 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', 'DaskLGBMRegressor', 'DaskLGBMClassifier', 'DaskLGBMRanker', - 'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', + 'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 'EarlyStopException', 'plot_importance', 'plot_split_value_histogram', 'plot_metric', 'plot_tree', 'create_tree_digraph'] diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index caa71fed47e5..3dfa583a62bb 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -24,6 +24,13 @@ if TYPE_CHECKING: from typing import Literal + # typing.TypeGuard was only introduced in Python 3.10 + try: + from typing import TypeGuard + except ImportError: + from typing_extensions import TypeGuard + + __all__ = [ 'Booster', 'Dataset', @@ -54,6 +61,7 @@ _LGBM_EvalFunctionResultType = Tuple[str, float, bool] _LGBM_BoosterBestScoreType = Dict[str, Dict[str, float]] _LGBM_BoosterEvalMethodResultType = Tuple[str, str, float, bool] +_LGBM_BoosterEvalMethodResultWithStandardDeviationType = Tuple[str, str, float, bool, float] _LGBM_CategoricalFeatureConfiguration = Union[List[str], List[int], "Literal['auto']"] _LGBM_FeatureNameConfiguration = Union[List[str], "Literal['auto']"] _LGBM_GroupType = Union[ @@ -62,6 +70,10 @@ np.ndarray, pd_Series ] +_LGBM_PositionType = Union[ + np.ndarray, + pd_Series +] _LGBM_InitScoreType = Union[ List[float], List[List[float]], @@ -126,7 +138,7 @@ class _MissingType(Enum): class _DummyLogger: def info(self, msg: str) -> None: - print(msg) + print(msg) # noqa: T201 def warning(self, msg: str) -> None: warnings.warn(msg, stacklevel=3) @@ -274,6 +286,20 @@ def _is_1d_list(data: Any) -> bool: return isinstance(data, list) and (not data or _is_numeric(data[0])) +def _is_list_of_numpy_arrays(data: Any) -> "TypeGuard[List[np.ndarray]]": + return ( + isinstance(data, list) + and all(isinstance(x, np.ndarray) for x in data) + ) + + +def _is_list_of_sequences(data: Any) -> "TypeGuard[List[Sequence]]": + return ( + isinstance(data, list) + and all(isinstance(x, Sequence) for x in data) + ) + + def _is_1d_collection(data: Any) -> bool: """Check whether data is a 1-D collection.""" return ( @@ -453,7 +479,7 @@ def _get_all_param_aliases() -> Dict[str, List[str]]: buffer_len = 1 << 20 tmp_out_len = ctypes.c_int64(0) string_buffer = ctypes.create_string_buffer(buffer_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) _safe_call(_LIB.LGBM_DumpParamAliases( ctypes.c_int64(buffer_len), ctypes.byref(tmp_out_len), @@ -462,16 +488,15 @@ def _get_all_param_aliases() -> Dict[str, List[str]]: # if buffer length is not long enough, re-allocate a buffer if actual_len > buffer_len: string_buffer = ctypes.create_string_buffer(actual_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) _safe_call(_LIB.LGBM_DumpParamAliases( ctypes.c_int64(actual_len), ctypes.byref(tmp_out_len), ptr_string_buffer)) - aliases = json.loads( + return json.loads( string_buffer.value.decode('utf-8'), object_hook=lambda obj: {k: [k] + v for k, v in obj.items()} ) - return aliases @classmethod def get(cls, *args) -> Set[str]: @@ -578,7 +603,8 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va "label": _C_API_DTYPE_FLOAT32, "weight": _C_API_DTYPE_FLOAT32, "init_score": _C_API_DTYPE_FLOAT64, - "group": _C_API_DTYPE_INT32 + "group": _C_API_DTYPE_INT32, + "position": _C_API_DTYPE_INT32 } """String name to int feature importance type mapper""" @@ -664,57 +690,52 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None: def _data_from_pandas( - data, - feature_name: Optional[_LGBM_FeatureNameConfiguration], - categorical_feature: Optional[_LGBM_CategoricalFeatureConfiguration], + data: pd_DataFrame, + feature_name: _LGBM_FeatureNameConfiguration, + categorical_feature: _LGBM_CategoricalFeatureConfiguration, pandas_categorical: Optional[List[List]] -): - if isinstance(data, pd_DataFrame): - if len(data.shape) != 2 or data.shape[0] < 1: - raise ValueError('Input data must be 2 dimensional and non empty.') - if feature_name == 'auto' or feature_name is None: - data = data.rename(columns=str, copy=False) - cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)] - cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered] - if pandas_categorical is None: # train dataset - pandas_categorical = [list(data[col].cat.categories) for col in cat_cols] - else: - if len(cat_cols) != len(pandas_categorical): - raise ValueError('train and valid dataset categorical_feature do not match.') - for col, category in zip(cat_cols, pandas_categorical): - if list(data[col].cat.categories) != list(category): - data[col] = data[col].cat.set_categories(category) - if len(cat_cols): # cat_cols is list - data = data.copy(deep=False) # not alter origin DataFrame - data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan}) - if categorical_feature is not None: - if feature_name is None: - feature_name = list(data.columns) - if categorical_feature == 'auto': # use cat cols from DataFrame - categorical_feature = cat_cols_not_ordered - else: # use cat cols specified by user - categorical_feature = list(categorical_feature) # type: ignore[assignment] - if feature_name == 'auto': - feature_name = list(data.columns) - _check_for_bad_pandas_dtypes(data.dtypes) - df_dtypes = [dtype.type for dtype in data.dtypes] - df_dtypes.append(np.float32) # so that the target dtype considers floats - target_dtype = np.find_common_type(df_dtypes, []) - try: - # most common case (no nullable dtypes) - data = data.to_numpy(dtype=target_dtype, copy=False) - except TypeError: - # 1.0 <= pd version < 1.1 and nullable dtypes, least common case - # raises error because array is casted to type(pd.NA) and there's no na_value argument - data = data.astype(target_dtype, copy=False).values - except ValueError: - # data has nullable dtypes, but we can specify na_value argument and copy will be made - data = data.to_numpy(dtype=target_dtype, na_value=np.nan) +) -> Tuple[np.ndarray, List[str], List[str], List[List]]: + if len(data.shape) != 2 or data.shape[0] < 1: + raise ValueError('Input data must be 2 dimensional and non empty.') + + # determine feature names + if feature_name == 'auto': + feature_name = [str(col) for col in data.columns] + + # determine categorical features + cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)] + cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered] + if pandas_categorical is None: # train dataset + pandas_categorical = [list(data[col].cat.categories) for col in cat_cols] else: - if feature_name == 'auto': - feature_name = None - if categorical_feature == 'auto': - categorical_feature = None + if len(cat_cols) != len(pandas_categorical): + raise ValueError('train and valid dataset categorical_feature do not match.') + for col, category in zip(cat_cols, pandas_categorical): + if list(data[col].cat.categories) != list(category): + data[col] = data[col].cat.set_categories(category) + if len(cat_cols): # cat_cols is list + data = data.copy(deep=False) # not alter origin DataFrame + data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan}) + if categorical_feature == 'auto': # use cat cols from DataFrame + categorical_feature = cat_cols_not_ordered + else: # use cat cols specified by user + categorical_feature = list(categorical_feature) # type: ignore[assignment] + + # get numpy representation of the data + _check_for_bad_pandas_dtypes(data.dtypes) + df_dtypes = [dtype.type for dtype in data.dtypes] + df_dtypes.append(np.float32) # so that the target dtype considers floats + target_dtype = np.result_type(*df_dtypes) + try: + # most common case (no nullable dtypes) + data = data.to_numpy(dtype=target_dtype, copy=False) + except TypeError: + # 1.0 <= pd version < 1.1 and nullable dtypes, least common case + # raises error because array is casted to type(pd.NA) and there's no na_value argument + data = data.astype(target_dtype, copy=False).values + except ValueError: + # data has nullable dtypes, but we can specify na_value argument and copy will be made + data = data.to_numpy(dtype=target_dtype, na_value=np.nan) return data, feature_name, categorical_feature, pandas_categorical @@ -1000,7 +1021,15 @@ def predict( ctypes.c_int(len(data_names)), ) ) - data = _data_from_pandas(data, None, None, self.pandas_categorical)[0] + + if isinstance(data, pd_DataFrame): + data = _data_from_pandas( + data=data, + feature_name="auto", + categorical_feature="auto", + pandas_categorical=self.pandas_categorical + )[0] + predict_type = _C_API_PREDICT_NORMAL if raw_score: predict_type = _C_API_PREDICT_RAW_SCORE @@ -1526,7 +1555,8 @@ def __init__( feature_name: _LGBM_FeatureNameConfiguration = 'auto', categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', params: Optional[Dict[str, Any]] = None, - free_raw_data: bool = True + free_raw_data: bool = True, + position: Optional[_LGBM_PositionType] = None, ): """Initialize Dataset. @@ -1566,6 +1596,8 @@ def __init__( Other parameters for Dataset. free_raw_data : bool, optional (default=True) If True, raw data is freed after constructing inner Dataset. + position : numpy 1-D array, pandas Series or None, optional (default=None) + Position of items used in unbiased learning-to-rank task. """ self._handle: Optional[_DatasetHandle] = None self.data = data @@ -1573,6 +1605,7 @@ def __init__( self.reference = reference self.weight = weight self.group = group + self.position = position self.init_score = init_score self.feature_name: _LGBM_FeatureNameConfiguration = feature_name self.categorical_feature: _LGBM_CategoricalFeatureConfiguration = categorical_feature @@ -1581,7 +1614,7 @@ def __init__( self.used_indices: Optional[List[int]] = None self._need_slice = True self._predictor: Optional[_InnerPredictor] = None - self.pandas_categorical = None + self.pandas_categorical: Optional[List[List]] = None self._params_back_up = None self.version = 0 self._start_row = 0 # Used when pushing rows one by one. @@ -1837,7 +1870,8 @@ def _lazy_init( predictor: Optional[_InnerPredictor], feature_name: _LGBM_FeatureNameConfiguration, categorical_feature: _LGBM_CategoricalFeatureConfiguration, - params: Optional[Dict[str, Any]] + params: Optional[Dict[str, Any]], + position: Optional[_LGBM_PositionType] ) -> "Dataset": if data is None: self._handle = None @@ -1845,10 +1879,13 @@ def _lazy_init( if reference is not None: self.pandas_categorical = reference.pandas_categorical categorical_feature = reference.categorical_feature - data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data=data, - feature_name=feature_name, - categorical_feature=categorical_feature, - pandas_categorical=self.pandas_categorical) + if isinstance(data, pd_DataFrame): + data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas( + data=data, + feature_name=feature_name, + categorical_feature=categorical_feature, + pandas_categorical=self.pandas_categorical + ) # process for args params = {} if params is None else params @@ -1858,10 +1895,10 @@ def _lazy_init( _log_warning(f'{key} keyword has been found in `params` and will be ignored.\n' f'Please use {key} argument of the Dataset constructor to pass this parameter.') # get categorical features - if categorical_feature is not None: + if isinstance(categorical_feature, list): categorical_indices = set() feature_dict = {} - if feature_name is not None: + if isinstance(feature_name, list): feature_dict = {name: i for i, name in enumerate(feature_name)} for name in categorical_feature: if isinstance(name, str) and name in feature_dict: @@ -1902,9 +1939,9 @@ def _lazy_init( elif isinstance(data, np.ndarray): self.__init_from_np2d(data, params_str, ref_dataset) elif isinstance(data, list) and len(data) > 0: - if all(isinstance(x, np.ndarray) for x in data): + if _is_list_of_numpy_arrays(data): self.__init_from_list_np2d(data, params_str, ref_dataset) - elif all(isinstance(x, Sequence) for x in data): + elif _is_list_of_sequences(data): self.__init_from_seqs(data, ref_dataset) else: raise TypeError('Data list can only be of ndarray or Sequence') @@ -1926,6 +1963,8 @@ def _lazy_init( self.set_weight(weight) if group is not None: self.set_group(group) + if position is not None: + self.set_position(position) if isinstance(predictor, _InnerPredictor): if self._predictor is None and init_score is not None: _log_warning("The init_score will be overridden by the prediction of init_model.") @@ -2220,7 +2259,7 @@ def construct(self) -> "Dataset": if self.used_indices is None: # create valid self._lazy_init(data=self.data, label=self.label, reference=self.reference, - weight=self.weight, group=self.group, + weight=self.weight, group=self.group, position=self.position, init_score=self.init_score, predictor=self._predictor, feature_name=self.feature_name, categorical_feature='auto', params=self.params) else: @@ -2243,6 +2282,8 @@ def construct(self) -> "Dataset": self.get_data() if self.group is not None: self.set_group(self.group) + if self.position is not None: + self.set_position(self.position) if self.get_label() is None: raise ValueError("Label should not be None.") if isinstance(self._predictor, _InnerPredictor) and self._predictor is not self.reference._predictor: @@ -2257,7 +2298,8 @@ def construct(self) -> "Dataset": self._lazy_init(data=self.data, label=self.label, reference=None, weight=self.weight, group=self.group, init_score=self.init_score, predictor=self._predictor, - feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params) + feature_name=self.feature_name, categorical_feature=self.categorical_feature, + params=self.params, position=self.position) if self.free_raw_data: self.data = None self.feature_name = self.get_feature_name() @@ -2270,7 +2312,8 @@ def create_valid( weight: Optional[_LGBM_WeightType] = None, group: Optional[_LGBM_GroupType] = None, init_score: Optional[_LGBM_InitScoreType] = None, - params: Optional[Dict[str, Any]] = None + params: Optional[Dict[str, Any]] = None, + position: Optional[_LGBM_PositionType] = None ) -> "Dataset": """Create validation data align with current Dataset. @@ -2293,6 +2336,8 @@ def create_valid( Init score for Dataset. params : dict or None, optional (default=None) Other parameters for validation Dataset. + position : numpy 1-D array, pandas Series or None, optional (default=None) + Position of items used in unbiased learning-to-rank task. Returns ------- @@ -2300,7 +2345,7 @@ def create_valid( Validation Dataset with reference to self. """ ret = Dataset(data, label=label, reference=self, - weight=weight, group=group, init_score=init_score, + weight=weight, group=group, position=position, init_score=init_score, params=params, free_raw_data=self.free_raw_data) ret._predictor = self._predictor ret.pandas_categorical = self.pandas_categorical @@ -2435,7 +2480,7 @@ def set_field( 'In multiclass classification init_score can also be a list of lists, numpy 2-D array or pandas DataFrame.' ) else: - dtype = np.int32 if field_name == 'group' else np.float32 + dtype = np.int32 if (field_name == 'group' or field_name == 'position') else np.float32 data = _list_to_1d_numpy(data, dtype=dtype, name=field_name) ptr_data: Union[_ctypes_float_ptr, _ctypes_int_ptr] @@ -2728,6 +2773,28 @@ def set_group( self.set_field('group', group) return self + def set_position( + self, + position: Optional[_LGBM_PositionType] + ) -> "Dataset": + """Set position of Dataset (used for ranking). + + Parameters + ---------- + position : numpy 1-D array, pandas Series or None, optional (default=None) + Position of items used in unbiased learning-to-rank task. + + Returns + ------- + self : Dataset + Dataset with set position. + """ + self.position = position + if self._handle is not None and position is not None: + position = _list_to_1d_numpy(position, dtype=np.int32, name='position') + self.set_field('position', position) + return self + def get_feature_name(self) -> List[str]: """Get the names of columns (features) in the Dataset. @@ -2824,7 +2891,7 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]: self.data = self.data[self.used_indices, :] elif isinstance(self.data, Sequence): self.data = self.data[self.used_indices] - elif isinstance(self.data, list) and len(self.data) > 0 and all(isinstance(x, Sequence) for x in self.data): + elif _is_list_of_sequences(self.data) and len(self.data) > 0: self.data = np.array(list(self._yield_row_from_seqlist(self.data, self.used_indices))) else: _log_warning(f"Cannot subset {type(self.data).__name__} type of raw data.\n" @@ -2854,6 +2921,18 @@ def get_group(self) -> Optional[np.ndarray]: self.group = np.diff(self.group) return self.group + def get_position(self) -> Optional[np.ndarray]: + """Get the position of the Dataset. + + Returns + ------- + position : numpy 1-D array or None + Position of items used in unbiased learning-to-rank task. + """ + if self.position is None: + self.position = self.get_field('position') + return self.position + def num_data(self) -> int: """Get the number of rows in the Dataset. @@ -3209,8 +3288,7 @@ def __copy__(self) -> "Booster": def __deepcopy__(self, _) -> "Booster": model_str = self.model_to_string(num_iteration=-1) - booster = Booster(model_str=model_str) - return booster + return Booster(model_str=model_str) def __getstate__(self) -> Dict[str, Any]: this = self.__dict__.copy() @@ -3237,7 +3315,7 @@ def _get_loaded_param(self) -> Dict[str, Any]: buffer_len = 1 << 20 tmp_out_len = ctypes.c_int64(0) string_buffer = ctypes.create_string_buffer(buffer_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) _safe_call(_LIB.LGBM_BoosterGetLoadedParam( self._handle, ctypes.c_int64(buffer_len), @@ -3247,7 +3325,7 @@ def _get_loaded_param(self) -> Dict[str, Any]: # if buffer length is not long enough, re-allocate a buffer if actual_len > buffer_len: string_buffer = ctypes.create_string_buffer(actual_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) _safe_call(_LIB.LGBM_BoosterGetLoadedParam( self._handle, ctypes.c_int64(actual_len), @@ -4000,7 +4078,7 @@ def model_to_string( buffer_len = 1 << 20 tmp_out_len = ctypes.c_int64(0) string_buffer = ctypes.create_string_buffer(buffer_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) _safe_call(_LIB.LGBM_BoosterSaveModelToString( self._handle, ctypes.c_int(start_iteration), @@ -4013,7 +4091,7 @@ def model_to_string( # if buffer length is not long enough, re-allocate a buffer if actual_len > buffer_len: string_buffer = ctypes.create_string_buffer(actual_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) _safe_call(_LIB.LGBM_BoosterSaveModelToString( self._handle, ctypes.c_int(start_iteration), @@ -4068,7 +4146,7 @@ def dump_model( buffer_len = 1 << 20 tmp_out_len = ctypes.c_int64(0) string_buffer = ctypes.create_string_buffer(buffer_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) _safe_call(_LIB.LGBM_BoosterDumpModel( self._handle, ctypes.c_int(start_iteration), @@ -4081,7 +4159,7 @@ def dump_model( # if buffer length is not long enough, reallocate a buffer if actual_len > buffer_len: string_buffer = ctypes.create_string_buffer(actual_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) _safe_call(_LIB.LGBM_BoosterDumpModel( self._handle, ctypes.c_int(start_iteration), diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py index 77856f5bdab6..2f77ee740c75 100644 --- a/python-package/lightgbm/callback.py +++ b/python-package/lightgbm/callback.py @@ -1,12 +1,18 @@ # coding: utf-8 """Callbacks library.""" -import collections +from collections import OrderedDict +from dataclasses import dataclass from functools import partial -from typing import Any, Callable, Dict, List, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union -from .basic import _ConfigAliases, _LGBM_BoosterEvalMethodResultType, _log_info, _log_warning +from .basic import (Booster, _ConfigAliases, _LGBM_BoosterEvalMethodResultType, + _LGBM_BoosterEvalMethodResultWithStandardDeviationType, _log_info, _log_warning) + +if TYPE_CHECKING: + from .engine import CVBooster __all__ = [ + 'EarlyStopException', 'early_stopping', 'log_evaluation', 'record_evaluation', @@ -16,16 +22,20 @@ _EvalResultDict = Dict[str, Dict[str, List[Any]]] _EvalResultTuple = Union[ _LGBM_BoosterEvalMethodResultType, - Tuple[str, str, float, bool, float] + _LGBM_BoosterEvalMethodResultWithStandardDeviationType ] _ListOfEvalResultTuples = Union[ List[_LGBM_BoosterEvalMethodResultType], - List[Tuple[str, str, float, bool, float]] + List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType] ] class EarlyStopException(Exception): - """Exception of early stopping.""" + """Exception of early stopping. + + Raise this from a callback passed in via keyword argument ``callbacks`` + in ``cv()`` or ``train()`` to trigger early stopping. + """ def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) -> None: """Create early stopping exception. @@ -34,6 +44,7 @@ def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) -> ---------- best_iteration : int The best iteration stopped. + 0-based... pass ``best_iteration=2`` to indicate that the third iteration was the best one. best_score : list of (eval_name, metric_name, eval_result, is_higher_better) tuple or (eval_name, metric_name, eval_result, is_higher_better, stdv) tuple Scores for each metric, on each validation set, as of the best iteration. """ @@ -43,14 +54,14 @@ def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) -> # Callback environment used by callbacks -CallbackEnv = collections.namedtuple( - "CallbackEnv", - ["model", - "params", - "iteration", - "begin_iteration", - "end_iteration", - "evaluation_result_list"]) +@dataclass +class CallbackEnv: + model: Union[Booster, "CVBooster"] + params: Dict[str, Any] + iteration: int + begin_iteration: int + end_iteration: int + evaluation_result_list: Optional[_ListOfEvalResultTuples] def _format_eval_result(value: _EvalResultTuple, show_stdv: bool) -> str: @@ -120,13 +131,18 @@ def __init__(self, eval_result: _EvalResultDict) -> None: self.eval_result = eval_result def _init(self, env: CallbackEnv) -> None: + if env.evaluation_result_list is None: + raise RuntimeError( + "record_evaluation() callback enabled but no evaluation results found. This is a probably bug in LightGBM. " + "Please report it at https://github.com/microsoft/LightGBM/issues" + ) self.eval_result.clear() for item in env.evaluation_result_list: if len(item) == 4: # regular train data_name, eval_name = item[:2] else: # cv data_name, eval_name = item[1].split() - self.eval_result.setdefault(data_name, collections.OrderedDict()) + self.eval_result.setdefault(data_name, OrderedDict()) if len(item) == 4: self.eval_result[data_name].setdefault(eval_name, []) else: @@ -136,6 +152,11 @@ def _init(self, env: CallbackEnv) -> None: def __call__(self, env: CallbackEnv) -> None: if env.iteration == env.begin_iteration: self._init(env) + if env.evaluation_result_list is None: + raise RuntimeError( + "record_evaluation() callback enabled but no evaluation results found. This is a probably bug in LightGBM. " + "Please report it at https://github.com/microsoft/LightGBM/issues" + ) for item in env.evaluation_result_list: if len(item) == 4: data_name, eval_name, result = item[:3] @@ -143,7 +164,7 @@ def __call__(self, env: CallbackEnv) -> None: else: data_name, eval_name = item[1].split() res_mean = item[2] - res_stdv = item[4] + res_stdv = item[4] # type: ignore[misc] self.eval_result[data_name][f'{eval_name}-mean'].append(res_mean) self.eval_result[data_name][f'{eval_name}-stdv'].append(res_stdv) @@ -274,6 +295,10 @@ def _is_train_set(self, ds_name: str, eval_name: str, train_name: str) -> bool: return (ds_name == "cv_agg" and eval_name == "train") or ds_name == train_name def _init(self, env: CallbackEnv) -> None: + if env.evaluation_result_list is None or env.evaluation_result_list == []: + raise ValueError( + "For early stopping, at least one dataset and eval metric is required for evaluation" + ) is_dart = any(env.params.get(alias, "") == 'dart' for alias in _ConfigAliases.get("boosting")) only_train_set = ( len(env.evaluation_result_list) == 1 @@ -289,9 +314,6 @@ def _init(self, env: CallbackEnv) -> None: elif only_train_set: _log_warning('Only training set found, disabling early stopping.') return - if not env.evaluation_result_list: - raise ValueError('For early stopping, ' - 'at least one dataset and eval metric is required for evaluation') if self.stopping_rounds <= 0: raise ValueError("stopping_rounds should be greater than zero.") @@ -353,6 +375,11 @@ def __call__(self, env: CallbackEnv) -> None: self._init(env) if not self.enabled: return + if env.evaluation_result_list is None: + raise RuntimeError( + "early_stopping() callback enabled but no evaluation results found. This is a probably bug in LightGBM. " + "Please report it at https://github.com/microsoft/LightGBM/issues" + ) # self.best_score_list is initialized to an empty list first_time_updating_best_score_list = (self.best_score_list == []) for i in range(len(env.evaluation_result_list)): diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index 2d640d741629..822aa3b35017 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -1,8 +1,8 @@ # coding: utf-8 """Library with training routines of LightGBM.""" -import collections import copy import json +from collections import OrderedDict, defaultdict from operator import attrgetter from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union @@ -11,9 +11,9 @@ from . import callback from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor, - _LGBM_BoosterEvalMethodResultType, _LGBM_CategoricalFeatureConfiguration, - _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration, - _log_warning) + _LGBM_BoosterEvalMethodResultType, _LGBM_BoosterEvalMethodResultWithStandardDeviationType, + _LGBM_CategoricalFeatureConfiguration, _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType, + _LGBM_FeatureNameConfiguration, _log_warning) from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold __all__ = [ @@ -293,7 +293,7 @@ def train( booster.best_iteration = earlyStopException.best_iteration + 1 evaluation_result_list = earlyStopException.best_score break - booster.best_score = collections.defaultdict(collections.OrderedDict) + booster.best_score = defaultdict(OrderedDict) for dataset_name, eval_name, score, _ in evaluation_result_list: booster.best_score[dataset_name][eval_name] = score if not keep_training_booster: @@ -339,16 +339,12 @@ def __init__( with open(model_file, "r") as file: self._from_dict(json.load(file)) - def _append(self, booster: Booster) -> None: - """Add a booster to CVBooster.""" - self.boosters.append(booster) - def _from_dict(self, models: Dict[str, Any]) -> None: """Load CVBooster from dict.""" self.best_iteration = models["best_iteration"] self.boosters = [] for model_str in models["boosters"]: - self._append(Booster(model_str=model_str)) + self.boosters.append(Booster(model_str=model_str)) def _to_dict(self, num_iteration: Optional[int], start_iteration: int, importance_type: str) -> Dict[str, Any]: """Serialize CVBooster to dict.""" @@ -514,19 +510,19 @@ def _make_n_folds( train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy()) else: tparam = params - cvbooster = Booster(tparam, train_set) + booster_for_fold = Booster(tparam, train_set) if eval_train_metric: - cvbooster.add_valid(train_set, 'train') - cvbooster.add_valid(valid_set, 'valid') - ret._append(cvbooster) + booster_for_fold.add_valid(train_set, 'train') + booster_for_fold.add_valid(valid_set, 'valid') + ret.boosters.append(booster_for_fold) return ret def _agg_cv_result( - raw_results: List[List[Tuple[str, str, float, bool]]] -) -> List[Tuple[str, str, float, bool, float]]: + raw_results: List[List[_LGBM_BoosterEvalMethodResultType]] +) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]: """Aggregate cross-validation results.""" - cvmap: Dict[str, List[float]] = collections.OrderedDict() + cvmap: Dict[str, List[float]] = OrderedDict() metric_type: Dict[str, bool] = {} for one_result in raw_results: for one_line in one_result: @@ -534,7 +530,7 @@ def _agg_cv_result( metric_type[key] = one_line[3] cvmap.setdefault(key, []) cvmap[key].append(one_line[2]) - return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()] + return [('cv_agg', k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()] def cv( @@ -655,13 +651,18 @@ def cv( Returns ------- - eval_hist : dict - Evaluation history. + eval_results : dict + History of evaluation results of each metric. The dictionary has the following format: - {'metric1-mean': [values], 'metric1-stdv': [values], - 'metric2-mean': [values], 'metric2-stdv': [values], + {'valid metric1-mean': [values], 'valid metric1-stdv': [values], + 'valid metric2-mean': [values], 'valid metric2-stdv': [values], ...}. If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key. + If ``eval_train_metric=True``, also returns the train metric history. + In this case, the dictionary has the following format: + {'train metric1-mean': [values], 'valid metric1-mean': [values], + 'train metric2-mean': [values], 'valid metric2-mean': [values], + ...}. """ if not isinstance(train_set, Dataset): raise TypeError(f"cv() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.") @@ -717,7 +718,7 @@ def cv( .set_feature_name(feature_name) \ .set_categorical_feature(categorical_feature) - results = collections.defaultdict(list) + results = defaultdict(list) cvfolds = _make_n_folds(full_data=train_set, folds=folds, nfold=nfold, params=params, seed=seed, fpreproc=fpreproc, stratified=stratified, shuffle=shuffle, diff --git a/python-package/lightgbm/plotting.py b/python-package/lightgbm/plotting.py index 0f9bcd5f8ccb..85b245c187ef 100644 --- a/python-package/lightgbm/plotting.py +++ b/python-package/lightgbm/plotting.py @@ -693,11 +693,7 @@ def create_tree_digraph( model = booster.dump_model() tree_infos = model['tree_info'] - if 'feature_names' in model: - feature_names = model['feature_names'] - else: - feature_names = None - + feature_names = model.get('feature_names', None) monotone_constraints = model.get('monotone_constraints', None) if tree_index < len(tree_infos): @@ -716,13 +712,13 @@ def create_tree_digraph( if isinstance(example_case, pd_DataFrame): example_case = _data_from_pandas( data=example_case, - feature_name=None, - categorical_feature=None, + feature_name="auto", + categorical_feature="auto", pandas_categorical=booster.pandas_categorical )[0] example_case = example_case[0] - graph = _to_graphviz( + return _to_graphviz( tree_info=tree_info, show_info=show_info, feature_names=feature_names, @@ -734,8 +730,6 @@ def create_tree_digraph( **kwargs ) - return graph - def plot_tree( booster: Union[Booster, LGBMModel], diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 7e909342c01f..c71c233df908 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1103,6 +1103,8 @@ def fit( # type: ignore[override] self._classes = self._le.classes_ self._n_classes = len(self._classes) # type: ignore[arg-type] + if self.objective is None: + self._objective = None # adjust eval metrics to match whether binary or multiclass # classification is being performed diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index d3ff28286bb9..6e43dc242d1b 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -18,6 +18,7 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence" ] dependencies = [ + "dataclasses ; python_version < '3.7'", "numpy", "scipy" ] @@ -29,7 +30,7 @@ maintainers = [ name = "lightgbm" readme = "README.rst" requires-python = ">=3.6" -version = "4.0.0.99" +version = "4.1.0.99" [project.optional-dependencies] dask = [ @@ -111,7 +112,13 @@ select = [ # pycodestyle "E", # pyflakes - "F" + "F", + # flake8-return: unnecessary assignment before return + "RET504", + # flake8-simplify: use dict.get() instead of an if-else block + "SIM401", + # flake8-print + "T", ] # this should be set to the oldest version of python LightGBM supports @@ -120,13 +127,17 @@ target-version = "py37" [tool.ruff.per-file-ignores] "examples/*" = [ # pydocstyle - "D" + "D", + # flake8-print + "T" ] "tests/*" = [ # (flake8-bugbear) Found useless expression "B018", # pydocstyle - "D" + "D", + # flake8-print + "T" ] [tool.ruff.pydocstyle] diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index 9a87e982483e..88ece154e432 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -115,6 +115,12 @@ class RF : public GBDT { const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); const std::vector>& bag_data_indices = data_sample_strategy_->bag_data_indices(); + // GOSSStrategy->Bagging may modify value of bag_data_cnt_ + if (is_use_subset && bag_data_cnt < num_data_) { + tmp_grad_.resize(num_data_); + tmp_hess_.resize(num_data_); + } + CHECK_EQ(gradients, nullptr); CHECK_EQ(hessians, nullptr); diff --git a/src/c_api.cpp b/src/c_api.cpp index 442247d7a9dd..8c4eee96b4c9 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -150,7 +150,7 @@ class Booster { objective_fun_.reset(ObjectiveFunction::CreateObjectiveFunction(config_.objective, config_)); if (objective_fun_ == nullptr) { - Log::Warning("Using self-defined objective function"); + Log::Info("Using self-defined objective function"); } // initialize the objective function if (objective_fun_ != nullptr) { @@ -320,7 +320,7 @@ class Booster { objective_fun_.reset(ObjectiveFunction::CreateObjectiveFunction(config_.objective, config_)); if (objective_fun_ == nullptr) { - Log::Warning("Using self-defined objective function"); + Log::Info("Using self-defined objective function"); } // initialize the objective function if (objective_fun_ != nullptr) { diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp index fd4abcf25e79..a7d0df697e24 100644 --- a/src/cuda/cuda_utils.cpp +++ b/src/cuda/cuda_utils.cpp @@ -26,6 +26,12 @@ void SetCUDADevice(int gpu_device_id, const char* file, int line) { } } +int GetCUDADevice(const char* file, int line) { + int cur_gpu_device_id = 0; + CUDASUCCESS_OR_FATAL_OUTER(cudaGetDevice(&cur_gpu_device_id)); + return cur_gpu_device_id; +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 0906ba4b6439..394614af3f33 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -304,6 +304,7 @@ const std::unordered_set& Config::parameter_set() { "lambdarank_truncation_level", "lambdarank_norm", "label_gain", + "lambdarank_position_bias_regularization", "metric", "metric_freq", "is_provide_training_metric", @@ -619,6 +620,9 @@ void Config::GetMembersFromString(const std::unordered_map(tmp_str, ','); } + GetDouble(params, "lambdarank_position_bias_regularization", &lambdarank_position_bias_regularization); + CHECK_GE(lambdarank_position_bias_regularization, 0.0); + GetInt(params, "metric_freq", &metric_freq); CHECK_GT(metric_freq, 0); @@ -660,12 +664,14 @@ void Config::GetMembersFromString(const std::unordered_map>& Config::paramet {"lambdarank_truncation_level", {}}, {"lambdarank_norm", {}}, {"label_gain", {}}, + {"lambdarank_position_bias_regularization", {}}, {"metric", {"metrics", "metric_types"}}, {"metric_freq", {"output_freq"}}, {"is_provide_training_metric", {"training_metric", "is_training_metric", "train_metric"}}, @@ -1035,6 +1047,7 @@ const std::unordered_map& Config::ParameterTypes() { {"lambdarank_truncation_level", "int"}, {"lambdarank_norm", "bool"}, {"label_gain", "vector"}, + {"lambdarank_position_bias_regularization", "double"}, {"metric", "vector"}, {"metric_freq", "int"}, {"is_provide_training_metric", "bool"}, diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp index a1080cb2b902..eb0938c01225 100644 --- a/src/io/cuda/cuda_column_data.cpp +++ b/src/io/cuda/cuda_column_data.cpp @@ -12,11 +12,8 @@ namespace LightGBM { CUDAColumnData::CUDAColumnData(const data_size_t num_data, const int gpu_device_id) { num_threads_ = OMP_NUM_THREADS(); num_data_ = num_data; - if (gpu_device_id >= 0) { - SetCUDADevice(gpu_device_id, __FILE__, __LINE__); - } else { - SetCUDADevice(0, __FILE__, __LINE__); - } + gpu_device_id_ = gpu_device_id >= 0 ? gpu_device_id : 0; + SetCUDADevice(gpu_device_id_, __FILE__, __LINE__); cuda_used_indices_ = nullptr; cuda_data_by_column_ = nullptr; cuda_column_bit_type_ = nullptr; @@ -117,37 +114,41 @@ void CUDAColumnData::Init(const int num_columns, feature_mfb_is_na_ = feature_mfb_is_na; data_by_column_.resize(num_columns_, nullptr); OMP_INIT_EX(); - #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int column_index = 0; column_index < num_columns_; ++column_index) { - OMP_LOOP_EX_BEGIN(); - const int8_t bit_type = column_bit_type[column_index]; - if (column_data[column_index] != nullptr) { - // is dense column - if (bit_type == 4) { - column_bit_type_[column_index] = 8; - InitOneColumnData(column_data[column_index], nullptr, &data_by_column_[column_index]); - } else if (bit_type == 8) { - InitOneColumnData(column_data[column_index], nullptr, &data_by_column_[column_index]); - } else if (bit_type == 16) { - InitOneColumnData(column_data[column_index], nullptr, &data_by_column_[column_index]); - } else if (bit_type == 32) { - InitOneColumnData(column_data[column_index], nullptr, &data_by_column_[column_index]); - } else { - Log::Fatal("Unknow column bit type %d", bit_type); - } - } else { - // is sparse column - if (bit_type == 8) { - InitOneColumnData(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]); - } else if (bit_type == 16) { - InitOneColumnData(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]); - } else if (bit_type == 32) { - InitOneColumnData(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]); + #pragma omp parallel num_threads(num_threads_) + { + SetCUDADevice(gpu_device_id_, __FILE__, __LINE__); + #pragma omp for schedule(static) + for (int column_index = 0; column_index < num_columns_; ++column_index) { + OMP_LOOP_EX_BEGIN(); + const int8_t bit_type = column_bit_type[column_index]; + if (column_data[column_index] != nullptr) { + // is dense column + if (bit_type == 4) { + column_bit_type_[column_index] = 8; + InitOneColumnData(column_data[column_index], nullptr, &data_by_column_[column_index]); + } else if (bit_type == 8) { + InitOneColumnData(column_data[column_index], nullptr, &data_by_column_[column_index]); + } else if (bit_type == 16) { + InitOneColumnData(column_data[column_index], nullptr, &data_by_column_[column_index]); + } else if (bit_type == 32) { + InitOneColumnData(column_data[column_index], nullptr, &data_by_column_[column_index]); + } else { + Log::Fatal("Unknow column bit type %d", bit_type); + } } else { - Log::Fatal("Unknow column bit type %d", bit_type); + // is sparse column + if (bit_type == 8) { + InitOneColumnData(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]); + } else if (bit_type == 16) { + InitOneColumnData(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]); + } else if (bit_type == 32) { + InitOneColumnData(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]); + } else { + Log::Fatal("Unknow column bit type %d", bit_type); + } } + OMP_LOOP_EX_END(); } - OMP_LOOP_EX_END(); } OMP_THROW_EX(); feature_to_column_ = feature_to_column; @@ -182,24 +183,28 @@ void CUDAColumnData::CopySubrow( AllocateCUDAMemory(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__); data_by_column_.resize(num_columns_, nullptr); OMP_INIT_EX(); - #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int column_index = 0; column_index < num_columns_; ++column_index) { - OMP_LOOP_EX_BEGIN(); - const uint8_t bit_type = column_bit_type_[column_index]; - if (bit_type == 8) { - uint8_t* column_data = nullptr; - AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); - data_by_column_[column_index] = reinterpret_cast(column_data); - } else if (bit_type == 16) { - uint16_t* column_data = nullptr; - AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); - data_by_column_[column_index] = reinterpret_cast(column_data); - } else if (bit_type == 32) { - uint32_t* column_data = nullptr; - AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); - data_by_column_[column_index] = reinterpret_cast(column_data); + #pragma omp parallel num_threads(num_threads_) + { + SetCUDADevice(gpu_device_id_, __FILE__, __LINE__); + #pragma omp for schedule(static) + for (int column_index = 0; column_index < num_columns_; ++column_index) { + OMP_LOOP_EX_BEGIN(); + const uint8_t bit_type = column_bit_type_[column_index]; + if (bit_type == 8) { + uint8_t* column_data = nullptr; + AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); + data_by_column_[column_index] = reinterpret_cast(column_data); + } else if (bit_type == 16) { + uint16_t* column_data = nullptr; + AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); + data_by_column_[column_index] = reinterpret_cast(column_data); + } else if (bit_type == 32) { + uint32_t* column_data = nullptr; + AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); + data_by_column_[column_index] = reinterpret_cast(column_data); + } + OMP_LOOP_EX_END(); } - OMP_LOOP_EX_END(); } OMP_THROW_EX(); InitCUDAMemoryFromHostMemory(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__); @@ -221,27 +226,31 @@ void CUDAColumnData::ResizeWhenCopySubrow(const data_size_t num_used_indices) { DeallocateCUDAMemory(&cuda_used_indices_, __FILE__, __LINE__); AllocateCUDAMemory(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__); OMP_INIT_EX(); - #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int column_index = 0; column_index < num_columns_; ++column_index) { - OMP_LOOP_EX_BEGIN(); - const uint8_t bit_type = column_bit_type_[column_index]; - if (bit_type == 8) { - uint8_t* column_data = reinterpret_cast(data_by_column_[column_index]); - DeallocateCUDAMemory(&column_data, __FILE__, __LINE__); - AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); - data_by_column_[column_index] = reinterpret_cast(column_data); - } else if (bit_type == 16) { - uint16_t* column_data = reinterpret_cast(data_by_column_[column_index]); - DeallocateCUDAMemory(&column_data, __FILE__, __LINE__); - AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); - data_by_column_[column_index] = reinterpret_cast(column_data); - } else if (bit_type == 32) { - uint32_t* column_data = reinterpret_cast(data_by_column_[column_index]); - DeallocateCUDAMemory(&column_data, __FILE__, __LINE__); - AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); - data_by_column_[column_index] = reinterpret_cast(column_data); + #pragma omp parallel num_threads(num_threads_) + { + SetCUDADevice(gpu_device_id_, __FILE__, __LINE__); + #pragma omp for schedule(static) + for (int column_index = 0; column_index < num_columns_; ++column_index) { + OMP_LOOP_EX_BEGIN(); + const uint8_t bit_type = column_bit_type_[column_index]; + if (bit_type == 8) { + uint8_t* column_data = reinterpret_cast(data_by_column_[column_index]); + DeallocateCUDAMemory(&column_data, __FILE__, __LINE__); + AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); + data_by_column_[column_index] = reinterpret_cast(column_data); + } else if (bit_type == 16) { + uint16_t* column_data = reinterpret_cast(data_by_column_[column_index]); + DeallocateCUDAMemory(&column_data, __FILE__, __LINE__); + AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); + data_by_column_[column_index] = reinterpret_cast(column_data); + } else if (bit_type == 32) { + uint32_t* column_data = reinterpret_cast(data_by_column_[column_index]); + DeallocateCUDAMemory(&column_data, __FILE__, __LINE__); + AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); + data_by_column_[column_index] = reinterpret_cast(column_data); + } + OMP_LOOP_EX_END(); } - OMP_LOOP_EX_END(); } OMP_THROW_EX(); DeallocateCUDAMemory(&cuda_data_by_column_, __FILE__, __LINE__); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 5b23f01ec3a0..cd692afb031a 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -699,7 +699,7 @@ TrainingShareStates* Dataset::GetShareStates( if (col_wise_time < row_wise_time) { auto overhead_cost = row_wise_init_time + row_wise_time + col_wise_time; - Log::Warning( + Log::Info( "Auto-choosing col-wise multi-threading, the overhead of testing was " "%f seconds.\n" "You can set `force_col_wise=true` to remove the overhead.", @@ -707,7 +707,7 @@ TrainingShareStates* Dataset::GetShareStates( return col_wise_state.release(); } else { auto overhead_cost = col_wise_init_time + row_wise_time + col_wise_time; - Log::Warning( + Log::Info( "Auto-choosing row-wise multi-threading, the overhead of testing was " "%f seconds.\n" "You can set `force_row_wise=true` to remove the overhead.\n" @@ -937,6 +937,8 @@ bool Dataset::SetIntField(const char* field_name, const int* field_data, name = Common::Trim(name); if (name == std::string("query") || name == std::string("group")) { metadata_.SetQuery(field_data, num_element); + } else if (name == std::string("position")) { + metadata_.SetPosition(field_data, num_element); } else { return false; } @@ -987,6 +989,9 @@ bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, if (name == std::string("query") || name == std::string("group")) { *out_ptr = metadata_.query_boundaries(); *out_len = metadata_.num_queries() + 1; + } else if (name == std::string("position")) { + *out_ptr = metadata_.positions(); + *out_len = num_data_; } else { return false; } @@ -1273,21 +1278,34 @@ void Dataset::ConstructHistogramsInner( auto ptr_ordered_grad = gradients; auto ptr_ordered_hess = hessians; if (num_used_dense_group > 0) { - if (USE_INDICES) { - if (USE_HESSIAN) { -#pragma omp parallel for schedule(static, 512) if (num_data >= 1024) + if (USE_QUANT_GRAD) { + int16_t* ordered_gradients_and_hessians = reinterpret_cast(ordered_gradients); + const int16_t* gradients_and_hessians = reinterpret_cast(gradients); + if (USE_INDICES) { + #pragma omp parallel for schedule(static, 512) if (num_data >= 1024) for (data_size_t i = 0; i < num_data; ++i) { - ordered_gradients[i] = gradients[data_indices[i]]; - ordered_hessians[i] = hessians[data_indices[i]]; + ordered_gradients_and_hessians[i] = gradients_and_hessians[data_indices[i]]; } - ptr_ordered_grad = ordered_gradients; - ptr_ordered_hess = ordered_hessians; - } else { -#pragma omp parallel for schedule(static, 512) if (num_data >= 1024) - for (data_size_t i = 0; i < num_data; ++i) { - ordered_gradients[i] = gradients[data_indices[i]]; + ptr_ordered_grad = reinterpret_cast(ordered_gradients); + ptr_ordered_hess = nullptr; + } + } else { + if (USE_INDICES) { + if (USE_HESSIAN) { + #pragma omp parallel for schedule(static, 512) if (num_data >= 1024) + for (data_size_t i = 0; i < num_data; ++i) { + ordered_gradients[i] = gradients[data_indices[i]]; + ordered_hessians[i] = hessians[data_indices[i]]; + } + ptr_ordered_grad = ordered_gradients; + ptr_ordered_hess = ordered_hessians; + } else { + #pragma omp parallel for schedule(static, 512) if (num_data >= 1024) + for (data_size_t i = 0; i < num_data; ++i) { + ordered_gradients[i] = gradients[data_indices[i]]; + } + ptr_ordered_grad = ordered_gradients; } - ptr_ordered_grad = ordered_gradients; } } OMP_INIT_EX(); diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 2a589fa24ef8..1fc47c46787f 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -15,7 +16,9 @@ Metadata::Metadata() { num_init_score_ = 0; num_data_ = 0; num_queries_ = 0; + num_positions_ = 0; weight_load_from_file_ = false; + position_load_from_file_ = false; query_load_from_file_ = false; init_score_load_from_file_ = false; #ifdef USE_CUDA @@ -28,6 +31,7 @@ void Metadata::Init(const char* data_filename) { // for lambdarank, it needs query data for partition data in distributed learning LoadQueryBoundaries(); LoadWeights(); + LoadPositions(); CalculateQueryWeights(); LoadInitialScore(data_filename_); } @@ -214,6 +218,13 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector 0 && num_positions_ != num_all_data) { + positions_.clear(); + num_positions_ = 0; + Log::Fatal("Positions size (%i) doesn't match data size (%i)", num_positions_, num_data_); + } + // get local positions + if (!positions_.empty()) { + auto old_positions = positions_; + num_positions_ = num_data_; + positions_ = std::vector(num_data_); + #pragma omp parallel for schedule(static, 512) + for (int i = 0; i < static_cast(used_data_indices.size()); ++i) { + positions_[i] = old_positions[used_data_indices[i]]; + } + old_positions.clear(); + } + } if (query_load_from_file_) { // check query boundries if (!query_boundaries_.empty() && query_boundaries_[num_queries_] != num_all_data) { @@ -489,6 +519,47 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) { #endif // USE_CUDA } +void Metadata::SetPosition(const data_size_t* positions, data_size_t len) { + std::lock_guard lock(mutex_); + // save to nullptr + if (positions == nullptr || len == 0) { + positions_.clear(); + num_positions_ = 0; + return; + } + #ifdef USE_CUDA + Log::Fatal("Positions in learning to rank is not supported in CUDA version yet."); + #endif // USE_CUDA + if (num_data_ != len) { + Log::Fatal("Positions size (%i) doesn't match data size (%i)", len, num_data_); + } + if (positions_.empty()) { + positions_.resize(num_data_); + } else { + Log::Warning("Overwritting positions in dataset."); + } + num_positions_ = num_data_; + + position_load_from_file_ = false; + + position_ids_.clear(); + std::unordered_map map_id2pos; + for (data_size_t i = 0; i < num_positions_; ++i) { + if (map_id2pos.count(positions[i]) == 0) { + int pos = static_cast(map_id2pos.size()); + map_id2pos[positions[i]] = pos; + position_ids_.push_back(std::to_string(positions[i])); + } + } + + Log::Debug("number of unique positions found = %ld", position_ids_.size()); + + #pragma omp parallel for schedule(static, 512) if (num_positions_ >= 1024) + for (data_size_t i = 0; i < num_positions_; ++i) { + positions_[i] = map_id2pos.at(positions[i]); + } +} + void Metadata::InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len) { if (!queries) { Log::Fatal("Passed null queries"); @@ -528,6 +599,32 @@ void Metadata::LoadWeights() { weight_load_from_file_ = true; } +void Metadata::LoadPositions() { + num_positions_ = 0; + std::string position_filename(data_filename_); + // default position file name + position_filename.append(".position"); + TextReader reader(position_filename.c_str(), false); + reader.ReadAllLines(); + if (reader.Lines().empty()) { + return; + } + Log::Info("Loading positions from %s ...", position_filename.c_str()); + num_positions_ = static_cast(reader.Lines().size()); + positions_ = std::vector(num_positions_); + position_ids_ = std::vector(); + std::unordered_map map_id2pos; + for (data_size_t i = 0; i < num_positions_; ++i) { + std::string& line = reader.Lines()[i]; + if (map_id2pos.count(line) == 0) { + map_id2pos[line] = static_cast(position_ids_.size()); + position_ids_.push_back(line); + } + positions_[i] = map_id2pos.at(line); + } + position_load_from_file_ = true; +} + void Metadata::LoadInitialScore(const std::string& data_filename) { num_init_score_ = 0; std::string init_score_filename(data_filename); diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 653fc6e8609a..6bd5324812f8 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -25,7 +25,10 @@ namespace LightGBM { class RankingObjective : public ObjectiveFunction { public: explicit RankingObjective(const Config& config) - : seed_(config.objective_seed) {} + : seed_(config.objective_seed) { + learning_rate_ = config.learning_rate; + position_bias_regularization_ = config.lambdarank_position_bias_regularization; + } explicit RankingObjective(const std::vector&) : seed_(0) {} @@ -37,12 +40,20 @@ class RankingObjective : public ObjectiveFunction { label_ = metadata.label(); // get weights weights_ = metadata.weights(); + // get positions + positions_ = metadata.positions(); + // get position ids + position_ids_ = metadata.position_ids(); + // get number of different position ids + num_position_ids_ = static_cast(metadata.num_position_ids()); // get boundries query_boundaries_ = metadata.query_boundaries(); if (query_boundaries_ == nullptr) { Log::Fatal("Ranking tasks require query information"); } num_queries_ = metadata.num_queries(); + // initialize position bias vectors + pos_biases_.resize(num_position_ids_, 0.0); } void GetGradients(const double* score, score_t* gradients, @@ -51,7 +62,13 @@ class RankingObjective : public ObjectiveFunction { for (data_size_t i = 0; i < num_queries_; ++i) { const data_size_t start = query_boundaries_[i]; const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i]; - GetGradientsForOneQuery(i, cnt, label_ + start, score + start, + std::vector score_adjusted; + if (num_position_ids_ > 0) { + for (data_size_t j = 0; j < cnt; ++j) { + score_adjusted.push_back(score[start + j] + pos_biases_[positions_[start + j]]); + } + } + GetGradientsForOneQuery(i, cnt, label_ + start, num_position_ids_ > 0 ? score_adjusted.data() : score + start, gradients + start, hessians + start); if (weights_ != nullptr) { for (data_size_t j = 0; j < cnt; ++j) { @@ -62,6 +79,9 @@ class RankingObjective : public ObjectiveFunction { } } } + if (num_position_ids_ > 0) { + UpdatePositionBiasFactors(gradients, hessians); + } } virtual void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt, @@ -69,6 +89,8 @@ class RankingObjective : public ObjectiveFunction { const double* score, score_t* lambdas, score_t* hessians) const = 0; + virtual void UpdatePositionBiasFactors(const score_t* /*lambdas*/, const score_t* /*hessians*/) const {} + const char* GetName() const override = 0; std::string ToString() const override { @@ -88,8 +110,20 @@ class RankingObjective : public ObjectiveFunction { const label_t* label_; /*! \brief Pointer of weights */ const label_t* weights_; + /*! \brief Pointer of positions */ + const data_size_t* positions_; + /*! \brief Pointer of position IDs */ + const std::string* position_ids_; + /*! \brief Pointer of label */ + data_size_t num_position_ids_; /*! \brief Query boundaries */ const data_size_t* query_boundaries_; + /*! \brief Position bias factors */ + mutable std::vector pos_biases_; + /*! \brief Learning rate to update position bias factors */ + double learning_rate_; + /*! \brief Position bias regularization */ + double position_bias_regularization_; }; /*! @@ -253,9 +287,67 @@ class LambdarankNDCG : public RankingObjective { } } + void UpdatePositionBiasFactors(const score_t* lambdas, const score_t* hessians) const override { + /// get number of threads + int num_threads = 1; + #pragma omp parallel + #pragma omp master + { + num_threads = omp_get_num_threads(); + } + // create per-thread buffers for first and second derivatives of utility w.r.t. position bias factors + std::vector bias_first_derivatives(num_position_ids_ * num_threads, 0.0); + std::vector bias_second_derivatives(num_position_ids_ * num_threads, 0.0); + std::vector instance_counts(num_position_ids_ * num_threads, 0); + #pragma omp parallel for schedule(guided) + for (data_size_t i = 0; i < num_data_; i++) { + // get thread ID + const int tid = omp_get_thread_num(); + size_t offset = static_cast(positions_[i] + tid * num_position_ids_); + // accumulate first derivatives of utility w.r.t. position bias factors, for each position + bias_first_derivatives[offset] -= lambdas[i]; + // accumulate second derivatives of utility w.r.t. position bias factors, for each position + bias_second_derivatives[offset] -= hessians[i]; + instance_counts[offset]++; + } + #pragma omp parallel for schedule(guided) + for (data_size_t i = 0; i < num_position_ids_; i++) { + double bias_first_derivative = 0.0; + double bias_second_derivative = 0.0; + int instance_count = 0; + // aggregate derivatives from per-thread buffers + for (int tid = 0; tid < num_threads; tid++) { + size_t offset = static_cast(i + tid * num_position_ids_); + bias_first_derivative += bias_first_derivatives[offset]; + bias_second_derivative += bias_second_derivatives[offset]; + instance_count += instance_counts[offset]; + } + // L2 regularization on position bias factors + bias_first_derivative -= pos_biases_[i] * position_bias_regularization_ * instance_count; + bias_second_derivative -= position_bias_regularization_ * instance_count; + // do Newton-Raphson step to update position bias factors + pos_biases_[i] += learning_rate_ * bias_first_derivative / (std::abs(bias_second_derivative) + 0.001); + } + LogDebugPositionBiasFactors(); + } + const char* GetName() const override { return "lambdarank"; } protected: + void LogDebugPositionBiasFactors() const { + std::stringstream message_stream; + message_stream << std::setw(15) << "position" + << std::setw(15) << "bias_factor" + << std::endl; + Log::Debug(message_stream.str().c_str()); + message_stream.str(""); + for (int i = 0; i < num_position_ids_; ++i) { + message_stream << std::setw(15) << position_ids_[i] + << std::setw(15) << pos_biases_[i]; + Log::Debug(message_stream.str().c_str()); + message_stream.str(""); + } + } /*! \brief Sigmoid param */ double sigmoid_; /*! \brief Normalize the lambdas or not */ diff --git a/src/objective/regression_objective.hpp b/src/objective/regression_objective.hpp index 71c1a6d7cdfe..eb149756c205 100644 --- a/src/objective/regression_objective.hpp +++ b/src/objective/regression_objective.hpp @@ -24,14 +24,14 @@ namespace LightGBM { for (data_size_t i = 0; i < cnt_data; ++i) { \ ref_data[i] = data_reader(i); \ } \ - const double float_pos = static_cast(1.0 - alpha) * cnt_data; \ - const data_size_t pos = static_cast(float_pos); \ + const double float_pos = static_cast(cnt_data - 1) * (1.0 - alpha); \ + const data_size_t pos = static_cast(float_pos) + 1; \ if (pos < 1) { \ return ref_data[ArrayArgs::ArgMax(ref_data)]; \ } else if (pos >= cnt_data) { \ return ref_data[ArrayArgs::ArgMin(ref_data)]; \ } else { \ - const double bias = float_pos - pos; \ + const double bias = float_pos - (pos - 1); \ if (pos > cnt_data / 2) { \ ArrayArgs::ArgMaxAtK(&ref_data, 0, cnt_data, pos - 1); \ T v1 = ref_data[pos - 1]; \ diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index 163bfc4df9ca..fdf55693a0e9 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -53,6 +53,25 @@ class LeafSplits { weight_ = weight; } + /*! + * \brief Init split on current leaf on partial data. + * \param leaf Index of current leaf + * \param data_partition current data partition + * \param sum_gradients + * \param sum_hessians + * \param sum_gradients_and_hessians + * \param weight + */ + void Init(int leaf, const DataPartition* data_partition, double sum_gradients, + double sum_hessians, int64_t sum_gradients_and_hessians, double weight) { + leaf_index_ = leaf; + data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); + sum_gradients_ = sum_gradients; + sum_hessians_ = sum_hessians; + int_sum_gradients_and_hessians_ = sum_gradients_and_hessians; + weight_ = weight; + } + /*! * \brief Init split on current leaf on partial data. * \param leaf Index of current leaf diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index c322c1a796c2..37d9a2a50713 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -841,32 +841,65 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, #endif // init the leaves that used on next iteration - if (best_split_info.left_count < best_split_info.right_count) { - CHECK_GT(best_split_info.left_count, 0); - smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), - best_split_info.left_sum_gradient, - best_split_info.left_sum_hessian, - best_split_info.left_output); - larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), - best_split_info.right_sum_gradient, - best_split_info.right_sum_hessian, - best_split_info.right_output); + if (!config_->use_quantized_grad) { + if (best_split_info.left_count < best_split_info.right_count) { + CHECK_GT(best_split_info.left_count, 0); + smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, + best_split_info.left_output); + larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, + best_split_info.right_output); + } else { + CHECK_GT(best_split_info.right_count, 0); + smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, + best_split_info.right_output); + larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, + best_split_info.left_output); + } } else { - CHECK_GT(best_split_info.right_count, 0); - smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), - best_split_info.right_sum_gradient, - best_split_info.right_sum_hessian, - best_split_info.right_output); - larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), - best_split_info.left_sum_gradient, - best_split_info.left_sum_hessian, - best_split_info.left_output); + if (best_split_info.left_count < best_split_info.right_count) { + CHECK_GT(best_split_info.left_count, 0); + smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, + best_split_info.left_sum_gradient_and_hessian, + best_split_info.left_output); + larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, + best_split_info.right_sum_gradient_and_hessian, + best_split_info.right_output); + } else { + CHECK_GT(best_split_info.right_count, 0); + smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, + best_split_info.right_sum_gradient_and_hessian, + best_split_info.right_output); + larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, + best_split_info.left_sum_gradient_and_hessian, + best_split_info.left_output); + } } if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) { gradient_discretizer_->SetNumBitsInHistogramBin(*left_leaf, *right_leaf, data_partition_->leaf_count(*left_leaf), data_partition_->leaf_count(*right_leaf)); } + + #ifdef DEBUG + CheckSplit(best_split_info, *left_leaf, *right_leaf); + #endif + auto leaves_need_update = constraints_->Update( is_numerical_split, *left_leaf, *right_leaf, best_split_info.monotone_type, best_split_info.right_output, @@ -1024,4 +1057,48 @@ std::vector node_used_features = col_sampler_.GetByNode(tree, leaf); *split = bests[best_idx]; } +#ifdef DEBUG +void SerialTreeLearner::CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index) { + data_size_t num_data_in_left = 0; + data_size_t num_data_in_right = 0; + const data_size_t* data_indices_in_left = data_partition_->GetIndexOnLeaf(left_leaf_index, &num_data_in_left); + const data_size_t* data_indices_in_right = data_partition_->GetIndexOnLeaf(right_leaf_index, &num_data_in_right); + if (config_->use_quantized_grad) { + int32_t sum_left_gradient = 0; + int32_t sum_left_hessian = 0; + int32_t sum_right_gradient = 0; + int32_t sum_right_hessian = 0; + const int8_t* discretized_grad_and_hess = gradient_discretizer_->discretized_gradients_and_hessians(); + for (data_size_t i = 0; i < num_data_in_left; ++i) { + const data_size_t index = data_indices_in_left[i]; + sum_left_gradient += discretized_grad_and_hess[2 * index + 1]; + sum_left_hessian += discretized_grad_and_hess[2 * index]; + } + for (data_size_t i = 0; i < num_data_in_right; ++i) { + const data_size_t index = data_indices_in_right[i]; + sum_right_gradient += discretized_grad_and_hess[2 * index + 1]; + sum_right_hessian += discretized_grad_and_hess[2 * index]; + } + Log::Warning("============================ start leaf split info ============================"); + Log::Warning("left_leaf_index = %d, right_leaf_index = %d", left_leaf_index, right_leaf_index); + Log::Warning("num_data_in_left = %d, num_data_in_right = %d", num_data_in_left, num_data_in_right); + Log::Warning("sum_left_gradient = %d, best_split_info->left_sum_gradient_and_hessian.gradient = %d", sum_left_gradient, + static_cast(best_split_info.left_sum_gradient_and_hessian >> 32)); + Log::Warning("sum_left_hessian = %d, best_split_info->left_sum_gradient_and_hessian.hessian = %d", sum_left_hessian, + static_cast(best_split_info.left_sum_gradient_and_hessian & 0x00000000ffffffff)); + Log::Warning("sum_right_gradient = %d, best_split_info->right_sum_gradient_and_hessian.gradient = %d", sum_right_gradient, + static_cast(best_split_info.right_sum_gradient_and_hessian >> 32)); + Log::Warning("sum_right_hessian = %d, best_split_info->right_sum_gradient_and_hessian.hessian = %d", sum_right_hessian, + static_cast(best_split_info.right_sum_gradient_and_hessian & 0x00000000ffffffff)); + CHECK_EQ(num_data_in_left, best_split_info.left_count); + CHECK_EQ(num_data_in_right, best_split_info.right_count); + CHECK_EQ(sum_left_gradient, static_cast(best_split_info.left_sum_gradient_and_hessian >> 32)) + CHECK_EQ(sum_left_hessian, static_cast(best_split_info.left_sum_gradient_and_hessian & 0x00000000ffffffff)); + CHECK_EQ(sum_right_gradient, static_cast(best_split_info.right_sum_gradient_and_hessian >> 32)); + CHECK_EQ(sum_right_hessian, static_cast(best_split_info.right_sum_gradient_and_hessian & 0x00000000ffffffff)); + Log::Warning("============================ end leaf split info ============================"); + } +} +#endif + } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index d815d265c0d2..93e0787a90cf 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -171,7 +171,9 @@ class SerialTreeLearner: public TreeLearner { std::set FindAllForceFeatures(Json force_split_leaf_setting); + #ifdef DEBUG void CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index); + #endif /*! * \brief Get the number of data in a leaf diff --git a/tests/distributed/_test_distributed.py b/tests/distributed/_test_distributed.py index 9ede4e0800fb..e37dafee6393 100644 --- a/tests/distributed/_test_distributed.py +++ b/tests/distributed/_test_distributed.py @@ -25,7 +25,7 @@ def _find_random_open_port() -> int: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(('', 0)) port = s.getsockname()[1] - return port + return port # noqa: RET504 def _generate_n_ports(n: int) -> Generator[int, None, None]: @@ -47,8 +47,7 @@ def create_data(task: str, n_samples: int = 1_000) -> np.ndarray: X, y = make_blobs(n_samples, centers=centers, random_state=42) elif task == 'regression': X, y = make_regression(n_samples, n_features=4, n_informative=2, random_state=42) - dataset = np.hstack([y.reshape(-1, 1), X]) - return dataset + return np.hstack([y.reshape(-1, 1), X]) class DistributedMockup: @@ -149,8 +148,7 @@ def predict(self, predict_config: Dict[str, Any]) -> np.ndarray: result = subprocess.run(cmd) if result.returncode != 0: raise RuntimeError('Error in prediction') - y_pred = np.loadtxt(str(TESTS_DIR / 'predictions.txt')) - return y_pred + return np.loadtxt(str(TESTS_DIR / 'predictions.txt')) def write_train_config(self, i: int) -> None: """Create a file train{i}.conf with the required configuration to train. diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 267041eae2e4..7f8980c271f7 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -723,7 +723,12 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name): pd = pytest.importorskip('pandas') X = np.random.rand(10, 2).astype(dtype) df = pd.DataFrame(X) - built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0] + built_data = lgb.basic._data_from_pandas( + data=df, + feature_name=feature_name, + categorical_feature="auto", + pandas_categorical=None + )[0] assert built_data.dtype == dtype assert np.shares_memory(X, built_data) @@ -734,7 +739,12 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name): X = np.random.choice(['a', 'b'], 100).reshape(-1, 1) column_name = 'a' if feature_name == 'auto' else feature_name[0] df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category') - data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0] + data = lgb.basic._data_from_pandas( + data=df, + feature_name=feature_name, + categorical_feature="auto", + pandas_categorical=None + )[0] # check that the original data wasn't modified np.testing.assert_equal(df[column_name], X[:, 0]) # check that the built data has the codes @@ -806,3 +816,10 @@ def test_set_leaf_output(): leaf_output = bst.get_leaf_output(tree_id=0, leaf_id=leaf_id) bst.set_leaf_output(tree_id=0, leaf_id=leaf_id, value=leaf_output + 1) np.testing.assert_allclose(bst.predict(X), y_pred + 1) + + +def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset(): + ds = lgb.Dataset( + data=np.random.randn(100, 3), + ) + assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"] diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index cb69440b3cde..9da50945385c 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1838,7 +1838,6 @@ def test_distributed_quantized_training(cluster): 'num_grad_quant_bins': 30, 'quant_train_renew_leaf': True, 'verbose': -1, - 'force_row_wise': True, } quant_dask_classifier = lgb.DaskLGBMRegressor( diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index bf2bd6a8b01d..4ef72888e767 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -9,6 +9,7 @@ import re from os import getenv from pathlib import Path +from shutil import copyfile import numpy as np import psutil @@ -19,7 +20,7 @@ from sklearn.model_selection import GroupKFold, TimeSeriesSplit, train_test_split import lightgbm as lgb -from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame +from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame, pd_Series from .utils import (SERIALIZERS, dummy_obj, load_breast_cancer, load_digits, load_iris, logistic_sigmoid, make_synthetic_regression, mse_obj, pickle_and_unpickle_object, sklearn_multiclass_custom_objective, @@ -142,7 +143,7 @@ def test_regression(objective): elif objective == 'quantile': assert ret < 1311 else: - assert ret < 338 + assert ret < 343 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) @@ -747,6 +748,171 @@ def test_ranking_prediction_early_stopping(): np.testing.assert_allclose(ret_early, ret_early_more_strict) +# Simulates position bias for a given ranking dataset. +# The ouput dataset is identical to the input one with the exception for the relevance labels. +# The new labels are generated according to an instance of a cascade user model: +# for each query, the user is simulated to be traversing the list of documents ranked by a baseline ranker +# (in our example it is simply the ordering by some feature correlated with relevance, e.g., 34) +# and clicks on that document (new_label=1) with some probability 'pclick' depending on its true relevance; +# at each position the user may stop the traversal with some probability pstop. For the non-clicked documents, +# new_label=0. Thus the generated new labels are biased towards the baseline ranker. +# The positions of the documents in the ranked lists produced by the baseline, are returned. +def simulate_position_bias(file_dataset_in, file_query_in, file_dataset_out, baseline_feature): + # a mapping of a document's true relevance (defined on a 5-grade scale) into the probability of clicking it + def get_pclick(label): + if label == 0: + return 0.4 + elif label == 1: + return 0.6 + elif label == 2: + return 0.7 + elif label == 3: + return 0.8 + else: + return 0.9 + # an instantiation of a cascade model where the user stops with probability 0.2 after observing each document + pstop = 0.2 + + f_dataset_in = open(file_dataset_in, 'r') + f_dataset_out = open(file_dataset_out, 'w') + random.seed(10) + positions_all = [] + for line in open(file_query_in): + docs_num = int (line) + lines = [] + index_values = [] + positions = [0] * docs_num + for index in range(docs_num): + features = f_dataset_in.readline().split() + lines.append(features) + val = 0.0 + for feature_val in features: + feature_val_split = feature_val.split(":") + if int(feature_val_split[0]) == baseline_feature: + val = float(feature_val_split[1]) + index_values.append([index, val]) + index_values.sort(key=lambda x: -x[1]) + stop = False + for pos in range(docs_num): + index = index_values[pos][0] + new_label = 0 + if not stop: + label = int(lines[index][0]) + pclick = get_pclick(label) + if random.random() < pclick: + new_label = 1 + stop = random.random() < pstop + lines[index][0] = str(new_label) + positions[index] = pos + for features in lines: + f_dataset_out.write(' '.join(features) + '\n') + positions_all.extend(positions) + f_dataset_out.close() + return positions_all + + +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Positions in learning to rank is not supported in CUDA version yet') +def test_ranking_with_position_information_with_file(tmp_path): + rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank' + params = { + 'objective': 'lambdarank', + 'verbose': -1, + 'eval_at': [3], + 'metric': 'ndcg', + 'bagging_freq': 1, + 'bagging_fraction': 0.9, + 'min_data_in_leaf': 50, + 'min_sum_hessian_in_leaf': 5.0 + } + + # simulate position bias for the train dataset and put the train dataset with biased labels to temp directory + positions = simulate_position_bias(str(rank_example_dir / 'rank.train'), str(rank_example_dir / 'rank.train.query'), str(tmp_path / 'rank.train'), baseline_feature=34) + copyfile(str(rank_example_dir / 'rank.train.query'), str(tmp_path / 'rank.train.query')) + copyfile(str(rank_example_dir / 'rank.test'), str(tmp_path / 'rank.test')) + copyfile(str(rank_example_dir / 'rank.test.query'), str(tmp_path / 'rank.test.query')) + + lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params) + lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))] + gbm_baseline = lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50) + + f_positions_out = open(str(tmp_path / 'rank.train.position'), 'w') + for pos in positions: + f_positions_out.write(str(pos) + '\n') + f_positions_out.close() + + lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params) + lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))] + gbm_unbiased_with_file = lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50) + + # the performance of the unbiased LambdaMART should outperform the plain LambdaMART on the dataset with position bias + assert gbm_baseline.best_score['valid_0']['ndcg@3'] + 0.03 <= gbm_unbiased_with_file.best_score['valid_0']['ndcg@3'] + + # add extra row to position file + with open(str(tmp_path / 'rank.train.position'), 'a') as file: + file.write('pos_1000\n') + file.close() + lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params) + lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))] + with pytest.raises(lgb.basic.LightGBMError, match="Positions size \(3006\) doesn't match data size"): + lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50) + + +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Positions in learning to rank is not supported in CUDA version yet') +def test_ranking_with_position_information_with_dataset_constructor(tmp_path): + rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank' + params = { + 'objective': 'lambdarank', + 'verbose': -1, + 'eval_at': [3], + 'metric': 'ndcg', + 'bagging_freq': 1, + 'bagging_fraction': 0.9, + 'min_data_in_leaf': 50, + 'min_sum_hessian_in_leaf': 5.0, + 'num_threads': 1, + 'deterministic': True, + 'seed': 0 + } + + # simulate position bias for the train dataset and put the train dataset with biased labels to temp directory + positions = simulate_position_bias(str(rank_example_dir / 'rank.train'), str(rank_example_dir / 'rank.train.query'), str(tmp_path / 'rank.train'), baseline_feature=34) + copyfile(str(rank_example_dir / 'rank.train.query'), str(tmp_path / 'rank.train.query')) + copyfile(str(rank_example_dir / 'rank.test'), str(tmp_path / 'rank.test')) + copyfile(str(rank_example_dir / 'rank.test.query'), str(tmp_path / 'rank.test.query')) + + lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params) + lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))] + gbm_baseline = lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50) + + positions = np.array(positions) + + # test setting positions through Dataset constructor with numpy array + lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params, position=positions) + lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))] + gbm_unbiased = lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50) + + # the performance of the unbiased LambdaMART should outperform the plain LambdaMART on the dataset with position bias + assert gbm_baseline.best_score['valid_0']['ndcg@3'] + 0.03 <= gbm_unbiased.best_score['valid_0']['ndcg@3'] + + if PANDAS_INSTALLED: + # test setting positions through Dataset constructor with pandas Series + lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params, position=pd_Series(positions)) + lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))] + gbm_unbiased_pandas_series = lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50) + assert gbm_unbiased.best_score['valid_0']['ndcg@3'] == gbm_unbiased_pandas_series.best_score['valid_0']['ndcg@3'] + + # test setting positions through set_position + lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params) + lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))] + lgb_train.set_position(positions) + gbm_unbiased_set_position = lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50) + assert gbm_unbiased.best_score['valid_0']['ndcg@3'] == gbm_unbiased_set_position.best_score['valid_0']['ndcg@3'] + + # test get_position works + positions_from_get = lgb_train.get_position() + np.testing.assert_array_equal(positions_from_get, positions) + + def test_early_stopping(): X, y = load_breast_cancer(return_X_y=True) params = { @@ -926,6 +1092,33 @@ def test_early_stopping_min_delta(first_only, single_metric, greater_is_better): assert np.greater_equal(last_score, best_score - min_delta).any() +def test_early_stopping_can_be_triggered_via_custom_callback(): + X, y = make_synthetic_regression() + + def _early_stop_after_seventh_iteration(env): + if env.iteration == 6: + exc = lgb.EarlyStopException( + best_iteration=6, + best_score=[("some_validation_set", "some_metric", 0.708, True)] + ) + raise exc + + bst = lgb.train( + params={ + "objective": "regression", + "verbose": -1, + "num_leaves": 2 + }, + train_set=lgb.Dataset(X, label=y), + num_boost_round=23, + callbacks=[_early_stop_after_seventh_iteration] + ) + assert bst.num_trees() == 7 + assert bst.best_score["some_validation_set"]["some_metric"] == 0.708 + assert bst.best_iteration == 7 + assert bst.current_iteration() == 7 + + def test_continue_train(): X, y = make_synthetic_regression() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -1341,6 +1534,203 @@ def train_and_predict(init_model=None, return_model=False): assert ret_origin == pytest.approx(ret) +def test_all_expected_params_are_written_out_to_model_text(tmp_path): + X, y = make_synthetic_regression() + params = { + 'objective': 'mape', + 'metric': ['l2', 'mae'], + 'seed': 708, + 'data_sample_strategy': 'bagging', + 'sub_row': 0.8234, + 'verbose': -1 + } + dtrain = lgb.Dataset(data=X, label=y) + gbm = lgb.train( + params=params, + train_set=dtrain, + num_boost_round=3 + ) + + model_txt_from_memory = gbm.model_to_string() + model_file = tmp_path / "out.model" + gbm.save_model(filename=model_file) + with open(model_file, "r") as f: + model_txt_from_file = f.read() + + assert model_txt_from_memory == model_txt_from_file + + # entries whose values should reflect params passed to lgb.train() + non_default_param_entries = [ + "[objective: mape]", + # 'l1' was passed in with alias 'mae' + "[metric: l2,l1]", + "[data_sample_strategy: bagging]", + "[seed: 708]", + # NOTE: this was passed in with alias 'sub_row' + "[bagging_fraction: 0.8234]", + "[num_iterations: 3]", + ] + + # entries with default values of params + default_param_entries = [ + "[boosting: gbdt]", + "[tree_learner: serial]", + "[data: ]", + "[valid: ]", + "[learning_rate: 0.1]", + "[num_leaves: 31]", + "[num_threads: 0]", + "[deterministic: 0]", + "[histogram_pool_size: -1]", + "[max_depth: -1]", + "[min_data_in_leaf: 20]", + "[min_sum_hessian_in_leaf: 0.001]", + "[pos_bagging_fraction: 1]", + "[neg_bagging_fraction: 1]", + "[bagging_freq: 0]", + "[bagging_seed: 15415]", + "[feature_fraction: 1]", + "[feature_fraction_bynode: 1]", + "[feature_fraction_seed: 32671]", + "[extra_trees: 0]", + "[extra_seed: 6642]", + "[early_stopping_round: 0]", + "[first_metric_only: 0]", + "[max_delta_step: 0]", + "[lambda_l1: 0]", + "[lambda_l2: 0]", + "[linear_lambda: 0]", + "[min_gain_to_split: 0]", + "[drop_rate: 0.1]", + "[max_drop: 50]", + "[skip_drop: 0.5]", + "[xgboost_dart_mode: 0]", + "[uniform_drop: 0]", + "[drop_seed: 20623]", + "[top_rate: 0.2]", + "[other_rate: 0.1]", + "[min_data_per_group: 100]", + "[max_cat_threshold: 32]", + "[cat_l2: 10]", + "[cat_smooth: 10]", + "[max_cat_to_onehot: 4]", + "[top_k: 20]", + "[monotone_constraints: ]", + "[monotone_constraints_method: basic]", + "[monotone_penalty: 0]", + "[feature_contri: ]", + "[forcedsplits_filename: ]", + "[refit_decay_rate: 0.9]", + "[cegb_tradeoff: 1]", + "[cegb_penalty_split: 0]", + "[cegb_penalty_feature_lazy: ]", + "[cegb_penalty_feature_coupled: ]", + "[path_smooth: 0]", + "[interaction_constraints: ]", + "[verbosity: -1]", + "[saved_feature_importance_type: 0]", + "[use_quantized_grad: 0]", + "[num_grad_quant_bins: 4]", + "[quant_train_renew_leaf: 0]", + "[stochastic_rounding: 1]", + "[linear_tree: 0]", + "[max_bin: 255]", + "[max_bin_by_feature: ]", + "[min_data_in_bin: 3]", + "[bin_construct_sample_cnt: 200000]", + "[data_random_seed: 2350]", + "[is_enable_sparse: 1]", + "[enable_bundle: 1]", + "[use_missing: 1]", + "[zero_as_missing: 0]", + "[feature_pre_filter: 1]", + "[pre_partition: 0]", + "[two_round: 0]", + "[header: 0]", + "[label_column: ]", + "[weight_column: ]", + "[group_column: ]", + "[ignore_column: ]", + "[categorical_feature: ]", + "[forcedbins_filename: ]", + "[precise_float_parser: 0]", + "[parser_config_file: ]", + "[objective_seed: 4309]", + "[num_class: 1]", + "[is_unbalance: 0]", + "[scale_pos_weight: 1]", + "[sigmoid: 1]", + "[boost_from_average: 1]", + "[reg_sqrt: 0]", + "[alpha: 0.9]", + "[fair_c: 1]", + "[poisson_max_delta_step: 0.7]", + "[tweedie_variance_power: 1.5]", + "[lambdarank_truncation_level: 30]", + "[lambdarank_norm: 1]", + "[label_gain: ]", + "[lambdarank_position_bias_regularization: 0]", + "[eval_at: ]", + "[multi_error_top_k: 1]", + "[auc_mu_weights: ]", + "[num_machines: 1]", + "[local_listen_port: 12400]", + "[time_out: 120]", + "[machine_list_filename: ]", + "[machines: ]", + "[gpu_platform_id: -1]", + "[gpu_device_id: -1]", + "[num_gpu: 1]", + ] + all_param_entries = non_default_param_entries + default_param_entries + + # add device-specific entries + # + # passed-in force_col_wise / force_row_wise parameters are ignored on CUDA and GPU builds... + # https://github.com/microsoft/LightGBM/blob/1d7ee63686272bceffd522284127573b511df6be/src/io/config.cpp#L375-L377 + if getenv('TASK', '') == 'cuda': + device_entries = [ + "[force_col_wise: 0]", + "[force_row_wise: 1]", + "[device_type: cuda]", + "[gpu_use_dp: 1]" + ] + elif getenv('TASK', '') == 'gpu': + device_entries = [ + "[force_col_wise: 1]", + "[force_row_wise: 0]", + "[device_type: gpu]", + "[gpu_use_dp: 0]" + ] + else: + device_entries = [ + "[force_col_wise: 0]", + "[force_row_wise: 0]", + "[device_type: cpu]", + "[gpu_use_dp: 0]" + ] + + all_param_entries += device_entries + + # check that model text has all expected param entries + for param_str in all_param_entries: + assert param_str in model_txt_from_file + assert param_str in model_txt_from_memory + + # since Booster.model_to_string() is used when pickling, check that parameters all + # roundtrip pickling successfully too + gbm_pkl = pickle_and_unpickle_object(gbm, serializer="joblib") + model_txt_from_memory = gbm_pkl.model_to_string() + model_file = tmp_path / "out-pkl.model" + gbm_pkl.save_model(filename=model_file) + with open(model_file, "r") as f: + model_txt_from_file = f.read() + + for param_str in all_param_entries: + assert param_str in model_txt_from_file + assert param_str in model_txt_from_memory + + def test_pandas_categorical(): pd = pytest.importorskip("pandas") np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat) @@ -1720,8 +2110,7 @@ def generate_trainset_for_monotone_constraints_tests(x3_to_category=True): categorical_features = [] if x3_to_category: categorical_features = [2] - trainset = lgb.Dataset(x, label=y, categorical_feature=categorical_features, free_raw_data=False) - return trainset + return lgb.Dataset(x, label=y, categorical_feature=categorical_features, free_raw_data=False) @pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Monotone constraints are not yet supported by CUDA version') diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index e41719845c0a..2247c9a512d2 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1561,3 +1561,20 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type ) preds = model.predict(X) assert spearmanr(preds, y).correlation >= 0.99 + + +def test_classifier_fit_detects_classes_every_time(): + rng = np.random.default_rng(seed=123) + nrows = 1000 + ncols = 20 + + X = rng.standard_normal(size=(nrows, ncols)) + y_bin = (rng.random(size=nrows) <= .3).astype(np.float64) + y_multi = rng.integers(4, size=nrows) + + model = lgb.LGBMClassifier(verbose=-1) + for _ in range(2): + model.fit(X, y_multi) + assert model.objective_ == "multiclass" + model.fit(X, y_bin) + assert model.objective_ == "binary" diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py index 29183713d714..df01e29852e7 100644 --- a/tests/python_package_test/utils.py +++ b/tests/python_package_test/utils.py @@ -192,4 +192,4 @@ def pickle_and_unpickle_object(obj, serializer): filepath=tmp_file.name, serializer=serializer ) - return obj_from_disk + return obj_from_disk # noqa: RET504