diff --git a/.appveyor.yml b/.appveyor.yml
index dc431ded5018..4cff03d571a1 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -1,4 +1,4 @@
-version: 4.0.0.99.{build}
+version: 4.1.0.99.{build}
 
 image: Visual Studio 2015
 platform: x64
diff --git a/.ci/lint_r_code.R b/.ci/lint_r_code.R
index c13471ca8fb1..12116104ef6d 100755
--- a/.ci/lint_r_code.R
+++ b/.ci/lint_r_code.R
@@ -33,29 +33,37 @@ LINTERS_TO_USE <- list(
     , "any_duplicated"       = lintr::any_duplicated_linter()
     , "any_is_na"            = lintr::any_is_na_linter()
     , "assignment"           = lintr::assignment_linter()
+    , "boolean_arithmetic"   = lintr::boolean_arithmetic_linter()
     , "braces"               = lintr::brace_linter()
     , "class_equals"         = lintr::class_equals_linter()
     , "commas"               = lintr::commas_linter()
     , "duplicate_argument"   = lintr::duplicate_argument_linter()
+    , "empty_assignment"     = lintr::empty_assignment_linter()
     , "equals_na"            = lintr::equals_na_linter()
+    , "for_loop_index"       = lintr::for_loop_index_linter()
     , "function_left"        = lintr::function_left_parentheses_linter()
     , "implicit_integers"    = lintr::implicit_integer_linter()
     , "infix_spaces"         = lintr::infix_spaces_linter()
     , "inner_combine"        = lintr::inner_combine_linter()
+    , "is_numeric"           = lintr::is_numeric_linter()
     , "fixed_regex"          = lintr::fixed_regex_linter()
+    , "function_return"      = lintr::function_return_linter()
+    , "lengths"              = lintr::lengths_linter()
     , "literal_coercion"     = lintr::literal_coercion_linter()
     , "long_lines"           = lintr::line_length_linter(length = 120L)
+    , "matrix"               = lintr::matrix_apply_linter()
     , "missing_argument"     = lintr::missing_argument_linter()
-    , "no_tabs"              = lintr::no_tab_linter()
     , "non_portable_path"    = lintr::nonportable_path_linter()
     , "numeric_leading_zero" = lintr::numeric_leading_zero_linter()
     , "outer_negation"       = lintr::outer_negation_linter()
     , "package_hooks"        = lintr::package_hooks_linter()
     , "paste"                = lintr::paste_linter()
+    , "quotes"               = lintr::quotes_linter()
+    , "redundant_equals"     = lintr::redundant_equals_linter()
     , "regex_subset"         = lintr::regex_subset_linter()
+    , "routine_registration" = lintr::routine_registration_linter()
     , "semicolon"            = lintr::semicolon_linter()
     , "seq"                  = lintr::seq_linter()
-    , "single_quotes"        = lintr::single_quotes_linter()
     , "spaces_inside"        = lintr::spaces_inside_linter()
     , "spaces_left_parens"   = lintr::spaces_left_parentheses_linter()
     , "sprintf"              = lintr::sprintf_linter()
@@ -96,9 +104,11 @@ LINTERS_TO_USE <- list(
             , "??" = interactive_text
         )
     )
-    , "unneeded_concatenation" = lintr::unneeded_concatenation_linter()
-    , "unreachable_code"       = lintr::unreachable_code_linter()
-    , "vector_logic"           = lintr::vector_logic_linter()
+    , "unnecessary_concatenation" = lintr::unnecessary_concatenation_linter()
+    , "unnecessary_lambda"        = lintr::unnecessary_lambda_linter()
+    , "unreachable_code"          = lintr::unreachable_code_linter()
+    , "vector_logic"              = lintr::vector_logic_linter()
+    , "whitespace"                = lintr::whitespace_linter()
 )
 
 noquote(paste0(length(FILES_TO_LINT), " R files need linting"))
diff --git a/.ci/test-python-oldest.sh b/.ci/test-python-oldest.sh
index 09cc24633e15..3a0ea08dddda 100644
--- a/.ci/test-python-oldest.sh
+++ b/.ci/test-python-oldest.sh
@@ -7,6 +7,7 @@
 #
 echo "installing lightgbm's dependencies"
 pip install \
+  'dataclasses' \
   'numpy==1.12.0' \
   'pandas==0.24.0' \
   'scikit-learn==0.18.2' \
diff --git a/.ci/test.sh b/.ci/test.sh
index 665e7f6546ec..af7cae2e3858 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -73,7 +73,7 @@ if [[ $TASK == "lint" ]]; then
         cpplint \
         isort \
         mypy \
-        'r-lintr>=3.0' \
+        'r-lintr>=3.1' \
         ruff
     source activate $CONDA_ENV
     echo "Linting Python code"
@@ -119,15 +119,21 @@ if [[ $TASK == "check-docs" ]] || [[ $TASK == "check-links" ]]; then
     exit 0
 fi
 
+# older versions of Dask are incompatible with pandas>=2.0, but not all conda packages' metadata accurately reflects that
+#
+# ref: https://github.com/microsoft/LightGBM/issues/6030
+CONSTRAINED_DEPENDENCIES="'dask-core>=2023.5.0' 'distributed>=2023.5.0' 'pandas>=2.0'"
+if [[ $PYTHON_VERSION == "3.7" ]]; then
+    CONSTRAINED_DEPENDENCIES="'dask-core' 'distributed' 'pandas<2.0'"
+fi
+
 # including python=version[build=*cpython] to ensure that conda doesn't fall back to pypy
 conda create -q -y -n $CONDA_ENV \
+    ${CONSTRAINED_DEPENDENCIES} \
     cloudpickle \
-    dask-core \
-    distributed \
     joblib \
     matplotlib \
     numpy \
-    pandas \
     psutil \
     pytest \
     ${CONDA_PYTHON_REQUIREMENT} \
diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh
index fe435ce11382..e4d70261aa36 100755
--- a/.ci/test_r_package.sh
+++ b/.ci/test_r_package.sh
@@ -21,9 +21,9 @@ if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
     export R_LINUX_VERSION="3.6.3-1bionic"
     export R_APT_REPO="bionic-cran35/"
 elif [[ "${R_MAJOR_VERSION}" == "4" ]]; then
-    export R_MAC_VERSION=4.2.2
-    export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/base/R-${R_MAC_VERSION}.pkg
-    export R_LINUX_VERSION="4.2.2-1.2204.0"
+    export R_MAC_VERSION=4.3.1
+    export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/big-sur-x86_64/base/R-${R_MAC_VERSION}-x86_64.pkg
+    export R_LINUX_VERSION="4.3.1-1.2204.0"
     export R_APT_REPO="jammy-cran40/"
 else
     echo "Unrecognized R version: ${R_VERSION}"
@@ -36,7 +36,10 @@ fi
 #
 # `devscripts` is required for 'checkbashisms' (https://github.com/r-lib/actions/issues/111)
 if [[ $OS_NAME == "linux" ]]; then
+    mkdir -p ~/.gnupg
+    echo "disable-ipv6" >> ~/.gnupg/dirmngr.conf
     sudo apt-key adv \
+        --homedir ~/.gnupg \
         --keyserver keyserver.ubuntu.com \
         --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 || exit -1
     sudo add-apt-repository \
@@ -53,6 +56,7 @@ if [[ $OS_NAME == "linux" ]]; then
             texlive-latex-recommended \
             texlive-fonts-recommended \
             texlive-fonts-extra \
+            tidy \
             qpdf \
             || exit -1
 
diff --git a/.ci/test_r_package_windows.ps1 b/.ci/test_r_package_windows.ps1
index e4d20de50b90..52d47effcad7 100644
--- a/.ci/test_r_package_windows.ps1
+++ b/.ci/test_r_package_windows.ps1
@@ -203,6 +203,19 @@ if ($env:COMPILER -ne "MSVC") {
   }
 }
 
+# Checking that the correct R version was used
+if ($env:TOOLCHAIN -ne "MSVC") {
+  $checks = Select-String -Path "${LOG_FILE_NAME}" -Pattern "using R version $env:R_WINDOWS_VERSION"
+  $checks_cnt = $checks.Matches.length
+} else {
+  $checks = Select-String -Path "${INSTALL_LOG_FILE_NAME}" -Pattern "R version passed into FindLibR.* $env:R_WINDOWS_VERSION"
+  $checks_cnt = $checks.Matches.length
+}
+if ($checks_cnt -eq 0) {
+  Write-Output "Wrong R version was found (expected '$env:R_WINDOWS_VERSION'). Check the build logs."
+  Check-Output $False
+}
+
 # Checking that we actually got the expected compiler. The R package has some logic
 # to fail back to MinGW if MSVC fails, but for CI builds we need to check that the correct
 # compiler was used.
diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1
index 5962a9441346..413af821e065 100644
--- a/.ci/test_windows.ps1
+++ b/.ci/test_windows.ps1
@@ -124,7 +124,7 @@ if (($env:TASK -eq "regular") -or (($env:APPVEYOR -eq "true") -and ($env:TASK -e
   cd $env:BUILD_SOURCESDIRECTORY/examples/python-guide
   @("import matplotlib", "matplotlib.use('Agg')") + (Get-Content "plot_example.py") | Set-Content "plot_example.py"
   (Get-Content "plot_example.py").replace('graph.render(view=True)', 'graph.render(view=False)') | Set-Content "plot_example.py"  # prevent interactive window mode
-  conda install -q -y -n $env:CONDA_ENV h5py ipywidgets notebook
+  conda install -q -y -n $env:CONDA_ENV "h5py>3.0" ipywidgets notebook
   foreach ($file in @(Get-ChildItem *.py)) {
     @("import sys, warnings", "warnings.showwarning = lambda message, category, filename, lineno, file=None, line=None: sys.stdout.write(warnings.formatwarning(message, category, filename, lineno, line))") + (Get-Content $file) | Set-Content $file
     python $file ; Check-Output $?
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ce6da9f6e7fb..02b5cfbdae23 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -7,56 +7,4 @@
 # offer a reasonable automatic best-guess
 
 # catch-all rule (this only gets matched if no rules below match)
-*    @guolinke @StrikerRUS @jameslamb @shiyu1994
-
-# other catch-alls that will get matched if specific rules below are not matched
-*.R    @jameslamb @jmoralez
-*.py    @StrikerRUS @jmoralez @jameslamb @shiyu1994
-*.cpp    @guolinke @shiyu1994
-*.h    @guolinke @shiyu1994
-
-# main C++ code
-include/    @guolinke @shiyu1994
-src/    @guolinke @shiyu1994
-CMakeLists.txt    @guolinke @jameslamb @StrikerRUS @shiyu1994
-tests/c_api_test/    @guolinke @shiyu1994
-tests/cpp_tests/    @guolinke @shiyu1994
-tests/data/    @guolinke @shiyu1994
-windows/    @guolinke @StrikerRUS @shiyu1994
-
-# R code
-build_r.R    @jameslamb @StrikerRUS @jmoralez
-build-cran-package.sh    @jameslamb @StrikerRUS @jmoralez
-R-package/    @jameslamb @jmoralez
-
-# Python code
-python-package/    @StrikerRUS @shiyu1994 @jameslamb @jmoralez
-
-# Dask integration
-python-package/lightgbm/dask.py    @jameslamb @jmoralez
-tests/python_package_test/test_dask.py    @jameslamb @jmoralez
-
-# helpers
-helpers/    @StrikerRUS @guolinke
-
-# CI administrative stuff
-.ci/    @StrikerRUS @jameslamb
-docs/    @StrikerRUS @jameslamb
-examples/     @StrikerRUS @jameslamb @guolinke @jmoralez
-*.yml    @StrikerRUS @jameslamb
-.vsts-ci.yml    @StrikerRUS @jameslamb
-
-# docker setup
-docker/    @StrikerRUS @jameslamb
-docker/dockerfile-cli    @guolinke @shiyu1994 @StrikerRUS @jameslamb
-docker/gpu/    @StrikerRUS @jameslamb
-docker/dockerfile-python    @StrikerRUS @shiyu1994 @jameslamb @jmoralez
-docker/dockerfile-r    @jameslamb @jmoralez
-
-# GPU code
-docs/GPU-*.rst    @shiyu1994 @guolinke
-src/treelearner/gpu_tree_learner.cpp    @guolinke @shiyu1994
-src/treelearner/tree_learner.cpp    @guolinke @shiyu1994
-
-# JAVA code
-swig/    @guolinke @shiyu1994
+*    @guolinke @jameslamb @shiyu1994 @jmoralez
diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml
new file mode 100644
index 000000000000..72d8b7c2f585
--- /dev/null
+++ b/.github/workflows/lock.yml
@@ -0,0 +1,44 @@
+name: 'Lock Inactive Threads'
+
+on:
+  schedule:
+    # midnight UTC, every Wednesday
+    - cron: '0 0 * * 3'
+  # allow manual triggering from GitHub UI
+  workflow_dispatch:
+
+permissions:
+  issues: write
+  pull-requests: write
+
+concurrency:
+  group: lock
+
+jobs:
+  action:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: dessant/lock-threads@v4
+        with:
+          github-token: ${{ github.token }}
+          # after how many days of inactivity should a closed issue/PR be locked?
+          issue-inactive-days: '90'
+          pr-inactive-days: '90'
+          # do not close feature request issues...
+          # we close those but track them in https://github.com/microsoft/LightGBM/issues/2302
+          exclude-any-issue-labels: '"feature request"'
+          # what labels should be removed prior to locking?
+          remove-issue-labels: 'awaiting response,awaiting review,blocking,in progress'
+          remove-pr-labels: 'awaiting response,awaiting review,blocking,in progress'
+          # what message should be posted prior to locking?
+          issue-comment: >
+            This issue has been automatically locked since there has not been any recent activity since it was closed.
+            To start a new related discussion, open a new issue at https://github.com/microsoft/LightGBM/issues
+            including a reference to this.
+          pr-comment: >
+            This pull request has been automatically locked since there has not been any recent activity since it was closed.
+            To start a new related discussion, open a new issue at https://github.com/microsoft/LightGBM/issues
+            including a reference to this.
+          # what shoulld the locking status be?
+          issue-lock-reason: 'resolved'
+          pr-lock-reason: 'resolved'
diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml
index eb2cb90a424e..838528617143 100644
--- a/.github/workflows/r_package.yml
+++ b/.github/workflows/r_package.yml
@@ -48,7 +48,7 @@ jobs:
           - os: ubuntu-latest
             task: r-package
             compiler: gcc
-            r_version: 4.2
+            r_version: 4.3
             build_type: cmake
             container: 'ubuntu:22.04'
           - os: ubuntu-latest
@@ -60,19 +60,19 @@ jobs:
           - os: ubuntu-latest
             task: r-package
             compiler: clang
-            r_version: 4.2
+            r_version: 4.3
             build_type: cmake
             container: 'ubuntu:22.04'
           - os: macOS-latest
             task: r-package
             compiler: gcc
-            r_version: 4.2
+            r_version: 4.3
             build_type: cmake
             container: null
           - os: macOS-latest
             task: r-package
             compiler: clang
-            r_version: 4.2
+            r_version: 4.3
             build_type: cmake
             container: null
           - os: windows-latest
@@ -125,13 +125,13 @@ jobs:
           - os: ubuntu-latest
             task: r-package
             compiler: gcc
-            r_version: 4.2
+            r_version: 4.3
             build_type: cran
             container: 'ubuntu:22.04'
           - os: macOS-latest
             task: r-package
             compiler: clang
-            r_version: 4.2
+            r_version: 4.3
             build_type: cran
             container: null
           ################
@@ -140,7 +140,7 @@ jobs:
           - os: ubuntu-latest
             task: r-rchk
             compiler: gcc
-            r_version: 4.2
+            r_version: 4.3
             build_type: cran
             container: 'ubuntu:22.04'
     steps:
diff --git a/.gitignore b/.gitignore
index d4045d9a4798..bcf6f48b4cea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -139,8 +139,6 @@ publish/
 # Publish Web Output
 *.[Pp]ublish.xml
 *.azurePubxml
-# TODO: Comment the next line if you want to checkin your web deploy settings 
-# but database connection strings (with potential passwords) will be unencrypted
 *.pubxml
 *.publishproj
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5087d6a8fddb..6705ef130052 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -326,6 +326,13 @@ if(UNIX OR MINGW OR CYGWIN)
     CMAKE_CXX_FLAGS
     "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type"
   )
+  if(MINGW)
+    # ignore this warning: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95353
+    set(
+      CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -Wno-stringop-overflow"
+    )
+  endif()
   if(USE_DEBUG)
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
   else()
diff --git a/R-package/R/callback.R b/R-package/R/callback.R
index 50f36be4a2be..e428dfb79eea 100644
--- a/R-package/R/callback.R
+++ b/R-package/R/callback.R
@@ -24,7 +24,7 @@ CB_ENV <- R6::R6Class(
 )
 
 # Format the evaluation metric string
-format.eval.string <- function(eval_res, eval_err) {
+.format_eval_string <- function(eval_res, eval_err) {
 
   # Check for empty evaluation string
   if (is.null(eval_res) || length(eval_res) == 0L) {
@@ -40,7 +40,7 @@ format.eval.string <- function(eval_res, eval_err) {
 
 }
 
-merge.eval.string <- function(env) {
+.merge_eval_string <- function(env) {
 
   # Check length of evaluation list
   if (length(env$eval_list) <= 0L) {
@@ -63,7 +63,7 @@ merge.eval.string <- function(env) {
     }
 
     # Set error message
-    msg <- c(msg, format.eval.string(eval_res = env$eval_list[[j]], eval_err = eval_err))
+    msg <- c(msg, .format_eval_string(eval_res = env$eval_list[[j]], eval_err = eval_err))
 
   }
 
@@ -86,11 +86,11 @@ cb_print_evaluation <- function(period) {
       if ((i - 1L) %% period == 0L || is.element(i, c(env$begin_iteration, env$end_iteration))) {
 
         # Merge evaluation string
-        msg <- merge.eval.string(env = env)
+        msg <- .merge_eval_string(env = env)
 
         # Check if message is existing
         if (nchar(msg) > 0L) {
-          print(merge.eval.string(env = env))
+          print(.merge_eval_string(env = env))
         }
 
       }
@@ -270,7 +270,7 @@ cb_early_stop <- function(stopping_rounds, first_metric_only, verbose) {
 
           # Prepare to print if verbose
           if (verbose) {
-            best_msg[[i]] <<- as.character(merge.eval.string(env = env))
+            best_msg[[i]] <<- as.character(.merge_eval_string(env = env))
           }
 
         } else {
diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R
index 949038fde622..755b171724f9 100644
--- a/R-package/R/lgb.Booster.R
+++ b/R-package/R/lgb.Booster.R
@@ -928,6 +928,7 @@ NULL
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' valids <- list(test = dtest)
 #' model <- lgb.train(
@@ -1086,7 +1087,10 @@ predict.lgb.Booster <- function(object,
 #' X <- as.matrix(mtcars[, -1L])
 #' y <- mtcars[, 1L]
 #' dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L))
-#' params <- list(min_data_in_leaf = 2L)
+#' params <- list(
+#'   min_data_in_leaf = 2L
+#'   , num_threads = 2L
+#' )
 #' model <- lgb.train(
 #'   params = params
 #'  , data = dtrain
@@ -1231,6 +1235,7 @@ summary.lgb.Booster <- function(object, ...) {
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' valids <- list(test = dtest)
 #' model <- lgb.train(
@@ -1296,6 +1301,7 @@ lgb.load <- function(filename = NULL, model_str = NULL) {
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' valids <- list(test = dtest)
 #' model <- lgb.train(
@@ -1351,6 +1357,7 @@ lgb.save <- function(booster, filename, num_iteration = NULL) {
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' valids <- list(test = dtest)
 #' model <- lgb.train(
@@ -1401,6 +1408,7 @@ lgb.dump <- function(booster, num_iteration = NULL) {
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' valids <- list(test = dtest)
 #' model <- lgb.train(
diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index 4df0acbdf005..e2892ea4bae0 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -494,11 +494,10 @@ Dataset <- R6::R6Class(
         if (info_len > 0L) {
 
           # Get back fields
-          ret <- NULL
-          ret <- if (field_name == "group") {
-            integer(info_len)
+          if (field_name == "group") {
+            ret <- integer(info_len)
           } else {
-            numeric(info_len)
+            ret <- numeric(info_len)
           }
 
           .Call(
diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R
index 7f036c9726b6..0b7b39e2d8c2 100644
--- a/R-package/R/lgb.Predictor.R
+++ b/R-package/R/lgb.Predictor.R
@@ -98,8 +98,6 @@ Predictor <- R6::R6Class(
         start_iteration <- 0L
       }
 
-      num_row <- 0L
-
       # Check if data is a file name and not a matrix
       if (identical(class(data), "character") && length(data) == 1L) {
 
diff --git a/R-package/R/lgb.convert_with_rules.R b/R-package/R/lgb.convert_with_rules.R
index f282fa3152fc..f024e9dfe6e9 100644
--- a/R-package/R/lgb.convert_with_rules.R
+++ b/R-package/R/lgb.convert_with_rules.R
@@ -116,10 +116,6 @@ lgb.convert_with_rules <- function(data, rules = NULL) {
 
     column_classes <- .get_column_classes(df = data)
 
-    is_char <- which(column_classes == "character")
-    is_factor <- which(column_classes == "factor")
-    is_logical <- which(column_classes == "logical")
-
     is_data_table <- data.table::is.data.table(x = data)
     is_data_frame <- is.data.frame(data)
 
diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
index cf88100db399..f81026fe673f 100644
--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@@ -59,6 +59,7 @@ CVBooster <- R6::R6Class(
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' model <- lgb.cv(
 #'   params = params
@@ -224,8 +225,6 @@ lgb.cv <- function(params = list()
       stop(sQuote("folds"), " must be a list with 2 or more elements that are vectors of indices for each CV-fold")
     }
 
-    nfold <- length(folds)
-
   } else {
 
     if (nfold <= 1L) {
diff --git a/R-package/R/lgb.importance.R b/R-package/R/lgb.importance.R
index c05c6628be34..5a58770553f9 100644
--- a/R-package/R/lgb.importance.R
+++ b/R-package/R/lgb.importance.R
@@ -24,6 +24,7 @@
 #'   , max_depth = -1L
 #'   , min_data_in_leaf = 1L
 #'   , min_sum_hessian_in_leaf = 1.0
+#'   , num_threads = 2L
 #' )
 #' model <- lgb.train(
 #'     params = params
diff --git a/R-package/R/lgb.interprete.R b/R-package/R/lgb.interprete.R
index 70aac8760485..7de772664d8b 100644
--- a/R-package/R/lgb.interprete.R
+++ b/R-package/R/lgb.interprete.R
@@ -35,6 +35,7 @@
 #'     , max_depth = -1L
 #'     , min_data_in_leaf = 1L
 #'     , min_sum_hessian_in_leaf = 1.0
+#'     , num_threads = 2L
 #' )
 #' model <- lgb.train(
 #'     params = params
@@ -71,7 +72,9 @@ lgb.interprete <- function(model,
   leaf_index_dt <- data.table::as.data.table(x = pred_mat)
   leaf_index_mat_list <- lapply(
     X = leaf_index_dt
-    , FUN = function(x) matrix(x, ncol = num_class, byrow = TRUE)
+    , FUN = matrix
+    , ncol = num_class
+    , byrow = TRUE
   )
 
   # Get list of trees
diff --git a/R-package/R/lgb.model.dt.tree.R b/R-package/R/lgb.model.dt.tree.R
index 871f8f1d24bf..8b0d8d81e2e8 100644
--- a/R-package/R/lgb.model.dt.tree.R
+++ b/R-package/R/lgb.model.dt.tree.R
@@ -40,6 +40,7 @@
 #'   , max_depth = -1L
 #'   , min_data_in_leaf = 1L
 #'   , min_sum_hessian_in_leaf = 1.0
+#'   , num_threads = 2L
 #' )
 #' model <- lgb.train(params, dtrain, 10L)
 #'
diff --git a/R-package/R/lgb.plot.importance.R b/R-package/R/lgb.plot.importance.R
index 66eafd73a731..fc59ebd0efec 100644
--- a/R-package/R/lgb.plot.importance.R
+++ b/R-package/R/lgb.plot.importance.R
@@ -28,6 +28,7 @@
 #'     , learning_rate = 0.1
 #'     , min_data_in_leaf = 1L
 #'     , min_sum_hessian_in_leaf = 1.0
+#'     , num_threads = 2L
 #' )
 #'
 #' model <- lgb.train(
diff --git a/R-package/R/lgb.plot.interpretation.R b/R-package/R/lgb.plot.interpretation.R
index 86d8b682725f..a88f14bf83f0 100644
--- a/R-package/R/lgb.plot.interpretation.R
+++ b/R-package/R/lgb.plot.interpretation.R
@@ -39,6 +39,7 @@
 #'   , max_depth = -1L
 #'   , min_data_in_leaf = 1L
 #'   , min_sum_hessian_in_leaf = 1.0
+#'   , num_threads = 2L
 #' )
 #' model <- lgb.train(
 #'   params = params
diff --git a/R-package/R/lgb.restore_handle.R b/R-package/R/lgb.restore_handle.R
index dcb167608888..4de93d46c96a 100644
--- a/R-package/R/lgb.restore_handle.R
+++ b/R-package/R/lgb.restore_handle.R
@@ -23,7 +23,9 @@
 #'   , agaricus.train$label
 #'   , params = list(objective = "binary")
 #'   , nrounds = 5L
-#'   , verbose = 0)
+#'   , verbose = 0
+#'   , num_threads = 2L
+#' )
 #' fname <- tempfile(fileext="rds")
 #' saveRDS(model, fname)
 #'
diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
index 8cf3a95eaf2e..20916c9844b5 100644
--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -30,6 +30,7 @@
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' valids <- list(test = dtest)
 #' model <- lgb.train(
@@ -153,6 +154,9 @@ lgb.train <- function(params = list(),
 
   # Construct datasets, if needed
   data$update_params(params = params)
+  if (!is.null(categorical_feature)) {
+    data$set_categorical_feature(categorical_feature)
+  }
   data$construct()
 
   # Check interaction constraints
@@ -178,11 +182,6 @@ lgb.train <- function(params = list(),
     data$set_colnames(colnames)
   }
 
-  # Write categorical features
-  if (!is.null(categorical_feature)) {
-    data$set_categorical_feature(categorical_feature)
-  }
-
   valid_contain_train <- FALSE
   train_data_name <- "train"
   reduced_valid_sets <- list()
diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R
index cb3ef31e8afa..711b3ef0dc38 100644
--- a/R-package/R/lightgbm.R
+++ b/R-package/R/lightgbm.R
@@ -116,7 +116,7 @@ NULL
 #'                  \item If passing a factor with more than two variables, will use objective \code{"multiclass"}
 #'                  (note that parameter \code{num_class} in this case will also be determined automatically from
 #'                  \code{label}).
-#'                  \item Otherwise, will use objective \code{"regression"}.
+#'                  \item Otherwise (or if passing \code{lgb.Dataset} as input), will use objective \code{"regression"}.
 #'                  }
 #'
 #'                  \emph{New in version 4.0.0}
@@ -211,6 +211,9 @@ lightgbm <- function(data,
     rm(temp)
   } else {
     data_processor <- NULL
+    if (objective == "auto") {
+      objective <- "regression"
+    }
   }
 
   # Set data to a temporary variable
diff --git a/R-package/R/readRDS.lgb.Booster.R b/R-package/R/readRDS.lgb.Booster.R
index a995d804adc5..a8abac642c24 100644
--- a/R-package/R/readRDS.lgb.Booster.R
+++ b/R-package/R/readRDS.lgb.Booster.R
@@ -23,6 +23,7 @@
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' valids <- list(test = dtest)
 #' model <- lgb.train(
diff --git a/R-package/R/saveRDS.lgb.Booster.R b/R-package/R/saveRDS.lgb.Booster.R
index eb71e7a2f08c..5d3af097301f 100644
--- a/R-package/R/saveRDS.lgb.Booster.R
+++ b/R-package/R/saveRDS.lgb.Booster.R
@@ -33,6 +33,7 @@
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' valids <- list(test = dtest)
 #' model <- lgb.train(
diff --git a/R-package/configure b/R-package/configure
index 867ef2d395a6..39a18d669833 100755
--- a/R-package/configure
+++ b/R-package/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for lightgbm 4.0.0.99.
+# Generated by GNU Autoconf 2.71 for lightgbm 4.1.0.99.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -607,8 +607,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='lightgbm'
 PACKAGE_TARNAME='lightgbm'
-PACKAGE_VERSION='4.0.0.99'
-PACKAGE_STRING='lightgbm 4.0.0.99'
+PACKAGE_VERSION='4.1.0.99'
+PACKAGE_STRING='lightgbm 4.1.0.99'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1211,7 +1211,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures lightgbm 4.0.0.99 to adapt to many kinds of systems.
+\`configure' configures lightgbm 4.1.0.99 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1273,7 +1273,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of lightgbm 4.0.0.99:";;
+     short | recursive ) echo "Configuration of lightgbm 4.1.0.99:";;
    esac
   cat <<\_ACEOF
 
@@ -1341,7 +1341,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-lightgbm configure 4.0.0.99
+lightgbm configure 4.1.0.99
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1378,7 +1378,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by lightgbm $as_me 4.0.0.99, which was
+It was created by lightgbm $as_me 4.1.0.99, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -2454,7 +2454,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by lightgbm $as_me 4.0.0.99, which was
+This file was extended by lightgbm $as_me 4.1.0.99, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -2509,7 +2509,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-lightgbm config.status 4.0.0.99
+lightgbm config.status 4.1.0.99
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
diff --git a/R-package/cran-comments.md b/R-package/cran-comments.md
index 6fa74cdac4cb..44b8ed391bfc 100644
--- a/R-package/cran-comments.md
+++ b/R-package/cran-comments.md
@@ -1,5 +1,66 @@
 # CRAN Submission History
 
+## v4.1.0 - not submitted
+
+v4.1.0 was not submitted to CRAN, because https://github.com/microsoft/LightGBM/issues/5987 had not been resolved.
+
+## v4.0.0 - Submission 2 - (July 19, 2023)
+
+### CRAN response
+
+> Dear maintainer,
+> package lightgbm_4.0.0.tar.gz does not pass the incoming checks automatically.
+
+The logs linked from those messagges showed one issue remaining on Debian (0 on Windows).
+
+```text
+* checking examples ... [7s/4s] NOTE
+Examples with CPU time > 2.5 times elapsed time
+                    user system elapsed  ratio
+lgb.restore_handle 1.206  0.085   0.128 10.08
+```
+
+### Maintainer Notes
+
+Chose to document the issue and need for a fix in https://github.com/microsoft/LightGBM/issues/5987, but not resubmit,
+to avoid annoying CRAN maintainers.
+
+## v4.0.0 - Submission 1 - (July 16, 2023)
+
+### CRAN response
+
+> Dear maintainer,
+> package lightgbm_4.0.0.tar.gz does not pass the incoming checks automatically.
+
+The logs linked from those messages showed the following issues from `R CMD check`.
+
+```text
+* checking S3 generic/method consistency ... NOTE
+Mismatches for apparent methods not registered:
+merge:
+  function(x, y, ...)
+merge.eval.string:
+  function(env)
+
+format:
+  function(x, ...)
+format.eval.string:
+  function(eval_res, eval_err)
+See section 'Registering S3 methods' in the 'Writing R Extensions'
+manual.
+```
+
+```text
+* checking examples ... [8s/4s] NOTE
+Examples with CPU time > 2.5 times elapsed time
+                    user system elapsed ratio
+lgb.restore_handle 1.819  0.128   0.165  11.8
+```
+
+### Maintainer Notes
+
+Attempted to fix these with https://github.com/microsoft/LightGBM/pull/5988 and resubmitted.
+
 ## v3.3.5 - Submission 2 - (January 16, 2023)
 
 ### CRAN response
diff --git a/R-package/man/lgb.configure_fast_predict.Rd b/R-package/man/lgb.configure_fast_predict.Rd
index a228aad42e21..39fe6afa6b18 100644
--- a/R-package/man/lgb.configure_fast_predict.Rd
+++ b/R-package/man/lgb.configure_fast_predict.Rd
@@ -119,7 +119,10 @@ data(mtcars)
 X <- as.matrix(mtcars[, -1L])
 y <- mtcars[, 1L]
 dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L))
-params <- list(min_data_in_leaf = 2L)
+params <- list(
+  min_data_in_leaf = 2L
+  , num_threads = 2L
+)
 model <- lgb.train(
   params = params
  , data = dtrain
diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd
index 0e6db2e2cb0f..555cb11c7bb3 100644
--- a/R-package/man/lgb.cv.Rd
+++ b/R-package/man/lgb.cv.Rd
@@ -160,6 +160,7 @@ params <- list(
   , metric = "l2"
   , min_data = 1L
   , learning_rate = 1.0
+  , num_threads = 2L
 )
 model <- lgb.cv(
   params = params
diff --git a/R-package/man/lgb.dump.Rd b/R-package/man/lgb.dump.Rd
index c9b242a812e3..f4e90242fd75 100644
--- a/R-package/man/lgb.dump.Rd
+++ b/R-package/man/lgb.dump.Rd
@@ -31,6 +31,7 @@ params <- list(
   , metric = "l2"
   , min_data = 1L
   , learning_rate = 1.0
+  , num_threads = 2L
 )
 valids <- list(test = dtest)
 model <- lgb.train(
diff --git a/R-package/man/lgb.get.eval.result.Rd b/R-package/man/lgb.get.eval.result.Rd
index cb54217bc42d..9c2293a0f909 100644
--- a/R-package/man/lgb.get.eval.result.Rd
+++ b/R-package/man/lgb.get.eval.result.Rd
@@ -45,6 +45,7 @@ params <- list(
   , metric = "l2"
   , min_data = 1L
   , learning_rate = 1.0
+  , num_threads = 2L
 )
 valids <- list(test = dtest)
 model <- lgb.train(
diff --git a/R-package/man/lgb.importance.Rd b/R-package/man/lgb.importance.Rd
index 2fd5d4938de5..89a3d4e6b5b7 100644
--- a/R-package/man/lgb.importance.Rd
+++ b/R-package/man/lgb.importance.Rd
@@ -35,6 +35,7 @@ params <- list(
   , max_depth = -1L
   , min_data_in_leaf = 1L
   , min_sum_hessian_in_leaf = 1.0
+  , num_threads = 2L
 )
 model <- lgb.train(
     params = params
diff --git a/R-package/man/lgb.interprete.Rd b/R-package/man/lgb.interprete.Rd
index 6431a5011f48..c1166b2c1cc9 100644
--- a/R-package/man/lgb.interprete.Rd
+++ b/R-package/man/lgb.interprete.Rd
@@ -48,6 +48,7 @@ params <- list(
     , max_depth = -1L
     , min_data_in_leaf = 1L
     , min_sum_hessian_in_leaf = 1.0
+    , num_threads = 2L
 )
 model <- lgb.train(
     params = params
diff --git a/R-package/man/lgb.load.Rd b/R-package/man/lgb.load.Rd
index 6031ff8e55bb..c1a00a20974b 100644
--- a/R-package/man/lgb.load.Rd
+++ b/R-package/man/lgb.load.Rd
@@ -31,6 +31,7 @@ params <- list(
   , metric = "l2"
   , min_data = 1L
   , learning_rate = 1.0
+  , num_threads = 2L
 )
 valids <- list(test = dtest)
 model <- lgb.train(
diff --git a/R-package/man/lgb.model.dt.tree.Rd b/R-package/man/lgb.model.dt.tree.Rd
index c5c88156ff4d..4d02ede9a001 100644
--- a/R-package/man/lgb.model.dt.tree.Rd
+++ b/R-package/man/lgb.model.dt.tree.Rd
@@ -51,6 +51,7 @@ params <- list(
   , max_depth = -1L
   , min_data_in_leaf = 1L
   , min_sum_hessian_in_leaf = 1.0
+  , num_threads = 2L
 )
 model <- lgb.train(params, dtrain, 10L)
 
diff --git a/R-package/man/lgb.plot.importance.Rd b/R-package/man/lgb.plot.importance.Rd
index 4b915e35fc86..302f46460e3f 100644
--- a/R-package/man/lgb.plot.importance.Rd
+++ b/R-package/man/lgb.plot.importance.Rd
@@ -47,6 +47,7 @@ params <- list(
     , learning_rate = 0.1
     , min_data_in_leaf = 1L
     , min_sum_hessian_in_leaf = 1.0
+    , num_threads = 2L
 )
 
 model <- lgb.train(
diff --git a/R-package/man/lgb.plot.interpretation.Rd b/R-package/man/lgb.plot.interpretation.Rd
index 2d7416561f23..a914071e896f 100644
--- a/R-package/man/lgb.plot.interpretation.Rd
+++ b/R-package/man/lgb.plot.interpretation.Rd
@@ -58,6 +58,7 @@ params <- list(
   , max_depth = -1L
   , min_data_in_leaf = 1L
   , min_sum_hessian_in_leaf = 1.0
+  , num_threads = 2L
 )
 model <- lgb.train(
   params = params
diff --git a/R-package/man/lgb.restore_handle.Rd b/R-package/man/lgb.restore_handle.Rd
index bbe6f70c85de..95cbdc64485d 100644
--- a/R-package/man/lgb.restore_handle.Rd
+++ b/R-package/man/lgb.restore_handle.Rd
@@ -34,7 +34,9 @@ model <- lightgbm(
   , agaricus.train$label
   , params = list(objective = "binary")
   , nrounds = 5L
-  , verbose = 0)
+  , verbose = 0
+  , num_threads = 2L
+)
 fname <- tempfile(fileext="rds")
 saveRDS(model, fname)
 
diff --git a/R-package/man/lgb.save.Rd b/R-package/man/lgb.save.Rd
index 0736c26ab3f6..efd110c7d816 100644
--- a/R-package/man/lgb.save.Rd
+++ b/R-package/man/lgb.save.Rd
@@ -33,6 +33,7 @@ params <- list(
   , metric = "l2"
   , min_data = 1L
   , learning_rate = 1.0
+  , num_threads = 2L
 )
 valids <- list(test = dtest)
 model <- lgb.train(
diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd
index 30589ef34e54..0f2961edc415 100644
--- a/R-package/man/lgb.train.Rd
+++ b/R-package/man/lgb.train.Rd
@@ -141,6 +141,7 @@ params <- list(
   , metric = "l2"
   , min_data = 1L
   , learning_rate = 1.0
+  , num_threads = 2L
 )
 valids <- list(test = dtest)
 model <- lgb.train(
diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd
index 88f3e3188fec..09d7704605c1 100644
--- a/R-package/man/lightgbm.Rd
+++ b/R-package/man/lightgbm.Rd
@@ -68,7 +68,7 @@ set to the iteration number of the best iteration.}
                  \item If passing a factor with more than two variables, will use objective \code{"multiclass"}
                  (note that parameter \code{num_class} in this case will also be determined automatically from
                  \code{label}).
-                 \item Otherwise, will use objective \code{"regression"}.
+                 \item Otherwise (or if passing \code{lgb.Dataset} as input), will use objective \code{"regression"}.
                  }
 
                  \emph{New in version 4.0.0}}
diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd
index f8043767be43..2df13b9bc374 100644
--- a/R-package/man/predict.lgb.Booster.Rd
+++ b/R-package/man/predict.lgb.Booster.Rd
@@ -132,6 +132,7 @@ params <- list(
   , metric = "l2"
   , min_data = 1L
   , learning_rate = 1.0
+  , num_threads = 2L
 )
 valids <- list(test = dtest)
 model <- lgb.train(
diff --git a/R-package/man/readRDS.lgb.Booster.Rd b/R-package/man/readRDS.lgb.Booster.Rd
index 5a1c647a0f74..6a8e4c80ca91 100644
--- a/R-package/man/readRDS.lgb.Booster.Rd
+++ b/R-package/man/readRDS.lgb.Booster.Rd
@@ -34,6 +34,7 @@ params <- list(
   , metric = "l2"
   , min_data = 1L
   , learning_rate = 1.0
+  , num_threads = 2L
 )
 valids <- list(test = dtest)
 model <- lgb.train(
diff --git a/R-package/man/saveRDS.lgb.Booster.Rd b/R-package/man/saveRDS.lgb.Booster.Rd
index e730f36b2caf..a8664243dce2 100644
--- a/R-package/man/saveRDS.lgb.Booster.Rd
+++ b/R-package/man/saveRDS.lgb.Booster.Rd
@@ -57,6 +57,7 @@ params <- list(
   , metric = "l2"
   , min_data = 1L
   , learning_rate = 1.0
+  , num_threads = 2L
 )
 valids <- list(test = dtest)
 model <- lgb.train(
diff --git a/R-package/tests/testthat/helper.R b/R-package/tests/testthat/helper.R
new file mode 100644
index 000000000000..9c928c1f71d1
--- /dev/null
+++ b/R-package/tests/testthat/helper.R
@@ -0,0 +1,48 @@
+# ref for this file:
+#
+# * https://r-pkgs.org/testing-design.html#testthat-helper-files
+# * https://r-pkgs.org/testing-design.html#testthat-setup-files
+
+# LightGBM-internal fix to comply with CRAN policy of only using up to 2 threads in tests and example.
+#
+# per https://cran.r-project.org/web/packages/policies.html
+#
+# > If running a package uses multiple threads/cores it must never use more than two simultaneously:
+#   the check farm is a shared resource and will typically be running many checks simultaneously.
+#
+.LGB_MAX_THREADS <- 2L
+
+# by default, how much should results in tests be allowed to differ from hard-coded expected numbers?
+.LGB_NUMERIC_TOLERANCE <- 1e-6
+
+# are the tests running on Windows?
+.LGB_ON_WINDOWS <- .Platform$OS.type == "windows"
+.LGB_ON_32_BIT_WINDOWS <- .LGB_ON_WINDOWS && .Machine$sizeof.pointer != 8L
+
+# are the tests running in a UTF-8 locale?
+.LGB_UTF8_LOCALE <- all(endsWith(
+  Sys.getlocale(category = "LC_CTYPE")
+  , "UTF-8"
+))
+
+# control how many loud LightGBM's logger is in tests
+.LGB_VERBOSITY <- as.integer(
+  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
+)
+
+# [description]
+#    test that every element of 'x' is in 'y'
+#
+#    testthat::expect_in() is not available in version of {testthat}
+#    built for R 3.6, this is here to support a similar interface on R 3.6
+.expect_in <- function(x, y) {
+  if (exists("expect_in")) {
+    expect_in(x, y)
+  } else {
+    missing_items <- x[!(x %in% y)]
+    if (length(missing_items) != 0L) {
+      error_msg <- paste0("Some expected items not found: ", toString(missing_items))
+      stop(error_msg)
+    }
+  }
+}
diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R
index a5003f086cbd..90be1d08cf67 100644
--- a/R-package/tests/testthat/test_Predictor.R
+++ b/R-package/tests/testthat/test_Predictor.R
@@ -1,11 +1,5 @@
 library(Matrix)
 
-VERBOSITY <- as.integer(
-  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
-)
-
-TOLERANCE <- 1e-6
-
 test_that("Predictor$finalize() should not fail", {
     X <- as.matrix(as.integer(iris[, "Species"]), ncol = 1L)
     y <- iris[["Sepal.Length"]]
@@ -14,8 +8,9 @@ test_that("Predictor$finalize() should not fail", {
         data = dtrain
         , params = list(
             objective = "regression"
+            , num_threads = .LGB_MAX_THREADS
         )
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
         , nrounds = 3L
     )
     model_file <- tempfile(fileext = ".model")
@@ -42,8 +37,9 @@ test_that("predictions do not fail for integer input", {
         data = dtrain
         , params = list(
             objective = "regression"
+            , num_threads = .LGB_MAX_THREADS
         )
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
         , nrounds = 3L
     )
     X_double <- X[c(1L, 51L, 101L), , drop = FALSE]
@@ -76,7 +72,8 @@ test_that("start_iteration works correctly", {
             num_leaves = 4L
             , learning_rate = 0.6
             , objective = "binary"
-            , verbosity = VERBOSITY
+            , verbosity = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         )
         , nrounds = 50L
         , valids = list("test" = dtest)
@@ -125,8 +122,8 @@ test_that("Feature contributions from sparse inputs produce sparse outputs", {
       data = dtrain
       , obj = "regression"
       , nrounds = 5L
-      , verbose = VERBOSITY
-      , params = list(min_data_in_leaf = 5L)
+      , verbose = .LGB_VERBOSITY
+      , params = list(min_data_in_leaf = 5L, num_threads = .LGB_MAX_THREADS)
     )
 
     pred_dense <- predict(bst, X, type = "contrib")
@@ -156,8 +153,8 @@ test_that("Sparse feature contribution predictions do not take inputs with wrong
       data = dtrain
       , obj = "regression"
       , nrounds = 5L
-      , verbose = VERBOSITY
-      , params = list(min_data_in_leaf = 5L)
+      , verbose = .LGB_VERBOSITY
+      , params = list(min_data_in_leaf = 5L, num_threads = .LGB_MAX_THREADS)
     )
 
     X_wrong <- X[, c(1L:10L, 1L:10L)]
@@ -186,8 +183,8 @@ test_that("Feature contribution predictions do not take non-general CSR or CSC i
       data = dtrain
       , obj = "regression"
       , nrounds = 5L
-      , verbose = VERBOSITY
-      , params = list(min_data_in_leaf = 5L)
+      , verbose = .LGB_VERBOSITY
+      , params = list(min_data_in_leaf = 5L, num_threads = .LGB_MAX_THREADS)
     )
 
     expect_error(predict(bst, SmatC, type = "contrib"))
@@ -211,16 +208,17 @@ test_that("predict() params should override keyword argument for raw-score predi
       objective = "binary"
       , min_data_in_leaf = 1L
       , seed = 708L
+      , num_threads = .LGB_MAX_THREADS
     )
     , nrounds = 10L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
   )
 
   # check that the predictions from predict.lgb.Booster() really look like raw score predictions
   preds_prob <- predict(bst, X)
   preds_raw_s3_keyword <- predict(bst, X, type = "raw")
   preds_prob_from_raw <- 1.0 / (1.0 + exp(-preds_raw_s3_keyword))
-  expect_equal(preds_prob, preds_prob_from_raw, tolerance = TOLERANCE)
+  expect_equal(preds_prob, preds_prob_from_raw, tolerance = .LGB_NUMERIC_TOLERANCE)
   accuracy <- sum(as.integer(preds_prob_from_raw > 0.5) == y) / length(y)
   expect_equal(accuracy, 1.0)
 
@@ -262,9 +260,10 @@ test_that("predict() params should override keyword argument for leaf-index pred
       objective = "regression"
       , min_data_in_leaf = 1L
       , seed = 708L
+      , num_threads = .LGB_MAX_THREADS
     )
     , nrounds = 10L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
   )
 
   # check that predictions really look like leaf index predictions
@@ -315,9 +314,10 @@ test_that("predict() params should override keyword argument for feature contrib
       objective = "regression"
       , min_data_in_leaf = 1L
       , seed = 708L
+      , num_threads = .LGB_MAX_THREADS
     )
     , nrounds = 10L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
   )
 
   # check that predictions really look like feature contributions
@@ -425,8 +425,8 @@ test_that("predict() keeps row names from data (regression)", {
         data = dtrain
         , obj = "regression"
         , nrounds = 5L
-        , verbose = VERBOSITY
-        , params = list(min_data_in_leaf = 1L)
+        , verbose = .LGB_VERBOSITY
+        , params = list(min_data_in_leaf = 1L, num_threads = .LGB_MAX_THREADS)
     )
     .check_all_row_name_expectations(bst, X)
 })
@@ -441,7 +441,8 @@ test_that("predict() keeps row names from data (binary classification)", {
         data = dtrain
         , obj = "binary"
         , nrounds = 5L
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , params = list(num_threads = .LGB_MAX_THREADS)
     )
     .check_all_row_name_expectations(bst, X)
 })
@@ -455,9 +456,9 @@ test_that("predict() keeps row names from data (multi-class classification)", {
     bst <- lgb.train(
         data = dtrain
         , obj = "multiclass"
-        , params = list(num_class = 3L)
+        , params = list(num_class = 3L, num_threads = .LGB_MAX_THREADS)
         , nrounds = 5L
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
     )
     .check_all_row_name_expectations(bst, X)
 })
@@ -478,8 +479,8 @@ test_that("predictions for regression and binary classification are returned as
       data = dtrain
       , obj = "regression"
       , nrounds = 5L
-      , verbose = VERBOSITY
-      , params = list(min_data_in_leaf = 1L)
+      , verbose = .LGB_VERBOSITY
+      , params = list(min_data_in_leaf = 1L, num_threads = .LGB_MAX_THREADS)
     )
     pred <- predict(model, X)
     expect_true(is.vector(pred))
@@ -496,7 +497,8 @@ test_that("predictions for regression and binary classification are returned as
       data = dtrain
       , obj = "binary"
       , nrounds = 5L
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , params = list(num_threads = .LGB_MAX_THREADS)
     )
     pred <- predict(model, X)
     expect_true(is.vector(pred))
@@ -515,8 +517,8 @@ test_that("predictions for multiclass classification are returned as matrix", {
       data = dtrain
       , obj = "multiclass"
       , nrounds = 5L
-      , verbose = VERBOSITY
-      , params = list(num_class = 3L)
+      , verbose = .LGB_VERBOSITY
+      , params = list(num_class = 3L, num_threads = .LGB_MAX_THREADS)
     )
     pred <- predict(model, X)
     expect_true(is.matrix(pred))
@@ -533,7 +535,7 @@ test_that("Single-row predictions are identical to multi-row ones", {
     X <- as.matrix(mtcars[, -1L])
     y <- mtcars[, 1L]
     dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L))
-    params <- list(min_data_in_leaf = 2L)
+    params <- list(min_data_in_leaf = 2L, num_threads = .LGB_MAX_THREADS)
     model <- lgb.train(
       params = params
      , data = dtrain
@@ -594,7 +596,7 @@ test_that("Fast-predict configuration accepts non-default prediction types", {
     X <- as.matrix(mtcars[, -1L])
     y <- mtcars[, 1L]
     dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L))
-    params <- list(min_data_in_leaf = 2L)
+    params <- list(min_data_in_leaf = 2L, num_threads = .LGB_MAX_THREADS)
     model <- lgb.train(
       params = params
      , data = dtrain
@@ -624,7 +626,7 @@ test_that("Fast-predict configuration does not block other prediction types", {
     X <- as.matrix(mtcars[, -1L])
     y <- mtcars[, 1L]
     dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L))
-    params <- list(min_data_in_leaf = 2L)
+    params <- list(min_data_in_leaf = 2L, num_threads = .LGB_MAX_THREADS)
     model <- lgb.train(
       params = params
      , data = dtrain
@@ -660,7 +662,8 @@ test_that("predict type='class' returns predicted class for classification objec
         data = dtrain
         , obj = "binary"
         , nrounds = 5L
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , params = list(num_threads = .LGB_MAX_THREADS)
     )
     pred <- predict(bst, X, type = "class")
     expect_true(all(pred %in% c(0L, 1L)))
@@ -673,8 +676,8 @@ test_that("predict type='class' returns predicted class for classification objec
       data = dtrain
       , obj = "multiclass"
       , nrounds = 5L
-      , verbose = VERBOSITY
-      , params = list(num_class = 3L)
+      , verbose = .LGB_VERBOSITY
+      , params = list(num_class = 3L, num_threads = .LGB_MAX_THREADS)
     )
     pred <- predict(model, X, type = "class")
     expect_true(all(pred %in% c(0L, 1L, 2L)))
@@ -689,7 +692,8 @@ test_that("predict type='class' returns values in the target's range for regress
         data = dtrain
         , obj = "regression"
         , nrounds = 5L
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , params = list(num_threads = .LGB_MAX_THREADS)
     )
     pred <- predict(bst, X, type = "class")
     expect_true(!any(pred %in% c(0.0, 1.0)))
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index b2ce39f18816..57c33c35dfee 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -1,20 +1,8 @@
-VERBOSITY <- as.integer(
-  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
-)
-
-ON_WINDOWS <- .Platform$OS.type == "windows"
-
-UTF8_LOCALE <- all(endsWith(
-  Sys.getlocale(category = "LC_CTYPE")
-  , "UTF-8"
-))
-
 data(agaricus.train, package = "lightgbm")
 data(agaricus.test, package = "lightgbm")
 train <- agaricus.train
 test <- agaricus.test
 
-TOLERANCE <- 1e-6
 set.seed(708L)
 
 # [description] Every time this function is called, it adds 0.1
@@ -55,18 +43,22 @@ CONSTANT_METRIC_VALUE <- 0.2
 DTRAIN_RANDOM_REGRESSION <- lgb.Dataset(
   data = as.matrix(rnorm(100L), ncol = 1L, drop = FALSE)
   , label = rnorm(100L)
+  , params = list(num_threads = .LGB_MAX_THREADS)
 )
 DVALID_RANDOM_REGRESSION <- lgb.Dataset(
   data = as.matrix(rnorm(50L), ncol = 1L, drop = FALSE)
   , label = rnorm(50L)
+  , params = list(num_threads = .LGB_MAX_THREADS)
 )
 DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset(
   data = as.matrix(rnorm(120L), ncol = 1L, drop = FALSE)
   , label = sample(c(0L, 1L), size = 120L, replace = TRUE)
+  , params = list(num_threads = .LGB_MAX_THREADS)
 )
 DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset(
   data = as.matrix(rnorm(37L), ncol = 1L, drop = FALSE)
   , label = sample(c(0L, 1L), size = 37L, replace = TRUE)
+  , params = list(num_threads = .LGB_MAX_THREADS)
 )
 
 test_that("train and predict binary classification", {
@@ -78,7 +70,8 @@ test_that("train and predict binary classification", {
         num_leaves = 5L
         , objective = "binary"
         , metric = "binary_error"
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     , nrounds = nrounds
     , valids = list(
@@ -99,7 +92,7 @@ test_that("train and predict binary classification", {
   expect_equal(length(pred1), 6513L)
   err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label)
   err_log <- record_results[1L]
-  expect_lt(abs(err_pred1 - err_log), TOLERANCE)
+  expect_lt(abs(err_pred1 - err_log), .LGB_NUMERIC_TOLERANCE)
 })
 
 
@@ -119,7 +112,8 @@ test_that("train and predict softmax", {
         , objective = "multiclass"
         , metric = "multi_error"
         , num_class = 3L
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     , nrounds = 20L
     , valids = list(
@@ -149,13 +143,15 @@ test_that("use of multiple eval metrics works", {
         , learning_rate = 1.0
         , objective = "binary"
         , metric = metrics
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     , nrounds = 10L
     , valids = list(
       "train" = lgb.Dataset(
         data = train$data
         , label = train$label
+        , params = list(num_threads = .LGB_MAX_THREADS)
       )
     )
   )
@@ -178,12 +174,13 @@ test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expec
         num_leaves = 5L
         , objective = "binary"
         , metric = "binary_error"
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     , nrounds = nrounds
   )
-  expect_true(abs(bst$lower_bound() - -1.590853) < TOLERANCE)
-  expect_true(abs(bst$upper_bound() - 1.871015) <  TOLERANCE)
+  expect_true(abs(bst$lower_bound() - -1.590853) < .LGB_NUMERIC_TOLERANCE)
+  expect_true(abs(bst$upper_bound() - 1.871015) <  .LGB_NUMERIC_TOLERANCE)
 })
 
 test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expected for regression", {
@@ -196,17 +193,18 @@ test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expec
         num_leaves = 5L
         , objective = "regression"
         , metric = "l2"
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     , nrounds = nrounds
   )
-  expect_true(abs(bst$lower_bound() - 0.1513859) < TOLERANCE)
-  expect_true(abs(bst$upper_bound() - 0.9080349) < TOLERANCE)
+  expect_true(abs(bst$lower_bound() - 0.1513859) < .LGB_NUMERIC_TOLERANCE)
+  expect_true(abs(bst$upper_bound() - 0.9080349) < .LGB_NUMERIC_TOLERANCE)
 })
 
 test_that("lightgbm() rejects negative or 0 value passed to nrounds", {
   dtrain <- lgb.Dataset(train$data, label = train$label)
-  params <- list(objective = "regression", metric = "l2,l1")
+  params <- list(objective = "regression", metric = "l2,l1", num_threads = .LGB_MAX_THREADS)
   for (nround_value in c(-10L, 0L)) {
     expect_error({
       bst <- lightgbm(
@@ -230,7 +228,8 @@ test_that("lightgbm() accepts nrounds as either a top-level argument or paramete
       objective = "regression"
       , metric = "l2"
       , num_leaves = 5L
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
   )
 
@@ -243,7 +242,8 @@ test_that("lightgbm() accepts nrounds as either a top-level argument or paramete
       , metric = "l2"
       , num_leaves = 5L
       , nrounds = nrounds
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
   )
 
@@ -257,7 +257,8 @@ test_that("lightgbm() accepts nrounds as either a top-level argument or paramete
       , metric = "l2"
       , num_leaves = 5L
       , nrounds = nrounds
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
   )
 
@@ -285,10 +286,12 @@ test_that("lightgbm() performs evaluation on validation sets if they are provide
   dvalid1 <- lgb.Dataset(
     data = train$data
     , label = train$label
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   dvalid2 <- lgb.Dataset(
     data = train$data
     , label = train$label
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   nrounds <- 10L
   bst <- lightgbm(
@@ -301,7 +304,8 @@ test_that("lightgbm() performs evaluation on validation sets if they are provide
             "binary_error"
             , "auc"
         )
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     , nrounds = nrounds
     , valids = list(
@@ -310,6 +314,7 @@ test_that("lightgbm() performs evaluation on validation sets if they are provide
       , "train" = lgb.Dataset(
         data = train$data
         , label = train$label
+        , params = list(num_threads = .LGB_MAX_THREADS)
       )
     )
   )
@@ -324,9 +329,9 @@ test_that("lightgbm() performs evaluation on validation sets if they are provide
     eval_results <- bst$record_evals[[valid_name]][["binary_error"]]
     expect_length(eval_results[["eval"]], nrounds)
   }
-  expect_true(abs(bst$record_evals[["train"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE)
-  expect_true(abs(bst$record_evals[["valid1"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE)
-  expect_true(abs(bst$record_evals[["valid2"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE)
+  expect_true(abs(bst$record_evals[["train"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < .LGB_NUMERIC_TOLERANCE)
+  expect_true(abs(bst$record_evals[["valid1"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < .LGB_NUMERIC_TOLERANCE)
+  expect_true(abs(bst$record_evals[["valid2"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < .LGB_NUMERIC_TOLERANCE)
 })
 
 test_that("training continuation works", {
@@ -334,6 +339,7 @@ test_that("training continuation works", {
     train$data
     , label = train$label
     , free_raw_data = FALSE
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   watchlist <- list(train = dtrain)
   param <- list(
@@ -341,7 +347,8 @@ test_that("training continuation works", {
     , metric = "binary_logloss"
     , num_leaves = 5L
     , learning_rate = 1.0
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
 
   # train for 10 consecutive iterations
@@ -367,7 +374,8 @@ test_that("cv works", {
     , metric = "l2,l1"
     , min_data = 1L
     , learning_rate = 1.0
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
   bst <- lgb.cv(
     params
@@ -387,7 +395,8 @@ test_that("CVBooster$reset_parameter() works as expected", {
       objective = "regression"
       , min_data = 1L
       , num_leaves = 7L
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = dtrain
     , nrounds = 3L
@@ -405,11 +414,12 @@ test_that("CVBooster$reset_parameter() works as expected", {
 })
 
 test_that("lgb.cv() rejects negative or 0 value passed to nrounds", {
-  dtrain <- lgb.Dataset(train$data, label = train$label)
+  dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = 2L))
   params <- list(
     objective = "regression"
     , metric = "l2,l1"
     , min_data = 1L
+    , num_threads = .LGB_MAX_THREADS
   )
   for (nround_value in c(-10L, 0L)) {
     expect_error({
@@ -453,6 +463,7 @@ test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric
   dtrain <- lgb.Dataset(
     data = as.matrix(runif(n = 500L, min = 0.0, max = 15.0), drop = FALSE)
     , label = rep(c(0L, 1L), 250L)
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   nrounds <- 10L
   cv_bst <- lgb.cv(
@@ -464,7 +475,8 @@ test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric
       , metric = "auc,binary_error"
       , learning_rate = 1.5
       , num_leaves = 5L
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
   )
   expect_true(methods::is(cv_bst, "lgb.CVBooster"))
@@ -487,6 +499,7 @@ test_that("lgb.cv() fit on linearly-relatead data improves when using linear lea
     return(lgb.Dataset(
       data = X
       , label = 2L * X + runif(nrow(X), 0L, 0.1)
+      , params = list(num_threads = .LGB_MAX_THREADS)
     ))
   }
 
@@ -496,6 +509,7 @@ test_that("lgb.cv() fit on linearly-relatead data improves when using linear lea
     , metric = "mse"
     , seed = 0L
     , num_leaves = 2L
+    , num_threads = .LGB_MAX_THREADS
   )
 
   dtrain <- .new_dataset()
@@ -520,12 +534,13 @@ test_that("lgb.cv() fit on linearly-relatead data improves when using linear lea
 })
 
 test_that("lgb.cv() respects showsd argument", {
-  dtrain <- lgb.Dataset(train$data, label = train$label)
+  dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS))
   params <- list(
     objective = "regression"
     , metric = "l2"
     , min_data = 1L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
   nrounds <- 5L
   set.seed(708L)
@@ -559,6 +574,7 @@ test_that("lgb.cv() raises an informative error for unrecognized objectives", {
   dtrain <- lgb.Dataset(
     data = train$data
     , label = train$label
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   expect_error({
     capture.output({
@@ -566,7 +582,8 @@ test_that("lgb.cv() raises an informative error for unrecognized objectives", {
         data = dtrain
         , params = list(
           objective_type = "not_a_real_objective"
-          , verbosity = VERBOSITY
+          , verbosity = .LGB_VERBOSITY
+          , num_threads = .LGB_MAX_THREADS
         )
       )
     }, type = "message")
@@ -579,6 +596,7 @@ test_that("lgb.cv() respects parameter aliases for objective", {
   dtrain <- lgb.Dataset(
     data = train$data
     , label = train$label
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   cv_bst <- lgb.cv(
     data = dtrain
@@ -586,7 +604,8 @@ test_that("lgb.cv() respects parameter aliases for objective", {
       num_leaves = 5L
       , application = "binary"
       , num_iterations = nrounds
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , nfold = nfold
   )
@@ -602,10 +621,12 @@ test_that("lgb.cv() prefers objective in params to keyword argument", {
     data = lgb.Dataset(
       data = EuStockMarkets[, c("SMI", "CAC", "FTSE")]
       , label = EuStockMarkets[, "DAX"]
+      , params = list(num_threads = .LGB_MAX_THREADS)
     )
     , params = list(
       application = "regression_l1"
-      , verbosity = VERBOSITY
+      , verbosity = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , nrounds = 5L
     , obj = "regression_l2"
@@ -631,6 +652,7 @@ test_that("lgb.cv() respects parameter aliases for metric", {
   dtrain <- lgb.Dataset(
     data = train$data
     , label = train$label
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   cv_bst <- lgb.cv(
     data = dtrain
@@ -639,7 +661,8 @@ test_that("lgb.cv() respects parameter aliases for metric", {
       , objective = "binary"
       , num_iterations = nrounds
       , metric_types = c("auc", "binary_logloss")
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , nfold = nfold
   )
@@ -656,7 +679,8 @@ test_that("lgb.cv() respects eval_train_metric argument", {
     objective = "regression"
     , metric = "l2"
     , min_data = 1L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
   nrounds <- 5L
   set.seed(708L)
@@ -696,18 +720,21 @@ test_that("lgb.train() works as expected with multiple eval metrics", {
     data = lgb.Dataset(
       train$data
       , label = train$label
+      , params = list(num_threads = .LGB_MAX_THREADS)
     )
     , nrounds = 10L
     , params = list(
       objective = "binary"
       , metric = metrics
       , learning_rate = 1.0
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , valids = list(
       "train" = lgb.Dataset(
         train$data
         , label = train$label
+        , params = list(num_threads = .LGB_MAX_THREADS)
       )
     )
   )
@@ -731,7 +758,7 @@ test_that("lgb.train() raises an informative error for unrecognized objectives",
         data = dtrain
         , params = list(
           objective_type = "not_a_real_objective"
-          , verbosity = VERBOSITY
+          , verbosity = .LGB_VERBOSITY
         )
       )
     }, type = "message")
@@ -743,6 +770,7 @@ test_that("lgb.train() respects parameter aliases for objective", {
   dtrain <- lgb.Dataset(
     data = train$data
     , label = train$label
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   bst <- lgb.train(
     data = dtrain
@@ -750,7 +778,8 @@ test_that("lgb.train() respects parameter aliases for objective", {
       num_leaves = 5L
       , application = "binary"
       , num_iterations = nrounds
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , valids = list(
       "the_training_data" = dtrain
@@ -767,10 +796,12 @@ test_that("lgb.train() prefers objective in params to keyword argument", {
     data = lgb.Dataset(
       data = EuStockMarkets[, c("SMI", "CAC", "FTSE")]
       , label = EuStockMarkets[, "DAX"]
+      , params = list(num_threads = .LGB_MAX_THREADS)
     )
     , params = list(
         loss = "regression_l1"
-        , verbosity = VERBOSITY
+        , verbosity = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     , nrounds = 5L
     , obj = "regression_l2"
@@ -792,6 +823,7 @@ test_that("lgb.train() respects parameter aliases for metric", {
   dtrain <- lgb.Dataset(
     data = train$data
     , label = train$label
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   bst <- lgb.train(
     data = dtrain
@@ -800,7 +832,8 @@ test_that("lgb.train() respects parameter aliases for metric", {
       , objective = "binary"
       , num_iterations = nrounds
       , metric_types = c("auc", "binary_logloss")
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , valids = list(
       "train" = dtrain
@@ -814,11 +847,12 @@ test_that("lgb.train() respects parameter aliases for metric", {
 })
 
 test_that("lgb.train() rejects negative or 0 value passed to nrounds", {
-  dtrain <- lgb.Dataset(train$data, label = train$label)
+  dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS))
   params <- list(
     objective = "regression"
     , metric = "l2,l1"
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
   for (nround_value in c(-10L, 0L)) {
     expect_error({
@@ -840,13 +874,15 @@ test_that("lgb.train() accepts nrounds as either a top-level argument or paramet
     data = lgb.Dataset(
       train$data
       , label = train$label
+      , params = list(num_threads = .LGB_MAX_THREADS)
     )
     , nrounds = nrounds
     , params = list(
       objective = "regression"
       , metric = "l2"
       , num_leaves = 5L
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
   )
 
@@ -855,13 +891,14 @@ test_that("lgb.train() accepts nrounds as either a top-level argument or paramet
     data = lgb.Dataset(
       train$data
       , label = train$label
+      , params = list(num_threads = .LGB_MAX_THREADS)
     )
     , params = list(
       objective = "regression"
       , metric = "l2"
       , num_leaves = 5L
       , nrounds = nrounds
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
     )
   )
 
@@ -870,6 +907,7 @@ test_that("lgb.train() accepts nrounds as either a top-level argument or paramet
     data = lgb.Dataset(
       train$data
       , label = train$label
+      , params = list(num_threads = .LGB_MAX_THREADS)
     )
     , nrounds = 20L
     , params = list(
@@ -877,7 +915,8 @@ test_that("lgb.train() accepts nrounds as either a top-level argument or paramet
       , metric = "l2"
       , num_leaves = 5L
       , nrounds = nrounds
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
   )
 
@@ -916,7 +955,7 @@ test_that("lgb.train() throws an informative error if 'data' is not an lgb.Datas
         params = list(
             objective = "regression"
             , metric = "l2,l1"
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
         )
         , data = val
         , 10L
@@ -935,7 +974,7 @@ test_that("lgb.train() throws an informative error if 'valids' is not a list of
       params = list(
         objective = "regression"
         , metric = "l2,l1"
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
       )
       , data = lgb.Dataset(train$data, label = train$label)
       , 10L
@@ -954,7 +993,7 @@ test_that("lgb.train() errors if 'valids' is a list of lgb.Dataset objects but s
       params = list(
         objective = "regression"
         , metric = "l2,l1"
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
       )
       , data = lgb.Dataset(train$data, label = train$label)
       , 10L
@@ -973,7 +1012,7 @@ test_that("lgb.train() throws an informative error if 'valids' contains lgb.Data
       params = list(
         objective = "regression"
         , metric = "l2,l1"
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
     )
       , data = lgb.Dataset(train$data, label = train$label)
       , 10L
@@ -988,12 +1027,14 @@ test_that("lgb.train() works with force_col_wise and force_row_wise", {
   dtrain <- lgb.Dataset(
     train$data
     , label = train$label
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   params <- list(
     objective = "binary"
     , metric = "binary_error"
     , force_col_wise = TRUE
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
   bst_col_wise <- lgb.train(
     params = params
@@ -1005,7 +1046,8 @@ test_that("lgb.train() works with force_col_wise and force_row_wise", {
     objective = "binary"
     , metric = "binary_error"
     , force_row_wise = TRUE
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
   bst_row_wise <- lgb.train(
     params = params
@@ -1037,6 +1079,7 @@ test_that("lgb.train() works as expected with sparse features", {
   dtrain <- lgb.Dataset(
     data = as.matrix(trainDF[["x"]], drop = FALSE)
     , label = trainDF[["y"]]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   nrounds <- 1L
   bst <- lgb.train(
@@ -1044,7 +1087,8 @@ test_that("lgb.train() works as expected with sparse features", {
       objective = "binary"
       , min_data = 1L
       , min_data_in_bin = 1L
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = dtrain
     , nrounds = nrounds
@@ -1056,7 +1100,7 @@ test_that("lgb.train() works as expected with sparse features", {
   expect_equal(parsed_model$objective, "binary sigmoid:1")
   expect_false(parsed_model$average_output)
   expected_error <- 0.6931268
-  expect_true(abs(bst$eval_train()[[1L]][["value"]] - expected_error) < TOLERANCE)
+  expect_true(abs(bst$eval_train()[[1L]][["value"]] - expected_error) < .LGB_NUMERIC_TOLERANCE)
 })
 
 test_that("lgb.train() works with early stopping for classification", {
@@ -1071,10 +1115,12 @@ test_that("lgb.train() works with early stopping for classification", {
   dtrain <- lgb.Dataset(
     data = as.matrix(trainDF[["feat1"]], drop = FALSE)
     , label = trainDF[["target"]]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   dvalid <- lgb.Dataset(
     data = as.matrix(validDF[["feat1"]], drop = FALSE)
     , label = validDF[["target"]]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   nrounds <- 10L
 
@@ -1085,7 +1131,8 @@ test_that("lgb.train() works with early stopping for classification", {
     params = list(
       objective = "binary"
       , metric = "binary_error"
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = dtrain
     , nrounds = nrounds
@@ -1109,7 +1156,8 @@ test_that("lgb.train() works with early stopping for classification", {
       objective = "binary"
       , metric = "binary_error"
       , early_stopping_rounds = early_stopping_rounds
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = dtrain
     , nrounds = nrounds
@@ -1142,10 +1190,12 @@ test_that("lgb.train() treats early_stopping_rounds<=0 as disabling early stoppi
   dtrain <- lgb.Dataset(
     data = as.matrix(trainDF[["feat1"]], drop = FALSE)
     , label = trainDF[["target"]]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   dvalid <- lgb.Dataset(
     data = as.matrix(validDF[["feat1"]], drop = FALSE)
     , label = validDF[["target"]]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   nrounds <- 5L
 
@@ -1158,7 +1208,8 @@ test_that("lgb.train() treats early_stopping_rounds<=0 as disabling early stoppi
       params = list(
         objective = "binary"
         , metric = "binary_error"
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
       )
       , data = dtrain
       , nrounds = nrounds
@@ -1182,7 +1233,8 @@ test_that("lgb.train() treats early_stopping_rounds<=0 as disabling early stoppi
         objective = "binary"
         , metric = "binary_error"
         , n_iter_no_change = value
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
       )
       , data = dtrain
       , nrounds = nrounds
@@ -1204,10 +1256,12 @@ test_that("lgb.train() works with early stopping for classification with a metri
   dtrain <- lgb.Dataset(
     data = train$data
     , label = train$label
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   dvalid <- lgb.Dataset(
     data = test$data
     , label = test$label
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   nrounds <- 10L
 
@@ -1222,7 +1276,8 @@ test_that("lgb.train() works with early stopping for classification with a metri
       , metric = "auc"
       , max_depth = 3L
       , early_stopping_rounds = early_stopping_rounds
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = dtrain
     , nrounds = nrounds
@@ -1236,7 +1291,8 @@ test_that("lgb.train() works with early stopping for classification with a metri
       , metric = "binary_error"
       , max_depth = 3L
       , early_stopping_rounds = early_stopping_rounds
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = dtrain
     , nrounds = nrounds
@@ -1254,7 +1310,7 @@ test_that("lgb.train() works with early stopping for classification with a metri
   )
   expect_identical(bst_binary_error$best_iter, 1L)
   expect_identical(bst_binary_error$current_iter(), early_stopping_rounds + 1L)
-  expect_true(abs(bst_binary_error$best_score - 0.01613904) < TOLERANCE)
+  expect_true(abs(bst_binary_error$best_score - 0.01613904) < .LGB_NUMERIC_TOLERANCE)
 
   # early stopping should not have been hit for AUC (higher_better = TRUE)
   eval_info <- bst_auc$.__enclos_env__$private$get_eval_info()
@@ -1265,7 +1321,7 @@ test_that("lgb.train() works with early stopping for classification with a metri
   )
   expect_identical(bst_auc$best_iter, 9L)
   expect_identical(bst_auc$current_iter(), nrounds)
-  expect_true(abs(bst_auc$best_score - 0.9999969) < TOLERANCE)
+  expect_true(abs(bst_auc$best_score - 0.9999969) < .LGB_NUMERIC_TOLERANCE)
 })
 
 test_that("lgb.train() works with early stopping for regression", {
@@ -1281,10 +1337,12 @@ test_that("lgb.train() works with early stopping for regression", {
   dtrain <- lgb.Dataset(
     data = as.matrix(trainDF[["feat1"]], drop = FALSE)
     , label = trainDF[["target"]]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   dvalid <- lgb.Dataset(
     data = as.matrix(validDF[["feat1"]], drop = FALSE)
     , label = validDF[["target"]]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   nrounds <- 10L
 
@@ -1295,7 +1353,8 @@ test_that("lgb.train() works with early stopping for regression", {
     params = list(
       objective = "regression"
       , metric = "rmse"
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = dtrain
     , nrounds = nrounds
@@ -1319,7 +1378,8 @@ test_that("lgb.train() works with early stopping for regression", {
       objective = "regression"
       , metric = "rmse"
       , early_stopping_rounds = early_stopping_rounds
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = dtrain
     , nrounds = nrounds
@@ -1354,7 +1414,8 @@ test_that("lgb.train() does not stop early if early_stopping_rounds is not given
     params = list(
       objective = "regression"
       , metric = "None"
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = DTRAIN_RANDOM_REGRESSION
     , nrounds = nrounds
@@ -1398,14 +1459,16 @@ test_that("If first_metric_only is not given or is FALSE, lgb.train() decides to
       objective = "regression"
       , metric = "None"
       , early_stopping_rounds = early_stopping_rounds
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , list(
       objective = "regression"
       , metric = "None"
       , early_stopping_rounds = early_stopping_rounds
       , first_metric_only = FALSE
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
   )
 
@@ -1468,7 +1531,8 @@ test_that("If first_metric_only is TRUE, lgb.train() decides to stop early based
       , metric = "None"
       , early_stopping_rounds = early_stopping_rounds
       , first_metric_only = TRUE
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = DTRAIN_RANDOM_REGRESSION
     , nrounds = nrounds
@@ -1514,7 +1578,8 @@ test_that("lgb.train() works when a mixture of functions and strings are passed
     params = list(
       objective = "regression"
       , metric = "None"
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = DTRAIN_RANDOM_REGRESSION
     , nrounds = nrounds
@@ -1539,15 +1604,15 @@ test_that("lgb.train() works when a mixture of functions and strings are passed
 
   # the difference metrics shouldn't have been mixed up with each other
   results <- bst$record_evals[["valid1"]]
-  expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 1.105012) < TOLERANCE)
-  expect_true(abs(results[["l2"]][["eval"]][[1L]] - 1.221051) < TOLERANCE)
+  expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 1.105012) < .LGB_NUMERIC_TOLERANCE)
+  expect_true(abs(results[["l2"]][["eval"]][[1L]] - 1.221051) < .LGB_NUMERIC_TOLERANCE)
   expected_increasing_metric <- increasing_metric_starting_value + 0.1
   expect_true(
     abs(
       results[["increasing_metric"]][["eval"]][[1L]] - expected_increasing_metric
-    ) < TOLERANCE
+    ) < .LGB_NUMERIC_TOLERANCE
   )
-  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE)
+  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < .LGB_NUMERIC_TOLERANCE)
 
 })
 
@@ -1570,7 +1635,8 @@ test_that("lgb.train() works when a list of strings or a character vector is pas
       params = list(
         objective = "binary"
         , metric = "None"
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
       )
       , data = DTRAIN_RANDOM_CLASSIFICATION
       , nrounds = nrounds
@@ -1591,10 +1657,10 @@ test_that("lgb.train() works when a list of strings or a character vector is pas
     # the difference metrics shouldn't have been mixed up with each other
     results <- bst$record_evals[["valid1"]]
     if ("binary_error" %in% unlist(eval_variation)) {
-      expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE)
+      expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < .LGB_NUMERIC_TOLERANCE)
     }
     if ("binary_logloss" %in% unlist(eval_variation)) {
-      expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE)
+      expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < .LGB_NUMERIC_TOLERANCE)
     }
   }
 })
@@ -1607,7 +1673,8 @@ test_that("lgb.train() works when you specify both 'metric' and 'eval' with stri
     params = list(
       objective = "binary"
       , metric = "binary_error"
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = DTRAIN_RANDOM_CLASSIFICATION
     , nrounds = nrounds
@@ -1627,8 +1694,8 @@ test_that("lgb.train() works when you specify both 'metric' and 'eval' with stri
 
   # the difference metrics shouldn't have been mixed up with each other
   results <- bst$record_evals[["valid1"]]
-  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE)
-  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE)
+  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < .LGB_NUMERIC_TOLERANCE)
+  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < .LGB_NUMERIC_TOLERANCE)
 })
 
 test_that("lgb.train() works when you give a function for eval", {
@@ -1639,7 +1706,8 @@ test_that("lgb.train() works when you give a function for eval", {
     params = list(
       objective = "binary"
       , metric = "None"
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = DTRAIN_RANDOM_CLASSIFICATION
     , nrounds = nrounds
@@ -1651,7 +1719,7 @@ test_that("lgb.train() works when you give a function for eval", {
 
   # the difference metrics shouldn't have been mixed up with each other
   results <- bst$record_evals[["valid1"]]
-  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE)
+  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < .LGB_NUMERIC_TOLERANCE)
 })
 
 test_that("lgb.train() works with early stopping for regression with a metric that should be minimized", {
@@ -1667,10 +1735,12 @@ test_that("lgb.train() works with early stopping for regression with a metric th
   dtrain <- lgb.Dataset(
     data = as.matrix(trainDF[["feat1"]], drop = FALSE)
     , label = trainDF[["target"]]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   dvalid <- lgb.Dataset(
     data = as.matrix(validDF[["feat1"]], drop = FALSE)
     , label = validDF[["target"]]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   nrounds <- 10L
 
@@ -1688,7 +1758,8 @@ test_that("lgb.train() works with early stopping for regression with a metric th
       )
       , min_data_in_bin = 5L
       , early_stopping_rounds = early_stopping_rounds
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = dtrain
     , nrounds = nrounds
@@ -1720,6 +1791,7 @@ test_that("lgb.train() supports non-ASCII feature names", {
   dtrain <- lgb.Dataset(
     data = matrix(rnorm(400L), ncol =  4L)
     , label = rnorm(100L)
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   # content below is equivalent to
   #
@@ -1739,7 +1811,8 @@ test_that("lgb.train() supports non-ASCII feature names", {
     , obj = "regression"
     , params = list(
       metric = "rmse"
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , colnames = feature_names
   )
@@ -1749,7 +1822,7 @@ test_that("lgb.train() supports non-ASCII feature names", {
   # UTF-8 strings are not well-supported on Windows
   # * https://developer.r-project.org/Blog/public/2020/05/02/utf-8-support-on-windows/
   # * https://developer.r-project.org/Blog/public/2020/07/30/windows/utf-8-build-of-r-and-cran-packages/index.html
-  if (UTF8_LOCALE && !ON_WINDOWS) {
+  if (.LGB_UTF8_LOCALE && !.LGB_ON_WINDOWS) {
     expect_identical(
       dumped_model[["feature_names"]]
       , feature_names
@@ -1779,7 +1852,7 @@ test_that("lgb.train() works with integer, double, and numeric data", {
         , min_data_in_leaf = 1L
         , learning_rate = 0.01
         , seed = 708L
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
       )
       , nrounds = nrounds
     )
@@ -1792,7 +1865,7 @@ test_that("lgb.train() works with integer, double, and numeric data", {
     # should have achieved expected performance
     preds <- predict(bst, X)
     mae <- mean(abs(y - preds))
-    expect_true(abs(mae - expected_mae) < TOLERANCE)
+    expect_true(abs(mae - expected_mae) < .LGB_NUMERIC_TOLERANCE)
   }
 })
 
@@ -1800,6 +1873,7 @@ test_that("lgb.train() updates params based on keyword arguments", {
   dtrain <- lgb.Dataset(
     data = matrix(rnorm(400L), ncol =  4L)
     , label = rnorm(100L)
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
 
   # defaults from keyword arguments should be used if not specified in params
@@ -1808,7 +1882,7 @@ test_that("lgb.train() updates params based on keyword arguments", {
       bst <- lgb.train(
         data = dtrain
         , obj = "regression"
-        , params = list()
+        , params = list(num_threads = .LGB_MAX_THREADS)
       )
     })
   )
@@ -1824,6 +1898,7 @@ test_that("lgb.train() updates params based on keyword arguments", {
         , params = list(
           "verbosity" = 5L
           , "num_iterations" = 2L
+          , num_threads = .LGB_MAX_THREADS
         )
       )
     })
@@ -1840,6 +1915,7 @@ test_that("lgb.train() updates params based on keyword arguments", {
         , params = list(
           "verbose" = 5L
           , "num_boost_round" = 2L
+          , num_threads = .LGB_MAX_THREADS
         )
       )
     })
@@ -1863,14 +1939,17 @@ test_that("when early stopping is not activated, best_iter and best_score come f
   dtrain <- lgb.Dataset(
     data = as.matrix(trainDF[["feat1"]], drop = FALSE)
     , label = trainDF[["target"]]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   dvalid1 <- lgb.Dataset(
     data = as.matrix(validDF[["feat1"]], drop = FALSE)
     , label = validDF[["target"]]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   dvalid2 <- lgb.Dataset(
     data = as.matrix(validDF[1L:10L, "feat1"], drop = FALSE)
     , label = validDF[1L:10L, "target"]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   nrounds <- 10L
   train_params <- list(
@@ -1878,7 +1957,8 @@ test_that("when early stopping is not activated, best_iter and best_score come f
     , metric = "rmse"
     , learning_rate = 1.5
     , num_leaves = 5L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
 
   # example 1: two valids, neither are the training data
@@ -2020,10 +2100,12 @@ test_that("lightgbm.train() gives the correct best_score and best_iter for a met
   dtrain <- lgb.Dataset(
     data = as.matrix(trainDF[["feat1"]], drop = FALSE)
     , label = trainDF[["target"]]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   dvalid1 <- lgb.Dataset(
     data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE)
     , label = validDF[1L:25L, "target"]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   nrounds <- 10L
   bst <- lgb.train(
@@ -2038,7 +2120,8 @@ test_that("lightgbm.train() gives the correct best_score and best_iter for a met
       , metric = "auc"
       , learning_rate = 1.5
       , num_leaves = 5L
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
   )
   # note that "something-random-we-would-not-hardcode" was recognized as the training
@@ -2070,14 +2153,17 @@ test_that("using lightgbm() without early stopping, best_iter and best_score com
   dtrain <- lgb.Dataset(
     data = as.matrix(trainDF[["feat1"]], drop = FALSE)
     , label = trainDF[["target"]]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   dvalid1 <- lgb.Dataset(
     data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE)
     , label = validDF[1L:25L, "target"]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   dvalid2 <- lgb.Dataset(
     data = as.matrix(validDF[26L:50L, "feat1"], drop = FALSE)
     , label = validDF[26L:50L, "target"]
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   nrounds <- 10L
   bst <- lightgbm(
@@ -2093,6 +2179,7 @@ test_that("using lightgbm() without early stopping, best_iter and best_score com
       , metric = "auc"
       , learning_rate = 1.5
       , num_leaves = 5L
+      , num_threads = .LGB_MAX_THREADS
     )
     , verbose = -7L
   )
@@ -2119,7 +2206,8 @@ test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings
     params = list(
       objective = "binary"
       , metric = "binary_error"
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = DTRAIN_RANDOM_CLASSIFICATION
     , nrounds = nrounds
@@ -2137,8 +2225,8 @@ test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings
 
   # the difference metrics shouldn't have been mixed up with each other
   results <- bst$record_evals[["valid"]]
-  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5005654) < TOLERANCE)
-  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7011232) < TOLERANCE)
+  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5005654) < .LGB_NUMERIC_TOLERANCE)
+  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7011232) < .LGB_NUMERIC_TOLERANCE)
 
   # all boosters should have been created
   expect_length(bst$boosters, nfolds)
@@ -2153,7 +2241,8 @@ test_that("lgb.cv() works when you give a function for eval", {
     params = list(
       objective = "binary"
       , metric = "None"
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = DTRAIN_RANDOM_CLASSIFICATION
     , nfold = nfolds
@@ -2163,7 +2252,7 @@ test_that("lgb.cv() works when you give a function for eval", {
 
   # the difference metrics shouldn't have been mixed up with each other
   results <- bst$record_evals[["valid"]]
-  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE)
+  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < .LGB_NUMERIC_TOLERANCE)
   expect_named(results, "constant_metric")
 })
 
@@ -2179,7 +2268,8 @@ test_that("If first_metric_only is TRUE, lgb.cv() decides to stop early based on
       , metric = "None"
       , early_stopping_rounds = early_stopping_rounds
       , first_metric_only = TRUE
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = DTRAIN_RANDOM_REGRESSION
     , nfold = nfolds
@@ -2236,7 +2326,8 @@ test_that("early stopping works with lgb.cv()", {
       , metric = "None"
       , early_stopping_rounds = early_stopping_rounds
       , first_metric_only = TRUE
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = DTRAIN_RANDOM_REGRESSION
     , nfold = nfolds
@@ -2286,11 +2377,12 @@ test_that("lgb.cv() respects changes to logging verbosity", {
   dtrain <- lgb.Dataset(
     data = train$data
     , label = train$label
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   # (verbose = 1) should be INFO and WARNING level logs
   lgb_cv_logs <- capture.output({
     cv_bst <- lgb.cv(
-      params = list()
+      params = list(num_threads = .LGB_MAX_THREADS)
       , nfold = 2L
       , nrounds = 5L
       , data = dtrain
@@ -2304,7 +2396,7 @@ test_that("lgb.cv() respects changes to logging verbosity", {
   # (verbose = 0) should be WARNING level logs only
   lgb_cv_logs <- capture.output({
     cv_bst <- lgb.cv(
-      params = list()
+      params = list(num_threads = .LGB_MAX_THREADS)
       , nfold = 2L
       , nrounds = 5L
       , data = dtrain
@@ -2318,7 +2410,7 @@ test_that("lgb.cv() respects changes to logging verbosity", {
   # (verbose = -1) no logs
   lgb_cv_logs <- capture.output({
     cv_bst <- lgb.cv(
-      params = list()
+      params = list(num_threads = .LGB_MAX_THREADS)
       , nfold = 2L
       , nrounds = 5L
       , data = dtrain
@@ -2336,6 +2428,7 @@ test_that("lgb.cv() updates params based on keyword arguments", {
   dtrain <- lgb.Dataset(
     data = matrix(rnorm(400L), ncol =  4L)
     , label = rnorm(100L)
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
 
   # defaults from keyword arguments should be used if not specified in params
@@ -2344,7 +2437,7 @@ test_that("lgb.cv() updates params based on keyword arguments", {
       cv_bst <- lgb.cv(
         data = dtrain
         , obj = "regression"
-        , params = list()
+        , params = list(num_threads = .LGB_MAX_THREADS)
         , nfold = 2L
       )
     })
@@ -2365,6 +2458,7 @@ test_that("lgb.cv() updates params based on keyword arguments", {
         , params = list(
           "verbosity" = 5L
           , "num_iterations" = 2L
+          , num_threads = .LGB_MAX_THREADS
         )
         , nfold = 2L
       )
@@ -2385,6 +2479,7 @@ test_that("lgb.cv() updates params based on keyword arguments", {
         , params = list(
           "verbose" = 5L
           , "num_boost_round" = 2L
+          , num_threads = .LGB_MAX_THREADS
         )
         , nfold = 2L
       )
@@ -2407,15 +2502,17 @@ test_that("lgb.train() fit on linearly-relatead data improves when using linear
     return(lgb.Dataset(
       data = X
       , label = 2L * X + runif(nrow(X), 0L, 0.1)
+      , params = list(num_threads = .LGB_MAX_THREADS)
     ))
   }
 
   params <- list(
     objective = "regression"
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
     , metric = "mse"
     , seed = 0L
     , num_leaves = 2L
+    , num_threads = .LGB_MAX_THREADS
   )
 
   dtrain <- .new_dataset()
@@ -2446,15 +2543,17 @@ test_that("lgb.train() with linear learner fails already-constructed dataset wit
   set.seed(708L)
   params <- list(
     objective = "regression"
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
     , metric = "mse"
     , seed = 0L
     , num_leaves = 2L
+    , num_threads = .LGB_MAX_THREADS
   )
 
   dtrain <- lgb.Dataset(
     data = matrix(rnorm(100L), ncol = 1L)
     , label = rnorm(100L)
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   dtrain$construct()
   expect_error({
@@ -2480,15 +2579,17 @@ test_that("lgb.train() works with linear learners even if Dataset has missing va
     return(lgb.Dataset(
       data = X
       , label = 2L * X + runif(nrow(X), 0L, 0.1)
+      , params = list(num_threads = .LGB_MAX_THREADS)
     ))
   }
 
   params <- list(
     objective = "regression"
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
     , metric = "mse"
     , seed = 0L
     , num_leaves = 2L
+    , num_threads = .LGB_MAX_THREADS
   )
 
   dtrain <- .new_dataset()
@@ -2526,17 +2627,19 @@ test_that("lgb.train() works with linear learners, bagging, and a Dataset that h
     return(lgb.Dataset(
       data = X
       , label = 2L * X + runif(nrow(X), 0L, 0.1)
+      , params = list(num_threads = .LGB_MAX_THREADS)
     ))
   }
 
   params <- list(
     objective = "regression"
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
     , metric = "mse"
     , seed = 0L
     , num_leaves = 2L
     , bagging_freq = 1L
     , subsample = 0.8
+    , num_threads = .LGB_MAX_THREADS
   )
 
   dtrain <- .new_dataset()
@@ -2576,6 +2679,7 @@ test_that("lgb.train() works with linear learners and data where a feature has o
       , label = 2L * X[, 1L] + runif(nrow(X), 0L, 0.1)
       , params = list(
         feature_pre_filter = FALSE
+        , num_threads = .LGB_MAX_THREADS
       )
     ))
   }
@@ -2586,6 +2690,7 @@ test_that("lgb.train() works with linear learners and data where a feature has o
     , metric = "mse"
     , seed = 0L
     , num_leaves = 2L
+    , num_threads = .LGB_MAX_THREADS
   )
 
   dtrain <- .new_dataset()
@@ -2606,6 +2711,7 @@ test_that("lgb.train() works with linear learners when Dataset has categorical f
     return(lgb.Dataset(
       data = X
       , label = 2L * X[, 1L] + runif(nrow(X), 0L, 0.1)
+      , params = list(num_threads = .LGB_MAX_THREADS)
     ))
   }
 
@@ -2616,6 +2722,7 @@ test_that("lgb.train() works with linear learners when Dataset has categorical f
     , seed = 0L
     , num_leaves = 2L
     , categorical_feature = 1L
+    , num_threads = .LGB_MAX_THREADS
   )
 
   dtrain <- .new_dataset()
@@ -2682,12 +2789,13 @@ test_that("lgb.train() throws an informative error if interaction_constraints co
 test_that(paste0("lgb.train() gives same result when interaction_constraints is specified as a list of ",
                  "character vectors, numeric vectors, or a combination"), {
   set.seed(1L)
-  dtrain <- lgb.Dataset(train$data, label = train$label)
+  dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS))
 
   params <- list(
     objective = "regression"
     , interaction_constraints = list(c(1L, 2L), 3L)
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
   bst <- lightgbm(
     data = dtrain
@@ -2700,7 +2808,8 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is
   params <- list(
     objective = "regression"
     , interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), cnames[[3L]])
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
   bst <- lightgbm(
     data = dtrain
@@ -2712,7 +2821,8 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is
   params <- list(
     objective = "regression"
     , interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), 3L)
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
   bst <- lightgbm(
     data = dtrain
@@ -2728,12 +2838,13 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is
 
 test_that(paste0("lgb.train() gives same results when using interaction_constraints and specifying colnames"), {
   set.seed(1L)
-  dtrain <- lgb.Dataset(train$data, label = train$label)
+  dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS))
 
   params <- list(
     objective = "regression"
     , interaction_constraints = list(c(1L, 2L), 3L)
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
   bst <- lightgbm(
     data = dtrain
@@ -2746,7 +2857,8 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
   params <- list(
     objective = "regression"
     , interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L])
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
   bst <- lightgbm(
     data = dtrain
@@ -2796,6 +2908,7 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
     , categorical_feature = categorical_features
     , free_raw_data = FALSE
     , colnames = c("feature_1", "feature_2", "feature_3")
+    , params = list(num_threads = .LGB_MAX_THREADS)
   ))
 }
 
@@ -2890,7 +3003,8 @@ for (x3_to_categorical in c(TRUE, FALSE)) {
         , monotone_constraints = c(1L, -1L, 0L)
         , monotone_constraints_method = monotone_constraints_method
         , use_missing = FALSE
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
       )
       constrained_model <- lgb.train(
         params = params
@@ -2912,9 +3026,9 @@ test_that("lightgbm() accepts objective as function argument and under params",
   bst1 <- lightgbm(
     data = train$data
     , label = train$label
-    , params = list(objective = "regression_l1")
+    , params = list(objective = "regression_l1", num_threads = .LGB_MAX_THREADS)
     , nrounds = 5L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
   )
   expect_equal(bst1$params$objective, "regression_l1")
   model_txt_lines <- strsplit(
@@ -2930,7 +3044,7 @@ test_that("lightgbm() accepts objective as function argument and under params",
     , label = train$label
     , objective = "regression_l1"
     , nrounds = 5L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
   )
   expect_equal(bst2$params$objective, "regression_l1")
   model_txt_lines <- strsplit(
@@ -2947,9 +3061,9 @@ test_that("lightgbm() prioritizes objective under params over objective as funct
     data = train$data
     , label = train$label
     , objective = "regression"
-    , params = list(objective = "regression_l1")
+    , params = list(objective = "regression_l1", num_threads = .LGB_MAX_THREADS)
     , nrounds = 5L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
   )
   expect_equal(bst1$params$objective, "regression_l1")
   model_txt_lines <- strsplit(
@@ -2964,9 +3078,9 @@ test_that("lightgbm() prioritizes objective under params over objective as funct
     data = train$data
     , label = train$label
     , objective = "regression"
-    , params = list(loss = "regression_l1")
+    , params = list(loss = "regression_l1", num_threads = .LGB_MAX_THREADS)
     , nrounds = 5L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
   )
   expect_equal(bst2$params$objective, "regression_l1")
   model_txt_lines <- strsplit(
@@ -2984,7 +3098,8 @@ test_that("lightgbm() accepts init_score as function argument", {
     , label = train$label
     , objective = "binary"
     , nrounds = 5L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   pred1 <- predict(bst1, train$data, type = "raw")
 
@@ -2994,7 +3109,8 @@ test_that("lightgbm() accepts init_score as function argument", {
     , init_score = pred1
     , objective = "binary"
     , nrounds = 5L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   pred2 <- predict(bst2, train$data, type = "raw")
 
@@ -3006,7 +3122,8 @@ test_that("lightgbm() defaults to 'regression' objective if objective not otherw
     data = train$data
     , label = train$label
     , nrounds = 5L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , params = list(num_threads = .LGB_MAX_THREADS)
   )
   expect_equal(bst$params$objective, "regression")
   model_txt_lines <- strsplit(
@@ -3023,7 +3140,7 @@ test_that("lightgbm() accepts 'num_threads' as either top-level argument or unde
     data = train$data
     , label = train$label
     , nrounds = 5L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
     , num_threads = 1L
   )
   expect_equal(bst$params$num_threads, 1L)
@@ -3038,7 +3155,7 @@ test_that("lightgbm() accepts 'num_threads' as either top-level argument or unde
     data = train$data
     , label = train$label
     , nrounds = 5L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
     , params = list(num_threads = 1L)
   )
   expect_equal(bst$params$num_threads, 1L)
@@ -3053,7 +3170,7 @@ test_that("lightgbm() accepts 'num_threads' as either top-level argument or unde
     data = train$data
     , label = train$label
     , nrounds = 5L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
     , num_threads = 10L
     , params = list(num_threads = 1L)
   )
@@ -3077,10 +3194,11 @@ test_that("lightgbm() accepts 'weight' and 'weights'", {
     , weights = w
     , obj = "regression"
     , nrounds = 5L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
     , params = list(
       min_data_in_bin = 1L
       , min_data_in_leaf = 1L
+      , num_threads = .LGB_MAX_THREADS
     )
   )
   expect_equal(model$.__enclos_env__$private$train_set$get_field("weight"), w)
@@ -3132,7 +3250,7 @@ test_that("lightgbm() accepts 'weight' and 'weights'", {
   expect_equal(
     object = unlist(record_evals[["valid"]][["auc"]][["eval"]])
     , expected = expected_valid_auc
-    , tolerance = TOLERANCE
+    , tolerance = .LGB_NUMERIC_TOLERANCE
   )
    expect_named(record_evals, c("start_iter", "valid"), ignore.order = TRUE, ignore.case = FALSE)
   expect_equal(record_evals[["valid"]][["auc"]][["eval_err"]], list())
@@ -3146,6 +3264,9 @@ test_that("lightgbm() accepts 'weight' and 'weights'", {
     , objective = "binary"
     , metric =  "auc"
     , early_stopping_round = nrounds
+    , num_threads = .LGB_MAX_THREADS
+    # include a nonsense parameter just to trigger a WARN-level log
+    , nonsense_param = 1.0
   )
   if (!is.null(verbose_param)) {
     params[["verbose"]] <- verbose_param
@@ -3162,6 +3283,7 @@ test_that("lightgbm() accepts 'weight' and 'weights'", {
     train_kwargs[["data"]] <- lgb.Dataset(
       data = train$data
       , label = train$label
+      , params = list(num_threads = .LGB_MAX_THREADS)
     )
     train_kwargs[["valids"]] <- list(
       "valid" = lgb.Dataset(data = test$data, label = test$label)
@@ -3535,7 +3657,7 @@ test_that("lightgbm() changes objective='auto' appropriately", {
   data("mtcars")
   y <- mtcars$mpg
   x <- as.matrix(mtcars[, -1L])
-  model <- lightgbm(x, y, objective = "auto", verbose = VERBOSITY, nrounds = 5L)
+  model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS)
   expect_equal(model$params$objective, "regression")
   model_txt_lines <- strsplit(
     x = model$save_model_to_string()
@@ -3548,7 +3670,7 @@ test_that("lightgbm() changes objective='auto' appropriately", {
   # Binary classification
   x <- train$data
   y <- factor(train$label)
-  model <- lightgbm(x, y, objective = "auto", verbose = VERBOSITY, nrounds = 5L)
+  model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS)
   expect_equal(model$params$objective, "binary")
   model_txt_lines <- strsplit(
     x = model$save_model_to_string()
@@ -3561,7 +3683,7 @@ test_that("lightgbm() changes objective='auto' appropriately", {
   data("iris")
   y <- factor(iris$Species)
   x <- as.matrix(iris[, -5L])
-  model <- lightgbm(x, y, objective = "auto", verbose = VERBOSITY, nrounds = 5L)
+  model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS)
   expect_equal(model$params$objective, "multiclass")
   expect_equal(model$params$num_class, 3L)
   model_txt_lines <- strsplit(
@@ -3576,7 +3698,14 @@ test_that("lightgbm() determines number of classes for non-default multiclass ob
   data("iris")
   y <- factor(iris$Species)
   x <- as.matrix(iris[, -5L])
-  model <- lightgbm(x, y, objective = "multiclassova", verbose = VERBOSITY, nrounds = 5L)
+  model <- lightgbm(
+    x
+    , y
+    , objective = "multiclassova"
+    , verbose = .LGB_VERBOSITY
+    , nrounds = 5L
+    , num_threads = .LGB_MAX_THREADS
+  )
   expect_equal(model$params$objective, "multiclassova")
   expect_equal(model$params$num_class, 3L)
   model_txt_lines <- strsplit(
@@ -3592,7 +3721,7 @@ test_that("lightgbm() doesn't accept binary classification with non-binary facto
   y <- factor(iris$Species)
   x <- as.matrix(iris[, -5L])
   expect_error({
-    lightgbm(x, y, objective = "binary", verbose = VERBOSITY, nrounds = 5L)
+    lightgbm(x, y, objective = "binary", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS)
   }, regexp = "Factors with >2 levels as labels only allowed for multi-class objectives")
 })
 
@@ -3603,7 +3732,7 @@ test_that("lightgbm() doesn't accept multi-class classification with binary fact
   y <- factor(y)
   x <- as.matrix(iris[, -5L])
   expect_error({
-    lightgbm(x, y, objective = "multiclass", verbose = VERBOSITY, nrounds = 5L)
+    lightgbm(x, y, objective = "multiclass", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS)
   }, regexp = "Two-level factors as labels only allowed for objective='binary'")
 })
 
@@ -3611,7 +3740,7 @@ test_that("lightgbm() model predictions retain factor levels for multiclass clas
   data("iris")
   y <- factor(iris$Species)
   x <- as.matrix(iris[, -5L])
-  model <- lightgbm(x, y, objective = "auto", verbose = VERBOSITY, nrounds = 5L)
+  model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS)
 
   pred <- predict(model, x, type = "class")
   expect_true(is.factor(pred))
@@ -3630,7 +3759,7 @@ test_that("lightgbm() model predictions retain factor levels for binary classifi
   y[y == "setosa"] <- "versicolor"
   y <- factor(y)
   x <- as.matrix(iris[, -5L])
-  model <- lightgbm(x, y, objective = "auto", verbose = VERBOSITY, nrounds = 5L)
+  model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS)
 
   pred <- predict(model, x, type = "class")
   expect_true(is.factor(pred))
@@ -3646,3 +3775,33 @@ test_that("lightgbm() model predictions retain factor levels for binary classifi
   expect_true(is.numeric(pred))
   expect_false(any(pred %in% y))
 })
+
+test_that("lightgbm() accepts named categorical_features", {
+  data(mtcars)
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1L])
+  model <- lightgbm(
+    x
+    , y
+    , categorical_feature = "cyl"
+    , verbose = .LGB_VERBOSITY
+    , nrounds = 5L
+    , num_threads = .LGB_MAX_THREADS
+  )
+  expect_true(length(model$params$categorical_feature) > 0L)
+})
+
+test_that("lightgbm() correctly sets objective when passing lgb.Dataset as input", {
+  data(mtcars)
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1L])
+  ds <- lgb.Dataset(x, label = y)
+  model <- lightgbm(
+    ds
+    , objective = "auto"
+    , verbose = .LGB_VERBOSITY
+    , nrounds = 5L
+    , num_threads = .LGB_MAX_THREADS
+  )
+  expect_equal(model$params$objective, "regression")
+})
diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R
index 974430e1ab41..2c10b9d571dc 100644
--- a/R-package/tests/testthat/test_custom_objective.R
+++ b/R-package/tests/testthat/test_custom_objective.R
@@ -1,15 +1,9 @@
-VERBOSITY <- as.integer(
-  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
-)
-
 data(agaricus.train, package = "lightgbm")
 data(agaricus.test, package = "lightgbm")
 dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
 dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
 watchlist <- list(eval = dtest, train = dtrain)
 
-TOLERANCE <- 1e-6
-
 logregobj <- function(preds, dtrain) {
   labels <- get_field(dtrain, "label")
   preds <- 1.0 / (1.0 + exp(-preds))
@@ -38,7 +32,8 @@ param <- list(
   , learning_rate = 1.0
   , objective = logregobj
   , metric = "auc"
-  , verbose = VERBOSITY
+  , verbose = .LGB_VERBOSITY
+  , num_threads = .LGB_MAX_THREADS
 )
 num_round <- 10L
 
@@ -53,7 +48,8 @@ test_that("using a custom objective, custom eval, and no other metrics works", {
     params = list(
       num_leaves = 8L
       , learning_rate = 1.0
-      , verbose = VERBOSITY
+      , verbose = .LGB_VERBOSITY
+      , num_threads = .LGB_MAX_THREADS
     )
     , data = dtrain
     , nrounds = 4L
@@ -63,11 +59,11 @@ test_that("using a custom objective, custom eval, and no other metrics works", {
   )
   expect_false(is.null(bst$record_evals))
   expect_equal(bst$best_iter, 4L)
-  expect_true(abs(bst$best_score - 0.000621) < TOLERANCE)
+  expect_true(abs(bst$best_score - 0.000621) < .LGB_NUMERIC_TOLERANCE)
 
   eval_results <- bst$eval_valid(feval = evalerror)[[1L]]
   expect_true(eval_results[["data_name"]] == "eval")
-  expect_true(abs(eval_results[["value"]] - 0.0006207325) < TOLERANCE)
+  expect_true(abs(eval_results[["value"]] - 0.0006207325) < .LGB_NUMERIC_TOLERANCE)
   expect_true(eval_results[["name"]] == "error")
   expect_false(eval_results[["higher_better"]])
 })
@@ -79,7 +75,7 @@ test_that("using a custom objective that returns wrong shape grad or hess raises
   bad_hess <- function(preds, dtrain) {
     return(list(grad = rep(1.0, length(preds)), hess = numeric(0L)))
   }
-  params <- list(num_leaves = 3L, verbose = VERBOSITY)
+  params <- list(num_leaves = 3L, verbose = .LGB_VERBOSITY)
   expect_error({
     lgb.train(params = params, data = dtrain, obj = bad_grad)
   }, sprintf("Expected custom objective function to return grad with length %d, got 0.", nrow(dtrain)))
diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R
index bf4c826ef448..cf68ce9262a3 100644
--- a/R-package/tests/testthat/test_dataset.R
+++ b/R-package/tests/testthat/test_dataset.R
@@ -1,7 +1,3 @@
-VERBOSITY <- as.integer(
-  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
-)
-
 data(agaricus.train, package = "lightgbm")
 train_data <- agaricus.train$data[seq_len(1000L), ]
 train_label <- agaricus.train$label[seq_len(1000L)]
@@ -16,7 +12,7 @@ test_that("lgb.Dataset: basic construction, saving, loading", {
     test_data
     , label = test_label
     , params = list(
-      verbose = VERBOSITY
+      verbose = .LGB_VERBOSITY
     )
   )
   # from dense matrix
@@ -30,7 +26,7 @@ test_that("lgb.Dataset: basic construction, saving, loading", {
   dtest3 <- lgb.Dataset(
     tmp_file
     , params = list(
-      verbose = VERBOSITY
+      verbose = .LGB_VERBOSITY
     )
   )
   lgb.Dataset.construct(dtest3)
@@ -133,7 +129,7 @@ test_that("Dataset$set_reference() updates categorical_feature, colnames, and pr
   dtrain$construct()
   bst <- Booster$new(
     train_set = dtrain
-    , params = list(verbose = -1L)
+    , params = list(verbose = -1L, num_threads = .LGB_MAX_THREADS)
   )
   dtrain$.__enclos_env__$private$predictor <- bst$to_predictor()
 
@@ -376,7 +372,7 @@ test_that("lgb.Dataset: should be able to run lgb.train() immediately after usin
     data = test_data
     , label = test_label
     , params = list(
-      verbose = VERBOSITY
+      verbose = .LGB_VERBOSITY
     )
   )
   tmp_file <- tempfile(pattern = "lgb.Dataset_")
@@ -393,7 +389,8 @@ test_that("lgb.Dataset: should be able to run lgb.train() immediately after usin
     , metric = "binary_logloss"
     , num_leaves = 5L
     , learning_rate = 1.0
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
 
   # should be able to train right away
@@ -410,7 +407,7 @@ test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using l
     data = test_data
     , label = test_label
     , params = list(
-      verbosity = VERBOSITY
+      verbosity = .LGB_VERBOSITY
     )
   )
   tmp_file <- tempfile(pattern = "lgb.Dataset_")
@@ -428,7 +425,8 @@ test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using l
     , num_leaves = 5L
     , learning_rate = 1.0
     , num_iterations = 5L
-    , verbosity = VERBOSITY
+    , verbosity = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
   )
 
   # should be able to train right away
@@ -444,7 +442,7 @@ test_that("lgb.Dataset: should be able to use and retrieve long feature names",
   # set one feature to a value longer than the default buffer size used
   # in LGBM_DatasetGetFeatureNames_R
   feature_names <- names(iris)
-  long_name <- paste0(rep("a", 1000L), collapse = "")
+  long_name <- strrep("a", 1000L)
   feature_names[1L] <- long_name
   names(iris) <- feature_names
   # check that feature name survived the trip from R to C++ and back
@@ -473,7 +471,7 @@ test_that("lgb.Dataset: should be able to create a Dataset from a text file with
     data = train_file
     , params = list(
       header = TRUE
-      , verbosity = VERBOSITY
+      , verbosity = .LGB_VERBOSITY
     )
   )
   dtrain$construct()
@@ -497,7 +495,7 @@ test_that("lgb.Dataset: should be able to create a Dataset from a text file with
     data = train_file
     , params = list(
       header = FALSE
-      , verbosity = VERBOSITY
+      , verbosity = .LGB_VERBOSITY
     )
   )
   dtrain$construct()
diff --git a/R-package/tests/testthat/test_learning_to_rank.R b/R-package/tests/testthat/test_learning_to_rank.R
index c1c7768dac7d..6868794cf8ec 100644
--- a/R-package/tests/testthat/test_learning_to_rank.R
+++ b/R-package/tests/testthat/test_learning_to_rank.R
@@ -1,12 +1,3 @@
-VERBOSITY <- as.integer(
-  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
-)
-
-# numerical tolerance to use when checking metric values
-TOLERANCE <- 1e-06
-
-ON_32_BIT_WINDOWS <- .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8L
-
 test_that("learning-to-rank with lgb.train() works as expected", {
     set.seed(708L)
     data(agaricus.train, package = "lightgbm")
@@ -26,7 +17,8 @@ test_that("learning-to-rank with lgb.train() works as expected", {
         , ndcg_at = ndcg_at
         , lambdarank_truncation_level = 3L
         , learning_rate = 0.001
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     model <- lgb.train(
         params = params
@@ -59,15 +51,15 @@ test_that("learning-to-rank with lgb.train() works as expected", {
         , eval_names
     )
     expect_equal(eval_results[[1L]][["value"]], 0.775)
-    if (!ON_32_BIT_WINDOWS) {
-        expect_true(abs(eval_results[[2L]][["value"]] - 0.745986) < TOLERANCE)
-        expect_true(abs(eval_results[[3L]][["value"]] - 0.7351959) < TOLERANCE)
+    if (!.LGB_ON_32_BIT_WINDOWS) {
+        expect_true(abs(eval_results[[2L]][["value"]] - 0.745986) < .LGB_NUMERIC_TOLERANCE)
+        expect_true(abs(eval_results[[3L]][["value"]] - 0.7351959) < .LGB_NUMERIC_TOLERANCE)
     }
 })
 
 test_that("learning-to-rank with lgb.cv() works as expected", {
     testthat::skip_if(
-        ON_32_BIT_WINDOWS
+        .LGB_ON_32_BIT_WINDOWS
         , message = "Skipping on 32-bit Windows"
     )
     set.seed(708L)
@@ -90,7 +82,8 @@ test_that("learning-to-rank with lgb.cv() works as expected", {
         , label_gain = "0,1,3"
         , min_data = 1L
         , learning_rate = 0.01
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     nfold <- 4L
     nrounds <- 10L
@@ -113,7 +106,7 @@ test_that("learning-to-rank with lgb.cv() works as expected", {
     best_score <- cv_bst$best_score
     expect_true(best_iter > 0L && best_iter <= nrounds)
     expect_true(best_score > 0.0 && best_score < 1.0)
-    expect_true(abs(best_score - 0.75) < TOLERANCE)
+    expect_true(abs(best_score - 0.75) < .LGB_NUMERIC_TOLERANCE)
 
     # best_score should be set for the first metric
     first_metric <- eval_names[[1L]]
@@ -136,19 +129,19 @@ test_that("learning-to-rank with lgb.cv() works as expected", {
 
     # first and last value of each metric should be as expected
     ndcg1_values <- c(0.675, 0.725, 0.65, 0.725, 0.75, 0.725, 0.75, 0.725, 0.75, 0.75)
-    expect_true(all(abs(unlist(eval_results[["ndcg@1"]][["eval"]]) - ndcg1_values) < TOLERANCE))
+    expect_true(all(abs(unlist(eval_results[["ndcg@1"]][["eval"]]) - ndcg1_values) < .LGB_NUMERIC_TOLERANCE))
 
     ndcg2_values <- c(
         0.6556574, 0.6669721, 0.6306574, 0.6476294, 0.6629581,
         0.6476294, 0.6629581, 0.6379581, 0.7113147, 0.6823008
     )
-    expect_true(all(abs(unlist(eval_results[["ndcg@2"]][["eval"]]) - ndcg2_values) < TOLERANCE))
+    expect_true(all(abs(unlist(eval_results[["ndcg@2"]][["eval"]]) - ndcg2_values) < .LGB_NUMERIC_TOLERANCE))
 
     ndcg3_values <- c(
         0.6484639, 0.6571238, 0.6469279, 0.6540516, 0.6481857,
         0.6481857, 0.6481857, 0.6466496, 0.7027939, 0.6629898
     )
-    expect_true(all(abs(unlist(eval_results[["ndcg@3"]][["eval"]]) - ndcg3_values) < TOLERANCE))
+    expect_true(all(abs(unlist(eval_results[["ndcg@3"]][["eval"]]) - ndcg3_values) < .LGB_NUMERIC_TOLERANCE))
 
     # check details of each booster
     for (bst in cv_bst$boosters) {
diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R
index 8ccfdaa336ae..5f398f1c081d 100644
--- a/R-package/tests/testthat/test_lgb.Booster.R
+++ b/R-package/tests/testthat/test_lgb.Booster.R
@@ -1,10 +1,3 @@
-VERBOSITY <- as.integer(
-  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
-)
-
-ON_WINDOWS <- .Platform$OS.type == "windows"
-TOLERANCE <- 1e-6
-
 test_that("Booster$finalize() should not fail", {
     X <- as.matrix(as.integer(iris[, "Species"]), ncol = 1L)
     y <- iris[["Sepal.Length"]]
@@ -13,8 +6,9 @@ test_that("Booster$finalize() should not fail", {
         data = dtrain
         , params = list(
             objective = "regression"
+            , num_threads = .LGB_MAX_THREADS
         )
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
         , nrounds = 3L
     )
     expect_true(lgb.is.Booster(bst))
@@ -65,7 +59,8 @@ test_that("lgb.get.eval.result() should throw an informative error for incorrect
             , metric = "l2"
             , min_data = 1L
             , learning_rate = 1.0
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         )
         , data = dtrain
         , nrounds = 5L
@@ -99,7 +94,7 @@ test_that("lgb.get.eval.result() should throw an informative error for incorrect
             , metric = "l2"
             , min_data = 1L
             , learning_rate = 1.0
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
         )
         , data = dtrain
         , nrounds = 5L
@@ -133,7 +128,7 @@ test_that("lgb.load() gives the expected error messages given different incorrec
             objective = "binary"
             , num_leaves = 4L
             , learning_rate = 1.0
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
         )
         , nrounds = 2L
     )
@@ -184,7 +179,8 @@ test_that("Loading a Booster from a text file works", {
         , metric = c("mape", "average_precision")
         , learning_rate = 1.0
         , objective = "binary"
-        , verbosity = VERBOSITY
+        , verbosity = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     bst <- lightgbm(
         data = as.matrix(train$data)
@@ -227,13 +223,14 @@ test_that("boosters with linear models at leaves can be written to text file and
         , metric = "mse"
         , seed = 0L
         , num_leaves = 2L
+        , num_threads = .LGB_MAX_THREADS
     )
 
     bst <- lgb.train(
         data = dtrain
         , nrounds = 10L
         , params = params
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
     )
     expect_true(lgb.is.Booster(bst))
 
@@ -267,7 +264,8 @@ test_that("Loading a Booster from a string works", {
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         )
         , nrounds = 2L
     )
@@ -299,9 +297,10 @@ test_that("Saving a large model to string should work", {
             num_leaves = 100L
             , learning_rate = 0.01
             , objective = "binary"
+            , num_threads = .LGB_MAX_THREADS
         )
         , nrounds = 500L
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
     )
 
     pred <- predict(bst, train$data)
@@ -342,9 +341,10 @@ test_that("Saving a large model to JSON should work", {
             num_leaves = 100L
             , learning_rate = 0.01
             , objective = "binary"
+            , num_threads = .LGB_MAX_THREADS
         )
         , nrounds = 200L
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
     )
 
     model_json <- bst$dump_model()
@@ -371,7 +371,8 @@ test_that("If a string and a file are both passed to lgb.load() the file is used
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         )
         , nrounds = 2L
     )
@@ -405,7 +406,8 @@ test_that("Creating a Booster from a Dataset should work", {
     bst <- Booster$new(
         params = list(
             objective = "binary"
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         ),
         train_set = dtrain
     )
@@ -426,7 +428,8 @@ test_that("Creating a Booster from a Dataset with an existing predictor should w
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         )
         , nrounds = nrounds
     )
@@ -439,7 +442,8 @@ test_that("Creating a Booster from a Dataset with an existing predictor should w
     bst_from_ds <- Booster$new(
         train_set = dtest
         , params = list(
-            verbose = VERBOSITY
+            verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         )
     )
     expect_true(lgb.is.Booster(bst))
@@ -462,7 +466,8 @@ test_that("Booster$eval() should work on a Dataset stored in a binary file", {
             objective = "regression"
             , metric = "l2"
             , num_leaves = 4L
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         )
         , data = dtrain
         , nrounds = 2L
@@ -492,14 +497,14 @@ test_that("Booster$eval() should work on a Dataset stored in a binary file", {
     eval_from_file <- bst$eval(
         data = lgb.Dataset(
             data = test_file
-            , params = list(verbose = VERBOSITY)
+            , params = list(verbose = .LGB_VERBOSITY, num_threads = .LGB_MAX_THREADS)
         )$construct()
         , name = "test"
     )
 
-    expect_true(abs(eval_in_mem[[1L]][["value"]] - 0.1744423) < TOLERANCE)
+    expect_true(abs(eval_in_mem[[1L]][["value"]] - 0.1744423) < .LGB_NUMERIC_TOLERANCE)
     # refer to https://github.com/microsoft/LightGBM/issues/4680
-    if (isTRUE(ON_WINDOWS)) {
+    if (isTRUE(.LGB_ON_WINDOWS)) {
       expect_equal(eval_in_mem, eval_from_file)
     } else {
       expect_identical(eval_in_mem, eval_from_file)
@@ -520,7 +525,8 @@ test_that("Booster$rollback_one_iter() should work as expected", {
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         )
         , nrounds = nrounds
     )
@@ -554,7 +560,8 @@ test_that("Booster$update() passing a train_set works as expected", {
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         )
         , nrounds = nrounds
     )
@@ -564,7 +571,7 @@ test_that("Booster$update() passing a train_set works as expected", {
         train_set = Dataset$new(
             data = agaricus.train$data
             , label = agaricus.train$label
-            , params = list(verbose = VERBOSITY)
+            , params = list(verbose = .LGB_VERBOSITY)
         )
     )
     expect_true(lgb.is.Booster(bst))
@@ -578,7 +585,8 @@ test_that("Booster$update() passing a train_set works as expected", {
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         )
         , nrounds = nrounds +  1L
     )
@@ -603,7 +611,8 @@ test_that("Booster$update() throws an informative error if you provide a non-Dat
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         )
         , nrounds = nrounds
     )
@@ -630,7 +639,8 @@ test_that("Booster should store parameters and Booster$reset_parameter() should
         , metric = c("multi_logloss", "multi_error")
         , boosting = "gbdt"
         , num_class = 5L
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     bst <- Booster$new(
         params = params
@@ -657,7 +667,8 @@ test_that("Booster$params should include dataset params, before and after Booste
         objective = "binary"
         , max_depth = 4L
         , bagging_fraction = 0.8
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     bst <- Booster$new(
         params = params
@@ -669,7 +680,8 @@ test_that("Booster$params should include dataset params, before and after Booste
             objective = "binary"
             , max_depth = 4L
             , bagging_fraction = 0.8
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
             , max_bin = 17L
         )
     )
@@ -680,7 +692,8 @@ test_that("Booster$params should include dataset params, before and after Booste
         objective = "binary"
         , max_depth = 4L
         , bagging_fraction = 0.9
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
         , max_bin = 17L
     )
     expect_identical(ret_bst$params, expected_params)
@@ -698,7 +711,8 @@ test_that("Saving a model with different feature importance types works", {
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         )
         , nrounds = 2L
     )
@@ -753,7 +767,8 @@ test_that("Saving a model with unknown importance type fails", {
             num_leaves = 4L
             , learning_rate = 1.0
             , objective = "binary"
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
         )
         , nrounds = 2L
     )
@@ -784,36 +799,166 @@ test_that("all parameters are stored correctly with save_model_to_string()", {
         data = matrix(rnorm(500L), nrow = 100L)
         , label = rnorm(100L)
     )
-    nrounds <- 4L
     bst <- lgb.train(
         params = list(
-            objective = "regression"
-            , metric = "l2"
+            objective = "mape"
+            , metric = c("l2", "mae")
+            , num_threads = .LGB_MAX_THREADS
+            , seed = 708L
+            , data_sample_strategy = "bagging"
+            , sub_row = 0.8234
         )
         , data = dtrain
-        , nrounds = nrounds
-        , verbose = VERBOSITY
-    )
+        , nrounds = 3L
+        , verbose = .LGB_VERBOSITY
+    )
+
+    # entries whose values should reflect params passed to lgb.train()
+    non_default_param_entries <- c(
+        "[objective: mape]"
+        # 'l1' was passed in with alias 'mae'
+        , "[metric: l2,l1]"
+        , "[data_sample_strategy: bagging]"
+        , "[seed: 708]"
+        # this was passed in with alias 'sub_row'
+        , "[bagging_fraction: 0.8234]"
+        , "[num_iterations: 3]"
+    )
+
+    # entries with default values of params
+    default_param_entries <- c(
+        "[boosting: gbdt]"
+        , "[tree_learner: serial]"
+        , "[device_type: cpu]"
+        , "[data: ]"
+        , "[valid: ]"
+        , "[learning_rate: 0.1]"
+        , "[num_leaves: 31]"
+        , sprintf("[num_threads: %i]", .LGB_MAX_THREADS)
+        , "[deterministic: 0]"
+        , "[histogram_pool_size: -1]"
+        , "[max_depth: -1]"
+        , "[min_data_in_leaf: 20]"
+        , "[min_sum_hessian_in_leaf: 0.001]"
+        , "[pos_bagging_fraction: 1]"
+        , "[neg_bagging_fraction: 1]"
+        , "[bagging_freq: 0]"
+        , "[bagging_seed: 15415]"
+        , "[feature_fraction: 1]"
+        , "[feature_fraction_bynode: 1]"
+        , "[feature_fraction_seed: 32671]"
+        , "[extra_trees: 0]"
+        , "[extra_seed: 6642]"
+        , "[early_stopping_round: 0]"
+        , "[first_metric_only: 0]"
+        , "[max_delta_step: 0]"
+        , "[lambda_l1: 0]"
+        , "[lambda_l2: 0]"
+        , "[linear_lambda: 0]"
+        , "[min_gain_to_split: 0]"
+        , "[drop_rate: 0.1]"
+        , "[max_drop: 50]"
+        , "[skip_drop: 0.5]"
+        , "[xgboost_dart_mode: 0]"
+        , "[uniform_drop: 0]"
+        , "[drop_seed: 20623]"
+        , "[top_rate: 0.2]"
+        , "[other_rate: 0.1]"
+        , "[min_data_per_group: 100]"
+        , "[max_cat_threshold: 32]"
+        , "[cat_l2: 10]"
+        , "[cat_smooth: 10]"
+        , "[max_cat_to_onehot: 4]"
+        , "[top_k: 20]"
+        , "[monotone_constraints: ]"
+        , "[monotone_constraints_method: basic]"
+        , "[monotone_penalty: 0]"
+        , "[feature_contri: ]"
+        , "[forcedsplits_filename: ]"
+        , "[force_col_wise: 0]"
+        , "[force_row_wise: 0]"
+        , "[refit_decay_rate: 0.9]"
+        , "[cegb_tradeoff: 1]"
+        , "[cegb_penalty_split: 0]"
+        , "[cegb_penalty_feature_lazy: ]"
+        , "[cegb_penalty_feature_coupled: ]"
+        , "[path_smooth: 0]"
+        , "[interaction_constraints: ]"
+        , sprintf("[verbosity: %i]", .LGB_VERBOSITY)
+        , "[saved_feature_importance_type: 0]"
+        , "[use_quantized_grad: 0]"
+        , "[num_grad_quant_bins: 4]"
+        , "[quant_train_renew_leaf: 0]"
+        , "[stochastic_rounding: 1]"
+        , "[linear_tree: 0]"
+        , "[max_bin: 255]"
+        , "[max_bin_by_feature: ]"
+        , "[min_data_in_bin: 3]"
+        , "[bin_construct_sample_cnt: 200000]"
+        , "[data_random_seed: 2350]"
+        , "[is_enable_sparse: 1]"
+        , "[enable_bundle: 1]"
+        , "[use_missing: 1]"
+        , "[zero_as_missing: 0]"
+        , "[feature_pre_filter: 1]"
+        , "[pre_partition: 0]"
+        , "[two_round: 0]"
+        , "[header: 0]"
+        , "[label_column: ]"
+        , "[weight_column: ]"
+        , "[group_column: ]"
+        , "[ignore_column: ]"
+        , "[categorical_feature: ]"
+        , "[forcedbins_filename: ]"
+        , "[precise_float_parser: 0]"
+        , "[parser_config_file: ]"
+        , "[objective_seed: 4309]"
+        , "[num_class: 1]"
+        , "[is_unbalance: 0]"
+        , "[scale_pos_weight: 1]"
+        , "[sigmoid: 1]"
+        , "[boost_from_average: 1]"
+        , "[reg_sqrt: 0]"
+        , "[alpha: 0.9]"
+        , "[fair_c: 1]"
+        , "[poisson_max_delta_step: 0.7]"
+        , "[tweedie_variance_power: 1.5]"
+        , "[lambdarank_truncation_level: 30]"
+        , "[lambdarank_norm: 1]"
+        , "[label_gain: ]"
+        , "[lambdarank_position_bias_regularization: 0]"
+        , "[eval_at: ]"
+        , "[multi_error_top_k: 1]"
+        , "[auc_mu_weights: ]"
+        , "[num_machines: 1]"
+        , "[local_listen_port: 12400]"
+        , "[time_out: 120]"
+        , "[machine_list_filename: ]"
+        , "[machines: ]"
+        , "[gpu_platform_id: -1]"
+        , "[gpu_device_id: -1]"
+        , "[gpu_use_dp: 0]"
+        , "[num_gpu: 1]"
+    )
+    all_param_entries <- c(non_default_param_entries, default_param_entries)
 
+    # parameters should match what was passed from the R package
     model_str <- bst$save_model_to_string()
     params_in_file <- .params_from_model_string(model_str = model_str)
-
-    # parameters should match what was passed from the R package
-    expect_equal(sum(startsWith(params_in_file, "[metric:")), 1L)
-    expect_equal(sum(params_in_file == "[metric: l2]"), 1L)
-
-    expect_equal(sum(startsWith(params_in_file, "[num_iterations:")), 1L)
-    expect_equal(sum(params_in_file == "[num_iterations: 4]"), 1L)
-
-    expect_equal(sum(startsWith(params_in_file, "[objective:")), 1L)
-    expect_equal(sum(params_in_file == "[objective: regression]"), 1L)
-
-    expect_equal(sum(startsWith(params_in_file, "[verbosity:")), 1L)
-    expect_equal(sum(params_in_file == sprintf("[verbosity: %i]", VERBOSITY)), 1L)
+    .expect_in(all_param_entries, params_in_file)
 
     # early stopping should be off by default
     expect_equal(sum(startsWith(params_in_file, "[early_stopping_round:")), 1L)
     expect_equal(sum(params_in_file == "[early_stopping_round: 0]"), 1L)
+
+    # since save_model_to_string() is used when serializing with saveRDS(), check that parameters all
+    # roundtrip saveRDS()/loadRDS() successfully
+    rds_file <- tempfile()
+    saveRDS(bst, rds_file)
+    bst_rds <- readRDS(rds_file)
+    model_str <- bst_rds$save_model_to_string()
+    params_in_file <- .params_from_model_string(model_str = model_str)
+    .expect_in(all_param_entries, params_in_file)
 })
 
 test_that("early_stopping, num_iterations are stored correctly in model string even with aliases", {
@@ -845,6 +990,7 @@ test_that("early_stopping, num_iterations are stored correctly in model string e
         , n_iter = n_iter
         , early_stopping_round = early_stopping_round
         , n_iter_no_change = n_iter_no_change
+        , num_threads = .LGB_MAX_THREADS
     )
 
     bst <- lgb.train(
@@ -855,7 +1001,7 @@ test_that("early_stopping, num_iterations are stored correctly in model string e
         , valids = list(
             "random_valid" = dvalid
         )
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
     )
 
     model_str <- bst$save_model_to_string()
@@ -884,9 +1030,10 @@ test_that("Booster: method calls Booster with a null handle should raise an info
             objective = "regression"
             , metric = "l2"
             , num_leaves = 8L
+            , num_threads = .LGB_MAX_THREADS
         )
         , data = dtrain
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
         , nrounds = 5L
         , valids = list(
             train = dtrain
@@ -962,7 +1109,7 @@ test_that("Booster$new() using a Dataset with a null handle should raise an info
         bst <- Booster$new(
             train_set = dtrain
             , params = list(
-                verbose = VERBOSITY
+                verbose = .LGB_VERBOSITY
             )
         )
     }, regexp = "Attempting to create a Dataset without any raw data")
@@ -1073,7 +1220,8 @@ test_that("lgb.cv() correctly handles passing through params to the model file",
         , n_iter = n_iter
         , early_stopping_round = early_stopping_round
         , n_iter_no_change = n_iter_no_change
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
 
     cv_bst <- lgb.cv(
@@ -1082,7 +1230,7 @@ test_that("lgb.cv() correctly handles passing through params to the model file",
         , nrounds = nrounds_kwarg
         , early_stopping_rounds = early_stopping_round_kwarg
         , nfold = 3L
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
     )
 
     for (bst in cv_bst$boosters) {
@@ -1117,7 +1265,8 @@ test_that("params (including dataset params) should be stored in .rds file for B
         objective = "binary"
         , max_depth = 4L
         , bagging_fraction = 0.8
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     bst <- Booster$new(
         params = params
@@ -1133,7 +1282,8 @@ test_that("params (including dataset params) should be stored in .rds file for B
             objective = "binary"
             , max_depth = 4L
             , bagging_fraction = 0.8
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
             , max_bin = 17L
         )
     )
@@ -1152,7 +1302,8 @@ test_that("params (including dataset params) should be stored in .rds file for B
         objective = "binary"
         , max_depth = 4L
         , bagging_fraction = 0.8
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     bst <- Booster$new(
         params = params
@@ -1168,7 +1319,8 @@ test_that("params (including dataset params) should be stored in .rds file for B
             objective = "binary"
             , max_depth = 4L
             , bagging_fraction = 0.8
-            , verbose = VERBOSITY
+            , verbose = .LGB_VERBOSITY
+            , num_threads = .LGB_MAX_THREADS
             , max_bin = 17L
         )
     )
@@ -1182,8 +1334,9 @@ test_that("Handle is automatically restored when calling predict", {
         , nrounds = 5L
         , obj = "binary"
         , params = list(
-            verbose = VERBOSITY
+            verbose = .LGB_VERBOSITY
         )
+        , num_threads = .LGB_MAX_THREADS
     )
     bst_file <- tempfile(fileext = ".rds")
     saveRDS(bst, file = bst_file)
@@ -1205,10 +1358,11 @@ test_that("boosters with linear models at leaves work with saveRDS.lgb.Booster a
 
     params <- list(
         objective = "regression"
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
         , metric = "mse"
         , seed = 0L
         , num_leaves = 2L
+        , num_threads = .LGB_MAX_THREADS
     )
 
     bst <- lgb.train(
@@ -1244,10 +1398,11 @@ test_that("boosters with linear models at leaves can be written to RDS and re-lo
 
     params <- list(
         objective = "regression"
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
         , metric = "mse"
         , seed = 0L
         , num_leaves = 2L
+        , num_threads = .LGB_MAX_THREADS
     )
 
     bst <- lgb.train(
@@ -1308,7 +1463,7 @@ test_that("Booster's print, show, and summary work correctly", {
         .has_expected_content_for_fitted_model(log_txt)
 
         # summary()
-        log_text <- capture.output({
+        log_txt <- capture.output({
           ret <- summary(model)
         })
         .have_same_handle(ret, model)
@@ -1344,6 +1499,7 @@ test_that("Booster's print, show, and summary work correctly", {
         params = list(
           objective = "regression"
           , min_data_in_leaf = 1L
+          , num_threads = .LGB_MAX_THREADS
         )
         , data = lgb.Dataset(
             as.matrix(mtcars[, -1L])
@@ -1352,19 +1508,19 @@ test_that("Booster's print, show, and summary work correctly", {
               min_data_in_bin = 1L
             )
         )
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
         , nrounds = 5L
     )
     .check_methods_work(model)
 
     data("iris")
     model <- lgb.train(
-        params = list(objective = "multiclass", num_class = 3L)
+        params = list(objective = "multiclass", num_class = 3L, num_threads = .LGB_MAX_THREADS)
         , data = lgb.Dataset(
             as.matrix(iris[, -5L])
             , label = as.numeric(factor(iris$Species)) - 1.0
         )
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
         , nrounds = 5L
     )
     .check_methods_work(model)
@@ -1397,8 +1553,9 @@ test_that("Booster's print, show, and summary work correctly", {
         )
         , obj = .logregobj
         , eval = .evalerror
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
         , nrounds = 5L
+        , params = list(num_threads = .LGB_MAX_THREADS)
     )
 
     .check_methods_work(model)
@@ -1410,6 +1567,7 @@ test_that("LGBM_BoosterGetNumFeature_R returns correct outputs", {
         params = list(
           objective = "regression"
           , min_data_in_leaf = 1L
+          , num_threads = .LGB_MAX_THREADS
         )
         , data = lgb.Dataset(
             as.matrix(mtcars[, -1L])
@@ -1418,7 +1576,7 @@ test_that("LGBM_BoosterGetNumFeature_R returns correct outputs", {
               min_data_in_bin = 1L
             )
         )
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
         , nrounds = 5L
     )
     ncols <- .Call(LGBM_BoosterGetNumFeature_R, model$.__enclos_env__$private$handle)
@@ -1431,7 +1589,7 @@ test_that("LGBM_BoosterGetNumFeature_R returns correct outputs", {
             as.matrix(iris[, -5L])
             , label = as.numeric(factor(iris$Species)) - 1.0
         )
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
         , nrounds = 5L
     )
     ncols <- .Call(LGBM_BoosterGetNumFeature_R, model$.__enclos_env__$private$handle)
diff --git a/R-package/tests/testthat/test_lgb.interprete.R b/R-package/tests/testthat/test_lgb.interprete.R
index 29ac110accbc..322a80a55bc5 100644
--- a/R-package/tests/testthat/test_lgb.interprete.R
+++ b/R-package/tests/testthat/test_lgb.interprete.R
@@ -1,7 +1,3 @@
-VERBOSITY <- as.integer(
-    Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
-)
-
 .sigmoid <- function(x) {
     1.0 / (1.0 + exp(-x))
 }
@@ -30,7 +26,8 @@ test_that("lgb.intereprete works as expected for binary classification", {
         , max_depth = -1L
         , min_data_in_leaf = 1L
         , min_sum_hessian_in_leaf = 1.0
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     model <- lgb.train(
         params = params
@@ -82,7 +79,8 @@ test_that("lgb.intereprete works as expected for multiclass classification", {
         , num_class = 3L
         , learning_rate = 0.00001
         , min_data = 1L
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     model <- lgb.train(
         params = params
diff --git a/R-package/tests/testthat/test_lgb.plot.importance.R b/R-package/tests/testthat/test_lgb.plot.importance.R
index 1a1e2b0d5398..e7ff63facde5 100644
--- a/R-package/tests/testthat/test_lgb.plot.importance.R
+++ b/R-package/tests/testthat/test_lgb.plot.importance.R
@@ -1,7 +1,3 @@
-VERBOSITY <- as.integer(
-    Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
-)
-
 test_that("lgb.plot.importance() should run without error for well-formed inputs", {
     data(agaricus.train, package = "lightgbm")
     train <- agaricus.train
@@ -13,7 +9,8 @@ test_that("lgb.plot.importance() should run without error for well-formed inputs
         , max_depth = -1L
         , min_data_in_leaf = 1L
         , min_sum_hessian_in_leaf = 1.0
-        , verbosity = VERBOSITY
+        , verbosity = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     model <- lgb.train(params, dtrain, 3L)
     tree_imp <- lgb.importance(model, percentage = TRUE)
diff --git a/R-package/tests/testthat/test_lgb.plot.interpretation.R b/R-package/tests/testthat/test_lgb.plot.interpretation.R
index bb8009d3595b..6cba9927942a 100644
--- a/R-package/tests/testthat/test_lgb.plot.interpretation.R
+++ b/R-package/tests/testthat/test_lgb.plot.interpretation.R
@@ -1,7 +1,3 @@
-VERBOSITY <- as.integer(
-    Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
-)
-
 .sigmoid <- function(x) {
     1.0 / (1.0 + exp(-x))
 }
@@ -30,7 +26,8 @@ test_that("lgb.plot.interepretation works as expected for binary classification"
         , max_depth = -1L
         , min_data_in_leaf = 1L
         , min_sum_hessian_in_leaf = 1.0
-        , verbosity = VERBOSITY
+        , verbosity = .LGB_VERBOSITY
+        , num_threads = .LGB_MAX_THREADS
     )
     model <- lgb.train(
         params = params
@@ -80,12 +77,13 @@ test_that("lgb.plot.interepretation works as expected for multiclass classificat
         , num_class = 3L
         , learning_rate = 0.00001
         , min_data = 1L
+        , num_threads = .LGB_MAX_THREADS
     )
     model <- lgb.train(
         params = params
         , data = dtrain
         , nrounds = 3L
-        , verbose = VERBOSITY
+        , verbose = .LGB_VERBOSITY
     )
     num_trees <- 5L
     tree_interpretation <- lgb.interprete(
diff --git a/R-package/tests/testthat/test_parameters.R b/R-package/tests/testthat/test_parameters.R
index 3f98f8d2907e..367f01af817c 100644
--- a/R-package/tests/testthat/test_parameters.R
+++ b/R-package/tests/testthat/test_parameters.R
@@ -20,6 +20,7 @@ test_that("Feature penalties work properly", {
         , objective = "binary"
         , feature_penalty = paste0(feature_penalties, collapse = ",")
         , metric = "binary_error"
+        , num_threads = .LGB_MAX_THREADS
       )
       , nrounds = 5L
       , verbose = -1L
@@ -97,6 +98,7 @@ test_that("training should warn if you use 'dart' boosting, specified with 'boos
         , learning_rate = 0.05
         , objective = "binary"
         , metric = "binary_error"
+        , num_threads = .LGB_MAX_THREADS
     )
     params[[boosting_param]] <- "dart"
     expect_warning({
diff --git a/R-package/tests/testthat/test_weighted_loss.R b/R-package/tests/testthat/test_weighted_loss.R
index d00399548560..f9f9675c3bb9 100644
--- a/R-package/tests/testthat/test_weighted_loss.R
+++ b/R-package/tests/testthat/test_weighted_loss.R
@@ -1,7 +1,3 @@
-VERBOSITY <- as.integer(
-  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
-)
-
 test_that("Gamma regression reacts on 'weight'", {
   n <- 100L
   set.seed(87L)
@@ -9,7 +5,7 @@ test_that("Gamma regression reacts on 'weight'", {
   y <- X[, 1L] + X[, 2L] + runif(n)
   X_pred <- X[1L:5L, ]
 
-  params <- list(objective = "gamma")
+  params <- list(objective = "gamma", num_threads = .LGB_MAX_THREADS)
 
   # Unweighted
   dtrain <- lgb.Dataset(X, label = y)
@@ -17,7 +13,7 @@ test_that("Gamma regression reacts on 'weight'", {
     params = params
     , data = dtrain
     , nrounds = 4L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
   )
   pred_unweighted <- predict(bst, X_pred)
 
@@ -31,7 +27,7 @@ test_that("Gamma regression reacts on 'weight'", {
     params = params
     , data = dtrain
     , nrounds = 4L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
   )
   pred_weighted_1 <- predict(bst, X_pred)
 
@@ -45,7 +41,7 @@ test_that("Gamma regression reacts on 'weight'", {
     params = params
     , data = dtrain
     , nrounds = 4L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
   )
   pred_weighted_2 <- predict(bst, X_pred)
 
@@ -59,7 +55,7 @@ test_that("Gamma regression reacts on 'weight'", {
     params = params
     , data = dtrain
     , nrounds = 4L
-    , verbose = VERBOSITY
+    , verbose = .LGB_VERBOSITY
   )
   pred_weighted <- predict(bst, X_pred)
 
diff --git a/README.md b/README.md
index a44d557f058b..f6f4e8c570e0 100644
--- a/README.md
+++ b/README.md
@@ -126,6 +126,8 @@ lightgbm-transform (feature transformation binding): https://github.com/microsof
 
 `postgresml` (LightGBM training and prediction in SQL, via a Postgres extension): https://github.com/postgresml/postgresml
 
+`vaex-ml` (Python DataFrame library with its own interface to LightGBM): https://github.com/vaexio/vaex
+
 Support
 -------
 
diff --git a/VERSION.txt b/VERSION.txt
index 200681852af8..1f06da0058c9 100644
--- a/VERSION.txt
+++ b/VERSION.txt
@@ -1 +1 @@
-4.0.0.99
+4.1.0.99
diff --git a/build-python.sh b/build-python.sh
index e535eeb06abc..8892ca2e936f 100755
--- a/build-python.sh
+++ b/build-python.sh
@@ -48,6 +48,9 @@
 #                                   Compile with MinGW.
 #     --mpi
 #                                   Compile MPI version.
+#     --no-isolation
+#                                   Assume all build and install dependencies are already installed,
+#                                   don't go to the internet to get them.
 #     --nomp
 #                                   Compile version without OpenMP support.
 #     --precompile
@@ -159,6 +162,10 @@ while [ $# -gt 0 ]; do
     --mpi)
         BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_MPI=ON"
         ;;
+    --no-isolation)
+        BUILD_ARGS="${BUILD_ARGS} --no-isolation"
+        PIP_INSTALL_ARGS="${PIP_INSTALL_ARGS} --no-build-isolation"
+        ;;
     --nomp)
         BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_OPENMP=OFF"
         ;;
@@ -337,6 +344,7 @@ if test "${BUILD_SDIST}" = true; then
     python -m build \
         --sdist \
         --outdir ../dist \
+        ${BUILD_ARGS} \
         .
 fi
 
diff --git a/docker/dockerfile-python b/docker/dockerfile-python
index 541884811a0b..900d05c30012 100644
--- a/docker/dockerfile-python
+++ b/docker/dockerfile-python
@@ -26,6 +26,7 @@ RUN apt-get update && \
     # lightgbm
     conda install -q -y numpy scipy scikit-learn pandas && \
     git clone --recursive --branch stable --depth 1 https://github.com/Microsoft/LightGBM && \
+    cd ./LightGBM && \
     sh ./build-python.sh install && \
     # clean
     apt-get autoremove -y && apt-get clean && \
diff --git a/docs/.linkcheckerrc b/docs/.linkcheckerrc
index e6ab4ea1a5df..96fdcbd08157 100644
--- a/docs/.linkcheckerrc
+++ b/docs/.linkcheckerrc
@@ -9,6 +9,7 @@ threads=1
 ignore=
   pythonapi/lightgbm\..*\.html.*
   http.*amd.com/.*
+  https.*dl.acm.org/doi/.*
   https.*tandfonline.com/.*
 ignorewarnings=http-robots-denied,https-certificate-error
 checkextern=1
diff --git a/docs/Advanced-Topics.rst b/docs/Advanced-Topics.rst
index d1787b998479..345a1361bfa9 100644
--- a/docs/Advanced-Topics.rst
+++ b/docs/Advanced-Topics.rst
@@ -77,3 +77,44 @@ Recommendations for gcc Users (MinGW, \*nix)
 --------------------------------------------
 
 -  Refer to `gcc Tips <./gcc-Tips.rst>`__.
+
+Support for Position Bias Treatment
+------------------------------------
+
+Often the relevance labels provided in Learning-to-Rank tasks might be derived from implicit user feedback (e.g., clicks) and therefore might be biased due to their position/location on the screen when having been presented to a user.
+LightGBM can make use of positional data.
+
+For example, consider the case where you expect that the first 3 results from a search engine will be visible in users' browsers without scrolling, and all other results for a query would require scrolling.
+
+LightGBM could be told to account for the position bias from results being "above the fold" by providing a ``positions`` array encoded as follows:
+
+::
+
+    0
+    0
+    0
+    1
+    1
+    0
+    0
+    0
+    1
+    ...
+
+Where ``0 = "above the fold"`` and ``1 = "requires scrolling"``.
+The specific values are not important, as long as they are consistent across all observations in the training data.
+An encoding like ``100 = "above the fold"`` and ``17 = "requires scrolling"`` would result in exactly the same trained model.
+
+In that way, ``positions`` in LightGBM's API are similar to a categorical feature.
+Just as with non-ordinal categorical features, an integer representation is just used for memory and computational efficiency... LightGBM does not care about the absolute or relative magnitude of the values.
+
+Unlike a categorical feature, however, ``positions`` are used to adjust the target to reduce the bias in predictions made by the trained model.
+
+The position file corresponds with training data file line by line, and has one position per line. And if the name of training data file is ``train.txt``, the position file should be named as ``train.txt.position`` and placed in the same folder as the data file.
+In this case, LightGBM will load the position file automatically if it exists. The positions can also be specified through the ``Dataset`` constructor when using Python API. If the positions are specified in both approaches, the ``.position`` file will be ignored.
+
+Currently, implemented is an approach to model position bias by using an idea of Generalized Additive Models (`GAM <https://en.wikipedia.org/wiki/Generalized_additive_model>`_) to linearly decompose the document score ``s`` into the sum of a relevance component ``f`` and a positional component ``g``:  ``s(x, pos) = f(x) + g(pos)`` where the former component depends on the original query-document features and the latter depends on the position of an item. 
+During the training, the compound scoring function ``s(x, pos)`` is fit with a standard ranking algorithm (e.g., LambdaMART) which boils down to jointly learning the relevance component ``f(x)`` (it is later returned as an unbiased model) and the position factors ``g(pos)`` that help better explain the observed (biased) labels. 
+Similar score decomposition ideas have previously been applied for classification & pointwise ranking tasks with assumptions of binary labels and binary relevance (a.k.a. "two-tower" models, refer to the papers: `Towards Disentangling Relevance and Bias in Unbiased Learning to Rank <https://arxiv.org/abs/2212.13937>`_, `PAL: a position-bias aware learning framework for CTR prediction in live recommender systems <https://dl.acm.org/doi/10.1145/3298689.3347033>`_, `A General Framework for Debiasing in CTR Prediction <https://arxiv.org/abs/2112.02767>`_). 
+In LightGBM, we adapt this idea to general pairwise Lerarning-to-Rank with arbitrary ordinal relevance labels. 
+Besides, GAMs have been used in the context of explainable ML (`Accurate Intelligible Models with Pairwise Interactions <https://www.cs.cornell.edu/~yinlou/papers/lou-kdd13.pdf>`_) to linearly decompose the contribution of each feature (and possibly their pairwise interactions) to the overall score, for subsequent analysis and interpretation of their effects in the trained models.
diff --git a/docs/GPU-Windows.rst b/docs/GPU-Windows.rst
index aa1cb2036f4e..36e657e5801b 100644
--- a/docs/GPU-Windows.rst
+++ b/docs/GPU-Windows.rst
@@ -602,9 +602,9 @@ And open an issue in GitHub `here`_ with that log.
 
 .. _Boost: https://www.boost.org/users/history/
 
-.. _Prebuilt Boost x86_64: https://www.rpmfind.net/linux/fedora/linux/releases/36/Everything/x86_64/os/Packages/m/mingw64-boost-static-1.75.0-6.fc36.noarch.rpm
+.. _Prebuilt Boost x86_64: https://www.rpmfind.net/linux/fedora/linux/releases/38/Everything/x86_64/os/Packages/m/mingw64-boost-static-1.78.0-4.fc38.noarch.rpm
 
-.. _Prebuilt Boost i686: https://www.rpmfind.net/linux/fedora/linux/releases/36/Everything/x86_64/os/Packages/m/mingw32-boost-static-1.75.0-6.fc36.noarch.rpm
+.. _Prebuilt Boost i686: https://www.rpmfind.net/linux/fedora/linux/releases/38/Everything/x86_64/os/Packages/m/mingw32-boost-static-1.78.0-4.fc38.noarch.rpm
 
 .. _7zip: https://www.7-zip.org/download.html
 
diff --git a/docs/Parallel-Learning-Guide.rst b/docs/Parallel-Learning-Guide.rst
index 438fd3f9ee0c..e1857034e499 100644
--- a/docs/Parallel-Learning-Guide.rst
+++ b/docs/Parallel-Learning-Guide.rst
@@ -514,7 +514,7 @@ See `the mars documentation`_ for usage examples.
 
 .. _SynapseML: https://aka.ms/spark
 
-.. _this SynapseML example: https://github.com/microsoft/SynapseML/blob/master/notebooks/features/lightgbm/LightGBM%20-%20Overview.ipynb
+.. _this SynapseML example: https://github.com/microsoft/SynapseML/tree/master/docs/Explore%20Algorithms/LightGBM
 
 .. _the Dask Array documentation: https://docs.dask.org/en/latest/array.html
 
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 5eecc27889b6..86104ba5be55 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -1137,6 +1137,12 @@ Objective Parameters
 
    -  separate by ``,``
 
+-  ``lambdarank_position_bias_regularization`` :raw-html:`<a id="lambdarank_position_bias_regularization" title="Permalink to this parameter" href="#lambdarank_position_bias_regularization">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, constraints: ``lambdarank_position_bias_regularization >= 0.0``
+
+   -  used only in ``lambdarank`` application when positional information is provided and position bias is modeled. Larger values reduce the inferred position bias factors.
+
+   -  *New in version 4.1.0*
+
 Metric Parameters
 -----------------
 
diff --git a/external_libs/fast_double_parser b/external_libs/fast_double_parser
index ace60646c02d..efec03532ef6 160000
--- a/external_libs/fast_double_parser
+++ b/external_libs/fast_double_parser
@@ -1 +1 @@
-Subproject commit ace60646c02dc54c57f19d644e49a61e7e7758ec
+Subproject commit efec03532ef65984786e5e32dbc81f6e6a55a115
diff --git a/external_libs/fmt b/external_libs/fmt
index b6f4ceaed0a0..f5e54359df4c 160000
--- a/external_libs/fmt
+++ b/external_libs/fmt
@@ -1 +1 @@
-Subproject commit b6f4ceaed0a0a24ccf575fab6c56dd50ccf6f1a9
+Subproject commit f5e54359df4c26b6230fc61d38aa294581393084
diff --git a/helpers/parameter_generator.py b/helpers/parameter_generator.py
index 407f2c73e1e3..a554ee60b6c9 100644
--- a/helpers/parameter_generator.py
+++ b/helpers/parameter_generator.py
@@ -330,7 +330,7 @@ def gen_parameter_code(
     str_to_write += '  std::string tmp_str = "";\n'
     for x in infos:
         for y in x:
-            if "[doc-only]" in y:
+            if "[no-automatically-extract]" in y:
                 continue
             param_type = y["inner_type"][0]
             name = y["name"][0]
@@ -345,7 +345,7 @@ def gen_parameter_code(
     str_to_write += "  std::stringstream str_buf;\n"
     for x in infos:
         for y in x:
-            if "[doc-only]" in y or "[no-save]" in y:
+            if "[no-save]" in y:
                 continue
             param_type = y["inner_type"][0]
             name = y["name"][0]
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index e01578396259..6d61bc764924 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -5,8 +5,13 @@
  * \note
  * - desc and descl2 fields must be written in reStructuredText format;
  * - nested sections can be placed only at the bottom of parent's section;
- * - [doc-only] tag indicates that only documentation for this param should be generated and all other actions are performed manually;
- * - [no-save] tag indicates that this param should not be saved into a model text representation.
+ * - [no-automatically-extract]
+ *       - do not automatically extract this parameter into a Config property with the same name in Config::GetMembersFromString(). Use if:
+ *           - specialized extraction logic for this param exists in Config::GetMembersFromString()
+ * - [no-save]
+ *       - this param should not be saved into a model text representation via Config::SaveMembersToString(). Use if:
+ *           - param is only used by the CLI (especially the "predict" and "convert_model" tasks)
+ *           - param is related to LightGBM writing files (e.g. "output_model", "save_binary")
  */
 #ifndef LIGHTGBM_CONFIG_H_
 #define LIGHTGBM_CONFIG_H_
@@ -97,15 +102,15 @@ struct Config {
   #pragma region Core Parameters
   #endif  // __NVCC__
 
+  // [no-automatically-extract]
   // [no-save]
-  // [doc-only]
   // alias = config_file
   // desc = path of config file
   // desc = **Note**: can be used only in CLI version
   std::string config = "";
 
+  // [no-automatically-extract]
   // [no-save]
-  // [doc-only]
   // type = enum
   // default = train
   // options = train, predict, convert_model, refit
@@ -118,7 +123,8 @@ struct Config {
   // desc = **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions
   TaskType task = TaskType::kTrain;
 
-  // [doc-only]
+  // [no-automatically-extract]
+  // [no-save]
   // type = enum
   // options = regression, regression_l1, huber, fair, poisson, quantile, mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda, lambdarank, rank_xendcg
   // alias = objective_type, app, application, loss
@@ -150,7 +156,8 @@ struct Config {
   // descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect)
   std::string objective = "regression";
 
-  // [doc-only]
+  // [no-automatically-extract]
+  // [no-save]
   // type = enum
   // alias = boosting_type, boost
   // options = gbdt, rf, dart
@@ -160,7 +167,7 @@ struct Config {
   // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
   std::string boosting = "gbdt";
 
-  // [doc-only]
+  // [no-automatically-extract]
   // type = enum
   // options = bagging, goss
   // desc = ``bagging``, Randomly Bagging Sampling
@@ -200,7 +207,8 @@ struct Config {
   // desc = max number of leaves in one tree
   int num_leaves = kDefaultNumLeaves;
 
-  // [doc-only]
+  // [no-automatically-extract]
+  // [no-save]
   // type = enum
   // options = serial, feature, data, voting
   // alias = tree, tree_type, tree_learner_type
@@ -222,7 +230,8 @@ struct Config {
   // desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
   int num_threads = 0;
 
-  // [doc-only]
+  // [no-automatically-extract]
+  // [no-save]
   // type = enum
   // options = cpu, gpu, cuda
   // alias = device
@@ -235,7 +244,7 @@ struct Config {
   // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
   std::string device_type = "cpu";
 
-  // [doc-only]
+  // [no-automatically-extract]
   // alias = random_seed, random_state
   // default = None
   // desc = this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc.
@@ -593,7 +602,6 @@ struct Config {
   // desc = **Note**: can be used only in CLI version
   int snapshot_freq = -1;
 
-  // [no-save]
   // desc = whether to use gradient quantization when training
   // desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins``
   // desc = with quantized training, most arithmetics in the training process will be integer operations
@@ -602,21 +610,18 @@ struct Config {
   // desc = *New in version 4.0.0*
   bool use_quantized_grad = false;
 
-  // [no-save]
   // desc = number of bins to quantization gradients and hessians
   // desc = with more bins, the quantized training will be closer to full precision training
   // desc = **Note**: can be used only with ``device_type = cpu``
   // desc = *New in 4.0.0*
   int num_grad_quant_bins = 4;
 
-  // [no-save]
   // desc = whether to renew the leaf values with original gradients when quantized training
   // desc = renewing is very helpful for good quantized training accuracy for ranking objectives
   // desc = **Note**: can be used only with ``device_type = cpu``
   // desc = *New in 4.0.0*
   bool quant_train_renew_leaf = false;
 
-  // [no-save]
   // desc = whether to use stochastic rounding in gradient quantization
   // desc = *New in 4.0.0*
   bool stochastic_rounding = true;
@@ -965,13 +970,19 @@ struct Config {
   // desc = separate by ``,``
   std::vector<double> label_gain;
 
+  // check = >=0.0
+  // desc = used only in ``lambdarank`` application when positional information is provided and position bias is modeled. Larger values reduce the inferred position bias factors.
+  // desc = *New in version 4.1.0*
+  double lambdarank_position_bias_regularization = 0.0;
+
   #ifndef __NVCC__
   #pragma endregion
 
   #pragma region Metric Parameters
   #endif  // __NVCC__
 
-  // [doc-only]
+  // [no-automatically-extract]
+  // [no-save]
   // alias = metrics, metric_types
   // default = ""
   // type = multi-enum
diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp
index 6668c92f2921..5b2301ac8de3 100644
--- a/include/LightGBM/cuda/cuda_column_data.hpp
+++ b/include/LightGBM/cuda/cuda_column_data.hpp
@@ -98,6 +98,7 @@ class CUDAColumnData {
 
   void ResizeWhenCopySubrow(const data_size_t num_used_indices);
 
+  int gpu_device_id_;
   int num_threads_;
   data_size_t num_data_;
   int num_columns_;
diff --git a/include/LightGBM/cuda/cuda_metric.hpp b/include/LightGBM/cuda/cuda_metric.hpp
index 5eb04c81c777..9186ceea160b 100644
--- a/include/LightGBM/cuda/cuda_metric.hpp
+++ b/include/LightGBM/cuda/cuda_metric.hpp
@@ -9,6 +9,7 @@
 
 #ifdef USE_CUDA
 
+#include <LightGBM/cuda/cuda_utils.h>
 #include <LightGBM/metric.h>
 
 namespace LightGBM {
@@ -19,6 +20,8 @@ class CUDAMetricInterface: public HOST_METRIC {
   explicit CUDAMetricInterface(const Config& config): HOST_METRIC(config) {
     cuda_labels_ = nullptr;
     cuda_weights_ = nullptr;
+    const int gpu_device_id = config.gpu_device_id >= 0 ? config.gpu_device_id : 0;
+    SetCUDADevice(gpu_device_id, __FILE__, __LINE__);
   }
 
   void Init(const Metadata& metadata, data_size_t num_data) override {
diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp
index dacaf252f8e6..fae8aa7ec643 100644
--- a/include/LightGBM/cuda/cuda_objective_function.hpp
+++ b/include/LightGBM/cuda/cuda_objective_function.hpp
@@ -21,7 +21,10 @@ namespace LightGBM {
 template <typename HOST_OBJECTIVE>
 class CUDAObjectiveInterface: public HOST_OBJECTIVE {
  public:
-  explicit CUDAObjectiveInterface(const Config& config): HOST_OBJECTIVE(config) {}
+  explicit CUDAObjectiveInterface(const Config& config): HOST_OBJECTIVE(config) {
+    const int gpu_device_id = config.gpu_device_id >= 0 ? config.gpu_device_id : 0;
+    SetCUDADevice(gpu_device_id, __FILE__, __LINE__);
+  }
 
   explicit CUDAObjectiveInterface(const std::vector<std::string>& strs): HOST_OBJECTIVE(strs) {}
 
diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h
index 46ac5a9149d7..953bf9f12e88 100644
--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.h
@@ -28,6 +28,8 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort =
 
 void SetCUDADevice(int gpu_device_id, const char* file, int line);
 
+int GetCUDADevice(const char* file, int line);
+
 template <typename T>
 void AllocateCUDAMemory(T** out_ptr, size_t size, const char* file, const int line) {
   void* tmp_ptr = nullptr;
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 825c5c6ebcf8..e7baa42dc2e6 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -114,6 +114,8 @@ class Metadata {
 
   void SetQuery(const data_size_t* query, data_size_t len);
 
+  void SetPosition(const data_size_t* position, data_size_t len);
+
   /*!
   * \brief Set initial scores
   * \param init_score Initial scores, this class will manage memory for init_score.
@@ -213,6 +215,38 @@ class Metadata {
     }
   }
 
+  /*!
+  * \brief Get positions, if does not exist then return nullptr
+  * \return Pointer of positions
+  */
+  inline const data_size_t* positions() const {
+    if (!positions_.empty()) {
+      return positions_.data();
+    } else {
+      return nullptr;
+    }
+  }
+
+  /*!
+  * \brief Get position IDs, if does not exist then return nullptr
+  * \return Pointer of position IDs
+  */
+  inline const std::string* position_ids() const {
+    if (!position_ids_.empty()) {
+      return position_ids_.data();
+    } else {
+      return nullptr;
+    }
+  }
+
+  /*!
+  * \brief Get Number of different position IDs
+  * \return number of different position IDs
+  */
+  inline size_t num_position_ids() const {
+      return position_ids_.size();
+  }
+
   /*!
   * \brief Get data boundaries on queries, if not exists, will return nullptr
   *        we assume data will order by query,
@@ -289,6 +323,8 @@ class Metadata {
  private:
   /*! \brief Load wights from file */
   void LoadWeights();
+  /*! \brief Load positions from file */
+  void LoadPositions();
   /*! \brief Load query boundaries from file */
   void LoadQueryBoundaries();
   /*! \brief Calculate query weights from queries */
@@ -309,10 +345,16 @@ class Metadata {
   data_size_t num_data_;
   /*! \brief Number of weights, used to check correct weight file */
   data_size_t num_weights_;
+  /*! \brief Number of positions, used to check correct position file */
+  data_size_t num_positions_;
   /*! \brief Label data */
   std::vector<label_t> label_;
   /*! \brief Weights data */
   std::vector<label_t> weights_;
+  /*! \brief Positions data */
+  std::vector<data_size_t> positions_;
+  /*! \brief Position identifiers */
+  std::vector<std::string> position_ids_;
   /*! \brief Query boundaries */
   std::vector<data_size_t> query_boundaries_;
   /*! \brief Query weights */
@@ -328,6 +370,7 @@ class Metadata {
   /*! \brief mutex for threading safe call */
   std::mutex mutex_;
   bool weight_load_from_file_;
+  bool position_load_from_file_;
   bool query_load_from_file_;
   bool init_score_load_from_file_;
   #ifdef USE_CUDA
diff --git a/python-package/README.rst b/python-package/README.rst
index c83307916163..bf9874e1227c 100644
--- a/python-package/README.rst
+++ b/python-package/README.rst
@@ -256,7 +256,14 @@ If you get any errors during installation or due to any other reasons, you may w
 Build Wheel File
 ****************
 
-You can use ``sh ./build-python.sh install bdist_wheel`` instead of ``sh ./build-python.sh install`` to build wheel file and use it for installation later. This might be useful for systems with restricted or completely without network access.
+You can use ``sh ./build-python.sh install bdist_wheel`` to build a wheel file but not install it.
+
+That script requires some dependencies like ``build``, ``scikit-build-core``, and ``wheel``.
+In environments with restricted or no internt access, install those tools and then pass ``--no-isolation``.
+
+.. code:: sh
+
+  sh ./build-python.sh bdist_wheel --no-isolation
 
 Build With MSBuild
 ******************
diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py
index 5815bc602bde..0dc5b75cfdf2 100644
--- a/python-package/lightgbm/__init__.py
+++ b/python-package/lightgbm/__init__.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 
 from .basic import Booster, Dataset, Sequence, register_logger
-from .callback import early_stopping, log_evaluation, record_evaluation, reset_parameter
+from .callback import EarlyStopException, early_stopping, log_evaluation, record_evaluation, reset_parameter
 from .engine import CVBooster, cv, train
 
 try:
@@ -32,5 +32,5 @@
            'train', 'cv',
            'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker',
            'DaskLGBMRegressor', 'DaskLGBMClassifier', 'DaskLGBMRanker',
-           'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping',
+           'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 'EarlyStopException',
            'plot_importance', 'plot_split_value_histogram', 'plot_metric', 'plot_tree', 'create_tree_digraph']
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index caa71fed47e5..3dfa583a62bb 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -24,6 +24,13 @@
 if TYPE_CHECKING:
     from typing import Literal
 
+    # typing.TypeGuard was only introduced in Python 3.10
+    try:
+        from typing import TypeGuard
+    except ImportError:
+        from typing_extensions import TypeGuard
+
+
 __all__ = [
     'Booster',
     'Dataset',
@@ -54,6 +61,7 @@
 _LGBM_EvalFunctionResultType = Tuple[str, float, bool]
 _LGBM_BoosterBestScoreType = Dict[str, Dict[str, float]]
 _LGBM_BoosterEvalMethodResultType = Tuple[str, str, float, bool]
+_LGBM_BoosterEvalMethodResultWithStandardDeviationType = Tuple[str, str, float, bool, float]
 _LGBM_CategoricalFeatureConfiguration = Union[List[str], List[int], "Literal['auto']"]
 _LGBM_FeatureNameConfiguration = Union[List[str], "Literal['auto']"]
 _LGBM_GroupType = Union[
@@ -62,6 +70,10 @@
     np.ndarray,
     pd_Series
 ]
+_LGBM_PositionType = Union[
+    np.ndarray,
+    pd_Series
+]
 _LGBM_InitScoreType = Union[
     List[float],
     List[List[float]],
@@ -126,7 +138,7 @@ class _MissingType(Enum):
 
 class _DummyLogger:
     def info(self, msg: str) -> None:
-        print(msg)
+        print(msg)  # noqa: T201
 
     def warning(self, msg: str) -> None:
         warnings.warn(msg, stacklevel=3)
@@ -274,6 +286,20 @@ def _is_1d_list(data: Any) -> bool:
     return isinstance(data, list) and (not data or _is_numeric(data[0]))
 
 
+def _is_list_of_numpy_arrays(data: Any) -> "TypeGuard[List[np.ndarray]]":
+    return (
+        isinstance(data, list)
+        and all(isinstance(x, np.ndarray) for x in data)
+    )
+
+
+def _is_list_of_sequences(data: Any) -> "TypeGuard[List[Sequence]]":
+    return (
+        isinstance(data, list)
+        and all(isinstance(x, Sequence) for x in data)
+    )
+
+
 def _is_1d_collection(data: Any) -> bool:
     """Check whether data is a 1-D collection."""
     return (
@@ -453,7 +479,7 @@ def _get_all_param_aliases() -> Dict[str, List[str]]:
         buffer_len = 1 << 20
         tmp_out_len = ctypes.c_int64(0)
         string_buffer = ctypes.create_string_buffer(buffer_len)
-        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+        ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
         _safe_call(_LIB.LGBM_DumpParamAliases(
             ctypes.c_int64(buffer_len),
             ctypes.byref(tmp_out_len),
@@ -462,16 +488,15 @@ def _get_all_param_aliases() -> Dict[str, List[str]]:
         # if buffer length is not long enough, re-allocate a buffer
         if actual_len > buffer_len:
             string_buffer = ctypes.create_string_buffer(actual_len)
-            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+            ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
             _safe_call(_LIB.LGBM_DumpParamAliases(
                 ctypes.c_int64(actual_len),
                 ctypes.byref(tmp_out_len),
                 ptr_string_buffer))
-        aliases = json.loads(
+        return json.loads(
             string_buffer.value.decode('utf-8'),
             object_hook=lambda obj: {k: [k] + v for k, v in obj.items()}
         )
-        return aliases
 
     @classmethod
     def get(cls, *args) -> Set[str]:
@@ -578,7 +603,8 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va
     "label": _C_API_DTYPE_FLOAT32,
     "weight": _C_API_DTYPE_FLOAT32,
     "init_score": _C_API_DTYPE_FLOAT64,
-    "group": _C_API_DTYPE_INT32
+    "group": _C_API_DTYPE_INT32,
+    "position": _C_API_DTYPE_INT32
 }
 
 """String name to int feature importance type mapper"""
@@ -664,57 +690,52 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None:
 
 
 def _data_from_pandas(
-    data,
-    feature_name: Optional[_LGBM_FeatureNameConfiguration],
-    categorical_feature: Optional[_LGBM_CategoricalFeatureConfiguration],
+    data: pd_DataFrame,
+    feature_name: _LGBM_FeatureNameConfiguration,
+    categorical_feature: _LGBM_CategoricalFeatureConfiguration,
     pandas_categorical: Optional[List[List]]
-):
-    if isinstance(data, pd_DataFrame):
-        if len(data.shape) != 2 or data.shape[0] < 1:
-            raise ValueError('Input data must be 2 dimensional and non empty.')
-        if feature_name == 'auto' or feature_name is None:
-            data = data.rename(columns=str, copy=False)
-        cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)]
-        cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
-        if pandas_categorical is None:  # train dataset
-            pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
-        else:
-            if len(cat_cols) != len(pandas_categorical):
-                raise ValueError('train and valid dataset categorical_feature do not match.')
-            for col, category in zip(cat_cols, pandas_categorical):
-                if list(data[col].cat.categories) != list(category):
-                    data[col] = data[col].cat.set_categories(category)
-        if len(cat_cols):  # cat_cols is list
-            data = data.copy(deep=False)  # not alter origin DataFrame
-            data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
-        if categorical_feature is not None:
-            if feature_name is None:
-                feature_name = list(data.columns)
-            if categorical_feature == 'auto':  # use cat cols from DataFrame
-                categorical_feature = cat_cols_not_ordered
-            else:  # use cat cols specified by user
-                categorical_feature = list(categorical_feature)  # type: ignore[assignment]
-        if feature_name == 'auto':
-            feature_name = list(data.columns)
-        _check_for_bad_pandas_dtypes(data.dtypes)
-        df_dtypes = [dtype.type for dtype in data.dtypes]
-        df_dtypes.append(np.float32)  # so that the target dtype considers floats
-        target_dtype = np.find_common_type(df_dtypes, [])
-        try:
-            # most common case (no nullable dtypes)
-            data = data.to_numpy(dtype=target_dtype, copy=False)
-        except TypeError:
-            # 1.0 <= pd version < 1.1 and nullable dtypes, least common case
-            # raises error because array is casted to type(pd.NA) and there's no na_value argument
-            data = data.astype(target_dtype, copy=False).values
-        except ValueError:
-            # data has nullable dtypes, but we can specify na_value argument and copy will be made
-            data = data.to_numpy(dtype=target_dtype, na_value=np.nan)
+) -> Tuple[np.ndarray, List[str], List[str], List[List]]:
+    if len(data.shape) != 2 or data.shape[0] < 1:
+        raise ValueError('Input data must be 2 dimensional and non empty.')
+
+    # determine feature names
+    if feature_name == 'auto':
+        feature_name = [str(col) for col in data.columns]
+
+    # determine categorical features
+    cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)]
+    cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
+    if pandas_categorical is None:  # train dataset
+        pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
     else:
-        if feature_name == 'auto':
-            feature_name = None
-        if categorical_feature == 'auto':
-            categorical_feature = None
+        if len(cat_cols) != len(pandas_categorical):
+            raise ValueError('train and valid dataset categorical_feature do not match.')
+        for col, category in zip(cat_cols, pandas_categorical):
+            if list(data[col].cat.categories) != list(category):
+                data[col] = data[col].cat.set_categories(category)
+    if len(cat_cols):  # cat_cols is list
+        data = data.copy(deep=False)  # not alter origin DataFrame
+        data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
+    if categorical_feature == 'auto':  # use cat cols from DataFrame
+        categorical_feature = cat_cols_not_ordered
+    else:  # use cat cols specified by user
+        categorical_feature = list(categorical_feature)  # type: ignore[assignment]
+
+    # get numpy representation of the data
+    _check_for_bad_pandas_dtypes(data.dtypes)
+    df_dtypes = [dtype.type for dtype in data.dtypes]
+    df_dtypes.append(np.float32)  # so that the target dtype considers floats
+    target_dtype = np.result_type(*df_dtypes)
+    try:
+        # most common case (no nullable dtypes)
+        data = data.to_numpy(dtype=target_dtype, copy=False)
+    except TypeError:
+        # 1.0 <= pd version < 1.1 and nullable dtypes, least common case
+        # raises error because array is casted to type(pd.NA) and there's no na_value argument
+        data = data.astype(target_dtype, copy=False).values
+    except ValueError:
+        # data has nullable dtypes, but we can specify na_value argument and copy will be made
+        data = data.to_numpy(dtype=target_dtype, na_value=np.nan)
     return data, feature_name, categorical_feature, pandas_categorical
 
 
@@ -1000,7 +1021,15 @@ def predict(
                     ctypes.c_int(len(data_names)),
                 )
             )
-        data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]
+
+        if isinstance(data, pd_DataFrame):
+            data = _data_from_pandas(
+                data=data,
+                feature_name="auto",
+                categorical_feature="auto",
+                pandas_categorical=self.pandas_categorical
+            )[0]
+
         predict_type = _C_API_PREDICT_NORMAL
         if raw_score:
             predict_type = _C_API_PREDICT_RAW_SCORE
@@ -1526,7 +1555,8 @@ def __init__(
         feature_name: _LGBM_FeatureNameConfiguration = 'auto',
         categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
         params: Optional[Dict[str, Any]] = None,
-        free_raw_data: bool = True
+        free_raw_data: bool = True,
+        position: Optional[_LGBM_PositionType] = None,
     ):
         """Initialize Dataset.
 
@@ -1566,6 +1596,8 @@ def __init__(
             Other parameters for Dataset.
         free_raw_data : bool, optional (default=True)
             If True, raw data is freed after constructing inner Dataset.
+        position : numpy 1-D array, pandas Series or None, optional (default=None)
+            Position of items used in unbiased learning-to-rank task.
         """
         self._handle: Optional[_DatasetHandle] = None
         self.data = data
@@ -1573,6 +1605,7 @@ def __init__(
         self.reference = reference
         self.weight = weight
         self.group = group
+        self.position = position
         self.init_score = init_score
         self.feature_name: _LGBM_FeatureNameConfiguration = feature_name
         self.categorical_feature: _LGBM_CategoricalFeatureConfiguration = categorical_feature
@@ -1581,7 +1614,7 @@ def __init__(
         self.used_indices: Optional[List[int]] = None
         self._need_slice = True
         self._predictor: Optional[_InnerPredictor] = None
-        self.pandas_categorical = None
+        self.pandas_categorical: Optional[List[List]] = None
         self._params_back_up = None
         self.version = 0
         self._start_row = 0  # Used when pushing rows one by one.
@@ -1837,7 +1870,8 @@ def _lazy_init(
         predictor: Optional[_InnerPredictor],
         feature_name: _LGBM_FeatureNameConfiguration,
         categorical_feature: _LGBM_CategoricalFeatureConfiguration,
-        params: Optional[Dict[str, Any]]
+        params: Optional[Dict[str, Any]],
+        position: Optional[_LGBM_PositionType]
     ) -> "Dataset":
         if data is None:
             self._handle = None
@@ -1845,10 +1879,13 @@ def _lazy_init(
         if reference is not None:
             self.pandas_categorical = reference.pandas_categorical
             categorical_feature = reference.categorical_feature
-        data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data=data,
-                                                                                             feature_name=feature_name,
-                                                                                             categorical_feature=categorical_feature,
-                                                                                             pandas_categorical=self.pandas_categorical)
+        if isinstance(data, pd_DataFrame):
+            data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(
+                data=data,
+                feature_name=feature_name,
+                categorical_feature=categorical_feature,
+                pandas_categorical=self.pandas_categorical
+            )
 
         # process for args
         params = {} if params is None else params
@@ -1858,10 +1895,10 @@ def _lazy_init(
                 _log_warning(f'{key} keyword has been found in `params` and will be ignored.\n'
                              f'Please use {key} argument of the Dataset constructor to pass this parameter.')
         # get categorical features
-        if categorical_feature is not None:
+        if isinstance(categorical_feature, list):
             categorical_indices = set()
             feature_dict = {}
-            if feature_name is not None:
+            if isinstance(feature_name, list):
                 feature_dict = {name: i for i, name in enumerate(feature_name)}
             for name in categorical_feature:
                 if isinstance(name, str) and name in feature_dict:
@@ -1902,9 +1939,9 @@ def _lazy_init(
         elif isinstance(data, np.ndarray):
             self.__init_from_np2d(data, params_str, ref_dataset)
         elif isinstance(data, list) and len(data) > 0:
-            if all(isinstance(x, np.ndarray) for x in data):
+            if _is_list_of_numpy_arrays(data):
                 self.__init_from_list_np2d(data, params_str, ref_dataset)
-            elif all(isinstance(x, Sequence) for x in data):
+            elif _is_list_of_sequences(data):
                 self.__init_from_seqs(data, ref_dataset)
             else:
                 raise TypeError('Data list can only be of ndarray or Sequence')
@@ -1926,6 +1963,8 @@ def _lazy_init(
             self.set_weight(weight)
         if group is not None:
             self.set_group(group)
+        if position is not None:
+            self.set_position(position)
         if isinstance(predictor, _InnerPredictor):
             if self._predictor is None and init_score is not None:
                 _log_warning("The init_score will be overridden by the prediction of init_model.")
@@ -2220,7 +2259,7 @@ def construct(self) -> "Dataset":
                 if self.used_indices is None:
                     # create valid
                     self._lazy_init(data=self.data, label=self.label, reference=self.reference,
-                                    weight=self.weight, group=self.group,
+                                    weight=self.weight, group=self.group, position=self.position,
                                     init_score=self.init_score, predictor=self._predictor,
                                     feature_name=self.feature_name, categorical_feature='auto', params=self.params)
                 else:
@@ -2243,6 +2282,8 @@ def construct(self) -> "Dataset":
                         self.get_data()
                     if self.group is not None:
                         self.set_group(self.group)
+                    if self.position is not None:
+                        self.set_position(self.position)
                     if self.get_label() is None:
                         raise ValueError("Label should not be None.")
                     if isinstance(self._predictor, _InnerPredictor) and self._predictor is not self.reference._predictor:
@@ -2257,7 +2298,8 @@ def construct(self) -> "Dataset":
                 self._lazy_init(data=self.data, label=self.label, reference=None,
                                 weight=self.weight, group=self.group,
                                 init_score=self.init_score, predictor=self._predictor,
-                                feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params)
+                                feature_name=self.feature_name, categorical_feature=self.categorical_feature,
+                                params=self.params, position=self.position)
             if self.free_raw_data:
                 self.data = None
             self.feature_name = self.get_feature_name()
@@ -2270,7 +2312,8 @@ def create_valid(
         weight: Optional[_LGBM_WeightType] = None,
         group: Optional[_LGBM_GroupType] = None,
         init_score: Optional[_LGBM_InitScoreType] = None,
-        params: Optional[Dict[str, Any]] = None
+        params: Optional[Dict[str, Any]] = None,
+        position: Optional[_LGBM_PositionType] = None
     ) -> "Dataset":
         """Create validation data align with current Dataset.
 
@@ -2293,6 +2336,8 @@ def create_valid(
             Init score for Dataset.
         params : dict or None, optional (default=None)
             Other parameters for validation Dataset.
+        position : numpy 1-D array, pandas Series or None, optional (default=None)
+            Position of items used in unbiased learning-to-rank task.
 
         Returns
         -------
@@ -2300,7 +2345,7 @@ def create_valid(
             Validation Dataset with reference to self.
         """
         ret = Dataset(data, label=label, reference=self,
-                      weight=weight, group=group, init_score=init_score,
+                      weight=weight, group=group, position=position, init_score=init_score,
                       params=params, free_raw_data=self.free_raw_data)
         ret._predictor = self._predictor
         ret.pandas_categorical = self.pandas_categorical
@@ -2435,7 +2480,7 @@ def set_field(
                     'In multiclass classification init_score can also be a list of lists, numpy 2-D array or pandas DataFrame.'
                 )
         else:
-            dtype = np.int32 if field_name == 'group' else np.float32
+            dtype = np.int32 if (field_name == 'group' or field_name == 'position') else np.float32
             data = _list_to_1d_numpy(data, dtype=dtype, name=field_name)
 
         ptr_data: Union[_ctypes_float_ptr, _ctypes_int_ptr]
@@ -2728,6 +2773,28 @@ def set_group(
             self.set_field('group', group)
         return self
 
+    def set_position(
+        self,
+        position: Optional[_LGBM_PositionType]
+    ) -> "Dataset":
+        """Set position of Dataset (used for ranking).
+
+        Parameters
+        ----------
+        position : numpy 1-D array, pandas Series or None, optional (default=None)
+            Position of items used in unbiased learning-to-rank task.
+
+        Returns
+        -------
+        self : Dataset
+            Dataset with set position.
+        """
+        self.position = position
+        if self._handle is not None and position is not None:
+            position = _list_to_1d_numpy(position, dtype=np.int32, name='position')
+            self.set_field('position', position)
+        return self
+
     def get_feature_name(self) -> List[str]:
         """Get the names of columns (features) in the Dataset.
 
@@ -2824,7 +2891,7 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]:
                     self.data = self.data[self.used_indices, :]
                 elif isinstance(self.data, Sequence):
                     self.data = self.data[self.used_indices]
-                elif isinstance(self.data, list) and len(self.data) > 0 and all(isinstance(x, Sequence) for x in self.data):
+                elif _is_list_of_sequences(self.data) and len(self.data) > 0:
                     self.data = np.array(list(self._yield_row_from_seqlist(self.data, self.used_indices)))
                 else:
                     _log_warning(f"Cannot subset {type(self.data).__name__} type of raw data.\n"
@@ -2854,6 +2921,18 @@ def get_group(self) -> Optional[np.ndarray]:
                 self.group = np.diff(self.group)
         return self.group
 
+    def get_position(self) -> Optional[np.ndarray]:
+        """Get the position of the Dataset.
+
+        Returns
+        -------
+        position : numpy 1-D array or None
+            Position of items used in unbiased learning-to-rank task.
+        """
+        if self.position is None:
+            self.position = self.get_field('position')
+        return self.position
+
     def num_data(self) -> int:
         """Get the number of rows in the Dataset.
 
@@ -3209,8 +3288,7 @@ def __copy__(self) -> "Booster":
 
     def __deepcopy__(self, _) -> "Booster":
         model_str = self.model_to_string(num_iteration=-1)
-        booster = Booster(model_str=model_str)
-        return booster
+        return Booster(model_str=model_str)
 
     def __getstate__(self) -> Dict[str, Any]:
         this = self.__dict__.copy()
@@ -3237,7 +3315,7 @@ def _get_loaded_param(self) -> Dict[str, Any]:
         buffer_len = 1 << 20
         tmp_out_len = ctypes.c_int64(0)
         string_buffer = ctypes.create_string_buffer(buffer_len)
-        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+        ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
         _safe_call(_LIB.LGBM_BoosterGetLoadedParam(
             self._handle,
             ctypes.c_int64(buffer_len),
@@ -3247,7 +3325,7 @@ def _get_loaded_param(self) -> Dict[str, Any]:
         # if buffer length is not long enough, re-allocate a buffer
         if actual_len > buffer_len:
             string_buffer = ctypes.create_string_buffer(actual_len)
-            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+            ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
             _safe_call(_LIB.LGBM_BoosterGetLoadedParam(
                 self._handle,
                 ctypes.c_int64(actual_len),
@@ -4000,7 +4078,7 @@ def model_to_string(
         buffer_len = 1 << 20
         tmp_out_len = ctypes.c_int64(0)
         string_buffer = ctypes.create_string_buffer(buffer_len)
-        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+        ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
         _safe_call(_LIB.LGBM_BoosterSaveModelToString(
             self._handle,
             ctypes.c_int(start_iteration),
@@ -4013,7 +4091,7 @@ def model_to_string(
         # if buffer length is not long enough, re-allocate a buffer
         if actual_len > buffer_len:
             string_buffer = ctypes.create_string_buffer(actual_len)
-            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+            ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
             _safe_call(_LIB.LGBM_BoosterSaveModelToString(
                 self._handle,
                 ctypes.c_int(start_iteration),
@@ -4068,7 +4146,7 @@ def dump_model(
         buffer_len = 1 << 20
         tmp_out_len = ctypes.c_int64(0)
         string_buffer = ctypes.create_string_buffer(buffer_len)
-        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+        ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
         _safe_call(_LIB.LGBM_BoosterDumpModel(
             self._handle,
             ctypes.c_int(start_iteration),
@@ -4081,7 +4159,7 @@ def dump_model(
         # if buffer length is not long enough, reallocate a buffer
         if actual_len > buffer_len:
             string_buffer = ctypes.create_string_buffer(actual_len)
-            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+            ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
             _safe_call(_LIB.LGBM_BoosterDumpModel(
                 self._handle,
                 ctypes.c_int(start_iteration),
diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py
index 77856f5bdab6..2f77ee740c75 100644
--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
@@ -1,12 +1,18 @@
 # coding: utf-8
 """Callbacks library."""
-import collections
+from collections import OrderedDict
+from dataclasses import dataclass
 from functools import partial
-from typing import Any, Callable, Dict, List, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
-from .basic import _ConfigAliases, _LGBM_BoosterEvalMethodResultType, _log_info, _log_warning
+from .basic import (Booster, _ConfigAliases, _LGBM_BoosterEvalMethodResultType,
+                    _LGBM_BoosterEvalMethodResultWithStandardDeviationType, _log_info, _log_warning)
+
+if TYPE_CHECKING:
+    from .engine import CVBooster
 
 __all__ = [
+    'EarlyStopException',
     'early_stopping',
     'log_evaluation',
     'record_evaluation',
@@ -16,16 +22,20 @@
 _EvalResultDict = Dict[str, Dict[str, List[Any]]]
 _EvalResultTuple = Union[
     _LGBM_BoosterEvalMethodResultType,
-    Tuple[str, str, float, bool, float]
+    _LGBM_BoosterEvalMethodResultWithStandardDeviationType
 ]
 _ListOfEvalResultTuples = Union[
     List[_LGBM_BoosterEvalMethodResultType],
-    List[Tuple[str, str, float, bool, float]]
+    List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]
 ]
 
 
 class EarlyStopException(Exception):
-    """Exception of early stopping."""
+    """Exception of early stopping.
+
+    Raise this from a callback passed in via keyword argument ``callbacks``
+    in ``cv()`` or ``train()`` to trigger early stopping.
+    """
 
     def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) -> None:
         """Create early stopping exception.
@@ -34,6 +44,7 @@ def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) ->
         ----------
         best_iteration : int
             The best iteration stopped.
+            0-based... pass ``best_iteration=2`` to indicate that the third iteration was the best one.
         best_score : list of (eval_name, metric_name, eval_result, is_higher_better) tuple or (eval_name, metric_name, eval_result, is_higher_better, stdv) tuple
             Scores for each metric, on each validation set, as of the best iteration.
         """
@@ -43,14 +54,14 @@ def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) ->
 
 
 # Callback environment used by callbacks
-CallbackEnv = collections.namedtuple(
-    "CallbackEnv",
-    ["model",
-     "params",
-     "iteration",
-     "begin_iteration",
-     "end_iteration",
-     "evaluation_result_list"])
+@dataclass
+class CallbackEnv:
+    model: Union[Booster, "CVBooster"]
+    params: Dict[str, Any]
+    iteration: int
+    begin_iteration: int
+    end_iteration: int
+    evaluation_result_list: Optional[_ListOfEvalResultTuples]
 
 
 def _format_eval_result(value: _EvalResultTuple, show_stdv: bool) -> str:
@@ -120,13 +131,18 @@ def __init__(self, eval_result: _EvalResultDict) -> None:
         self.eval_result = eval_result
 
     def _init(self, env: CallbackEnv) -> None:
+        if env.evaluation_result_list is None:
+            raise RuntimeError(
+                "record_evaluation() callback enabled but no evaluation results found. This is a probably bug in LightGBM. "
+                "Please report it at https://github.com/microsoft/LightGBM/issues"
+            )
         self.eval_result.clear()
         for item in env.evaluation_result_list:
             if len(item) == 4:  # regular train
                 data_name, eval_name = item[:2]
             else:  # cv
                 data_name, eval_name = item[1].split()
-            self.eval_result.setdefault(data_name, collections.OrderedDict())
+            self.eval_result.setdefault(data_name, OrderedDict())
             if len(item) == 4:
                 self.eval_result[data_name].setdefault(eval_name, [])
             else:
@@ -136,6 +152,11 @@ def _init(self, env: CallbackEnv) -> None:
     def __call__(self, env: CallbackEnv) -> None:
         if env.iteration == env.begin_iteration:
             self._init(env)
+        if env.evaluation_result_list is None:
+            raise RuntimeError(
+                "record_evaluation() callback enabled but no evaluation results found. This is a probably bug in LightGBM. "
+                "Please report it at https://github.com/microsoft/LightGBM/issues"
+            )
         for item in env.evaluation_result_list:
             if len(item) == 4:
                 data_name, eval_name, result = item[:3]
@@ -143,7 +164,7 @@ def __call__(self, env: CallbackEnv) -> None:
             else:
                 data_name, eval_name = item[1].split()
                 res_mean = item[2]
-                res_stdv = item[4]
+                res_stdv = item[4]  # type: ignore[misc]
                 self.eval_result[data_name][f'{eval_name}-mean'].append(res_mean)
                 self.eval_result[data_name][f'{eval_name}-stdv'].append(res_stdv)
 
@@ -274,6 +295,10 @@ def _is_train_set(self, ds_name: str, eval_name: str, train_name: str) -> bool:
         return (ds_name == "cv_agg" and eval_name == "train") or ds_name == train_name
 
     def _init(self, env: CallbackEnv) -> None:
+        if env.evaluation_result_list is None or env.evaluation_result_list == []:
+            raise ValueError(
+                "For early stopping, at least one dataset and eval metric is required for evaluation"
+            )
         is_dart = any(env.params.get(alias, "") == 'dart' for alias in _ConfigAliases.get("boosting"))
         only_train_set = (
             len(env.evaluation_result_list) == 1
@@ -289,9 +314,6 @@ def _init(self, env: CallbackEnv) -> None:
             elif only_train_set:
                 _log_warning('Only training set found, disabling early stopping.')
             return
-        if not env.evaluation_result_list:
-            raise ValueError('For early stopping, '
-                             'at least one dataset and eval metric is required for evaluation')
 
         if self.stopping_rounds <= 0:
             raise ValueError("stopping_rounds should be greater than zero.")
@@ -353,6 +375,11 @@ def __call__(self, env: CallbackEnv) -> None:
             self._init(env)
         if not self.enabled:
             return
+        if env.evaluation_result_list is None:
+            raise RuntimeError(
+                "early_stopping() callback enabled but no evaluation results found. This is a probably bug in LightGBM. "
+                "Please report it at https://github.com/microsoft/LightGBM/issues"
+            )
         # self.best_score_list is initialized to an empty list
         first_time_updating_best_score_list = (self.best_score_list == [])
         for i in range(len(env.evaluation_result_list)):
diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py
index 2d640d741629..822aa3b35017 100644
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -1,8 +1,8 @@
 # coding: utf-8
 """Library with training routines of LightGBM."""
-import collections
 import copy
 import json
+from collections import OrderedDict, defaultdict
 from operator import attrgetter
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
@@ -11,9 +11,9 @@
 
 from . import callback
 from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor,
-                    _LGBM_BoosterEvalMethodResultType, _LGBM_CategoricalFeatureConfiguration,
-                    _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration,
-                    _log_warning)
+                    _LGBM_BoosterEvalMethodResultType, _LGBM_BoosterEvalMethodResultWithStandardDeviationType,
+                    _LGBM_CategoricalFeatureConfiguration, _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType,
+                    _LGBM_FeatureNameConfiguration, _log_warning)
 from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
 
 __all__ = [
@@ -293,7 +293,7 @@ def train(
             booster.best_iteration = earlyStopException.best_iteration + 1
             evaluation_result_list = earlyStopException.best_score
             break
-    booster.best_score = collections.defaultdict(collections.OrderedDict)
+    booster.best_score = defaultdict(OrderedDict)
     for dataset_name, eval_name, score, _ in evaluation_result_list:
         booster.best_score[dataset_name][eval_name] = score
     if not keep_training_booster:
@@ -339,16 +339,12 @@ def __init__(
             with open(model_file, "r") as file:
                 self._from_dict(json.load(file))
 
-    def _append(self, booster: Booster) -> None:
-        """Add a booster to CVBooster."""
-        self.boosters.append(booster)
-
     def _from_dict(self, models: Dict[str, Any]) -> None:
         """Load CVBooster from dict."""
         self.best_iteration = models["best_iteration"]
         self.boosters = []
         for model_str in models["boosters"]:
-            self._append(Booster(model_str=model_str))
+            self.boosters.append(Booster(model_str=model_str))
 
     def _to_dict(self, num_iteration: Optional[int], start_iteration: int, importance_type: str) -> Dict[str, Any]:
         """Serialize CVBooster to dict."""
@@ -514,19 +510,19 @@ def _make_n_folds(
             train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy())
         else:
             tparam = params
-        cvbooster = Booster(tparam, train_set)
+        booster_for_fold = Booster(tparam, train_set)
         if eval_train_metric:
-            cvbooster.add_valid(train_set, 'train')
-        cvbooster.add_valid(valid_set, 'valid')
-        ret._append(cvbooster)
+            booster_for_fold.add_valid(train_set, 'train')
+        booster_for_fold.add_valid(valid_set, 'valid')
+        ret.boosters.append(booster_for_fold)
     return ret
 
 
 def _agg_cv_result(
-    raw_results: List[List[Tuple[str, str, float, bool]]]
-) -> List[Tuple[str, str, float, bool, float]]:
+    raw_results: List[List[_LGBM_BoosterEvalMethodResultType]]
+) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]:
     """Aggregate cross-validation results."""
-    cvmap: Dict[str, List[float]] = collections.OrderedDict()
+    cvmap: Dict[str, List[float]] = OrderedDict()
     metric_type: Dict[str, bool] = {}
     for one_result in raw_results:
         for one_line in one_result:
@@ -534,7 +530,7 @@ def _agg_cv_result(
             metric_type[key] = one_line[3]
             cvmap.setdefault(key, [])
             cvmap[key].append(one_line[2])
-    return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]
+    return [('cv_agg', k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()]
 
 
 def cv(
@@ -655,13 +651,18 @@ def cv(
 
     Returns
     -------
-    eval_hist : dict
-        Evaluation history.
+    eval_results : dict
+        History of evaluation results of each metric.
         The dictionary has the following format:
-        {'metric1-mean': [values], 'metric1-stdv': [values],
-        'metric2-mean': [values], 'metric2-stdv': [values],
+        {'valid metric1-mean': [values], 'valid metric1-stdv': [values],
+        'valid metric2-mean': [values], 'valid metric2-stdv': [values],
         ...}.
         If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key.
+        If ``eval_train_metric=True``, also returns the train metric history.
+        In this case, the dictionary has the following format:
+        {'train metric1-mean': [values], 'valid metric1-mean': [values],
+        'train metric2-mean': [values], 'valid metric2-mean': [values],
+        ...}.
     """
     if not isinstance(train_set, Dataset):
         raise TypeError(f"cv() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.")
@@ -717,7 +718,7 @@ def cv(
              .set_feature_name(feature_name) \
              .set_categorical_feature(categorical_feature)
 
-    results = collections.defaultdict(list)
+    results = defaultdict(list)
     cvfolds = _make_n_folds(full_data=train_set, folds=folds, nfold=nfold,
                             params=params, seed=seed, fpreproc=fpreproc,
                             stratified=stratified, shuffle=shuffle,
diff --git a/python-package/lightgbm/plotting.py b/python-package/lightgbm/plotting.py
index 0f9bcd5f8ccb..85b245c187ef 100644
--- a/python-package/lightgbm/plotting.py
+++ b/python-package/lightgbm/plotting.py
@@ -693,11 +693,7 @@ def create_tree_digraph(
 
     model = booster.dump_model()
     tree_infos = model['tree_info']
-    if 'feature_names' in model:
-        feature_names = model['feature_names']
-    else:
-        feature_names = None
-
+    feature_names = model.get('feature_names', None)
     monotone_constraints = model.get('monotone_constraints', None)
 
     if tree_index < len(tree_infos):
@@ -716,13 +712,13 @@ def create_tree_digraph(
         if isinstance(example_case, pd_DataFrame):
             example_case = _data_from_pandas(
                 data=example_case,
-                feature_name=None,
-                categorical_feature=None,
+                feature_name="auto",
+                categorical_feature="auto",
                 pandas_categorical=booster.pandas_categorical
             )[0]
         example_case = example_case[0]
 
-    graph = _to_graphviz(
+    return _to_graphviz(
         tree_info=tree_info,
         show_info=show_info,
         feature_names=feature_names,
@@ -734,8 +730,6 @@ def create_tree_digraph(
         **kwargs
     )
 
-    return graph
-
 
 def plot_tree(
     booster: Union[Booster, LGBMModel],
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 7e909342c01f..c71c233df908 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -1103,6 +1103,8 @@ def fit(  # type: ignore[override]
 
         self._classes = self._le.classes_
         self._n_classes = len(self._classes)  # type: ignore[arg-type]
+        if self.objective is None:
+            self._objective = None
 
         # adjust eval metrics to match whether binary or multiclass
         # classification is being performed
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index d3ff28286bb9..6e43dc242d1b 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -18,6 +18,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence"
 ]
 dependencies = [
+    "dataclasses ; python_version < '3.7'",
     "numpy",
     "scipy"
 ]
@@ -29,7 +30,7 @@ maintainers = [
 name = "lightgbm"
 readme = "README.rst"
 requires-python = ">=3.6"
-version = "4.0.0.99"
+version = "4.1.0.99"
 
 [project.optional-dependencies]
 dask = [
@@ -111,7 +112,13 @@ select = [
     # pycodestyle
     "E",
     # pyflakes
-    "F"
+    "F",
+    # flake8-return: unnecessary assignment before return
+    "RET504",
+    # flake8-simplify: use dict.get() instead of an if-else block
+    "SIM401",
+    # flake8-print
+    "T",
 ]
 
 # this should be set to the oldest version of python LightGBM supports
@@ -120,13 +127,17 @@ target-version = "py37"
 [tool.ruff.per-file-ignores]
 "examples/*" = [
     # pydocstyle
-    "D"
+    "D",
+    # flake8-print
+    "T"
 ]
 "tests/*" = [
     # (flake8-bugbear) Found useless expression
     "B018",
     # pydocstyle
-    "D"
+    "D",
+    # flake8-print
+    "T"
 ]
 
 [tool.ruff.pydocstyle]
diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp
index 9a87e982483e..88ece154e432 100644
--- a/src/boosting/rf.hpp
+++ b/src/boosting/rf.hpp
@@ -115,6 +115,12 @@ class RF : public GBDT {
     const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
     const std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>>& bag_data_indices = data_sample_strategy_->bag_data_indices();
 
+    // GOSSStrategy->Bagging may modify value of bag_data_cnt_
+    if (is_use_subset && bag_data_cnt < num_data_) {
+      tmp_grad_.resize(num_data_);
+      tmp_hess_.resize(num_data_);
+    }
+
     CHECK_EQ(gradients, nullptr);
     CHECK_EQ(hessians, nullptr);
 
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 442247d7a9dd..8c4eee96b4c9 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -150,7 +150,7 @@ class Booster {
     objective_fun_.reset(ObjectiveFunction::CreateObjectiveFunction(config_.objective,
                                                                     config_));
     if (objective_fun_ == nullptr) {
-      Log::Warning("Using self-defined objective function");
+      Log::Info("Using self-defined objective function");
     }
     // initialize the objective function
     if (objective_fun_ != nullptr) {
@@ -320,7 +320,7 @@ class Booster {
       objective_fun_.reset(ObjectiveFunction::CreateObjectiveFunction(config_.objective,
                                                                       config_));
       if (objective_fun_ == nullptr) {
-        Log::Warning("Using self-defined objective function");
+        Log::Info("Using self-defined objective function");
       }
       // initialize the objective function
       if (objective_fun_ != nullptr) {
diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp
index fd4abcf25e79..a7d0df697e24 100644
--- a/src/cuda/cuda_utils.cpp
+++ b/src/cuda/cuda_utils.cpp
@@ -26,6 +26,12 @@ void SetCUDADevice(int gpu_device_id, const char* file, int line) {
   }
 }
 
+int GetCUDADevice(const char* file, int line) {
+  int cur_gpu_device_id = 0;
+  CUDASUCCESS_OR_FATAL_OUTER(cudaGetDevice(&cur_gpu_device_id));
+  return cur_gpu_device_id;
+}
+
 }  // namespace LightGBM
 
 #endif  // USE_CUDA
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 0906ba4b6439..394614af3f33 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -304,6 +304,7 @@ const std::unordered_set<std::string>& Config::parameter_set() {
   "lambdarank_truncation_level",
   "lambdarank_norm",
   "label_gain",
+  "lambdarank_position_bias_regularization",
   "metric",
   "metric_freq",
   "is_provide_training_metric",
@@ -619,6 +620,9 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
     label_gain = Common::StringToArray<double>(tmp_str, ',');
   }
 
+  GetDouble(params, "lambdarank_position_bias_regularization", &lambdarank_position_bias_regularization);
+  CHECK_GE(lambdarank_position_bias_regularization, 0.0);
+
   GetInt(params, "metric_freq", &metric_freq);
   CHECK_GT(metric_freq, 0);
 
@@ -660,12 +664,14 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
 std::string Config::SaveMembersToString() const {
   std::stringstream str_buf;
+  str_buf << "[data_sample_strategy: " << data_sample_strategy << "]\n";
   str_buf << "[data: " << data << "]\n";
   str_buf << "[valid: " << Common::Join(valid, ",") << "]\n";
   str_buf << "[num_iterations: " << num_iterations << "]\n";
   str_buf << "[learning_rate: " << learning_rate << "]\n";
   str_buf << "[num_leaves: " << num_leaves << "]\n";
   str_buf << "[num_threads: " << num_threads << "]\n";
+  str_buf << "[seed: " << seed << "]\n";
   str_buf << "[deterministic: " << deterministic << "]\n";
   str_buf << "[force_col_wise: " << force_col_wise << "]\n";
   str_buf << "[force_row_wise: " << force_row_wise << "]\n";
@@ -718,6 +724,10 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[interaction_constraints: " << interaction_constraints << "]\n";
   str_buf << "[verbosity: " << verbosity << "]\n";
   str_buf << "[saved_feature_importance_type: " << saved_feature_importance_type << "]\n";
+  str_buf << "[use_quantized_grad: " << use_quantized_grad << "]\n";
+  str_buf << "[num_grad_quant_bins: " << num_grad_quant_bins << "]\n";
+  str_buf << "[quant_train_renew_leaf: " << quant_train_renew_leaf << "]\n";
+  str_buf << "[stochastic_rounding: " << stochastic_rounding << "]\n";
   str_buf << "[linear_tree: " << linear_tree << "]\n";
   str_buf << "[max_bin: " << max_bin << "]\n";
   str_buf << "[max_bin_by_feature: " << Common::Join(max_bin_by_feature, ",") << "]\n";
@@ -754,6 +764,7 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[lambdarank_truncation_level: " << lambdarank_truncation_level << "]\n";
   str_buf << "[lambdarank_norm: " << lambdarank_norm << "]\n";
   str_buf << "[label_gain: " << Common::Join(label_gain, ",") << "]\n";
+  str_buf << "[lambdarank_position_bias_regularization: " << lambdarank_position_bias_regularization << "]\n";
   str_buf << "[eval_at: " << Common::Join(eval_at, ",") << "]\n";
   str_buf << "[multi_error_top_k: " << multi_error_top_k << "]\n";
   str_buf << "[auc_mu_weights: " << Common::Join(auc_mu_weights, ",") << "]\n";
@@ -893,6 +904,7 @@ const std::unordered_map<std::string, std::vector<std::string>>& Config::paramet
     {"lambdarank_truncation_level", {}},
     {"lambdarank_norm", {}},
     {"label_gain", {}},
+    {"lambdarank_position_bias_regularization", {}},
     {"metric", {"metrics", "metric_types"}},
     {"metric_freq", {"output_freq"}},
     {"is_provide_training_metric", {"training_metric", "is_training_metric", "train_metric"}},
@@ -1035,6 +1047,7 @@ const std::unordered_map<std::string, std::string>& Config::ParameterTypes() {
     {"lambdarank_truncation_level", "int"},
     {"lambdarank_norm", "bool"},
     {"label_gain", "vector<double>"},
+    {"lambdarank_position_bias_regularization", "double"},
     {"metric", "vector<string>"},
     {"metric_freq", "int"},
     {"is_provide_training_metric", "bool"},
diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp
index a1080cb2b902..eb0938c01225 100644
--- a/src/io/cuda/cuda_column_data.cpp
+++ b/src/io/cuda/cuda_column_data.cpp
@@ -12,11 +12,8 @@ namespace LightGBM {
 CUDAColumnData::CUDAColumnData(const data_size_t num_data, const int gpu_device_id) {
   num_threads_ = OMP_NUM_THREADS();
   num_data_ = num_data;
-  if (gpu_device_id >= 0) {
-    SetCUDADevice(gpu_device_id, __FILE__, __LINE__);
-  } else {
-    SetCUDADevice(0, __FILE__, __LINE__);
-  }
+  gpu_device_id_ = gpu_device_id >= 0 ? gpu_device_id : 0;
+  SetCUDADevice(gpu_device_id_, __FILE__, __LINE__);
   cuda_used_indices_ = nullptr;
   cuda_data_by_column_ = nullptr;
   cuda_column_bit_type_ = nullptr;
@@ -117,37 +114,41 @@ void CUDAColumnData::Init(const int num_columns,
   feature_mfb_is_na_ = feature_mfb_is_na;
   data_by_column_.resize(num_columns_, nullptr);
   OMP_INIT_EX();
-  #pragma omp parallel for schedule(static) num_threads(num_threads_)
-  for (int column_index = 0; column_index < num_columns_; ++column_index) {
-    OMP_LOOP_EX_BEGIN();
-    const int8_t bit_type = column_bit_type[column_index];
-    if (column_data[column_index] != nullptr) {
-      // is dense column
-      if (bit_type == 4) {
-        column_bit_type_[column_index] = 8;
-        InitOneColumnData<false, true, uint8_t>(column_data[column_index], nullptr, &data_by_column_[column_index]);
-      } else if (bit_type == 8) {
-        InitOneColumnData<false, false, uint8_t>(column_data[column_index], nullptr, &data_by_column_[column_index]);
-      } else if (bit_type == 16) {
-        InitOneColumnData<false, false, uint16_t>(column_data[column_index], nullptr, &data_by_column_[column_index]);
-      } else if (bit_type == 32) {
-        InitOneColumnData<false, false, uint32_t>(column_data[column_index], nullptr, &data_by_column_[column_index]);
-      } else {
-        Log::Fatal("Unknow column bit type %d", bit_type);
-      }
-    } else {
-      // is sparse column
-      if (bit_type == 8) {
-        InitOneColumnData<true, false, uint8_t>(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]);
-      } else if (bit_type == 16) {
-        InitOneColumnData<true, false, uint16_t>(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]);
-      } else if (bit_type == 32) {
-        InitOneColumnData<true, false, uint32_t>(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]);
+  #pragma omp parallel num_threads(num_threads_)
+  {
+    SetCUDADevice(gpu_device_id_, __FILE__, __LINE__);
+    #pragma omp for schedule(static)
+    for (int column_index = 0; column_index < num_columns_; ++column_index) {
+      OMP_LOOP_EX_BEGIN();
+      const int8_t bit_type = column_bit_type[column_index];
+      if (column_data[column_index] != nullptr) {
+        // is dense column
+        if (bit_type == 4) {
+          column_bit_type_[column_index] = 8;
+          InitOneColumnData<false, true, uint8_t>(column_data[column_index], nullptr, &data_by_column_[column_index]);
+        } else if (bit_type == 8) {
+          InitOneColumnData<false, false, uint8_t>(column_data[column_index], nullptr, &data_by_column_[column_index]);
+        } else if (bit_type == 16) {
+          InitOneColumnData<false, false, uint16_t>(column_data[column_index], nullptr, &data_by_column_[column_index]);
+        } else if (bit_type == 32) {
+          InitOneColumnData<false, false, uint32_t>(column_data[column_index], nullptr, &data_by_column_[column_index]);
+        } else {
+          Log::Fatal("Unknow column bit type %d", bit_type);
+        }
       } else {
-        Log::Fatal("Unknow column bit type %d", bit_type);
+        // is sparse column
+        if (bit_type == 8) {
+          InitOneColumnData<true, false, uint8_t>(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]);
+        } else if (bit_type == 16) {
+          InitOneColumnData<true, false, uint16_t>(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]);
+        } else if (bit_type == 32) {
+          InitOneColumnData<true, false, uint32_t>(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]);
+        } else {
+          Log::Fatal("Unknow column bit type %d", bit_type);
+        }
       }
+      OMP_LOOP_EX_END();
     }
-    OMP_LOOP_EX_END();
   }
   OMP_THROW_EX();
   feature_to_column_ = feature_to_column;
@@ -182,24 +183,28 @@ void CUDAColumnData::CopySubrow(
     AllocateCUDAMemory<data_size_t>(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__);
     data_by_column_.resize(num_columns_, nullptr);
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(static) num_threads(num_threads_)
-    for (int column_index = 0; column_index < num_columns_; ++column_index) {
-      OMP_LOOP_EX_BEGIN();
-      const uint8_t bit_type = column_bit_type_[column_index];
-      if (bit_type == 8) {
-        uint8_t* column_data = nullptr;
-        AllocateCUDAMemory<uint8_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
-        data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
-      } else if (bit_type == 16) {
-        uint16_t* column_data = nullptr;
-        AllocateCUDAMemory<uint16_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
-        data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
-      } else if (bit_type == 32) {
-        uint32_t* column_data = nullptr;
-        AllocateCUDAMemory<uint32_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
-        data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+    #pragma omp parallel num_threads(num_threads_)
+    {
+      SetCUDADevice(gpu_device_id_, __FILE__, __LINE__);
+      #pragma omp for schedule(static)
+      for (int column_index = 0; column_index < num_columns_; ++column_index) {
+        OMP_LOOP_EX_BEGIN();
+        const uint8_t bit_type = column_bit_type_[column_index];
+        if (bit_type == 8) {
+          uint8_t* column_data = nullptr;
+          AllocateCUDAMemory<uint8_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
+          data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+        } else if (bit_type == 16) {
+          uint16_t* column_data = nullptr;
+          AllocateCUDAMemory<uint16_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
+          data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+        } else if (bit_type == 32) {
+          uint32_t* column_data = nullptr;
+          AllocateCUDAMemory<uint32_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
+          data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+        }
+        OMP_LOOP_EX_END();
       }
-      OMP_LOOP_EX_END();
     }
     OMP_THROW_EX();
     InitCUDAMemoryFromHostMemory<void*>(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__);
@@ -221,27 +226,31 @@ void CUDAColumnData::ResizeWhenCopySubrow(const data_size_t num_used_indices) {
   DeallocateCUDAMemory<data_size_t>(&cuda_used_indices_, __FILE__, __LINE__);
   AllocateCUDAMemory<data_size_t>(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__);
   OMP_INIT_EX();
-  #pragma omp parallel for schedule(static) num_threads(num_threads_)
-  for (int column_index = 0; column_index < num_columns_; ++column_index) {
-    OMP_LOOP_EX_BEGIN();
-    const uint8_t bit_type = column_bit_type_[column_index];
-    if (bit_type == 8) {
-      uint8_t* column_data = reinterpret_cast<uint8_t*>(data_by_column_[column_index]);
-      DeallocateCUDAMemory<uint8_t>(&column_data, __FILE__, __LINE__);
-      AllocateCUDAMemory<uint8_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
-      data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
-    } else if (bit_type == 16) {
-      uint16_t* column_data = reinterpret_cast<uint16_t*>(data_by_column_[column_index]);
-      DeallocateCUDAMemory<uint16_t>(&column_data, __FILE__, __LINE__);
-      AllocateCUDAMemory<uint16_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
-      data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
-    } else if (bit_type == 32) {
-      uint32_t* column_data = reinterpret_cast<uint32_t*>(data_by_column_[column_index]);
-      DeallocateCUDAMemory<uint32_t>(&column_data, __FILE__, __LINE__);
-      AllocateCUDAMemory<uint32_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
-      data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+  #pragma omp parallel num_threads(num_threads_)
+  {
+    SetCUDADevice(gpu_device_id_, __FILE__, __LINE__);
+    #pragma omp for schedule(static)
+    for (int column_index = 0; column_index < num_columns_; ++column_index) {
+      OMP_LOOP_EX_BEGIN();
+      const uint8_t bit_type = column_bit_type_[column_index];
+      if (bit_type == 8) {
+        uint8_t* column_data = reinterpret_cast<uint8_t*>(data_by_column_[column_index]);
+        DeallocateCUDAMemory<uint8_t>(&column_data, __FILE__, __LINE__);
+        AllocateCUDAMemory<uint8_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
+        data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+      } else if (bit_type == 16) {
+        uint16_t* column_data = reinterpret_cast<uint16_t*>(data_by_column_[column_index]);
+        DeallocateCUDAMemory<uint16_t>(&column_data, __FILE__, __LINE__);
+        AllocateCUDAMemory<uint16_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
+        data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+      } else if (bit_type == 32) {
+        uint32_t* column_data = reinterpret_cast<uint32_t*>(data_by_column_[column_index]);
+        DeallocateCUDAMemory<uint32_t>(&column_data, __FILE__, __LINE__);
+        AllocateCUDAMemory<uint32_t>(&column_data, num_used_indices_size, __FILE__, __LINE__);
+        data_by_column_[column_index] = reinterpret_cast<void*>(column_data);
+      }
+      OMP_LOOP_EX_END();
     }
-    OMP_LOOP_EX_END();
   }
   OMP_THROW_EX();
   DeallocateCUDAMemory<void*>(&cuda_data_by_column_, __FILE__, __LINE__);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 5b23f01ec3a0..cd692afb031a 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -699,7 +699,7 @@ TrainingShareStates* Dataset::GetShareStates(
 
     if (col_wise_time < row_wise_time) {
       auto overhead_cost = row_wise_init_time + row_wise_time + col_wise_time;
-      Log::Warning(
+      Log::Info(
           "Auto-choosing col-wise multi-threading, the overhead of testing was "
           "%f seconds.\n"
           "You can set `force_col_wise=true` to remove the overhead.",
@@ -707,7 +707,7 @@ TrainingShareStates* Dataset::GetShareStates(
       return col_wise_state.release();
     } else {
       auto overhead_cost = col_wise_init_time + row_wise_time + col_wise_time;
-      Log::Warning(
+      Log::Info(
           "Auto-choosing row-wise multi-threading, the overhead of testing was "
           "%f seconds.\n"
           "You can set `force_row_wise=true` to remove the overhead.\n"
@@ -937,6 +937,8 @@ bool Dataset::SetIntField(const char* field_name, const int* field_data,
   name = Common::Trim(name);
   if (name == std::string("query") || name == std::string("group")) {
     metadata_.SetQuery(field_data, num_element);
+  } else if (name == std::string("position")) {
+    metadata_.SetPosition(field_data, num_element);
   } else {
     return false;
   }
@@ -987,6 +989,9 @@ bool Dataset::GetIntField(const char* field_name, data_size_t* out_len,
   if (name == std::string("query") || name == std::string("group")) {
     *out_ptr = metadata_.query_boundaries();
     *out_len = metadata_.num_queries() + 1;
+  } else if (name == std::string("position")) {
+    *out_ptr = metadata_.positions();
+    *out_len = num_data_;
   } else {
     return false;
   }
@@ -1273,21 +1278,34 @@ void Dataset::ConstructHistogramsInner(
   auto ptr_ordered_grad = gradients;
   auto ptr_ordered_hess = hessians;
   if (num_used_dense_group > 0) {
-    if (USE_INDICES) {
-      if (USE_HESSIAN) {
-#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
+    if (USE_QUANT_GRAD) {
+      int16_t* ordered_gradients_and_hessians = reinterpret_cast<int16_t*>(ordered_gradients);
+      const int16_t* gradients_and_hessians = reinterpret_cast<const int16_t*>(gradients);
+      if (USE_INDICES) {
+  #pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
         for (data_size_t i = 0; i < num_data; ++i) {
-          ordered_gradients[i] = gradients[data_indices[i]];
-          ordered_hessians[i] = hessians[data_indices[i]];
+          ordered_gradients_and_hessians[i] = gradients_and_hessians[data_indices[i]];
         }
-        ptr_ordered_grad = ordered_gradients;
-        ptr_ordered_hess = ordered_hessians;
-      } else {
-#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
-        for (data_size_t i = 0; i < num_data; ++i) {
-          ordered_gradients[i] = gradients[data_indices[i]];
+        ptr_ordered_grad = reinterpret_cast<const score_t*>(ordered_gradients);
+        ptr_ordered_hess = nullptr;
+      }
+    } else {
+      if (USE_INDICES) {
+        if (USE_HESSIAN) {
+  #pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
+          for (data_size_t i = 0; i < num_data; ++i) {
+            ordered_gradients[i] = gradients[data_indices[i]];
+            ordered_hessians[i] = hessians[data_indices[i]];
+          }
+          ptr_ordered_grad = ordered_gradients;
+          ptr_ordered_hess = ordered_hessians;
+        } else {
+  #pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
+          for (data_size_t i = 0; i < num_data; ++i) {
+            ordered_gradients[i] = gradients[data_indices[i]];
+          }
+          ptr_ordered_grad = ordered_gradients;
         }
-        ptr_ordered_grad = ordered_gradients;
       }
     }
     OMP_INIT_EX();
diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp
index 2a589fa24ef8..1fc47c46787f 100644
--- a/src/io/metadata.cpp
+++ b/src/io/metadata.cpp
@@ -5,6 +5,7 @@
 #include <LightGBM/dataset.h>
 #include <LightGBM/utils/common.h>
 
+#include <set>
 #include <string>
 #include <vector>
 
@@ -15,7 +16,9 @@ Metadata::Metadata() {
   num_init_score_ = 0;
   num_data_ = 0;
   num_queries_ = 0;
+  num_positions_ = 0;
   weight_load_from_file_ = false;
+  position_load_from_file_ = false;
   query_load_from_file_ = false;
   init_score_load_from_file_ = false;
   #ifdef USE_CUDA
@@ -28,6 +31,7 @@ void Metadata::Init(const char* data_filename) {
   // for lambdarank, it needs query data for partition data in distributed learning
   LoadQueryBoundaries();
   LoadWeights();
+  LoadPositions();
   CalculateQueryWeights();
   LoadInitialScore(data_filename_);
 }
@@ -214,6 +218,13 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
       Log::Fatal("Weights size doesn't match data size");
     }
 
+    // check positions
+    if (!positions_.empty() && num_positions_ != num_data_) {
+      Log::Fatal("Positions size (%i) doesn't match data size (%i)", num_positions_, num_data_);
+      positions_.clear();
+      num_positions_ = 0;
+    }
+
     // check query boundries
     if (!query_boundaries_.empty() && query_boundaries_[num_queries_] != num_data_) {
       query_boundaries_.clear();
@@ -251,6 +262,25 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
         old_weights.clear();
       }
     }
+    // check positions
+    if (position_load_from_file_) {
+      if (positions_.size() > 0 && num_positions_ != num_all_data) {
+        positions_.clear();
+        num_positions_ = 0;
+        Log::Fatal("Positions size (%i) doesn't match data size (%i)", num_positions_, num_data_);
+      }
+      // get local positions
+      if (!positions_.empty()) {
+        auto old_positions = positions_;
+        num_positions_ = num_data_;
+        positions_ = std::vector<data_size_t>(num_data_);
+        #pragma omp parallel for schedule(static, 512)
+        for (int i = 0; i < static_cast<int>(used_data_indices.size()); ++i) {
+          positions_[i] = old_positions[used_data_indices[i]];
+        }
+        old_positions.clear();
+      }
+    }
     if (query_load_from_file_) {
       // check query boundries
       if (!query_boundaries_.empty() && query_boundaries_[num_queries_] != num_all_data) {
@@ -489,6 +519,47 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
   #endif  // USE_CUDA
 }
 
+void Metadata::SetPosition(const data_size_t* positions, data_size_t len) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  // save to nullptr
+  if (positions == nullptr || len == 0) {
+    positions_.clear();
+    num_positions_ = 0;
+    return;
+  }
+  #ifdef USE_CUDA
+  Log::Fatal("Positions in learning to rank is not supported in CUDA version yet.");
+  #endif  // USE_CUDA
+  if (num_data_ != len) {
+    Log::Fatal("Positions size (%i) doesn't match data size (%i)", len, num_data_);
+  }
+  if (positions_.empty()) {
+    positions_.resize(num_data_);
+  } else {
+    Log::Warning("Overwritting positions in dataset.");
+  }
+  num_positions_ = num_data_;
+
+  position_load_from_file_ = false;
+
+  position_ids_.clear();
+  std::unordered_map<data_size_t, int> map_id2pos;
+  for (data_size_t i = 0; i < num_positions_; ++i) {
+    if (map_id2pos.count(positions[i]) == 0) {
+      int pos = static_cast<int>(map_id2pos.size());
+      map_id2pos[positions[i]] = pos;
+      position_ids_.push_back(std::to_string(positions[i]));
+    }
+  }
+
+  Log::Debug("number of unique positions found = %ld", position_ids_.size());
+
+  #pragma omp parallel for schedule(static, 512) if (num_positions_ >= 1024)
+  for (data_size_t i = 0; i < num_positions_; ++i) {
+    positions_[i] = map_id2pos.at(positions[i]);
+  }
+}
+
 void Metadata::InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len) {
   if (!queries) {
     Log::Fatal("Passed null queries");
@@ -528,6 +599,32 @@ void Metadata::LoadWeights() {
   weight_load_from_file_ = true;
 }
 
+void Metadata::LoadPositions() {
+  num_positions_ = 0;
+  std::string position_filename(data_filename_);
+  // default position file name
+  position_filename.append(".position");
+  TextReader<size_t> reader(position_filename.c_str(), false);
+  reader.ReadAllLines();
+  if (reader.Lines().empty()) {
+    return;
+  }
+  Log::Info("Loading positions from %s ...", position_filename.c_str());
+  num_positions_ = static_cast<data_size_t>(reader.Lines().size());
+  positions_ = std::vector<data_size_t>(num_positions_);
+  position_ids_ = std::vector<std::string>();
+  std::unordered_map<std::string, data_size_t> map_id2pos;
+  for (data_size_t i = 0; i < num_positions_; ++i) {
+    std::string& line = reader.Lines()[i];
+    if (map_id2pos.count(line) == 0) {
+      map_id2pos[line] = static_cast<data_size_t>(position_ids_.size());
+      position_ids_.push_back(line);
+    }
+    positions_[i] = map_id2pos.at(line);
+  }
+  position_load_from_file_ = true;
+}
+
 void Metadata::LoadInitialScore(const std::string& data_filename) {
   num_init_score_ = 0;
   std::string init_score_filename(data_filename);
diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp
index 653fc6e8609a..6bd5324812f8 100644
--- a/src/objective/rank_objective.hpp
+++ b/src/objective/rank_objective.hpp
@@ -25,7 +25,10 @@ namespace LightGBM {
 class RankingObjective : public ObjectiveFunction {
  public:
   explicit RankingObjective(const Config& config)
-      : seed_(config.objective_seed) {}
+      : seed_(config.objective_seed) {
+    learning_rate_ = config.learning_rate;
+    position_bias_regularization_ = config.lambdarank_position_bias_regularization;
+  }
 
   explicit RankingObjective(const std::vector<std::string>&) : seed_(0) {}
 
@@ -37,12 +40,20 @@ class RankingObjective : public ObjectiveFunction {
     label_ = metadata.label();
     // get weights
     weights_ = metadata.weights();
+    // get positions
+    positions_ = metadata.positions();
+    // get position ids
+    position_ids_ = metadata.position_ids();
+    // get number of different position ids
+    num_position_ids_ = static_cast<data_size_t>(metadata.num_position_ids());
     // get boundries
     query_boundaries_ = metadata.query_boundaries();
     if (query_boundaries_ == nullptr) {
       Log::Fatal("Ranking tasks require query information");
     }
     num_queries_ = metadata.num_queries();
+    // initialize position bias vectors
+    pos_biases_.resize(num_position_ids_, 0.0);
   }
 
   void GetGradients(const double* score, score_t* gradients,
@@ -51,7 +62,13 @@ class RankingObjective : public ObjectiveFunction {
     for (data_size_t i = 0; i < num_queries_; ++i) {
       const data_size_t start = query_boundaries_[i];
       const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i];
-      GetGradientsForOneQuery(i, cnt, label_ + start, score + start,
+      std::vector<double> score_adjusted;
+      if (num_position_ids_ > 0) {
+        for (data_size_t j = 0; j < cnt; ++j) {
+          score_adjusted.push_back(score[start + j] + pos_biases_[positions_[start + j]]);
+        }
+      }
+      GetGradientsForOneQuery(i, cnt, label_ + start, num_position_ids_ > 0 ? score_adjusted.data() : score + start,
                               gradients + start, hessians + start);
       if (weights_ != nullptr) {
         for (data_size_t j = 0; j < cnt; ++j) {
@@ -62,6 +79,9 @@ class RankingObjective : public ObjectiveFunction {
         }
       }
     }
+    if (num_position_ids_ > 0) {
+      UpdatePositionBiasFactors(gradients, hessians);
+    }
   }
 
   virtual void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt,
@@ -69,6 +89,8 @@ class RankingObjective : public ObjectiveFunction {
                                        const double* score, score_t* lambdas,
                                        score_t* hessians) const = 0;
 
+  virtual void UpdatePositionBiasFactors(const score_t* /*lambdas*/, const score_t* /*hessians*/) const {}
+
   const char* GetName() const override = 0;
 
   std::string ToString() const override {
@@ -88,8 +110,20 @@ class RankingObjective : public ObjectiveFunction {
   const label_t* label_;
   /*! \brief Pointer of weights */
   const label_t* weights_;
+  /*! \brief Pointer of positions */
+  const data_size_t* positions_;
+  /*! \brief Pointer of position IDs */
+  const std::string* position_ids_;
+  /*! \brief Pointer of label */
+  data_size_t num_position_ids_;
   /*! \brief Query boundaries */
   const data_size_t* query_boundaries_;
+  /*! \brief Position bias factors */
+  mutable std::vector<label_t> pos_biases_;
+  /*! \brief Learning rate to update position bias factors */
+  double learning_rate_;
+  /*! \brief Position bias regularization */
+  double position_bias_regularization_;
 };
 
 /*!
@@ -253,9 +287,67 @@ class LambdarankNDCG : public RankingObjective {
     }
   }
 
+  void UpdatePositionBiasFactors(const score_t* lambdas, const score_t* hessians) const override {
+    /// get number of threads
+    int num_threads = 1;
+    #pragma omp parallel
+    #pragma omp master
+    {
+      num_threads = omp_get_num_threads();
+    }
+    // create per-thread buffers for first and second derivatives of utility w.r.t. position bias factors
+    std::vector<double> bias_first_derivatives(num_position_ids_ * num_threads, 0.0);
+    std::vector<double> bias_second_derivatives(num_position_ids_ * num_threads, 0.0);
+    std::vector<int> instance_counts(num_position_ids_ * num_threads, 0);
+    #pragma omp parallel for schedule(guided)
+    for (data_size_t i = 0; i < num_data_; i++) {
+      // get thread ID
+      const int tid = omp_get_thread_num();
+      size_t offset = static_cast<size_t>(positions_[i] + tid * num_position_ids_);
+      // accumulate first derivatives of utility w.r.t. position bias factors, for each position
+      bias_first_derivatives[offset] -= lambdas[i];
+      // accumulate second derivatives of utility w.r.t. position bias factors, for each position
+      bias_second_derivatives[offset] -= hessians[i];
+      instance_counts[offset]++;
+    }
+    #pragma omp parallel for schedule(guided)
+    for (data_size_t i = 0; i < num_position_ids_; i++) {
+      double bias_first_derivative = 0.0;
+      double bias_second_derivative = 0.0;
+      int instance_count = 0;
+      // aggregate derivatives from per-thread buffers
+      for (int tid = 0; tid < num_threads; tid++) {
+        size_t offset = static_cast<size_t>(i + tid * num_position_ids_);
+        bias_first_derivative += bias_first_derivatives[offset];
+        bias_second_derivative += bias_second_derivatives[offset];
+        instance_count += instance_counts[offset];
+      }
+      // L2 regularization on position bias factors
+      bias_first_derivative -= pos_biases_[i] * position_bias_regularization_ * instance_count;
+      bias_second_derivative -= position_bias_regularization_ * instance_count;
+      // do Newton-Raphson step to update position bias factors
+      pos_biases_[i] += learning_rate_ * bias_first_derivative / (std::abs(bias_second_derivative) + 0.001);
+    }
+    LogDebugPositionBiasFactors();
+  }
+
   const char* GetName() const override { return "lambdarank"; }
 
  protected:
+  void LogDebugPositionBiasFactors() const {
+    std::stringstream message_stream;
+    message_stream << std::setw(15) << "position"
+      << std::setw(15) << "bias_factor"
+      << std::endl;
+    Log::Debug(message_stream.str().c_str());
+    message_stream.str("");
+    for (int i = 0; i < num_position_ids_; ++i) {
+      message_stream << std::setw(15) << position_ids_[i]
+        << std::setw(15) << pos_biases_[i];
+      Log::Debug(message_stream.str().c_str());
+      message_stream.str("");
+    }
+  }
   /*! \brief Sigmoid param */
   double sigmoid_;
   /*! \brief Normalize the lambdas or not */
diff --git a/src/objective/regression_objective.hpp b/src/objective/regression_objective.hpp
index 71c1a6d7cdfe..eb149756c205 100644
--- a/src/objective/regression_objective.hpp
+++ b/src/objective/regression_objective.hpp
@@ -24,14 +24,14 @@ namespace LightGBM {
     for (data_size_t i = 0; i < cnt_data; ++i) {                          \
       ref_data[i] = data_reader(i);                                       \
     }                                                                     \
-    const double float_pos = static_cast<double>(1.0 - alpha) * cnt_data; \
-    const data_size_t pos = static_cast<data_size_t>(float_pos);          \
+    const double float_pos = static_cast<double>(cnt_data - 1) * (1.0 - alpha);   \
+    const data_size_t pos = static_cast<data_size_t>(float_pos) + 1;      \
     if (pos < 1) {                                                        \
       return ref_data[ArrayArgs<T>::ArgMax(ref_data)];                    \
     } else if (pos >= cnt_data) {                                         \
       return ref_data[ArrayArgs<T>::ArgMin(ref_data)];                    \
     } else {                                                              \
-      const double bias = float_pos - pos;                                \
+      const double bias = float_pos - (pos - 1);                          \
       if (pos > cnt_data / 2) {                                           \
         ArrayArgs<T>::ArgMaxAtK(&ref_data, 0, cnt_data, pos - 1);         \
         T v1 = ref_data[pos - 1];                                         \
diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp
index 163bfc4df9ca..fdf55693a0e9 100644
--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -53,6 +53,25 @@ class LeafSplits {
     weight_ = weight;
   }
 
+  /*!
+  * \brief Init split on current leaf on partial data. 
+  * \param leaf Index of current leaf
+  * \param data_partition current data partition
+  * \param sum_gradients
+  * \param sum_hessians
+  * \param sum_gradients_and_hessians
+  * \param weight
+  */
+  void Init(int leaf, const DataPartition* data_partition, double sum_gradients,
+            double sum_hessians, int64_t sum_gradients_and_hessians, double weight) {
+    leaf_index_ = leaf;
+    data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
+    sum_gradients_ = sum_gradients;
+    sum_hessians_ = sum_hessians;
+    int_sum_gradients_and_hessians_ = sum_gradients_and_hessians;
+    weight_ = weight;
+  }
+
   /*!
   * \brief Init split on current leaf on partial data.
   * \param leaf Index of current leaf
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index c322c1a796c2..37d9a2a50713 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -841,32 +841,65 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
 #endif
 
   // init the leaves that used on next iteration
-  if (best_split_info.left_count < best_split_info.right_count) {
-    CHECK_GT(best_split_info.left_count, 0);
-    smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(),
-                               best_split_info.left_sum_gradient,
-                               best_split_info.left_sum_hessian,
-                               best_split_info.left_output);
-    larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
-                              best_split_info.right_sum_gradient,
-                              best_split_info.right_sum_hessian,
-                              best_split_info.right_output);
+  if (!config_->use_quantized_grad) {
+    if (best_split_info.left_count < best_split_info.right_count) {
+      CHECK_GT(best_split_info.left_count, 0);
+      smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(),
+                                 best_split_info.left_sum_gradient,
+                                 best_split_info.left_sum_hessian,
+                                 best_split_info.left_output);
+      larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
+                                best_split_info.right_sum_gradient,
+                                best_split_info.right_sum_hessian,
+                                best_split_info.right_output);
+    } else {
+      CHECK_GT(best_split_info.right_count, 0);
+      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(),
+                                 best_split_info.right_sum_gradient,
+                                 best_split_info.right_sum_hessian,
+                                 best_split_info.right_output);
+      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(),
+                                best_split_info.left_sum_gradient,
+                                best_split_info.left_sum_hessian,
+                                best_split_info.left_output);
+    }
   } else {
-    CHECK_GT(best_split_info.right_count, 0);
-    smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(),
-                               best_split_info.right_sum_gradient,
-                               best_split_info.right_sum_hessian,
-                               best_split_info.right_output);
-    larger_leaf_splits_->Init(*left_leaf, data_partition_.get(),
-                              best_split_info.left_sum_gradient,
-                              best_split_info.left_sum_hessian,
-                              best_split_info.left_output);
+    if (best_split_info.left_count < best_split_info.right_count) {
+      CHECK_GT(best_split_info.left_count, 0);
+      smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(),
+                                 best_split_info.left_sum_gradient,
+                                 best_split_info.left_sum_hessian,
+                                 best_split_info.left_sum_gradient_and_hessian,
+                                 best_split_info.left_output);
+      larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
+                                 best_split_info.right_sum_gradient,
+                                 best_split_info.right_sum_hessian,
+                                 best_split_info.right_sum_gradient_and_hessian,
+                                 best_split_info.right_output);
+    } else {
+      CHECK_GT(best_split_info.right_count, 0);
+      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(),
+                                 best_split_info.right_sum_gradient,
+                                 best_split_info.right_sum_hessian,
+                                 best_split_info.right_sum_gradient_and_hessian,
+                                 best_split_info.right_output);
+      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(),
+                                best_split_info.left_sum_gradient,
+                                best_split_info.left_sum_hessian,
+                                best_split_info.left_sum_gradient_and_hessian,
+                                best_split_info.left_output);
+    }
   }
   if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) {
     gradient_discretizer_->SetNumBitsInHistogramBin<false>(*left_leaf, *right_leaf,
                                                     data_partition_->leaf_count(*left_leaf),
                                                     data_partition_->leaf_count(*right_leaf));
   }
+
+  #ifdef DEBUG
+  CheckSplit(best_split_info, *left_leaf, *right_leaf);
+  #endif
+
   auto leaves_need_update = constraints_->Update(
       is_numerical_split, *left_leaf, *right_leaf,
       best_split_info.monotone_type, best_split_info.right_output,
@@ -1024,4 +1057,48 @@ std::vector<int8_t> node_used_features = col_sampler_.GetByNode(tree, leaf);
   *split = bests[best_idx];
 }
 
+#ifdef DEBUG
+void SerialTreeLearner::CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index) {
+  data_size_t num_data_in_left = 0;
+  data_size_t num_data_in_right = 0;
+  const data_size_t* data_indices_in_left = data_partition_->GetIndexOnLeaf(left_leaf_index, &num_data_in_left);
+  const data_size_t* data_indices_in_right = data_partition_->GetIndexOnLeaf(right_leaf_index, &num_data_in_right);
+  if (config_->use_quantized_grad) {
+    int32_t sum_left_gradient = 0;
+    int32_t sum_left_hessian = 0;
+    int32_t sum_right_gradient = 0;
+    int32_t sum_right_hessian = 0;
+    const int8_t* discretized_grad_and_hess = gradient_discretizer_->discretized_gradients_and_hessians();
+    for (data_size_t i = 0; i < num_data_in_left; ++i) {
+      const data_size_t index = data_indices_in_left[i];
+      sum_left_gradient += discretized_grad_and_hess[2 * index + 1];
+      sum_left_hessian += discretized_grad_and_hess[2 * index];
+    }
+    for (data_size_t i = 0; i < num_data_in_right; ++i) {
+      const data_size_t index = data_indices_in_right[i];
+      sum_right_gradient += discretized_grad_and_hess[2 * index + 1];
+      sum_right_hessian += discretized_grad_and_hess[2 * index];
+    }
+    Log::Warning("============================ start leaf split info ============================");
+    Log::Warning("left_leaf_index = %d, right_leaf_index = %d", left_leaf_index, right_leaf_index);
+    Log::Warning("num_data_in_left = %d, num_data_in_right = %d", num_data_in_left, num_data_in_right);
+    Log::Warning("sum_left_gradient = %d, best_split_info->left_sum_gradient_and_hessian.gradient = %d", sum_left_gradient,
+      static_cast<int32_t>(best_split_info.left_sum_gradient_and_hessian >> 32));
+    Log::Warning("sum_left_hessian = %d, best_split_info->left_sum_gradient_and_hessian.hessian = %d", sum_left_hessian,
+      static_cast<int32_t>(best_split_info.left_sum_gradient_and_hessian & 0x00000000ffffffff));
+    Log::Warning("sum_right_gradient = %d, best_split_info->right_sum_gradient_and_hessian.gradient = %d", sum_right_gradient,
+      static_cast<int32_t>(best_split_info.right_sum_gradient_and_hessian >> 32));
+    Log::Warning("sum_right_hessian = %d, best_split_info->right_sum_gradient_and_hessian.hessian = %d", sum_right_hessian,
+      static_cast<int32_t>(best_split_info.right_sum_gradient_and_hessian & 0x00000000ffffffff));
+    CHECK_EQ(num_data_in_left, best_split_info.left_count);
+    CHECK_EQ(num_data_in_right, best_split_info.right_count);
+    CHECK_EQ(sum_left_gradient, static_cast<int32_t>(best_split_info.left_sum_gradient_and_hessian >> 32))
+    CHECK_EQ(sum_left_hessian, static_cast<int32_t>(best_split_info.left_sum_gradient_and_hessian & 0x00000000ffffffff));
+    CHECK_EQ(sum_right_gradient, static_cast<int32_t>(best_split_info.right_sum_gradient_and_hessian >> 32));
+    CHECK_EQ(sum_right_hessian, static_cast<int32_t>(best_split_info.right_sum_gradient_and_hessian & 0x00000000ffffffff));
+    Log::Warning("============================ end leaf split info ============================");
+  }
+}
+#endif
+
 }  // namespace LightGBM
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index d815d265c0d2..93e0787a90cf 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -171,7 +171,9 @@ class SerialTreeLearner: public TreeLearner {
 
   std::set<int> FindAllForceFeatures(Json force_split_leaf_setting);
 
+  #ifdef DEBUG
   void CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index);
+  #endif
 
   /*!
   * \brief Get the number of data in a leaf
diff --git a/tests/distributed/_test_distributed.py b/tests/distributed/_test_distributed.py
index 9ede4e0800fb..e37dafee6393 100644
--- a/tests/distributed/_test_distributed.py
+++ b/tests/distributed/_test_distributed.py
@@ -25,7 +25,7 @@ def _find_random_open_port() -> int:
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         s.bind(('', 0))
         port = s.getsockname()[1]
-    return port
+    return port  # noqa: RET504
 
 
 def _generate_n_ports(n: int) -> Generator[int, None, None]:
@@ -47,8 +47,7 @@ def create_data(task: str, n_samples: int = 1_000) -> np.ndarray:
         X, y = make_blobs(n_samples, centers=centers, random_state=42)
     elif task == 'regression':
         X, y = make_regression(n_samples, n_features=4, n_informative=2, random_state=42)
-    dataset = np.hstack([y.reshape(-1, 1), X])
-    return dataset
+    return np.hstack([y.reshape(-1, 1), X])
 
 
 class DistributedMockup:
@@ -149,8 +148,7 @@ def predict(self, predict_config: Dict[str, Any]) -> np.ndarray:
         result = subprocess.run(cmd)
         if result.returncode != 0:
             raise RuntimeError('Error in prediction')
-        y_pred = np.loadtxt(str(TESTS_DIR / 'predictions.txt'))
-        return y_pred
+        return np.loadtxt(str(TESTS_DIR / 'predictions.txt'))
 
     def write_train_config(self, i: int) -> None:
         """Create a file train{i}.conf with the required configuration to train.
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index 267041eae2e4..7f8980c271f7 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -723,7 +723,12 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
     pd = pytest.importorskip('pandas')
     X = np.random.rand(10, 2).astype(dtype)
     df = pd.DataFrame(X)
-    built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
+    built_data = lgb.basic._data_from_pandas(
+        data=df,
+        feature_name=feature_name,
+        categorical_feature="auto",
+        pandas_categorical=None
+    )[0]
     assert built_data.dtype == dtype
     assert np.shares_memory(X, built_data)
 
@@ -734,7 +739,12 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name):
     X = np.random.choice(['a', 'b'], 100).reshape(-1, 1)
     column_name = 'a' if feature_name == 'auto' else feature_name[0]
     df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category')
-    data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
+    data = lgb.basic._data_from_pandas(
+        data=df,
+        feature_name=feature_name,
+        categorical_feature="auto",
+        pandas_categorical=None
+    )[0]
     # check that the original data wasn't modified
     np.testing.assert_equal(df[column_name], X[:, 0])
     # check that the built data has the codes
@@ -806,3 +816,10 @@ def test_set_leaf_output():
         leaf_output = bst.get_leaf_output(tree_id=0, leaf_id=leaf_id)
         bst.set_leaf_output(tree_id=0, leaf_id=leaf_id, value=leaf_output + 1)
     np.testing.assert_allclose(bst.predict(X), y_pred + 1)
+
+
+def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset():
+    ds = lgb.Dataset(
+        data=np.random.randn(100, 3),
+    )
+    assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]
diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index cb69440b3cde..9da50945385c 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -1838,7 +1838,6 @@ def test_distributed_quantized_training(cluster):
             'num_grad_quant_bins': 30,
             'quant_train_renew_leaf': True,
             'verbose': -1,
-            'force_row_wise': True,
         }
 
         quant_dask_classifier = lgb.DaskLGBMRegressor(
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index bf2bd6a8b01d..4ef72888e767 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -9,6 +9,7 @@
 import re
 from os import getenv
 from pathlib import Path
+from shutil import copyfile
 
 import numpy as np
 import psutil
@@ -19,7 +20,7 @@
 from sklearn.model_selection import GroupKFold, TimeSeriesSplit, train_test_split
 
 import lightgbm as lgb
-from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame
+from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame, pd_Series
 
 from .utils import (SERIALIZERS, dummy_obj, load_breast_cancer, load_digits, load_iris, logistic_sigmoid,
                     make_synthetic_regression, mse_obj, pickle_and_unpickle_object, sklearn_multiclass_custom_objective,
@@ -142,7 +143,7 @@ def test_regression(objective):
     elif objective == 'quantile':
         assert ret < 1311
     else:
-        assert ret < 338
+        assert ret < 343
     assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret)
 
 
@@ -747,6 +748,171 @@ def test_ranking_prediction_early_stopping():
         np.testing.assert_allclose(ret_early, ret_early_more_strict)
 
 
+# Simulates position bias for a given ranking dataset.
+# The ouput dataset is identical to the input one with the exception for the relevance labels.
+# The new labels are generated according to an instance of a cascade user model:
+# for each query, the user is simulated to be traversing the list of documents ranked by a baseline ranker
+# (in our example it is simply the ordering by some feature correlated with relevance, e.g., 34)
+# and clicks on that document (new_label=1) with some probability 'pclick' depending on its true relevance;
+# at each position the user may stop the traversal with some probability pstop. For the non-clicked documents,
+# new_label=0. Thus the generated new labels are biased towards the baseline ranker. 
+# The positions of the documents in the ranked lists produced by the baseline, are returned.
+def simulate_position_bias(file_dataset_in, file_query_in, file_dataset_out, baseline_feature):
+    # a mapping of a document's true relevance (defined on a 5-grade scale) into the probability of clicking it
+    def get_pclick(label):
+        if label == 0:
+            return 0.4
+        elif label == 1:
+            return 0.6
+        elif label == 2:
+            return 0.7
+        elif label == 3:
+            return 0.8
+        else:
+            return 0.9
+    # an instantiation of a cascade model where the user stops with probability 0.2 after observing each document
+    pstop = 0.2
+ 
+    f_dataset_in = open(file_dataset_in, 'r')
+    f_dataset_out = open(file_dataset_out, 'w')
+    random.seed(10)
+    positions_all = []
+    for line in open(file_query_in):
+        docs_num = int (line)
+        lines = []
+        index_values = []    
+        positions = [0] * docs_num
+        for index in range(docs_num):
+            features = f_dataset_in.readline().split()
+            lines.append(features)
+            val = 0.0
+            for feature_val in features:
+                feature_val_split = feature_val.split(":")           
+                if int(feature_val_split[0]) == baseline_feature:
+                    val = float(feature_val_split[1])
+            index_values.append([index, val])
+        index_values.sort(key=lambda x: -x[1])
+        stop = False 
+        for pos in range(docs_num):
+            index = index_values[pos][0]
+            new_label = 0
+            if not stop:
+                label = int(lines[index][0])
+                pclick = get_pclick(label)
+                if random.random() < pclick:
+                    new_label = 1       
+                stop = random.random() < pstop
+            lines[index][0] = str(new_label)
+            positions[index] = pos
+        for features in lines:
+            f_dataset_out.write(' '.join(features) + '\n')
+        positions_all.extend(positions)
+    f_dataset_out.close()
+    return positions_all
+
+
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Positions in learning to rank is not supported in CUDA version yet')
+def test_ranking_with_position_information_with_file(tmp_path):
+    rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
+    params = {
+        'objective': 'lambdarank',
+        'verbose': -1,
+        'eval_at': [3],
+        'metric': 'ndcg',
+        'bagging_freq': 1,
+        'bagging_fraction': 0.9,
+        'min_data_in_leaf': 50,
+        'min_sum_hessian_in_leaf': 5.0
+    }
+
+    # simulate position bias for the train dataset and put the train dataset with biased labels to temp directory
+    positions = simulate_position_bias(str(rank_example_dir / 'rank.train'), str(rank_example_dir / 'rank.train.query'), str(tmp_path / 'rank.train'), baseline_feature=34)
+    copyfile(str(rank_example_dir / 'rank.train.query'), str(tmp_path / 'rank.train.query'))
+    copyfile(str(rank_example_dir / 'rank.test'), str(tmp_path / 'rank.test'))
+    copyfile(str(rank_example_dir / 'rank.test.query'), str(tmp_path / 'rank.test.query'))
+
+    lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params)
+    lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))]
+    gbm_baseline = lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50)
+
+    f_positions_out = open(str(tmp_path / 'rank.train.position'), 'w')
+    for pos in positions:
+        f_positions_out.write(str(pos) + '\n')
+    f_positions_out.close()
+
+    lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params)
+    lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))]
+    gbm_unbiased_with_file = lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50)
+    
+    # the performance of the unbiased LambdaMART should outperform the plain LambdaMART on the dataset with position bias
+    assert gbm_baseline.best_score['valid_0']['ndcg@3'] + 0.03 <= gbm_unbiased_with_file.best_score['valid_0']['ndcg@3']
+
+    # add extra row to position file
+    with open(str(tmp_path / 'rank.train.position'), 'a') as file:
+        file.write('pos_1000\n')
+        file.close()
+    lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params)
+    lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))]
+    with pytest.raises(lgb.basic.LightGBMError, match="Positions size \(3006\) doesn't match data size"):
+        lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50)
+
+
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Positions in learning to rank is not supported in CUDA version yet')
+def test_ranking_with_position_information_with_dataset_constructor(tmp_path):
+    rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
+    params = {
+        'objective': 'lambdarank',
+        'verbose': -1,
+        'eval_at': [3],
+        'metric': 'ndcg',
+        'bagging_freq': 1,
+        'bagging_fraction': 0.9,
+        'min_data_in_leaf': 50,
+        'min_sum_hessian_in_leaf': 5.0,
+        'num_threads': 1,
+        'deterministic': True,
+        'seed': 0
+    }
+
+    # simulate position bias for the train dataset and put the train dataset with biased labels to temp directory
+    positions = simulate_position_bias(str(rank_example_dir / 'rank.train'), str(rank_example_dir / 'rank.train.query'), str(tmp_path / 'rank.train'), baseline_feature=34)
+    copyfile(str(rank_example_dir / 'rank.train.query'), str(tmp_path / 'rank.train.query'))
+    copyfile(str(rank_example_dir / 'rank.test'), str(tmp_path / 'rank.test'))
+    copyfile(str(rank_example_dir / 'rank.test.query'), str(tmp_path / 'rank.test.query'))
+
+    lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params)
+    lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))]
+    gbm_baseline = lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50)
+
+    positions = np.array(positions)
+
+    # test setting positions through Dataset constructor with numpy array
+    lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params, position=positions)
+    lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))]
+    gbm_unbiased = lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50)
+
+    # the performance of the unbiased LambdaMART should outperform the plain LambdaMART on the dataset with position bias
+    assert gbm_baseline.best_score['valid_0']['ndcg@3'] + 0.03 <= gbm_unbiased.best_score['valid_0']['ndcg@3']
+
+    if PANDAS_INSTALLED:
+        # test setting positions through Dataset constructor with pandas Series
+        lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params, position=pd_Series(positions))
+        lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))]
+        gbm_unbiased_pandas_series = lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50)
+        assert gbm_unbiased.best_score['valid_0']['ndcg@3'] == gbm_unbiased_pandas_series.best_score['valid_0']['ndcg@3']
+
+    # test setting positions through set_position
+    lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params)
+    lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))]
+    lgb_train.set_position(positions)
+    gbm_unbiased_set_position = lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50)
+    assert gbm_unbiased.best_score['valid_0']['ndcg@3'] == gbm_unbiased_set_position.best_score['valid_0']['ndcg@3']
+
+    # test get_position works
+    positions_from_get = lgb_train.get_position()
+    np.testing.assert_array_equal(positions_from_get, positions)
+
+
 def test_early_stopping():
     X, y = load_breast_cancer(return_X_y=True)
     params = {
@@ -926,6 +1092,33 @@ def test_early_stopping_min_delta(first_only, single_metric, greater_is_better):
         assert np.greater_equal(last_score, best_score - min_delta).any()
 
 
+def test_early_stopping_can_be_triggered_via_custom_callback():
+    X, y = make_synthetic_regression()
+
+    def _early_stop_after_seventh_iteration(env):
+        if env.iteration == 6:
+            exc = lgb.EarlyStopException(
+                best_iteration=6,
+                best_score=[("some_validation_set", "some_metric", 0.708, True)]
+            )
+            raise exc
+
+    bst = lgb.train(
+        params={
+            "objective": "regression",
+            "verbose": -1,
+            "num_leaves": 2
+        },
+        train_set=lgb.Dataset(X, label=y),
+        num_boost_round=23,
+        callbacks=[_early_stop_after_seventh_iteration]
+    )
+    assert bst.num_trees() == 7
+    assert bst.best_score["some_validation_set"]["some_metric"] == 0.708
+    assert bst.best_iteration == 7
+    assert bst.current_iteration() == 7
+
+
 def test_continue_train():
     X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
@@ -1341,6 +1534,203 @@ def train_and_predict(init_model=None, return_model=False):
         assert ret_origin == pytest.approx(ret)
 
 
+def test_all_expected_params_are_written_out_to_model_text(tmp_path):
+    X, y = make_synthetic_regression()
+    params = {
+        'objective': 'mape',
+        'metric': ['l2', 'mae'],
+        'seed': 708,
+        'data_sample_strategy': 'bagging',
+        'sub_row': 0.8234,
+        'verbose': -1
+    }
+    dtrain = lgb.Dataset(data=X, label=y)
+    gbm = lgb.train(
+        params=params,
+        train_set=dtrain,
+        num_boost_round=3
+    )
+
+    model_txt_from_memory = gbm.model_to_string()
+    model_file = tmp_path / "out.model"
+    gbm.save_model(filename=model_file)
+    with open(model_file, "r") as f:
+        model_txt_from_file = f.read()
+
+    assert model_txt_from_memory == model_txt_from_file
+
+    # entries whose values should reflect params passed to lgb.train()
+    non_default_param_entries = [
+        "[objective: mape]",
+        # 'l1' was passed in with alias 'mae'
+        "[metric: l2,l1]",
+        "[data_sample_strategy: bagging]",
+        "[seed: 708]",
+        # NOTE: this was passed in with alias 'sub_row'
+        "[bagging_fraction: 0.8234]",
+        "[num_iterations: 3]",
+    ]
+
+    # entries with default values of params
+    default_param_entries = [
+        "[boosting: gbdt]",
+        "[tree_learner: serial]",
+        "[data: ]",
+        "[valid: ]",
+        "[learning_rate: 0.1]",
+        "[num_leaves: 31]",
+        "[num_threads: 0]",
+        "[deterministic: 0]",
+        "[histogram_pool_size: -1]",
+        "[max_depth: -1]",
+        "[min_data_in_leaf: 20]",
+        "[min_sum_hessian_in_leaf: 0.001]",
+        "[pos_bagging_fraction: 1]",
+        "[neg_bagging_fraction: 1]",
+        "[bagging_freq: 0]",
+        "[bagging_seed: 15415]",
+        "[feature_fraction: 1]",
+        "[feature_fraction_bynode: 1]",
+        "[feature_fraction_seed: 32671]",
+        "[extra_trees: 0]",
+        "[extra_seed: 6642]",
+        "[early_stopping_round: 0]",
+        "[first_metric_only: 0]",
+        "[max_delta_step: 0]",
+        "[lambda_l1: 0]",
+        "[lambda_l2: 0]",
+        "[linear_lambda: 0]",
+        "[min_gain_to_split: 0]",
+        "[drop_rate: 0.1]",
+        "[max_drop: 50]",
+        "[skip_drop: 0.5]",
+        "[xgboost_dart_mode: 0]",
+        "[uniform_drop: 0]",
+        "[drop_seed: 20623]",
+        "[top_rate: 0.2]",
+        "[other_rate: 0.1]",
+        "[min_data_per_group: 100]",
+        "[max_cat_threshold: 32]",
+        "[cat_l2: 10]",
+        "[cat_smooth: 10]",
+        "[max_cat_to_onehot: 4]",
+        "[top_k: 20]",
+        "[monotone_constraints: ]",
+        "[monotone_constraints_method: basic]",
+        "[monotone_penalty: 0]",
+        "[feature_contri: ]",
+        "[forcedsplits_filename: ]",
+        "[refit_decay_rate: 0.9]",
+        "[cegb_tradeoff: 1]",
+        "[cegb_penalty_split: 0]",
+        "[cegb_penalty_feature_lazy: ]",
+        "[cegb_penalty_feature_coupled: ]",
+        "[path_smooth: 0]",
+        "[interaction_constraints: ]",
+        "[verbosity: -1]",
+        "[saved_feature_importance_type: 0]",
+        "[use_quantized_grad: 0]",
+        "[num_grad_quant_bins: 4]",
+        "[quant_train_renew_leaf: 0]",
+        "[stochastic_rounding: 1]",
+        "[linear_tree: 0]",
+        "[max_bin: 255]",
+        "[max_bin_by_feature: ]",
+        "[min_data_in_bin: 3]",
+        "[bin_construct_sample_cnt: 200000]",
+        "[data_random_seed: 2350]",
+        "[is_enable_sparse: 1]",
+        "[enable_bundle: 1]",
+        "[use_missing: 1]",
+        "[zero_as_missing: 0]",
+        "[feature_pre_filter: 1]",
+        "[pre_partition: 0]",
+        "[two_round: 0]",
+        "[header: 0]",
+        "[label_column: ]",
+        "[weight_column: ]",
+        "[group_column: ]",
+        "[ignore_column: ]",
+        "[categorical_feature: ]",
+        "[forcedbins_filename: ]",
+        "[precise_float_parser: 0]",
+        "[parser_config_file: ]",
+        "[objective_seed: 4309]",
+        "[num_class: 1]",
+        "[is_unbalance: 0]",
+        "[scale_pos_weight: 1]",
+        "[sigmoid: 1]",
+        "[boost_from_average: 1]",
+        "[reg_sqrt: 0]",
+        "[alpha: 0.9]",
+        "[fair_c: 1]",
+        "[poisson_max_delta_step: 0.7]",
+        "[tweedie_variance_power: 1.5]",
+        "[lambdarank_truncation_level: 30]",
+        "[lambdarank_norm: 1]",
+        "[label_gain: ]",
+        "[lambdarank_position_bias_regularization: 0]",
+        "[eval_at: ]",
+        "[multi_error_top_k: 1]",
+        "[auc_mu_weights: ]",
+        "[num_machines: 1]",
+        "[local_listen_port: 12400]",
+        "[time_out: 120]",
+        "[machine_list_filename: ]",
+        "[machines: ]",
+        "[gpu_platform_id: -1]",
+        "[gpu_device_id: -1]",
+        "[num_gpu: 1]",
+    ]
+    all_param_entries = non_default_param_entries + default_param_entries
+
+    # add device-specific entries
+    #
+    # passed-in force_col_wise / force_row_wise parameters are ignored on CUDA and GPU builds...
+    # https://github.com/microsoft/LightGBM/blob/1d7ee63686272bceffd522284127573b511df6be/src/io/config.cpp#L375-L377
+    if getenv('TASK', '') == 'cuda':
+        device_entries = [
+            "[force_col_wise: 0]",
+            "[force_row_wise: 1]",
+            "[device_type: cuda]",
+            "[gpu_use_dp: 1]"
+        ]
+    elif getenv('TASK', '') == 'gpu':
+        device_entries = [
+            "[force_col_wise: 1]",
+            "[force_row_wise: 0]",
+            "[device_type: gpu]",
+            "[gpu_use_dp: 0]"
+        ]
+    else:
+        device_entries = [
+            "[force_col_wise: 0]",
+            "[force_row_wise: 0]",
+            "[device_type: cpu]",
+            "[gpu_use_dp: 0]"
+        ]
+
+    all_param_entries += device_entries
+
+    # check that model text has all expected param entries
+    for param_str in all_param_entries:
+        assert param_str in model_txt_from_file
+        assert param_str in model_txt_from_memory
+
+    # since Booster.model_to_string() is used when pickling, check that parameters all
+    # roundtrip pickling successfully too
+    gbm_pkl = pickle_and_unpickle_object(gbm, serializer="joblib")
+    model_txt_from_memory = gbm_pkl.model_to_string()
+    model_file = tmp_path / "out-pkl.model"
+    gbm_pkl.save_model(filename=model_file)
+    with open(model_file, "r") as f:
+        model_txt_from_file = f.read()
+
+    for param_str in all_param_entries:
+        assert param_str in model_txt_from_file
+        assert param_str in model_txt_from_memory
+
+
 def test_pandas_categorical():
     pd = pytest.importorskip("pandas")
     np.random.seed(42)  # sometimes there is no difference how cols are treated (cat or not cat)
@@ -1720,8 +2110,7 @@ def generate_trainset_for_monotone_constraints_tests(x3_to_category=True):
     categorical_features = []
     if x3_to_category:
         categorical_features = [2]
-    trainset = lgb.Dataset(x, label=y, categorical_feature=categorical_features, free_raw_data=False)
-    return trainset
+    return lgb.Dataset(x, label=y, categorical_feature=categorical_features, free_raw_data=False)
 
 
 @pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Monotone constraints are not yet supported by CUDA version')
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index e41719845c0a..2247c9a512d2 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -1561,3 +1561,20 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
     )
     preds = model.predict(X)
     assert spearmanr(preds, y).correlation >= 0.99
+
+
+def test_classifier_fit_detects_classes_every_time():
+    rng = np.random.default_rng(seed=123)
+    nrows = 1000
+    ncols = 20
+
+    X = rng.standard_normal(size=(nrows, ncols))
+    y_bin = (rng.random(size=nrows) <= .3).astype(np.float64)
+    y_multi = rng.integers(4, size=nrows)
+
+    model = lgb.LGBMClassifier(verbose=-1)
+    for _ in range(2):
+        model.fit(X, y_multi)
+        assert model.objective_ == "multiclass"
+        model.fit(X, y_bin)
+        assert model.objective_ == "binary"
diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py
index 29183713d714..df01e29852e7 100644
--- a/tests/python_package_test/utils.py
+++ b/tests/python_package_test/utils.py
@@ -192,4 +192,4 @@ def pickle_and_unpickle_object(obj, serializer):
             filepath=tmp_file.name,
             serializer=serializer
         )
-    return obj_from_disk
+    return obj_from_disk  # noqa: RET504