microsoft · jameslamb · Sep 13, 2023 · Sep 5, 2023 · Sep 7, 2023 · Sep 7, 2023
@@ -29,3 +29,20 @@
 .LGB_VERBOSITY <- as.integer(
  Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
 )
+
+# [description]
+# test that every element of 'x' is in 'y'
+#
+# testthat::expect_in() is not available in version of {testthat}
+# built for R 3.6, this is here to support a similar interface on R 3.6
+.expect_in <- function(x, y) {
+ if (exists("expect_in")) {
+ expect_in(x, y)
+ } else {
+ missing_items <- x[!(x %in% y)]
+ if (length(missing_items) != 0L) {
+ error_msg <- paste0("Some expected items not found: ", toString(missing_items))
+ stop(error_msg)
+ }
+ }
+}
@@ -799,37 +799,166 @@ test_that("all parameters are stored correctly with save_model_to_string()", {
  data = matrix(rnorm(500L), nrow = 100L)
  , label = rnorm(100L)
  )
- nrounds <- 4L
  bst <- lgb.train(
  params = list(
- objective = "regression"
- , metric = "l2"
+ objective = "mape"
+ , metric = c("l2", "mae")
  , num_threads = .LGB_MAX_THREADS
+ , seed = 708L
+ , data_sample_strategy = "bagging"
+ , sub_row = 0.8234
  )
  , data = dtrain
- , nrounds = nrounds
+ , nrounds = 3L
  , verbose = .LGB_VERBOSITY
  )
 
- model_str <- bst$save_model_to_string()
- params_in_file <- .params_from_model_string(model_str = model_str)
+ # entries whose values should reflect params passed to lgb.train()
+ non_default_param_entries <- c(
+ "[objective: mape]"
+ # 'l1' was passed in with alias 'mae'
+ , "[metric: l2,l1]"
+ , "[data_sample_strategy: bagging]"
+ , "[seed: 708]"
+ # this was passed in with alias 'sub_row'
+ , "[bagging_fraction: 0.8234]"
+ , "[num_iterations: 3]"
+ )
+
+ # entries with default values of params
+ default_param_entries <- c(
+ "[boosting: gbdt]"
+ , "[tree_learner: serial]"
+ , "[device_type: cpu]"
+ , "[data: ]"
+ , "[valid: ]"
+ , "[learning_rate: 0.1]"
+ , "[num_leaves: 31]"
+ , sprintf("[num_threads: %i]", .LGB_MAX_THREADS)
+ , "[deterministic: 0]"
+ , "[histogram_pool_size: -1]"
+ , "[max_depth: -1]"
+ , "[min_data_in_leaf: 20]"
+ , "[min_sum_hessian_in_leaf: 0.001]"
+ , "[pos_bagging_fraction: 1]"
+ , "[neg_bagging_fraction: 1]"
+ , "[bagging_freq: 0]"
+ , "[bagging_seed: 15415]"
+ , "[feature_fraction: 1]"
+ , "[feature_fraction_bynode: 1]"
+ , "[feature_fraction_seed: 32671]"
+ , "[extra_trees: 0]"
+ , "[extra_seed: 6642]"
+ , "[early_stopping_round: 0]"
+ , "[first_metric_only: 0]"
+ , "[max_delta_step: 0]"
+ , "[lambda_l1: 0]"
+ , "[lambda_l2: 0]"
+ , "[linear_lambda: 0]"
+ , "[min_gain_to_split: 0]"
+ , "[drop_rate: 0.1]"
+ , "[max_drop: 50]"
+ , "[skip_drop: 0.5]"
+ , "[xgboost_dart_mode: 0]"
+ , "[uniform_drop: 0]"
+ , "[drop_seed: 20623]"
+ , "[top_rate: 0.2]"
+ , "[other_rate: 0.1]"
+ , "[min_data_per_group: 100]"
+ , "[max_cat_threshold: 32]"
+ , "[cat_l2: 10]"
+ , "[cat_smooth: 10]"
+ , "[max_cat_to_onehot: 4]"
+ , "[top_k: 20]"
+ , "[monotone_constraints: ]"
+ , "[monotone_constraints_method: basic]"
+ , "[monotone_penalty: 0]"
+ , "[feature_contri: ]"
+ , "[forcedsplits_filename: ]"
+ , "[force_col_wise: 0]"
+ , "[force_row_wise: 0]"
+ , "[refit_decay_rate: 0.9]"
+ , "[cegb_tradeoff: 1]"
+ , "[cegb_penalty_split: 0]"
+ , "[cegb_penalty_feature_lazy: ]"
+ , "[cegb_penalty_feature_coupled: ]"
+ , "[path_smooth: 0]"
+ , "[interaction_constraints: ]"
+ , sprintf("[verbosity: %i]", .LGB_VERBOSITY)
+ , "[saved_feature_importance_type: 0]"
+ , "[use_quantized_grad: 0]"
+ , "[num_grad_quant_bins: 4]"
+ , "[quant_train_renew_leaf: 0]"
+ , "[stochastic_rounding: 1]"
+ , "[linear_tree: 0]"
+ , "[max_bin: 255]"
+ , "[max_bin_by_feature: ]"
+ , "[min_data_in_bin: 3]"
+ , "[bin_construct_sample_cnt: 200000]"
+ , "[data_random_seed: 2350]"
+ , "[is_enable_sparse: 1]"
+ , "[enable_bundle: 1]"
+ , "[use_missing: 1]"
+ , "[zero_as_missing: 0]"
+ , "[feature_pre_filter: 1]"
+ , "[pre_partition: 0]"
+ , "[two_round: 0]"
+ , "[header: 0]"
+ , "[label_column: ]"
+ , "[weight_column: ]"
+ , "[group_column: ]"
+ , "[ignore_column: ]"
+ , "[categorical_feature: ]"
+ , "[forcedbins_filename: ]"
+ , "[precise_float_parser: 0]"
+ , "[parser_config_file: ]"
+ , "[objective_seed: 4309]"
+ , "[num_class: 1]"
+ , "[is_unbalance: 0]"
+ , "[scale_pos_weight: 1]"
+ , "[sigmoid: 1]"
+ , "[boost_from_average: 1]"
+ , "[reg_sqrt: 0]"
+ , "[alpha: 0.9]"
+ , "[fair_c: 1]"
+ , "[poisson_max_delta_step: 0.7]"
+ , "[tweedie_variance_power: 1.5]"
+ , "[lambdarank_truncation_level: 30]"
+ , "[lambdarank_norm: 1]"
+ , "[label_gain: ]"
+ , "[lambdarank_position_bias_regularization: 0]"
+ , "[eval_at: ]"
+ , "[multi_error_top_k: 1]"
+ , "[auc_mu_weights: ]"
+ , "[num_machines: 1]"
+ , "[local_listen_port: 12400]"
+ , "[time_out: 120]"
+ , "[machine_list_filename: ]"
+ , "[machines: ]"
+ , "[gpu_platform_id: -1]"
+ , "[gpu_device_id: -1]"
+ , "[gpu_use_dp: 0]"
+ , "[num_gpu: 1]"
+ )
+ all_param_entries <- c(non_default_param_entries, default_param_entries)
 
  # parameters should match what was passed from the R package
- expect_equal(sum(startsWith(params_in_file, "[metric:")), 1L)
- expect_equal(sum(params_in_file == "[metric: l2]"), 1L)
-
- expect_equal(sum(startsWith(params_in_file, "[num_iterations:")), 1L)
- expect_equal(sum(params_in_file == "[num_iterations: 4]"), 1L)
-
- expect_equal(sum(startsWith(params_in_file, "[objective:")), 1L)
- expect_equal(sum(params_in_file == "[objective: regression]"), 1L)
-
- expect_equal(sum(startsWith(params_in_file, "[verbosity:")), 1L)
- expect_equal(sum(params_in_file == sprintf("[verbosity: %i]", .LGB_VERBOSITY)), 1L)
+ model_str <- bst$save_model_to_string()
+ params_in_file <- .params_from_model_string(model_str = model_str)
+ .expect_in(all_param_entries, params_in_file)
 
  # early stopping should be off by default
  expect_equal(sum(startsWith(params_in_file, "[early_stopping_round:")), 1L)
  expect_equal(sum(params_in_file == "[early_stopping_round: 0]"), 1L)
+
+ # since save_model_to_string() is used when serializing with saveRDS(), check that parameters all
+ # roundtrip saveRDS()/loadRDS() successfully
+ rds_file <- tempfile()
+ saveRDS(bst, rds_file)
+ bst_rds <- readRDS(rds_file)
+ model_str <- bst_rds$save_model_to_string()
+ params_in_file <- .params_from_model_string(model_str = model_str)
+ .expect_in(all_param_entries, params_in_file)
 })
 
 test_that("early_stopping, num_iterations are stored correctly in model string even with aliases", {

@@ -330,7 +330,7 @@ def gen_parameter_code(
  str_to_write += ' std::string tmp_str = "";\n'
  for x in infos:
  for y in x:
- if "[doc-only]" in y:
+ if "[no-automatically-extract]" in y:
  continue
  param_type = y["inner_type"][0]
  name = y["name"][0]
@@ -345,7 +345,7 @@ def gen_parameter_code(
  str_to_write += " std::stringstream str_buf;\n"
  for x in infos:
  for y in x:
- if "[doc-only]" in y or "[no-save]" in y:
+ if "[no-save]" in y:
  continue
  param_type = y["inner_type"][0]
  name = y["name"][0]

@@ -5,8 +5,13 @@
  * \note
  * - desc and descl2 fields must be written in reStructuredText format;
  * - nested sections can be placed only at the bottom of parent's section;
- * - [doc-only] tag indicates that only documentation for this param should be generated and all other actions are performed manually;
- * - [no-save] tag indicates that this param should not be saved into a model text representation.
+ * - [no-automatically-extract]
+ * - do not automatically extract this parameter into a Config property with the same name in Config::GetMembersFromString(). Use if:
+ * - specialized extraction logic for this param exists in Config::GetMembersFromString()
+ * - [no-save]
+ * - this param should not be saved into a model text representation via Config::SaveMembersToString(). Use if:
+ * - param is only used by the CLI (especially the "predict" and "convert_model" tasks)
+ * - param is related to LightGBM writing files (e.g. "output_model", "save_binary")
  */
 #ifndef LIGHTGBM_CONFIG_H_
 #define LIGHTGBM_CONFIG_H_
@@ -97,15 +102,15 @@ struct Config {
  #pragma region Core Parameters
  #endif // __NVCC__
 
+ // [no-automatically-extract]
  // [no-save]
- // [doc-only]
  // alias = config_file
  // desc = path of config file
  // desc = **Note**: can be used only in CLI version
  std::string config = "";
 
+ // [no-automatically-extract]
  // [no-save]
- // [doc-only]
  // type = enum
  // default = train
  // options = train, predict, convert_model, refit
@@ -118,7 +123,8 @@ struct Config {
  // desc = **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions
  TaskType task = TaskType::kTrain;
 
- // [doc-only]
+ // [no-automatically-extract]
+ // [no-save]
  // type = enum
  // options = regression, regression_l1, huber, fair, poisson, quantile, mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda, lambdarank, rank_xendcg
  // alias = objective_type, app, application, loss
@@ -150,7 +156,8 @@ struct Config {
  // descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect)
  std::string objective = "regression";
 
- // [doc-only]
+ // [no-automatically-extract]
+ // [no-save]
  // type = enum
  // alias = boosting_type, boost
  // options = gbdt, rf, dart
@@ -160,7 +167,7 @@ struct Config {
  // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
  std::string boosting = "gbdt";
 
- // [doc-only]
+ // [no-automatically-extract]
  // type = enum
  // options = bagging, goss
  // desc = ``bagging``, Randomly Bagging Sampling
@@ -200,7 +207,8 @@ struct Config {
  // desc = max number of leaves in one tree
  int num_leaves = kDefaultNumLeaves;
 
- // [doc-only]
+ // [no-automatically-extract]
+ // [no-save]
  // type = enum
  // options = serial, feature, data, voting
  // alias = tree, tree_type, tree_learner_type
@@ -222,7 +230,8 @@ struct Config {
  // desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
  int num_threads = 0;
 
- // [doc-only]
+ // [no-automatically-extract]
+ // [no-save]
  // type = enum
  // options = cpu, gpu, cuda
  // alias = device
@@ -235,7 +244,7 @@ struct Config {
  // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
  std::string device_type = "cpu";
 
- // [doc-only]
+ // [no-automatically-extract]
  // alias = random_seed, random_state
  // default = None
  // desc = this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc.
@@ -593,7 +602,6 @@ struct Config {
  // desc = **Note**: can be used only in CLI version
  int snapshot_freq = -1;
 
- // [no-save]
  // desc = whether to use gradient quantization when training
  // desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins``
  // desc = with quantized training, most arithmetics in the training process will be integer operations
@@ -602,21 +610,18 @@ struct Config {
  // desc = *New in version 4.0.0*
  bool use_quantized_grad = false;
 
- // [no-save]
  // desc = number of bins to quantization gradients and hessians
  // desc = with more bins, the quantized training will be closer to full precision training
  // desc = **Note**: can be used only with ``device_type = cpu``
  // desc = *New in 4.0.0*
  int num_grad_quant_bins = 4;
 
- // [no-save]
  // desc = whether to renew the leaf values with original gradients when quantized training
  // desc = renewing is very helpful for good quantized training accuracy for ranking objectives
  // desc = **Note**: can be used only with ``device_type = cpu``
  // desc = *New in 4.0.0*
  bool quant_train_renew_leaf = false;
 
- // [no-save]
  // desc = whether to use stochastic rounding in gradient quantization
  // desc = *New in 4.0.0*
  bool stochastic_rounding = true;
@@ -976,7 +981,8 @@ struct Config {
  #pragma region Metric Parameters
  #endif // __NVCC__
 
- // [doc-only]
+ // [no-automatically-extract]
+ // [no-save]
  // alias = metrics, metric_types
  // default = ""
  // type = multi-enum

@@ -664,12 +664,14 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
 std::string Config::SaveMembersToString() const {
  std::stringstream str_buf;
+ str_buf << "[data_sample_strategy: " << data_sample_strategy << "]\n";
  str_buf << "[data: " << data << "]\n";
  str_buf << "[valid: " << Common::Join(valid, ",") << "]\n";
  str_buf << "[num_iterations: " << num_iterations << "]\n";
  str_buf << "[learning_rate: " << learning_rate << "]\n";
  str_buf << "[num_leaves: " << num_leaves << "]\n";
  str_buf << "[num_threads: " << num_threads << "]\n";
+ str_buf << "[seed: " << seed << "]\n";
  str_buf << "[deterministic: " << deterministic << "]\n";
  str_buf << "[force_col_wise: " << force_col_wise << "]\n";
  str_buf << "[force_row_wise: " << force_row_wise << "]\n";
@@ -722,6 +724,10 @@ std::string Config::SaveMembersToString() const {
  str_buf << "[interaction_constraints: " << interaction_constraints << "]\n";
  str_buf << "[verbosity: " << verbosity << "]\n";
  str_buf << "[saved_feature_importance_type: " << saved_feature_importance_type << "]\n";
+ str_buf << "[use_quantized_grad: " << use_quantized_grad << "]\n";
+ str_buf << "[num_grad_quant_bins: " << num_grad_quant_bins << "]\n";
+ str_buf << "[quant_train_renew_leaf: " << quant_train_renew_leaf << "]\n";
+ str_buf << "[stochastic_rounding: " << stochastic_rounding << "]\n";
  str_buf << "[linear_tree: " << linear_tree << "]\n";
  str_buf << "[max_bin: " << max_bin << "]\n";
  str_buf << "[max_bin_by_feature: " << Common::Join(max_bin_by_feature, ",") << "]\n";