diff --git a/R-package/tests/testthat/helper.R b/R-package/tests/testthat/helper.R index 9da2f9bd7167..9c928c1f71d1 100644 --- a/R-package/tests/testthat/helper.R +++ b/R-package/tests/testthat/helper.R @@ -29,3 +29,20 @@ .LGB_VERBOSITY <- as.integer( Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") ) + +# [description] +# test that every element of 'x' is in 'y' +# +# testthat::expect_in() is not available in version of {testthat} +# built for R 3.6, this is here to support a similar interface on R 3.6 +.expect_in <- function(x, y) { + if (exists("expect_in")) { + expect_in(x, y) + } else { + missing_items <- x[!(x %in% y)] + if (length(missing_items) != 0L) { + error_msg <- paste0("Some expected items not found: ", toString(missing_items)) + stop(error_msg) + } + } +} diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 1ff038598db1..5f398f1c081d 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -799,37 +799,166 @@ test_that("all parameters are stored correctly with save_model_to_string()", { data = matrix(rnorm(500L), nrow = 100L) , label = rnorm(100L) ) - nrounds <- 4L bst <- lgb.train( params = list( - objective = "regression" - , metric = "l2" + objective = "mape" + , metric = c("l2", "mae") , num_threads = .LGB_MAX_THREADS + , seed = 708L + , data_sample_strategy = "bagging" + , sub_row = 0.8234 ) , data = dtrain - , nrounds = nrounds + , nrounds = 3L , verbose = .LGB_VERBOSITY ) - model_str <- bst$save_model_to_string() - params_in_file <- .params_from_model_string(model_str = model_str) + # entries whose values should reflect params passed to lgb.train() + non_default_param_entries <- c( + "[objective: mape]" + # 'l1' was passed in with alias 'mae' + , "[metric: l2,l1]" + , "[data_sample_strategy: bagging]" + , "[seed: 708]" + # this was passed in with alias 'sub_row' + , "[bagging_fraction: 0.8234]" + , "[num_iterations: 3]" + ) + + # entries with default values of params + default_param_entries <- c( + "[boosting: gbdt]" + , "[tree_learner: serial]" + , "[device_type: cpu]" + , "[data: ]" + , "[valid: ]" + , "[learning_rate: 0.1]" + , "[num_leaves: 31]" + , sprintf("[num_threads: %i]", .LGB_MAX_THREADS) + , "[deterministic: 0]" + , "[histogram_pool_size: -1]" + , "[max_depth: -1]" + , "[min_data_in_leaf: 20]" + , "[min_sum_hessian_in_leaf: 0.001]" + , "[pos_bagging_fraction: 1]" + , "[neg_bagging_fraction: 1]" + , "[bagging_freq: 0]" + , "[bagging_seed: 15415]" + , "[feature_fraction: 1]" + , "[feature_fraction_bynode: 1]" + , "[feature_fraction_seed: 32671]" + , "[extra_trees: 0]" + , "[extra_seed: 6642]" + , "[early_stopping_round: 0]" + , "[first_metric_only: 0]" + , "[max_delta_step: 0]" + , "[lambda_l1: 0]" + , "[lambda_l2: 0]" + , "[linear_lambda: 0]" + , "[min_gain_to_split: 0]" + , "[drop_rate: 0.1]" + , "[max_drop: 50]" + , "[skip_drop: 0.5]" + , "[xgboost_dart_mode: 0]" + , "[uniform_drop: 0]" + , "[drop_seed: 20623]" + , "[top_rate: 0.2]" + , "[other_rate: 0.1]" + , "[min_data_per_group: 100]" + , "[max_cat_threshold: 32]" + , "[cat_l2: 10]" + , "[cat_smooth: 10]" + , "[max_cat_to_onehot: 4]" + , "[top_k: 20]" + , "[monotone_constraints: ]" + , "[monotone_constraints_method: basic]" + , "[monotone_penalty: 0]" + , "[feature_contri: ]" + , "[forcedsplits_filename: ]" + , "[force_col_wise: 0]" + , "[force_row_wise: 0]" + , "[refit_decay_rate: 0.9]" + , "[cegb_tradeoff: 1]" + , "[cegb_penalty_split: 0]" + , "[cegb_penalty_feature_lazy: ]" + , "[cegb_penalty_feature_coupled: ]" + , "[path_smooth: 0]" + , "[interaction_constraints: ]" + , sprintf("[verbosity: %i]", .LGB_VERBOSITY) + , "[saved_feature_importance_type: 0]" + , "[use_quantized_grad: 0]" + , "[num_grad_quant_bins: 4]" + , "[quant_train_renew_leaf: 0]" + , "[stochastic_rounding: 1]" + , "[linear_tree: 0]" + , "[max_bin: 255]" + , "[max_bin_by_feature: ]" + , "[min_data_in_bin: 3]" + , "[bin_construct_sample_cnt: 200000]" + , "[data_random_seed: 2350]" + , "[is_enable_sparse: 1]" + , "[enable_bundle: 1]" + , "[use_missing: 1]" + , "[zero_as_missing: 0]" + , "[feature_pre_filter: 1]" + , "[pre_partition: 0]" + , "[two_round: 0]" + , "[header: 0]" + , "[label_column: ]" + , "[weight_column: ]" + , "[group_column: ]" + , "[ignore_column: ]" + , "[categorical_feature: ]" + , "[forcedbins_filename: ]" + , "[precise_float_parser: 0]" + , "[parser_config_file: ]" + , "[objective_seed: 4309]" + , "[num_class: 1]" + , "[is_unbalance: 0]" + , "[scale_pos_weight: 1]" + , "[sigmoid: 1]" + , "[boost_from_average: 1]" + , "[reg_sqrt: 0]" + , "[alpha: 0.9]" + , "[fair_c: 1]" + , "[poisson_max_delta_step: 0.7]" + , "[tweedie_variance_power: 1.5]" + , "[lambdarank_truncation_level: 30]" + , "[lambdarank_norm: 1]" + , "[label_gain: ]" + , "[lambdarank_position_bias_regularization: 0]" + , "[eval_at: ]" + , "[multi_error_top_k: 1]" + , "[auc_mu_weights: ]" + , "[num_machines: 1]" + , "[local_listen_port: 12400]" + , "[time_out: 120]" + , "[machine_list_filename: ]" + , "[machines: ]" + , "[gpu_platform_id: -1]" + , "[gpu_device_id: -1]" + , "[gpu_use_dp: 0]" + , "[num_gpu: 1]" + ) + all_param_entries <- c(non_default_param_entries, default_param_entries) # parameters should match what was passed from the R package - expect_equal(sum(startsWith(params_in_file, "[metric:")), 1L) - expect_equal(sum(params_in_file == "[metric: l2]"), 1L) - - expect_equal(sum(startsWith(params_in_file, "[num_iterations:")), 1L) - expect_equal(sum(params_in_file == "[num_iterations: 4]"), 1L) - - expect_equal(sum(startsWith(params_in_file, "[objective:")), 1L) - expect_equal(sum(params_in_file == "[objective: regression]"), 1L) - - expect_equal(sum(startsWith(params_in_file, "[verbosity:")), 1L) - expect_equal(sum(params_in_file == sprintf("[verbosity: %i]", .LGB_VERBOSITY)), 1L) + model_str <- bst$save_model_to_string() + params_in_file <- .params_from_model_string(model_str = model_str) + .expect_in(all_param_entries, params_in_file) # early stopping should be off by default expect_equal(sum(startsWith(params_in_file, "[early_stopping_round:")), 1L) expect_equal(sum(params_in_file == "[early_stopping_round: 0]"), 1L) + + # since save_model_to_string() is used when serializing with saveRDS(), check that parameters all + # roundtrip saveRDS()/loadRDS() successfully + rds_file <- tempfile() + saveRDS(bst, rds_file) + bst_rds <- readRDS(rds_file) + model_str <- bst_rds$save_model_to_string() + params_in_file <- .params_from_model_string(model_str = model_str) + .expect_in(all_param_entries, params_in_file) }) test_that("early_stopping, num_iterations are stored correctly in model string even with aliases", { diff --git a/helpers/parameter_generator.py b/helpers/parameter_generator.py index 407f2c73e1e3..a554ee60b6c9 100644 --- a/helpers/parameter_generator.py +++ b/helpers/parameter_generator.py @@ -330,7 +330,7 @@ def gen_parameter_code( str_to_write += ' std::string tmp_str = "";\n' for x in infos: for y in x: - if "[doc-only]" in y: + if "[no-automatically-extract]" in y: continue param_type = y["inner_type"][0] name = y["name"][0] @@ -345,7 +345,7 @@ def gen_parameter_code( str_to_write += " std::stringstream str_buf;\n" for x in infos: for y in x: - if "[doc-only]" in y or "[no-save]" in y: + if "[no-save]" in y: continue param_type = y["inner_type"][0] name = y["name"][0] diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 187043cc2053..6d61bc764924 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -5,8 +5,13 @@ * \note * - desc and descl2 fields must be written in reStructuredText format; * - nested sections can be placed only at the bottom of parent's section; - * - [doc-only] tag indicates that only documentation for this param should be generated and all other actions are performed manually; - * - [no-save] tag indicates that this param should not be saved into a model text representation. + * - [no-automatically-extract] + * - do not automatically extract this parameter into a Config property with the same name in Config::GetMembersFromString(). Use if: + * - specialized extraction logic for this param exists in Config::GetMembersFromString() + * - [no-save] + * - this param should not be saved into a model text representation via Config::SaveMembersToString(). Use if: + * - param is only used by the CLI (especially the "predict" and "convert_model" tasks) + * - param is related to LightGBM writing files (e.g. "output_model", "save_binary") */ #ifndef LIGHTGBM_CONFIG_H_ #define LIGHTGBM_CONFIG_H_ @@ -97,15 +102,15 @@ struct Config { #pragma region Core Parameters #endif // __NVCC__ + // [no-automatically-extract] // [no-save] - // [doc-only] // alias = config_file // desc = path of config file // desc = **Note**: can be used only in CLI version std::string config = ""; + // [no-automatically-extract] // [no-save] - // [doc-only] // type = enum // default = train // options = train, predict, convert_model, refit @@ -118,7 +123,8 @@ struct Config { // desc = **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions TaskType task = TaskType::kTrain; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // options = regression, regression_l1, huber, fair, poisson, quantile, mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda, lambdarank, rank_xendcg // alias = objective_type, app, application, loss @@ -150,7 +156,8 @@ struct Config { // descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect) std::string objective = "regression"; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // alias = boosting_type, boost // options = gbdt, rf, dart @@ -160,7 +167,7 @@ struct Config { // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations std::string boosting = "gbdt"; - // [doc-only] + // [no-automatically-extract] // type = enum // options = bagging, goss // desc = ``bagging``, Randomly Bagging Sampling @@ -200,7 +207,8 @@ struct Config { // desc = max number of leaves in one tree int num_leaves = kDefaultNumLeaves; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // options = serial, feature, data, voting // alias = tree, tree_type, tree_learner_type @@ -222,7 +230,8 @@ struct Config { // desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors int num_threads = 0; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // options = cpu, gpu, cuda // alias = device @@ -235,7 +244,7 @@ struct Config { // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support std::string device_type = "cpu"; - // [doc-only] + // [no-automatically-extract] // alias = random_seed, random_state // default = None // desc = this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc. @@ -593,7 +602,6 @@ struct Config { // desc = **Note**: can be used only in CLI version int snapshot_freq = -1; - // [no-save] // desc = whether to use gradient quantization when training // desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins`` // desc = with quantized training, most arithmetics in the training process will be integer operations @@ -602,21 +610,18 @@ struct Config { // desc = *New in version 4.0.0* bool use_quantized_grad = false; - // [no-save] // desc = number of bins to quantization gradients and hessians // desc = with more bins, the quantized training will be closer to full precision training // desc = **Note**: can be used only with ``device_type = cpu`` // desc = *New in 4.0.0* int num_grad_quant_bins = 4; - // [no-save] // desc = whether to renew the leaf values with original gradients when quantized training // desc = renewing is very helpful for good quantized training accuracy for ranking objectives // desc = **Note**: can be used only with ``device_type = cpu`` // desc = *New in 4.0.0* bool quant_train_renew_leaf = false; - // [no-save] // desc = whether to use stochastic rounding in gradient quantization // desc = *New in 4.0.0* bool stochastic_rounding = true; @@ -976,7 +981,8 @@ struct Config { #pragma region Metric Parameters #endif // __NVCC__ - // [doc-only] + // [no-automatically-extract] + // [no-save] // alias = metrics, metric_types // default = "" // type = multi-enum diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 8182c9b52b93..394614af3f33 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -664,12 +664,14 @@ void Config::GetMembersFromString(const std::unordered_map