mlr-org · sebffischer · Apr 23, 2024 · Jan 25, 2024 · Jan 25, 2024 · Jan 26, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -74,7 +74,7 @@ Config/testthat/edition: 3
 Config/testthat/parallel: false
 NeedsCompilation: no
 Roxygen: list(markdown = TRUE, r6 = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.2.3.9000
 Collate:
  'mlr_reflections.R'
  'BenchmarkResult.R'
@@ -90,6 +90,7 @@ Collate:
  'mlr_learners.R'
  'LearnerClassifDebug.R'
  'LearnerClassifFeatureless.R'
+ 'LearnerClassifLily.R'
  'LearnerClassifRpart.R'
  'LearnerRegr.R'
  'LearnerRegrDebug.R'
@@ -183,6 +184,7 @@ Collate:
  'helper_hashes.R'
  'helper_print.R'
  'install_pkgs.R'
+ 'marshal.R'
  'mlr_sugar.R'
  'mlr_test_helpers.R'
  'partition.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -87,6 +87,8 @@ S3method(fix_factor_levels,data.table)
 S3method(head,Task)
 S3method(is_missing_prediction_data,PredictionDataClassif)
 S3method(is_missing_prediction_data,PredictionDataRegr)
+S3method(marshal_model,classif_lily_model)
+S3method(marshal_model,default)
 S3method(partition,Task)
 S3method(partition,TaskClassif)
 S3method(partition,TaskRegr)
@@ -104,6 +106,8 @@ S3method(set_threads,default)
 S3method(set_threads,list)
 S3method(summary,Task)
 S3method(tail,Task)
+S3method(unmarshal_model,classif_lily_model_marshalled)
+S3method(unmarshal_model,default)
 export(BenchmarkResult)
 export(DataBackend)
 export(DataBackendDataTable)
@@ -113,6 +117,7 @@ export(Learner)
 export(LearnerClassif)
 export(LearnerClassifDebug)
 export(LearnerClassifFeatureless)
+export(LearnerClassifLily)
 export(LearnerClassifRpart)
 export(LearnerRegr)
 export(LearnerRegrDebug)
@@ -208,8 +213,13 @@ export(extract_pkgs)
 export(filter_prediction_data)
 export(install_pkgs)
 export(is_missing_prediction_data)
+export(learner_marshal)
+export(learner_marshalled)
+export(learner_unmarshal)
 export(lrn)
 export(lrns)
+export(marshal_model)
+export(marshalled_model)
 export(mlr_learners)
 export(mlr_measures)
 export(mlr_reflections)
@@ -227,6 +237,7 @@ export(tgen)
 export(tgens)
 export(tsk)
 export(tsks)
+export(unmarshal_model)
 import(checkmate)
 import(data.table)
 import(mlr3misc)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # mlr3 (development version)
 
+* Feat: added support for `"marshal"` property, which allows learners to process
+models so they can be serialized. This happens automatically during `resample()`
+and `benchmark()`. The naming was inspired by the {marshal} package.
+
 # mlr3 0.17.2
 
 * Skip new `data.table` tests on mac.

diff --git a/R/BenchmarkResult.R b/R/BenchmarkResult.R
@@ -133,6 +133,16 @@ BenchmarkResult = R6Class("BenchmarkResult",
  invisible(self)
  },
 
+ #' @description
+ #' marshals all stored models.
+ marshal = function() {
+ private$.data$marshal()
+ },
+ #' @description
+ #' Unmarshals all stored models.
+ unmarshal = function() {
+ private$.data$unmarshal()
+ },
 
  #' @description
  #' Returns a table with one row for each resampling iteration, including

diff --git a/R/HotstartStack.R b/R/HotstartStack.R
@@ -85,10 +85,13 @@ HotstartStack = R6Class("HotstartStack",
  add = function(learners) {
  learners = assert_learners(as_learners(learners))
 
- # check for models
- if (any(map_lgl(learners, function(learner) is.null(learner$state$model)))) {
- stopf("Learners must be trained before adding them to the hotstart stack.")
- }
+ walk(learners, function(learner) {
+ if (is.null(learner$model)) {
+ stopf("Learners must be trained before adding them to the hotstart stack.")
+ } else if (marshalled_model(learner$model)) {
+ stopf("Learners must be unmarshalled before adding them to the hotstart stack.")
+ }
+ })
 
  if (!is.null(self$hotstart_threshold)) {
  learners = keep(learners, function(learner) {

diff --git a/R/Learner.R b/R/Learner.R
@@ -184,7 +184,7 @@ Learner = R6Class("Learner",
  #' @param ... (ignored).
  print = function(...) {
  catn(format(self), if (is.null(self$label) || is.na(self$label)) "" else paste0(": ", self$label))
- catn(str_indent("* Model:", if (is.null(self$model)) "-" else class(self$model)[1L]))
+ catn(str_indent("* Model:", if (is.null(self$model)) "-" else if (marshalled_model(self$model)) "<marshalled>" else paste0(class(self$model)[1L])))
  catn(str_indent("* Parameters:", as_short_string(self$param_set$values, 1000L)))
  catn(str_indent("* Packages:", self$packages))
  catn(str_indent("* Predict Types: ", replace(self$predict_types, self$predict_types == self$predict_type, paste0("[", self$predict_type, "]"))))
@@ -279,6 +279,10 @@ Learner = R6Class("Learner",
  stopf("Cannot predict, Learner '%s' has not been trained yet", self$id)
  }
 
+ if (marshalled_model(self$model)) {
+ stopf("Cannot predict, Learner '%s' has not been unmarshalled yet", self$id)
+ }
+
  if (isTRUE(self$parallel_predict) && nbrOfWorkers() > 1L) {
  row_ids = row_ids %??% task$row_ids
  chunked = chunk_vector(row_ids, n_chunks = nbrOfWorkers(), shuffle = FALSE)
@@ -388,7 +392,6 @@ Learner = R6Class("Learner",
  self$state$model
  },
 
-
  #' @field timings (named `numeric(2)`)\cr
  #' Elapsed time in seconds for the steps `"train"` and `"predict"`.
  #' Measured via [mlr3misc::encapsulate()].
@@ -541,7 +544,6 @@ Learner = R6Class("Learner",
  )
 )
 
-
 #' @export
 rd_info.Learner = function(obj, ...) {
  x = c("",

diff --git a/R/LearnerClassifLily.R b/R/LearnerClassifLily.R
@@ -0,0 +1,74 @@
+#' @title Lily and Marshall
+#'
+#' @name mlr_learners_classif.lily
+#' @include LearnerClassifDebug.R
+#'
+#' @description
+#' This learner is just like [`LearnerClassifDebug`], but can be marshalled.
+#' When the `count_marshalling` parameter is `TRUE`, the model contains a `marshal_count` that will be increased
+#' by 1, each time `marshal_model` is called.
+#'
+#' @templateVar id classif.lily
+#' @template learner
+#'
+#' @export
+LearnerClassifLily = R6Class("LearnerClassifLily",
+ inherit = LearnerClassifDebug,
+ public = list(
+ #' @description
+ #' Creates a new instance of this [R6][R6::R6Class] class.
+ initialize = function() {
+ super$initialize()
+ self$param_set$add(ps(count_marshalling = p_lgl(tags = c("train", "required"))))
+ self$param_set$values$count_marshalling = FALSE
+ self$properties = sort(c("marshal", self$properties))
+ self$man = "mlr3::mlr_learners_classif.lily"
+ self$label = "Lily Learner"
+ self$id = "classif.lily"
+ },
+ #' @description
+ #' Marshals the learner.
+ marshal = function() {
+ learner_marshal(self)
+ },
+ #' @description
+ #' Unmarshal the learner.
+ unmarshal = function() {
+ learner_unmarshal(self)
+ }
+ ),
+ active = list(
+ #' @field marshalled (logical(1))\cr
+ #' Whether the learner has been marshalled.
+ marshalled = function() {
+ learner_marshalled(self)
+ }
+ ),
+ private = list(
+ .train = function(task) {
+ model = super$.train(task)
+ if (self$param_set$values$count_marshalling) {
+ model$marshal_count = 0L
+ }
+ class(model) = "classif_lily_model"
+ return(model)
+ }
+ )
+)
+
+#' @include mlr_learners.R
+mlr_learners$add("classif.lily", function() LearnerClassifLily$new())
+
+#' @export
+marshal_model.classif_lily_model = function(model, ...) {
+ if (!is.null(model$marshal_count)) {
+ model$marshal_count = model$marshal_count + 1
+ }
+ newclass = c("classif_lily_model_marshalled", "marshalled")
+ structure(list(model), class = newclass)
+}
+
+#' @export
+unmarshal_model.classif_lily_model_marshalled = function(model, ...) {
+ model[[1L]]
+}
diff --git a/R/Measure.R b/R/Measure.R
@@ -173,6 +173,8 @@ Measure = R6Class("Measure",
  assert_measure(self, task = task, learner = learner)
  assert_prediction(prediction)
 
+ # FIXME: if self has property model check that not marshalled
+
  if ("requires_task" %in% self$properties && is.null(task)) {
  stopf("Measure '%s' requires a task", self$id)
  }
@@ -184,6 +186,9 @@ Measure = R6Class("Measure",
  if ("requires_model" %in% self$properties && (is.null(learner) || is.null(learner$model))) {
  stopf("Measure '%s' requires the trained model", self$id)
  }
+ if ("requires_model" %in% self$properties && marshalled_model(learner$model)) {
+ stopf("Measure '%s' requires the trained model, but model is un marshalled form", self$id)
+ }
 
  if ("requires_train_set" %in% self$properties && is.null(train_set)) {
  stopf("Measure '%s' requires the train_set", self$id)

diff --git a/R/ResampleResult.R b/R/ResampleResult.R
@@ -222,6 +222,17 @@ ResampleResult = R6Class("ResampleResult",
  #' the object in its previous state.
  discard = function(backends = FALSE, models = FALSE) {
  private$.data$discard(backends = backends, models = models)
+ },
+
+ #' @description
+ #' marshals all stored learner models.
+ marshal = function() {
+ private$.data$marshal()
+ },
+ #' @description
+ #' Unmarshals all stored learner models.
+ unmarshal = function() {
+ private$.data$unmarshal()
  }
  ),
 

diff --git a/R/ResultData.R b/R/ResultData.R
@@ -242,6 +242,21 @@ ResultData = R6Class("ResultData",
  invisible(self)
  },
 
+ #' @description
+ #' Marshals all stored learner models.
+ marshal = function() {
+ learner_state = NULL
+ self$data$fact[, learner_state := lapply(learner_state, marshal_state)]
+ invisible(self)
+ },
+ #' @description
+ #' Unmarshals all stored learner models.
+ unmarshal = function() {
+ learner_state = NULL
+ self$data$fact[, learner_state := lapply(learner_state, unmarshal_state)]
+ invisible(self)
+ },
+
  #' @description
  #' Shrinks the object by discarding parts of the stored data.
  #'

diff --git a/R/benchmark.R b/R/benchmark.R
@@ -14,6 +14,7 @@
 #' @template param_encapsulate
 #' @template param_allow_hotstart
 #' @template param_clone
+#' @template param_unmarshal
 #'
 #' @return [BenchmarkResult].
 #'
@@ -77,7 +78,7 @@
 #' ## Get the training set of the 2nd iteration of the featureless learner on penguins
 #' rr = bmr$aggregate()[learner_id == "classif.featureless"]$resample_result[[1]]
 #' rr$resampling$train_set(2)
-benchmark = function(design, store_models = FALSE, store_backends = TRUE, encapsulate = NA_character_, allow_hotstart = FALSE, clone = c("task", "learner", "resampling")) {
+benchmark = function(design, store_models = FALSE, store_backends = TRUE, encapsulate = NA_character_, allow_hotstart = FALSE, clone = c("task", "learner", "resampling"), unmarshal = TRUE) {
  assert_subset(clone, c("task", "learner", "resampling"))
  assert_data_frame(design, min.rows = 1L)
  assert_names(names(design), must.include = c("task", "learner", "resampling"))
@@ -196,5 +197,12 @@ benchmark = function(design, store_models = FALSE, store_backends = TRUE, encaps
  lg$info("Finished benchmark")
 
  set(grid, j = "mode", value = NULL)
- BenchmarkResult$new(ResultData$new(grid, store_backends = store_backends))
+
+ result_data = ResultData$new(grid, store_backends = store_backends)
+
+ if (unmarshal && store_models) {
+ result_data$unmarshal()
+ }
+
+ BenchmarkResult$new(result_data)
 }
diff --git a/R/helper_exec.R b/R/helper_exec.R
@@ -42,6 +42,7 @@ future_map = function(n, FUN, ..., MoreArgs = list()) {
  future.apply::future_mapply(
  FUN, ..., MoreArgs = MoreArgs, SIMPLIFY = FALSE, USE.NAMES = FALSE,
  future.globals = FALSE, future.packages = "mlr3", future.seed = TRUE,
- future.scheduling = scheduling, future.chunk.size = chunk_size, future.stdout = stdout)
+ future.scheduling = scheduling, future.chunk.size = chunk_size, future.stdout = stdout
+ )
  }
 }