mlr-org · be-marc · Nov 30, 2024 · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -41,6 +41,7 @@ Suggests:
     mlr3learners,
     mlr3pipelines,
     rpart,
+    fastVoteR,
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
 Config/testthat/parallel: true
@@ -74,6 +75,7 @@ Collate:
     'assertions.R'
     'auto_fselector.R'
     'bibentries.R'
+    'embedded_ensemble_fselect.R'
     'ensemble_fselect.R'
     'extract_inner_fselect_archives.R'
     'extract_inner_fselect_results.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -36,6 +36,7 @@ export(auto_fselector)
 export(callback_batch_fselect)
 export(clbk)
 export(clbks)
+export(embedded_ensemble_fselect)
 export(ensemble_fselect)
 export(extract_inner_fselect_archives)
 export(extract_inner_fselect_results)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # mlr3fselect (development version)
 
+* Use [fastVoteR](https://github.com/bblodfon/fastVoteR) for feature ranking in `EnsembleFSResult()` objects
+* Add embedded ensemble feature selection `embedded_ensemble_fselect()`
+* Refactor `ensemble_fselect()` and `EnsembleFSResult()`
+
 # mlr3fselect 1.2.1
 
 * compatibility: mlr3 0.22.0

diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R
diff --git a/R/bibentries.R b/R/bibentries.R
@@ -9,7 +9,6 @@ bibentries = c(
     title       = "ecr 2.0",
     booktitle   = "Proceedings of the Genetic and Evolutionary Computation Conference Companion"
   ),
-
   bergstra_2012 = bibentry("article",
     title       = "Random Search for Hyper-Parameter Optimization",
     author      = "James Bergstra and Yoshua Bengio",
@@ -20,8 +19,7 @@ bibentries = c(
     pages       = "281--305",
     url         = "https://jmlr.csail.mit.edu/papers/v13/bergstra12a.html"
   ),
-
-  thomas2017  = bibentry("article",
+  thomas2017 = bibentry("article",
     doi       = "10.1155/2017/1421409",
     year      = "2017",
     publisher = "Hindawi Limited",
@@ -31,8 +29,7 @@ bibentries = c(
     title     = "Probing for Sparse and Fast Variable Selection with Model-Based Boosting",
     journal   = "Computational and Mathematical Methods in Medicine"
   ),
-
-  wu2007      = bibentry("article",
+  wu2007 = bibentry("article",
     doi       = "10.1198/016214506000000843",
     year      = "2007",
     month     = "3",
@@ -44,8 +41,7 @@ bibentries = c(
     title     = "Controlling Variable Selection by the Addition of Pseudovariables",
     journal   = "Journal of the American Statistical Association"
   ),
-
-  guyon2002     = bibentry("article",
+  guyon2002 = bibentry("article",
     title       = "Gene Selection for Cancer Classification using Support Vector Machines",
     volume      = "46",
     issn        = "1573-0565",
@@ -56,7 +52,6 @@ bibentries = c(
     author      = "Isabelle Guyon and Jason Weston and Stephen Barnhill and Vladimir Vapnik",
     year        = "2002"
   ),
-
   kuhn2013 = bibentry("Inbook",
     author    = "Kuhn, Max and Johnson, Kjell",
     chapter   = "Over-Fitting and Model Tuning",
@@ -67,7 +62,6 @@ bibentries = c(
     pages     = "61--92",
     isbn      = "978-1-4614-6849-3"
   ),
-
   saeys2008 = bibentry("article",
     author      = "Saeys, Yvan and Abeel, Thomas and Van De Peer, Yves",
     doi         = "10.1007/978-3-540-87481-2_21",
@@ -79,7 +73,6 @@ bibentries = c(
     volume      = "5212 LNAI",
     year        = "2008"
   ),
-
   abeel2010 = bibentry("article",
     author    = "Abeel, Thomas and Helleputte, Thibault and Van de Peer, Yves and Dupont, Pierre and Saeys, Yvan",
     doi       = "10.1093/BIOINFORMATICS/BTP630",
@@ -92,7 +85,6 @@ bibentries = c(
     volume    = "26",
     year      = "2010"
   ),
-
   pes2020 = bibentry("article",
     author    = "Pes, Barbara",
     doi       = "10.1007/s00521-019-04082-3",
@@ -106,7 +98,6 @@ bibentries = c(
     volume    = "32",
     year      = "2020"
   ),
-
   das1999 = bibentry("article",
     author    = "Das, I",
     issn      = "09344373",
@@ -118,5 +109,31 @@ bibentries = c(
     title     = "On characterizing the 'knee' of the Pareto curve based on normal-boundary intersection",
     volume    = "18",
     year      = "1999"
+  ),
+  meinshausen2010 = bibentry("article",
+    author    = "Meinshausen, Nicolai and Buhlmann, Peter",
+    doi       = "10.1111/J.1467-9868.2010.00740.X",
+    eprint    = "0809.2932",
+    issn      = "1369-7412",
+    journal   = "Journal of the Royal Statistical Society Series B: Statistical Methodology",
+    month     = "sep",
+    number    = "4",
+    pages     = "417--473",
+    publisher = "Oxford Academic",
+    title     = "Stability Selection",
+    volume    = "72",
+    year      = "2010"
+  ),
+  hedou2024 = bibentry("article",
+    author = "Hedou, Julien and Maric, Ivana and Bellan, Gregoire and Einhaus, Jakob and Gaudilliere, Dyani K. and Ladant, Francois Xavier and Verdonk, Franck and Stelzer, Ina A. and Feyaerts, Dorien and Tsai, Amy S. and Ganio, Edward A. and Sabayev, Maximilian and Gillard, Joshua and Amar, Jonas and Cambriel, Amelie and Oskotsky, Tomiko T. and Roldan, Alennie and Golob, Jonathan L. and Sirota, Marina and Bonham, Thomas A. and Sato, Masaki and Diop, Maigane and Durand, Xavier and Angst, Martin S. and Stevenson, David K. and Aghaeepour, Nima and Montanari, Andrea and Gaudilliere, Brice", #nolint
+    doi = "10.1038/s41587-023-02033-x",
+    issn = "1546-1696",
+    journal = "Nature Biotechnology 2024",
+    month = "jan",
+    pages = "1--13",
+    publisher = "Nature Publishing Group",
+    title = "Discovery of sparse, reliable omic biomarkers with Stabl",
+    url = "https://www.nature.com/articles/s41587-023-02033-x",
+    year = "2024"
   )
 )
diff --git a/R/embedded_ensemble_fselect.R b/R/embedded_ensemble_fselect.R
@@ -0,0 +1,112 @@
+#' @title Embedded Ensemble Feature Selection
+#'
+#' @include CallbackBatchFSelect.R
+#'
+#' @description
+#' Ensemble feature selection using multiple learners.
+#' The ensemble feature selection method is designed to identify the most predictive features from a given dataset by leveraging multiple machine learning models and resampling techniques.
+#' Returns an [EnsembleFSResult].
+#'
+#' @details
+#' The method begins by applying an initial resampling technique specified by the user, to create **multiple subsamples** from the original dataset (train/test splits).
+#' This resampling process helps in generating diverse subsets of data for robust feature selection.
+#'
+#' For each subsample (train set) generated in the previous step, the method applies learners
+#' that support **embedded feature selection**.
+#' These learners are then scored on their ability to predict on the resampled
+#' test sets, storing the selected features during training, for each
+#' combination of subsample and learner.
+#'
+#' Results are stored in an [EnsembleFSResult].
+#'
+#' @param learners (list of [mlr3::Learner])\cr
+#'  The learners to be used for feature selection.
+#'  All learners must have the `selected_features` property, i.e. implement
+#'  embedded feature selection (e.g. regularized models).
+#' @param init_resampling ([mlr3::Resampling])\cr
+#'  The initial resampling strategy of the data, from which each train set
+#'  will be passed on to the learners and each test set will be used for
+#'  prediction.
+#'  Can only be [mlr3::ResamplingSubsampling] or [mlr3::ResamplingBootstrap].
+#' @param measure ([mlr3::Measure])\cr
+#'  The measure used to score each learner on the test sets generated by
+#'  `init_resampling`.
+#'  If `NULL`, default measure is used.
+#' @param store_benchmark_result (`logical(1)`)\cr
+#'  Whether to store the benchmark result in [EnsembleFSResult] or not.
+#'
+#' @template param_task
+#'
+#' @returns an [EnsembleFSResult] object.
+#'
+#' @source
+#' `r format_bib("meinshausen2010", "hedou2024")`
+#' @export
+#' @examples
+#' \donttest{
+#'   eefsr = embedded_ensemble_fselect(
+#'     task = tsk("sonar"),
+#'     learners = lrns(c("classif.rpart", "classif.featureless")),
+#'     init_resampling = rsmp("subsampling", repeats = 5),
+#'     measure = msr("classif.ce")
+#'   )
+#'   eefsr
+#' }
+embedded_ensemble_fselect = function(
+  task,
+  learners,
+  init_resampling,
+  measure,
+  store_benchmark_result = TRUE
+  ) {
+  assert_task(task)
+  assert_learners(as_learners(learners), task = task, properties = "selected_features")
+  assert_resampling(init_resampling)
+  assert_choice(class(init_resampling)[1], choices = c("ResamplingBootstrap", "ResamplingSubsampling"))
+  assert_measure(measure, task = task)
+  assert_flag(store_benchmark_result)
+
+  init_resampling$instantiate(task)
+
+  design = benchmark_grid(
+    tasks = task,
+    learners = learners,
+    resamplings = init_resampling
+  )
+
+  bmr = benchmark(design, store_models = TRUE)
+
+  trained_learners = bmr$score()$learner
+
+  # extract selected features
+  features = map(trained_learners, function(learner) {
+    learner$selected_features()
+  })
+
+  # extract n_features
+  n_features = map_int(features, length)
+
+  # extract scores on the test sets
+  scores = bmr$score(measure)
+
+  set(scores, j = "features", value = features)
+  set(scores, j = "n_features", value = n_features)
+  setnames(scores, "iteration", "resampling_iteration")
+
+  # remove R6 objects
+  set(scores, j = "learner", value = NULL)
+  set(scores, j = "task", value = NULL)
+  set(scores, j = "resampling", value = NULL)
+  set(scores, j = "prediction_test", value = NULL)
+  set(scores, j = "task_id", value = NULL)
+  set(scores, j = "nr", value = NULL)
+  set(scores, j = "resampling_id", value = NULL)
+  set(scores, j = "uhash", value = NULL)
+
+  EnsembleFSResult$new(
+    result = scores,
+    features = task$feature_names,
+    benchmark_result = if (store_benchmark_result) bmr,
+    measure = measure
+  )
+}