-
Notifications
You must be signed in to change notification settings - Fork 1
/
41_train_QRF_LLO_KFCV_optimal_model.R
269 lines (223 loc) · 10.9 KB
/
41_train_QRF_LLO_KFCV_optimal_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#------------------------------------------------------------------------------
# Name: 41_train_QRF_LLO_KFCV_optimal_model.R
# (QRF = quantile regression forest, LLO = leave-location-out,
# KFCV = k-fold cross-validation)
#
# Content: - read in regression matrix and tuning grid of hyperparameters
# specific to chosen target soil property (response/dependent var)
# - fit QRF model using a cross-validation (CV) grouped by location of
# the training data (requires "caret", "CAST" and "quantregForest" or
# "ranger" pkg ("ranger" is preferable because much faster)) and
# optimal hyperparameters
# - save trained/fitted/calibrated model to disk
#
# Refs: - QRF package and vignettes:
# https://cran.r-project.org/web/packages/quantregForest/quantregForest.pdf
# https://mran.microsoft.com/snapshot/2015-07-15/web/packages/quantregForest/vignettes/quantregForest.pdf
# - Cross validation strategies for spatio-temporal data using "CAST" pkg:
# https://cran.r-project.org/web/packages/CAST/vignettes/CAST-intro.html
# https://cran.r-project.org/web/packages/CAST/vignettes/AOA-tutorial.html
# - Model tuning using "caret" pkg and RF hyperparameters:
# https://topepo.github.io/caret/model-training-and-tuning.html
# https://bradleyboehmke.github.io/HOML/random-forest.html#hyperparameters
#
# Inputs: - regression matrix of chosen target soil property:
# "out/data/model/tbl_regmat_[TARGET].Rds"
# - tuning grid of hyperparameters specific to chosen target soil property:
# "out/data/model/tbl_QRF_hypertuning_[TARGET].Rds"
#
# Output: - saved QRF model fit: into "out/data/model/" directory
#
# Runtime: - approx. 2 hr entire script
# (for 16K obs, 195 p, 48 cores, 10-fold CV, 500 trees)
#
# Project: BIS+
# Author: Anatol Helfenstein
# Updated: April 2021
#------------------------------------------------------------------------------
### empty memory and workspace; load required packages ----------------------
gc()
rm(list=ls())
pkgs <- c("tidyverse", "ranger", "caret", "CAST")
lapply(pkgs, library, character.only = TRUE)
# QRF can be calibrated using "ranger" or "quantregForest" (original Meinhausen pkg),
# but "ranger" implementation much faster because uses C++ and better use of cores
### Prepare modelling data --------------------------------------------------
# 1) Specify target soil property:
TARGET = "pH_KCl"
# 2) Read in regression matrix specific to target soil property
tbl_regmat_target <- read_rds(paste0("out/data/model/tbl_regmat_",
TARGET, ".Rds"))
# 3) Read in tuning grid of hyperparameters specific to target soil property
tbl_tuning_extensive <- read_rds(paste0("out/data/model/tbl_RF_hypertuning_",
TARGET, ".Rds"))
# 4) Specify number of cores/threads to use for parallel computation
THREADS = parallel::detectCores()
# use full computational power of SGL server
### Train optimal QRF model ----------------------------------------------------
# dataframe of covariate/predictors values at training locations
df_cov_train <- tbl_regmat_target %>%
filter(split %in% "train") %>%
dplyr::select(-(split:hor), -all_of(TARGET)) %>%
as.data.frame() # "setting row names on tibble is deprecated"
# vector of response variable values (target soil property) at training locations
v_response <- tbl_regmat_target %>%
filter(split %in% "train") %>%
pull(all_of(TARGET))
# To prevent overfitting the model and improving separate & independent validation
# results later on, we make use of the "CAST" pkg of Hanna Meyer, embedded in a
# "caret" framework. This allows machine learning (ML) for space-time data,
# for which models should be fit differently to non-spatio-temporal data...
# set seed to control randomness of cross-validation (CV)
set.seed(2021)
# Use 10-fold CV grouped by location (site_id) using CAST::CreatSpacetimeFolds()
indices <- CreateSpacetimeFolds(x = tbl_regmat_target %>%
filter(split %in% "train"),
spacevar = "site_id",
k = 10)
# number of predictors/covariates used
P = ncol(df_cov_train)
# choose hyperparameters that performed best (lowest RMSE) from tuning grid
df_tuning_optimal <- expand.grid(mtry = tbl_tuning_extensive$mtry[1],
min.node.size = tbl_tuning_extensive$min.node.size[1],
splitrule = tbl_tuning_extensive$splitrule[1])
# Control randomness of sampling in each fold in multi-core environment
set.seed(2021)
# 10 folds + final model
seeds <- vector(mode = "list", length = length(indices$index) + 1)
for (i in 1:length(indices$index)) {
seeds[[i]] <- sample.int(1000, 1)
# only 1 seed per fold because only 1 model (optimal); see ?trainControl seeds
}
# For the final model:
seeds[[11]] <- sample.int(1000, 1)
# set nuances of model training:
# By default, train chooses tuning parameter with best performance (e.g. lowest RMSE)
# "one standard error" rule of Breiman et al. (1984) suggests that tuning
# parameter associated with the best performance may overfit and so its better
# to choose simplest model within one standard error (SE) of the empirically
# optimal model; useful for simple trees that easily overfit but little or no
# effect for RF...
# trainControl for optimal final model
train_control_optimal <- trainControl(
method = "cv",
returnResamp = "all", # save all hold-out predictions for each resample
savePredictions = "all",
selectionFunction = "best", # or "oneSE"
index = indices$index,
seeds = seeds, # 1 seed for each fold + 1 for final model
allowParallel = THREADS
)
# indexFinal = optional vector of integers indicating which samples are used to
# fit the final model after resampling. If NULL, then entire data set is used.
# set seed to control randomness of model training/fitting
set.seed(2021)
# There are rare cases where the underlying model function does not control the
# random number seed, especially if the computations are conducted in C code!
# Could this be the case for RF using ranger (written in C)?
# train model using caret::train() & "ranger" implementation
system.time(
qrf_fit_optimal <- train(
x = df_cov_train,
y = v_response,
method = "ranger",
metric = "RMSE",
num.trees = tbl_tuning_extensive$num.trees[1],
importance = "permutation", # Not possible to retrieve both importance measures (?)
replace = tbl_tuning_extensive$replace[1],
sample.fraction = tbl_tuning_extensive$sample.fraction[1],
local.importance = TRUE, # Var imp for each predictor & each sample
keep.inbag = TRUE,
quantreg = TRUE, # this changes it to QRF instead of RF
oob.error = TRUE,
num.threads = THREADS, # doesn't seem to work (no status bar)
verbose = TRUE,
# seed = 2021,
tuneGrid = df_tuning_optimal,
trControl = train_control_optimal
)
)
# time elapse "optimal" model on 48 cores: 50 min
# time elapse "optimal" model (ntrees = 250) on 48 cores: 12 min
# time elapse "optimal" model (ntrees = 250, returnResamp TRUE) on 48 cores: 40 min
# summary of calibrated model (model fit)
qrf_fit_optimal
# in case of varying hyperparameters:
# plot(qrf_fit)
# output above are averages of RMSE, Rsquared and MAE over all folds
qrf_fit_optimal$resample
# make sure there are no NAs
qrf_fit_optimal$finalModel$predictions %>%
is.na() %>%
unique() # should be FALSE
# save fitted QRF model
write_rds(qrf_fit_optimal, paste0('out/data/model/QRF_fit_', TARGET,
'_obs', length(qrf_fit_optimal$finalModel$predictions),
'_p', P,
'_LLO_', length(qrf_fit_optimal$control$index),
'FCV_optimal.Rds'))
### Work-around to also save each fold of CV as separate QRF model -------------
# Since we also want the cross-validated predictions for all quantiles, we
# achieve that here by fitting a QRF 10 times using the sampling indices of one
# fold in 10-fold CV; i.e. 1st model is a single training/test split using the
# first fold indices, 2nd model is a single training/test split using second
# fold indices, etc. etc.
# create empty list ready to fill with each model object (10x)
ls_models <- vector(mode = "list", length = length(indices$index))
# prepare and fit each model in loop
system.time(
for(i in 1:length(indices$index)) {
# Control randomness of sampling in single training/test split in multi-core env
set.seed(2021)
# trainControl for 1 training/test split using indices from 1 fold
train_control_single_split <- trainControl(
method = "LGOCV",
number = 1, # 1 training/test split
selectionFunction = "best", # or "oneSE"
index = indices$index[i],
indexOut = indices$indexOut[i],
# fit final model using only training samples of fold i
indexFinal = indices$index[[i]],
# 1 seed for each fold + 1 for final model
seeds = c(seeds[i], seeds[11]),
allowParallel = THREADS
)
# indexFinal = optional vector of integers indicating which samples are used to
# fit the final model after resampling. If NULL, then entire data set is used.
# set seed to control randomness of model training/fitting
set.seed(2021)
# There are rare cases where the underlying model function does not control the
# random number seed, especially if the computations are conducted in C code!
# Could this be the case for RF using ranger (written in C)?
# train model using caret::train() & "ranger" implementation
qrf_fit_single_split <- train(
x = df_cov_train,
y = v_response,
method = "ranger",
metric = "RMSE",
num.trees = tbl_tuning_extensive$num.trees[1],
importance = "permutation", # Not possible to retrieve both importance measures (?)
replace = tbl_tuning_extensive$replace[1],
sample.fraction = tbl_tuning_extensive$sample.fraction[1],
local.importance = TRUE, # Var imp for each predictor & each sample
keep.inbag = TRUE,
quantreg = TRUE, # this changes it to QRF instead of RF
oob.error = TRUE,
num.threads = THREADS, # doesn't seem to work (no status bar)
verbose = TRUE,
# seed = 2021,
tuneGrid = df_tuning_optimal,
trControl = train_control_single_split
)
# time elapse 1 model single split on 48 cores: 8 min
# save each model in list
ls_models[[i]] <- qrf_fit_single_split
}
)
# time elapse 10 models using 48 cores: 1.3 h (78 min)
# save all folds of training data as separate models
write_rds(ls_models, paste0('out/data/model/QRF_fit_', TARGET,
'_obs', length(qrf_fit_optimal$finalModel$predictions),
'_p', P,
'_LLO_', length(qrf_fit_optimal$control$index),
'FCV_all_folds.Rds'))