From 69433f6e743fb1703b9326c0ceff39652f306aa4 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 31 Oct 2024 15:17:00 +0000 Subject: [PATCH] Switch from `multisession` parallelization to `multicore` in `evaluate` stage --- pipeline/03-evaluate.R | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pipeline/03-evaluate.R b/pipeline/03-evaluate.R index 6828aaa..f303185 100644 --- a/pipeline/03-evaluate.R +++ b/pipeline/03-evaluate.R @@ -12,8 +12,17 @@ tictoc::tic("Evaluate") # Load libraries, helpers, and recipes from files purrr::walk(list.files("R/", "\\.R$", full.names = TRUE), source) -# Enable parallel backend for generating stats more quickly -plan(multisession, workers = num_threads) +# Enable parallel backend for generating stats faster. +# In the past we used the 'multisession' parallelization strategy, but this +# strategy exhibits diminishing returns (and eventually worse performance) past +# 5 workers on the server, and it's not particularly fast either (~10 mins to +# complete this stage). The 'multicore' strategy has a higher risk of hogging +# server resources for the duration of execution, but it executes much faster +# than the multisession strategy (~80 seconds to complete this stage), so +# ultimately we think it's worth the risk; plus, we only use half the available +# cores in order to ensure we don't block execution of other important tasks on +# the server. +plan(multicore, workers = ceiling(num_threads / 2)) # Renaming dictionary for input columns. We want the actual value of the column # to become geography_id and the NAME of the column to become geography_name