From 69433f6e743fb1703b9326c0ceff39652f306aa4 Mon Sep 17 00:00:00 2001
From: Jean Cochrane <jean.cochrane@cookcountyil.gov>
Date: Thu, 31 Oct 2024 15:17:00 +0000
Subject: [PATCH] Switch from `multisession` parallelization to `multicore` in
 `evaluate` stage

---
 pipeline/03-evaluate.R | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/pipeline/03-evaluate.R b/pipeline/03-evaluate.R
index 6828aaa..f303185 100644
--- a/pipeline/03-evaluate.R
+++ b/pipeline/03-evaluate.R
@@ -12,8 +12,17 @@ tictoc::tic("Evaluate")
 # Load libraries, helpers, and recipes from files
 purrr::walk(list.files("R/", "\\.R$", full.names = TRUE), source)
 
-# Enable parallel backend for generating stats more quickly
-plan(multisession, workers = num_threads)
+# Enable parallel backend for generating stats faster.
+# In the past we used the 'multisession' parallelization strategy, but this
+# strategy exhibits diminishing returns (and eventually worse performance) past
+# 5 workers on the server, and it's not particularly fast either (~10 mins to
+# complete this stage). The 'multicore' strategy has a higher risk of hogging
+# server resources for the duration of execution, but it executes much faster
+# than the multisession strategy (~80 seconds to complete this stage), so
+# ultimately we think it's worth the risk; plus, we only use half the available
+# cores in order to ensure we don't block execution of other important tasks on
+# the server.
+plan(multicore, workers = ceiling(num_threads / 2))
 
 # Renaming dictionary for input columns. We want the actual value of the column
 # to become geography_id and the NAME of the column to become geography_name