diff --git a/CMakeLists.txt b/CMakeLists.txt
index 907557f1df5e..28c59c84fa8c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -202,6 +202,10 @@ endif()
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_STANDARD 17)
 
+#add_compile_options(-fsanitize=thread)
+#add_link_options(-fsanitize=thread)
+add_link_options(-Wl,-no_pie -L/opt/homebrew/lib -lprofiler -ltcmalloc)
+
 option(PARTHENON_IMPORT_KOKKOS "If ON, attempt to link to an external Kokkos library. If OFF, build Kokkos from source and package with Parthenon" OFF)
 if (NOT TARGET Kokkos::kokkos)
   if (PARTHENON_IMPORT_KOKKOS)
diff --git a/example/advection/CMakeLists.txt b/example/advection/CMakeLists.txt
index 97541db4006c..d59e0b04414f 100644
--- a/example/advection/CMakeLists.txt
+++ b/example/advection/CMakeLists.txt
@@ -22,6 +22,9 @@ if( "advection-example" IN_LIST DRIVER_LIST OR NOT PARTHENON_DISABLE_EXAMPLES)
           main.cpp
           parthenon_app_inputs.cpp
   )
+  #add_compile_options(-fsanitize=thread)
+  #add_link_options(-fsanitize=thread)
+  add_link_options(-Wl,-no_pie -L/opt/homebrew/lib -lprofiler -ltcmalloc)
   target_link_libraries(advection-example PRIVATE Parthenon::parthenon)
   lint_target(advection-example)
 endif()
diff --git a/example/advection/advection_driver.cpp b/example/advection/advection_driver.cpp
index c9e2d5a2b435..d64bde97b8ae 100644
--- a/example/advection/advection_driver.cpp
+++ b/example/advection/advection_driver.cpp
@@ -86,8 +86,8 @@ TaskCollection AdvectionDriver::MakeTaskCollection(BlockList_t &blocks, const in
 
     const auto any = parthenon::BoundaryType::any;
 
-    tl.AddTask(none, parthenon::StartReceiveBoundBufs<any>, mc1);
-    tl.AddTask(none, parthenon::StartReceiveFluxCorrections, mc0);
+    //tl.AddTask(none, parthenon::StartReceiveBoundBufs<any>, mc1);
+    //tl.AddTask(none, parthenon::StartReceiveFluxCorrections, mc0);
   }
 
   // Number of task lists that can be executed independently and thus *may*
@@ -124,13 +124,13 @@ TaskCollection AdvectionDriver::MakeTaskCollection(BlockList_t &blocks, const in
     auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
     auto &mdudt = pmesh->mesh_data.GetOrAdd("dUdt", i);
 
-    auto send_flx = tl.AddTask(none, parthenon::LoadAndSendFluxCorrections, mc0);
-    auto recv_flx = tl.AddTask(none, parthenon::ReceiveFluxCorrections, mc0);
-    auto set_flx = tl.AddTask(recv_flx, parthenon::SetFluxCorrections, mc0);
+    //auto send_flx = tl.AddTask(none, parthenon::LoadAndSendFluxCorrections, mc0);
+    //auto recv_flx = tl.AddTask(none, parthenon::ReceiveFluxCorrections, mc0);
+    //auto set_flx = tl.AddTask(recv_flx, parthenon::SetFluxCorrections, mc0);
 
     // compute the divergence of fluxes of conserved variables
     auto flux_div =
-        tl.AddTask(set_flx, FluxDivergence<MeshData<Real>>, mc0.get(), mdudt.get());
+        tl.AddTask(none, FluxDivergence<MeshData<Real>>, mc0.get(), mdudt.get());
 
     auto avg_data = tl.AddTask(flux_div, AverageIndependentData<MeshData<Real>>,
                                mc0.get(), mbase.get(), beta);
@@ -139,7 +139,7 @@ TaskCollection AdvectionDriver::MakeTaskCollection(BlockList_t &blocks, const in
                              mdudt.get(), beta * dt, mc1.get());
 
     // do boundary exchange
-    parthenon::AddBoundaryExchangeTasks(update, tl, mc1, pmesh->multilevel);
+    //parthenon::AddBoundaryExchangeTasks(update, tl, mc1, pmesh->multilevel);
   }
 
   TaskRegion &async_region2 = tc.AddRegion(num_task_lists_executed_independently);
diff --git a/example/advection/advection_package.cpp b/example/advection/advection_package.cpp
index 3008f457d8c9..ddce9effcee5 100644
--- a/example/advection/advection_package.cpp
+++ b/example/advection/advection_package.cpp
@@ -28,6 +28,7 @@
 #include "kokkos_abstraction.hpp"
 #include "reconstruct/dc_inline.hpp"
 #include "utils/error_checking.hpp"
+#include "utils/thread_pool.hpp"
 
 using namespace parthenon::package::prelude;
 
@@ -75,6 +76,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin) {
 
   auto fill_derived = pin->GetOrAddBoolean("Advection", "fill_derived", true);
   pkg->AddParam<>("fill_derived", fill_derived);
+  printf("fill_derived = %d\n", fill_derived);
 
   // For wavevector along coordinate axes, set desired values of ang_2/ang_3.
   //    For example, for 1D problem use ang_2 = ang_3 = 0.0
@@ -243,22 +245,23 @@ AmrTag CheckRefinement(MeshBlockData<Real> *rc) {
   for (int var = 1; var < num_vars; ++var) {
     vars.push_back("advected_" + std::to_string(var));
   }
-  // type is parthenon::VariablePack<Variable<Real>>
-  auto v = rc->PackVariables(vars);
+  auto desc = parthenon::MakePackDescriptor(pmb->resolved_packages.get(), vars);
+  auto v = desc.GetPack(rc);
 
   IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::entire);
   IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::entire);
   IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::entire);
 
   typename Kokkos::MinMax<Real>::value_type minmax;
-  pmb->par_reduce(
-      PARTHENON_AUTO_LABEL, 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+  parthenon::par_reduce(
+    parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 
+    0, v.GetMaxNumberOfVars() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i,
                     typename Kokkos::MinMax<Real>::value_type &lminmax) {
         lminmax.min_val =
-            (v(n, k, j, i) < lminmax.min_val ? v(n, k, j, i) : lminmax.min_val);
+            (v(0, n, k, j, i) < lminmax.min_val ? v(0, n, k, j, i) : lminmax.min_val);
         lminmax.max_val =
-            (v(n, k, j, i) > lminmax.max_val ? v(n, k, j, i) : lminmax.max_val);
+            (v(0, n, k, j, i) > lminmax.max_val ? v(0, n, k, j, i) : lminmax.max_val);
       },
       Kokkos::MinMax<Real>(minmax));
 
@@ -283,16 +286,18 @@ void PreFill(MeshBlockData<Real> *rc) {
 
     // packing in principle unnecessary/convoluted here and just done for demonstration
     std::vector<std::string> vars({"advected", "one_minus_advected"});
-    PackIndexMap imap;
-    const auto &v = rc->PackVariables(vars, imap);
+    auto desc = parthenon::MakePackDescriptor(pmb->resolved_packages.get(), vars);
+    auto v = desc.GetPack(rc);
+    auto imap = desc.GetMap();
 
-    const int in = imap.get("advected").first;
-    const int out = imap.get("one_minus_advected").first;
+    const int in = imap["advected"];
+    const int out = imap["one_minus_advected"];
     const auto num_vars = rc->Get("advected").data.GetDim(4);
-    pmb->par_for(
-        PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+    parthenon::par_for(
+        DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(),
+        0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
-          v(out + n, k, j, i) = 1.0 - v(in + n, k, j, i);
+          v(0, out + n, k, j, i) = 1.0 - v(0, in + n, k, j, i);
         });
   }
 }
@@ -307,16 +312,18 @@ void SquareIt(MeshBlockData<Real> *rc) {
 
   // packing in principle unnecessary/convoluted here and just done for demonstration
   std::vector<std::string> vars({"one_minus_advected", "one_minus_advected_sq"});
-  PackIndexMap imap;
-  const auto &v = rc->PackVariables(vars, imap);
+  auto desc = parthenon::MakePackDescriptor(pmb->resolved_packages.get(), vars);
+  auto v = desc.GetPack(rc);
+  auto imap = desc.GetMap();
 
-  const int in = imap.get("one_minus_advected").first;
-  const int out = imap.get("one_minus_advected_sq").first;
+  const int in = imap["one_minus_advected"];
+  const int out = imap["one_minus_advected_sq"];
   const auto num_vars = rc->Get("advected").data.GetDim(4);
-  pmb->par_for(
-      PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+  parthenon::par_for(
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(),
+      0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
-        v(out + n, k, j, i) = v(in + n, k, j, i) * v(in + n, k, j, i);
+        v(0, out + n, k, j, i) = v(0, in + n, k, j, i) * v(0, in + n, k, j, i);
       });
 
   // The following block/logic is also just added for regression testing.
@@ -330,8 +337,9 @@ void SquareIt(MeshBlockData<Real> *rc) {
   const auto &profile = pkg->Param<std::string>("profile");
   if (profile == "smooth_gaussian") {
     const auto &advected = rc->Get("advected").data;
-    pmb->par_for(
-        PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+    parthenon::par_for(
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(),
+      0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
           PARTHENON_REQUIRE(advected(n, k, j, i) != 0.0,
                             "Advected not properly initialized.");
@@ -355,19 +363,21 @@ void PostFill(MeshBlockData<Real> *rc) {
     pmb->AllocSparseID("one_minus_sqrt_one_minus_advected_sq", 37);
 
     // packing in principle unnecessary/convoluted here and just done for demonstration
-    std::vector<std::string> vars(
-        {"one_minus_advected_sq", "one_minus_sqrt_one_minus_advected_sq"});
-    PackIndexMap imap;
-    const auto &v = rc->PackVariables(vars, imap);
+    std::vector<std::pair<std::string, bool>> vars(
+        {{"one_minus_advected_sq", false}, {"one_minus_sqrt_one_minus_advected_sq*", true}});
+    auto desc = parthenon::MakePackDescriptor(pmb->resolved_packages.get(), vars);
+    auto v = desc.GetPack(rc);
+    auto imap = desc.GetMap();
 
-    const int in = imap.get("one_minus_advected_sq").first;
+    const int in = imap["one_minus_advected_sq"];
     // we can get sparse fields either by specifying base name and sparse id, or the full
     // name
-    const int out12 = imap.get("one_minus_sqrt_one_minus_advected_sq", 12).first;
-    const int out37 = imap.get("one_minus_sqrt_one_minus_advected_sq_37").first;
+    const int out12 = imap["one_minus_sqrt_one_minus_advected_sq_12"];
+    const int out37 = imap["one_minus_sqrt_one_minus_advected_sq_37"];
     const auto num_vars = rc->Get("advected").data.GetDim(4);
-    pmb->par_for(
-        PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+    parthenon::par_for(
+        DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(),
+        0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
           v(out12 + n, k, j, i) = 1.0 - sqrt(v(in + n, k, j, i));
           v(out37 + n, k, j, i) = 1.0 - v(out12 + n, k, j, i);
@@ -390,7 +400,8 @@ Real AdvectionHst(MeshData<Real> *md) {
 
   // Packing variable over MeshBlock as the function is called for MeshData, i.e., a
   // collection of blocks
-  const auto &advected_pack = md->PackVariables(std::vector<std::string>{"advected"});
+  auto desc = parthenon::MakePackDescriptor(pmb->resolved_packages.get(), std::vector<std::string>{"advected"});
+  auto advected_pack = desc.GetPack(md);
 
   Real result = 0.0;
   T reducer(result);
@@ -400,11 +411,11 @@ Real AdvectionHst(MeshData<Real> *md) {
   // weighting needs to be applied in the reduction region.
   const bool volume_weighting = std::is_same<T, Kokkos::Sum<Real, HostExecSpace>>::value;
 
-  pmb->par_reduce(
-      PARTHENON_AUTO_LABEL, 0, advected_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
-      ib.e,
+  parthenon::par_reduce(
+      parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(),
+      0, advected_pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &lresult) {
-        const auto &coords = advected_pack.GetCoords(b);
+        const auto &coords = advected_pack.GetCoordinates(b);
         // `join` is a function of the Kokkos::ReducerConecpt that allows to use the same
         // call for different reductions
         const Real vol = volume_weighting ? coords.CellVolume(k, j, i) : 1.0;
@@ -432,8 +443,9 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
 
   // this is obviously overkill for this constant velocity problem
   Real min_dt;
-  pmb->par_reduce(
-      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+  parthenon::par_reduce(
+      parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(),
+      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i, Real &lmin_dt) {
         if (vx != 0.0)
           lmin_dt = std::min(lmin_dt, coords.Dxc<X1DIR>(k, j, i) / std::abs(vx));
@@ -465,22 +477,30 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   const auto &vy = pkg->Param<Real>("vy");
   const auto &vz = pkg->Param<Real>("vz");
 
-  PackIndexMap index_map;
-  auto v = rc->PackVariablesAndFluxes(std::vector<MetadataFlag>{Metadata::WithFluxes},
-                                      index_map);
-
-  // For non constant velocity, we need the index of the velocity vector as it's part of
-  // the variable pack.
-  const auto idx_v = index_map["v"].first;
-  const auto v_const = idx_v < 0; // using "at own perill" magic number
+  auto desc = parthenon::MakePackDescriptor<parthenon::variable_names::any>(pmb->resolved_packages.get(),
+                                            std::vector<MetadataFlag>{Metadata::WithFluxes},
+                                            std::set<parthenon::PDOpt>{parthenon::PDOpt::WithFluxes});
+  auto v = desc.GetPack(rc.get());
+  //auto imap = desc.GetMap();
+  int idx_v;
+  bool v_const;
+  //if (imap.count("v") > 0) {
+    //idx_v = imap["v"];
+  //  v_const = false;
+  //} else {
+    v_const = true;
+  //}
 
   const int scratch_level = 1; // 0 is actual scratch (tiny); 1 is HBM
   const int nx1 = pmb->cellbounds.ncellsi(IndexDomain::entire);
-  const int nvar = v.GetDim(4);
+  const int nvar = v.GetMaxNumberOfVars();
   size_t scratch_size_in_bytes = parthenon::ScratchPad2D<Real>::shmem_size(nvar, nx1);
   // get x-fluxes
-  pmb->par_for_outer(
-      PARTHENON_AUTO_LABEL, 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+  //std::cout << "hello from thread " << std::this_thread::get_id() << std::endl;
+  //for (int cnt=0; cnt<300; cnt++) {
+  parthenon::par_for_outer(
+      DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(),
+      2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
       jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
         parthenon::ScratchPad2D<Real> ql(member.team_scratch(scratch_level), nvar, nx1);
         parthenon::ScratchPad2D<Real> qr(member.team_scratch(scratch_level), nvar, nx1);
@@ -512,8 +532,9 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
 
   // get y-fluxes
   if (pmb->pmy_mesh->ndim >= 2) {
-    pmb->par_for_outer(
-        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+    parthenon::par_for_outer(
+        DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(),
+        3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
         jb.e + 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clear inefficient and kept
           // just for demonstrating purposes. The key point is that we cannot reuse
@@ -555,8 +576,9 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
 
   // get z-fluxes
   if (pmb->pmy_mesh->ndim == 3) {
-    pmb->par_for_outer(
-        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1,
+    parthenon::par_for_outer(
+        DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(),
+        3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1,
         jb.s, jb.e,
         KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clear inefficient and kept
@@ -596,6 +618,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
           }
         });
   }
+  //}
+  //std::cout << "done on thread " << std::this_thread::get_id() << std::endl;
 
   return TaskStatus::complete;
 }
diff --git a/external/Kokkos b/external/Kokkos
index 62d2b6c879b7..486cc745cb9a 160000
--- a/external/Kokkos
+++ b/external/Kokkos
@@ -1 +1 @@
-Subproject commit 62d2b6c879b74b6ae7bd06eb3e5e80139c4708e6
+Subproject commit 486cc745cb9a287f3915061455105a3ee588c616
diff --git a/scripts/python/packages/parthenon_tools/parthenon_tools/movie2d.py b/scripts/python/packages/parthenon_tools/parthenon_tools/movie2d.py
index 363f4d00eb66..25bb914efa01 100644
--- a/scripts/python/packages/parthenon_tools/parthenon_tools/movie2d.py
+++ b/scripts/python/packages/parthenon_tools/parthenon_tools/movie2d.py
@@ -247,7 +247,8 @@ def plot_dump(
     for i in range(n_blocks):
         # Plot the actual data, should work if parthenon/output*/ghost_zones = true/false
         # but obviously no ghost data will be shown if ghost_zones = false
-        p.pcolormesh(xf[i, :], yf[i, :], q[i, :, :], vmin=qmin, vmax=qmax)
+        # print(xf[i,:].shape, yf[i,:].shape, q[i,:,:].shape)
+        p.pcolormesh(xf[i, :], yf[i, :], np.squeeze(q[i, :, :]), vmin=qmin, vmax=qmax)
 
         # Print the block gid in the center of the block
         if len(block_ids) > 0:
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4ecd634481f5..b7f06f704d9d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -222,7 +222,6 @@ add_library(parthenon
   solvers/solver_utils.hpp
 
   tasks/tasks.hpp
-  tasks/thread_pool.hpp
 
   time_integration/butcher_integrator.cpp
   time_integration/low_storage_integrator.cpp
@@ -260,6 +259,7 @@ add_library(parthenon
   utils/sort.hpp
   utils/string_utils.cpp
   utils/string_utils.hpp
+  utils/thread_pool.hpp
   utils/unique_id.cpp
   utils/unique_id.hpp
   utils/utils.hpp
@@ -321,6 +321,10 @@ if (Kokkos_ENABLE_CUDA AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
    target_compile_options(parthenon PUBLIC --expt-relaxed-constexpr)
 endif()
 
+#add_compile_options(-fsanitize=thread)
+#add_link_options(-fsanitize=thread)
+add_link_options(-Wl,-no_pie -lprofiler -ltcmalloc)
+
 target_link_libraries(parthenon PUBLIC Kokkos::kokkos Threads::Threads)
 
 if (PARTHENON_ENABLE_ASCENT)
diff --git a/src/basic_types.hpp b/src/basic_types.hpp
index 886788727d48..6d10f4dfa5b0 100644
--- a/src/basic_types.hpp
+++ b/src/basic_types.hpp
@@ -25,6 +25,8 @@
 
 namespace parthenon {
 
+inline thread_local Kokkos::Serial t_exec_space;
+
 // primitive type alias that allows code to run with either floats or doubles
 #if SINGLE_PRECISION_ENABLED
 using Real = float;
diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp
index b4a4072758bd..fcf47dd78150 100644
--- a/src/bvals/comms/bnd_info.cpp
+++ b/src/bvals/comms/bnd_info.cpp
@@ -220,6 +220,11 @@ int GetBufferSize(MeshBlock *pmb, const NeighborBlock &nb,
          v->GetDim(4) * topo_comp;
 }
 
+void BndInfo::LockedBufferCopy(CommBuffer<buf_pool_t<Real>::owner_t> *input_buf) {
+  //std::lock_guard<std::mutex> lock(mutex);
+  buf = input_buf->buffer();
+}
+
 BndInfo BndInfo::GetSendBndInfo(MeshBlock *pmb, const NeighborBlock &nb,
                                 std::shared_ptr<Variable<Real>> v,
                                 CommBuffer<buf_pool_t<Real>::owner_t> *buf) {
@@ -229,7 +234,8 @@ BndInfo BndInfo::GetSendBndInfo(MeshBlock *pmb, const NeighborBlock &nb,
   out.alloc_status = v->GetAllocationStatus();
   if (!out.allocated) return out;
 
-  out.buf = buf->buffer();
+  out.LockedBufferCopy(buf);
+  //out.buf = buf->buffer();
 
   int Nv = v->GetDim(4);
   int Nu = v->GetDim(5);
@@ -257,7 +263,8 @@ BndInfo BndInfo::GetSetBndInfo(MeshBlock *pmb, const NeighborBlock &nb,
                                std::shared_ptr<Variable<Real>> v,
                                CommBuffer<buf_pool_t<Real>::owner_t> *buf) {
   BndInfo out;
-  out.buf = buf->buffer();
+  //out.buf = buf->buffer();
+  out.LockedBufferCopy(buf);
   auto buf_state = buf->GetState();
   if (buf_state == BufferState::received) {
     out.buf_allocated = true;
@@ -454,7 +461,8 @@ BndInfo BndInfo::GetSendCCFluxCor(MeshBlock *pmb, const NeighborBlock &nb,
     // Not going to actually do anything with this buffer
     return out;
   }
-  out.buf = buf->buffer();
+  //out.buf = buf->buffer();
+  out.LockedBufferCopy(buf);
 
   IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
   IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
@@ -514,7 +522,8 @@ BndInfo BndInfo::GetSetCCFluxCor(MeshBlock *pmb, const NeighborBlock &nb,
   }
   out.allocated = true;
   out.alloc_status = v->GetAllocationStatus();
-  out.buf = buf->buffer();
+  //out.buf = buf->buffer();
+  out.LockedBufferCopy(buf);
 
   IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
   IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp
index 4860bdc7245f..bbc74ed67f71 100644
--- a/src/bvals/comms/bnd_info.hpp
+++ b/src/bvals/comms/bnd_info.hpp
@@ -56,6 +56,9 @@ struct BndInfo {
   BndInfo() = default;
   BndInfo(const BndInfo &) = default;
 
+  inline static std::mutex mutex;
+  void LockedBufferCopy(CommBuffer<buf_pool_t<Real>::owner_t> *buf);
+
   // These are are used to generate the BndInfo struct for various
   // kinds of boundary types and operations.
   static BndInfo GetSendBndInfo(MeshBlock *pmb, const NeighborBlock &nb,
diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp
index cc093a34f080..4a84f432a16f 100644
--- a/src/bvals/comms/boundary_communication.cpp
+++ b/src/bvals/comms/boundary_communication.cpp
@@ -44,10 +44,17 @@ namespace parthenon {
 using namespace loops;
 using namespace loops::shorthands;
 
+static std::array<std::mutex, 2 * NUM_BNDRY_TYPES> mutex;
+//static std::mutex mutex;
+
 template <BoundaryType bound_type>
 TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
   PARTHENON_INSTRUMENT
 
+  int mutex_id = 2 * static_cast<int>(bound_type) + 1;
+  //mutex[mutex_id].lock();
+  //mutex.lock();
+
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, true);
 
@@ -59,9 +66,13 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
       CheckSendBufferCacheForRebuild<bound_type, true>(md);
 
   if (nbound == 0) {
+    //mutex[mutex_id].unlock();
+    //mutex.unlock();
     return TaskStatus::complete;
   }
   if (other_communication_unfinished) {
+    //mutex[mutex_id].unlock();
+    //mutex.unlock();
     return TaskStatus::incomplete;
   }
 
@@ -77,6 +88,10 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
                                            ProResInfo::GetSend);
     }
   }
+
+  //mutex[mutex_id].unlock();
+  //mutex.unlock();
+
   // Restrict
   auto pmb = md->GetBlockData(0)->GetBlockPointer();
   StateDescriptor *resolved_packages = pmb->resolved_packages.get();
@@ -149,6 +164,7 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
     else
       buf.SendNull();
   }
+  //mutex.unlock();
 
   return TaskStatus::complete;
 }
@@ -165,11 +181,18 @@ SendBoundBufs<BoundaryType::gmg_prolongate_send>(std::shared_ptr<MeshData<Real>>
 template <BoundaryType bound_type>
 TaskStatus StartReceiveBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
   PARTHENON_INSTRUMENT
+
+  int mutex_id = 2 * static_cast<int>(bound_type);
+  //mutex[mutex_id].lock();
+  //mutex.lock();
+
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
   if (cache.buf_vec.size() == 0)
     InitializeBufferCache<bound_type>(md, &(pmesh->boundary_comm_map), &cache, ReceiveKey,
                                       false);
+  //mutex[mutex_id].unlock();
+  //mutex.unlock();
 
   std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec),
                 [](auto pbuf) { pbuf->TryStartReceive(); });
@@ -192,11 +215,17 @@ template <BoundaryType bound_type>
 TaskStatus ReceiveBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
   PARTHENON_INSTRUMENT
 
+  int mutex_id = 2 * static_cast<int>(bound_type);
+  //mutex[mutex_id].lock();
+  //mutex.lock();
+
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
   if (cache.buf_vec.size() == 0)
     InitializeBufferCache<bound_type>(md, &(pmesh->boundary_comm_map), &cache, ReceiveKey,
                                       false);
+  //mutex[mutex_id].unlock();
+  //mutex.unlock();
 
   bool all_received = true;
   std::for_each(
@@ -240,6 +269,10 @@ template <BoundaryType bound_type>
 TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
   PARTHENON_INSTRUMENT
 
+  int mutex_id = 2 * static_cast<int>(bound_type);
+  //mutex[mutex_id].lock();
+  //mutex.lock();
+
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
 
@@ -257,6 +290,10 @@ TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
                                             ProResInfo::GetSet);
     }
   }
+
+  //mutex[mutex_id].unlock();
+  //mutex.unlock();
+
   // const Real threshold = Globals::sparse_config.allocation_threshold;
   auto &bnd_info = cache.bnd_info;
   Kokkos::parallel_for(
@@ -333,6 +370,10 @@ template <BoundaryType bound_type>
 TaskStatus ProlongateBounds(std::shared_ptr<MeshData<Real>> &md) {
   PARTHENON_INSTRUMENT
 
+  int mutex_id = 2 * static_cast<int>(bound_type);
+  //mutex[mutex_id].lock();
+  //mutex.lock();
+
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
 
@@ -350,6 +391,8 @@ TaskStatus ProlongateBounds(std::shared_ptr<MeshData<Real>> &md) {
                                             ProResInfo::GetSet);
     }
   }
+  //mutex[mutex_id].unlock();
+  //mutex.unlock();
 
   if (nbound > 0 && pmesh->multilevel) {
     auto pmb = md->GetBlockData(0)->GetBlockPointer();
diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp
index bdb5e95dc4c2..a9a9b3392d15 100644
--- a/src/bvals/comms/bvals_utils.hpp
+++ b/src/bvals/comms/bvals_utils.hpp
@@ -95,9 +95,9 @@ void InitializeBufferCache(std::shared_ptr<MeshData<Real>> &md, COMM_MAP *comm_m
 
   // Or, what the hell, you could put them in random order if you want, which
   // frighteningly seems to run faster in some cases
-  std::random_device rd;
-  std::mt19937 g(rd());
-  std::shuffle(key_order.begin(), key_order.end(), g);
+  //std::random_device rd;
+  //std::mt19937 g(rd());
+  //std::shuffle(key_order.begin(), key_order.end(), g);
 
   int buff_idx = 0;
   pcache->buf_vec.clear();
diff --git a/src/bvals/comms/flux_correction.cpp b/src/bvals/comms/flux_correction.cpp
index ff76bcba0014..9240be85e5ee 100644
--- a/src/bvals/comms/flux_correction.cpp
+++ b/src/bvals/comms/flux_correction.cpp
@@ -38,9 +38,16 @@
 namespace parthenon {
 using namespace impl;
 
+static std::array<std::mutex, 2 * NUM_BNDRY_TYPES> mutex;
+//static std::mutex mutex;
+
 TaskStatus LoadAndSendFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   PARTHENON_INSTRUMENT
 
+  int mutex_id = 2 * static_cast<int>(BoundaryType::flxcor_send) + 1;  
+  //mutex[mutex_id].lock();
+  //mutex.lock();
+
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_send, true);
   const int ndim = pmesh->ndim;
@@ -53,16 +60,22 @@ TaskStatus LoadAndSendFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
       CheckSendBufferCacheForRebuild<BoundaryType::flxcor_send, true>(md);
 
   if (nbound == 0) {
+    //mutex[mutex_id].unlock();
+    //mutex.unlock();
     return TaskStatus::complete;
   }
 
   if (other_communication_unfinished) {
+    //mutex[mutex_id].unlock();
+    //mutex.unlock();
     return TaskStatus::incomplete;
   }
 
   if (rebuild)
     RebuildBufferCache<BoundaryType::flxcor_send, true>(
         md, nbound, BndInfo::GetSendCCFluxCor, ProResInfo::GetSend);
+  //mutex[mutex_id].unlock();
+  //mutex.unlock();
 
   auto &bnd_info = cache.bnd_info;
   PARTHENON_REQUIRE(bnd_info.size() == nbound, "Need same size for boundary info");
@@ -114,16 +127,24 @@ TaskStatus LoadAndSendFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   // Calling Send will send null if the underlying buffer is unallocated
   for (auto &buf : cache.buf_vec)
     buf->Send();
+  //mutex.unlock();
   return TaskStatus::complete;
 }
 
 TaskStatus StartReceiveFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   PARTHENON_INSTRUMENT
+
+  int mutex_id = 2 * static_cast<int>(BoundaryType::flxcor_recv);
+  //mutex[mutex_id].lock();
+  //mutex.lock();
+
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false);
   if (cache.buf_vec.size() == 0)
     InitializeBufferCache<BoundaryType::flxcor_recv>(
         md, &(pmesh->boundary_comm_flxcor_map), &cache, ReceiveKey, false);
+  //mutex[mutex_id].unlock();
+  //mutex.unlock();
 
   std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec),
                 [](auto pbuf) { pbuf->TryStartReceive(); });
@@ -134,11 +155,17 @@ TaskStatus StartReceiveFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
 TaskStatus ReceiveFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   PARTHENON_INSTRUMENT
 
+  int mutex_id = 2 * static_cast<int>(BoundaryType::flxcor_recv);
+  //mutex[mutex_id].lock();
+  //mutex.lock();
+
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false);
   if (cache.buf_vec.size() == 0)
     InitializeBufferCache<BoundaryType::flxcor_recv>(
         md, &(pmesh->boundary_comm_flxcor_map), &cache, ReceiveKey, false);
+  //mutex[mutex_id].unlock();
+  //mutex.unlock();
 
   bool all_received = true;
   std::for_each(
@@ -152,6 +179,10 @@ TaskStatus ReceiveFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
 TaskStatus SetFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   PARTHENON_INSTRUMENT
 
+  int mutex_id = 2 * static_cast<int>(BoundaryType::flxcor_recv);
+  //mutex[mutex_id].lock();
+  //mutex.lock();
+
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false);
 
@@ -160,6 +191,8 @@ TaskStatus SetFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   if (rebuild)
     RebuildBufferCache<BoundaryType::flxcor_recv, false>(
         md, nbound, BndInfo::GetSetCCFluxCor, ProResInfo::GetSend);
+  //mutex[mutex_id].unlock();
+  //mutex.unlock();
 
   auto &bnd_info = cache.bnd_info;
   Kokkos::parallel_for(
@@ -182,6 +215,7 @@ TaskStatus SetFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec),
                 [](auto pbuf) { pbuf->Stale(); });
 
+  //mutex.unlock();
   return TaskStatus::complete;
 }
 
diff --git a/src/driver/driver.hpp b/src/driver/driver.hpp
index 929ea19c3f2b..2fd27f5d21fd 100644
--- a/src/driver/driver.hpp
+++ b/src/driver/driver.hpp
@@ -38,7 +38,9 @@ enum class DriverStatus { complete, timeout, failed };
 class Driver {
  public:
   Driver(ParameterInput *pin, ApplicationInput *app_in, Mesh *pm)
-      : pinput(pin), app_input(app_in), pmesh(pm), mbcnt_prev(), time_LBandAMR() {}
+      : pinput(pin), app_input(app_in), pmesh(pm), mbcnt_prev(), time_LBandAMR() {
+    pool = pm->pool;
+  }
   virtual DriverStatus Execute() = 0;
   void InitializeOutputs() { pouts = std::make_unique<Outputs>(pmesh, pinput); }
 
@@ -46,14 +48,15 @@ class Driver {
   ApplicationInput *app_input;
   Mesh *pmesh;
   std::unique_ptr<Outputs> pouts;
+  std::shared_ptr<ThreadPool> pool;
   static double elapsed_main() { return timer_main.seconds(); }
   static double elapsed_cycle() { return timer_cycle.seconds(); }
   static double elapsed_LBandAMR() { return timer_LBandAMR.seconds(); }
 
  protected:
   static Kokkos::Timer timer_cycle, timer_main, timer_LBandAMR;
-  double time_LBandAMR;
   std::uint64_t mbcnt_prev;
+  double time_LBandAMR;
   virtual void PreExecute();
   virtual void PostExecute(DriverStatus status);
 
@@ -105,7 +108,7 @@ TaskListStatus ConstructAndExecuteBlockTasks(T *driver, Args... args) {
   for (auto &pmb : driver->pmesh->block_list) {
     tr[i++] = driver->MakeTaskList(pmb.get(), std::forward<Args>(args)...);
   }
-  TaskListStatus status = tc.Execute();
+  TaskListStatus status = tc.Execute(*driver->pool);
   return status;
 }
 
@@ -113,7 +116,7 @@ template <typename T, class... Args>
 TaskListStatus ConstructAndExecuteTaskLists(T *driver, Args... args) {
   TaskCollection tc =
       driver->MakeTaskCollection(driver->pmesh->block_list, std::forward<Args>(args)...);
-  TaskListStatus status = tc.Execute();
+  TaskListStatus status = tc.Execute(*driver->pool);
   return status;
 }
 
diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp
index e8250c4b58bc..c57e777fd86b 100644
--- a/src/interface/sparse_pack_base.cpp
+++ b/src/interface/sparse_pack_base.cpp
@@ -287,8 +287,9 @@ template SparsePackBase SparsePackBase::Build<MeshData<Real>>(MeshData<Real> *,
 template <class T>
 SparsePackBase &SparsePackCache::Get(T *pmd, const PackDescriptor &desc,
                                      const std::vector<bool> &include_block) {
-  if (pack_map.count(desc.identifier) > 0) {
-    auto &cache_tuple = pack_map[desc.identifier];
+  auto cache_tuple_itr = GetFromMap(desc.identifier);
+  if (false && cache_tuple_itr != pack_map.end()) {
+    auto &cache_tuple = cache_tuple_itr->second;
     auto &pack = std::get<0>(cache_tuple);
     auto alloc_status_in = SparsePackBase::GetAllocStatus(pmd, desc, include_block);
     auto &alloc_status = std::get<1>(cache_tuple);
@@ -320,8 +321,10 @@ SparsePackCache::Get<MeshBlockData<Real>>(MeshBlockData<Real> *, const PackDescr
 template <class T>
 SparsePackBase &SparsePackCache::BuildAndAdd(T *pmd, const PackDescriptor &desc,
                                              const std::vector<bool> &include_block) {
-  if (pack_map.count(desc.identifier) > 0) pack_map.erase(desc.identifier);
-  pack_map[desc.identifier] = {SparsePackBase::Build(pmd, desc, include_block),
+  auto pack = SparsePackBase::Build(pmd, desc, include_block);
+  std::lock_guard<std::mutex> lock(mutex);
+  pack_map.erase(desc.identifier);
+  pack_map[desc.identifier] = {std::move(pack),
                                SparsePackBase::GetAllocStatus(pmd, desc, include_block),
                                include_block};
   return std::get<0>(pack_map[desc.identifier]);
diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp
index 26a5f3fd7b61..76673a6c2440 100644
--- a/src/interface/sparse_pack_base.hpp
+++ b/src/interface/sparse_pack_base.hpp
@@ -18,6 +18,7 @@
 #include <limits>
 #include <map>
 #include <memory>
+#include <mutex>
 #include <set>
 #include <string>
 #include <tuple>
@@ -121,6 +122,13 @@ class SparsePackCache {
                                              SparsePackBase::include_t>>
       pack_map;
 
+ private:
+  auto GetFromMap(const std::string &key) {
+    std::lock_guard<std::mutex> lock(mutex);
+    return pack_map.find(key);
+  }
+  std::mutex mutex;
+
   friend class SparsePackBase;
 };
 
diff --git a/src/interface/update.cpp b/src/interface/update.cpp
index 0e0d1195943e..69ab99d0990d 100644
--- a/src/interface/update.cpp
+++ b/src/interface/update.cpp
@@ -62,23 +62,37 @@ template <>
 TaskStatus FluxDivergence(MeshData<Real> *in_obj, MeshData<Real> *dudt_obj) {
   const IndexDomain interior = IndexDomain::interior;
 
+  auto pm = in_obj->GetMeshPointer();
   std::vector<MetadataFlag> flags({Metadata::WithFluxes, Metadata::Cell});
-  const auto &vin = in_obj->PackVariablesAndFluxes(flags);
-  auto dudt = dudt_obj->PackVariables(flags);
+  auto desc = MakePackDescriptor<variable_names::any>(pm->resolved_packages.get(),
+                                 flags, std::set<PDOpt>{PDOpt::WithFluxes});
+  auto v = desc.GetPack(in_obj);
+  auto dudt = desc.GetPack(dudt_obj);
+
   const IndexRange ib = in_obj->GetBoundsI(interior);
   const IndexRange jb = in_obj->GetBoundsJ(interior);
   const IndexRange kb = in_obj->GetBoundsK(interior);
 
-  const int ndim = vin.GetNdim();
+  const int nblocks = v.GetNBlocks();
+  const int nvars = v.GetMaxNumberOfVars();
+
+  const int ndim = pm->ndim;
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, vin.GetDim(5) - 1, 0,
-      vin.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, nblocks - 1, 0,
+      nvars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int m, const int l, const int k, const int j, const int i) {
-        if (dudt.IsAllocated(m, l) && vin.IsAllocated(m, l)) {
-          const auto &coords = vin.GetCoords(m);
-          const auto &v = vin(m);
-          dudt(m, l, k, j, i) = FluxDivHelper(l, k, j, i, ndim, coords, v);
+        const auto &coords = v.GetCoordinates(m);
+        Real du = (coords.FaceArea<X1DIR>(k, j, i + 1) * v.flux(m, X1DIR, l, k, j, i + 1) -
+                    coords.FaceArea<X1DIR>(k, j, i) * v.flux(m, X1DIR, l, k, j, i));
+        if (ndim >= 2) {
+          du += (coords.FaceArea<X2DIR>(k, j + 1, i) * v.flux(m, X2DIR, l, k, j + 1, i) -
+                  coords.FaceArea<X2DIR>(k, j, i) * v.flux(m, X2DIR, l, k, j, i));
+        }
+        if (ndim == 3) {
+          du += (coords.FaceArea<X3DIR>(k + 1, j, i) * v.flux(m, X3DIR, l, k + 1, j, i) -
+                  coords.FaceArea<X3DIR>(k, j, i) * v.flux(m, X3DIR, l, k, j, i));
         }
+        dudt(m, l, k, j, i) = -du / coords.CellVolume(k, j, i);
       });
   return TaskStatus::complete;
 }
diff --git a/src/interface/update.hpp b/src/interface/update.hpp
index 21f035caefa3..79f660ee6ff1 100644
--- a/src/interface/update.hpp
+++ b/src/interface/update.hpp
@@ -71,19 +71,21 @@ template <typename F, typename T>
 TaskStatus WeightedSumData(const F &flags, T *in1, T *in2, const Real w1, const Real w2,
                            T *out) {
   PARTHENON_INSTRUMENT
-  const auto &x = in1->PackVariables(flags);
-  const auto &y = in2->PackVariables(flags);
-  const auto &z = out->PackVariables(flags);
+  auto pm = in1->GetMeshPointer();
+  auto desc = MakePackDescriptor<variable_names::any>(pm->resolved_packages.get(), flags);
+  auto x = desc.GetPack(in1);
+  auto y = desc.GetPack(in2);
+  auto z = desc.GetPack(out);
+  auto num_var = x.GetMaxNumberOfVars();
+  auto num_blocks = x.GetNBlocks();
+  auto ib = in1->GetBoundsI(IndexDomain::entire);
+  auto jb = in1->GetBoundsJ(IndexDomain::entire);
+  auto kb = in1->GetBoundsK(IndexDomain::entire);
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, x.GetDim(5) - 1, 0,
-      x.GetDim(4) - 1, 0, x.GetDim(3) - 1, 0, x.GetDim(2) - 1, 0, x.GetDim(1) - 1,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, num_blocks - 1, 0,
+      num_var - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
-        // TOOD(someone) This is potentially dangerous and/or not intended behavior
-        // as we still may want to update (or populate) z if any of those vars are
-        // not allocated yet.
-        if (x.IsAllocated(b, l) && y.IsAllocated(b, l) && z.IsAllocated(b, l)) {
-          z(b, l, k, j, i) = w1 * x(b, l, k, j, i) + w2 * y(b, l, k, j, i);
-        }
+        z(b, l, k, j, i) = w1 * x(b, l, k, j, i) + w2 * y(b, l, k, j, i);
       });
   return TaskStatus::complete;
 }
diff --git a/src/interface/variable_pack.hpp b/src/interface/variable_pack.hpp
index 924d15166aaf..9fd87cfbb566 100644
--- a/src/interface/variable_pack.hpp
+++ b/src/interface/variable_pack.hpp
@@ -716,6 +716,7 @@ template <typename T>
 VariableFluxPack<T> MakeFluxPack(const VarListWithKeys<T> &var_list,
                                  const VarListWithKeys<T> &flux_var_list,
                                  PackIndexMap *pvmap) {
+  printf("Making a FluxPack\n");
   const auto &vars = var_list.vars();           // for convenience
   const auto &flux_vars = flux_var_list.vars(); // for convenience
 
@@ -777,6 +778,7 @@ VariableFluxPack<T> MakeFluxPack(const VarListWithKeys<T> &var_list,
 template <typename T>
 VariablePack<T> MakePack(const VarListWithKeys<T> &var_list, bool coarse,
                          PackIndexMap *pvmap) {
+  printf("Making a Pack\n");
   const auto &vars = var_list.vars(); // for convenience
 
   if (vars.empty()) {
diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp
index c2cc09ce879f..6a2a5e40c033 100644
--- a/src/kokkos_abstraction.hpp
+++ b/src/kokkos_abstraction.hpp
@@ -43,9 +43,12 @@ using DevExecSpace = Kokkos::Cuda;
 #else
 using DevMemSpace = Kokkos::DefaultExecutionSpace::memory_space;
 using HostMemSpace = Kokkos::HostSpace;
-using DevExecSpace = Kokkos::DefaultExecutionSpace;
+using DevExecSpace_t = Kokkos::DefaultExecutionSpace;
+inline Kokkos::Serial DevExecSpace() {
+  return t_exec_space;
+}
 #endif
-using ScratchMemSpace = DevExecSpace::scratch_memory_space;
+using ScratchMemSpace = DevExecSpace_t::scratch_memory_space;
 
 using HostExecSpace = Kokkos::DefaultHostExecutionSpace;
 using LayoutWrapper = Kokkos::LayoutRight;
@@ -214,10 +217,21 @@ inline void kokkos_dispatch(ParallelScanDispatch, Args &&...args) {
 
 } // namespace dispatch_impl
 
+class ExecSpace {
+ public:
+  explicit ExecSpace(const int npartitions) {
+    std::vector<int> weights(npartitions, 1);
+    partitions = Kokkos::Experimental::partition_space(DevExecSpace_t(), weights);
+  }
+  Kokkos::DefaultExecutionSpace& operator[](const int i) { return partitions[i]; }
+ private:
+  std::vector<Kokkos::DefaultExecutionSpace> partitions;
+};
+
 // 1D loop using RangePolicy loops
 template <typename Tag, typename Function, class... Args>
 inline typename std::enable_if<sizeof...(Args) <= 1, void>::type
-par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_space,
+par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace_t exec_space,
              const int &il, const int &iu, const Function &function, Args &&...args) {
   Tag tag;
   kokkos_dispatch(tag, name,
@@ -230,7 +244,7 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp
 // 2D loop using MDRange loops
 template <typename Tag, typename Function, class... Args>
 inline typename std::enable_if<sizeof...(Args) <= 1, void>::type
-par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space,
+par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace_t exec_space,
              const int jl, const int ju, const int il, const int iu,
              const Function &function, Args &&...args) {
   Tag tag;
@@ -245,7 +259,7 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac
 // 3D loop using Kokkos 1D Range
 template <typename Tag, typename Function, class... Args>
 inline typename std::enable_if<sizeof...(Args) <= 1, void>::type
-par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_space,
+par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace_t exec_space,
              const int kl, const int ku, const int jl, const int ju, const int il,
              const int iu, const Function &function, Args &&...args) {
   Tag tag;
@@ -271,7 +285,7 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp
 // 3D loop using MDRange loops
 template <typename Tag, typename Function, class... Args>
 inline typename std::enable_if<sizeof...(Args) <= 1, void>::type
-par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space,
+par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace_t exec_space,
              const int &kl, const int &ku, const int &jl, const int &ju, const int &il,
              const int &iu, const Function &function, Args &&...args) {
   Tag tag;
@@ -287,7 +301,7 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac
 // 3D loop using TeamPolicy with single inner TeamThreadRange
 template <typename Tag, typename Function>
 inline void par_dispatch(LoopPatternTPTTR, const std::string &name,
-                         DevExecSpace exec_space, const int &kl, const int &ku,
+                         DevExecSpace_t exec_space, const int &kl, const int &ku,
                          const int &jl, const int &ju, const int &il, const int &iu,
                          const Function &function) {
   const int Nk = ku - kl + 1;
@@ -306,7 +320,7 @@ inline void par_dispatch(LoopPatternTPTTR, const std::string &name,
 // 3D loop using TeamPolicy with single inner ThreadVectorRange
 template <typename Tag, typename Function>
 inline void par_dispatch(LoopPatternTPTVR, const std::string &name,
-                         DevExecSpace exec_space, const int &kl, const int &ku,
+                         DevExecSpace_t exec_space, const int &kl, const int &ku,
                          const int &jl, const int &ju, const int &il, const int &iu,
                          const Function &function) {
   // TODO(pgrete) if exec space is Cuda,throw error
@@ -326,7 +340,7 @@ inline void par_dispatch(LoopPatternTPTVR, const std::string &name,
 // 3D loop using TeamPolicy with nested TeamThreadRange and ThreadVectorRange
 template <typename Tag, typename Function>
 inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name,
-                         DevExecSpace exec_space, const int &kl, const int &ku,
+                         DevExecSpace_t exec_space, const int &kl, const int &ku,
                          const int &jl, const int &ju, const int &il, const int &iu,
                          const Function &function) {
   const int Nk = ku - kl + 1;
@@ -345,7 +359,7 @@ inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name,
 // 3D loop using SIMD FOR loops
 template <typename Tag, typename Function>
 inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
-                         DevExecSpace exec_space, const int &kl, const int &ku,
+                         DevExecSpace_t exec_space, const int &kl, const int &ku,
                          const int &jl, const int &ju, const int &il, const int &iu,
                          const Function &function) {
   PARTHENON_INSTRUMENT_REGION(name)
@@ -359,7 +373,7 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
 // 4D loop using Kokkos 1D Range
 template <typename Tag, typename Function, class... Args>
 inline typename std::enable_if<sizeof...(Args) <= 1, void>::type
-par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_space,
+par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace_t exec_space,
              const int nl, const int nu, const int kl, const int ku, const int jl,
              const int ju, const int il, const int iu, const Function &function,
              Args &&...args) {
@@ -390,7 +404,7 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp
 // 4D loop using MDRange loops
 template <typename Tag, typename Function, class... Args>
 inline typename std::enable_if<sizeof...(Args) <= 1, void>::type
-par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space,
+par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace_t exec_space,
              const int nl, const int nu, const int kl, const int ku, const int jl,
              const int ju, const int il, const int iu, const Function &function,
              Args &&...args) {
@@ -407,7 +421,7 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac
 // 4D loop using TeamPolicy loop with inner TeamThreadRange
 template <typename Tag, typename Function>
 inline void par_dispatch(LoopPatternTPTTR, const std::string &name,
-                         DevExecSpace exec_space, const int nl, const int nu,
+                         DevExecSpace_t exec_space, const int nl, const int nu,
                          const int kl, const int ku, const int jl, const int ju,
                          const int il, const int iu, const Function &function) {
   const int Nn = nu - nl + 1;
@@ -431,7 +445,7 @@ inline void par_dispatch(LoopPatternTPTTR, const std::string &name,
 // 4D loop using TeamPolicy loop with inner ThreadVectorRange
 template <typename Tag, typename Function>
 inline void par_dispatch(LoopPatternTPTVR, const std::string &name,
-                         DevExecSpace exec_space, const int nl, const int nu,
+                         DevExecSpace_t exec_space, const int nl, const int nu,
                          const int kl, const int ku, const int jl, const int ju,
                          const int il, const int iu, const Function &function) {
   // TODO(pgrete) if exec space is Cuda,throw error
@@ -456,7 +470,7 @@ inline void par_dispatch(LoopPatternTPTVR, const std::string &name,
 // 4D loop using TeamPolicy with nested TeamThreadRange and ThreadVectorRange
 template <typename Tag, typename Function>
 inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name,
-                         DevExecSpace exec_space, const int nl, const int nu,
+                         DevExecSpace_t exec_space, const int nl, const int nu,
                          const int kl, const int ku, const int jl, const int ju,
                          const int il, const int iu, const Function &function) {
   const int Nn = nu - nl + 1;
@@ -478,7 +492,7 @@ inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name,
 // 4D loop using SIMD FOR loops
 template <typename Tag, typename Function>
 inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
-                         DevExecSpace exec_space, const int nl, const int nu,
+                         DevExecSpace_t exec_space, const int nl, const int nu,
                          const int kl, const int ku, const int jl, const int ju,
                          const int il, const int iu, const Function &function) {
   PARTHENON_INSTRUMENT_REGION(name)
@@ -493,7 +507,7 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
 // 5D loop using MDRange loops
 template <typename Tag, typename Function, class... Args>
 inline typename std::enable_if<sizeof...(Args) <= 1, void>::type
-par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space,
+par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace_t exec_space,
              const int ml, const int mu, const int nl, const int nu, const int kl,
              const int ku, const int jl, const int ju, const int il, const int iu,
              const Function &function, Args &&...args) {
@@ -511,7 +525,7 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac
 // 5D loop using Kokkos 1D Range
 template <typename Tag, typename Function>
 inline void par_dispatch(LoopPatternFlatRange, const std::string &name,
-                         DevExecSpace exec_space, const int bl, const int bu,
+                         DevExecSpace_t exec_space, const int bl, const int bu,
                          const int nl, const int nu, const int kl, const int ku,
                          const int jl, const int ju, const int il, const int iu,
                          const Function &function) {
@@ -544,7 +558,7 @@ inline void par_dispatch(LoopPatternFlatRange, const std::string &name,
 // 5D loop using SIMD FOR loops
 template <typename Tag, typename Function>
 inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
-                         DevExecSpace exec_space, const int bl, const int bu,
+                         DevExecSpace_t exec_space, const int bl, const int bu,
                          const int nl, const int nu, const int kl, const int ku,
                          const int jl, const int ju, const int il, const int iu,
                          const Function &function) {
@@ -561,7 +575,7 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
 // 6D loop using MDRange loops
 template <typename Tag, typename Function, class... Args>
 inline typename std::enable_if<sizeof...(Args) <= 1, void>::type
-par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space,
+par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace_t exec_space,
              const int ll, const int lu, const int ml, const int mu, const int nl,
              const int nu, const int kl, const int ku, const int jl, const int ju,
              const int il, const int iu, const Function &function, Args &&...args) {
@@ -579,7 +593,7 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac
 // 6D loop using Kokkos 1D Range
 template <typename Tag, typename Function>
 inline void par_dispatch(LoopPatternFlatRange, const std::string &name,
-                         DevExecSpace exec_space, const int ll, const int lu,
+                         DevExecSpace_t exec_space, const int ll, const int lu,
                          const int ml, const int mu, const int nl, const int nu,
                          const int kl, const int ku, const int jl, const int ju,
                          const int il, const int iu, const Function &function) {
@@ -616,7 +630,7 @@ inline void par_dispatch(LoopPatternFlatRange, const std::string &name,
 // 6D loop using SIMD FOR loops
 template <typename Tag, typename Function>
 inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
-                         DevExecSpace exec_space, const int ll, const int lu,
+                         DevExecSpace_t exec_space, const int ll, const int lu,
                          const int ml, const int mu, const int nl, const int nu,
                          const int kl, const int ku, const int jl, const int ju,
                          const int il, const int iu, const Function &function) {
@@ -649,7 +663,7 @@ inline void par_scan(Args &&...args) {
 // 1D  outer parallel loop using Kokkos Teams
 template <typename Function>
 inline void par_for_outer(OuterLoopPatternTeams, const std::string &name,
-                          DevExecSpace exec_space, size_t scratch_size_in_bytes,
+                          DevExecSpace_t exec_space, size_t scratch_size_in_bytes,
                           const int scratch_level, const int kl, const int ku,
                           const Function &function) {
   const int Nk = ku + 1 - kl;
@@ -668,7 +682,7 @@ inline void par_for_outer(OuterLoopPatternTeams, const std::string &name,
 // 2D  outer parallel loop using Kokkos Teams
 template <typename Function>
 inline void par_for_outer(OuterLoopPatternTeams, const std::string &name,
-                          DevExecSpace exec_space, size_t scratch_size_in_bytes,
+                          DevExecSpace_t exec_space, size_t scratch_size_in_bytes,
                           const int scratch_level, const int kl, const int ku,
                           const int jl, const int ju, const Function &function) {
   const int Nk = ku + 1 - kl;
@@ -690,7 +704,7 @@ inline void par_for_outer(OuterLoopPatternTeams, const std::string &name,
 // 3D  outer parallel loop using Kokkos Teams
 template <typename Function>
 inline void par_for_outer(OuterLoopPatternTeams, const std::string &name,
-                          DevExecSpace exec_space, size_t scratch_size_in_bytes,
+                          DevExecSpace_t exec_space, size_t scratch_size_in_bytes,
                           const int scratch_level, const int nl, const int nu,
                           const int kl, const int ku, const int jl, const int ju,
                           const Function &function) {
@@ -715,6 +729,20 @@ inline void par_for_outer(OuterLoopPatternTeams, const std::string &name,
       });
 }
 
+template <typename Function>
+inline void par_for_outer_stupid(OuterLoopPatternTeams, const std::string &name,
+                          DevExecSpace_t exec_space, const int nl, const int nu,
+                          const int kl, const int ku, const int jl, const int ju,
+                          const Function &function) {
+  for (int n = nl; n <= nu; n++) {
+    for (int k = kl; k <= ku; k++) {
+      for (int j = jl; j <= ju; j++) {
+        function(n, k, j);
+      }
+    }
+  }
+}
+
 // Inner parallel loop using TeamThreadRange
 template <typename Function>
 KOKKOS_FORCEINLINE_FUNCTION void
@@ -944,7 +972,7 @@ struct DeviceDeleter {
   }
 };
 
-template <typename T, typename ES = DevExecSpace, typename MS = DevMemSpace>
+template <typename T, typename ES = DevExecSpace_t, typename MS = DevMemSpace>
 std::unique_ptr<T, DeviceDeleter<MS>> DeviceAllocate() {
   static_assert(std::is_trivially_destructible<T>::value,
                 "DeviceAllocate only supports trivially destructible classes!");
@@ -957,7 +985,7 @@ std::unique_ptr<T, DeviceDeleter<MS>> DeviceAllocate() {
   return up;
 }
 
-template <typename T, typename ES = DevExecSpace, typename MS = DevMemSpace>
+template <typename T, typename ES = DevExecSpace_t, typename MS = DevMemSpace>
 std::unique_ptr<T, DeviceDeleter<MS>> DeviceCopy(const T &host_object) {
   static_assert(std::is_trivially_destructible<T>::value,
                 "DeviceCopy only supports trivially destructible classes!");
diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 6359dbee3e89..92b07f283765 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -42,6 +42,7 @@
 #include "parthenon_arrays.hpp"
 #include "utils/buffer_utils.hpp"
 #include "utils/error_checking.hpp"
+#include "utils/thread_pool.hpp"
 
 namespace parthenon {
 
@@ -650,6 +651,12 @@ bool Mesh::GatherCostListAndCheckBalance() {
   return true;
 }
 
+void thread_safe_insert(std::unordered_set<LogicalLocation> &myset, LogicalLocation &myval) {
+  static std::mutex mutex;
+  std::lock_guard<std::mutex> lock(mutex);
+  myset.insert(myval);
+}
+
 //----------------------------------------------------------------------------------------
 // \!fn void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot)
 // \brief redistribute MeshBlocks according to the new load balance
@@ -682,8 +689,10 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
     PARTHENON_INSTRUMENT
     newloc = forest.GetMeshBlockListAndResolveGids();
     nbtotal = newloc.size();
-    for (int ib = 0; ib < nbtotal; ++ib)
-      newtoold[ib] = forest.GetOldGid(newloc[ib]);
+
+    ThreadPoolLoopBlocking(*pool, 0, nbtotal-1, [&](const int i) {
+      newtoold[i] = forest.GetOldGid(newloc[i]);
+    });
 
     // create a list mapping the previous gid to the current one
     oldtonew[0] = 0;
@@ -701,17 +710,19 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
     for (; mb_idx < nbtold; mb_idx++)
       oldtonew[mb_idx] = ntot - 1;
 
-    current_level = 0;
-    for (int n = 0; n < ntot; n++) {
+    //current_level = 0;
+    //for (int n = 0; n < ntot; n++) {
+    current_level = ThreadPoolReduce(*pool, 0, ntot-1, [&](const int n) {
       // "on" = "old n" = "old gid" = "old global MeshBlock ID"
       int on = newtoold[n];
-      if (newloc[n].level() > current_level) // set the current max level
-        current_level = newloc[n].level();
+      //if (newloc[n].level() > current_level) // set the current max level
+      //  current_level = newloc[n].level();
       if (newloc[n].level() >= loclist[on].level()) { // same or refined
         newcost[n] = costlist[on];
         // Keep a list of all blocks refined for below
         if (newloc[n].level() > loclist[on].level()) {
-          newly_refined.insert(newloc[n]);
+          thread_safe_insert(newly_refined, newloc[n]);
+          //newly_refined.insert(newloc[n]);
         }
       } else {
         double acost = 0.0;
@@ -719,7 +730,8 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
           acost += costlist[on + l];
         newcost[n] = acost / nleaf;
       }
-    }
+      return newloc[n].level();
+    }, [](int a, int b) { return std::max(a,b); }, int(0));
   } // Construct new list region
 
   // Calculate new load balance
@@ -729,30 +741,39 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   int nbe = nbs + nblist[Globals::my_rank] - 1;
 
   // Restrict fine to coarse buffers
-  ProResCache_t restriction_cache;
-  int nrestrict = 0;
-  for (int on = onbs; on <= onbe; on++) {
-    int nn = oldtonew[on];
-    auto pmb = FindMeshBlock(on);
-    if (newloc[nn].level() < loclist[on].level()) nrestrict += pmb->vars_cc_.size();
-  }
-  restriction_cache.Initialize(nrestrict, resolved_packages.get());
-  int irestrict = 0;
-  for (int on = onbs; on <= onbe; on++) {
-    int nn = oldtonew[on];
-    if (newloc[nn].level() < loclist[on].level()) {
+  auto restrict_fine_to_coarse_buffers = [&](const int t_onbs, const int t_onbe) {
+    ProResCache_t restriction_cache;
+    int nrestrict = 0;
+    for (int on = t_onbs; on <= t_onbe; on++) {
+      int nn = oldtonew[on];
       auto pmb = FindMeshBlock(on);
-      for (auto &var : pmb->vars_cc_) {
-        restriction_cache.RegisterRegionHost(
-            irestrict++, ProResInfo::GetInteriorRestrict(pmb.get(), NeighborBlock(), var),
-            var.get(), resolved_packages.get());
+      if (newloc[nn].level() < loclist[on].level()) nrestrict += pmb->vars_cc_.size();
+    }
+
+    restriction_cache.Initialize(nrestrict, resolved_packages.get());
+    int irestrict = 0;
+    for (int on = t_onbs; on <= t_onbe; on++) {
+      int nn = oldtonew[on];
+      if (newloc[nn].level() < loclist[on].level()) {
+        auto pmb = FindMeshBlock(on);
+        for (auto &var : pmb->vars_cc_) {
+          restriction_cache.RegisterRegionHost(
+              irestrict++, ProResInfo::GetInteriorRestrict(pmb.get(), NeighborBlock(), var),
+              var.get(), resolved_packages.get());
+        }
       }
     }
+    restriction_cache.CopyToDevice();
+    refinement::Restrict(resolved_packages.get(), restriction_cache,
+                         block_list[0]->cellbounds, block_list[0]->c_cellbounds);
+  };
+  std::vector<int> onstart, onstop;
+  ThreadLoopBounds(*pool, onbs, onbe, onstart, onstop);
+  for (int it = 0; it < pool->size(); it++) {
+    pool->enqueue(restrict_fine_to_coarse_buffers, onstart[it], onstop[it]);
   }
-  restriction_cache.CopyToDevice();
-  refinement::Restrict(resolved_packages.get(), restriction_cache,
-                       block_list[0]->cellbounds, block_list[0]->c_cellbounds);
-
+  //pool->run();
+  pool->wait();
   Kokkos::fence();
 
 #ifdef MPI_PARALLEL
@@ -795,6 +816,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
     RegionSize block_size = GetBlockSize();
 
     for (int n = nbs; n <= nbe; n++) {
+    //ThreadPoolLoopBlocking(*pool, nbs, nbe, [&, this](const int n) {
       int on = newtoold[n];
       if ((ranklist[on] == Globals::my_rank) &&
           (loclist[on].level() == newloc[n].level())) {
@@ -816,7 +838,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
             MeshBlock::Make(n, n - nbs, newloc[n], block_size, block_bcs, this, pin,
                             app_in, packages, resolved_packages, gflag);
       }
-    }
+    }//);
   } // AMR Construct new MeshBlockList region
 
   // Replace the MeshBlock list
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index d8630d98dcb2..c7a77b17dae5 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -43,6 +43,7 @@
 #include "globals.hpp"
 #include "interface/state_descriptor.hpp"
 #include "interface/update.hpp"
+#include "kokkos_abstraction.hpp"
 #include "mesh/mesh.hpp"
 #include "mesh/mesh_refinement.hpp"
 #include "mesh/meshblock.hpp"
@@ -103,7 +104,8 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
       // private members:
       num_mesh_threads_(pin->GetOrAddInteger("parthenon/mesh", "num_threads", 1)),
       use_uniform_meshgen_fn_{true, true, true, true}, lb_flag_(true), lb_automatic_(),
-      lb_manual_(), MeshBndryFnctn{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr} {
+      lb_manual_(), MeshBndryFnctn{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr},
+      pool(std::make_shared<ThreadPool>(pin->GetOrAddInteger("parthenon/execution", "nthreads", 1))) {
   std::stringstream msg;
   RegionSize block_size;
   BoundaryFlag block_bcs[6];
@@ -479,7 +481,8 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, RestartReader &rr,
       // private members:
       num_mesh_threads_(pin->GetOrAddInteger("parthenon/mesh", "num_threads", 1)),
       use_uniform_meshgen_fn_{true, true, true, true}, lb_flag_(true), lb_automatic_(),
-      lb_manual_(), MeshBndryFnctn{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr} {
+      lb_manual_(), MeshBndryFnctn{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr},
+      pool(std::make_shared<ThreadPool>(pin->GetOrAddInteger("parthenon/execution", "nthreads", 1))) {
   std::stringstream msg;
   RegionSize block_size;
   BoundaryFlag block_bcs[6];
diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index 81a2e9673be6..2a5cf3832bec 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -54,6 +54,7 @@
 #include "utils/hash.hpp"
 #include "utils/object_pool.hpp"
 #include "utils/partition_stl_containers.hpp"
+#include "utils/thread_pool.hpp"
 
 namespace parthenon {
 
@@ -259,6 +260,9 @@ class Mesh {
     return resolved_packages->GetVariableNames(std::forward<Args>(args)...);
   }
 
+  std::shared_ptr<ThreadPool> pool;
+  void SetThreadPool(std::shared_ptr<ThreadPool> p) { pool = p; }
+
  private:
   // data
   int root_level, max_level, current_level;
diff --git a/src/mesh/meshblock.hpp b/src/mesh/meshblock.hpp
index 54e11bcff119..cef22be3f9d6 100644
--- a/src/mesh/meshblock.hpp
+++ b/src/mesh/meshblock.hpp
@@ -84,7 +84,7 @@ class MeshBlock : public std::enable_shared_from_this<MeshBlock> {
        int igflag, double icost = 1.0);
 
   // Kokkos execution space for this MeshBlock
-  DevExecSpace exec_space;
+  DevExecSpace_t exec_space;
 
   // data
   Mesh *pmy_mesh = nullptr; // ptr to Mesh containing this MeshBlock
diff --git a/src/parthenon_manager.hpp b/src/parthenon_manager.hpp
index 4dc31f169b1f..516b054fc568 100644
--- a/src/parthenon_manager.hpp
+++ b/src/parthenon_manager.hpp
@@ -24,6 +24,7 @@
 #include "driver/driver.hpp"
 #include "interface/state_descriptor.hpp"
 #include "interface/swarm.hpp"
+#include "kokkos_abstraction.hpp"
 #include "mesh/domain.hpp"
 #include "mesh/mesh.hpp"
 #include "outputs/restart.hpp"
@@ -36,7 +37,7 @@ enum class ParthenonStatus { ok, complete, error };
 
 class ParthenonManager {
  public:
-  ParthenonManager() { app_input.reset(new ApplicationInput()); }
+  ParthenonManager() { t_exec_space = DevExecSpace_t(); app_input.reset(new ApplicationInput()); }
   ParthenonStatus ParthenonInitEnv(int argc, char *argv[]);
   void ParthenonInitPackagesAndMesh();
   ParthenonStatus ParthenonFinalize();
diff --git a/src/reconstruct/dc_inline.hpp b/src/reconstruct/dc_inline.hpp
index bd8401f8ab54..c0dd180da997 100644
--- a/src/reconstruct/dc_inline.hpp
+++ b/src/reconstruct/dc_inline.hpp
@@ -33,13 +33,12 @@ template <typename T>
 KOKKOS_FORCEINLINE_FUNCTION void
 DonorCellX1(parthenon::team_mbr_t const &member, const int k, const int j, const int il,
             const int iu, const T &q, ScratchPad2D<Real> &ql, ScratchPad2D<Real> &qr) {
-  const int nu = q.GetDim(4) - 1;
+  const int nu = q.GetMaxNumberOfVars() - 1;
 
   // compute L/R states for each variable
   for (int n = 0; n <= nu; ++n) {
-    if (!q.IsAllocated(n)) continue;
     parthenon::par_for_inner(
-        member, il, iu, [&](const int i) { ql(n, i + 1) = qr(n, i) = q(n, k, j, i); });
+        member, il, iu, [&](const int i) { ql(n, i + 1) = qr(n, i) = q(0, n, k, j, i); });
   }
 }
 
@@ -50,12 +49,11 @@ template <typename T>
 KOKKOS_FORCEINLINE_FUNCTION void
 DonorCellX2(parthenon::team_mbr_t const &member, const int k, const int j, const int il,
             const int iu, const T &q, ScratchPad2D<Real> &ql, ScratchPad2D<Real> &qr) {
-  const int nu = q.GetDim(4) - 1;
+  const int nu = q.GetMaxNumberOfVars() - 1;
   // compute L/R states for each variable
   for (int n = 0; n <= nu; ++n) {
-    if (!q.IsAllocated(n)) continue;
     parthenon::par_for_inner(member, il, iu,
-                             [&](const int i) { ql(n, i) = qr(n, i) = q(n, k, j, i); });
+                             [&](const int i) { ql(n, i) = qr(n, i) = q(0, n, k, j, i); });
   }
 }
 
@@ -66,12 +64,11 @@ template <typename T>
 KOKKOS_FORCEINLINE_FUNCTION void
 DonorCellX3(parthenon::team_mbr_t const &member, const int k, const int j, const int il,
             const int iu, const T &q, ScratchPad2D<Real> &ql, ScratchPad2D<Real> &qr) {
-  const int nu = q.GetDim(4) - 1;
+  const int nu = q.GetMaxNumberOfVars() - 1;
   // compute L/R states for each variable
   for (int n = 0; n <= nu; ++n) {
-    if (!q.IsAllocated(n)) continue;
     parthenon::par_for_inner(member, il, iu,
-                             [&](const int i) { ql(n, i) = qr(n, i) = q(n, k, j, i); });
+                             [&](const int i) { ql(n, i) = qr(n, i) = q(0, n, k, j, i); });
   }
 }
 } // namespace parthenon
diff --git a/src/tasks/tasks.hpp b/src/tasks/tasks.hpp
index 62b2d75a2ec2..d2f626582f92 100644
--- a/src/tasks/tasks.hpp
+++ b/src/tasks/tasks.hpp
@@ -27,8 +27,8 @@
 #include <basic_types.hpp>
 #include <parthenon_mpi.hpp>
 
-#include "thread_pool.hpp"
 #include "utils/error_checking.hpp"
+#include "utils/thread_pool.hpp"
 
 namespace parthenon {
 
@@ -395,8 +395,8 @@ class TaskRegion {
 
   TaskListStatus Execute(ThreadPool &pool) {
     // for now, require a pool with one thread
-    PARTHENON_REQUIRE_THROWS(pool.size() == 1,
-                             "ThreadPool size != 1 is not currently supported.")
+    //PARTHENON_REQUIRE_THROWS(pool.size() == 1,
+                             //"ThreadPool size != 1 is not currently supported.")
 
     // first, if needed, finish building the graph
     if (!graph_built) BuildGraph();
@@ -420,6 +420,9 @@ class TaskRegion {
       pool.enqueue([t, &ProcessTask]() { return ProcessTask(t); });
     }
 
+    // and run it
+    //pool.run();
+
     // then wait until everything is done
     pool.wait();
 
diff --git a/src/tasks/thread_pool.hpp b/src/tasks/thread_pool.hpp
deleted file mode 100644
index b8f526750230..000000000000
--- a/src/tasks/thread_pool.hpp
+++ /dev/null
@@ -1,139 +0,0 @@
-//========================================================================================
-// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
-//
-// This program was produced under U.S. Government contract 89233218CNA000001 for Los
-// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
-// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
-// in the program are reserved by Triad National Security, LLC, and the U.S. Department
-// of Energy/National Nuclear Security Administration. The Government is granted for
-// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
-// license in this material to reproduce, prepare derivative works, distribute copies to
-// the public, perform publicly and display publicly, and to permit others to do so.
-//========================================================================================
-
-#ifndef TASKS_THREAD_POOL_HPP_
-#define TASKS_THREAD_POOL_HPP_
-
-#include <condition_variable>
-#include <functional>
-#include <future>
-#include <memory>
-#include <mutex>
-#include <queue>
-#include <thread>
-#include <utility>
-#include <vector>
-
-namespace parthenon {
-
-template <typename T>
-class ThreadQueue {
- public:
-  explicit ThreadQueue(const int num_workers) : nworkers(num_workers), nwaiting(0) {}
-  void push(T q) {
-    std::lock_guard<std::mutex> lock(mutex);
-    queue.push(q);
-    cv.notify_one();
-  }
-  bool pop(T &q) {
-    std::unique_lock<std::mutex> lock(mutex);
-    if (queue.empty()) {
-      nwaiting++;
-      if (waiting && nwaiting == nworkers) {
-        complete = true;
-        complete_cv.notify_all();
-      }
-      cv.wait(lock, [this]() { return exit || !queue.empty(); });
-      nwaiting--;
-      if (exit) return true;
-    }
-    q = queue.front();
-    queue.pop();
-    return false;
-  }
-  void signal_kill() {
-    std::lock_guard<std::mutex> lock(mutex);
-    std::queue<T>().swap(queue);
-    complete = true;
-    exit = true;
-    cv.notify_all();
-  }
-  void signal_exit_when_finished() {
-    std::lock_guard<std::mutex> lock(mutex);
-    exit = true;
-    complete = true;
-    cv.notify_all();
-  }
-  void wait_for_complete() {
-    std::unique_lock<std::mutex> lock(mutex);
-    waiting = true;
-    if (queue.empty() && nwaiting == nworkers) {
-      complete = false;
-      waiting = false;
-      return;
-    }
-    complete_cv.wait(lock, [this]() { return complete; });
-    complete = false;
-    waiting = false;
-  }
-
- private:
-  const int nworkers;
-  int nwaiting;
-  std::queue<T> queue;
-  std::mutex mutex;
-  std::condition_variable cv;
-  std::condition_variable complete_cv;
-  bool complete = false;
-  bool exit = false;
-  bool waiting = false;
-};
-
-class ThreadPool {
- public:
-  explicit ThreadPool(const int numthreads = std::thread::hardware_concurrency())
-      : nthreads(numthreads), queue(nthreads) {
-    for (int i = 0; i < nthreads; i++) {
-      auto worker = [&]() {
-        while (true) {
-          std::function<void()> f;
-          auto stop = queue.pop(f);
-          if (stop) break;
-          if (f) f();
-        }
-      };
-      threads.emplace_back(worker);
-    }
-  }
-  ~ThreadPool() {
-    queue.signal_exit_when_finished();
-    for (auto &t : threads) {
-      t.join();
-    }
-  }
-
-  void wait() { queue.wait_for_complete(); }
-
-  void kill() { queue.signal_kill(); }
-
-  template <typename F, class... Args>
-  std::future<typename std::result_of<F(Args...)>::type> enqueue(F &&f, Args &&...args) {
-    using return_t = typename std::result_of<F(Args...)>::type;
-    auto task = std::make_shared<std::packaged_task<return_t()>>(
-        [=, func = std::forward<F>(f)] { return func(std::forward<Args>(args)...); });
-    std::future<return_t> result = task->get_future();
-    queue.push([task]() { (*task)(); });
-    return result;
-  }
-
-  int size() const { return nthreads; }
-
- private:
-  const int nthreads;
-  std::vector<std::thread> threads;
-  ThreadQueue<std::function<void()>> queue;
-};
-
-} // namespace parthenon
-
-#endif // TASKS_THREAD_POOL_HPP_
diff --git a/src/utils/communication_buffer.hpp b/src/utils/communication_buffer.hpp
index edfc04f11c77..eee7ab563fe3 100644
--- a/src/utils/communication_buffer.hpp
+++ b/src/utils/communication_buffer.hpp
@@ -21,6 +21,7 @@
 #include <iostream>
 #include <limits>
 #include <memory>
+#include <shared_mutex>
 #include <type_traits>
 #include <unordered_map>
 #include <utility>
@@ -37,6 +38,32 @@ namespace parthenon {
 // received:     X
 enum class BufferState { stale, sending, sending_null, received, received_null };
 
+class BufferStateThreadSafe {
+ public:
+  BufferStateThreadSafe() = default;
+  BufferStateThreadSafe(BufferState bs) : state_(bs) {}
+  BufferStateThreadSafe(const BufferStateThreadSafe &bs) = delete;
+  BufferStateThreadSafe &operator =(BufferState bs) {
+    mutex.lock();
+    state_ = bs;
+    mutex.unlock();
+    return *this;
+  }
+  bool operator==(BufferState bs) {
+    return (Get() == bs);
+  }
+  BufferState Get() {
+    mutex.lock_shared();
+    auto val = state_;
+    mutex.unlock_shared();
+    return val;
+  }
+
+ private:
+  BufferState state_;
+  std::shared_mutex mutex;
+};
+
 enum class BuffCommType { sender, receiver, both, sparse_receiver };
 
 template <class T>
@@ -46,7 +73,7 @@ class CommBuffer {
   template <typename U>
   friend class CommBuffer;
 
-  std::shared_ptr<BufferState> state_;
+  std::shared_ptr<BufferStateThreadSafe> state_;
   std::shared_ptr<BuffCommType> comm_type_;
   std::shared_ptr<bool> started_irecv_;
   std::shared_ptr<int> nrecv_tries_;
@@ -65,6 +92,7 @@ class CommBuffer {
   std::function<T()> get_resource_;
 
   T buf_;
+  inline static std::mutex mutex;
 
  public:
   CommBuffer()
@@ -81,6 +109,9 @@ class CommBuffer {
 
   ~CommBuffer();
 
+  //CommBuffer(const CommBuffer<T> &in) = delete;
+  //CommBuffer &operator=(const CommBuffer<T> &in) = delete;
+
   template <class U>
   CommBuffer(const CommBuffer<U> &in);
 
@@ -95,19 +126,21 @@ class CommBuffer {
 
   void Allocate() {
     if (!active_) {
+      std::lock_guard<std::mutex> lock(mutex);
       buf_ = get_resource_();
       active_ = true;
     }
   }
 
   void Free() {
+    std::lock_guard<std::mutex> lock(mutex);
     buf_ = T();
     active_ = false;
   }
 
   bool IsActive() const { return active_; }
 
-  BufferState GetState() { return *state_; }
+  BufferState GetState() { return state_->Get(); }
 
   void Send() noexcept;
   void SendNull() noexcept;
@@ -132,7 +165,7 @@ class CommBuffer {
 template <class T>
 CommBuffer<T>::CommBuffer(int tag, int send_rank, int recv_rank, mpi_comm_t comm,
                           std::function<T()> get_resource, bool do_sparse_allocation)
-    : state_(std::make_shared<BufferState>(BufferState::stale)),
+    : state_(std::make_shared<BufferStateThreadSafe>(BufferState::stale)),
       comm_type_(std::make_shared<BuffCommType>(BuffCommType::both)),
       started_irecv_(std::make_shared<bool>(false)),
       nrecv_tries_(std::make_shared<int>(0)),
diff --git a/src/utils/sort.hpp b/src/utils/sort.hpp
index 97e9c77a88e4..c92e80865621 100644
--- a/src/utils/sort.hpp
+++ b/src/utils/sort.hpp
@@ -75,7 +75,7 @@ void sort(ParArray1D<Key> data, KeyComparator comparator, size_t min_idx,
   thrust::sort(first_d, last_d, comparator);
 #endif
 #else
-  if (std::is_same<DevExecSpace, HostExecSpace>::value) {
+  if (std::is_same<DevExecSpace_t, HostExecSpace>::value) {
     std::sort(data.data() + min_idx, data.data() + max_idx + 1, comparator);
   } else {
     PARTHENON_FAIL("sort is not supported outside of CPU or NVIDIA GPU. If you need sort "
@@ -103,7 +103,7 @@ void sort(ParArray1D<Key> data, size_t min_idx, size_t max_idx) {
   thrust::sort(first_d, last_d);
 #endif
 #else
-  if (std::is_same<DevExecSpace, HostExecSpace>::value) {
+  if (std::is_same<DevExecSpace_t, HostExecSpace>::value) {
     std::sort(data.data() + min_idx, data.data() + max_idx + 1);
   } else {
     PARTHENON_FAIL("sort is not supported outside of CPU or NVIDIA GPU. If you need sort "
diff --git a/src/utils/thread_pool.hpp b/src/utils/thread_pool.hpp
new file mode 100644
index 000000000000..6bfeddf6e611
--- /dev/null
+++ b/src/utils/thread_pool.hpp
@@ -0,0 +1,595 @@
+//========================================================================================
+// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldmy_tide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+
+#ifndef UTILS_THREAD_POOL_HPP_
+#define UTILS_THREAD_POOL_HPP_
+
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <map>
+#include <mutex>
+#include <queue>
+#include <shared_mutex>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "basic_types.hpp"
+#include "kokkos_abstraction.hpp"
+
+/*
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <utility>
+#include <vector>
+*/
+
+namespace parthenon {
+
+inline thread_local int my_tid;
+
+/*
+template <typename T>
+class ConcurrentQueue {
+ public:
+  explicit ConcurrentQueue(const int nthreads)
+      : built(false), main_waiting(false), exit(false), nqueues(nthreads+1), nsleep(0),
+        mutex(nqueues), queue(nqueues) {}
+
+  void push(T q) {
+    {
+      std::lock_guard<std::mutex> lock(mutex[my_tid]);
+      queue[my_tid].push(q);
+    }
+    nqueued++;
+    if (nsleep.load() > 0) {
+      std::lock_guard<std::shared_mutex> lock(complete_mutex);
+      sleep_cv.notify_all();
+    }
+  }
+
+  void push(int i, T q) {
+    {
+      std::lock_guard<std::mutex> lock(mutex[i]);
+      queue[i].push(q);
+    }
+    nqueued++;
+    if (nsleep.load() > 0) {
+      std::lock_guard<std::shared_mutex> lock(complete_mutex);
+      sleep_cv.notify_all();
+    }
+  }
+
+  void push_lazy(T q) {
+    assert(my_tid == 0);
+    queue[0].push(q);
+    nqueued++;
+  }
+  void push_lazy(int i, T q) {
+    queue[i].push(q);
+    nqueued++;
+  }
+
+  void go() {
+    sleep_cv.notify_all();
+  }
+
+  void pop(T &q) {
+    if (mutex[my_tid].try_lock()) {
+      if (!queue[my_tid].empty()) {
+        q = queue[my_tid].front();
+        queue[my_tid].pop();
+        mutex[my_tid].unlock();
+        nqueued--;
+        return;
+      }
+      mutex[my_tid].unlock();
+    }
+    for (int i = 0; i < nqueues; i++) {
+      if (mutex[i].try_lock()) {
+        if (!queue[i].empty()) {
+          q = queue[i].front();
+          queue[i].pop();
+          mutex[i].unlock();
+          nqueued--;
+          return;
+        }
+        mutex[i].unlock();
+      }
+    }
+    return;
+  }
+
+  bool thread_sleep() {
+    nsleep++;
+    std::unique_lock<std::shared_mutex> lock(complete_mutex);
+    if (nsleep == nqueues - 1 && main_waiting && nqueued.load() == 0) complete_cv.notify_all();
+    sleep_cv.wait(lock, [this]() { return nqueued.load() > 0 || exit; });
+    nsleep--;
+    return exit && nqueued.load() == 0;
+  }
+
+  bool signal_if_complete() {
+    if (nqueued.load() > 0) return false;
+    bool should_exit = thread_sleep();
+    return should_exit;
+  }
+
+  void signal_exit_when_finished() {
+    std::unique_lock<std::shared_mutex> lock(complete_mutex);
+    exit = true;
+    sleep_cv.notify_all();
+  }
+
+  void wait_for_complete() {
+    std::unique_lock<std::shared_mutex> lock(complete_mutex);
+    main_waiting = true;
+    complete_cv.wait(lock, [this]() { return nqueued == 0; });
+  }
+
+ private:
+  bool built, main_waiting, exit;
+  const int nqueues;
+  std::atomic_int nsleep;
+  std::vector<std::mutex> mutex;
+  std::vector<std::queue<T>> queue;
+  std::map<std::thread::id, int> worker_id;
+  std::mutex built_mutex;
+  std::condition_variable built_cv;
+  std::atomic_int nqueued;
+  std::shared_mutex complete_mutex;
+  std::condition_variable_any complete_cv;
+  std::condition_variable_any sleep_cv;
+};
+
+
+class ThreadPool {
+ public:
+  explicit ThreadPool(const int numthreads = std::thread::hardware_concurrency())
+      : nthreads(numthreads), main_id(std::this_thread::get_id()), thread_idx(0), queue(nthreads), exec_space(nthreads+1) {
+    
+    auto worker = [&](const int i) {
+      my_tid = i;
+      t_exec_space = exec_space[i];
+      while (true) {
+        auto stop = queue.signal_if_complete();
+        if(stop) break;
+        std::function<void()> f;
+        queue.pop(f);
+        if (f) f();
+      }
+    };
+    my_tid = 0;
+    t_exec_space = exec_space[0];
+    threads.reserve(nthreads);
+    for (int i = 0; i < nthreads; i++) {
+      threads.emplace_back(worker, i+1);
+    }
+    //queue.BuildMap(threads);
+  }
+  ~ThreadPool() {
+    queue.signal_exit_when_finished();
+    for (auto &t : threads) {
+      t.join();
+    }
+  }
+  void wait() {
+    queue.wait_for_complete();
+  }
+  template <typename F, class... Args>
+  std::future<typename std::invoke_result<F, Args...>::type> enqueue(F &&f, Args... args) {
+    using return_t = typename std::invoke_result<F, Args...>::type;
+    auto task = std::make_shared<std::packaged_task<return_t()>>(
+        [=, func = std::forward<F>(f)] { return func(args...); });
+    std::future<return_t> result = task->get_future();
+    if (std::this_thread::get_id() == main_id) {
+        queue.push(thread_idx + 1, [task]() { (*task)(); });
+        thread_idx = (thread_idx + 1) % nthreads;
+    } else {
+        queue.push([task]() { (*task)(); });
+    }
+    return result;
+  }
+  template <typename F, class... Args>
+  std::future<typename std::invoke_result<F, Args...>::type> enqueue(const int tid, F &&f, Args... args) {
+    using return_t = typename std::invoke_result<F, Args...>::type;
+    auto task = std::make_shared<std::packaged_task<return_t()>>(
+        [=, func = std::forward<F>(f)] { return func(args...); });
+    std::future<return_t> result = task->get_future();
+    queue.push(tid, [task]() { (*task)(); });
+    return result;
+  }
+  template <typename F, class... Args>
+  std::future<typename std::invoke_result<F, Args...>::type> enqueue_lazy(F &&f, Args... args) {
+    using return_t = typename std::invoke_result<F, Args...>::type;
+    auto task = std::make_shared<std::packaged_task<return_t()>>(
+        [=, func = std::forward<F>(f)] { return func(args...); });
+    std::future<return_t> result = task->get_future();
+    if (std::this_thread::get_id() == main_id) {
+      queue.push_lazy(thread_idx + 1, [task]() { (*task)(); });
+      thread_idx = (thread_idx + 1) % nthreads;
+    } else {
+      printf("THIS IS WRONG!\n");
+    }
+    return result;
+  }
+  template <typename F, class... Args>
+  std::future<typename std::invoke_result<F, Args...>::type> enqueue_lazy(const int tid, F &&f, Args... args) {
+    using return_t = typename std::invoke_result<F, Args...>::type;
+    auto task = std::make_shared<std::packaged_task<return_t()>>(
+        [=, func = std::forward<F>(f)] { return func(args...); });
+    std::future<return_t> result = task->get_future();
+    queue.push_lazy(tid, [task]() { (*task)(); });
+    return result;
+  }
+
+  int size() const { return nthreads; }
+  void run() { queue.go(); }
+ private:
+  const int nthreads;
+  const std::thread::id main_id;
+  int thread_idx;
+  std::vector<std::thread> threads;
+  ConcurrentQueue<std::function<void()>> queue;
+  ExecSpace exec_space;
+};
+
+template <typename F>
+void ThreadPoolLoop(ThreadPool &pool, const int is, const int ie, F &&f) {
+  const int nthreads = pool.size();
+  const double niter = (ie - is + 1) / (1.0*nthreads) + 1.e-10;
+  auto looper = [func = std::forward<F>(f)](int t_is, int t_ie) {
+    for (int i = t_is; i <= t_ie; i++) func(i);
+  };
+  for (int it = 0; it < nthreads; it++) {
+    int start =  is + static_cast<int>(it*niter);
+    int end = is + static_cast<int>((it+1)*niter) - 1;
+    pool.enqueue_lazy(it+1, looper, start, end);
+  }
+  pool.run();
+}
+template <class... Args>
+void ThreadPoolLoopBlocking(ThreadPool &pool, Args &&... args) {
+  ThreadPoolLoop(pool, std::forward<Args>(args)...);
+  pool.wait();
+}
+
+template <typename F, typename Reducer, typename reduce_t>
+reduce_t ThreadPoolReduce(ThreadPool &pool, const int is, const int ie, F &&f, Reducer &&r, reduce_t init_val) {
+  const int nthreads = pool.size();
+  std::vector<std::future<reduce_t>> futures(nthreads);
+  const double niter = (ie - is + 1) / (1.0*nthreads) + 1.e-10;
+  auto looper = [=, func = std::forward<F>(f), red = std::forward<Reducer>(r)](int t_is, int t_ie) {
+    reduce_t rval = 0;
+    for (int i = t_is; i <= t_ie; i++) {
+      reduce_t val = func(i);
+      rval = red(rval, val);
+    }
+    return rval;
+  };
+  for (int it = 0; it < nthreads; it++) {
+    int start =  is + static_cast<int>(it*niter);
+    int end = is + static_cast<int>((it+1)*niter) - 1;
+    futures[it] = pool.enqueue_lazy(it + 1, looper, start, end);
+  }
+  pool.run();
+  pool.wait();
+  reduce_t reduced_val = init_val;
+  for (int it = 0; it < nthreads; it++) {
+    if (!futures[it].valid()) {
+      printf("Got an invalid future somehow!\n");
+    }
+    reduced_val = r(reduced_val, futures[it].get());
+  }
+  return reduced_val;
+}
+
+inline
+void ThreadLoopBounds(ThreadPool &pool, const int is, const int ie, std::vector<int> &start, std::vector<int> &stop) {
+  const int nthreads = pool.size();
+  start.resize(nthreads);
+  stop.resize(nthreads);
+  const double niter = (ie - is + 1) / (1.0 * nthreads) + 1.e-10;
+  for (int it = 0; it < nthreads; it++) {
+    start[it] = is + static_cast<int>(it*niter);
+    stop[it] = is + static_cast<int>((it+1)*niter) - 1;
+  }
+}
+*/
+
+
+
+
+
+template <typename T>
+class ThreadQueue {
+ public:
+  explicit ThreadQueue(const int num_workers)
+    : nworkers(num_workers), nwaiting(0), nqueued(0), nqueued0(0), queue(nworkers+1),
+      mutex(nworkers+1), cv_go(nworkers+1) {}
+  void push(T q) {
+    //std::lock_guard<std::mutex> lock(mutex[my_tid]);
+    mutex[my_tid].lock();
+    queue[my_tid].push(q);
+    mutex[my_tid].unlock();
+    nqueued++;
+    std::lock_guard<std::mutex> lock(mutex[0]);
+    cv.notify_one();
+    //if (my_tid == 0) {
+      //nqueued0++;
+      //cv.notify_one();
+    //}
+  }
+  //void push(int i, T q) {
+    //std::lock_guard<std::mutex> lock(mutex[i]);
+    //queue[i].push(q);
+    //nqueued++;
+    //cv_go[i].notify_one();
+  //}
+  void pop_tag(int tag, T &q) {
+    std::lock_guard<std::mutex> lock(mutex[tag]);
+    if (queue[tag].empty()) return;
+    q = queue[tag].front();
+    queue[tag].pop();
+    nqueued--;
+    return;
+  }
+  bool pop(T &q) {
+    pop_tag(0, q);
+    if (q) return false;
+    pop_tag(my_tid, q);
+    if (q) return false;
+
+    for (int i = 0; i <= nworkers; i++) {
+      if (mutex[i].try_lock()) {
+        if (!queue[i].empty()) {
+          q = queue[i].front();
+          queue[i].pop();
+          mutex[i].unlock();
+          nqueued--;
+          return false;
+        }
+        mutex[i].unlock();
+      }
+    }
+
+    //printf("thread %d going into waiting...\n", my_tid);
+
+    std::unique_lock<std::mutex> lock(mutex[0]);
+    nwaiting++;
+    if (waiting && nwaiting.load() == nworkers && nqueued.load() == 0) {
+      complete = true;
+      complete_cv.notify_all();
+    }
+    cv.wait(lock, [this]() { return exit || nqueued.load() > 0; });
+    nwaiting--;
+    return exit;
+  }
+/*
+    //std::unique_lock<std::mutex> lock(mutex[my_tid]);
+    if (nqueued0.load() > 0) {
+      std::lock_guard<std::mutex> lock(mutex[0]);
+      if (!queue[0].empty()) {
+        q = queue[0].front();
+        queue[0].pop();
+        nqueued--;
+        nqueued0--;
+        return false;
+      }
+    }
+    mutex[my_tid].lock();
+    if (queue[my_tid].empty()) {
+      //mutex[my_tid].unlock();
+      //std::unique_lock<std::mutex> lock(mutex[0]);
+      //if (queue[0].empty()) {
+        nwaiting++;
+        if (waiting && nwaiting.load() == nworkers) {
+          complete = true;
+          complete_cv.notify_all();
+        }
+        cv_go[my_tid].wait(lock, [this]() { return exit || !queue[my_tid].empty(); });
+        nwaiting--;
+        if (exit) return true;
+    }
+    q = queue[my_tid].front();
+    queue[my_tid].pop();
+    mutex[my_tid].unlock();
+    nqueued--;
+    return false;
+  }
+  */
+  //void signal_kill() {
+    //std::lock_guard<std::mutex> lock(mutex[0]);
+    //std::queue<T>().swap(queue);
+    //complete = true;
+    //exit = true;
+    //cv.notify_all();
+  //}
+  void signal_exit_when_finished() {
+    std::lock_guard<std::mutex> lock(mutex[0]);
+    exit = true;
+    complete = true;
+    cv.notify_all();
+  }
+  void wait_for_complete() {
+    std::unique_lock<std::mutex> lock(mutex[0]);
+    waiting = true;
+    if (nqueued.load() == 0 && nwaiting.load() == nworkers) {
+      complete = false;
+      waiting = false;
+      return;
+    }
+    complete_cv.wait(lock, [this]() { return complete; });
+    //printf("got complete!\n");
+    complete = false;
+    waiting = false;
+  }
+  //size_t size() {
+    //std::lock_guard<std::mutex> lock(mutex[0]);
+    //return queue.size();
+  //}
+
+ private:
+  const int nworkers;
+  std::atomic_int nwaiting;
+  std::atomic_int nqueued, nqueued0;
+  std::vector<std::queue<T>> queue;
+  std::vector<std::mutex> mutex;
+  std::vector<std::mutex> cv_go;
+  std::condition_variable cv;
+  std::condition_variable complete_cv;
+  bool complete = false;
+  bool exit = false;
+  bool waiting = false;
+};
+
+class ThreadPool {
+ public:
+  explicit ThreadPool(const int numthreads = std::thread::hardware_concurrency())
+      : nthreads(numthreads), queue(nthreads), exec_space(nthreads+1) {
+    my_tid = 0;
+    t_exec_space = exec_space[0];
+    for (int i = 0; i < nthreads; i++) {
+      auto worker = [&](const int i) {
+        my_tid = i;
+        t_exec_space = exec_space[i];
+        while (true) {
+          std::function<void()> f;
+          auto stop = queue.pop(f);
+          if (stop) break;
+          if (f) f();
+        }
+      };
+      threads.emplace_back(worker, i+1);
+    }
+  }
+  ~ThreadPool() {
+    queue.signal_exit_when_finished();
+    for (auto &t : threads) {
+      t.join();
+    }
+  }
+
+  void wait() { //queue.notify_all(); 
+  queue.wait_for_complete(); }
+
+  //void kill() { queue.signal_kill(); }
+
+  template <typename F, class... Args>
+  std::future<typename std::invoke_result<F, Args...>::type> enqueue(F &&f, Args... args) {
+    using return_t = typename std::invoke_result<F, Args...>::type;
+    auto task = std::make_shared<std::packaged_task<return_t()>>(
+        [=, func = std::forward<F>(f)] { return func(args...); });
+    std::future<return_t> result = task->get_future();
+    queue.push([task]() { (*task)(); });
+    return result;
+  }
+
+  int size() const { return nthreads; }
+
+  //size_t num_queued() { return queue.size(); }
+
+ private:
+  const int nthreads;
+  std::vector<std::thread> threads;
+  ThreadQueue<std::function<void()>> queue;
+  ExecSpace exec_space;
+};
+
+template <typename F>
+void ThreadPoolLoop(ThreadPool &pool, const int is, const int ie, F &&f) {
+  const int nthreads = pool.size();
+  const double niter = (ie - is + 1) / (1.0*nthreads) + 1.e-10;
+  auto looper = [func = std::forward<F>(f)](int t_is, int t_ie) {
+    for (int i = t_is; i <= t_ie; i++) func(i);
+  };
+  for (int it = 0; it < nthreads; it++) {
+    int start =  is + static_cast<int>(it*niter);
+    int end = is + static_cast<int>((it+1)*niter) - 1;
+    pool.enqueue(looper, start, end);
+  }
+}
+template <class... Args>
+void ThreadPoolLoopBlocking(ThreadPool &pool, Args &&... args) {
+  ThreadPoolLoop(pool, std::forward<Args>(args)...);
+  pool.wait();
+}
+
+template <typename F, typename Reducer, typename reduce_t>
+reduce_t ThreadPoolReduce(ThreadPool &pool, const int is, const int ie, F &&f, Reducer &&r, reduce_t init_val) {
+  const int nthreads = pool.size();
+  std::vector<std::future<reduce_t>> futures(nthreads);
+  const double niter = (ie - is + 1) / (1.0*nthreads) + 1.e-10;
+  auto looper = [=, func = std::forward<F>(f), red = std::forward<Reducer>(r)](int t_is, int t_ie) {
+    reduce_t rval = 0;
+    for (int i = t_is; i <= t_ie; i++) {
+      reduce_t val = func(i);
+      rval = red(rval, val);
+    }
+    return rval;
+  };
+  for (int it = 0; it < nthreads; it++) {
+    int start =  is + static_cast<int>(it*niter);
+    int end = is + static_cast<int>((it+1)*niter) - 1;
+    futures[it] = pool.enqueue(looper, start, end);
+  }
+  pool.wait();
+  reduce_t reduced_val = init_val;
+  for (int it = 0; it < nthreads; it++) {
+    reduced_val = r(reduced_val, futures[it].get());
+  }
+  return reduced_val;
+}
+
+template <typename F, typename Reducer, typename reduce_t>
+reduce_t ThreadPoolReduce2(ThreadPool &pool, const int is, const int ie, F &&f, Reducer &&r, reduce_t init_val) {
+  std::vector<std::future<reduce_t>> futures(ie+1);
+  auto flb = [=, func = std::forward<F>(f), red = std::forward<Reducer>(r)](int t_i) {
+    reduce_t rval = init_val;
+    return red(rval, func(t_i));
+  };
+  for (int it = is; it <= ie; it++) {
+    futures[it] = pool.enqueue(flb, it);
+  }
+  pool.wait();
+  reduce_t reduced_val = init_val;
+  for (int it = is; it <= ie; it++) {
+    reduced_val = r(reduced_val, futures[it].get());
+  }
+  return reduced_val;
+}
+
+inline
+void ThreadLoopBounds(ThreadPool &pool, const int is, const int ie, std::vector<int> &start, std::vector<int> &stop) {
+  const int nthreads = pool.size();
+  start.resize(nthreads);
+  stop.resize(nthreads);
+  const double niter = (ie - is + 1) / (1.0 * nthreads) + 1.e-10;
+  for (int it = 0; it < nthreads; it++) {
+    start[it] = is + static_cast<int>(it*niter);
+    stop[it] = is + static_cast<int>((it+1)*niter) - 1;
+  }
+}
+
+
+
+} // namespace parthenon
+
+#endif // TASKS_THREAD_POOL_HPP_
diff --git a/tst/performance/test_meshblock_data_iterator.cpp b/tst/performance/test_meshblock_data_iterator.cpp
index 923899d26e26..f1004ff95888 100644
--- a/tst/performance/test_meshblock_data_iterator.cpp
+++ b/tst/performance/test_meshblock_data_iterator.cpp
@@ -76,7 +76,8 @@ void performance_test_wrapper(const std::string &test_name, InitFunc init_func,
 
 // we need to connect the MeshBlockData to a dummy mesh block, otherwise variables
 // won't be allocated
-static MeshBlockData<Real> createTestContainer(std::shared_ptr<MeshBlock> &dummy_mb) {
+static void createTestContainer(std::shared_ptr<MeshBlock> dummy_mb,
+                                MeshBlockData<Real> &mbd) {
   // Make a container for testing performance
   std::vector<int> scalar_shape{N, N, N};
   std::vector<int> vector_shape{N, N, N, 3};
@@ -93,10 +94,7 @@ static MeshBlockData<Real> createTestContainer(std::shared_ptr<MeshBlock> &dummy
   pkg->AddField("v4", m_in_vec);
   pkg->AddField("v5", m_in);
 
-  MeshBlockData<Real> mbd;
   mbd.Initialize(pkg, dummy_mb);
-
-  return mbd;
 }
 
 // std::function<void()> createLambdaRaw(ParArrayND<Real> &raw_array) {
@@ -125,7 +123,6 @@ std::function<void()> createLambdaContainer(MeshBlockData<Real> &container) {
             v(l, k, j, i) = static_cast<Real>((l + 1) * (k + 1) * (j + 1) * (i + 1));
           });
     }
-    return container;
   };
 }
 
@@ -142,7 +139,6 @@ std::function<void()> createLambdaContainerVar(MeshBlockData<Real> &container,
             data(l, k, j, i) = static_cast<Real>((l + 1) * (k + 1) * (j + 1) * (i + 1));
           });
     }
-    return container;
   };
 }
 
@@ -196,7 +192,8 @@ TEST_CASE("Catch2 Container Iterator Performance",
   SECTION("Iterate Variables") {
     GIVEN("A container.") {
       auto dummy_mb = std::make_shared<MeshBlock>(16, 3);
-      MeshBlockData<Real> container = createTestContainer(dummy_mb);
+      MeshBlockData<Real> container;
+      createTestContainer(dummy_mb, container);
       auto init_container = createLambdaContainer(container);
 
       // Make a function for initializing the container variables
@@ -215,7 +212,8 @@ TEST_CASE("Catch2 Container Iterator Performance",
     } // GIVEN
     GIVEN("A container Var.") {
       auto dummy_mb = std::make_shared<MeshBlock>(16, 3);
-      MeshBlockData<Real> container = createTestContainer(dummy_mb);
+      MeshBlockData<Real> container;
+      createTestContainer(dummy_mb, container);
       std::vector<std::string> names({"v0", "v1", "v2", "v3", "v4", "v5"});
       auto init_container = createLambdaContainerVar(container, names);
 
@@ -238,7 +236,8 @@ TEST_CASE("Catch2 Container Iterator Performance",
   SECTION("View of Views") {
     GIVEN("A container.") {
       auto dummy_mb = std::make_shared<MeshBlock>(16, 3);
-      MeshBlockData<Real> container = createTestContainer(dummy_mb);
+      MeshBlockData<Real> container;
+      createTestContainer(dummy_mb, container);
       WHEN("The view of views does not have any names.") {
         parthenon::VariablePack<Real> var_view =
             container.PackVariables({Metadata::Independent});