diff --git a/CMakeLists.txt b/CMakeLists.txt index 907557f1df5e..28c59c84fa8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -202,6 +202,10 @@ endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD 17) +#add_compile_options(-fsanitize=thread) +#add_link_options(-fsanitize=thread) +add_link_options(-Wl,-no_pie -L/opt/homebrew/lib -lprofiler -ltcmalloc) + option(PARTHENON_IMPORT_KOKKOS "If ON, attempt to link to an external Kokkos library. If OFF, build Kokkos from source and package with Parthenon" OFF) if (NOT TARGET Kokkos::kokkos) if (PARTHENON_IMPORT_KOKKOS) diff --git a/example/advection/CMakeLists.txt b/example/advection/CMakeLists.txt index 97541db4006c..d59e0b04414f 100644 --- a/example/advection/CMakeLists.txt +++ b/example/advection/CMakeLists.txt @@ -22,6 +22,9 @@ if( "advection-example" IN_LIST DRIVER_LIST OR NOT PARTHENON_DISABLE_EXAMPLES) main.cpp parthenon_app_inputs.cpp ) + #add_compile_options(-fsanitize=thread) + #add_link_options(-fsanitize=thread) + add_link_options(-Wl,-no_pie -L/opt/homebrew/lib -lprofiler -ltcmalloc) target_link_libraries(advection-example PRIVATE Parthenon::parthenon) lint_target(advection-example) endif() diff --git a/example/advection/advection_driver.cpp b/example/advection/advection_driver.cpp index c9e2d5a2b435..d64bde97b8ae 100644 --- a/example/advection/advection_driver.cpp +++ b/example/advection/advection_driver.cpp @@ -86,8 +86,8 @@ TaskCollection AdvectionDriver::MakeTaskCollection(BlockList_t &blocks, const in const auto any = parthenon::BoundaryType::any; - tl.AddTask(none, parthenon::StartReceiveBoundBufs, mc1); - tl.AddTask(none, parthenon::StartReceiveFluxCorrections, mc0); + //tl.AddTask(none, parthenon::StartReceiveBoundBufs, mc1); + //tl.AddTask(none, parthenon::StartReceiveFluxCorrections, mc0); } // Number of task lists that can be executed independently and thus *may* @@ -124,13 +124,13 @@ TaskCollection AdvectionDriver::MakeTaskCollection(BlockList_t &blocks, const in auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i); auto &mdudt = pmesh->mesh_data.GetOrAdd("dUdt", i); - auto send_flx = tl.AddTask(none, parthenon::LoadAndSendFluxCorrections, mc0); - auto recv_flx = tl.AddTask(none, parthenon::ReceiveFluxCorrections, mc0); - auto set_flx = tl.AddTask(recv_flx, parthenon::SetFluxCorrections, mc0); + //auto send_flx = tl.AddTask(none, parthenon::LoadAndSendFluxCorrections, mc0); + //auto recv_flx = tl.AddTask(none, parthenon::ReceiveFluxCorrections, mc0); + //auto set_flx = tl.AddTask(recv_flx, parthenon::SetFluxCorrections, mc0); // compute the divergence of fluxes of conserved variables auto flux_div = - tl.AddTask(set_flx, FluxDivergence>, mc0.get(), mdudt.get()); + tl.AddTask(none, FluxDivergence>, mc0.get(), mdudt.get()); auto avg_data = tl.AddTask(flux_div, AverageIndependentData>, mc0.get(), mbase.get(), beta); @@ -139,7 +139,7 @@ TaskCollection AdvectionDriver::MakeTaskCollection(BlockList_t &blocks, const in mdudt.get(), beta * dt, mc1.get()); // do boundary exchange - parthenon::AddBoundaryExchangeTasks(update, tl, mc1, pmesh->multilevel); + //parthenon::AddBoundaryExchangeTasks(update, tl, mc1, pmesh->multilevel); } TaskRegion &async_region2 = tc.AddRegion(num_task_lists_executed_independently); diff --git a/example/advection/advection_package.cpp b/example/advection/advection_package.cpp index 3008f457d8c9..ddce9effcee5 100644 --- a/example/advection/advection_package.cpp +++ b/example/advection/advection_package.cpp @@ -28,6 +28,7 @@ #include "kokkos_abstraction.hpp" #include "reconstruct/dc_inline.hpp" #include "utils/error_checking.hpp" +#include "utils/thread_pool.hpp" using namespace parthenon::package::prelude; @@ -75,6 +76,7 @@ std::shared_ptr Initialize(ParameterInput *pin) { auto fill_derived = pin->GetOrAddBoolean("Advection", "fill_derived", true); pkg->AddParam<>("fill_derived", fill_derived); + printf("fill_derived = %d\n", fill_derived); // For wavevector along coordinate axes, set desired values of ang_2/ang_3. // For example, for 1D problem use ang_2 = ang_3 = 0.0 @@ -243,22 +245,23 @@ AmrTag CheckRefinement(MeshBlockData *rc) { for (int var = 1; var < num_vars; ++var) { vars.push_back("advected_" + std::to_string(var)); } - // type is parthenon::VariablePack> - auto v = rc->PackVariables(vars); + auto desc = parthenon::MakePackDescriptor(pmb->resolved_packages.get(), vars); + auto v = desc.GetPack(rc); IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::entire); IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::entire); IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::entire); typename Kokkos::MinMax::value_type minmax; - pmb->par_reduce( - PARTHENON_AUTO_LABEL, 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + parthenon::par_reduce( + parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), + 0, v.GetMaxNumberOfVars() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i, typename Kokkos::MinMax::value_type &lminmax) { lminmax.min_val = - (v(n, k, j, i) < lminmax.min_val ? v(n, k, j, i) : lminmax.min_val); + (v(0, n, k, j, i) < lminmax.min_val ? v(0, n, k, j, i) : lminmax.min_val); lminmax.max_val = - (v(n, k, j, i) > lminmax.max_val ? v(n, k, j, i) : lminmax.max_val); + (v(0, n, k, j, i) > lminmax.max_val ? v(0, n, k, j, i) : lminmax.max_val); }, Kokkos::MinMax(minmax)); @@ -283,16 +286,18 @@ void PreFill(MeshBlockData *rc) { // packing in principle unnecessary/convoluted here and just done for demonstration std::vector vars({"advected", "one_minus_advected"}); - PackIndexMap imap; - const auto &v = rc->PackVariables(vars, imap); + auto desc = parthenon::MakePackDescriptor(pmb->resolved_packages.get(), vars); + auto v = desc.GetPack(rc); + auto imap = desc.GetMap(); - const int in = imap.get("advected").first; - const int out = imap.get("one_minus_advected").first; + const int in = imap["advected"]; + const int out = imap["one_minus_advected"]; const auto num_vars = rc->Get("advected").data.GetDim(4); - pmb->par_for( - PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + parthenon::par_for( + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), + 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { - v(out + n, k, j, i) = 1.0 - v(in + n, k, j, i); + v(0, out + n, k, j, i) = 1.0 - v(0, in + n, k, j, i); }); } } @@ -307,16 +312,18 @@ void SquareIt(MeshBlockData *rc) { // packing in principle unnecessary/convoluted here and just done for demonstration std::vector vars({"one_minus_advected", "one_minus_advected_sq"}); - PackIndexMap imap; - const auto &v = rc->PackVariables(vars, imap); + auto desc = parthenon::MakePackDescriptor(pmb->resolved_packages.get(), vars); + auto v = desc.GetPack(rc); + auto imap = desc.GetMap(); - const int in = imap.get("one_minus_advected").first; - const int out = imap.get("one_minus_advected_sq").first; + const int in = imap["one_minus_advected"]; + const int out = imap["one_minus_advected_sq"]; const auto num_vars = rc->Get("advected").data.GetDim(4); - pmb->par_for( - PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + parthenon::par_for( + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), + 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { - v(out + n, k, j, i) = v(in + n, k, j, i) * v(in + n, k, j, i); + v(0, out + n, k, j, i) = v(0, in + n, k, j, i) * v(0, in + n, k, j, i); }); // The following block/logic is also just added for regression testing. @@ -330,8 +337,9 @@ void SquareIt(MeshBlockData *rc) { const auto &profile = pkg->Param("profile"); if (profile == "smooth_gaussian") { const auto &advected = rc->Get("advected").data; - pmb->par_for( - PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + parthenon::par_for( + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), + 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { PARTHENON_REQUIRE(advected(n, k, j, i) != 0.0, "Advected not properly initialized."); @@ -355,19 +363,21 @@ void PostFill(MeshBlockData *rc) { pmb->AllocSparseID("one_minus_sqrt_one_minus_advected_sq", 37); // packing in principle unnecessary/convoluted here and just done for demonstration - std::vector vars( - {"one_minus_advected_sq", "one_minus_sqrt_one_minus_advected_sq"}); - PackIndexMap imap; - const auto &v = rc->PackVariables(vars, imap); + std::vector> vars( + {{"one_minus_advected_sq", false}, {"one_minus_sqrt_one_minus_advected_sq*", true}}); + auto desc = parthenon::MakePackDescriptor(pmb->resolved_packages.get(), vars); + auto v = desc.GetPack(rc); + auto imap = desc.GetMap(); - const int in = imap.get("one_minus_advected_sq").first; + const int in = imap["one_minus_advected_sq"]; // we can get sparse fields either by specifying base name and sparse id, or the full // name - const int out12 = imap.get("one_minus_sqrt_one_minus_advected_sq", 12).first; - const int out37 = imap.get("one_minus_sqrt_one_minus_advected_sq_37").first; + const int out12 = imap["one_minus_sqrt_one_minus_advected_sq_12"]; + const int out37 = imap["one_minus_sqrt_one_minus_advected_sq_37"]; const auto num_vars = rc->Get("advected").data.GetDim(4); - pmb->par_for( - PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + parthenon::par_for( + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), + 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { v(out12 + n, k, j, i) = 1.0 - sqrt(v(in + n, k, j, i)); v(out37 + n, k, j, i) = 1.0 - v(out12 + n, k, j, i); @@ -390,7 +400,8 @@ Real AdvectionHst(MeshData *md) { // Packing variable over MeshBlock as the function is called for MeshData, i.e., a // collection of blocks - const auto &advected_pack = md->PackVariables(std::vector{"advected"}); + auto desc = parthenon::MakePackDescriptor(pmb->resolved_packages.get(), std::vector{"advected"}); + auto advected_pack = desc.GetPack(md); Real result = 0.0; T reducer(result); @@ -400,11 +411,11 @@ Real AdvectionHst(MeshData *md) { // weighting needs to be applied in the reduction region. const bool volume_weighting = std::is_same>::value; - pmb->par_reduce( - PARTHENON_AUTO_LABEL, 0, advected_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, - ib.e, + parthenon::par_reduce( + parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), + 0, advected_pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &lresult) { - const auto &coords = advected_pack.GetCoords(b); + const auto &coords = advected_pack.GetCoordinates(b); // `join` is a function of the Kokkos::ReducerConecpt that allows to use the same // call for different reductions const Real vol = volume_weighting ? coords.CellVolume(k, j, i) : 1.0; @@ -432,8 +443,9 @@ Real EstimateTimestepBlock(MeshBlockData *rc) { // this is obviously overkill for this constant velocity problem Real min_dt; - pmb->par_reduce( - PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + parthenon::par_reduce( + parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), + kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int k, const int j, const int i, Real &lmin_dt) { if (vx != 0.0) lmin_dt = std::min(lmin_dt, coords.Dxc(k, j, i) / std::abs(vx)); @@ -465,22 +477,30 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { const auto &vy = pkg->Param("vy"); const auto &vz = pkg->Param("vz"); - PackIndexMap index_map; - auto v = rc->PackVariablesAndFluxes(std::vector{Metadata::WithFluxes}, - index_map); - - // For non constant velocity, we need the index of the velocity vector as it's part of - // the variable pack. - const auto idx_v = index_map["v"].first; - const auto v_const = idx_v < 0; // using "at own perill" magic number + auto desc = parthenon::MakePackDescriptor(pmb->resolved_packages.get(), + std::vector{Metadata::WithFluxes}, + std::set{parthenon::PDOpt::WithFluxes}); + auto v = desc.GetPack(rc.get()); + //auto imap = desc.GetMap(); + int idx_v; + bool v_const; + //if (imap.count("v") > 0) { + //idx_v = imap["v"]; + // v_const = false; + //} else { + v_const = true; + //} const int scratch_level = 1; // 0 is actual scratch (tiny); 1 is HBM const int nx1 = pmb->cellbounds.ncellsi(IndexDomain::entire); - const int nvar = v.GetDim(4); + const int nvar = v.GetMaxNumberOfVars(); size_t scratch_size_in_bytes = parthenon::ScratchPad2D::shmem_size(nvar, nx1); // get x-fluxes - pmb->par_for_outer( - PARTHENON_AUTO_LABEL, 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, + //std::cout << "hello from thread " << std::this_thread::get_id() << std::endl; + //for (int cnt=0; cnt<300; cnt++) { + parthenon::par_for_outer( + DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), + 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { parthenon::ScratchPad2D ql(member.team_scratch(scratch_level), nvar, nx1); parthenon::ScratchPad2D qr(member.team_scratch(scratch_level), nvar, nx1); @@ -512,8 +532,9 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { // get y-fluxes if (pmb->pmy_mesh->ndim >= 2) { - pmb->par_for_outer( - PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, + parthenon::par_for_outer( + DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), + 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e + 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { // the overall algorithm/use of scratch pad here is clear inefficient and kept // just for demonstrating purposes. The key point is that we cannot reuse @@ -555,8 +576,9 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { // get z-fluxes if (pmb->pmy_mesh->ndim == 3) { - pmb->par_for_outer( - PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1, + parthenon::par_for_outer( + DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), + 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1, jb.s, jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { // the overall algorithm/use of scratch pad here is clear inefficient and kept @@ -596,6 +618,8 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { } }); } + //} + //std::cout << "done on thread " << std::this_thread::get_id() << std::endl; return TaskStatus::complete; } diff --git a/external/Kokkos b/external/Kokkos index 62d2b6c879b7..486cc745cb9a 160000 --- a/external/Kokkos +++ b/external/Kokkos @@ -1 +1 @@ -Subproject commit 62d2b6c879b74b6ae7bd06eb3e5e80139c4708e6 +Subproject commit 486cc745cb9a287f3915061455105a3ee588c616 diff --git a/scripts/python/packages/parthenon_tools/parthenon_tools/movie2d.py b/scripts/python/packages/parthenon_tools/parthenon_tools/movie2d.py index 363f4d00eb66..25bb914efa01 100644 --- a/scripts/python/packages/parthenon_tools/parthenon_tools/movie2d.py +++ b/scripts/python/packages/parthenon_tools/parthenon_tools/movie2d.py @@ -247,7 +247,8 @@ def plot_dump( for i in range(n_blocks): # Plot the actual data, should work if parthenon/output*/ghost_zones = true/false # but obviously no ghost data will be shown if ghost_zones = false - p.pcolormesh(xf[i, :], yf[i, :], q[i, :, :], vmin=qmin, vmax=qmax) + # print(xf[i,:].shape, yf[i,:].shape, q[i,:,:].shape) + p.pcolormesh(xf[i, :], yf[i, :], np.squeeze(q[i, :, :]), vmin=qmin, vmax=qmax) # Print the block gid in the center of the block if len(block_ids) > 0: diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4ecd634481f5..b7f06f704d9d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -222,7 +222,6 @@ add_library(parthenon solvers/solver_utils.hpp tasks/tasks.hpp - tasks/thread_pool.hpp time_integration/butcher_integrator.cpp time_integration/low_storage_integrator.cpp @@ -260,6 +259,7 @@ add_library(parthenon utils/sort.hpp utils/string_utils.cpp utils/string_utils.hpp + utils/thread_pool.hpp utils/unique_id.cpp utils/unique_id.hpp utils/utils.hpp @@ -321,6 +321,10 @@ if (Kokkos_ENABLE_CUDA AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang") target_compile_options(parthenon PUBLIC --expt-relaxed-constexpr) endif() +#add_compile_options(-fsanitize=thread) +#add_link_options(-fsanitize=thread) +add_link_options(-Wl,-no_pie -lprofiler -ltcmalloc) + target_link_libraries(parthenon PUBLIC Kokkos::kokkos Threads::Threads) if (PARTHENON_ENABLE_ASCENT) diff --git a/src/basic_types.hpp b/src/basic_types.hpp index 886788727d48..6d10f4dfa5b0 100644 --- a/src/basic_types.hpp +++ b/src/basic_types.hpp @@ -25,6 +25,8 @@ namespace parthenon { +inline thread_local Kokkos::Serial t_exec_space; + // primitive type alias that allows code to run with either floats or doubles #if SINGLE_PRECISION_ENABLED using Real = float; diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp index b4a4072758bd..fcf47dd78150 100644 --- a/src/bvals/comms/bnd_info.cpp +++ b/src/bvals/comms/bnd_info.cpp @@ -220,6 +220,11 @@ int GetBufferSize(MeshBlock *pmb, const NeighborBlock &nb, v->GetDim(4) * topo_comp; } +void BndInfo::LockedBufferCopy(CommBuffer::owner_t> *input_buf) { + //std::lock_guard lock(mutex); + buf = input_buf->buffer(); +} + BndInfo BndInfo::GetSendBndInfo(MeshBlock *pmb, const NeighborBlock &nb, std::shared_ptr> v, CommBuffer::owner_t> *buf) { @@ -229,7 +234,8 @@ BndInfo BndInfo::GetSendBndInfo(MeshBlock *pmb, const NeighborBlock &nb, out.alloc_status = v->GetAllocationStatus(); if (!out.allocated) return out; - out.buf = buf->buffer(); + out.LockedBufferCopy(buf); + //out.buf = buf->buffer(); int Nv = v->GetDim(4); int Nu = v->GetDim(5); @@ -257,7 +263,8 @@ BndInfo BndInfo::GetSetBndInfo(MeshBlock *pmb, const NeighborBlock &nb, std::shared_ptr> v, CommBuffer::owner_t> *buf) { BndInfo out; - out.buf = buf->buffer(); + //out.buf = buf->buffer(); + out.LockedBufferCopy(buf); auto buf_state = buf->GetState(); if (buf_state == BufferState::received) { out.buf_allocated = true; @@ -454,7 +461,8 @@ BndInfo BndInfo::GetSendCCFluxCor(MeshBlock *pmb, const NeighborBlock &nb, // Not going to actually do anything with this buffer return out; } - out.buf = buf->buffer(); + //out.buf = buf->buffer(); + out.LockedBufferCopy(buf); IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior); IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior); @@ -514,7 +522,8 @@ BndInfo BndInfo::GetSetCCFluxCor(MeshBlock *pmb, const NeighborBlock &nb, } out.allocated = true; out.alloc_status = v->GetAllocationStatus(); - out.buf = buf->buffer(); + //out.buf = buf->buffer(); + out.LockedBufferCopy(buf); IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior); IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior); diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp index 4860bdc7245f..bbc74ed67f71 100644 --- a/src/bvals/comms/bnd_info.hpp +++ b/src/bvals/comms/bnd_info.hpp @@ -56,6 +56,9 @@ struct BndInfo { BndInfo() = default; BndInfo(const BndInfo &) = default; + inline static std::mutex mutex; + void LockedBufferCopy(CommBuffer::owner_t> *buf); + // These are are used to generate the BndInfo struct for various // kinds of boundary types and operations. static BndInfo GetSendBndInfo(MeshBlock *pmb, const NeighborBlock &nb, diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp index cc093a34f080..4a84f432a16f 100644 --- a/src/bvals/comms/boundary_communication.cpp +++ b/src/bvals/comms/boundary_communication.cpp @@ -44,10 +44,17 @@ namespace parthenon { using namespace loops; using namespace loops::shorthands; +static std::array mutex; +//static std::mutex mutex; + template TaskStatus SendBoundBufs(std::shared_ptr> &md) { PARTHENON_INSTRUMENT + int mutex_id = 2 * static_cast(bound_type) + 1; + //mutex[mutex_id].lock(); + //mutex.lock(); + Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(bound_type, true); @@ -59,9 +66,13 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { CheckSendBufferCacheForRebuild(md); if (nbound == 0) { + //mutex[mutex_id].unlock(); + //mutex.unlock(); return TaskStatus::complete; } if (other_communication_unfinished) { + //mutex[mutex_id].unlock(); + //mutex.unlock(); return TaskStatus::incomplete; } @@ -77,6 +88,10 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { ProResInfo::GetSend); } } + + //mutex[mutex_id].unlock(); + //mutex.unlock(); + // Restrict auto pmb = md->GetBlockData(0)->GetBlockPointer(); StateDescriptor *resolved_packages = pmb->resolved_packages.get(); @@ -149,6 +164,7 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { else buf.SendNull(); } + //mutex.unlock(); return TaskStatus::complete; } @@ -165,11 +181,18 @@ SendBoundBufs(std::shared_ptr> template TaskStatus StartReceiveBoundBufs(std::shared_ptr> &md) { PARTHENON_INSTRUMENT + + int mutex_id = 2 * static_cast(bound_type); + //mutex[mutex_id].lock(); + //mutex.lock(); + Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false); if (cache.buf_vec.size() == 0) InitializeBufferCache(md, &(pmesh->boundary_comm_map), &cache, ReceiveKey, false); + //mutex[mutex_id].unlock(); + //mutex.unlock(); std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec), [](auto pbuf) { pbuf->TryStartReceive(); }); @@ -192,11 +215,17 @@ template TaskStatus ReceiveBoundBufs(std::shared_ptr> &md) { PARTHENON_INSTRUMENT + int mutex_id = 2 * static_cast(bound_type); + //mutex[mutex_id].lock(); + //mutex.lock(); + Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false); if (cache.buf_vec.size() == 0) InitializeBufferCache(md, &(pmesh->boundary_comm_map), &cache, ReceiveKey, false); + //mutex[mutex_id].unlock(); + //mutex.unlock(); bool all_received = true; std::for_each( @@ -240,6 +269,10 @@ template TaskStatus SetBounds(std::shared_ptr> &md) { PARTHENON_INSTRUMENT + int mutex_id = 2 * static_cast(bound_type); + //mutex[mutex_id].lock(); + //mutex.lock(); + Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false); @@ -257,6 +290,10 @@ TaskStatus SetBounds(std::shared_ptr> &md) { ProResInfo::GetSet); } } + + //mutex[mutex_id].unlock(); + //mutex.unlock(); + // const Real threshold = Globals::sparse_config.allocation_threshold; auto &bnd_info = cache.bnd_info; Kokkos::parallel_for( @@ -333,6 +370,10 @@ template TaskStatus ProlongateBounds(std::shared_ptr> &md) { PARTHENON_INSTRUMENT + int mutex_id = 2 * static_cast(bound_type); + //mutex[mutex_id].lock(); + //mutex.lock(); + Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false); @@ -350,6 +391,8 @@ TaskStatus ProlongateBounds(std::shared_ptr> &md) { ProResInfo::GetSet); } } + //mutex[mutex_id].unlock(); + //mutex.unlock(); if (nbound > 0 && pmesh->multilevel) { auto pmb = md->GetBlockData(0)->GetBlockPointer(); diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp index bdb5e95dc4c2..a9a9b3392d15 100644 --- a/src/bvals/comms/bvals_utils.hpp +++ b/src/bvals/comms/bvals_utils.hpp @@ -95,9 +95,9 @@ void InitializeBufferCache(std::shared_ptr> &md, COMM_MAP *comm_m // Or, what the hell, you could put them in random order if you want, which // frighteningly seems to run faster in some cases - std::random_device rd; - std::mt19937 g(rd()); - std::shuffle(key_order.begin(), key_order.end(), g); + //std::random_device rd; + //std::mt19937 g(rd()); + //std::shuffle(key_order.begin(), key_order.end(), g); int buff_idx = 0; pcache->buf_vec.clear(); diff --git a/src/bvals/comms/flux_correction.cpp b/src/bvals/comms/flux_correction.cpp index ff76bcba0014..9240be85e5ee 100644 --- a/src/bvals/comms/flux_correction.cpp +++ b/src/bvals/comms/flux_correction.cpp @@ -38,9 +38,16 @@ namespace parthenon { using namespace impl; +static std::array mutex; +//static std::mutex mutex; + TaskStatus LoadAndSendFluxCorrections(std::shared_ptr> &md) { PARTHENON_INSTRUMENT + int mutex_id = 2 * static_cast(BoundaryType::flxcor_send) + 1; + //mutex[mutex_id].lock(); + //mutex.lock(); + Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_send, true); const int ndim = pmesh->ndim; @@ -53,16 +60,22 @@ TaskStatus LoadAndSendFluxCorrections(std::shared_ptr> &md) { CheckSendBufferCacheForRebuild(md); if (nbound == 0) { + //mutex[mutex_id].unlock(); + //mutex.unlock(); return TaskStatus::complete; } if (other_communication_unfinished) { + //mutex[mutex_id].unlock(); + //mutex.unlock(); return TaskStatus::incomplete; } if (rebuild) RebuildBufferCache( md, nbound, BndInfo::GetSendCCFluxCor, ProResInfo::GetSend); + //mutex[mutex_id].unlock(); + //mutex.unlock(); auto &bnd_info = cache.bnd_info; PARTHENON_REQUIRE(bnd_info.size() == nbound, "Need same size for boundary info"); @@ -114,16 +127,24 @@ TaskStatus LoadAndSendFluxCorrections(std::shared_ptr> &md) { // Calling Send will send null if the underlying buffer is unallocated for (auto &buf : cache.buf_vec) buf->Send(); + //mutex.unlock(); return TaskStatus::complete; } TaskStatus StartReceiveFluxCorrections(std::shared_ptr> &md) { PARTHENON_INSTRUMENT + + int mutex_id = 2 * static_cast(BoundaryType::flxcor_recv); + //mutex[mutex_id].lock(); + //mutex.lock(); + Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false); if (cache.buf_vec.size() == 0) InitializeBufferCache( md, &(pmesh->boundary_comm_flxcor_map), &cache, ReceiveKey, false); + //mutex[mutex_id].unlock(); + //mutex.unlock(); std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec), [](auto pbuf) { pbuf->TryStartReceive(); }); @@ -134,11 +155,17 @@ TaskStatus StartReceiveFluxCorrections(std::shared_ptr> &md) { TaskStatus ReceiveFluxCorrections(std::shared_ptr> &md) { PARTHENON_INSTRUMENT + int mutex_id = 2 * static_cast(BoundaryType::flxcor_recv); + //mutex[mutex_id].lock(); + //mutex.lock(); + Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false); if (cache.buf_vec.size() == 0) InitializeBufferCache( md, &(pmesh->boundary_comm_flxcor_map), &cache, ReceiveKey, false); + //mutex[mutex_id].unlock(); + //mutex.unlock(); bool all_received = true; std::for_each( @@ -152,6 +179,10 @@ TaskStatus ReceiveFluxCorrections(std::shared_ptr> &md) { TaskStatus SetFluxCorrections(std::shared_ptr> &md) { PARTHENON_INSTRUMENT + int mutex_id = 2 * static_cast(BoundaryType::flxcor_recv); + //mutex[mutex_id].lock(); + //mutex.lock(); + Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false); @@ -160,6 +191,8 @@ TaskStatus SetFluxCorrections(std::shared_ptr> &md) { if (rebuild) RebuildBufferCache( md, nbound, BndInfo::GetSetCCFluxCor, ProResInfo::GetSend); + //mutex[mutex_id].unlock(); + //mutex.unlock(); auto &bnd_info = cache.bnd_info; Kokkos::parallel_for( @@ -182,6 +215,7 @@ TaskStatus SetFluxCorrections(std::shared_ptr> &md) { std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec), [](auto pbuf) { pbuf->Stale(); }); + //mutex.unlock(); return TaskStatus::complete; } diff --git a/src/driver/driver.hpp b/src/driver/driver.hpp index 929ea19c3f2b..2fd27f5d21fd 100644 --- a/src/driver/driver.hpp +++ b/src/driver/driver.hpp @@ -38,7 +38,9 @@ enum class DriverStatus { complete, timeout, failed }; class Driver { public: Driver(ParameterInput *pin, ApplicationInput *app_in, Mesh *pm) - : pinput(pin), app_input(app_in), pmesh(pm), mbcnt_prev(), time_LBandAMR() {} + : pinput(pin), app_input(app_in), pmesh(pm), mbcnt_prev(), time_LBandAMR() { + pool = pm->pool; + } virtual DriverStatus Execute() = 0; void InitializeOutputs() { pouts = std::make_unique(pmesh, pinput); } @@ -46,14 +48,15 @@ class Driver { ApplicationInput *app_input; Mesh *pmesh; std::unique_ptr pouts; + std::shared_ptr pool; static double elapsed_main() { return timer_main.seconds(); } static double elapsed_cycle() { return timer_cycle.seconds(); } static double elapsed_LBandAMR() { return timer_LBandAMR.seconds(); } protected: static Kokkos::Timer timer_cycle, timer_main, timer_LBandAMR; - double time_LBandAMR; std::uint64_t mbcnt_prev; + double time_LBandAMR; virtual void PreExecute(); virtual void PostExecute(DriverStatus status); @@ -105,7 +108,7 @@ TaskListStatus ConstructAndExecuteBlockTasks(T *driver, Args... args) { for (auto &pmb : driver->pmesh->block_list) { tr[i++] = driver->MakeTaskList(pmb.get(), std::forward(args)...); } - TaskListStatus status = tc.Execute(); + TaskListStatus status = tc.Execute(*driver->pool); return status; } @@ -113,7 +116,7 @@ template TaskListStatus ConstructAndExecuteTaskLists(T *driver, Args... args) { TaskCollection tc = driver->MakeTaskCollection(driver->pmesh->block_list, std::forward(args)...); - TaskListStatus status = tc.Execute(); + TaskListStatus status = tc.Execute(*driver->pool); return status; } diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp index e8250c4b58bc..c57e777fd86b 100644 --- a/src/interface/sparse_pack_base.cpp +++ b/src/interface/sparse_pack_base.cpp @@ -287,8 +287,9 @@ template SparsePackBase SparsePackBase::Build>(MeshData *, template SparsePackBase &SparsePackCache::Get(T *pmd, const PackDescriptor &desc, const std::vector &include_block) { - if (pack_map.count(desc.identifier) > 0) { - auto &cache_tuple = pack_map[desc.identifier]; + auto cache_tuple_itr = GetFromMap(desc.identifier); + if (false && cache_tuple_itr != pack_map.end()) { + auto &cache_tuple = cache_tuple_itr->second; auto &pack = std::get<0>(cache_tuple); auto alloc_status_in = SparsePackBase::GetAllocStatus(pmd, desc, include_block); auto &alloc_status = std::get<1>(cache_tuple); @@ -320,8 +321,10 @@ SparsePackCache::Get>(MeshBlockData *, const PackDescr template SparsePackBase &SparsePackCache::BuildAndAdd(T *pmd, const PackDescriptor &desc, const std::vector &include_block) { - if (pack_map.count(desc.identifier) > 0) pack_map.erase(desc.identifier); - pack_map[desc.identifier] = {SparsePackBase::Build(pmd, desc, include_block), + auto pack = SparsePackBase::Build(pmd, desc, include_block); + std::lock_guard lock(mutex); + pack_map.erase(desc.identifier); + pack_map[desc.identifier] = {std::move(pack), SparsePackBase::GetAllocStatus(pmd, desc, include_block), include_block}; return std::get<0>(pack_map[desc.identifier]); diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp index 26a5f3fd7b61..76673a6c2440 100644 --- a/src/interface/sparse_pack_base.hpp +++ b/src/interface/sparse_pack_base.hpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -121,6 +122,13 @@ class SparsePackCache { SparsePackBase::include_t>> pack_map; + private: + auto GetFromMap(const std::string &key) { + std::lock_guard lock(mutex); + return pack_map.find(key); + } + std::mutex mutex; + friend class SparsePackBase; }; diff --git a/src/interface/update.cpp b/src/interface/update.cpp index 0e0d1195943e..69ab99d0990d 100644 --- a/src/interface/update.cpp +++ b/src/interface/update.cpp @@ -62,23 +62,37 @@ template <> TaskStatus FluxDivergence(MeshData *in_obj, MeshData *dudt_obj) { const IndexDomain interior = IndexDomain::interior; + auto pm = in_obj->GetMeshPointer(); std::vector flags({Metadata::WithFluxes, Metadata::Cell}); - const auto &vin = in_obj->PackVariablesAndFluxes(flags); - auto dudt = dudt_obj->PackVariables(flags); + auto desc = MakePackDescriptor(pm->resolved_packages.get(), + flags, std::set{PDOpt::WithFluxes}); + auto v = desc.GetPack(in_obj); + auto dudt = desc.GetPack(dudt_obj); + const IndexRange ib = in_obj->GetBoundsI(interior); const IndexRange jb = in_obj->GetBoundsJ(interior); const IndexRange kb = in_obj->GetBoundsK(interior); - const int ndim = vin.GetNdim(); + const int nblocks = v.GetNBlocks(); + const int nvars = v.GetMaxNumberOfVars(); + + const int ndim = pm->ndim; parthenon::par_for( - DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, vin.GetDim(5) - 1, 0, - vin.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, nblocks - 1, 0, + nvars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int m, const int l, const int k, const int j, const int i) { - if (dudt.IsAllocated(m, l) && vin.IsAllocated(m, l)) { - const auto &coords = vin.GetCoords(m); - const auto &v = vin(m); - dudt(m, l, k, j, i) = FluxDivHelper(l, k, j, i, ndim, coords, v); + const auto &coords = v.GetCoordinates(m); + Real du = (coords.FaceArea(k, j, i + 1) * v.flux(m, X1DIR, l, k, j, i + 1) - + coords.FaceArea(k, j, i) * v.flux(m, X1DIR, l, k, j, i)); + if (ndim >= 2) { + du += (coords.FaceArea(k, j + 1, i) * v.flux(m, X2DIR, l, k, j + 1, i) - + coords.FaceArea(k, j, i) * v.flux(m, X2DIR, l, k, j, i)); + } + if (ndim == 3) { + du += (coords.FaceArea(k + 1, j, i) * v.flux(m, X3DIR, l, k + 1, j, i) - + coords.FaceArea(k, j, i) * v.flux(m, X3DIR, l, k, j, i)); } + dudt(m, l, k, j, i) = -du / coords.CellVolume(k, j, i); }); return TaskStatus::complete; } diff --git a/src/interface/update.hpp b/src/interface/update.hpp index 21f035caefa3..79f660ee6ff1 100644 --- a/src/interface/update.hpp +++ b/src/interface/update.hpp @@ -71,19 +71,21 @@ template TaskStatus WeightedSumData(const F &flags, T *in1, T *in2, const Real w1, const Real w2, T *out) { PARTHENON_INSTRUMENT - const auto &x = in1->PackVariables(flags); - const auto &y = in2->PackVariables(flags); - const auto &z = out->PackVariables(flags); + auto pm = in1->GetMeshPointer(); + auto desc = MakePackDescriptor(pm->resolved_packages.get(), flags); + auto x = desc.GetPack(in1); + auto y = desc.GetPack(in2); + auto z = desc.GetPack(out); + auto num_var = x.GetMaxNumberOfVars(); + auto num_blocks = x.GetNBlocks(); + auto ib = in1->GetBoundsI(IndexDomain::entire); + auto jb = in1->GetBoundsJ(IndexDomain::entire); + auto kb = in1->GetBoundsK(IndexDomain::entire); parthenon::par_for( - DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, x.GetDim(5) - 1, 0, - x.GetDim(4) - 1, 0, x.GetDim(3) - 1, 0, x.GetDim(2) - 1, 0, x.GetDim(1) - 1, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, num_blocks - 1, 0, + num_var - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) { - // TOOD(someone) This is potentially dangerous and/or not intended behavior - // as we still may want to update (or populate) z if any of those vars are - // not allocated yet. - if (x.IsAllocated(b, l) && y.IsAllocated(b, l) && z.IsAllocated(b, l)) { - z(b, l, k, j, i) = w1 * x(b, l, k, j, i) + w2 * y(b, l, k, j, i); - } + z(b, l, k, j, i) = w1 * x(b, l, k, j, i) + w2 * y(b, l, k, j, i); }); return TaskStatus::complete; } diff --git a/src/interface/variable_pack.hpp b/src/interface/variable_pack.hpp index 924d15166aaf..9fd87cfbb566 100644 --- a/src/interface/variable_pack.hpp +++ b/src/interface/variable_pack.hpp @@ -716,6 +716,7 @@ template VariableFluxPack MakeFluxPack(const VarListWithKeys &var_list, const VarListWithKeys &flux_var_list, PackIndexMap *pvmap) { + printf("Making a FluxPack\n"); const auto &vars = var_list.vars(); // for convenience const auto &flux_vars = flux_var_list.vars(); // for convenience @@ -777,6 +778,7 @@ VariableFluxPack MakeFluxPack(const VarListWithKeys &var_list, template VariablePack MakePack(const VarListWithKeys &var_list, bool coarse, PackIndexMap *pvmap) { + printf("Making a Pack\n"); const auto &vars = var_list.vars(); // for convenience if (vars.empty()) { diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index c2cc09ce879f..6a2a5e40c033 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -43,9 +43,12 @@ using DevExecSpace = Kokkos::Cuda; #else using DevMemSpace = Kokkos::DefaultExecutionSpace::memory_space; using HostMemSpace = Kokkos::HostSpace; -using DevExecSpace = Kokkos::DefaultExecutionSpace; +using DevExecSpace_t = Kokkos::DefaultExecutionSpace; +inline Kokkos::Serial DevExecSpace() { + return t_exec_space; +} #endif -using ScratchMemSpace = DevExecSpace::scratch_memory_space; +using ScratchMemSpace = DevExecSpace_t::scratch_memory_space; using HostExecSpace = Kokkos::DefaultHostExecutionSpace; using LayoutWrapper = Kokkos::LayoutRight; @@ -214,10 +217,21 @@ inline void kokkos_dispatch(ParallelScanDispatch, Args &&...args) { } // namespace dispatch_impl +class ExecSpace { + public: + explicit ExecSpace(const int npartitions) { + std::vector weights(npartitions, 1); + partitions = Kokkos::Experimental::partition_space(DevExecSpace_t(), weights); + } + Kokkos::DefaultExecutionSpace& operator[](const int i) { return partitions[i]; } + private: + std::vector partitions; +}; + // 1D loop using RangePolicy loops template inline typename std::enable_if::type -par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_space, +par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace_t exec_space, const int &il, const int &iu, const Function &function, Args &&...args) { Tag tag; kokkos_dispatch(tag, name, @@ -230,7 +244,7 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp // 2D loop using MDRange loops template inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, +par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace_t exec_space, const int jl, const int ju, const int il, const int iu, const Function &function, Args &&...args) { Tag tag; @@ -245,7 +259,7 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac // 3D loop using Kokkos 1D Range template inline typename std::enable_if::type -par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_space, +par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace_t exec_space, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function, Args &&...args) { Tag tag; @@ -271,7 +285,7 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp // 3D loop using MDRange loops template inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, +par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace_t exec_space, const int &kl, const int &ku, const int &jl, const int &ju, const int &il, const int &iu, const Function &function, Args &&...args) { Tag tag; @@ -287,7 +301,7 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac // 3D loop using TeamPolicy with single inner TeamThreadRange template inline void par_dispatch(LoopPatternTPTTR, const std::string &name, - DevExecSpace exec_space, const int &kl, const int &ku, + DevExecSpace_t exec_space, const int &kl, const int &ku, const int &jl, const int &ju, const int &il, const int &iu, const Function &function) { const int Nk = ku - kl + 1; @@ -306,7 +320,7 @@ inline void par_dispatch(LoopPatternTPTTR, const std::string &name, // 3D loop using TeamPolicy with single inner ThreadVectorRange template inline void par_dispatch(LoopPatternTPTVR, const std::string &name, - DevExecSpace exec_space, const int &kl, const int &ku, + DevExecSpace_t exec_space, const int &kl, const int &ku, const int &jl, const int &ju, const int &il, const int &iu, const Function &function) { // TODO(pgrete) if exec space is Cuda,throw error @@ -326,7 +340,7 @@ inline void par_dispatch(LoopPatternTPTVR, const std::string &name, // 3D loop using TeamPolicy with nested TeamThreadRange and ThreadVectorRange template inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name, - DevExecSpace exec_space, const int &kl, const int &ku, + DevExecSpace_t exec_space, const int &kl, const int &ku, const int &jl, const int &ju, const int &il, const int &iu, const Function &function) { const int Nk = ku - kl + 1; @@ -345,7 +359,7 @@ inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name, // 3D loop using SIMD FOR loops template inline void par_dispatch(LoopPatternSimdFor, const std::string &name, - DevExecSpace exec_space, const int &kl, const int &ku, + DevExecSpace_t exec_space, const int &kl, const int &ku, const int &jl, const int &ju, const int &il, const int &iu, const Function &function) { PARTHENON_INSTRUMENT_REGION(name) @@ -359,7 +373,7 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, // 4D loop using Kokkos 1D Range template inline typename std::enable_if::type -par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_space, +par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace_t exec_space, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function, Args &&...args) { @@ -390,7 +404,7 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp // 4D loop using MDRange loops template inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, +par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace_t exec_space, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function, Args &&...args) { @@ -407,7 +421,7 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac // 4D loop using TeamPolicy loop with inner TeamThreadRange template inline void par_dispatch(LoopPatternTPTTR, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, + DevExecSpace_t exec_space, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function) { const int Nn = nu - nl + 1; @@ -431,7 +445,7 @@ inline void par_dispatch(LoopPatternTPTTR, const std::string &name, // 4D loop using TeamPolicy loop with inner ThreadVectorRange template inline void par_dispatch(LoopPatternTPTVR, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, + DevExecSpace_t exec_space, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function) { // TODO(pgrete) if exec space is Cuda,throw error @@ -456,7 +470,7 @@ inline void par_dispatch(LoopPatternTPTVR, const std::string &name, // 4D loop using TeamPolicy with nested TeamThreadRange and ThreadVectorRange template inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, + DevExecSpace_t exec_space, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function) { const int Nn = nu - nl + 1; @@ -478,7 +492,7 @@ inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name, // 4D loop using SIMD FOR loops template inline void par_dispatch(LoopPatternSimdFor, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, + DevExecSpace_t exec_space, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function) { PARTHENON_INSTRUMENT_REGION(name) @@ -493,7 +507,7 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, // 5D loop using MDRange loops template inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, +par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace_t exec_space, const int ml, const int mu, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function, Args &&...args) { @@ -511,7 +525,7 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac // 5D loop using Kokkos 1D Range template inline void par_dispatch(LoopPatternFlatRange, const std::string &name, - DevExecSpace exec_space, const int bl, const int bu, + DevExecSpace_t exec_space, const int bl, const int bu, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function) { @@ -544,7 +558,7 @@ inline void par_dispatch(LoopPatternFlatRange, const std::string &name, // 5D loop using SIMD FOR loops template inline void par_dispatch(LoopPatternSimdFor, const std::string &name, - DevExecSpace exec_space, const int bl, const int bu, + DevExecSpace_t exec_space, const int bl, const int bu, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function) { @@ -561,7 +575,7 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, // 6D loop using MDRange loops template inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, +par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace_t exec_space, const int ll, const int lu, const int ml, const int mu, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function, Args &&...args) { @@ -579,7 +593,7 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac // 6D loop using Kokkos 1D Range template inline void par_dispatch(LoopPatternFlatRange, const std::string &name, - DevExecSpace exec_space, const int ll, const int lu, + DevExecSpace_t exec_space, const int ll, const int lu, const int ml, const int mu, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function) { @@ -616,7 +630,7 @@ inline void par_dispatch(LoopPatternFlatRange, const std::string &name, // 6D loop using SIMD FOR loops template inline void par_dispatch(LoopPatternSimdFor, const std::string &name, - DevExecSpace exec_space, const int ll, const int lu, + DevExecSpace_t exec_space, const int ll, const int lu, const int ml, const int mu, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function) { @@ -649,7 +663,7 @@ inline void par_scan(Args &&...args) { // 1D outer parallel loop using Kokkos Teams template inline void par_for_outer(OuterLoopPatternTeams, const std::string &name, - DevExecSpace exec_space, size_t scratch_size_in_bytes, + DevExecSpace_t exec_space, size_t scratch_size_in_bytes, const int scratch_level, const int kl, const int ku, const Function &function) { const int Nk = ku + 1 - kl; @@ -668,7 +682,7 @@ inline void par_for_outer(OuterLoopPatternTeams, const std::string &name, // 2D outer parallel loop using Kokkos Teams template inline void par_for_outer(OuterLoopPatternTeams, const std::string &name, - DevExecSpace exec_space, size_t scratch_size_in_bytes, + DevExecSpace_t exec_space, size_t scratch_size_in_bytes, const int scratch_level, const int kl, const int ku, const int jl, const int ju, const Function &function) { const int Nk = ku + 1 - kl; @@ -690,7 +704,7 @@ inline void par_for_outer(OuterLoopPatternTeams, const std::string &name, // 3D outer parallel loop using Kokkos Teams template inline void par_for_outer(OuterLoopPatternTeams, const std::string &name, - DevExecSpace exec_space, size_t scratch_size_in_bytes, + DevExecSpace_t exec_space, size_t scratch_size_in_bytes, const int scratch_level, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const Function &function) { @@ -715,6 +729,20 @@ inline void par_for_outer(OuterLoopPatternTeams, const std::string &name, }); } +template +inline void par_for_outer_stupid(OuterLoopPatternTeams, const std::string &name, + DevExecSpace_t exec_space, const int nl, const int nu, + const int kl, const int ku, const int jl, const int ju, + const Function &function) { + for (int n = nl; n <= nu; n++) { + for (int k = kl; k <= ku; k++) { + for (int j = jl; j <= ju; j++) { + function(n, k, j); + } + } + } +} + // Inner parallel loop using TeamThreadRange template KOKKOS_FORCEINLINE_FUNCTION void @@ -944,7 +972,7 @@ struct DeviceDeleter { } }; -template +template std::unique_ptr> DeviceAllocate() { static_assert(std::is_trivially_destructible::value, "DeviceAllocate only supports trivially destructible classes!"); @@ -957,7 +985,7 @@ std::unique_ptr> DeviceAllocate() { return up; } -template +template std::unique_ptr> DeviceCopy(const T &host_object) { static_assert(std::is_trivially_destructible::value, "DeviceCopy only supports trivially destructible classes!"); diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp index 6359dbee3e89..92b07f283765 100644 --- a/src/mesh/amr_loadbalance.cpp +++ b/src/mesh/amr_loadbalance.cpp @@ -42,6 +42,7 @@ #include "parthenon_arrays.hpp" #include "utils/buffer_utils.hpp" #include "utils/error_checking.hpp" +#include "utils/thread_pool.hpp" namespace parthenon { @@ -650,6 +651,12 @@ bool Mesh::GatherCostListAndCheckBalance() { return true; } +void thread_safe_insert(std::unordered_set &myset, LogicalLocation &myval) { + static std::mutex mutex; + std::lock_guard lock(mutex); + myset.insert(myval); +} + //---------------------------------------------------------------------------------------- // \!fn void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot) // \brief redistribute MeshBlocks according to the new load balance @@ -682,8 +689,10 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput PARTHENON_INSTRUMENT newloc = forest.GetMeshBlockListAndResolveGids(); nbtotal = newloc.size(); - for (int ib = 0; ib < nbtotal; ++ib) - newtoold[ib] = forest.GetOldGid(newloc[ib]); + + ThreadPoolLoopBlocking(*pool, 0, nbtotal-1, [&](const int i) { + newtoold[i] = forest.GetOldGid(newloc[i]); + }); // create a list mapping the previous gid to the current one oldtonew[0] = 0; @@ -701,17 +710,19 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput for (; mb_idx < nbtold; mb_idx++) oldtonew[mb_idx] = ntot - 1; - current_level = 0; - for (int n = 0; n < ntot; n++) { + //current_level = 0; + //for (int n = 0; n < ntot; n++) { + current_level = ThreadPoolReduce(*pool, 0, ntot-1, [&](const int n) { // "on" = "old n" = "old gid" = "old global MeshBlock ID" int on = newtoold[n]; - if (newloc[n].level() > current_level) // set the current max level - current_level = newloc[n].level(); + //if (newloc[n].level() > current_level) // set the current max level + // current_level = newloc[n].level(); if (newloc[n].level() >= loclist[on].level()) { // same or refined newcost[n] = costlist[on]; // Keep a list of all blocks refined for below if (newloc[n].level() > loclist[on].level()) { - newly_refined.insert(newloc[n]); + thread_safe_insert(newly_refined, newloc[n]); + //newly_refined.insert(newloc[n]); } } else { double acost = 0.0; @@ -719,7 +730,8 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput acost += costlist[on + l]; newcost[n] = acost / nleaf; } - } + return newloc[n].level(); + }, [](int a, int b) { return std::max(a,b); }, int(0)); } // Construct new list region // Calculate new load balance @@ -729,30 +741,39 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput int nbe = nbs + nblist[Globals::my_rank] - 1; // Restrict fine to coarse buffers - ProResCache_t restriction_cache; - int nrestrict = 0; - for (int on = onbs; on <= onbe; on++) { - int nn = oldtonew[on]; - auto pmb = FindMeshBlock(on); - if (newloc[nn].level() < loclist[on].level()) nrestrict += pmb->vars_cc_.size(); - } - restriction_cache.Initialize(nrestrict, resolved_packages.get()); - int irestrict = 0; - for (int on = onbs; on <= onbe; on++) { - int nn = oldtonew[on]; - if (newloc[nn].level() < loclist[on].level()) { + auto restrict_fine_to_coarse_buffers = [&](const int t_onbs, const int t_onbe) { + ProResCache_t restriction_cache; + int nrestrict = 0; + for (int on = t_onbs; on <= t_onbe; on++) { + int nn = oldtonew[on]; auto pmb = FindMeshBlock(on); - for (auto &var : pmb->vars_cc_) { - restriction_cache.RegisterRegionHost( - irestrict++, ProResInfo::GetInteriorRestrict(pmb.get(), NeighborBlock(), var), - var.get(), resolved_packages.get()); + if (newloc[nn].level() < loclist[on].level()) nrestrict += pmb->vars_cc_.size(); + } + + restriction_cache.Initialize(nrestrict, resolved_packages.get()); + int irestrict = 0; + for (int on = t_onbs; on <= t_onbe; on++) { + int nn = oldtonew[on]; + if (newloc[nn].level() < loclist[on].level()) { + auto pmb = FindMeshBlock(on); + for (auto &var : pmb->vars_cc_) { + restriction_cache.RegisterRegionHost( + irestrict++, ProResInfo::GetInteriorRestrict(pmb.get(), NeighborBlock(), var), + var.get(), resolved_packages.get()); + } } } + restriction_cache.CopyToDevice(); + refinement::Restrict(resolved_packages.get(), restriction_cache, + block_list[0]->cellbounds, block_list[0]->c_cellbounds); + }; + std::vector onstart, onstop; + ThreadLoopBounds(*pool, onbs, onbe, onstart, onstop); + for (int it = 0; it < pool->size(); it++) { + pool->enqueue(restrict_fine_to_coarse_buffers, onstart[it], onstop[it]); } - restriction_cache.CopyToDevice(); - refinement::Restrict(resolved_packages.get(), restriction_cache, - block_list[0]->cellbounds, block_list[0]->c_cellbounds); - + //pool->run(); + pool->wait(); Kokkos::fence(); #ifdef MPI_PARALLEL @@ -795,6 +816,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput RegionSize block_size = GetBlockSize(); for (int n = nbs; n <= nbe; n++) { + //ThreadPoolLoopBlocking(*pool, nbs, nbe, [&, this](const int n) { int on = newtoold[n]; if ((ranklist[on] == Globals::my_rank) && (loclist[on].level() == newloc[n].level())) { @@ -816,7 +838,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput MeshBlock::Make(n, n - nbs, newloc[n], block_size, block_bcs, this, pin, app_in, packages, resolved_packages, gflag); } - } + }//); } // AMR Construct new MeshBlockList region // Replace the MeshBlock list diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp index d8630d98dcb2..c7a77b17dae5 100644 --- a/src/mesh/mesh.cpp +++ b/src/mesh/mesh.cpp @@ -43,6 +43,7 @@ #include "globals.hpp" #include "interface/state_descriptor.hpp" #include "interface/update.hpp" +#include "kokkos_abstraction.hpp" #include "mesh/mesh.hpp" #include "mesh/mesh_refinement.hpp" #include "mesh/meshblock.hpp" @@ -103,7 +104,8 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages, // private members: num_mesh_threads_(pin->GetOrAddInteger("parthenon/mesh", "num_threads", 1)), use_uniform_meshgen_fn_{true, true, true, true}, lb_flag_(true), lb_automatic_(), - lb_manual_(), MeshBndryFnctn{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr} { + lb_manual_(), MeshBndryFnctn{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}, + pool(std::make_shared(pin->GetOrAddInteger("parthenon/execution", "nthreads", 1))) { std::stringstream msg; RegionSize block_size; BoundaryFlag block_bcs[6]; @@ -479,7 +481,8 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, RestartReader &rr, // private members: num_mesh_threads_(pin->GetOrAddInteger("parthenon/mesh", "num_threads", 1)), use_uniform_meshgen_fn_{true, true, true, true}, lb_flag_(true), lb_automatic_(), - lb_manual_(), MeshBndryFnctn{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr} { + lb_manual_(), MeshBndryFnctn{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}, + pool(std::make_shared(pin->GetOrAddInteger("parthenon/execution", "nthreads", 1))) { std::stringstream msg; RegionSize block_size; BoundaryFlag block_bcs[6]; diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp index 81a2e9673be6..2a5cf3832bec 100644 --- a/src/mesh/mesh.hpp +++ b/src/mesh/mesh.hpp @@ -54,6 +54,7 @@ #include "utils/hash.hpp" #include "utils/object_pool.hpp" #include "utils/partition_stl_containers.hpp" +#include "utils/thread_pool.hpp" namespace parthenon { @@ -259,6 +260,9 @@ class Mesh { return resolved_packages->GetVariableNames(std::forward(args)...); } + std::shared_ptr pool; + void SetThreadPool(std::shared_ptr p) { pool = p; } + private: // data int root_level, max_level, current_level; diff --git a/src/mesh/meshblock.hpp b/src/mesh/meshblock.hpp index 54e11bcff119..cef22be3f9d6 100644 --- a/src/mesh/meshblock.hpp +++ b/src/mesh/meshblock.hpp @@ -84,7 +84,7 @@ class MeshBlock : public std::enable_shared_from_this { int igflag, double icost = 1.0); // Kokkos execution space for this MeshBlock - DevExecSpace exec_space; + DevExecSpace_t exec_space; // data Mesh *pmy_mesh = nullptr; // ptr to Mesh containing this MeshBlock diff --git a/src/parthenon_manager.hpp b/src/parthenon_manager.hpp index 4dc31f169b1f..516b054fc568 100644 --- a/src/parthenon_manager.hpp +++ b/src/parthenon_manager.hpp @@ -24,6 +24,7 @@ #include "driver/driver.hpp" #include "interface/state_descriptor.hpp" #include "interface/swarm.hpp" +#include "kokkos_abstraction.hpp" #include "mesh/domain.hpp" #include "mesh/mesh.hpp" #include "outputs/restart.hpp" @@ -36,7 +37,7 @@ enum class ParthenonStatus { ok, complete, error }; class ParthenonManager { public: - ParthenonManager() { app_input.reset(new ApplicationInput()); } + ParthenonManager() { t_exec_space = DevExecSpace_t(); app_input.reset(new ApplicationInput()); } ParthenonStatus ParthenonInitEnv(int argc, char *argv[]); void ParthenonInitPackagesAndMesh(); ParthenonStatus ParthenonFinalize(); diff --git a/src/reconstruct/dc_inline.hpp b/src/reconstruct/dc_inline.hpp index bd8401f8ab54..c0dd180da997 100644 --- a/src/reconstruct/dc_inline.hpp +++ b/src/reconstruct/dc_inline.hpp @@ -33,13 +33,12 @@ template KOKKOS_FORCEINLINE_FUNCTION void DonorCellX1(parthenon::team_mbr_t const &member, const int k, const int j, const int il, const int iu, const T &q, ScratchPad2D &ql, ScratchPad2D &qr) { - const int nu = q.GetDim(4) - 1; + const int nu = q.GetMaxNumberOfVars() - 1; // compute L/R states for each variable for (int n = 0; n <= nu; ++n) { - if (!q.IsAllocated(n)) continue; parthenon::par_for_inner( - member, il, iu, [&](const int i) { ql(n, i + 1) = qr(n, i) = q(n, k, j, i); }); + member, il, iu, [&](const int i) { ql(n, i + 1) = qr(n, i) = q(0, n, k, j, i); }); } } @@ -50,12 +49,11 @@ template KOKKOS_FORCEINLINE_FUNCTION void DonorCellX2(parthenon::team_mbr_t const &member, const int k, const int j, const int il, const int iu, const T &q, ScratchPad2D &ql, ScratchPad2D &qr) { - const int nu = q.GetDim(4) - 1; + const int nu = q.GetMaxNumberOfVars() - 1; // compute L/R states for each variable for (int n = 0; n <= nu; ++n) { - if (!q.IsAllocated(n)) continue; parthenon::par_for_inner(member, il, iu, - [&](const int i) { ql(n, i) = qr(n, i) = q(n, k, j, i); }); + [&](const int i) { ql(n, i) = qr(n, i) = q(0, n, k, j, i); }); } } @@ -66,12 +64,11 @@ template KOKKOS_FORCEINLINE_FUNCTION void DonorCellX3(parthenon::team_mbr_t const &member, const int k, const int j, const int il, const int iu, const T &q, ScratchPad2D &ql, ScratchPad2D &qr) { - const int nu = q.GetDim(4) - 1; + const int nu = q.GetMaxNumberOfVars() - 1; // compute L/R states for each variable for (int n = 0; n <= nu; ++n) { - if (!q.IsAllocated(n)) continue; parthenon::par_for_inner(member, il, iu, - [&](const int i) { ql(n, i) = qr(n, i) = q(n, k, j, i); }); + [&](const int i) { ql(n, i) = qr(n, i) = q(0, n, k, j, i); }); } } } // namespace parthenon diff --git a/src/tasks/tasks.hpp b/src/tasks/tasks.hpp index 62b2d75a2ec2..d2f626582f92 100644 --- a/src/tasks/tasks.hpp +++ b/src/tasks/tasks.hpp @@ -27,8 +27,8 @@ #include #include -#include "thread_pool.hpp" #include "utils/error_checking.hpp" +#include "utils/thread_pool.hpp" namespace parthenon { @@ -395,8 +395,8 @@ class TaskRegion { TaskListStatus Execute(ThreadPool &pool) { // for now, require a pool with one thread - PARTHENON_REQUIRE_THROWS(pool.size() == 1, - "ThreadPool size != 1 is not currently supported.") + //PARTHENON_REQUIRE_THROWS(pool.size() == 1, + //"ThreadPool size != 1 is not currently supported.") // first, if needed, finish building the graph if (!graph_built) BuildGraph(); @@ -420,6 +420,9 @@ class TaskRegion { pool.enqueue([t, &ProcessTask]() { return ProcessTask(t); }); } + // and run it + //pool.run(); + // then wait until everything is done pool.wait(); diff --git a/src/tasks/thread_pool.hpp b/src/tasks/thread_pool.hpp deleted file mode 100644 index b8f526750230..000000000000 --- a/src/tasks/thread_pool.hpp +++ /dev/null @@ -1,139 +0,0 @@ -//======================================================================================== -// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved. -// -// This program was produced under U.S. Government contract 89233218CNA000001 for Los -// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC -// for the U.S. Department of Energy/National Nuclear Security Administration. All rights -// in the program are reserved by Triad National Security, LLC, and the U.S. Department -// of Energy/National Nuclear Security Administration. The Government is granted for -// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide -// license in this material to reproduce, prepare derivative works, distribute copies to -// the public, perform publicly and display publicly, and to permit others to do so. -//======================================================================================== - -#ifndef TASKS_THREAD_POOL_HPP_ -#define TASKS_THREAD_POOL_HPP_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace parthenon { - -template -class ThreadQueue { - public: - explicit ThreadQueue(const int num_workers) : nworkers(num_workers), nwaiting(0) {} - void push(T q) { - std::lock_guard lock(mutex); - queue.push(q); - cv.notify_one(); - } - bool pop(T &q) { - std::unique_lock lock(mutex); - if (queue.empty()) { - nwaiting++; - if (waiting && nwaiting == nworkers) { - complete = true; - complete_cv.notify_all(); - } - cv.wait(lock, [this]() { return exit || !queue.empty(); }); - nwaiting--; - if (exit) return true; - } - q = queue.front(); - queue.pop(); - return false; - } - void signal_kill() { - std::lock_guard lock(mutex); - std::queue().swap(queue); - complete = true; - exit = true; - cv.notify_all(); - } - void signal_exit_when_finished() { - std::lock_guard lock(mutex); - exit = true; - complete = true; - cv.notify_all(); - } - void wait_for_complete() { - std::unique_lock lock(mutex); - waiting = true; - if (queue.empty() && nwaiting == nworkers) { - complete = false; - waiting = false; - return; - } - complete_cv.wait(lock, [this]() { return complete; }); - complete = false; - waiting = false; - } - - private: - const int nworkers; - int nwaiting; - std::queue queue; - std::mutex mutex; - std::condition_variable cv; - std::condition_variable complete_cv; - bool complete = false; - bool exit = false; - bool waiting = false; -}; - -class ThreadPool { - public: - explicit ThreadPool(const int numthreads = std::thread::hardware_concurrency()) - : nthreads(numthreads), queue(nthreads) { - for (int i = 0; i < nthreads; i++) { - auto worker = [&]() { - while (true) { - std::function f; - auto stop = queue.pop(f); - if (stop) break; - if (f) f(); - } - }; - threads.emplace_back(worker); - } - } - ~ThreadPool() { - queue.signal_exit_when_finished(); - for (auto &t : threads) { - t.join(); - } - } - - void wait() { queue.wait_for_complete(); } - - void kill() { queue.signal_kill(); } - - template - std::future::type> enqueue(F &&f, Args &&...args) { - using return_t = typename std::result_of::type; - auto task = std::make_shared>( - [=, func = std::forward(f)] { return func(std::forward(args)...); }); - std::future result = task->get_future(); - queue.push([task]() { (*task)(); }); - return result; - } - - int size() const { return nthreads; } - - private: - const int nthreads; - std::vector threads; - ThreadQueue> queue; -}; - -} // namespace parthenon - -#endif // TASKS_THREAD_POOL_HPP_ diff --git a/src/utils/communication_buffer.hpp b/src/utils/communication_buffer.hpp index edfc04f11c77..eee7ab563fe3 100644 --- a/src/utils/communication_buffer.hpp +++ b/src/utils/communication_buffer.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -37,6 +38,32 @@ namespace parthenon { // received: X enum class BufferState { stale, sending, sending_null, received, received_null }; +class BufferStateThreadSafe { + public: + BufferStateThreadSafe() = default; + BufferStateThreadSafe(BufferState bs) : state_(bs) {} + BufferStateThreadSafe(const BufferStateThreadSafe &bs) = delete; + BufferStateThreadSafe &operator =(BufferState bs) { + mutex.lock(); + state_ = bs; + mutex.unlock(); + return *this; + } + bool operator==(BufferState bs) { + return (Get() == bs); + } + BufferState Get() { + mutex.lock_shared(); + auto val = state_; + mutex.unlock_shared(); + return val; + } + + private: + BufferState state_; + std::shared_mutex mutex; +}; + enum class BuffCommType { sender, receiver, both, sparse_receiver }; template @@ -46,7 +73,7 @@ class CommBuffer { template friend class CommBuffer; - std::shared_ptr state_; + std::shared_ptr state_; std::shared_ptr comm_type_; std::shared_ptr started_irecv_; std::shared_ptr nrecv_tries_; @@ -65,6 +92,7 @@ class CommBuffer { std::function get_resource_; T buf_; + inline static std::mutex mutex; public: CommBuffer() @@ -81,6 +109,9 @@ class CommBuffer { ~CommBuffer(); + //CommBuffer(const CommBuffer &in) = delete; + //CommBuffer &operator=(const CommBuffer &in) = delete; + template CommBuffer(const CommBuffer &in); @@ -95,19 +126,21 @@ class CommBuffer { void Allocate() { if (!active_) { + std::lock_guard lock(mutex); buf_ = get_resource_(); active_ = true; } } void Free() { + std::lock_guard lock(mutex); buf_ = T(); active_ = false; } bool IsActive() const { return active_; } - BufferState GetState() { return *state_; } + BufferState GetState() { return state_->Get(); } void Send() noexcept; void SendNull() noexcept; @@ -132,7 +165,7 @@ class CommBuffer { template CommBuffer::CommBuffer(int tag, int send_rank, int recv_rank, mpi_comm_t comm, std::function get_resource, bool do_sparse_allocation) - : state_(std::make_shared(BufferState::stale)), + : state_(std::make_shared(BufferState::stale)), comm_type_(std::make_shared(BuffCommType::both)), started_irecv_(std::make_shared(false)), nrecv_tries_(std::make_shared(0)), diff --git a/src/utils/sort.hpp b/src/utils/sort.hpp index 97e9c77a88e4..c92e80865621 100644 --- a/src/utils/sort.hpp +++ b/src/utils/sort.hpp @@ -75,7 +75,7 @@ void sort(ParArray1D data, KeyComparator comparator, size_t min_idx, thrust::sort(first_d, last_d, comparator); #endif #else - if (std::is_same::value) { + if (std::is_same::value) { std::sort(data.data() + min_idx, data.data() + max_idx + 1, comparator); } else { PARTHENON_FAIL("sort is not supported outside of CPU or NVIDIA GPU. If you need sort " @@ -103,7 +103,7 @@ void sort(ParArray1D data, size_t min_idx, size_t max_idx) { thrust::sort(first_d, last_d); #endif #else - if (std::is_same::value) { + if (std::is_same::value) { std::sort(data.data() + min_idx, data.data() + max_idx + 1); } else { PARTHENON_FAIL("sort is not supported outside of CPU or NVIDIA GPU. If you need sort " diff --git a/src/utils/thread_pool.hpp b/src/utils/thread_pool.hpp new file mode 100644 index 000000000000..6bfeddf6e611 --- /dev/null +++ b/src/utils/thread_pool.hpp @@ -0,0 +1,595 @@ +//======================================================================================== +// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved. +// +// This program was produced under U.S. Government contract 89233218CNA000001 for Los +// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC +// for the U.S. Department of Energy/National Nuclear Security Administration. All rights +// in the program are reserved by Triad National Security, LLC, and the U.S. Department +// of Energy/National Nuclear Security Administration. The Government is granted for +// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldmy_tide +// license in this material to reproduce, prepare derivative works, distribute copies to +// the public, perform publicly and display publicly, and to permit others to do so. +//======================================================================================== + +#ifndef UTILS_THREAD_POOL_HPP_ +#define UTILS_THREAD_POOL_HPP_ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "basic_types.hpp" +#include "kokkos_abstraction.hpp" + +/* +#include +#include +#include +#include +#include +#include +#include +#include +#include +*/ + +namespace parthenon { + +inline thread_local int my_tid; + +/* +template +class ConcurrentQueue { + public: + explicit ConcurrentQueue(const int nthreads) + : built(false), main_waiting(false), exit(false), nqueues(nthreads+1), nsleep(0), + mutex(nqueues), queue(nqueues) {} + + void push(T q) { + { + std::lock_guard lock(mutex[my_tid]); + queue[my_tid].push(q); + } + nqueued++; + if (nsleep.load() > 0) { + std::lock_guard lock(complete_mutex); + sleep_cv.notify_all(); + } + } + + void push(int i, T q) { + { + std::lock_guard lock(mutex[i]); + queue[i].push(q); + } + nqueued++; + if (nsleep.load() > 0) { + std::lock_guard lock(complete_mutex); + sleep_cv.notify_all(); + } + } + + void push_lazy(T q) { + assert(my_tid == 0); + queue[0].push(q); + nqueued++; + } + void push_lazy(int i, T q) { + queue[i].push(q); + nqueued++; + } + + void go() { + sleep_cv.notify_all(); + } + + void pop(T &q) { + if (mutex[my_tid].try_lock()) { + if (!queue[my_tid].empty()) { + q = queue[my_tid].front(); + queue[my_tid].pop(); + mutex[my_tid].unlock(); + nqueued--; + return; + } + mutex[my_tid].unlock(); + } + for (int i = 0; i < nqueues; i++) { + if (mutex[i].try_lock()) { + if (!queue[i].empty()) { + q = queue[i].front(); + queue[i].pop(); + mutex[i].unlock(); + nqueued--; + return; + } + mutex[i].unlock(); + } + } + return; + } + + bool thread_sleep() { + nsleep++; + std::unique_lock lock(complete_mutex); + if (nsleep == nqueues - 1 && main_waiting && nqueued.load() == 0) complete_cv.notify_all(); + sleep_cv.wait(lock, [this]() { return nqueued.load() > 0 || exit; }); + nsleep--; + return exit && nqueued.load() == 0; + } + + bool signal_if_complete() { + if (nqueued.load() > 0) return false; + bool should_exit = thread_sleep(); + return should_exit; + } + + void signal_exit_when_finished() { + std::unique_lock lock(complete_mutex); + exit = true; + sleep_cv.notify_all(); + } + + void wait_for_complete() { + std::unique_lock lock(complete_mutex); + main_waiting = true; + complete_cv.wait(lock, [this]() { return nqueued == 0; }); + } + + private: + bool built, main_waiting, exit; + const int nqueues; + std::atomic_int nsleep; + std::vector mutex; + std::vector> queue; + std::map worker_id; + std::mutex built_mutex; + std::condition_variable built_cv; + std::atomic_int nqueued; + std::shared_mutex complete_mutex; + std::condition_variable_any complete_cv; + std::condition_variable_any sleep_cv; +}; + + +class ThreadPool { + public: + explicit ThreadPool(const int numthreads = std::thread::hardware_concurrency()) + : nthreads(numthreads), main_id(std::this_thread::get_id()), thread_idx(0), queue(nthreads), exec_space(nthreads+1) { + + auto worker = [&](const int i) { + my_tid = i; + t_exec_space = exec_space[i]; + while (true) { + auto stop = queue.signal_if_complete(); + if(stop) break; + std::function f; + queue.pop(f); + if (f) f(); + } + }; + my_tid = 0; + t_exec_space = exec_space[0]; + threads.reserve(nthreads); + for (int i = 0; i < nthreads; i++) { + threads.emplace_back(worker, i+1); + } + //queue.BuildMap(threads); + } + ~ThreadPool() { + queue.signal_exit_when_finished(); + for (auto &t : threads) { + t.join(); + } + } + void wait() { + queue.wait_for_complete(); + } + template + std::future::type> enqueue(F &&f, Args... args) { + using return_t = typename std::invoke_result::type; + auto task = std::make_shared>( + [=, func = std::forward(f)] { return func(args...); }); + std::future result = task->get_future(); + if (std::this_thread::get_id() == main_id) { + queue.push(thread_idx + 1, [task]() { (*task)(); }); + thread_idx = (thread_idx + 1) % nthreads; + } else { + queue.push([task]() { (*task)(); }); + } + return result; + } + template + std::future::type> enqueue(const int tid, F &&f, Args... args) { + using return_t = typename std::invoke_result::type; + auto task = std::make_shared>( + [=, func = std::forward(f)] { return func(args...); }); + std::future result = task->get_future(); + queue.push(tid, [task]() { (*task)(); }); + return result; + } + template + std::future::type> enqueue_lazy(F &&f, Args... args) { + using return_t = typename std::invoke_result::type; + auto task = std::make_shared>( + [=, func = std::forward(f)] { return func(args...); }); + std::future result = task->get_future(); + if (std::this_thread::get_id() == main_id) { + queue.push_lazy(thread_idx + 1, [task]() { (*task)(); }); + thread_idx = (thread_idx + 1) % nthreads; + } else { + printf("THIS IS WRONG!\n"); + } + return result; + } + template + std::future::type> enqueue_lazy(const int tid, F &&f, Args... args) { + using return_t = typename std::invoke_result::type; + auto task = std::make_shared>( + [=, func = std::forward(f)] { return func(args...); }); + std::future result = task->get_future(); + queue.push_lazy(tid, [task]() { (*task)(); }); + return result; + } + + int size() const { return nthreads; } + void run() { queue.go(); } + private: + const int nthreads; + const std::thread::id main_id; + int thread_idx; + std::vector threads; + ConcurrentQueue> queue; + ExecSpace exec_space; +}; + +template +void ThreadPoolLoop(ThreadPool &pool, const int is, const int ie, F &&f) { + const int nthreads = pool.size(); + const double niter = (ie - is + 1) / (1.0*nthreads) + 1.e-10; + auto looper = [func = std::forward(f)](int t_is, int t_ie) { + for (int i = t_is; i <= t_ie; i++) func(i); + }; + for (int it = 0; it < nthreads; it++) { + int start = is + static_cast(it*niter); + int end = is + static_cast((it+1)*niter) - 1; + pool.enqueue_lazy(it+1, looper, start, end); + } + pool.run(); +} +template +void ThreadPoolLoopBlocking(ThreadPool &pool, Args &&... args) { + ThreadPoolLoop(pool, std::forward(args)...); + pool.wait(); +} + +template +reduce_t ThreadPoolReduce(ThreadPool &pool, const int is, const int ie, F &&f, Reducer &&r, reduce_t init_val) { + const int nthreads = pool.size(); + std::vector> futures(nthreads); + const double niter = (ie - is + 1) / (1.0*nthreads) + 1.e-10; + auto looper = [=, func = std::forward(f), red = std::forward(r)](int t_is, int t_ie) { + reduce_t rval = 0; + for (int i = t_is; i <= t_ie; i++) { + reduce_t val = func(i); + rval = red(rval, val); + } + return rval; + }; + for (int it = 0; it < nthreads; it++) { + int start = is + static_cast(it*niter); + int end = is + static_cast((it+1)*niter) - 1; + futures[it] = pool.enqueue_lazy(it + 1, looper, start, end); + } + pool.run(); + pool.wait(); + reduce_t reduced_val = init_val; + for (int it = 0; it < nthreads; it++) { + if (!futures[it].valid()) { + printf("Got an invalid future somehow!\n"); + } + reduced_val = r(reduced_val, futures[it].get()); + } + return reduced_val; +} + +inline +void ThreadLoopBounds(ThreadPool &pool, const int is, const int ie, std::vector &start, std::vector &stop) { + const int nthreads = pool.size(); + start.resize(nthreads); + stop.resize(nthreads); + const double niter = (ie - is + 1) / (1.0 * nthreads) + 1.e-10; + for (int it = 0; it < nthreads; it++) { + start[it] = is + static_cast(it*niter); + stop[it] = is + static_cast((it+1)*niter) - 1; + } +} +*/ + + + + + +template +class ThreadQueue { + public: + explicit ThreadQueue(const int num_workers) + : nworkers(num_workers), nwaiting(0), nqueued(0), nqueued0(0), queue(nworkers+1), + mutex(nworkers+1), cv_go(nworkers+1) {} + void push(T q) { + //std::lock_guard lock(mutex[my_tid]); + mutex[my_tid].lock(); + queue[my_tid].push(q); + mutex[my_tid].unlock(); + nqueued++; + std::lock_guard lock(mutex[0]); + cv.notify_one(); + //if (my_tid == 0) { + //nqueued0++; + //cv.notify_one(); + //} + } + //void push(int i, T q) { + //std::lock_guard lock(mutex[i]); + //queue[i].push(q); + //nqueued++; + //cv_go[i].notify_one(); + //} + void pop_tag(int tag, T &q) { + std::lock_guard lock(mutex[tag]); + if (queue[tag].empty()) return; + q = queue[tag].front(); + queue[tag].pop(); + nqueued--; + return; + } + bool pop(T &q) { + pop_tag(0, q); + if (q) return false; + pop_tag(my_tid, q); + if (q) return false; + + for (int i = 0; i <= nworkers; i++) { + if (mutex[i].try_lock()) { + if (!queue[i].empty()) { + q = queue[i].front(); + queue[i].pop(); + mutex[i].unlock(); + nqueued--; + return false; + } + mutex[i].unlock(); + } + } + + //printf("thread %d going into waiting...\n", my_tid); + + std::unique_lock lock(mutex[0]); + nwaiting++; + if (waiting && nwaiting.load() == nworkers && nqueued.load() == 0) { + complete = true; + complete_cv.notify_all(); + } + cv.wait(lock, [this]() { return exit || nqueued.load() > 0; }); + nwaiting--; + return exit; + } +/* + //std::unique_lock lock(mutex[my_tid]); + if (nqueued0.load() > 0) { + std::lock_guard lock(mutex[0]); + if (!queue[0].empty()) { + q = queue[0].front(); + queue[0].pop(); + nqueued--; + nqueued0--; + return false; + } + } + mutex[my_tid].lock(); + if (queue[my_tid].empty()) { + //mutex[my_tid].unlock(); + //std::unique_lock lock(mutex[0]); + //if (queue[0].empty()) { + nwaiting++; + if (waiting && nwaiting.load() == nworkers) { + complete = true; + complete_cv.notify_all(); + } + cv_go[my_tid].wait(lock, [this]() { return exit || !queue[my_tid].empty(); }); + nwaiting--; + if (exit) return true; + } + q = queue[my_tid].front(); + queue[my_tid].pop(); + mutex[my_tid].unlock(); + nqueued--; + return false; + } + */ + //void signal_kill() { + //std::lock_guard lock(mutex[0]); + //std::queue().swap(queue); + //complete = true; + //exit = true; + //cv.notify_all(); + //} + void signal_exit_when_finished() { + std::lock_guard lock(mutex[0]); + exit = true; + complete = true; + cv.notify_all(); + } + void wait_for_complete() { + std::unique_lock lock(mutex[0]); + waiting = true; + if (nqueued.load() == 0 && nwaiting.load() == nworkers) { + complete = false; + waiting = false; + return; + } + complete_cv.wait(lock, [this]() { return complete; }); + //printf("got complete!\n"); + complete = false; + waiting = false; + } + //size_t size() { + //std::lock_guard lock(mutex[0]); + //return queue.size(); + //} + + private: + const int nworkers; + std::atomic_int nwaiting; + std::atomic_int nqueued, nqueued0; + std::vector> queue; + std::vector mutex; + std::vector cv_go; + std::condition_variable cv; + std::condition_variable complete_cv; + bool complete = false; + bool exit = false; + bool waiting = false; +}; + +class ThreadPool { + public: + explicit ThreadPool(const int numthreads = std::thread::hardware_concurrency()) + : nthreads(numthreads), queue(nthreads), exec_space(nthreads+1) { + my_tid = 0; + t_exec_space = exec_space[0]; + for (int i = 0; i < nthreads; i++) { + auto worker = [&](const int i) { + my_tid = i; + t_exec_space = exec_space[i]; + while (true) { + std::function f; + auto stop = queue.pop(f); + if (stop) break; + if (f) f(); + } + }; + threads.emplace_back(worker, i+1); + } + } + ~ThreadPool() { + queue.signal_exit_when_finished(); + for (auto &t : threads) { + t.join(); + } + } + + void wait() { //queue.notify_all(); + queue.wait_for_complete(); } + + //void kill() { queue.signal_kill(); } + + template + std::future::type> enqueue(F &&f, Args... args) { + using return_t = typename std::invoke_result::type; + auto task = std::make_shared>( + [=, func = std::forward(f)] { return func(args...); }); + std::future result = task->get_future(); + queue.push([task]() { (*task)(); }); + return result; + } + + int size() const { return nthreads; } + + //size_t num_queued() { return queue.size(); } + + private: + const int nthreads; + std::vector threads; + ThreadQueue> queue; + ExecSpace exec_space; +}; + +template +void ThreadPoolLoop(ThreadPool &pool, const int is, const int ie, F &&f) { + const int nthreads = pool.size(); + const double niter = (ie - is + 1) / (1.0*nthreads) + 1.e-10; + auto looper = [func = std::forward(f)](int t_is, int t_ie) { + for (int i = t_is; i <= t_ie; i++) func(i); + }; + for (int it = 0; it < nthreads; it++) { + int start = is + static_cast(it*niter); + int end = is + static_cast((it+1)*niter) - 1; + pool.enqueue(looper, start, end); + } +} +template +void ThreadPoolLoopBlocking(ThreadPool &pool, Args &&... args) { + ThreadPoolLoop(pool, std::forward(args)...); + pool.wait(); +} + +template +reduce_t ThreadPoolReduce(ThreadPool &pool, const int is, const int ie, F &&f, Reducer &&r, reduce_t init_val) { + const int nthreads = pool.size(); + std::vector> futures(nthreads); + const double niter = (ie - is + 1) / (1.0*nthreads) + 1.e-10; + auto looper = [=, func = std::forward(f), red = std::forward(r)](int t_is, int t_ie) { + reduce_t rval = 0; + for (int i = t_is; i <= t_ie; i++) { + reduce_t val = func(i); + rval = red(rval, val); + } + return rval; + }; + for (int it = 0; it < nthreads; it++) { + int start = is + static_cast(it*niter); + int end = is + static_cast((it+1)*niter) - 1; + futures[it] = pool.enqueue(looper, start, end); + } + pool.wait(); + reduce_t reduced_val = init_val; + for (int it = 0; it < nthreads; it++) { + reduced_val = r(reduced_val, futures[it].get()); + } + return reduced_val; +} + +template +reduce_t ThreadPoolReduce2(ThreadPool &pool, const int is, const int ie, F &&f, Reducer &&r, reduce_t init_val) { + std::vector> futures(ie+1); + auto flb = [=, func = std::forward(f), red = std::forward(r)](int t_i) { + reduce_t rval = init_val; + return red(rval, func(t_i)); + }; + for (int it = is; it <= ie; it++) { + futures[it] = pool.enqueue(flb, it); + } + pool.wait(); + reduce_t reduced_val = init_val; + for (int it = is; it <= ie; it++) { + reduced_val = r(reduced_val, futures[it].get()); + } + return reduced_val; +} + +inline +void ThreadLoopBounds(ThreadPool &pool, const int is, const int ie, std::vector &start, std::vector &stop) { + const int nthreads = pool.size(); + start.resize(nthreads); + stop.resize(nthreads); + const double niter = (ie - is + 1) / (1.0 * nthreads) + 1.e-10; + for (int it = 0; it < nthreads; it++) { + start[it] = is + static_cast(it*niter); + stop[it] = is + static_cast((it+1)*niter) - 1; + } +} + + + +} // namespace parthenon + +#endif // TASKS_THREAD_POOL_HPP_ diff --git a/tst/performance/test_meshblock_data_iterator.cpp b/tst/performance/test_meshblock_data_iterator.cpp index 923899d26e26..f1004ff95888 100644 --- a/tst/performance/test_meshblock_data_iterator.cpp +++ b/tst/performance/test_meshblock_data_iterator.cpp @@ -76,7 +76,8 @@ void performance_test_wrapper(const std::string &test_name, InitFunc init_func, // we need to connect the MeshBlockData to a dummy mesh block, otherwise variables // won't be allocated -static MeshBlockData createTestContainer(std::shared_ptr &dummy_mb) { +static void createTestContainer(std::shared_ptr dummy_mb, + MeshBlockData &mbd) { // Make a container for testing performance std::vector scalar_shape{N, N, N}; std::vector vector_shape{N, N, N, 3}; @@ -93,10 +94,7 @@ static MeshBlockData createTestContainer(std::shared_ptr &dummy pkg->AddField("v4", m_in_vec); pkg->AddField("v5", m_in); - MeshBlockData mbd; mbd.Initialize(pkg, dummy_mb); - - return mbd; } // std::function createLambdaRaw(ParArrayND &raw_array) { @@ -125,7 +123,6 @@ std::function createLambdaContainer(MeshBlockData &container) { v(l, k, j, i) = static_cast((l + 1) * (k + 1) * (j + 1) * (i + 1)); }); } - return container; }; } @@ -142,7 +139,6 @@ std::function createLambdaContainerVar(MeshBlockData &container, data(l, k, j, i) = static_cast((l + 1) * (k + 1) * (j + 1) * (i + 1)); }); } - return container; }; } @@ -196,7 +192,8 @@ TEST_CASE("Catch2 Container Iterator Performance", SECTION("Iterate Variables") { GIVEN("A container.") { auto dummy_mb = std::make_shared(16, 3); - MeshBlockData container = createTestContainer(dummy_mb); + MeshBlockData container; + createTestContainer(dummy_mb, container); auto init_container = createLambdaContainer(container); // Make a function for initializing the container variables @@ -215,7 +212,8 @@ TEST_CASE("Catch2 Container Iterator Performance", } // GIVEN GIVEN("A container Var.") { auto dummy_mb = std::make_shared(16, 3); - MeshBlockData container = createTestContainer(dummy_mb); + MeshBlockData container; + createTestContainer(dummy_mb, container); std::vector names({"v0", "v1", "v2", "v3", "v4", "v5"}); auto init_container = createLambdaContainerVar(container, names); @@ -238,7 +236,8 @@ TEST_CASE("Catch2 Container Iterator Performance", SECTION("View of Views") { GIVEN("A container.") { auto dummy_mb = std::make_shared(16, 3); - MeshBlockData container = createTestContainer(dummy_mb); + MeshBlockData container; + createTestContainer(dummy_mb, container); WHEN("The view of views does not have any names.") { parthenon::VariablePack var_view = container.PackVariables({Metadata::Independent});